Skip to content

Commit b9372bc

Browse files
authored
Add server instructions and make all instructions externalized and overridable (#161)
1 parent e310cb2 commit b9372bc

16 files changed

Lines changed: 791 additions & 406 deletions

File tree

packages/datacommons-mcp/.env.sample

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,20 @@ DC_TYPE=base
4343
# Default: "base_and_custom" for custom DC
4444
# DC_SEARCH_SCOPE=base_and_custom
4545

46+
# =============================================================================
47+
# LESS COMMONLY USED OPTIONAL CONFIGURATION
48+
# =============================================================================
49+
50+
# Path to directory containing markdown file overrides for server instructions and/or tool descriptions.
51+
# Supports partial overrides: only create files for the specific instructions or tools you want to replace.
52+
# The system will fall back to package defaults for any file not found here.
53+
#
54+
# Expected structure inside this directory:
55+
# - server.md
56+
# - tools/{tool_name}.md
57+
# DC_INSTRUCTIONS_DIR=/path/to/custom/instructions
58+
59+
4660
# =============================================================================
4761
# NON-PROD ROOTS (optional, base DC only)
4862
# =============================================================================
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
# Copyright 2026 Google LLC.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""
15+
Core application module for the DC MCP server.
16+
"""
17+
18+
import json
19+
import logging
20+
from collections.abc import Callable
21+
from typing import Any
22+
23+
from fastmcp import FastMCP
24+
from fastmcp.tools.tool import Tool
25+
from pydantic import ValidationError
26+
27+
from datacommons_mcp import settings
28+
from datacommons_mcp.clients import create_dc_client
29+
from datacommons_mcp.utils import read_external_content, read_package_content
30+
from datacommons_mcp.version import __version__
31+
32+
# Configure logging
33+
logger = logging.getLogger(__name__)
34+
35+
MCP_SERVER_NAME = "DC MCP Server"
36+
DEFAULT_INSTRUCTIONS_PACKAGE = "datacommons_mcp.instructions"
37+
SERVER_INSTRUCTION_FILE = "server.md"
38+
39+
40+
class DCApp:
41+
"""Core application wrapper for Data Commons MCP."""
42+
43+
def __init__(self) -> None:
44+
"""Initialize the application."""
45+
# Load settings
46+
try:
47+
self.settings = settings.get_dc_settings()
48+
settings_dict = self.settings.model_dump()
49+
settings_dict["api_key"] = (
50+
"<SET>" if settings_dict.get("api_key") else "<NOT_SET>"
51+
)
52+
logger.info("Loaded DC settings:\n%s", json.dumps(settings_dict, indent=2))
53+
except ValidationError as e:
54+
logger.error("Settings error: %s", e)
55+
raise
56+
57+
# Create client
58+
try:
59+
self.client = create_dc_client(self.settings)
60+
except Exception as e:
61+
logger.error("Failed to create DC client: %s", e)
62+
raise
63+
64+
# Load Server Instructions
65+
server_instructions = self._load_instruction(SERVER_INSTRUCTION_FILE)
66+
67+
self.mcp = FastMCP(
68+
MCP_SERVER_NAME,
69+
version=__version__,
70+
instructions=server_instructions,
71+
)
72+
73+
def _load_instruction(self, filename: str) -> str:
74+
"""
75+
Loads markdown content.
76+
Priority:
77+
1. DC_INSTRUCTIONS_DIR/{filename} (if set and exists)
78+
2. Package default: datacommons_mcp/instructions/{filename}
79+
"""
80+
# Check specific override
81+
if self.settings.instructions_dir:
82+
content = read_external_content(self.settings.instructions_dir, filename)
83+
if content is not None:
84+
logger.info(
85+
"Loaded custom instruction for %s from %s",
86+
filename,
87+
self.settings.instructions_dir,
88+
)
89+
return content
90+
logger.debug(
91+
"Custom instruction file %s not found in %s, falling back to default.",
92+
filename,
93+
self.settings.instructions_dir,
94+
)
95+
96+
# Fallback to package resources
97+
return read_package_content(DEFAULT_INSTRUCTIONS_PACKAGE, filename)
98+
99+
def register_tool(self, func: Callable[..., Any], instruction_file: str) -> None:
100+
"""Register a tool with instructions loaded from a file.
101+
102+
Args:
103+
func: The tool function to register.
104+
instruction_file: Path to instruction file relative to instructions dir.
105+
"""
106+
description = self._load_instruction(instruction_file)
107+
if not description:
108+
logger.warning(
109+
"No description found for tool %s from file %s",
110+
func.__name__,
111+
instruction_file,
112+
)
113+
114+
# Create tool from function and add description
115+
tool = Tool.from_function(func, description=description)
116+
self.mcp.add_tool(tool)
117+
118+
119+
# Create global app instance
120+
app = DCApp()

packages/datacommons-mcp/datacommons_mcp/data_models/settings.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,12 @@ class DCSettings(BaseSettings):
5353
description="Whether to use the legacy search-indicators endpoint (True) or the client library (False) for fetching indicators.",
5454
)
5555

56+
instructions_dir: str | None = Field(
57+
default=None,
58+
alias="DC_INSTRUCTIONS_DIR",
59+
description="Directory containing custom instruction files (markdown overrides)",
60+
)
61+
5662

5763
class BaseDCSettings(DCSettings):
5864
"""Settings for base Data Commons instance."""

packages/datacommons-mcp/datacommons_mcp/instructions/__init__.py

Whitespace-only changes.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Act as a Data Commons Research Assistant. This server provides direct access to a massive, unified knowledge graph of aggregated statistical data from authoritative regional and global sources like the UN, World Bank, and Census Bureau. Use it to transform natural language queries into precise statistical insights by identifying specific indicators and retrieving their observations. It contains historical and recent data points on topics like demographics, economics, health, and environment across various geographic levels. It does not contain information on topics like real-time news, subjective viewpoints, or private corporate data. Crucially, every data point retrieved must be attributed to its original source provided in the tool output; never present statistics as "known facts" without citing the specific organization or dataset they originated from. Prioritize data integrity and transparency, ensuring that users understand both the metric and the provenance of the information provided.

packages/datacommons-mcp/datacommons_mcp/instructions/tools/__init__.py

Whitespace-only changes.
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
Fetches observations for a statistical variable from Data Commons.
2+
3+
**CRITICAL: Always validate variable-place combinations first**
4+
- You **MUST** call `search_indicators` first to verify that the variable exists for the specified place
5+
- Only use DCIDs returned by `search_indicators` - never guess or assume variable-place combinations
6+
- This ensures data availability and prevents errors from invalid combinations
7+
8+
This tool can operate in two primary modes:
9+
1. **Single Place Mode**: Get data for one specific place (e.g., "Population of California").
10+
2. **Child Places Mode**: Get data for all child places of a certain type within a parent place (e.g., "Population of all counties in California").
11+
12+
### Core Logic & Rules
13+
14+
* **Variable Selection**: You **must** provide the `variable_dcid`.
15+
* Variable DCIDs are unique identifiers for statistical variables in Data Commons and are returned by prior calls to the
16+
`search_indicators` tool.
17+
18+
* **Place Selection**: You **must** provide the `place_dcid`.
19+
* **Important Note for Bilateral Data**: When fetching data for bilateral variables (e.g., exports from one country to another),
20+
the `variable_dcid` often encodes one of the places (e.g., `TradeExports_FRA` refers to exports *to* France).
21+
In such cases, the `place_dcid` parameter in `get_observations` should specify the *other* place involved in the bilateral relationship
22+
(e.g., the exporter country, such as 'USA' for exports *from* USA).
23+
The `search_indicators` tool's `places_with_data` field can help identify which place is the appropriate observation source for `place_dcid`.
24+
25+
* **Mode Selection**:
26+
* To get data for the specified place (e.g., California), **do not** provide `child_place_type`.
27+
* To get data for all its children (e.g., all counties in California), you **must also** provide the `child_place_type` (e.g., "County").
28+
**CRITICAL:** Before calling `get_observations` with `child_place_type`, you **MUST** first call `search_indicators` with child sampling to determine the correct child place type.
29+
**Child Type Determination Logic:**
30+
1. Use the `dcid_place_type_mappings` field from the `search_indicators` response to examine the types of sampled child places
31+
2. Use the type that is common to ALL sampled child places
32+
3. If more than one type is common to all child places, use the most specific type
33+
4. If there is no common type across all sampled child places, use the majority type (50%+ threshold) if there's a clear majority
34+
5. If there is no common type and no clear majority, this tool cannot be called with child-place mode - fall back to single-place mode `get_observations` calls for each place
35+
**Note:** If you used child sampling in `search_indicators` to validate variable existence, you should still get data for ALL children of that type, not just the sampled subset.
36+
37+
* **Data Volume Constraint**: When using **Child Places Mode** (when `child_place_type` is set), you **must** be conservative with your date range to avoid requesting too much data.
38+
* Avoid requesting `'all'` data via the `date` parameter.
39+
* **Instead, you must either request the `'latest'` data or provide a specific, bounded date range.**
40+
41+
* **Date Filtering**: The tool filters observations by date using the following priority:
42+
1. **`date`**: The `date` parameter is required and can be one of the enum values 'all', 'latest', 'range', or a date string in the format 'YYYY', 'YYYY-MM', or 'YYYY-MM-DD'.
43+
2. **Date Range**: If `date` is set to 'range', you must specify a date range using `date_range_start` and/or `date_range_end`.
44+
* If only `date_range_start` is specified, then the response will contain all observations starting at and after that date (inclusive).
45+
* If only `date_range_end` is specified, then the response will contain all observations before and up to that date (inclusive).
46+
* If both are specified, the response contains observations within the provided range (inclusive).
47+
* Dates must be in `YYYY`, `YYYY-MM`, or `YYYY-MM-DD` format.
48+
3. **Default Behavior**: If you do not provide **any** date parameters (`date`, `date_range_start`, or `date_range_end`), the tool will automatically fetch only the `'latest'` observation.
49+
50+
Args:
51+
variable_dcid (str, required): The unique identifier (DCID) of the statistical variable.
52+
place_dcid (str, required): The DCID of the place.
53+
child_place_type (str, optional): The type of child places to get data for. **Use this to switch to Child Places Mode.**
54+
source_override (str, optional): An optional source ID to force the use of a specific data source.
55+
date (str, optional): An optional date filter. Accepts 'all', 'latest', 'range', or single date values of the format 'YYYY', 'YYYY-MM', or 'YYYY-MM-DD'. Defaults to 'latest' if no date parameters are provided.
56+
date_range_start (str, optional): The start date for a range (inclusive). **Used only if `date` is set to'range'.**
57+
date_range_end (str, optional): The end date for a range (inclusive). **Used only if `date` is set to'range'.**
58+
59+
Returns:
60+
The fetched observation data including:
61+
- `variable`: Details about the statistical variable requested.
62+
- `place_observations`: A list of observations, one entry per place. Each entry contains:
63+
- `place`: Details about the observed place (DCID, name, type).
64+
- `time_series`: A list of `(date, value)` tuples, where `date` is a string (e.g., "2022-01-01") and `value` is a float.
65+
- `source_metadata`: Information about the primary data source used.
66+
- `alternative_sources`: Details about other available data sources.

0 commit comments

Comments
 (0)