This guide explains how to integrate new media data sources into CultureMech.
Adding a new media source involves 4 steps:
- Tracking - Add source to
MEDIA_SOURCES.tsv - Fetching - Create fetcher to download raw data
- Importing - Create importer to convert to CultureMech schema
- Building - Add commands to
project.justfile
Before starting integration, document the source:
source_id source_name url api_url record_count data_format access_method priority
NEWSOURCE New Source Database https://... https://api... 1000 JSON REST API 1Key fields:
source_id: Short uppercase identifier (used in code)download_status: NOT_STARTED → IN_PROGRESS → COMPLETEpriority: 1 (highest) to 5 (lowest)
Create src/culturemech/fetch/{source}_fetcher.py:
"""
{Source Name} API fetcher.
Fetches media data from {Source} API/database.
"""
import argparse
import json
import time
from pathlib import Path
from typing import Any, Dict, List, Optional
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
class {Source}Fetcher:
"""Fetch data from {Source} API."""
BASE_URL = "https://api.example.com"
def __init__(self, output_dir: Path, delay: float = 0.5):
"""Initialize fetcher with rate limiting."""
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
self.delay = delay
# Session with retry strategy
self.session = requests.Session()
retries = Retry(
total=3,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
)
self.session.mount("https://", HTTPAdapter(max_retries=retries))
def fetch_all_media(self, limit: Optional[int] = None) -> List[Dict]:
"""Fetch all media from API."""
# Implementation here
pass
def save_json(self, data: Any, filename: str):
"""Save data to JSON file."""
output_path = self.output_dir / filename
with open(output_path, "w") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"✓ Saved to {output_path}")
def main():
"""CLI entry point."""
parser = argparse.ArgumentParser(description="Fetch media from {Source}")
parser.add_argument("-o", "--output", type=Path, default="data/raw/{source}")
parser.add_argument("-l", "--limit", type=int, help="Limit for testing")
args = parser.parse_args()
fetcher = {Source}Fetcher(output_dir=args.output)
media = fetcher.fetch_all_media(limit=args.limit)
fetcher.save_json(media, "{source}_media.json")
if __name__ == "__main__":
main()Add ethical scraping practices:
import time
from bs4 import BeautifulSoup
class {Source}Scraper:
def __init__(self, delay: float = 2.0):
"""Initialize with 2s delay for ethical scraping."""
self.delay = delay
# Check robots.txt first
# Implement caching to avoid re-scrapingEthical Guidelines:
- Check
robots.txtbefore scraping - Implement 1-2 second delays between requests
- Cache pages locally
- Provide attribution in README
- Contact site administrators if scraping large amounts
Create src/culturemech/import/{source}_importer.py:
"""
{Source} to CultureMech Importer
Converts {Source} data to CultureMech YAML format.
"""
import json
import yaml
from pathlib import Path
from typing import Any, Optional
import logging
logger = logging.getLogger(__name__)
class {Source}Importer:
"""Import {Source} data into CultureMech format."""
def __init__(self, raw_data_dir: Path, output_dir: Path):
"""Initialize importer."""
self.raw_dir = Path(raw_data_dir)
self.output_dir = Path(output_dir)
# Load raw data
self.media = self._load_json("{source}_media.json")
def _load_json(self, filename: str) -> dict:
"""Load JSON file from raw data directory."""
with open(self.raw_dir / filename) as f:
return json.load(f)
def import_all(self, limit: Optional[int] = None) -> list[Path]:
"""Import all media to CultureMech format."""
generated = []
media_list = self.media[:limit] if limit else self.media
for medium in media_list:
try:
yaml_path = self.import_medium(medium)
if yaml_path:
generated.append(yaml_path)
logger.info(f"✓ Imported {yaml_path.name}")
except Exception as e:
logger.error(f"✗ Error: {e}")
return generated
def import_medium(self, medium: dict) -> Optional[Path]:
"""Convert single medium to CultureMech YAML."""
# Map to CultureMech schema
recipe = {
"id": f"{source.upper()}_{medium['id']}",
"name": medium['name'],
"description": medium.get('description'),
"ingredients": self._map_ingredients(medium),
"preparation_steps": self._map_steps(medium),
"provenance": self._create_provenance(medium),
}
# Save to YAML
output_path = self._get_output_path(recipe)
with open(output_path, "w") as f:
yaml.dump(recipe, f, default_flow_style=False, sort_keys=False)
return output_path
def _map_ingredients(self, medium: dict) -> list[dict]:
"""Map ingredients to CultureMech format."""
# Implementation here
pass
def _create_provenance(self, medium: dict) -> dict:
"""Create provenance record."""
return {
"source_database": "{Source}",
"source_id": medium['id'],
"source_url": f"https://example.com/{medium['id']}",
"import_date": datetime.now().isoformat(),
}
def main():
"""CLI entry point."""
parser = argparse.ArgumentParser(description="Import {Source} media")
parser.add_argument("-i", "--input", type=Path, default="data/raw/{source}")
parser.add_argument("-o", "--output", type=Path, default="kb/media")
parser.add_argument("-l", "--limit", type=int, help="Limit for testing")
args = parser.parse_args()
importer = {Source}Importer(args.input, args.output)
importer.import_all(limit=args.limit)
if __name__ == "__main__":
main()Add to project.justfile:
[group('Data')]
fetch-{source}-raw limit="":
#!/usr/bin/env bash
echo "Fetching {Source} data..."
mkdir -p {{raw_data_dir}}/{source}
if [ "{{limit}}" != "" ]; then
uv run python -m culturemech.fetch.{source}_fetcher \
--output {{raw_data_dir}}/{source} \
--limit {{limit}}
else
uv run python -m culturemech.fetch.{source}_fetcher \
--output {{raw_data_dir}}/{source}
fi
[group('Import')]
import-{source} limit="":
#!/usr/bin/env bash
echo "Importing {Source} media..."
if [ ! -f "{{raw_data_dir}}/{source}/{source}_media.json" ]; then
echo "⚠ Raw data not found. Fetching..."
just fetch-{source}-raw {{ if limit != "" { limit } else { "" } }}
fi
uv run python -m culturemech.import.{source}_importer \
-i {{raw_data_dir}}/{source} \
-o {{kb_dir}} \
{{ if limit != "" { "--limit " + limit } else { "" } }}Create data/raw/{source}/README.md:
# {Source Name} Raw Data
## Source Information
- **Official Name**: {Full Name}
- **URL**: https://example.com
- **API Documentation**: https://api.example.com/docs
- **License**: {License Type}
- **Record Count**: ~{X} media recipes
## Data Files
- `{source}_media.json` - All media records
- `{source}_ingredients.json` - Ingredient mappings (optional)
- `fetch_stats.json` - Fetch metadata
## Provenance
**Fetch Date**: {YYYY-MM-DD}
**Fetcher**: `src/culturemech/fetch/{source}_fetcher.py`
**Fetch Command**: `just fetch-{source}-raw`
## Data Structure
```json
{
"id": "unique_id",
"name": "Media Name",
"ingredients": [...],
"description": "..."
}- {Any special considerations}
- {Known limitations}
- {Data quality issues}
- {Citation to paper/database}
- {Link to terms of service}
## Data Quality Checklist
Before marking source as COMPLETE:
- [ ] Raw data fetched successfully
- [ ] Fetch stats recorded (date, count, version)
- [ ] Import creates valid YAML files
- [ ] Schema validation passes (`just validate-all`)
- [ ] Chemical mappings applied (CHEBI IDs where possible)
- [ ] Provenance documented in each recipe
- [ ] Cross-references checked (deduplicate against existing media)
- [ ] README created with full provenance
- [ ] Build commands tested
- [ ] Statistics updated in main README
## Cross-Referencing
To avoid duplicates, check against existing sources:
```python
def check_duplicates(new_media: dict, existing_sources: list[str]) -> bool:
"""Check if media already exists in CultureMech."""
# 1. Exact name match
# 2. Fuzzy name match (Levenshtein distance)
# 3. Ingredient composition similarity (Jaccard index)
# 4. Cross-reference IDs (e.g., DSMZ:1 appears in multiple sources)
Create data/processed/media_crossref.tsv to track equivalencies.
Always test with small limits first:
# Test fetch (10 records)
just fetch-{source}-raw 10
# Test import (10 records)
just import-{source} 10
# Validate
just validate-all
# Check statistics
just count-recipes
just show-raw-data-statsdef fetch_all_media(self):
all_media = []
page = 1
while True:
batch = self.fetch_page(page)
if not batch:
break
all_media.extend(batch)
page += 1
time.sleep(self.delay)
return all_mediadef flatten_ingredients(self, medium: dict) -> list:
ingredients = []
for comp in medium.get('composition', []):
ingredients.append({
"chemical": self.map_chemical(comp['name']),
"quantity": comp.get('amount'),
"units": comp.get('unit'),
})
return ingredientsfrom culturemech.import.chemical_mappings import ChemicalMapper
mapper = ChemicalMapper(
microbe_media_param_dir="data/raw/microbe-media-param",
mediadive_dir="data/raw/mediadive"
)
chebi_id = mapper.lookup(ingredient_name)- Increase
--delayparameter - Implement exponential backoff
- Use caching to avoid re-fetching
- Check robots.txt compliance
- Add user-agent header
- Contact site administrator
- Consider manual curation instead
- Check LinkML schema:
src/culturemech/schema/culturemech.yaml - Use
just validate-schema file.yamlfor debugging - Common issues: missing required fields, wrong enum values
- Fallback to name-only (no CHEBI ID)
- Log unmapped ingredients for manual curation
- Contribute mappings back to MicrobeMediaParam
- LinkML Schema:
src/culturemech/schema/culturemech.yaml - Existing Importers:
src/culturemech/import/ - Chemical Mapper:
src/culturemech/import/chemical_mappings.py - MEDIA_SOURCES.tsv:
data/MEDIA_SOURCES.tsv
Open an issue or consult existing importers for patterns.