Skip to content

Add docs on how to integrate deep research into a repo #36

@cmungall

Description

@cmungall

Currently it's possible to do deep research lite by instructing the agent to simply search the web for papers about X, summarize, but this is likely "lazier" than full deep research

Note that it's now possible to using deep research via API, I'm including a scrpt that has worked in another repo. Think about strategies for others to integrate. Maybe need a pypi package? Alsp consider things like timeouts. We absolutely want this to only be triggered by ai-controllers to prevent cost issues.

we also need to experiemnt with MCPs and this API, cc @dosumis

#!/usr/bin/env python3
"""CLI wrapper for OpenAI Deep Research API for gene research."""

import sys
from pathlib import Path
from typing import Optional

import click
from openai import OpenAI


@click.command()
@click.argument("gene_symbol", type=str)
@click.option("--organism", default="human", help="Organism (default: human)")
@click.option("--model", default="o3-deep-research-2025-06-26", 
              help="Deep research model to use")
@click.option("--output-dir", type=click.Path(), 
              help="Output directory for results (defaults to genes/<organism>/<gene>/)")
@click.option("--api-key", envvar="OPENAI_API_KEY", 
              help="OpenAI API key (or set OPENAI_API_KEY env var)")
@click.option("--save-reasoning", is_flag=True, default=False,
              help="Save reasoning steps to debug file (default: off)")
@click.option("--system-prompt-file", type=click.Path(exists=True),
              help="File containing custom system prompt (optional)")
def research_gene(
    gene_symbol: str,
    organism: str,
    model: str,
    output_dir: Optional[str],
    api_key: Optional[str],
    save_reasoning: bool,
    system_prompt_file: Optional[str]
):
    """Research a gene using OpenAI's Deep Research API.
    
    This tool performs comprehensive research on a given gene symbol,
    gathering information about its function, structure, disease associations,
    and relevant literature.
    
    Examples:
        deep-research CFAP300
        deep-research TP53 --organism human --output-dir custom/path
    """
    if not api_key:
        click.echo("Error: OpenAI API key required. Set OPENAI_API_KEY environment variable or use --api-key option.", err=True)
        sys.exit(1)
    
    # Initialize OpenAI client
    client = OpenAI(api_key=api_key)
    
    # Determine output directory
    if output_dir:
        output_path = Path(output_dir)
    else:
        output_path = Path("genes") / organism / gene_symbol
    
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Load system prompt (custom or default)
    if system_prompt_file:
        with open(system_prompt_file, 'r', encoding='utf-8') as f:
            system_message = f.read()
    else:
        system_message = """You are a molecular biologist and gene annotation expert conducting comprehensive research to support GO annotation curation.

Provide detailed, well-cited information focusing on:
1. Gene function and molecular mechanisms
2. Cellular localization and subcellular components  
3. Biological processes involvement
4. Disease associations and phenotypes
5. Protein domains and structural features
6. Expression patterns and regulation
7. Evolutionary conservation
8. Key experimental evidence and literature

Format as a comprehensive research report with citations suitable for Gene Ontology annotation curation."""
    
    user_query = f"""Research the {organism} gene {gene_symbol}. Provide a comprehensive report covering function, localization, processes, domains, disease associations, expression, conservation, and relevant GO terms."""
    
    click.echo(f"Starting deep research on {gene_symbol} ({organism})...")
    click.echo(f"Output will be saved to: {output_path}")
    
    try:
        # Make the Deep Research API call
        click.echo("⏳ This may take several minutes to complete...")
        response = client.responses.create(
            model=model,
            input=[
                {
                    "role": "developer",
                    "content": [{"type": "input_text", "text": system_message}]
                },
                {
                    "role": "user", 
                    "content": [{"type": "input_text", "text": user_query}]
                }
            ],
            tools=[
                {"type": "web_search_preview"}
            ]
        )
        
        # Extract the final report
        final_output = response.output[-1]
        if hasattr(final_output, 'content') and final_output.content:  # type: ignore
            content = final_output.content  # type: ignore
            if isinstance(content, list) and len(content) > 0:
                first_content = content[0]
                if hasattr(first_content, 'text'):
                    report_text = first_content.text  # type: ignore
                else:
                    report_text = str(first_content)
            else:
                report_text = str(content)
        else:
            report_text = str(final_output)
        
        # Save the research report
        report_file = output_path / f"{gene_symbol}-deep-research.md"
        with open(report_file, "w", encoding="utf-8") as f:
            f.write(f"# Deep Research Report: {gene_symbol} ({organism})\n\n")
            f.write("Generated using OpenAI Deep Research API\n\n")
            f.write("---\n\n")
            f.write(report_text)
        
        click.echo(f"✅ Research completed and saved to: {report_file}")
        
        # Extract and save citations if available
        if hasattr(final_output, 'content') and final_output.content:  # type: ignore
            content = final_output.content  # type: ignore
            if isinstance(content, list) and len(content) > 0:
                first_content = content[0]
                if hasattr(first_content, 'annotations'):
                    annotations = first_content.annotations  # type: ignore
                    if annotations:
                        citations_file = output_path / f"{gene_symbol}-citations.md"
                        with open(citations_file, "w", encoding="utf-8") as f:
                            f.write(f"# Citations for {gene_symbol} Research\n\n")
                            for i, annotation in enumerate(annotations, 1):
                                f.write(f"{i}. {annotation}\n")
                        click.echo(f"📚 Citations saved to: {citations_file}")
        
        # Save reasoning steps if available and requested
        if save_reasoning:
            reasoning_steps = [item for item in response.output if hasattr(item, 'type') and item.type == "reasoning"]  # type: ignore
            if reasoning_steps:
                reasoning_file = output_path / f"{gene_symbol}-reasoning.md"
                with open(reasoning_file, "w", encoding="utf-8") as f:
                    f.write(f"# Reasoning Steps for {gene_symbol} Research\n\n")
                    for i, step in enumerate(reasoning_steps, 1):
                        f.write(f"## Step {i}\n\n")
                        if hasattr(step, 'content'):  # type: ignore
                            step_content = step.content  # type: ignore
                            if isinstance(step_content, list) and len(step_content) > 0:
                                if hasattr(step_content[0], 'text'):
                                    f.write(f"{step_content[0].text}\n\n")  # type: ignore
                                else:
                                    f.write(f"{step_content[0]}\n\n")
                            else:
                                f.write(f"{step_content}\n\n")
                        else:
                            f.write(f"{step}\n\n")
                click.echo(f"🧠 Reasoning steps saved to: {reasoning_file}")
            else:
                click.echo("ℹ️  No reasoning steps available to save")
        
    except Exception as e:
        click.echo(f"❌ Error during research: {e}", err=True)
        sys.exit(1)


if __name__ == "__main__":
    research_gene()

Metadata

Metadata

Assignees

No one assigned

    Labels

    codexdocumentationImprovements or additions to documentationenhancementNew feature or request

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions