pytc-client/server_api/chatbot/agent_cli.py at 5aa4eaa7264dd8fb62518661d43d416c5e224fd9 · PytorchConnectomics/pytc-client · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
Standalone Agent Test Script
Tests the multi-agent chatbot system without starting the full app.
Shows all RAG retrievals, tool calls, and final responses.

Usage:
  python agent_cli.py                     # interactive mode
  python agent_cli.py -b                  # batch: run 20-question PyTC eval
  python agent_cli.py "your question"     # single question
"""

import os
import sys
import time
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent.parent))

from server_api.chatbot.chatbot import build_chain


# ── Failed questions from 40-question test ──────────────────────────────────

BATCH_QUESTIONS = [
    # Test #10 - Fabricated CLI flags --batch-size, --checkpoint-interval
    "Give me the command to train on CREMI with batch size 2 and save checkpoints every 5000 iterations",

    # Test #14 - Didn't override scheduler explicitly
    "Train on MitoEM with the WarmupCosineLR scheduler and a base learning rate of 0.002",

    # Test #17 - Wrong override format --inference.AUG_NUM=8
    "Generate an inference command for CREMI. Use configs/CREMI/CREMI-Base.yaml and checkpoint outputs/CREMI/checkpoint_100000.pth.tar with 8 TTA augmented views",

    # Test #32 - Fabricated scripts/evaluate.py
    "How do I evaluate synapse detection results for the CREMI challenge?",
]


def run_batch():
    """Run the 20-question batch test. Agent is built once and reused."""
    print("Building agent (one-time)...")
    agent, reset_search_counter = build_chain()
    print(f"Running {len(BATCH_QUESTIONS)} tests...\n")

    for i, q in enumerate(BATCH_QUESTIONS, 1):
        reset_search_counter()

        print(f"\n{'='*80}")
        print(f"TEST {i}/{len(BATCH_QUESTIONS)}")
        print(f"Q: {q}")
        print(f"{'='*80}\n")

        t0 = time.time()
        try:
            result = agent.invoke({"messages": [("user", q)]})
            response = result["messages"][-1].content
        except Exception as e:
            response = f"[ERROR] {e}"
        elapsed = time.time() - t0

        print(f"\n{'─'*80}")
        print("RESPONSE:")
        print(f"{'─'*80}")
        print(response)
        print(f"\n({elapsed:.1f}s)")

    print(f"\n{'#'*80}")
    print(f"BATCH COMPLETE — {len(BATCH_QUESTIONS)} questions answered")
    print(f"{'#'*80}")


def run_single(question: str):
    """Test the agent with a single question."""
    print(f"\n{'='*80}")
    print(f"QUESTION: {question}")
    print(f"{'='*80}\n")
    agent, reset_search_counter = build_chain()
    reset_search_counter()
    result = agent.invoke({"messages": [("user", question)]})
    response = result["messages"][-1].content
    print(f"\n{'─'*80}")
    print("FINAL RESPONSE:")
    print(f"{'─'*80}")
    print(response)
    print(f"\n{'='*80}\n")


def interactive_mode():
    """Interactive mode for testing custom questions."""
    print("\n" + "="*80)
    print("INTERACTIVE AGENT TEST MODE")
    print("="*80)
    print("Type your questions to test the agent.")
    print("Type 'quit' or 'exit' to stop.\n")
    agent, reset_search_counter = build_chain()
    while True:
        try:
            question = input("\nYour question: ").strip()
            if question.lower() in ['quit', 'exit', 'q']:
                break
            if not question:
                continue
            reset_search_counter()
            result = agent.invoke({"messages": [("user", question)]})
            response = result["messages"][-1].content
            print(f"\n{'─'*60}")
            print(response)
            print(f"{'─'*60}")
        except KeyboardInterrupt:
            break
        except Exception as e:
            print(f"\nError: {e}")
            import traceback
            traceback.print_exc()


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Test the chatbot agent")
    parser.add_argument("-b", "--batch", action="store_true", help="Run 20-question graded batch test")
    parser.add_argument("-i", "--interactive", action="store_true", help="Interactive mode")
    parser.add_argument("question", nargs="*", help="Single question to test")
    args = parser.parse_args()

    if args.batch:
        run_batch()
    elif args.interactive:
        interactive_mode()
    elif args.question:
        run_single(" ".join(args.question))
    else:
        interactive_mode()