Skip to content

Commit a45306e

Browse files
karena408claude
andcommitted
Inline course subrepos into monorepo
Convert the 10 nested git repos under Learner Tooling/, Long Courses/, and Short Courses/ from gitlinks (submodule pointers without a .gitmodules file, which rendered as empty folders on GitHub) into regular tracked directories. Exclude *.mp4 to keep the repo light; the two Gemini CLI podcast files are linked to their upstream location in sc-gemini-cli-files/README.md. Fix the broken Gemini CLI thumbnail path in the top-level README. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 3de9359 commit a45306e

842 files changed

Lines changed: 177101 additions & 11 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,3 +107,6 @@ yarn-error.log*
107107
Thumbs.db
108108
ehthumbs.db
109109
Desktop.ini
110+
111+
# Large media assets — fetch from upstream course repos instead (see each course's README)
112+
*.mp4

Learner Tooling/database-viewer

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Copy this file to .env and fill in your values
2+
OPENAI_API_KEY=your-openai-api-key-here
3+
FLASK_ENV=development
4+
SECRET_KEY=change-this-in-production
5+
CHROMA_PERSIST_DIR=./chroma_data
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Environment variables (never commit secrets)
2+
.env
3+
.env.local
4+
.env.*.local
5+
6+
# ChromaDB data
7+
chroma_data/
8+
9+
# Python
10+
__pycache__/
11+
*.py[cod]
12+
*.pyo
13+
*.pyd
14+
.Python
15+
*.egg-info/
16+
dist/
17+
build/
18+
.eggs/
19+
*.egg
20+
21+
# Virtual environments
22+
venv/
23+
.venv/
24+
env/
25+
ENV/
26+
27+
# IDE
28+
.vscode/
29+
.idea/
30+
*.swp
31+
*.swo
32+
33+
# OS
34+
.DS_Store
35+
Thumbs.db
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# ChromaDB Code Search UI
2+
3+
A Flask web application for searching, browsing, and visualizing Python code using semantic embeddings and ChromaDB. Built as a companion app for the **Context Engineering with Chroma** course.
4+
5+
## Purpose
6+
7+
This app serves two roles in the course:
8+
9+
1. **Teaching material** — Students ingest this codebase into ChromaDB using AST-based chunking pipelines they build in the labs. The well-structured Python code (models, services, routes, utils) makes it an ideal target for practicing chunking strategies.
10+
2. **Interactive tool** — Once ingested, students launch this app to explore their collections, run searches, and see how their chunking and metadata decisions affect retrieval quality.
11+
12+
## Features
13+
14+
- **Semantic search** — Natural language queries over code using OpenAI embeddings (`text-embedding-3-small`)
15+
- **Regex search** — Structural pattern matching across the codebase with analysis and explanation
16+
- **Collection explorer** — Paginated chunk browser with filters by file path, chunk type, and symbol name
17+
- **Code statistics** — Construct detection, size distributions, and symbol rankings
18+
- **Embedding visualizer** — 2D PCA projections of chunk embeddings to explore clustering
19+
- **Smart suggestions** — Context-aware query suggestions based on collection metadata
20+
- **Query history and bookmarks** — Persistent search history with color-coded bookmarks
21+
- **Interactive tutorials** — Guided tours with spotlight overlays for onboarding
22+
23+
## Project Structure
24+
25+
```
26+
app/
27+
├── app.py # Flask application factory and entry point
28+
├── config.py # Dataclass-based configuration (env vars, defaults)
29+
├── requirements.txt # Python dependencies
30+
├── .env.example # Environment variable template
31+
32+
├── models/ # Data models
33+
│ ├── chunk.py # Chunk, ChunkMetadata, ChunkType
34+
│ ├── search_result.py # SearchResult, SearchResultSet, ResultFormatter
35+
│ └── query_history.py # QueryRecord, Bookmark, HistoryManager
36+
37+
├── routes/ # Flask blueprints (one per feature)
38+
│ ├── search.py # Semantic and regex search endpoints
39+
│ ├── collections.py # Collection CRUD and ingestion triggers
40+
│ ├── explorer.py # Paginated chunk browsing with filters
41+
│ ├── similarity.py # Pairwise similarity matrix computation
42+
│ ├── history.py # Query history and bookmarks API
43+
│ ├── regex_tester.py # Regex testing and analysis
44+
│ ├── suggestions.py # Smart query suggestions
45+
│ ├── statistics.py # Code metrics and analytics
46+
│ ├── visualizer.py # 2D embedding visualization
47+
│ └── tutorial.py # Interactive guided tours
48+
49+
├── services/ # Business logic layer
50+
│ ├── chroma_client.py # ChromaDB connection manager (singleton)
51+
│ ├── search_service.py # Search strategies (semantic + regex)
52+
│ ├── collection_service.py # Collection management and stats
53+
│ ├── ingestion_service.py # AST parsing and code chunking pipeline
54+
│ ├── similarity_service.py # Vector similarity computations
55+
│ ├── statistics_service.py # Code metrics and analysis
56+
│ ├── visualization_service.py # PCA and random projection reducers
57+
│ ├── suggestion_service.py # Multi-strategy suggestion generator
58+
│ └── tutorial_service.py # Tutorial builder and manager
59+
60+
├── utils/ # Utilities and helpers
61+
│ ├── validators.py # Input validation (queries, paths, regex)
62+
│ ├── regex_engine.py # Regex analysis and human-readable explanation
63+
│ ├── code_parser.py # Lightweight regex-based Python parser
64+
│ ├── text_splitter.py # Token-based text splitting
65+
│ └── formatters.py # Display formatting (scores, code, paths)
66+
67+
├── templates/ # Jinja2 HTML templates
68+
│ ├── base.html # Base layout with navbar and tutorial engine
69+
│ ├── index.html # Dashboard (collection cards)
70+
│ ├── search.html # Search interface
71+
│ ├── explorer.html # Chunk browser
72+
│ └── collection.html # Collection detail page
73+
74+
└── static/
75+
└── css/style.css # Custom styles
76+
```
77+
78+
## Design Patterns
79+
80+
The codebase intentionally demonstrates several software design patterns, making it a richer target for code search exercises:
81+
82+
- **Strategy**`SearchStrategy`, `SimilarityComputer`, `DimensionReducer`, `SuggestionStrategy`
83+
- **Singleton**`ChromaClientManager` for a single DB connection
84+
- **Factory**`get_reducer()`, `get_similarity_computer()`, `get_tutorial_builder()`
85+
- **Builder** — Tutorial builders (`DashboardTutorialBuilder`, `CollectionTutorialBuilder`)
86+
- **Facade**`SearchService`, `SuggestionService`, `StatisticsService` wrapping multiple strategies
87+
88+
## Setup
89+
90+
1. Install dependencies:
91+
```bash
92+
pip install -r requirements.txt
93+
```
94+
95+
2. Configure environment variables (copy `.env.example` to `.env`):
96+
```
97+
OPENAI_API_KEY=sk-your-key-here
98+
CHROMA_PERSIST_DIR=./chroma_data
99+
```
100+
101+
3. Run the app:
102+
```bash
103+
python app.py
104+
```
105+
106+
## Dependencies
107+
108+
| Package | Purpose |
109+
|---------|---------|
110+
| flask | Web framework |
111+
| chromadb | Vector database |
112+
| openai | Embedding API |
113+
| tiktoken | Token counting |
114+
| tree-sitter | AST parsing |
115+
| tree-sitter-python | Python grammar for tree-sitter |
116+
| python-dotenv | Environment variable management |
117+
| pathspec | `.gitignore` pattern matching |
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
"""Flask application entry point for the ChromaDB Code Search UI."""
2+
3+
import os
4+
import sys
5+
import logging
6+
from flask import Flask, jsonify
7+
from dotenv import load_dotenv
8+
9+
# Ensure the app directory is on the Python path
10+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
11+
12+
from config import get_config
13+
from routes import register_blueprints
14+
15+
16+
class ReverseProxied:
17+
"""WSGI middleware that sets SCRIPT_NAME from an environment variable.
18+
19+
When Flask runs behind a reverse proxy at a URL prefix (e.g. /flask/),
20+
this middleware tells Flask about the prefix so that url_for() generates
21+
correct URLs. Set SCRIPT_NAME=/flask in the environment to activate.
22+
"""
23+
24+
def __init__(self, app, script_name=""):
25+
self.app = app
26+
self.script_name = script_name
27+
28+
def __call__(self, environ, start_response):
29+
if self.script_name:
30+
environ["SCRIPT_NAME"] = self.script_name
31+
path_info = environ.get("PATH_INFO", "")
32+
if path_info.startswith(self.script_name):
33+
environ["PATH_INFO"] = path_info[len(self.script_name):]
34+
return self.app(environ, start_response)
35+
36+
37+
def create_app() -> Flask:
38+
"""Application factory: create and configure the Flask app."""
39+
load_dotenv()
40+
config = get_config()
41+
42+
app = Flask(__name__)
43+
app.secret_key = config.secret_key
44+
45+
# Support running behind a reverse proxy with a URL prefix
46+
script_name = os.environ.get("SCRIPT_NAME", "")
47+
if script_name:
48+
app.wsgi_app = ReverseProxied(app.wsgi_app, script_name=script_name)
49+
50+
# Configure logging
51+
logging.basicConfig(
52+
level=logging.DEBUG if config.debug else logging.INFO,
53+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
54+
)
55+
56+
# Register all route blueprints
57+
register_blueprints(app)
58+
59+
# Error handlers
60+
@app.errorhandler(404)
61+
def not_found(error):
62+
return jsonify({"error": "Not found"}), 404
63+
64+
@app.errorhandler(500)
65+
def internal_error(error):
66+
return jsonify({"error": "Internal server error"}), 500
67+
68+
app.logger.info(
69+
f"ChromaDB UI started in {config.environment.value} mode "
70+
f"on {config.host}:{config.port}"
71+
)
72+
73+
return app
74+
75+
76+
app = create_app()
77+
78+
79+
if __name__ == "__main__":
80+
config = get_config()
81+
app.run(host=config.host, port=config.port, debug=config.debug)
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
"""Application configuration with environment-aware settings."""
2+
3+
import os
4+
from dataclasses import dataclass, field
5+
from enum import Enum
6+
from typing import Optional
7+
from pathlib import Path
8+
9+
10+
class Environment(Enum):
11+
"""Application environment modes."""
12+
DEVELOPMENT = "development"
13+
PRODUCTION = "production"
14+
TESTING = "testing"
15+
16+
17+
class EmbeddingModel(Enum):
18+
"""Supported embedding models for ChromaDB."""
19+
OPENAI_SMALL = "text-embedding-3-small"
20+
OPENAI_LARGE = "text-embedding-3-large"
21+
DEFAULT = "default"
22+
23+
24+
@dataclass
25+
class ChromaConfig:
26+
"""Configuration for ChromaDB connection."""
27+
persist_directory: str = "./chroma_data"
28+
default_collection: str = "code_collection"
29+
embedding_model: EmbeddingModel = EmbeddingModel.OPENAI_SMALL
30+
batch_size: int = 100
31+
max_results: int = 20
32+
33+
def __post_init__(self):
34+
"""Ensure persist directory exists."""
35+
Path(self.persist_directory).mkdir(parents=True, exist_ok=True)
36+
37+
38+
@dataclass
39+
class SearchConfig:
40+
"""Configuration for search behavior."""
41+
default_n_results: int = 10
42+
max_n_results: int = 50
43+
min_query_length: int = 2
44+
max_query_length: int = 500
45+
score_precision: int = 4
46+
regex_max_results: int = 100
47+
regex_timeout_seconds: float = 5.0
48+
49+
50+
@dataclass
51+
class IngestionConfig:
52+
"""Configuration for the code ingestion pipeline."""
53+
max_tokens_per_chunk: int = 1000
54+
supported_extensions: tuple = (".py",)
55+
ignore_patterns: tuple = ("__pycache__", ".git", ".env", "node_modules")
56+
batch_size: int = 100
57+
tokenizer_model: str = "text-embedding-3-small"
58+
fallback_encoding: str = "cl100k_base"
59+
60+
61+
@dataclass
62+
class ExportConfig:
63+
"""Configuration for the export service."""
64+
default_format: str = "json"
65+
supported_formats: tuple = ("json", "csv")
66+
max_export_chunks: int = 10000
67+
68+
69+
@dataclass
70+
class DiffConfig:
71+
"""Configuration for the collection diff service."""
72+
similarity_threshold: float = 0.98
73+
max_diff_results: int = 50
74+
include_modified_by_default: bool = True
75+
76+
77+
@dataclass
78+
class AppConfig:
79+
"""Root application configuration combining all sub-configs."""
80+
environment: Environment = field(default_factory=lambda: Environment(
81+
os.getenv("FLASK_ENV", "development")
82+
))
83+
secret_key: str = field(default_factory=lambda:
84+
os.getenv("SECRET_KEY", "dev-secret-key-change-in-production")
85+
)
86+
host: str = "0.0.0.0"
87+
port: int = 5000
88+
debug: bool = field(init=False)
89+
openai_api_key: Optional[str] = field(default_factory=lambda:
90+
os.getenv("OPENAI_API_KEY")
91+
)
92+
chroma: ChromaConfig = field(default_factory=ChromaConfig)
93+
search: SearchConfig = field(default_factory=SearchConfig)
94+
ingestion: IngestionConfig = field(default_factory=IngestionConfig)
95+
export: ExportConfig = field(default_factory=ExportConfig)
96+
diff: DiffConfig = field(default_factory=DiffConfig)
97+
98+
def __post_init__(self):
99+
self.debug = self.environment == Environment.DEVELOPMENT
100+
101+
@classmethod
102+
def from_environment(cls) -> "AppConfig":
103+
"""Factory: build config from environment variables."""
104+
return cls(
105+
chroma=ChromaConfig(
106+
persist_directory=os.getenv("CHROMA_PERSIST_DIR", "./chroma_data"),
107+
),
108+
)
109+
110+
111+
def get_config() -> AppConfig:
112+
"""Get the current application configuration."""
113+
return AppConfig.from_environment()
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
"""Data models for the ChromaDB UI application."""
2+
3+
from models.chunk import Chunk, ChunkType, ChunkMetadata
4+
from models.search_result import SearchResult, SearchResultSet, SortOrder, ResultFormatter

0 commit comments

Comments
 (0)