diff --git a/.github/workflows/llm-cloud-run.yml b/.github/workflows/llm-cloud-run.yml index e4ec26b8..4383eb8e 100644 --- a/.github/workflows/llm-cloud-run.yml +++ b/.github/workflows/llm-cloud-run.yml @@ -3,6 +3,11 @@ name: Gemini-LLM GCS Artifact Test on: workflow_dispatch: inputs: + seed_notes: + description: "Seed notes.json into BigQuery before analysis" + required: false + type: boolean + default: false use_prod_service_url: description: "Call the Cloud Run service after downloading artifacts" required: false @@ -20,7 +25,7 @@ permissions: contents: read env: - GCP_PROJECT_ID: moz-testops-tools + GCP_PROJECT_ID: moz-mobile-tools SERVICE_URL: ${{ secrets.LLM_PROD_SERVICE_URL }} CRASH_URI: gs://testops-llm-artifacts/crashes/minidumps/examples/crash_example.txt ANR_URI: gs://testops-llm-artifacts/anr/examples/anr_example.txt @@ -28,7 +33,39 @@ env: LOCAL_ARTIFACT_DIR: artifacts jobs: + seed-notes: + if: ${{ inputs.seed_notes == true }} + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: pip install -r llm-cloud-run/requirements.txt + + - name: Authenticate to Google Cloud (JSON key) + uses: google-github-actions/auth@v3 + with: + credentials_json: ${{ secrets.GCP_SA_VERTEX_AI }} + + - name: Seed notes into BigQuery + working-directory: llm-cloud-run + env: + GCP_PROJECT: moz-mobile-tools + BQ_PROJECT: moz-mobile-tools + BQ_DATASET: vertex_ai_tool + run: python seed_notes.py + manual-run: + needs: [seed-notes] + # Always run this job, even when seed-notes is skipped + if: ${{ always() && !failure() && !cancelled() }} runs-on: ubuntu-latest steps: @@ -101,4 +138,4 @@ jobs: } >> "$GITHUB_STEP_SUMMARY" - rm -f "$CONTENT_FILE" "$RESPONSE_FILE" + rm -f "$CONTENT_FILE" "$RESPONSE_FILE" \ No newline at end of file diff --git a/llm-cloud-run/seed_notes.py b/llm-cloud-run/seed_notes.py new file mode 100644 index 00000000..355db09c --- /dev/null +++ b/llm-cloud-run/seed_notes.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +"""Seed BigQuery tables from notes.json. + +Reads each note, generates an embedding via Vertex AI, and inserts both +the note and its embedding into the configured BigQuery tables. + +Usage (requires GCP credentials via ADC or service account): + + # Uses defaults from Settings / environment + python seed_notes.py + + # Point at a different JSON file + python seed_notes.py --notes-file path/to/notes.json + + # Dry-run: log what would be inserted without touching BigQuery + python seed_notes.py --dry-run + +Environment variables honoured (see common/config.py): + GCP_PROJECT, GCP_LOCATION, BQ_PROJECT, BQ_DATASET, + BQ_NOTES_TABLE, BQ_EMBEDDINGS_TABLE, EMBEDDING_MODEL, ... +""" +from __future__ import annotations + +import argparse +import json +import logging +import sys +import time +from pathlib import Path + +# ── project imports (same packages the Cloud Run service uses) ────── +from common.config import Settings +from common.logging_utils import setup_logging +from llm.embeddings import embed_text +from storage.bigquery_repo import BigQueryNotesRepository + +logger = logging.getLogger(__name__) + +DEFAULT_NOTES_FILE = Path(__file__).parent / "notes.json" + + +def load_notes(path: Path) -> list[dict]: + """Return the list of note dicts from a JSON file.""" + with open(path) as fh: + data = json.load(fh) + if not isinstance(data, list): + raise ValueError(f"Expected a JSON array in {path}, got {type(data).__name__}") + return data + + +def seed( + settings: Settings, + repo: BigQueryNotesRepository, + notes: list[dict], + *, + dry_run: bool = False, +) -> None: + total = len(notes) + logger.info("Seeding %d note(s) (dry_run=%s)", total, dry_run) + + for idx, note in enumerate(notes, start=1): + note_id = note["id"] + content = note["content"] + source = note.get("source") + + logger.info("[%d/%d] Processing note %s …", idx, total, note_id) + + # ── generate embedding ────────────────────────────────────── + logger.info(" Generating embedding (%s) …", settings.embedding_model) + if dry_run: + embedding = [] + else: + embedding = embed_text(settings, content) + logger.info(" Embedding dimension: %d", len(embedding)) + + # ── insert into BigQuery ──────────────────────────────────── + if dry_run: + logger.info(" [DRY-RUN] Would insert note and embedding for %s", note_id) + continue + + repo.insert_note(note_id=note_id, content=content, source=source) + logger.info(" Inserted note %s", note_id) + + repo.insert_embedding(note_id=note_id, embedding=embedding) + logger.info(" Inserted embedding for %s", note_id) + + # Be polite to the Vertex AI quota (embedding API). + if idx < total: + time.sleep(0.25) + + logger.info("Done – %d note(s) processed.", total) + + +def main() -> None: + setup_logging() + + parser = argparse.ArgumentParser(description="Seed BigQuery notes from JSON") + parser.add_argument( + "--notes-file", + type=Path, + default=DEFAULT_NOTES_FILE, + help=f"Path to the notes JSON file (default: {DEFAULT_NOTES_FILE})", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Log actions without writing to BigQuery", + ) + args = parser.parse_args() + + if not args.notes_file.exists(): + logger.error("Notes file not found: %s", args.notes_file) + sys.exit(1) + + notes = load_notes(args.notes_file) + if not notes: + logger.warning("Notes file is empty, nothing to do.") + return + + settings = Settings() + logger.info( + "BigQuery target: %s.%s (notes=%s, embeddings=%s)", + settings.effective_bq_project, + settings.bq_dataset, + settings.bq_notes_table, + settings.bq_embeddings_table, + ) + + repo = BigQueryNotesRepository(settings) + seed(settings, repo, notes, dry_run=args.dry_run) + + +if __name__ == "__main__": + main() \ No newline at end of file