feat(training): add live progress feedback to fine-tuning pipeline

Francisco · Francisco · commit bb19cea216d6 · 2026-04-16T14:43:57.000-03:00
- Add ProgressEmitter callback to unsloth_train.py — emits structured
  PROGRESS:{...} lines to stdout on every logging step containing
  step, total_steps, epoch, loss, and learning_rate
- Update worker.py stdout loop to parse PROGRESS: lines and write
  metrics to job.metrics with db.commit() on each step — users polling
  client.training.retrieve(job_id) now get live feedback instead of
  no visibility between dispatch and completion
- Fix HF_HUB_OFFLINE comment — clarified that current value 0 permits
  downloads; sovereignty guard comment was contradicting the code
- Update CI workflow — add Rust toolchain install, cargo cache, and
  maturin develop --release build step to test job so fc_parser
  extension is available during unit tests on both Python 3.11 and 3.12
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -123,6 +123,25 @@ jobs:
           pip install --require-hashes -r sandbox_reqs_hashed.txt
           pip install pytest pytest-cov
 
+      - name: "🦀 Install Rust toolchain"
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: "📦 Cache Rust build artifacts"
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            rust/fc_parser/target
+          key: ${{ runner.os }}-rust-fc-parser-${{ matrix.python-version }}-${{ hashFiles('rust/fc_parser/Cargo.toml', 'rust/fc_parser/src/**') }}
+          restore-keys: |
+            ${{ runner.os }}-rust-fc-parser-${{ matrix.python-version }}-
+
+      - name: "🦀 Build fc_parser Rust extension"
+        run: |
+          pip install maturin
+          cd rust/fc_parser && maturin develop --release
+
       - name: "✅ Run Pytest with Coverage"
         run: pytest tests/ --cov=src --cov-report=term-missing
 
@@ -193,12 +212,6 @@ jobs:
 
   # ──────────────────────────────────────────────────────────────────────────
   #  🧪  Build & Publish Staging Images (DEV BRANCH ONLY)
-  #
-  #  Pushes images tagged :dev-<sha> to Docker Hub.
-  #  These are never tagged :latest and never affect production deployments.
-  #  After a successful build, automatically updates docker-compose.yml in
-  #  projectdavid-platform-staging so the staging environment always tracks
-  #  the latest dev images without manual intervention.
   # ──────────────────────────────────────────────────────────────────────────
   build_staging:
     name: "🧪 Build & Publish Staging Images"
@@ -524,11 +537,6 @@ jobs:
           echo "| inference-worker | \`${{ steps.get_version.outputs.VERSION }}\` |" >> $GITHUB_STEP_SUMMARY
           echo "| router | \`${{ steps.get_version.outputs.VERSION }}\` |" >> $GITHUB_STEP_SUMMARY
 
-      # ── Update projectdavid-platform with new production image tags ─────────
-      # Updates ALL docker-compose yml files in both the root and
-      # projectdavid_platform/ directory — two instances of each file.
-      # Requires PLATFORM_REPO_PAT secret with write access to
-      # project-david-ai/projectdavid-platform.
       - name: "📦 Update projectdavid-platform with production image tags"
         env:
           PLATFORM_REPO_PAT: ${{ secrets.PLATFORM_REPO_PAT }}
@@ -540,7 +548,6 @@ jobs:
           git config user.email "ci@projectdavid.ai"
           git config user.name "Project David CI"
 
-          # All yml files carrying image tags — root level and package level.
           YML_FILES=(
             "docker-compose.yml"
             "docker-compose.gpu.yml"
@@ -559,14 +566,11 @@ jobs:
               echo "Skipping missing file: $FILE"
               continue
             fi
-            # Replace pinned semver tags:  thanosprime/projectdavid-core-*:X.Y.Z
             sed -i "s|thanosprime/projectdavid-core-\([^:]*\):[0-9]*\.[0-9]*\.[0-9]*|thanosprime/projectdavid-core-\1:${VERSION}|g" "$FILE"
-            # Replace :latest tags if present
             sed -i "s|thanosprime/projectdavid-core-\([^:]*\):latest|thanosprime/projectdavid-core-\1:${VERSION}|g" "$FILE"
             echo "Updated: $FILE"
           done
 
-          # Update PINNED_IMAGES.md if it exists
           if [ -f "PINNED_IMAGES.md" ]; then
             sed -i "s|:[0-9]*\.[0-9]*\.[0-9]*|:${VERSION}|g" PINNED_IMAGES.md
             sed -i "s|:latest|:${VERSION}|g" PINNED_IMAGES.md
diff --git a/src/api/training/unsloth_train.py b/src/api/training/unsloth_train.py
@@ -1,10 +1,10 @@
+import json
 import os
 
 # ─── SOVEREIGNTY GUARD ────────────────────────────────────────────────────────
-# Prevent any HuggingFace hub download attempts at runtime.
-# Only models already present in the local HF cache are permitted.
-# If the requested model is not cached, this will raise a clear error
-# rather than attempting a download — enforcing airgap compliance.
+# HF_HUB_OFFLINE = "1" enforces cache-only mode — no downloads permitted.
+# Set to "0" only if you explicitly want to allow HuggingFace hub downloads.
+# For production sovereign deployments this should be "1".
 os.environ["HF_HUB_OFFLINE"] = "0"
 # ──────────────────────────────────────────────────────────────────────────────
 
@@ -15,6 +15,7 @@
 
 import unsloth  # noqa: F401 — must precede trl/transformers/peft
 from datasets import load_dataset
+from transformers import TrainerCallback
 from trl import SFTConfig, SFTTrainer
 from unsloth import FastLanguageModel, is_bfloat16_supported
 
@@ -37,6 +38,33 @@
 }
 
 
+# ─── PROGRESS EMITTER ─────────────────────────────────────────────────────────
+class ProgressEmitter(TrainerCallback):
+    """
+    Emits structured PROGRESS: lines to stdout on every logging step.
+    The training worker parses these lines and writes them to job.metrics
+    so users get live feedback during training instead of a black hole.
+
+    Output format (one line per logging step):
+        PROGRESS:{"step": 5, "total_steps": 20, "epoch": 0.25, "loss": 1.423, "learning_rate": 0.0002}
+    """
+
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        if not logs:
+            return
+        progress = {
+            "step": state.global_step,
+            "total_steps": state.max_steps,
+            "epoch": round(state.epoch or 0, 3),
+            "loss": round(logs.get("loss", 0), 4),
+            "learning_rate": logs.get("learning_rate"),
+        }
+        print(f"PROGRESS:{json.dumps(progress)}", flush=True)
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model", required=True)
@@ -138,6 +166,7 @@ def format_prompts(examples):
         model=model,
         train_dataset=dataset,
         processing_class=tokenizer,
+        callbacks=[ProgressEmitter()],
         args=SFTConfig(
             dataset_text_field="text",
             per_device_train_batch_size=p["per_device_train_batch_size"],
diff --git a/src/api/training/worker.py b/src/api/training/worker.py
@@ -6,6 +6,8 @@
 Responsibilities:
   - Listen to the Redis training job queue
   - Dispatch training jobs as direct subprocesses (GPU claimed for duration)
+  - Parse PROGRESS: lines from unsloth_train.py and write live metrics
+    to job.metrics so users get feedback during training
 
 Ray is NOT used here. inference_worker owns the Ray cluster (HEAD node)
 and manages all inference GPU reservations via Ray Serve.
@@ -54,6 +56,12 @@ def process_job(job_id: str, user_id: str):
     """
     Core training job logic — runs as a direct subprocess.
 
+    Progress feedback:
+        unsloth_train.py emits PROGRESS:{...} lines on every logging step.
+        This loop parses those lines and writes them to job.metrics so users
+        polling client.training.retrieve(job_id) get live loss and step count
+        rather than a black hole between dispatch and completion.
+
     All imports are local to keep the module lightweight and avoid
     import-time side effects from SQLAlchemy / ORM modules.
     """
@@ -130,8 +138,26 @@ def _get_samba_client():
             cmd, stdout=_subprocess.PIPE, stderr=_subprocess.STDOUT, text=True
         )
 
+        # ─── STDOUT LOOP WITH PROGRESS PARSING ───────────────────────────────
+        # unsloth_train.py emits PROGRESS:{...} lines on every logging step.
+        # We parse these and write them to job.metrics so polling clients
+        # get live feedback. Non-PROGRESS lines are logged normally.
         for line in process.stdout:
-            logging_utility.info(f"[{job_id}] {line.strip()}")
+            line = line.strip()
+            logging_utility.info(f"[{job_id}] {line}")
+
+            if line.startswith("PROGRESS:"):
+                try:
+                    metrics = json.loads(line[9:])
+                    job.metrics = metrics
+                    job.updated_at = int(_time.time())
+                    db.commit()
+                except Exception as parse_err:
+                    logging_utility.warning(
+                        f"[{job_id}] Failed to parse PROGRESS line: {parse_err}"
+                    )
+        # ─────────────────────────────────────────────────────────────────────
+
         process.wait()
 
         if process.returncode == 0:
@@ -142,7 +168,6 @@ def _get_samba_client():
                 name=f"FT: {job.base_model}",
                 base_model=job.base_model,
                 storage_path=model_rel_path,
-                # node_id removed — FK references compute_nodes (legacy, Phase 5 drop)
                 status=_StatusEnum.active,
                 created_at=int(_time.time()),
                 updated_at=int(_time.time()),