Add backfill job for missing ML classification and translations (#155)

ksy36 · web-flow · commit 852f6ffa3c8b · 2026-03-25T10:04:53.000-04:00
* Add backfill job for missing ML classification and translations

* Add joblock for backfill job

* Code review changes
diff --git a/server/reportmanager/cron.py b/server/reportmanager/cron.py
@@ -102,3 +102,8 @@ def import_reports():
         )
 
     call_command("import_reports_from_bigquery", since=since)
+
+
+@app.task(ignore_result=True)
+def backfill_missing_report_data():
+    call_command("backfill_missing_report_data")
diff --git a/server/reportmanager/management/commands/backfill_missing_report_data.py b/server/reportmanager/management/commands/backfill_missing_report_data.py
@@ -0,0 +1,207 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""Backfill missing ML classifications and translations from BigQuery.
+
+This command queries BigQuery for ML classification results and translations
+that are missing from the database and updates ReportEntry records accordingly.
+
+Background
+----------
+The broken_site_report_ml ETL job in docker-etl performs two operations:
+1. Gets ML classification from bugbug for each report
+2. Translates reports using ML.TRANSLATE
+
+However, some reports in the local database may be missing this data after
+we import them with import_reports_from_bigquery due to failures in the ETL pipeline,
+i.e. bugbug not returning classifications results or the job is stopped
+for whatever reason.
+
+By the time the ETL job receives the results, reports already might
+be imported into the dashboard DB. This backfill job periodically queries
+BigQuery for missing data and updates the local database.
+
+Impact on Clustering
+--------------------
+Reports with ml_valid_probability=NULL are excluded from clustering entirely
+and assigned to domain-based buckets.
+
+When reports receive new ML classifications or translations, they need to be re-triaged.
+All reports receiving updates have their bucket_id cleared so triage_new_reports can
+reassign them to proper cluster-based or domain-based bucket.
+
+Note: this backfill job only selecting reports with missing ML classification and
+not missing translation to fetch updates for. It's possible that ML.TRANSLATE is unable
+to translate text, but it's rather an edge case and mainly happens because text
+is too long (i.e. entire html page contents) or contains unprocessable characters.
+Once missing classification is received the job also checks if translation
+was missing and updates it, however missing ML classification is the
+deciding factor for updates.
+
+"""
+
+from dataclasses import dataclass
+from itertools import batched
+from logging import getLogger
+
+from django.conf import settings
+from django.core.management import BaseCommand
+from google.cloud import bigquery
+from google.oauth2 import service_account
+
+from reportmanager.locking import JobLockError, acquire_job_lock
+from reportmanager.models import JobLock, ReportEntry
+from reportmanager.utils import preprocess_text, transform_ml_label
+
+LOG = getLogger("reportmanager.backfill")
+
+
+@dataclass
+class BackfillData:
+    ml_valid_probability: float | None
+    language_code: str | None
+    translated_text: str | None
+
+
+class Command(BaseCommand):
+    help = "Backfill missing ML classification and translations from BigQuery"
+
+    BQ_BATCH_SIZE = 5000
+    DB_BATCH_SIZE = 1000
+
+    def handle(self, *args, **options) -> None:
+        try:
+            with acquire_job_lock(JobLock.LockTypes.BACKFILL):
+                self.run_backfill()
+        except JobLockError as e:
+            LOG.warning(f"Cannot start backfill: {e}")
+            return
+
+    def run_backfill(self) -> None:
+        # Find reports needing ML updates (only those with non-empty comments)
+        reports_to_update = ReportEntry.objects.filter(
+            ml_valid_probability__isnull=True, comments__isnull=False
+        ).exclude(comments="")
+
+        total_reports = reports_to_update.count()
+
+        if total_reports == 0:
+            LOG.info("No reports need ML backfill")
+            return
+
+        LOG.info("Found %d reports needing ML backfill", total_reports)
+
+        all_reports = list(reports_to_update)
+        batches = list(batched(all_reports, self.BQ_BATCH_SIZE))
+        total_updated: int = 0
+
+        params = {
+            "project": settings.BIGQUERY_PROJECT,
+        }
+
+        if svc_acct := getattr(settings, "BIGQUERY_SERVICE_ACCOUNT", None):
+            params["credentials"] = (
+                service_account.Credentials.from_service_account_info(svc_acct)
+            )
+
+        client: bigquery.Client = bigquery.Client(**params)
+
+        for batch_num, report_batch in enumerate(batches, 1):
+            LOG.info(
+                "Processing batch %d (total %d reports)...",
+                batch_num,
+                len(report_batch),
+            )
+
+            uuid_batch: list[str] = [str(report.uuid) for report in report_batch]
+
+            query: str = f"""
+                SELECT r.uuid,
+                       c.label as ml_label, c.probability as ml_probability,
+                       t.language_code, t.translated_text
+                FROM `{settings.BIGQUERY_TABLE}` as r
+                INNER JOIN `{settings.BIGQUERY_CLASSIFICATION_TABLE}` c
+                    ON r.uuid = c.report_uuid
+                LEFT JOIN `{settings.BIGQUERY_TRANSLATIONS_TABLE}` t
+                    ON r.uuid = t.report_uuid
+                WHERE r.uuid IN UNNEST(@uuids)
+            """
+
+            job_config = bigquery.QueryJobConfig(
+                query_parameters=[
+                    bigquery.ArrayQueryParameter("uuids", "STRING", uuid_batch)
+                ]
+            )
+
+            result = client.query(query, job_config=job_config)
+
+            bq_data: dict[str, BackfillData] = {}
+            for row in result:
+                ml_valid_probability = transform_ml_label(
+                    row.ml_label, row.ml_probability
+                )
+                bq_data[row.uuid] = BackfillData(
+                    ml_valid_probability=ml_valid_probability,
+                    language_code=row.language_code,
+                    translated_text=row.translated_text,
+                )
+
+            LOG.info("Fetched data for %d reports from BigQuery", len(bq_data))
+
+            if not bq_data:
+                continue
+
+            reports_to_update: list[ReportEntry] = []
+
+            for report in report_batch:
+                uuid = str(report.uuid)
+
+                if uuid in bq_data:
+                    data = bq_data[uuid]
+                    updated = False
+
+                    if (
+                        report.ml_valid_probability is None
+                        and data.ml_valid_probability is not None
+                    ):
+                        report.ml_valid_probability = data.ml_valid_probability
+                        updated = True
+
+                    if (
+                        report.comments_translated is None
+                        and data.translated_text is not None
+                    ):
+                        report.comments_translated = data.translated_text
+                        report.comments_original_language = data.language_code
+                        report.comments_preprocessed = preprocess_text(
+                            data.translated_text
+                        )
+                        updated = True
+
+                    if updated:
+                        reports_to_update.append(report)
+
+                        # Clear bucket assignment to re-triage these reports
+                        if report.cluster_id is None:
+                            report.bucket_id = None
+
+            if reports_to_update:
+                ReportEntry.objects.bulk_update(
+                    reports_to_update,
+                    [
+                        "ml_valid_probability",
+                        "comments_translated",
+                        "comments_original_language",
+                        "comments_preprocessed",
+                        "bucket_id",
+                    ],
+                    batch_size=self.DB_BATCH_SIZE,
+                )
+                total_updated += len(reports_to_update)
+                LOG.info(
+                    "Updated %d reports in batch (cleared buckets for re-triaging)",
+                    len(reports_to_update),
+                )
+
+        LOG.info("Backfill complete: %d reports updated", total_updated)
diff --git a/server/reportmanager/management/commands/import_reports_from_bigquery.py b/server/reportmanager/management/commands/import_reports_from_bigquery.py
@@ -14,6 +14,7 @@
 from google.oauth2 import service_account
 
 from reportmanager.models import ReportEntry
+from reportmanager.utils import transform_ml_label
 from webcompat.models import Report
 
 LOG = getLogger("reportmanager.import")
@@ -57,19 +58,7 @@ def handle(self, *args, **options):
         )
 
         for row in result:
-            # The BugBot ML prediction can assign two labels, invalid or valid,
-            # with a probability between 0 and 1. Having two labels makes
-            # filtering and sorting harder, so let's transform "invalid 95%"
-            # into "valid 5%".
-            # There is a rare chance that a bug will have no score. In this case,
-            # we just assign None, which will get treated as invalid in the
-            # frontend.
-            ml_valid_probability = None
-            match row.ml_label:
-                case "invalid":
-                    ml_valid_probability = 1 - row.ml_probability
-                case "valid":
-                    ml_valid_probability = row.ml_probability
+            ml_valid_probability = transform_ml_label(row.ml_label, row.ml_probability)
 
             report_obj = Report(
                 app_name=row.app_name,
diff --git a/server/reportmanager/migrations/0016_joblock.py b/server/reportmanager/migrations/0016_joblock.py
@@ -31,7 +31,7 @@ class Migration(migrations.Migration):
             fields=[
                 ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
                 ('singleton_key', models.PositiveSmallIntegerField(default=1, editable=False, help_text='Singleton key constrained to value 1 by check constraint', unique=True)),
-                ('lock_name', models.CharField(blank=True, choices=[('clustering', 'Clustering'), ('cleanup', 'Cleanup')], help_text='Name of operation holding the lock', max_length=50)),
+                ('lock_name', models.CharField(blank=True, choices=[('clustering', 'Clustering'), ('cleanup', 'Cleanup'), ('backfill', 'Backfill')], help_text='Name of operation holding the lock', max_length=50)),
                 ('acquired_at', models.DateTimeField(blank=True, null=True)),
                 ('acquired_by', models.CharField(blank=True, help_text='hostname:pid of process holding lock', max_length=255)),
             ],
diff --git a/server/reportmanager/models.py b/server/reportmanager/models.py
@@ -536,6 +536,7 @@ class JobLock(models.Model):
     class LockTypes(models.TextChoices):
         CLUSTERING = "clustering", "Clustering"
         CLEANUP = "cleanup", "Cleanup"
+        BACKFILL = "backfill", "Backfill"
 
     # Locks older than 3 hours are considered stale
     STALE_LOCK_HOURS = 3
diff --git a/server/reportmanager/utils.py b/server/reportmanager/utils.py
@@ -5,11 +5,42 @@
 import re
 
 
-def preprocess_text(text):
+def preprocess_text(text: str | None) -> str:
     if not text or text == "":
         return ""
 
     text = html.unescape(text)
     text = str(text).strip()
     text = re.sub(r"\s+", " ", text)
     return text
+
+
+def transform_ml_label(
+    ml_label: str | None, ml_probability: float | None
+) -> float | None:
+    """Transform ML label and probability into a valid probability.
+
+    The BugBot ML prediction can assign two labels, "invalid" or "valid",
+    with a probability between 0 and 1. Having two labels makes filtering
+    and sorting harder, so we transform "invalid 95%" into "valid 5%".
+
+    There is a chance that a bug will have no label and score. In this case,
+    we just assign None, which will get treated as invalid in the
+    frontend.
+
+    Args:
+        ml_label: The ML label ("invalid" or "valid"), or None if missing
+        ml_probability: The probability value (0-1), or None if missing
+
+    Returns:
+        The probability that the report is valid, or None if label is unknown
+    """
+    ml_valid_probability: float | None = None
+    match ml_label:
+        case "invalid":
+            ml_valid_probability = (
+                1 - ml_probability if ml_probability is not None else None
+            )
+        case "valid":
+            ml_valid_probability = ml_probability
+    return ml_valid_probability
diff --git a/server/server/settings.py b/server/server/settings.py
@@ -315,6 +315,10 @@ def resolver_context_processor(request):
         "task": "reportmanager.cron.unhide_buckets",
         "schedule": 60,
     },
+    "Backfill missing report data evry 12 hours": {
+        "task": "reportmanager.cron.backfill_missing_report_data",
+        "schedule": 60 * 60 * 12,
+    },
 }
 
 # Email
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -1,6 +1,8 @@
 """Tests for reportmanager.utils."""
 
-from reportmanager.utils import preprocess_text
+import pytest
+
+from reportmanager.utils import preprocess_text, transform_ml_label
 
 
 class TestPreprocessText:
@@ -83,3 +85,31 @@ def test_combined_transformations(self):
         # Multiple issues in one string
         input_text = "\t  The &amp; symbol   is\nescaped  "
         assert preprocess_text(input_text) == "The & symbol is escaped"
+
+
+class TestTransformMLLabel:
+    """Tests for transform_ml_label function."""
+
+    def test_valid_label_with_high_probability(self):
+        """Test transformation of 'valid' label with high probability."""
+        assert transform_ml_label("valid", 0.95) == 0.95
+
+    def test_valid_label_with_low_probability(self):
+        """Test transformation of 'valid' label with mid probability."""
+        assert transform_ml_label("valid", 0.53) == 0.53
+
+    def test_invalid_label_with_high_probability(self):
+        """Test transformation of 'invalid' label with high probability."""
+        assert transform_ml_label("invalid", 0.95) == pytest.approx(0.05)
+
+    def test_invalid_label_mid_probability(self):
+        """Test transformation of 'invalid' label with mid probability."""
+        assert transform_ml_label("invalid", 0.6) == pytest.approx(0.4)
+
+    def test_none_label_returns_none(self):
+        """Test transformation of None label returns None."""
+        assert transform_ml_label(None, 0.5) is None
+
+    def test_empty_string_label_returns_none(self):
+        """Test transformation of empty string label returns None."""
+        assert transform_ml_label("", 0.5) is None

Original file line number	Diff line number	Diff line change
`@@ -102,3 +102,8 @@ def import_reports():`
`102`	`102`	`)`
`103`	`103`
`104`	`104`	`call_command("import_reports_from_bigquery", since=since)`
	`105`	`+`
	`106`	`+`
	`107`	`+@app.task(ignore_result=True)`
	`108`	`+def backfill_missing_report_data():`
	`109`	`+ call_command("backfill_missing_report_data")`