Skip to content

Commit 852f6ff

Browse files
authored
Add backfill job for missing ML classification and translations (#155)
* Add backfill job for missing ML classification and translations * Add joblock for backfill job * Code review changes
1 parent 0282a2a commit 852f6ff

8 files changed

Lines changed: 283 additions & 16 deletions

File tree

server/reportmanager/cron.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,8 @@ def import_reports():
102102
)
103103

104104
call_command("import_reports_from_bigquery", since=since)
105+
106+
107+
@app.task(ignore_result=True)
108+
def backfill_missing_report_data():
109+
call_command("backfill_missing_report_data")
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
# This Source Code Form is subject to the terms of the Mozilla Public
2+
# License, v. 2.0. If a copy of the MPL was not distributed with this
3+
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
4+
5+
"""Backfill missing ML classifications and translations from BigQuery.
6+
7+
This command queries BigQuery for ML classification results and translations
8+
that are missing from the database and updates ReportEntry records accordingly.
9+
10+
Background
11+
----------
12+
The broken_site_report_ml ETL job in docker-etl performs two operations:
13+
1. Gets ML classification from bugbug for each report
14+
2. Translates reports using ML.TRANSLATE
15+
16+
However, some reports in the local database may be missing this data after
17+
we import them with import_reports_from_bigquery due to failures in the ETL pipeline,
18+
i.e. bugbug not returning classifications results or the job is stopped
19+
for whatever reason.
20+
21+
By the time the ETL job receives the results, reports already might
22+
be imported into the dashboard DB. This backfill job periodically queries
23+
BigQuery for missing data and updates the local database.
24+
25+
Impact on Clustering
26+
--------------------
27+
Reports with ml_valid_probability=NULL are excluded from clustering entirely
28+
and assigned to domain-based buckets.
29+
30+
When reports receive new ML classifications or translations, they need to be re-triaged.
31+
All reports receiving updates have their bucket_id cleared so triage_new_reports can
32+
reassign them to proper cluster-based or domain-based bucket.
33+
34+
Note: this backfill job only selecting reports with missing ML classification and
35+
not missing translation to fetch updates for. It's possible that ML.TRANSLATE is unable
36+
to translate text, but it's rather an edge case and mainly happens because text
37+
is too long (i.e. entire html page contents) or contains unprocessable characters.
38+
Once missing classification is received the job also checks if translation
39+
was missing and updates it, however missing ML classification is the
40+
deciding factor for updates.
41+
42+
"""
43+
44+
from dataclasses import dataclass
45+
from itertools import batched
46+
from logging import getLogger
47+
48+
from django.conf import settings
49+
from django.core.management import BaseCommand
50+
from google.cloud import bigquery
51+
from google.oauth2 import service_account
52+
53+
from reportmanager.locking import JobLockError, acquire_job_lock
54+
from reportmanager.models import JobLock, ReportEntry
55+
from reportmanager.utils import preprocess_text, transform_ml_label
56+
57+
LOG = getLogger("reportmanager.backfill")
58+
59+
60+
@dataclass
61+
class BackfillData:
62+
ml_valid_probability: float | None
63+
language_code: str | None
64+
translated_text: str | None
65+
66+
67+
class Command(BaseCommand):
68+
help = "Backfill missing ML classification and translations from BigQuery"
69+
70+
BQ_BATCH_SIZE = 5000
71+
DB_BATCH_SIZE = 1000
72+
73+
def handle(self, *args, **options) -> None:
74+
try:
75+
with acquire_job_lock(JobLock.LockTypes.BACKFILL):
76+
self.run_backfill()
77+
except JobLockError as e:
78+
LOG.warning(f"Cannot start backfill: {e}")
79+
return
80+
81+
def run_backfill(self) -> None:
82+
# Find reports needing ML updates (only those with non-empty comments)
83+
reports_to_update = ReportEntry.objects.filter(
84+
ml_valid_probability__isnull=True, comments__isnull=False
85+
).exclude(comments="")
86+
87+
total_reports = reports_to_update.count()
88+
89+
if total_reports == 0:
90+
LOG.info("No reports need ML backfill")
91+
return
92+
93+
LOG.info("Found %d reports needing ML backfill", total_reports)
94+
95+
all_reports = list(reports_to_update)
96+
batches = list(batched(all_reports, self.BQ_BATCH_SIZE))
97+
total_updated: int = 0
98+
99+
params = {
100+
"project": settings.BIGQUERY_PROJECT,
101+
}
102+
103+
if svc_acct := getattr(settings, "BIGQUERY_SERVICE_ACCOUNT", None):
104+
params["credentials"] = (
105+
service_account.Credentials.from_service_account_info(svc_acct)
106+
)
107+
108+
client: bigquery.Client = bigquery.Client(**params)
109+
110+
for batch_num, report_batch in enumerate(batches, 1):
111+
LOG.info(
112+
"Processing batch %d (total %d reports)...",
113+
batch_num,
114+
len(report_batch),
115+
)
116+
117+
uuid_batch: list[str] = [str(report.uuid) for report in report_batch]
118+
119+
query: str = f"""
120+
SELECT r.uuid,
121+
c.label as ml_label, c.probability as ml_probability,
122+
t.language_code, t.translated_text
123+
FROM `{settings.BIGQUERY_TABLE}` as r
124+
INNER JOIN `{settings.BIGQUERY_CLASSIFICATION_TABLE}` c
125+
ON r.uuid = c.report_uuid
126+
LEFT JOIN `{settings.BIGQUERY_TRANSLATIONS_TABLE}` t
127+
ON r.uuid = t.report_uuid
128+
WHERE r.uuid IN UNNEST(@uuids)
129+
"""
130+
131+
job_config = bigquery.QueryJobConfig(
132+
query_parameters=[
133+
bigquery.ArrayQueryParameter("uuids", "STRING", uuid_batch)
134+
]
135+
)
136+
137+
result = client.query(query, job_config=job_config)
138+
139+
bq_data: dict[str, BackfillData] = {}
140+
for row in result:
141+
ml_valid_probability = transform_ml_label(
142+
row.ml_label, row.ml_probability
143+
)
144+
bq_data[row.uuid] = BackfillData(
145+
ml_valid_probability=ml_valid_probability,
146+
language_code=row.language_code,
147+
translated_text=row.translated_text,
148+
)
149+
150+
LOG.info("Fetched data for %d reports from BigQuery", len(bq_data))
151+
152+
if not bq_data:
153+
continue
154+
155+
reports_to_update: list[ReportEntry] = []
156+
157+
for report in report_batch:
158+
uuid = str(report.uuid)
159+
160+
if uuid in bq_data:
161+
data = bq_data[uuid]
162+
updated = False
163+
164+
if (
165+
report.ml_valid_probability is None
166+
and data.ml_valid_probability is not None
167+
):
168+
report.ml_valid_probability = data.ml_valid_probability
169+
updated = True
170+
171+
if (
172+
report.comments_translated is None
173+
and data.translated_text is not None
174+
):
175+
report.comments_translated = data.translated_text
176+
report.comments_original_language = data.language_code
177+
report.comments_preprocessed = preprocess_text(
178+
data.translated_text
179+
)
180+
updated = True
181+
182+
if updated:
183+
reports_to_update.append(report)
184+
185+
# Clear bucket assignment to re-triage these reports
186+
if report.cluster_id is None:
187+
report.bucket_id = None
188+
189+
if reports_to_update:
190+
ReportEntry.objects.bulk_update(
191+
reports_to_update,
192+
[
193+
"ml_valid_probability",
194+
"comments_translated",
195+
"comments_original_language",
196+
"comments_preprocessed",
197+
"bucket_id",
198+
],
199+
batch_size=self.DB_BATCH_SIZE,
200+
)
201+
total_updated += len(reports_to_update)
202+
LOG.info(
203+
"Updated %d reports in batch (cleared buckets for re-triaging)",
204+
len(reports_to_update),
205+
)
206+
207+
LOG.info("Backfill complete: %d reports updated", total_updated)

server/reportmanager/management/commands/import_reports_from_bigquery.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from google.oauth2 import service_account
1515

1616
from reportmanager.models import ReportEntry
17+
from reportmanager.utils import transform_ml_label
1718
from webcompat.models import Report
1819

1920
LOG = getLogger("reportmanager.import")
@@ -57,19 +58,7 @@ def handle(self, *args, **options):
5758
)
5859

5960
for row in result:
60-
# The BugBot ML prediction can assign two labels, invalid or valid,
61-
# with a probability between 0 and 1. Having two labels makes
62-
# filtering and sorting harder, so let's transform "invalid 95%"
63-
# into "valid 5%".
64-
# There is a rare chance that a bug will have no score. In this case,
65-
# we just assign None, which will get treated as invalid in the
66-
# frontend.
67-
ml_valid_probability = None
68-
match row.ml_label:
69-
case "invalid":
70-
ml_valid_probability = 1 - row.ml_probability
71-
case "valid":
72-
ml_valid_probability = row.ml_probability
61+
ml_valid_probability = transform_ml_label(row.ml_label, row.ml_probability)
7362

7463
report_obj = Report(
7564
app_name=row.app_name,

server/reportmanager/migrations/0016_joblock.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ class Migration(migrations.Migration):
3131
fields=[
3232
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
3333
('singleton_key', models.PositiveSmallIntegerField(default=1, editable=False, help_text='Singleton key constrained to value 1 by check constraint', unique=True)),
34-
('lock_name', models.CharField(blank=True, choices=[('clustering', 'Clustering'), ('cleanup', 'Cleanup')], help_text='Name of operation holding the lock', max_length=50)),
34+
('lock_name', models.CharField(blank=True, choices=[('clustering', 'Clustering'), ('cleanup', 'Cleanup'), ('backfill', 'Backfill')], help_text='Name of operation holding the lock', max_length=50)),
3535
('acquired_at', models.DateTimeField(blank=True, null=True)),
3636
('acquired_by', models.CharField(blank=True, help_text='hostname:pid of process holding lock', max_length=255)),
3737
],

server/reportmanager/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,7 @@ class JobLock(models.Model):
536536
class LockTypes(models.TextChoices):
537537
CLUSTERING = "clustering", "Clustering"
538538
CLEANUP = "cleanup", "Cleanup"
539+
BACKFILL = "backfill", "Backfill"
539540

540541
# Locks older than 3 hours are considered stale
541542
STALE_LOCK_HOURS = 3

server/reportmanager/utils.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,42 @@
55
import re
66

77

8-
def preprocess_text(text):
8+
def preprocess_text(text: str | None) -> str:
99
if not text or text == "":
1010
return ""
1111

1212
text = html.unescape(text)
1313
text = str(text).strip()
1414
text = re.sub(r"\s+", " ", text)
1515
return text
16+
17+
18+
def transform_ml_label(
19+
ml_label: str | None, ml_probability: float | None
20+
) -> float | None:
21+
"""Transform ML label and probability into a valid probability.
22+
23+
The BugBot ML prediction can assign two labels, "invalid" or "valid",
24+
with a probability between 0 and 1. Having two labels makes filtering
25+
and sorting harder, so we transform "invalid 95%" into "valid 5%".
26+
27+
There is a chance that a bug will have no label and score. In this case,
28+
we just assign None, which will get treated as invalid in the
29+
frontend.
30+
31+
Args:
32+
ml_label: The ML label ("invalid" or "valid"), or None if missing
33+
ml_probability: The probability value (0-1), or None if missing
34+
35+
Returns:
36+
The probability that the report is valid, or None if label is unknown
37+
"""
38+
ml_valid_probability: float | None = None
39+
match ml_label:
40+
case "invalid":
41+
ml_valid_probability = (
42+
1 - ml_probability if ml_probability is not None else None
43+
)
44+
case "valid":
45+
ml_valid_probability = ml_probability
46+
return ml_valid_probability

server/server/settings.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,10 @@ def resolver_context_processor(request):
315315
"task": "reportmanager.cron.unhide_buckets",
316316
"schedule": 60,
317317
},
318+
"Backfill missing report data evry 12 hours": {
319+
"task": "reportmanager.cron.backfill_missing_report_data",
320+
"schedule": 60 * 60 * 12,
321+
},
318322
}
319323

320324
# Email

tests/test_utils.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
"""Tests for reportmanager.utils."""
22

3-
from reportmanager.utils import preprocess_text
3+
import pytest
4+
5+
from reportmanager.utils import preprocess_text, transform_ml_label
46

57

68
class TestPreprocessText:
@@ -83,3 +85,31 @@ def test_combined_transformations(self):
8385
# Multiple issues in one string
8486
input_text = "\t The & symbol is\nescaped "
8587
assert preprocess_text(input_text) == "The & symbol is escaped"
88+
89+
90+
class TestTransformMLLabel:
91+
"""Tests for transform_ml_label function."""
92+
93+
def test_valid_label_with_high_probability(self):
94+
"""Test transformation of 'valid' label with high probability."""
95+
assert transform_ml_label("valid", 0.95) == 0.95
96+
97+
def test_valid_label_with_low_probability(self):
98+
"""Test transformation of 'valid' label with mid probability."""
99+
assert transform_ml_label("valid", 0.53) == 0.53
100+
101+
def test_invalid_label_with_high_probability(self):
102+
"""Test transformation of 'invalid' label with high probability."""
103+
assert transform_ml_label("invalid", 0.95) == pytest.approx(0.05)
104+
105+
def test_invalid_label_mid_probability(self):
106+
"""Test transformation of 'invalid' label with mid probability."""
107+
assert transform_ml_label("invalid", 0.6) == pytest.approx(0.4)
108+
109+
def test_none_label_returns_none(self):
110+
"""Test transformation of None label returns None."""
111+
assert transform_ml_label(None, 0.5) is None
112+
113+
def test_empty_string_label_returns_none(self):
114+
"""Test transformation of empty string label returns None."""
115+
assert transform_ml_label("", 0.5) is None

0 commit comments

Comments
 (0)