1+ from datetime import datetime
12from itertools import batched
23from logging import getLogger
34
910 ClusteringConfig ,
1011 ClusterReport ,
1112)
12- from reportmanager .models import Bucket , ClusteringJob , ClusteringJobType , ReportEntry
13+ from reportmanager .models import (
14+ Bucket ,
15+ BucketHit ,
16+ ClusteringJob ,
17+ ClusteringJobType ,
18+ ReportEntry ,
19+ )
1320
1421LOG = getLogger ("reportmanager.triage" )
1522
@@ -75,10 +82,10 @@ def cluster_unmatched_reports(
7582def apply_domain_bucketing_fallback (
7683 unmatched_reports : list [ClusterReport ],
7784 report_entries : dict [int , ReportEntry ],
78- ) -> int :
85+ ) -> tuple [ int , list [ tuple [ int , datetime ]]] :
7986 """Add unclustered reports to default domain-based buckets."""
8087 if not unmatched_reports :
81- return 0
88+ return 0 , []
8289
8390 LOG .info (
8491 f"Applying domain-based bucketing to { len (unmatched_reports )} reports that didn't cluster" # noqa
@@ -98,6 +105,7 @@ def apply_domain_bucketing_fallback(
98105 existing_buckets .update ({bucket ["domain" ]: bucket ["id" ] for bucket in buckets })
99106
100107 entries_to_update = []
108+ bucket_hits = []
101109 buckets_created = 0
102110
103111 for report in unmatched_reports :
@@ -118,14 +126,15 @@ def apply_domain_bucketing_fallback(
118126 existing_buckets [report .domain ] = bucket_id
119127
120128 entry = report_entries [report .id ]
121- entry .bucket_id = bucket_id
129+ entry .bucket_id = bucket_id # type: ignore[attr-defined]
122130 entries_to_update .append (entry )
131+ bucket_hits .append ((bucket_id , entry .reported_at ))
123132
124133 if entries_to_update :
125134 ReportEntry .objects .bulk_update (entries_to_update , ["bucket_id" ])
126135
127136 LOG .info (f"Applied domain-based bucketing to { len (entries_to_update )} reports" )
128- return buckets_created
137+ return buckets_created , bucket_hits
129138
130139
131140def get_cluster_bucket (
@@ -184,6 +193,7 @@ def run_triage(job: ClusteringJob) -> None:
184193 unmatched_reports = []
185194 low_quality_reports = []
186195 entries_to_update = []
196+ bucket_hits = []
187197
188198 for report in unbucketed_reports :
189199 if report .ok_to_cluster :
@@ -194,6 +204,7 @@ def run_triage(job: ClusteringJob) -> None:
194204 entry .cluster_id = cluster_id
195205 entry .bucket_id = bucket_id
196206 entries_to_update .append (entry )
207+ bucket_hits .append ((bucket_id , entry .reported_at ))
197208 else :
198209 # Track unmatched reports for further clustering
199210 unmatched_reports .append (report )
@@ -219,7 +230,13 @@ def run_triage(job: ClusteringJob) -> None:
219230 # Fall back to domain-based bucketing for reports that still don't have clusters
220231 # and low-quality reports
221232 remaining = still_unmatched + low_quality_reports
222- fallback_buckets = apply_domain_bucketing_fallback (remaining , report_entries )
233+ fallback_buckets , fallback_bucket_hits = apply_domain_bucketing_fallback (
234+ remaining , report_entries
235+ )
236+
237+ all_bucket_hits = bucket_hits + fallback_bucket_hits
238+ if all_bucket_hits :
239+ BucketHit .bulk_increment_counts (all_bucket_hits )
223240
224241 total_buckets = buckets_created + fallback_buckets
225242 complete_job (job , success = True , buckets_created = total_buckets )
0 commit comments