|
| 1 | +""" |
| 2 | +Backfill operations for pghistory event tables. |
| 3 | +
|
| 4 | +Used by management commands and one-shot data migrations. Errors propagate |
| 5 | +to the caller so misconfigured backfills fail loudly. |
| 6 | +""" |
| 7 | +import logging |
| 8 | +import time |
| 9 | + |
| 10 | +from django.db import connection |
| 11 | +from django.utils import timezone |
| 12 | + |
| 13 | +logger = logging.getLogger(__name__) |
| 14 | + |
| 15 | + |
| 16 | +def get_excluded_fields(model_name): |
| 17 | + """Get the list of excluded fields for a specific model from pghistory configuration.""" |
| 18 | + excluded_fields_map = { |
| 19 | + "Dojo_User": ["password"], |
| 20 | + "Product": ["updated"], |
| 21 | + "Cred_User": ["password"], |
| 22 | + "Notification_Webhooks": ["header_name", "header_value"], |
| 23 | + } |
| 24 | + return excluded_fields_map.get(model_name, []) |
| 25 | + |
| 26 | + |
| 27 | +def get_table_names(model_name): |
| 28 | + """Get the source table name and event table name for a model.""" |
| 29 | + if model_name == "Dojo_User": |
| 30 | + table_name = "dojo_dojo_user" |
| 31 | + event_table_name = "dojo_dojo_userevent" |
| 32 | + elif model_name == "Product_Type": |
| 33 | + table_name = "dojo_product_type" |
| 34 | + event_table_name = "dojo_product_typeevent" |
| 35 | + elif model_name == "Finding_Group": |
| 36 | + table_name = "dojo_finding_group" |
| 37 | + event_table_name = "dojo_finding_groupevent" |
| 38 | + elif model_name == "Risk_Acceptance": |
| 39 | + table_name = "dojo_risk_acceptance" |
| 40 | + event_table_name = "dojo_risk_acceptanceevent" |
| 41 | + elif model_name == "Finding_Template": |
| 42 | + table_name = "dojo_finding_template" |
| 43 | + event_table_name = "dojo_finding_templateevent" |
| 44 | + elif model_name == "Cred_User": |
| 45 | + table_name = "dojo_cred_user" |
| 46 | + event_table_name = "dojo_cred_userevent" |
| 47 | + elif model_name == "Notification_Webhooks": |
| 48 | + table_name = "dojo_notification_webhooks" |
| 49 | + event_table_name = "dojo_notification_webhooksevent" |
| 50 | + elif model_name == "FindingReviewers": |
| 51 | + # M2M through table: Django creates dojo_finding_reviewers for Finding.reviewers |
| 52 | + table_name = "dojo_finding_reviewers" |
| 53 | + event_table_name = "dojo_finding_reviewersevent" |
| 54 | + # Tag through tables (tagulous auto-generated) |
| 55 | + elif model_name == "FindingTags": |
| 56 | + table_name = "dojo_finding_tags" |
| 57 | + event_table_name = "dojo_finding_tagsevent" |
| 58 | + elif model_name == "FindingInheritedTags": |
| 59 | + table_name = "dojo_finding_inherited_tags" |
| 60 | + event_table_name = "dojo_finding_inherited_tagsevent" |
| 61 | + elif model_name == "ProductTags": |
| 62 | + table_name = "dojo_product_tags" |
| 63 | + event_table_name = "dojo_product_tagsevent" |
| 64 | + elif model_name == "EngagementTags": |
| 65 | + table_name = "dojo_engagement_tags" |
| 66 | + event_table_name = "dojo_engagement_tagsevent" |
| 67 | + elif model_name == "EngagementInheritedTags": |
| 68 | + table_name = "dojo_engagement_inherited_tags" |
| 69 | + event_table_name = "dojo_engagement_inherited_tagsevent" |
| 70 | + elif model_name == "TestTags": |
| 71 | + table_name = "dojo_test_tags" |
| 72 | + event_table_name = "dojo_test_tagsevent" |
| 73 | + elif model_name == "TestInheritedTags": |
| 74 | + table_name = "dojo_test_inherited_tags" |
| 75 | + event_table_name = "dojo_test_inherited_tagsevent" |
| 76 | + elif model_name == "EndpointTags": |
| 77 | + table_name = "dojo_endpoint_tags" |
| 78 | + event_table_name = "dojo_endpoint_tagsevent" |
| 79 | + elif model_name == "EndpointInheritedTags": |
| 80 | + table_name = "dojo_endpoint_inherited_tags" |
| 81 | + event_table_name = "dojo_endpoint_inherited_tagsevent" |
| 82 | + elif model_name == "FindingTemplateTags": |
| 83 | + table_name = "dojo_finding_template_tags" |
| 84 | + event_table_name = "dojo_finding_template_tagsevent" |
| 85 | + elif model_name == "AppAnalysisTags": |
| 86 | + table_name = "dojo_app_analysis_tags" |
| 87 | + event_table_name = "dojo_app_analysis_tagsevent" |
| 88 | + elif model_name == "ObjectsProductTags": |
| 89 | + table_name = "dojo_objects_product_tags" |
| 90 | + event_table_name = "dojo_objects_product_tagsevent" |
| 91 | + else: |
| 92 | + table_name = f"dojo_{model_name.lower()}" |
| 93 | + event_table_name = f"dojo_{model_name.lower()}event" |
| 94 | + return table_name, event_table_name |
| 95 | + |
| 96 | + |
| 97 | +def check_tables_exist(table_name, event_table_name): |
| 98 | + """Check if both source and event tables exist.""" |
| 99 | + with connection.cursor() as cursor: |
| 100 | + cursor.execute(""" |
| 101 | + SELECT EXISTS ( |
| 102 | + SELECT FROM information_schema.tables |
| 103 | + WHERE table_name = %s |
| 104 | + ) |
| 105 | + """, [table_name]) |
| 106 | + table_exists = cursor.fetchone()[0] |
| 107 | + |
| 108 | + cursor.execute(""" |
| 109 | + SELECT EXISTS ( |
| 110 | + SELECT FROM information_schema.tables |
| 111 | + WHERE table_name = %s |
| 112 | + ) |
| 113 | + """, [event_table_name]) |
| 114 | + event_table_exists = cursor.fetchone()[0] |
| 115 | + |
| 116 | + return table_exists, event_table_exists |
| 117 | + |
| 118 | + |
| 119 | +def process_model_backfill( |
| 120 | + model_name, |
| 121 | + batch_size=10000, |
| 122 | + *, |
| 123 | + dry_run=False, |
| 124 | + progress_callback=None, |
| 125 | +): |
| 126 | + """ |
| 127 | + Process a single model's backfill using PostgreSQL COPY. |
| 128 | +
|
| 129 | + Args: |
| 130 | + model_name: Name of the model to backfill |
| 131 | + batch_size: Number of records to process in each batch |
| 132 | + dry_run: If True, only show what would be done without creating events |
| 133 | + progress_callback: Optional callable that receives (message, style) tuples |
| 134 | + for progress updates. If None, uses logger.info |
| 135 | +
|
| 136 | + Returns: |
| 137 | + tuple: (processed_count, records_per_second) |
| 138 | +
|
| 139 | + """ |
| 140 | + if progress_callback is None: |
| 141 | + def progress_callback(msg, style=None): |
| 142 | + if style == "ERROR": |
| 143 | + logger.error(msg) |
| 144 | + elif style == "WARNING": |
| 145 | + logger.warning(msg) |
| 146 | + elif style == "SUCCESS": |
| 147 | + logger.info(msg) |
| 148 | + elif style == "DEBUG": |
| 149 | + logger.debug(msg) |
| 150 | + else: |
| 151 | + logger.info(msg) |
| 152 | + |
| 153 | + table_name, event_table_name = get_table_names(model_name) |
| 154 | + |
| 155 | + table_exists, event_table_exists = check_tables_exist(table_name, event_table_name) |
| 156 | + |
| 157 | + if not table_exists: |
| 158 | + progress_callback(f" Table {table_name} not found") |
| 159 | + return 0, 0.0 |
| 160 | + |
| 161 | + if not event_table_exists: |
| 162 | + progress_callback( |
| 163 | + f" Event table {event_table_name} not found. " |
| 164 | + f"Is {model_name} tracked by pghistory?", |
| 165 | + "DEBUG", |
| 166 | + ) |
| 167 | + return 0, 0.0 |
| 168 | + |
| 169 | + with connection.cursor() as cursor: |
| 170 | + cursor.execute(f"SELECT COUNT(*) FROM {table_name}") |
| 171 | + total_count = cursor.fetchone()[0] |
| 172 | + |
| 173 | + if total_count == 0: |
| 174 | + progress_callback(f" No records found for {model_name}") |
| 175 | + return 0, 0.0 |
| 176 | + |
| 177 | + progress_callback(f" Found {total_count:,} records") |
| 178 | + |
| 179 | + excluded_fields = get_excluded_fields(model_name) |
| 180 | + |
| 181 | + with connection.cursor() as cursor: |
| 182 | + cursor.execute(f"SELECT COUNT(*) FROM {event_table_name} WHERE pgh_label = 'initial_backfill'") |
| 183 | + existing_count = cursor.fetchone()[0] |
| 184 | + |
| 185 | + with connection.cursor() as cursor: |
| 186 | + cursor.execute(f""" |
| 187 | + SELECT COUNT(*) FROM {table_name} t |
| 188 | + WHERE NOT EXISTS ( |
| 189 | + SELECT 1 FROM {event_table_name} e |
| 190 | + WHERE e.pgh_obj_id = t.id AND e.pgh_label = 'initial_backfill' |
| 191 | + ) |
| 192 | + """) |
| 193 | + backfill_count = cursor.fetchone()[0] |
| 194 | + |
| 195 | + progress_callback(f" Records with initial_backfill events: {existing_count:,}") |
| 196 | + progress_callback(f" Records needing initial_backfill events: {backfill_count:,}") |
| 197 | + |
| 198 | + if backfill_count == 0: |
| 199 | + progress_callback(f" ✓ All {total_count:,} records already have initial_backfill events", "SUCCESS") |
| 200 | + return total_count, 0.0 |
| 201 | + |
| 202 | + if dry_run: |
| 203 | + progress_callback(f" Would process {backfill_count:,} records using COPY...") |
| 204 | + return backfill_count, 0.0 |
| 205 | + |
| 206 | + with connection.cursor() as cursor: |
| 207 | + cursor.execute(""" |
| 208 | + SELECT column_name |
| 209 | + FROM information_schema.columns |
| 210 | + WHERE table_name = %s AND column_name != 'pgh_id' |
| 211 | + ORDER BY ordinal_position |
| 212 | + """, [event_table_name]) |
| 213 | + event_columns = [row[0] for row in cursor.fetchall()] |
| 214 | + |
| 215 | + with connection.cursor() as cursor: |
| 216 | + cursor.execute(f""" |
| 217 | + SELECT t.id FROM {table_name} t |
| 218 | + WHERE NOT EXISTS ( |
| 219 | + SELECT 1 FROM {event_table_name} e |
| 220 | + WHERE e.pgh_obj_id = t.id AND e.pgh_label = 'initial_backfill' |
| 221 | + ) |
| 222 | + ORDER BY t.id |
| 223 | + """) |
| 224 | + ids_to_process = [row[0] for row in cursor.fetchall()] |
| 225 | + |
| 226 | + if not ids_to_process: |
| 227 | + progress_callback(" No records need backfill") |
| 228 | + return 0, 0.0 |
| 229 | + |
| 230 | + processed = 0 |
| 231 | + batch_start_time = time.time() |
| 232 | + model_start_time = time.time() |
| 233 | + |
| 234 | + with connection.cursor() as cursor: |
| 235 | + cursor.execute(""" |
| 236 | + SELECT column_name |
| 237 | + FROM information_schema.columns |
| 238 | + WHERE table_name = %s |
| 239 | + ORDER BY ordinal_position |
| 240 | + """, [table_name]) |
| 241 | + source_columns = [row[0] for row in cursor.fetchall()] |
| 242 | + |
| 243 | + source_columns = [col for col in source_columns if col not in excluded_fields] |
| 244 | + |
| 245 | + if "id" in source_columns: |
| 246 | + id_column_index = source_columns.index("id") |
| 247 | + else: |
| 248 | + id_column_index = 0 |
| 249 | + progress_callback(" Warning: 'id' column not found in source columns, using first column", "WARNING") |
| 250 | + |
| 251 | + for i in range(0, len(ids_to_process), batch_size): |
| 252 | + batch_ids = ids_to_process[i:i + batch_size] |
| 253 | + |
| 254 | + if i > 0 and i % (batch_size * 10) == 0: |
| 255 | + progress_callback(f" Processing batch starting at index {i:,}...") |
| 256 | + |
| 257 | + columns_str = ", ".join(source_columns) |
| 258 | + placeholders = ", ".join(["%s"] * len(batch_ids)) |
| 259 | + query = f""" |
| 260 | + SELECT {columns_str} FROM {table_name} t |
| 261 | + WHERE t.id IN ({placeholders}) |
| 262 | + ORDER BY t.id |
| 263 | + """ |
| 264 | + |
| 265 | + with connection.cursor() as cursor: |
| 266 | + cursor.execute(query, batch_ids) |
| 267 | + batch_rows = cursor.fetchall() |
| 268 | + |
| 269 | + if not batch_rows: |
| 270 | + progress_callback(f" No records found for batch at index {i}") |
| 271 | + continue |
| 272 | + |
| 273 | + with connection.cursor() as cursor: |
| 274 | + raw_cursor = cursor.cursor |
| 275 | + copy_sql = f"COPY {event_table_name} ({', '.join(event_columns)}) FROM STDIN WITH (FORMAT text, DELIMITER E'\\t')" |
| 276 | + |
| 277 | + records = [] |
| 278 | + for row in batch_rows: |
| 279 | + row_data = [] |
| 280 | + |
| 281 | + source_values = {} |
| 282 | + for idx, value in enumerate(row): |
| 283 | + field_name = source_columns[idx] |
| 284 | + source_values[field_name] = value |
| 285 | + |
| 286 | + for col in event_columns: |
| 287 | + if col == "pgh_created_at": |
| 288 | + row_data.append(timezone.now()) |
| 289 | + elif col == "pgh_label": |
| 290 | + row_data.append("initial_backfill") |
| 291 | + elif col == "pgh_obj_id": |
| 292 | + row_data.append(row[id_column_index] if row[id_column_index] is not None else None) |
| 293 | + elif col == "pgh_context_id": |
| 294 | + row_data.append(None) |
| 295 | + elif col in source_values: |
| 296 | + row_data.append(source_values[col]) |
| 297 | + else: |
| 298 | + row_data.append(None) |
| 299 | + |
| 300 | + records.append(tuple(row_data)) |
| 301 | + |
| 302 | + with raw_cursor.copy(copy_sql) as copy: |
| 303 | + for record in records: |
| 304 | + copy.write_row(record) |
| 305 | + progress_callback(" COPY operation completed using write_row") |
| 306 | + |
| 307 | + raw_cursor.connection.commit() |
| 308 | + |
| 309 | + raw_cursor.execute(f"SELECT COUNT(*) FROM {event_table_name} WHERE pgh_label = 'initial_backfill'") |
| 310 | + count = raw_cursor.fetchone()[0] |
| 311 | + progress_callback(f" Records in event table after batch: {count}") |
| 312 | + |
| 313 | + batch_processed = len(batch_rows) |
| 314 | + processed += batch_processed |
| 315 | + |
| 316 | + batch_end_time = time.time() |
| 317 | + batch_duration = batch_end_time - batch_start_time |
| 318 | + batch_records_per_second = batch_processed / batch_duration if batch_duration > 0 else 0 |
| 319 | + |
| 320 | + progress = (processed / backfill_count) * 100 |
| 321 | + progress_callback( |
| 322 | + f" Processed {processed:,}/{backfill_count:,} records ({progress:.1f}%) - " |
| 323 | + f"Last batch: {batch_duration:.2f}s ({batch_records_per_second:.1f} records/sec)", |
| 324 | + ) |
| 325 | + |
| 326 | + batch_start_time = time.time() |
| 327 | + |
| 328 | + model_end_time = time.time() |
| 329 | + total_duration = model_end_time - model_start_time |
| 330 | + records_per_second = processed / total_duration if total_duration > 0 else 0 |
| 331 | + |
| 332 | + progress_callback( |
| 333 | + f" ✓ Completed {model_name}: {processed:,} records in {total_duration:.2f}s " |
| 334 | + f"({records_per_second:.1f} records/sec)", |
| 335 | + "SUCCESS", |
| 336 | + ) |
| 337 | + return processed, records_per_second |
| 338 | + |
| 339 | + |
| 340 | +def get_tracked_models(): |
| 341 | + """Get the list of models tracked by pghistory.""" |
| 342 | + return [ |
| 343 | + "Dojo_User", "Endpoint", "Engagement", "Finding", "Finding_Group", |
| 344 | + "Product_Type", "Product", "Test", "Risk_Acceptance", |
| 345 | + "Finding_Template", "Cred_User", "Notification_Webhooks", |
| 346 | + "FindingReviewers", # M2M through table for Finding.reviewers |
| 347 | + "Location", "URL", |
| 348 | + # Tag through tables (tagulous auto-generated) |
| 349 | + "FindingTags", |
| 350 | + "FindingInheritedTags", |
| 351 | + "ProductTags", |
| 352 | + "EngagementTags", |
| 353 | + "EngagementInheritedTags", |
| 354 | + "TestTags", |
| 355 | + "TestInheritedTags", |
| 356 | + "EndpointTags", |
| 357 | + "EndpointInheritedTags", |
| 358 | + "FindingTemplateTags", |
| 359 | + "AppAnalysisTags", |
| 360 | + "ObjectsProductTags", |
| 361 | + ] |
0 commit comments