DefectDojo
diff --git a/‎dojo/api_v2/serializers.py‎
Lines changed: 33 additions & 19 deletions b/‎dojo/api_v2/serializers.py‎
Lines changed: 33 additions & 19 deletions
diff --git a/‎dojo/db_migrations/0263_language_type_unique_language.py‎
Lines changed: 97 additions & 0 deletions b/‎dojo/db_migrations/0263_language_type_unique_language.py‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎dojo/db_migrations/max_migration.txt‎
Lines changed: 1 addition & 1 deletion b/‎dojo/db_migrations/max_migration.txt‎
Lines changed: 1 addition & 1 deletion
@@ -12,6 +12,7 @@
 from django.contrib.auth.models import Group, Permission
 from django.contrib.auth.password_validation import validate_password
 from django.core.exceptions import PermissionDenied, ValidationError
+from django.db import transaction
 from django.db.utils import IntegrityError
 from django.urls import reverse
 from django.utils import timezone
@@ -2880,38 +2881,51 @@ def save(self):
                 deserialized = json.loads(data)
         except Exception:
             msg = "Invalid format"
-            raise Exception(msg)
+            raise serializers.ValidationError(msg)
 
-        # Filter out ignored keys
-        language_names = [name for name in deserialized if name not in {"header", "SUM"}]
-        # Prepopulate existing Language_Type objects
-        existing_types = {
+        # Filter out ignored keys and deduplicate
+        language_names = list(dict.fromkeys(
+            name for name in deserialized if name not in {"header", "SUM"}
+        ))
+        # Ensure any new Language_Type records exist (ignore conflicts from
+        # concurrent requests or already-existing types)
+        Language_Type.objects.bulk_create(
+            [Language_Type(language=name) for name in language_names],
+            ignore_conflicts=True,
+        )
+        # Single query to fetch all Language_Type objects we need (indexed lookup)
+        language_types = {
             lt.language: lt
             for lt in Language_Type.objects.filter(language__in=language_names)
         }
-        # Determine which Language_Type objects need to be created
-        new_language_names = [name for name in language_names if name not in existing_types]
-        new_types = [Language_Type(language=name) for name in new_language_names]
-        Language_Type.objects.bulk_create(new_types)
-        # Add newly created Language_Type objects to cache
-        for lt in Language_Type.objects.filter(language__in=new_language_names):
-            existing_types[lt.language] = lt
-        # Delete all Languages for this product
-        Languages.objects.filter(product=product).delete()
-        # Prepare Languages objects for bulk insert
-        languages_to_create = [
+        # Prepare Languages objects for upsert
+        languages_to_upsert = [
             Languages(
                 product=product,
-                language=existing_types[name],
+                language=language_types[name],
                 files=deserialized[name].get("nFiles", 0),
                 blank=deserialized[name].get("blank", 0),
                 comment=deserialized[name].get("comment", 0),
                 code=deserialized[name].get("code", 0),
             )
             for name in language_names
         ]
-        # Bulk insert all Languages in one query
-        Languages.objects.bulk_create(languages_to_create)
+        # Upsert Languages and remove stale ones atomically
+        try:
+            with transaction.atomic():
+                Languages.objects.bulk_create(
+                    languages_to_upsert,
+                    update_conflicts=True,
+                    unique_fields=["language", "product"],
+                    update_fields=["files", "blank", "comment", "code"],
+                )
+                # Remove languages no longer present in the file
+                Languages.objects.filter(product=product).exclude(
+                    language__in=language_types.values(),
+                ).delete()
+        except IntegrityError as e:
+            msg = f"Failed to import languages due to a data integrity issue: {e}"
+            raise serializers.ValidationError(msg)
 
     def validate(self, data):
         if is_scan_file_too_large(data["file"]):
 
@@ -0,0 +1,97 @@
+import logging
+
+from django.db import migrations, models
+from django.db.models import Count, Min
+
+logger = logging.getLogger(__name__)
+
+
+def deduplicate_language_types(apps, schema_editor):
+    """
+    Deduplicate Language_Type records by language name. For each set of
+    duplicates, keep the lowest-ID record and reassign all Languages FK
+    references to it, then delete the duplicates.
+    """
+    Language_Type = apps.get_model("dojo", "Language_Type")
+    Languages = apps.get_model("dojo", "Languages")
+
+    # Find language names that have duplicate Language_Type records
+    dupes = (
+        Language_Type.objects
+        .values("language")
+        .annotate(cnt=Count("id"), min_id=Min("id"))
+        .filter(cnt__gt=1)
+    )
+
+    total_reassigned = 0
+    total_deleted_types = 0
+    total_deleted_languages = 0
+
+    for dupe in dupes:
+        canonical_id = dupe["min_id"]
+        duplicate_ids = list(
+            Language_Type.objects
+            .filter(language=dupe["language"])
+            .exclude(id=canonical_id)
+            .values_list("id", flat=True),
+        )
+
+        # Reassign Languages FKs from duplicates to the canonical record
+        reassigned = Languages.objects.filter(
+            language_id__in=duplicate_ids,
+        ).update(language_id=canonical_id)
+        total_reassigned += reassigned
+
+        # After reassignment, there may be duplicate (language, product) pairs.
+        # Find and remove them, keeping the lowest-ID Languages record per pair.
+        conflicting_pairs = (
+            Languages.objects
+            .filter(language_id=canonical_id)
+            .values("language_id", "product_id")
+            .annotate(cnt=Count("id"), min_id=Min("id"))
+            .filter(cnt__gt=1)
+        )
+        for pair in conflicting_pairs:
+            deleted_count, _ = (
+                Languages.objects
+                .filter(
+                    language_id=pair["language_id"],
+                    product_id=pair["product_id"],
+                )
+                .exclude(id=pair["min_id"])
+                .delete()
+            )
+            total_deleted_languages += deleted_count
+
+        # Delete the duplicate Language_Type records
+        deleted_count, _ = Language_Type.objects.filter(id__in=duplicate_ids).delete()
+        total_deleted_types += deleted_count
+
+    if total_deleted_types:
+        logger.info(
+            "Deduplicated Language_Type: removed %d duplicate types, "
+            "reassigned %d Languages FK references, "
+            "removed %d duplicate Languages records",
+            total_deleted_types,
+            total_reassigned,
+            total_deleted_languages,
+        )
+
+
+def noop_reverse(apps, schema_editor):
+    pass
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("dojo", "0262_remove_system_settings_credentials"),
+    ]
+
+    operations = [
+        migrations.RunPython(deduplicate_language_types, noop_reverse),
+        migrations.AlterField(
+            model_name="language_type",
+            name="language",
+            field=models.CharField(max_length=100, unique=True),
+        ),
+    ]
@@ -1 +1 @@
-0262_remove_system_settings_credentials
+0263_language_type_unique_language
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-0262_remove_system_settings_credentials`
	`1`	`+0263_language_type_unique_language`