Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 4 additions & 10 deletions killed-in-gaza.json
Original file line number Diff line number Diff line change
Expand Up @@ -25080,7 +25080,10 @@
"dob": "1948-03-20",
"sex": "m",
"id": "938867033",
"source": "u"
"source": "u",
"duplicate_ids": [
"700187263"
]
},
{
"en_name": "Majid Mohammed Hassan Abu Ouda",
Expand Down Expand Up @@ -226052,15 +226055,6 @@
"id": "700189525",
"source": "u"
},
{
"en_name": "Abdel Raouf Hassan Awad Al-Najjar",
"name": "عبد الرؤوف حسن عوض النجار",
"age": 77,
"dob": "1948-03-20",
"sex": "m",
"id": "700187263",
"source": "u"
},
{
"en_name": "Ismail Mousa Jibril Al-Hammadin",
"name": "اسماعيل موسي جبريل الحمادين",
Expand Down
2 changes: 1 addition & 1 deletion killed-in-gaza.min.json

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions scripts/data/common/killed-in-gaza/duplicates.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/**
* Manually curated map of known duplicate records.
*
* Key: canonical ID (the record to keep)
* Value: array of duplicate IDs to be removed and merged into the canonical record
*
* The canonical record will get a `duplicate_ids` field listing the merged IDs
* so API consumers can trace consolidated records.
*/
export const knownDuplicates: Record<string, string[]> = {
// Abdel Raouf Hassan Awad Al-Najjar - assumed to be same person with two IDs
// See: https://github.com/TechForPalestine/palestine-datasets/issues/599
"938867033": ["700187263"],
};
102 changes: 98 additions & 4 deletions scripts/data/v2/killed-in-gaza.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import toEnName from "arabic-name-to-en";
import { writeJson } from "../../utils/fs";
import { ApiResource } from "../../../types/api.types";
import { readCsv } from "../../utils/csv";
import { knownDuplicates } from "../common/killed-in-gaza/duplicates";

const jsonFileName = "killed-in-gaza.json";

Expand All @@ -27,7 +28,7 @@ interface MappedRecord extends Record<string, string | number> {

const sexMapping = {
M: "m",
F: "f"
F: "f",
};

const namesFallbackTranslated = new Map<string, number>();
Expand Down Expand Up @@ -176,18 +177,111 @@ const validateJson = (json: Array<Record<string, number | string>>) => {
}
};

/**
* Build a set of all IDs that are known duplicates (values in knownDuplicates map)
* so we can skip them during consolidation and unknown duplicate detection.
*/
const allKnownDuplicateIds = new Set<string>();
for (const dupIds of Object.values(knownDuplicates)) {
for (const dupId of dupIds) {
allKnownDuplicateIds.add(dupId);
}
}

/**
* Detect records that share the same name+dob+sex but have different IDs.
* These are potential duplicates that should be resolved in the duplicates config.
* Known duplicates (from the config) are excluded from this check.
*/
const detectUnknownDuplicates = (
json: MappedRecord[]
): Map<string, MappedRecord[]> => {
const keyMap = new Map<string, MappedRecord[]>();

for (const record of json) {
// Skip records that are known duplicates (to be consolidated)
if (allKnownDuplicateIds.has(record.id)) {
continue;
}
const key = `${record.name}|${record.dob}|${record.sex}`;
if (!keyMap.has(key)) {
keyMap.set(key, []);
}
keyMap.get(key)!.push(record);
}

const unknownDupes = new Map<string, MappedRecord[]>();
for (const [key, records] of keyMap) {
if (records.length > 1) {
unknownDupes.set(key, records);
}
}
return unknownDupes;
};

/**
* Consolidate known duplicates: remove duplicate records and add duplicate_ids
* to the canonical record.
*/
const consolidateKnownDuplicates = (json: MappedRecord[]): MappedRecord[] => {
const result: MappedRecord[] = [];

for (const record of json) {
// Skip records that are known duplicate IDs (non-canonical)
if (allKnownDuplicateIds.has(record.id)) {
continue;
}

// If this is a canonical record with known duplicates, add the field
if (knownDuplicates[record.id]) {
(record as Record<string, unknown>).duplicate_ids =
knownDuplicates[record.id];
}

result.push(record);
}

return result;
};

const generateJsonFromTranslatedCsv = async () => {
const [headerKeys, ...rows] = readCsv(
"scripts/data/common/killed-in-gaza/output/result.csv"
);
const jsonArray = formatToJson(headerKeys, rows);
validateJson(jsonArray);

// Detect unknown duplicates (same name+dob+sex, different IDs) and error
const unknownDupes = detectUnknownDuplicates(jsonArray);
if (unknownDupes.size > 0) {
const dupeDetails = Array.from(unknownDupes.entries())
.map(([key, records]) => {
const ids = records.map((r) => r.id).join(", ");
return ` ${records[0].en_name} (${key}): [${ids}]`;
})
.join("\n");
throw new Error(
`Found ${unknownDupes.size} unknown duplicate group(s) with same name+dob+sex but different IDs.\n` +
`Add them to scripts/data/common/killed-in-gaza/duplicates.ts and remove duplicates from raw.csv.\n` +
`Duplicate groups:\n${dupeDetails}`
);
}

// Consolidate known duplicates
const consolidated = consolidateKnownDuplicates(jsonArray);
const removedCount = jsonArray.length - consolidated.length;
if (removedCount > 0) {
console.log(
`Consolidated ${removedCount} known duplicate record(s) from ${jsonArray.length} total`
);
}

// sort by descending ID
jsonArray.sort((a, b) => b.id.localeCompare(a.id));
writeJson(ApiResource.KilledInGazaV2, jsonFileName, jsonArray);
consolidated.sort((a, b) => b.id.localeCompare(a.id));
writeJson(ApiResource.KilledInGazaV2, jsonFileName, consolidated);

console.log(
`generated JSON file with ${jsonArray.length} records: ${jsonFileName}`
`generated JSON file with ${consolidated.length} records: ${jsonFileName}`
);

const logLines: string[] = [];
Expand Down
1 change: 1 addition & 0 deletions types/killed-in-gaza.types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export type KilledInGaza = {
sex: "m" | "f";
en_name: string;
source: "h" | "c";
duplicate_ids?: string[];
};

export type MarqueePerson = Pick<KilledInGaza, "id" | "name" | "age" | "sex">;
Loading