Skip to content

Commit efc2a99

Browse files
better metadata handling when deduplicating
1 parent 70c3064 commit efc2a99

1 file changed

Lines changed: 8 additions & 1 deletion

File tree

WDoc/utils/batch_file_loader.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,8 +428,15 @@ def deterministic_sorter(doc_dict: DocDict) -> int:
428428
for k, v in doc.metadata.items():
429429
if "hash" in k:
430430
continue
431-
if k in deduped[ch].metadata and v == deduped[ch].metadata[k]:
431+
elif k == "source_tag":
432+
if "source_tag" in deduped[ch].metadata:
433+
deduped[ch].metadata[k] += " " + v
434+
else:
435+
deduped[ch].metadata[k] += v
432436
continue
437+
elif k in deduped[ch].metadata:
438+
if v == deduped[ch].metadata[k]:
439+
continue
433440
elif k not in deduped[ch].metadata:
434441
deduped[ch].metadata[k] = v
435442
elif isinstance(v, list) and isinstance(deduped[ch].metadata[k], list):

0 commit comments

Comments
 (0)