|
5 | 5 | import copy |
6 | 6 | import json |
7 | 7 | from concurrent.futures import ThreadPoolExecutor |
| 8 | +from dataclasses import replace |
8 | 9 | from typing import Any, Optional, Union |
9 | 10 |
|
10 | 11 | from jinja2 import meta |
@@ -318,24 +319,25 @@ def run(self, documents: list[Document], page_range: Optional[list[Union[str, in |
318 | 319 | successful_documents = [] |
319 | 320 | failed_documents = [] |
320 | 321 | for document, result in zip(documents, results): |
| 322 | + new_meta = {**document.meta} |
321 | 323 | if "error" in result: |
322 | | - document.meta["metadata_extraction_error"] = result["error"] |
323 | | - document.meta["metadata_extraction_response"] = None |
324 | | - failed_documents.append(document) |
| 324 | + new_meta["metadata_extraction_error"] = result["error"] |
| 325 | + new_meta["metadata_extraction_response"] = None |
| 326 | + failed_documents.append(replace(document, meta=new_meta)) |
325 | 327 | continue |
326 | 328 |
|
327 | 329 | parsed_metadata = self._extract_metadata(result["replies"][0].text) |
328 | 330 | if "error" in parsed_metadata: |
329 | | - document.meta["metadata_extraction_error"] = parsed_metadata["error"] |
330 | | - document.meta["metadata_extraction_response"] = result["replies"][0] |
331 | | - failed_documents.append(document) |
| 331 | + new_meta["metadata_extraction_error"] = parsed_metadata["error"] |
| 332 | + new_meta["metadata_extraction_response"] = result["replies"][0] |
| 333 | + failed_documents.append(replace(document, meta=new_meta)) |
332 | 334 | continue |
333 | 335 |
|
334 | 336 | for key in parsed_metadata: |
335 | | - document.meta[key] = parsed_metadata[key] |
| 337 | + new_meta[key] = parsed_metadata[key] |
336 | 338 | # Remove metadata_extraction_error and metadata_extraction_response if present from previous runs |
337 | | - document.meta.pop("metadata_extraction_error", None) |
338 | | - document.meta.pop("metadata_extraction_response", None) |
339 | | - successful_documents.append(document) |
| 339 | + new_meta.pop("metadata_extraction_error", None) |
| 340 | + new_meta.pop("metadata_extraction_response", None) |
| 341 | + successful_documents.append(replace(document, meta=new_meta)) |
340 | 342 |
|
341 | 343 | return {"documents": successful_documents, "failed_documents": failed_documents} |
0 commit comments