diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 2d36004a3..48cd22ff8 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -2315,7 +2315,7 @@ definitions: properties: type: type: string - enum: [ KeyTransformation ] + enum: [KeyTransformation] prefix: title: Key Prefix description: Prefix to add for object keys. If not provided original keys remain unchanged. diff --git a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py index f55675e0a..cb35646fb 100644 --- a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py @@ -2,23 +2,19 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # import logging +import mimetypes import os import traceback from datetime import datetime from io import BytesIO, IOBase -from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast import backoff import dpath import nltk +import pi_heif # Required for handling HEIF images import requests -from unstructured.file_utils.filetype import ( - EXT_TO_FILETYPE, - FILETYPE_TO_MIMETYPE, - STR_TO_FILETYPE, - FileType, - detect_filetype, -) +from unstructured.file_utils.filetype import FileType, detect_filetype from airbyte_cdk.models import FailureType from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig @@ -141,25 +137,38 @@ async def infer_schema( format = _extract_format(config) with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle: filetype = self._get_filetype(file_handle, file) - if filetype not in self._supported_file_types() and not format.skip_unprocessable_files: - raise self._create_parse_error( - file, - self._get_file_type_error_message(filetype), + if ( + isinstance(filetype, str) or filetype not in self._supported_file_types() + ) and not format.skip_unprocessable_files: + error_message = self._get_file_type_error_message(filetype) + logger.error(f"File {file.uri} has unsupported type: {error_message}") + raise AirbyteTracedException( + message=error_message, + internal_message="Please check the logged errors for more information.", + failure_type=FailureType.config_error, ) return { "content": { - "type": "string", + "type": ["null", "string"], "description": "Content of the file as markdown. Might be null if the file could not be parsed", }, "document_key": { - "type": "string", + "type": ["null", "string"], "description": "Unique identifier of the document, e.g. the file path", }, "_ab_source_file_parse_error": { - "type": "string", + "type": ["null", "string"], "description": "Error message if the file could not be parsed even though the file is supported", }, + "_ab_source_file_last_modified": { + "type": ["null", "string"], + "description": "Last modified timestamp of the source file", + }, + "_ab_source_file_url": { + "type": ["null", "string"], + "description": "URL or path to the source file", + }, } def parse_records( @@ -178,20 +187,57 @@ def parse_records( "content": markdown, "document_key": file.uri, "_ab_source_file_parse_error": None, + "_ab_source_file_last_modified": file.last_modified.strftime( + "%Y-%m-%dT%H:%M:%S.%fZ" + ), + "_ab_source_file_url": file.uri, } except RecordParseError as e: # RecordParseError is raised when the file can't be parsed because of a problem with the file content (either the file is not supported or the file is corrupted) # if the skip_unprocessable_files flag is set, we log a warning and pass the error as part of the document # otherwise, we raise the error to fail the sync + exception_str = str(e) + if format.skip_unprocessable_files: + logger.warning( + f"File {file.uri} caused an error during parsing: {exception_str}." + ) + error_message = f"Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename={file.uri} message={exception_str}" + yield { + "content": None, + "document_key": file.uri, + "_ab_source_file_parse_error": error_message, + "_ab_source_file_last_modified": file.last_modified.strftime( + "%Y-%m-%dT%H:%M:%S.%fZ" + ), + "_ab_source_file_url": file.uri, + } + logger.warning(f"File {file.uri} cannot be parsed. Skipping it.") + else: + logger.error( + f"File {file.uri} caused an error during parsing: {exception_str}." + ) + raise AirbyteTracedException( + message="Please check the logged errors for more information.", + internal_message=exception_str, + failure_type=FailureType.config_error, + ) + except AirbyteTracedException as e: if format.skip_unprocessable_files: exception_str = str(e) - logger.warn(f"File {file.uri} caused an error during parsing: {exception_str}.") + logger.warning( + f"File {file.uri} caused an error during parsing: {exception_str}." + ) + error_message = f"Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename={file.uri} message={exception_str}" yield { "content": None, "document_key": file.uri, - "_ab_source_file_parse_error": exception_str, + "_ab_source_file_parse_error": error_message, + "_ab_source_file_last_modified": file.last_modified.strftime( + "%Y-%m-%dT%H:%M:%S.%fZ" + ), + "_ab_source_file_url": file.uri, } - logger.warn(f"File {file.uri} cannot be parsed. Skipping it.") + logger.warning(f"File {file.uri} cannot be parsed. Skipping it.") else: raise e except Exception as e: @@ -218,14 +264,21 @@ def _read_file( filetype: FileType | None = self._get_filetype(file_handle, remote_file) if filetype is None or filetype not in self._supported_file_types(): - raise self._create_parse_error( - remote_file, - self._get_file_type_error_message(filetype), + error_message = self._get_file_type_error_message(filetype) + logger.error(f"File {remote_file.uri} has unsupported type: {error_message}") + raise AirbyteTracedException( + message="Please check the logged errors for more information.", + internal_message=error_message, + failure_type=FailureType.config_error, ) if filetype in {FileType.MD, FileType.TXT}: - file_content: bytes = file_handle.read() - decoded_content: str = optional_decode(file_content) - return decoded_content + try: + file_content: bytes = file_handle.read() + decoded_content: str = optional_decode(file_content) + return decoded_content + except Exception as e: + logger.error(f"Error reading {filetype} file: {str(e)}") + raise self._create_parse_error(remote_file, str(e)) if format.processing.mode == "local": return self._read_file_locally( file_handle, @@ -335,10 +388,16 @@ def _read_file_remotely( data = self._params_to_dict(format.parameters, strategy) - file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])} + mime_type = ( + mimetypes.guess_type(f"file.{filetype.name.lower()}")[0] + if filetype + else "application/octet-stream" + ) + + files = cast(Any, {"files": ("filename", file_handle, mime_type)}) response = requests.post( - f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data + f"{format.api_url}/general/v0/general", headers=headers, data=data, files=files ) if response.status_code == 422: @@ -364,24 +423,25 @@ def _read_file_locally( # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point) raise Exception("unstructured library is not available") - file: Any = file_handle - # before the parsing logic is entered, the file is read completely to make sure it is in local memory file_handle.seek(0) - file_handle.read() + file_content = file_handle.read() file_handle.seek(0) try: - if filetype == FileType.PDF: - # for PDF, read the file into a BytesIO object because some code paths in pdf parsing are doing an instance check on the file object and don't work with file-like objects - file_handle.seek(0) - with BytesIO(file_handle.read()) as file: - file_handle.seek(0) + # For all file types, create a fresh BytesIO to avoid issues with file-like objects + with BytesIO(file_content) as file: + if filetype == FileType.PDF: elements = unstructured_partition_pdf(file=file, strategy=strategy) - elif filetype == FileType.DOCX: - elements = unstructured_partition_docx(file=file) - elif filetype == FileType.PPTX: - elements = unstructured_partition_pptx(file=file) + elif filetype == FileType.DOCX: + elements = unstructured_partition_docx(file=file) + elif filetype == FileType.PPTX: + elements = unstructured_partition_pptx(file=file) + else: + raise self._create_parse_error( + remote_file, + f"Unsupported file type {filetype} for local processing", + ) except Exception as e: raise self._create_parse_error(remote_file, str(e)) @@ -405,8 +465,13 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT 2. Use the file name if available 3. Use the file content """ - if remote_file.mime_type and remote_file.mime_type in STR_TO_FILETYPE: - return STR_TO_FILETYPE[remote_file.mime_type] + if remote_file.mime_type: + for file_type in FileType: + if ( + mimetypes.guess_type(f"file.{file_type.name.lower()}")[0] + == remote_file.mime_type + ): + return file_type # set name to none, otherwise unstructured will try to get the modified date from the local file system if hasattr(file, "name"): @@ -415,27 +480,67 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT # detect_filetype is either using the file name or file content # if possible, try to leverage the file name to detect the file type # if the file name is not available, use the file content - file_type: FileType | None = None + detected_type: FileType | None = None try: - file_type = detect_filetype( - filename=remote_file.uri, - ) + detected_type = detect_filetype(remote_file.uri) except Exception: # Path doesn't exist locally. Try something else... pass - if file_type and file_type != FileType.UNK: - return file_type + if detected_type and detected_type != FileType.UNK: + return detected_type - type_based_on_content = detect_filetype(file=file) - file.seek(0) # detect_filetype is reading to read the file content, so we need to reset + file.seek(0) + try: + file_content = file.read(4096) # Read a sample of the file to detect type + file.seek(0) + + if isinstance(file_content, bytes) and file_content.startswith(b"%PDF-"): + return FileType.PDF + + if isinstance(file_content, bytes) and file_content.startswith(b"PK\x03\x04"): + if ( + b"ppt/" in file_content + or b"application/vnd.openxmlformats-officedocument.presentationml" + in file_content + ): + return FileType.PPTX + elif b"word/" in file_content or b"[Content_Types].xml" in file_content: + return FileType.DOCX + + if file_content and isinstance(file_content, bytes): + try: + content_str = file_content.decode("utf-8", errors="ignore") + if ( + content_str.lstrip().startswith("#") + or remote_file.mime_type == "text/markdown" + or remote_file.uri.endswith(".md") + ): + return FileType.MD + elif content_str.strip() and not any( + c for c in content_str[:100] if ord(c) > 127 + ): + return FileType.TXT + except UnicodeDecodeError: + pass # Not a text file + + type_based_on_content = FileType.UNK + except Exception as e: + type_based_on_content = FileType.UNK + file.seek(0) # Reset file position after reading if type_based_on_content and type_based_on_content != FileType.UNK: return type_based_on_content - extension = "." + remote_file.uri.split(".")[-1].lower() - if extension in EXT_TO_FILETYPE: - return EXT_TO_FILETYPE[extension] + if "." in remote_file.uri: + extension = "." + remote_file.uri.split(".")[-1].lower() + if extension == ".csv": + return FileType.CSV + for file_type in FileType: + if file_type.name.lower() == extension[1:].lower(): + return file_type + if remote_file.uri.endswith(".md") or remote_file.mime_type == "text/markdown": + return FileType.MD return None @@ -444,7 +549,7 @@ def _supported_file_types(self) -> List[Any]: def _get_file_type_error_message( self, - file_type: FileType | None, + file_type: Union[FileType, str, None], ) -> str: supported_file_types = ", ".join([str(type) for type in self._supported_file_types()]) return f"File type {file_type or 'None'!s} is not supported. Supported file types are {supported_file_types}" diff --git a/airbyte_cdk/sources/file_based/schema_helpers.py b/airbyte_cdk/sources/file_based/schema_helpers.py index 1b653db67..3089b3cef 100644 --- a/airbyte_cdk/sources/file_based/schema_helpers.py +++ b/airbyte_cdk/sources/file_based/schema_helpers.py @@ -54,6 +54,12 @@ def __lt__(self, other: Any) -> bool: def get_comparable_type(value: Any) -> Optional[ComparableType]: + if isinstance(value, list): + non_null_types = [item for item in value if item != "null"] + if non_null_types: + return get_comparable_type(non_null_types[0]) + else: + return ComparableType.NULL if value == "null": return ComparableType.NULL if value == "boolean": @@ -121,6 +127,8 @@ def merge_schemas(schema1: SchemaType, schema2: SchemaType) -> SchemaType: def _is_valid_type(t: JsonSchemaSupportedType) -> bool: + if isinstance(t, list): + return all(get_comparable_type(item) is not None for item in t) return t == "array" or get_comparable_type(t) is not None @@ -128,7 +136,33 @@ def _choose_wider_type(key: str, t1: Mapping[str, Any], t2: Mapping[str, Any]) - t1_type = t1["type"] t2_type = t2["type"] - if (t1_type == "array" or t2_type == "array") and t1 != t2: + if isinstance(t1_type, list) and isinstance(t2_type, list): + if set(t1_type).issubset(set(t2_type)): + return t2 + elif set(t2_type).issubset(set(t1_type)): + return t1 + else: + combined_types = list(set(t1_type).union(set(t2_type))) + result = dict(t1) + result["type"] = combined_types + return result + elif isinstance(t1_type, list): + if t2_type in t1_type: + return t1 + else: + combined_types = list(set(t1_type + [t2_type])) + result = dict(t1) + result["type"] = combined_types + return result + elif isinstance(t2_type, list): + if t1_type in t2_type: + return t2 + else: + combined_types = list(set(t2_type + [t1_type])) + result = dict(t2) + result["type"] = combined_types + return result + elif (t1_type == "array" or t2_type == "array") and t1 != t2: raise SchemaInferenceError( FileBasedSourceError.SCHEMA_INFERENCE_ERROR, details="Cannot merge schema for unequal array types.", @@ -149,20 +183,38 @@ def _choose_wider_type(key: str, t1: Mapping[str, Any], t2: Mapping[str, Any]) - detected_types=f"{t1},{t2}", ) else: - comparable_t1 = get_comparable_type( - TYPE_PYTHON_MAPPING[t1_type][0] - ) # accessing the type_mapping value - comparable_t2 = get_comparable_type( - TYPE_PYTHON_MAPPING[t2_type][0] - ) # accessing the type_mapping value - if not comparable_t1 and comparable_t2: - raise SchemaInferenceError( - FileBasedSourceError.UNRECOGNIZED_TYPE, key=key, detected_types=f"{t1},{t2}" - ) - return max( - [t1, t2], - key=lambda x: ComparableType(get_comparable_type(TYPE_PYTHON_MAPPING[x["type"]][0])), - ) # accessing the type_mapping value + if not isinstance(t1_type, list) and not isinstance(t2_type, list): + comparable_t1 = get_comparable_type( + TYPE_PYTHON_MAPPING[t1_type][0] + ) # accessing the type_mapping value + comparable_t2 = get_comparable_type( + TYPE_PYTHON_MAPPING[t2_type][0] + ) # accessing the type_mapping value + if not comparable_t1 and comparable_t2: + raise SchemaInferenceError( + FileBasedSourceError.UNRECOGNIZED_TYPE, key=key, detected_types=f"{t1},{t2}" + ) + return max( + [t1, t2], + key=lambda x: ComparableType( + get_comparable_type(TYPE_PYTHON_MAPPING[x["type"]][0]) + ), + ) # accessing the type_mapping value + + combined_types = [] + if isinstance(t1_type, list): + combined_types.extend(t1_type) + else: + combined_types.append(t1_type) + + if isinstance(t2_type, list): + combined_types.extend(t2_type) + else: + combined_types.append(t2_type) + + result = dict(t1) + result["type"] = list(set(combined_types)) + return result def is_equal_or_narrower_type(value: Any, expected_type: str) -> bool: diff --git a/poetry.lock b/poetry.lock index aec560f76..2faa46dfc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,18 @@ # This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. +[[package]] +name = "aiofiles" +version = "24.1.0" +description = "File support for asyncio." +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"file-based\"" +files = [ + {file = "aiofiles-24.1.0-py3-none-any.whl", hash = "sha256:b4ec55f4195e3eb5d7abd1bf7e061763e864dd4954231fb8539a0ef8bb8260e5"}, + {file = "aiofiles-24.1.0.tar.gz", hash = "sha256:22a075c9e5a3810f0c2e48f3008c94d68c65d763b9b03857924c99e57355166c"}, +] + [[package]] name = "aiohappyeyeballs" version = "2.4.4" @@ -1077,6 +1090,22 @@ files = [ {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, ] +[[package]] +name = "eval-type-backport" +version = "0.2.2" +description = "Like `typing._eval_type`, but lets older Python versions use newer typing features." +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"file-based\"" +files = [ + {file = "eval_type_backport-0.2.2-py3-none-any.whl", hash = "sha256:cb6ad7c393517f476f96d456d0412ea80f0a8cf96f6892834cd9340149111b0a"}, + {file = "eval_type_backport-0.2.2.tar.gz", hash = "sha256:f0576b4cf01ebb5bd358d02314d31846af5e07678387486e2c798af0e7d849c1"}, +] + +[package.extras] +tests = ["pytest"] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -1474,6 +1503,29 @@ files = [ {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, ] +[[package]] +name = "html5lib" +version = "1.1" +description = "HTML parser based on the WHATWG HTML specification" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"file-based\"" +files = [ + {file = "html5lib-1.1-py2.py3-none-any.whl", hash = "sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d"}, + {file = "html5lib-1.1.tar.gz", hash = "sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f"}, +] + +[package.dependencies] +six = ">=1.9" +webencodings = "*" + +[package.extras] +all = ["chardet (>=2.2)", "genshi", "lxml"] +chardet = ["chardet (>=2.2)"] +genshi = ["genshi"] +lxml = ["lxml"] + [[package]] name = "httpcore" version = "1.0.7" @@ -2599,6 +2651,19 @@ files = [ ] markers = {main = "(extra == \"vector-db-based\" or extra == \"file-based\") and (python_version <= \"3.11\" or python_version >= \"3.12\")", dev = "python_version <= \"3.11\" or python_version >= \"3.12\""} +[[package]] +name = "nest-asyncio" +version = "1.6.0" +description = "Patch asyncio to allow nested event loops" +optional = true +python-versions = ">=3.5" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"file-based\"" +files = [ + {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"}, + {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"}, +] + [[package]] name = "nltk" version = "3.9.1" @@ -2673,6 +2738,22 @@ files = [ {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, ] +[[package]] +name = "olefile" +version = "0.47" +description = "Python package to parse, read and write Microsoft OLE2 files (Structured Storage or Compound Document, Microsoft Office)" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"file-based\"" +files = [ + {file = "olefile-0.47-py2.py3-none-any.whl", hash = "sha256:543c7da2a7adadf21214938bb79c83ea12b473a4b6ee4ad4bf854e7715e13d1f"}, + {file = "olefile-0.47.zip", hash = "sha256:599383381a0bf3dfbd932ca0ca6515acd174ed48870cbf7fee123d698c192c1c"}, +] + +[package.extras] +tests = ["pytest", "pytest-cov"] + [[package]] name = "openai" version = "0.27.9" @@ -2986,14 +3067,78 @@ Jinja2 = ">=2.11.0" MarkupSafe = ">=1.1.1" pygments = ">=2.12.0" +[[package]] +name = "pi-heif" +version = "0.22.0" +description = "Python interface for libheif library" +optional = false +python-versions = ">=3.9" +groups = ["main", "dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "pi_heif-0.22.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:fca84436339eee2c91ff09cd7e301cfa2a0f7a9d83d5bc6a9d1db8587221d239"}, + {file = "pi_heif-0.22.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:46b0fcf876d85c8684d3bc1a0b7a4e4bc5673b72084807dc6bf85caa2da9173b"}, + {file = "pi_heif-0.22.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d85a8b09e28f3234a9a64796fc3ed71516b14a9ba08cad416ebd0db251e5f263"}, + {file = "pi_heif-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21416131308fabaeadbd1eae4d4daf218443832409f91ea6571edb64a0dc8d1c"}, + {file = "pi_heif-0.22.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d308f32ec557ec9f8cfee1225d83d391ffc72a1a8f03106a5805693c02359678"}, + {file = "pi_heif-0.22.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:94359418200d7ed61f1910c5b3318fcaf0bb6e25c3e6361fbf986b320d4b7e80"}, + {file = "pi_heif-0.22.0-cp310-cp310-win_amd64.whl", hash = "sha256:0292a1c4b58a7bfeaad0e315ca713beee3051600cf2c100a0fa96fb32377c8fd"}, + {file = "pi_heif-0.22.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:98dab5eb6bd70bdbe8ce021b4287c42ca779f6ee6d6f6fc91609d950e135d6dd"}, + {file = "pi_heif-0.22.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:ed1731ebece9dcaea50db251b891318ebfc6971161664cca1fd1367e75aa815f"}, + {file = "pi_heif-0.22.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d92149bad299390a96f29dc584bc0020c88d36d3edf073f03a6ac6b595673f63"}, + {file = "pi_heif-0.22.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd9f1688caa359ad9c6a66fc167fa41fa24dc0fa8ceed65be2c31563d42eb700"}, + {file = "pi_heif-0.22.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6339784cd447664faa4705373b7f4d7bc9c4133bc0e0a1140516614cd047e9a8"}, + {file = "pi_heif-0.22.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:2c5cfa7b8610750751cd414f7e276093080b38e1728d721f5d315f03a9ebd25c"}, + {file = "pi_heif-0.22.0-cp311-cp311-win_amd64.whl", hash = "sha256:e739bfe4a1785e34b52eecf092d5c511b673f20f053c728472167fe3ddcbe202"}, + {file = "pi_heif-0.22.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:fe7b539c1924973de96a58477dab29475ed8bfbc81cb4588db9655e3661710ba"}, + {file = "pi_heif-0.22.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:322fd33c75ccf1208f08d07aea06c7582eed6e577a3400fe6efcbaab0c1677ff"}, + {file = "pi_heif-0.22.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3965be305b4a5bbe4c7585f45feeab18ed18228e729a970e9b8a09b25434c885"}, + {file = "pi_heif-0.22.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebd91145a1ab9229ce330e5a7cb8a95c875c16a1cb1f2b0b5ed86e61a9fb6bd4"}, + {file = "pi_heif-0.22.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ed229d31a4e0037f0ba417a21f403fb8f965a40e3e5abaedafe717f6b710f544"}, + {file = "pi_heif-0.22.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6d95b90d5b005c35839120e934bfa5746fdf88ba344d1e58a814a33e5e9f057c"}, + {file = "pi_heif-0.22.0-cp312-cp312-win_amd64.whl", hash = "sha256:943dee9b05c768acbc06662b327518b2a257dd08ced79dce7c11fab5ac2d5c4b"}, + {file = "pi_heif-0.22.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:95dd7ec2cbcef6ef1110c6ba539fa7e1489a023589076ca8b3eebcb1e38d256c"}, + {file = "pi_heif-0.22.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:0e635dceb40424b5d88c7a2183d8dabb844c7776118df12f275ead2a10d275f6"}, + {file = "pi_heif-0.22.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f668c27a564c7373a462c0484d49166084ec608b65f9d6763fef7a1c80eee8c0"}, + {file = "pi_heif-0.22.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24ea5ba8cbd871ae09a856dbb9a7e6376ba70b5207085d0302f539574614b9e0"}, + {file = "pi_heif-0.22.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a89b57cd839b09ee749d12397d2027e20fe7a64a44883688ab44a873b16b507b"}, + {file = "pi_heif-0.22.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93acd60ef14e3ea835b7e3dafe284c07116349b0df05507520f10520c3ad09c1"}, + {file = "pi_heif-0.22.0-cp313-cp313-win_amd64.whl", hash = "sha256:6415b0005216ad08f86d0ef75ec24e13e60bf5f45273ab54a4a22f008b9f41ac"}, + {file = "pi_heif-0.22.0-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:3f85ac3c0e2fb18af10e5b9789dcfd73f091b1d6ea2090d70d6e87f8744b8fe9"}, + {file = "pi_heif-0.22.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2635cbcf35206dd3d7f6453df8a6a5cd6a83bcdc9818d999b7342837482d614e"}, + {file = "pi_heif-0.22.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:893a49c195563a9bbbef571daad995110b47e3e6b624b92269c281cf1b70b8da"}, + {file = "pi_heif-0.22.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b160a20dd6fa9d951a556006f02ec601a433ec4002953fdb67025f42e5fa89ea"}, + {file = "pi_heif-0.22.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4e2508317837ad6da6b6e2ba154faab766a0cdc189a86dd45b4b7decd641bfa5"}, + {file = "pi_heif-0.22.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a7a1666070cffce08027b4309fb7f270c0e3a4715a3e5a7a7202b05f65a849f2"}, + {file = "pi_heif-0.22.0-cp39-cp39-win_amd64.whl", hash = "sha256:c73e651cb17b7da3a740881c479e224084c95380df0d9d4f72d4858a422e80ae"}, + {file = "pi_heif-0.22.0-pp310-pypy310_pp73-macosx_13_0_x86_64.whl", hash = "sha256:6b83ec2f6db2dd61e09940006ee0a854eb58d91a52023be057da13a08a9f0517"}, + {file = "pi_heif-0.22.0-pp310-pypy310_pp73-macosx_14_0_arm64.whl", hash = "sha256:f33211fa2afa756b13a63e21aeab577cdc7ddb18a929a012cbbcd3b7d8a772d0"}, + {file = "pi_heif-0.22.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a82bb03e5ab429b6aee5f1446c7c1925b1fb4fd58d74c960c7995734285db269"}, + {file = "pi_heif-0.22.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:79d72744708949bd9028516d860bd2c341371bca13aa2196e4f2267263834608"}, + {file = "pi_heif-0.22.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:7bb583f93bb4c1dfaf3b6e689a9fa0de7c83182730c16ec8798c459cf8c3e8cf"}, + {file = "pi_heif-0.22.0-pp39-pypy39_pp73-macosx_13_0_x86_64.whl", hash = "sha256:052fffb0b65c51adf90993a696dd51dddc5f5707d5f40e7bd9f4ad958bb505d9"}, + {file = "pi_heif-0.22.0-pp39-pypy39_pp73-macosx_14_0_arm64.whl", hash = "sha256:b326a48001a97906e5eb4110113d0cfe1203704f3572100dd177782568c9fc32"}, + {file = "pi_heif-0.22.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8cc68012a870d5e39d8fd5468dfd1d452ca10388cab5fac30f90ddfa0772a3e"}, + {file = "pi_heif-0.22.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:350c49ac597d1b8cdaa8a35f2c0901a3847067b9d0a9fdc07d2d6851e5d63382"}, + {file = "pi_heif-0.22.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f3de6fb5a58cf271897adc31e045db45003ae1e32116efc30fa20c72e1c90b2b"}, + {file = "pi_heif-0.22.0.tar.gz", hash = "sha256:489ddda3c9fed948715a9c8642c6ee24c3b438a7fbf85b3a8f097d632d7082a8"}, +] + +[package.dependencies] +pillow = ">=10.1.0" + +[package.extras] +tests = ["defusedxml", "numpy", "packaging", "pympler", "pytest"] +tests-min = ["defusedxml", "packaging", "pytest"] + [[package]] name = "pillow" version = "11.1.0" description = "Python Imaging Library (Fork)" -optional = true +optional = false python-versions = ">=3.9" -groups = ["main"] -markers = "(extra == \"vector-db-based\" or extra == \"file-based\") and (python_version <= \"3.11\" or python_version >= \"3.12\")" +groups = ["main", "dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pillow-11.1.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:e1abe69aca89514737465752b4bcaf8016de61b3be1397a8fc260ba33321b3a8"}, {file = "pillow-11.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c640e5a06869c75994624551f45e5506e4256562ead981cce820d5ab39ae2192"}, @@ -3571,6 +3716,30 @@ files = [ [package.extras] diagrams = ["jinja2", "railroad-diagrams"] +[[package]] +name = "pypdf" +version = "5.4.0" +description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"file-based\"" +files = [ + {file = "pypdf-5.4.0-py3-none-any.whl", hash = "sha256:db994ab47cadc81057ea1591b90e5b543e2b7ef2d0e31ef41a9bfe763c119dab"}, + {file = "pypdf-5.4.0.tar.gz", hash = "sha256:9af476a9dc30fcb137659b0dec747ea94aa954933c52cf02ee33e39a16fe9175"}, +] + +[package.dependencies] +typing_extensions = {version = ">=4.0", markers = "python_version < \"3.11\""} + +[package.extras] +crypto = ["cryptography"] +cryptodome = ["PyCryptodome"] +dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "pytest-socket", "pytest-timeout", "pytest-xdist", "wheel"] +docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] +full = ["Pillow (>=8.0.0)", "cryptography"] +image = ["Pillow (>=8.0.0)"] + [[package]] name = "pyproject-flake8" version = "6.1.0" @@ -3939,21 +4108,41 @@ files = [ {file = "python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"}, ] +[[package]] +name = "python-oxmsg" +version = "0.0.2" +description = "Extract attachments from Outlook .msg files." +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"file-based\"" +files = [ + {file = "python_oxmsg-0.0.2-py3-none-any.whl", hash = "sha256:22be29b14c46016bcd05e34abddfd8e05ee82082f53b82753d115da3fc7d0355"}, + {file = "python_oxmsg-0.0.2.tar.gz", hash = "sha256:a6aff4deb1b5975d44d49dab1d9384089ffeec819e19c6940bc7ffbc84775fad"}, +] + +[package.dependencies] +click = "*" +olefile = "*" +typing_extensions = ">=4.9.0" + [[package]] name = "python-pptx" -version = "0.6.21" -description = "Generate and manipulate Open XML PowerPoint (.pptx) files" +version = "1.0.2" +description = "Create, read, and update PowerPoint 2007+ (.pptx) files." optional = true -python-versions = "*" +python-versions = ">=3.8" groups = ["main"] markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"file-based\"" files = [ - {file = "python-pptx-0.6.21.tar.gz", hash = "sha256:7798a2aaf89563565b3c7120c0acfe9aff775db0db3580544e3bf4840c2e378f"}, + {file = "python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba"}, + {file = "python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095"}, ] [package.dependencies] lxml = ">=3.1.0" Pillow = ">=3.3.2" +typing-extensions = ">=4.9.0" XlsxWriter = ">=0.5.7" [[package]] @@ -4787,22 +4976,6 @@ postgresql-psycopgbinary = ["psycopg[binary] (>=3.0.7)"] pymysql = ["pymysql"] sqlcipher = ["sqlcipher3_binary"] -[[package]] -name = "tabulate" -version = "0.9.0" -description = "Pretty-print tabular data" -optional = true -python-versions = ">=3.7" -groups = ["main"] -markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"file-based\"" -files = [ - {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, - {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, -] - -[package.extras] -widechars = ["wcwidth"] - [[package]] name = "tenacity" version = "8.5.0" @@ -5084,6 +5257,22 @@ files = [ mypy-extensions = ">=0.3.0" typing-extensions = ">=3.7.4" +[[package]] +name = "typing-inspection" +version = "0.4.0" +description = "Runtime typing introspection tools" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"file-based\"" +files = [ + {file = "typing_inspection-0.4.0-py3-none-any.whl", hash = "sha256:50e72559fcd2a6367a19f7a7e610e6afcb9fac940c650290eed893d61386832f"}, + {file = "typing_inspection-0.4.0.tar.gz", hash = "sha256:9765c87de36671694a67904bf2c96e395be9c6439bb6c87b5142569dcdd65122"}, +] + +[package.dependencies] +typing-extensions = ">=4.12.0" + [[package]] name = "tzdata" version = "2025.1" @@ -5115,15 +5304,15 @@ test = ["coverage", "pytest", "pytest-cov"] [[package]] name = "unstructured" -version = "0.10.27" +version = "0.17.2" description = "A library that prepares raw documents for downstream ML tasks." optional = true -python-versions = ">=3.7.0" +python-versions = ">=3.9.0" groups = ["main"] markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"file-based\"" files = [ - {file = "unstructured-0.10.27-py3-none-any.whl", hash = "sha256:3a8a8e44302388ddc39c184059e8b4458f1cdc58032540b9af7d85f6c3eca3be"}, - {file = "unstructured-0.10.27.tar.gz", hash = "sha256:f567b5c4385993a9ab48db5563dd7b413aac4f2002bb22e6250496ea8f440f5e"}, + {file = "unstructured-0.17.2-py3-none-any.whl", hash = "sha256:527dd26a4b273aebef2f9119c9d4f0d0ce17640038d92296d23abe89be123840"}, + {file = "unstructured-0.17.2.tar.gz", hash = "sha256:af18c3caef0a6c562cf77e34ee8b6ff522b605031d2336ffe565df66f126aa46"}, ] [package.dependencies] @@ -5133,68 +5322,70 @@ chardet = "*" dataclasses-json = "*" emoji = "*" filetype = "*" +html5lib = "*" langdetect = "*" lxml = "*" nltk = "*" numpy = "*" -python-docx = {version = ">=1.0.1", optional = true, markers = "extra == \"docx\""} +psutil = "*" +python-docx = {version = ">=1.1.2", optional = true, markers = "extra == \"docx\""} python-iso639 = "*" python-magic = "*" -python-pptx = {version = "<=0.6.21", optional = true, markers = "extra == \"pptx\""} +python-oxmsg = "*" +python-pptx = {version = ">=1.0.1", optional = true, markers = "extra == \"pptx\""} rapidfuzz = "*" requests = "*" -tabulate = "*" +tqdm = "*" typing-extensions = "*" +unstructured-client = "*" +wrapt = "*" [package.extras] -airtable = ["pyairtable"] -all-docs = ["markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.0.1)", "python-pptx (<=0.6.21)", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] -azure = ["adlfs", "fsspec (==2023.9.1)"] -azure-cognitive-search = ["azure-search-documents"] -bedrock = ["boto3", "langchain"] -biomed = ["bs4"] -box = ["boxfs", "fsspec (==2023.9.1)"] -confluence = ["atlassian-python-api"] +all-docs = ["effdet", "google-cloud-vision", "markdown", "networkx", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (>=0.8.10)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] csv = ["pandas"] -delta-table = ["deltalake", "fsspec (==2023.9.1)"] -discord = ["discord-py"] -doc = ["python-docx (>=1.0.1)"] -docx = ["python-docx (>=1.0.1)"] -dropbox = ["dropboxdrivefs", "fsspec (==2023.9.1)"] -elasticsearch = ["elasticsearch", "jq"] -embed-huggingface = ["huggingface", "langchain", "sentence-transformers"] +doc = ["python-docx (>=1.1.2)"] +docx = ["python-docx (>=1.1.2)"] epub = ["pypandoc"] -gcs = ["bs4", "fsspec (==2023.9.1)", "gcsfs"] -github = ["pygithub (>1.58.0)"] -gitlab = ["python-gitlab"] -google-drive = ["google-api-python-client"] huggingface = ["langdetect", "sacremoses", "sentencepiece", "torch", "transformers"] -image = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)"] -jira = ["atlassian-python-api"] -local-inference = ["markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.0.1)", "python-pptx (<=0.6.21)", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] +image = ["effdet", "google-cloud-vision", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypdf", "unstructured-inference (>=0.8.10)", "unstructured.pytesseract (>=0.3.12)"] +local-inference = ["effdet", "google-cloud-vision", "markdown", "networkx", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (>=0.8.10)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] md = ["markdown"] -msg = ["msg-parser"] -notion = ["htmlBuilder", "notion-client"] -odt = ["pypandoc", "python-docx (>=1.0.1)"] -onedrive = ["Office365-REST-Python-Client (<2.4.3)", "bs4", "msal"] -openai = ["langchain", "openai", "tiktoken"] +odt = ["pypandoc", "python-docx (>=1.1.2)"] org = ["pypandoc"] -outlook = ["Office365-REST-Python-Client (<2.4.3)", "msal"] -paddleocr = ["unstructured.paddleocr (==2.6.1.3)"] -pdf = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)"] -ppt = ["python-pptx (<=0.6.21)"] -pptx = ["python-pptx (<=0.6.21)"] -reddit = ["praw"] +paddleocr = ["paddlepaddle (>=3.0.0b1)", "unstructured.paddleocr (==2.10.0)"] +pdf = ["effdet", "google-cloud-vision", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypdf", "unstructured-inference (>=0.8.10)", "unstructured.pytesseract (>=0.3.12)"] +ppt = ["python-pptx (>=1.0.1)"] +pptx = ["python-pptx (>=1.0.1)"] rst = ["pypandoc"] rtf = ["pypandoc"] -s3 = ["fsspec (==2023.9.1)", "s3fs"] -salesforce = ["simple-salesforce"] -sharepoint = ["Office365-REST-Python-Client (<2.4.3)", "msal"] -slack = ["slack-sdk"] tsv = ["pandas"] -wikipedia = ["wikipedia"] xlsx = ["networkx", "openpyxl", "pandas", "xlrd"] +[[package]] +name = "unstructured-client" +version = "0.32.3" +description = "Python Client SDK for Unstructured API" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"file-based\"" +files = [ + {file = "unstructured_client-0.32.3-py3-none-any.whl", hash = "sha256:50b8198a3c3f984bdb53d848be7665d352093a99841858976f596cc2105903ec"}, + {file = "unstructured_client-0.32.3.tar.gz", hash = "sha256:1426d03325f7b93daad524ad2b954f1e7cceb0c15e67a4f4e88b49220dd2472c"}, +] + +[package.dependencies] +aiofiles = ">=24.1.0" +cryptography = ">=3.1" +eval-type-backport = ">=0.2.0" +httpx = ">=0.27.0" +nest-asyncio = ">=1.6.0" +pydantic = ">=2.10.3" +pypdf = ">=4.0" +python-dateutil = ">=2.8.2" +requests-toolbelt = ">=1.0.0" +typing-inspection = ">=0.4.0" + [[package]] name = "unstructured-pytesseract" version = "0.3.13" @@ -5263,6 +5454,19 @@ files = [ [package.dependencies] bracex = ">=2.1.1" +[[package]] +name = "webencodings" +version = "0.5.1" +description = "Character encoding aliases for legacy web content" +optional = true +python-versions = "*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"file-based\"" +files = [ + {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, + {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, +] + [[package]] name = "werkzeug" version = "3.1.3" @@ -5367,6 +5571,96 @@ files = [ [package.dependencies] tzdata = {version = ">=2020.1", markers = "sys_platform == \"win32\""} +[[package]] +name = "wrapt" +version = "1.17.2" +description = "Module for decorators, wrappers and monkey patching." +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"file-based\"" +files = [ + {file = "wrapt-1.17.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3d57c572081fed831ad2d26fd430d565b76aa277ed1d30ff4d40670b1c0dd984"}, + {file = "wrapt-1.17.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b5e251054542ae57ac7f3fba5d10bfff615b6c2fb09abeb37d2f1463f841ae22"}, + {file = "wrapt-1.17.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:80dd7db6a7cb57ffbc279c4394246414ec99537ae81ffd702443335a61dbf3a7"}, + {file = "wrapt-1.17.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a6e821770cf99cc586d33833b2ff32faebdbe886bd6322395606cf55153246c"}, + {file = "wrapt-1.17.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b60fb58b90c6d63779cb0c0c54eeb38941bae3ecf7a73c764c52c88c2dcb9d72"}, + {file = "wrapt-1.17.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b870b5df5b71d8c3359d21be8f0d6c485fa0ebdb6477dda51a1ea54a9b558061"}, + {file = "wrapt-1.17.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4011d137b9955791f9084749cba9a367c68d50ab8d11d64c50ba1688c9b457f2"}, + {file = "wrapt-1.17.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:1473400e5b2733e58b396a04eb7f35f541e1fb976d0c0724d0223dd607e0f74c"}, + {file = "wrapt-1.17.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3cedbfa9c940fdad3e6e941db7138e26ce8aad38ab5fe9dcfadfed9db7a54e62"}, + {file = "wrapt-1.17.2-cp310-cp310-win32.whl", hash = "sha256:582530701bff1dec6779efa00c516496968edd851fba224fbd86e46cc6b73563"}, + {file = "wrapt-1.17.2-cp310-cp310-win_amd64.whl", hash = "sha256:58705da316756681ad3c9c73fd15499aa4d8c69f9fd38dc8a35e06c12468582f"}, + {file = "wrapt-1.17.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ff04ef6eec3eee8a5efef2401495967a916feaa353643defcc03fc74fe213b58"}, + {file = "wrapt-1.17.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4db983e7bca53819efdbd64590ee96c9213894272c776966ca6306b73e4affda"}, + {file = "wrapt-1.17.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9abc77a4ce4c6f2a3168ff34b1da9b0f311a8f1cfd694ec96b0603dff1c79438"}, + {file = "wrapt-1.17.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b929ac182f5ace000d459c59c2c9c33047e20e935f8e39371fa6e3b85d56f4a"}, + {file = "wrapt-1.17.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f09b286faeff3c750a879d336fb6d8713206fc97af3adc14def0cdd349df6000"}, + {file = "wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a7ed2d9d039bd41e889f6fb9364554052ca21ce823580f6a07c4ec245c1f5d6"}, + {file = "wrapt-1.17.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:129a150f5c445165ff941fc02ee27df65940fcb8a22a61828b1853c98763a64b"}, + {file = "wrapt-1.17.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1fb5699e4464afe5c7e65fa51d4f99e0b2eadcc176e4aa33600a3df7801d6662"}, + {file = "wrapt-1.17.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9a2bce789a5ea90e51a02dfcc39e31b7f1e662bc3317979aa7e5538e3a034f72"}, + {file = "wrapt-1.17.2-cp311-cp311-win32.whl", hash = "sha256:4afd5814270fdf6380616b321fd31435a462019d834f83c8611a0ce7484c7317"}, + {file = "wrapt-1.17.2-cp311-cp311-win_amd64.whl", hash = "sha256:acc130bc0375999da18e3d19e5a86403667ac0c4042a094fefb7eec8ebac7cf3"}, + {file = "wrapt-1.17.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d5e2439eecc762cd85e7bd37161d4714aa03a33c5ba884e26c81559817ca0925"}, + {file = "wrapt-1.17.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fc7cb4c1c744f8c05cd5f9438a3caa6ab94ce8344e952d7c45a8ed59dd88392"}, + {file = "wrapt-1.17.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8fdbdb757d5390f7c675e558fd3186d590973244fab0c5fe63d373ade3e99d40"}, + {file = "wrapt-1.17.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb1d0dbf99411f3d871deb6faa9aabb9d4e744d67dcaaa05399af89d847a91d"}, + {file = "wrapt-1.17.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d18a4865f46b8579d44e4fe1e2bcbc6472ad83d98e22a26c963d46e4c125ef0b"}, + {file = "wrapt-1.17.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc570b5f14a79734437cb7b0500376b6b791153314986074486e0b0fa8d71d98"}, + {file = "wrapt-1.17.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6d9187b01bebc3875bac9b087948a2bccefe464a7d8f627cf6e48b1bbae30f82"}, + {file = "wrapt-1.17.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:9e8659775f1adf02eb1e6f109751268e493c73716ca5761f8acb695e52a756ae"}, + {file = "wrapt-1.17.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e8b2816ebef96d83657b56306152a93909a83f23994f4b30ad4573b00bd11bb9"}, + {file = "wrapt-1.17.2-cp312-cp312-win32.whl", hash = "sha256:468090021f391fe0056ad3e807e3d9034e0fd01adcd3bdfba977b6fdf4213ea9"}, + {file = "wrapt-1.17.2-cp312-cp312-win_amd64.whl", hash = "sha256:ec89ed91f2fa8e3f52ae53cd3cf640d6feff92ba90d62236a81e4e563ac0e991"}, + {file = "wrapt-1.17.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6ed6ffac43aecfe6d86ec5b74b06a5be33d5bb9243d055141e8cabb12aa08125"}, + {file = "wrapt-1.17.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:35621ae4c00e056adb0009f8e86e28eb4a41a4bfa8f9bfa9fca7d343fe94f998"}, + {file = "wrapt-1.17.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a604bf7a053f8362d27eb9fefd2097f82600b856d5abe996d623babd067b1ab5"}, + {file = "wrapt-1.17.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cbabee4f083b6b4cd282f5b817a867cf0b1028c54d445b7ec7cfe6505057cf8"}, + {file = "wrapt-1.17.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:49703ce2ddc220df165bd2962f8e03b84c89fee2d65e1c24a7defff6f988f4d6"}, + {file = "wrapt-1.17.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8112e52c5822fc4253f3901b676c55ddf288614dc7011634e2719718eaa187dc"}, + {file = "wrapt-1.17.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fee687dce376205d9a494e9c121e27183b2a3df18037f89d69bd7b35bcf59e2"}, + {file = "wrapt-1.17.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:18983c537e04d11cf027fbb60a1e8dfd5190e2b60cc27bc0808e653e7b218d1b"}, + {file = "wrapt-1.17.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:703919b1633412ab54bcf920ab388735832fdcb9f9a00ae49387f0fe67dad504"}, + {file = "wrapt-1.17.2-cp313-cp313-win32.whl", hash = "sha256:abbb9e76177c35d4e8568e58650aa6926040d6a9f6f03435b7a522bf1c487f9a"}, + {file = "wrapt-1.17.2-cp313-cp313-win_amd64.whl", hash = "sha256:69606d7bb691b50a4240ce6b22ebb319c1cfb164e5f6569835058196e0f3a845"}, + {file = "wrapt-1.17.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:4a721d3c943dae44f8e243b380cb645a709ba5bd35d3ad27bc2ed947e9c68192"}, + {file = "wrapt-1.17.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:766d8bbefcb9e00c3ac3b000d9acc51f1b399513f44d77dfe0eb026ad7c9a19b"}, + {file = "wrapt-1.17.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e496a8ce2c256da1eb98bd15803a79bee00fc351f5dfb9ea82594a3f058309e0"}, + {file = "wrapt-1.17.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d615e4fe22f4ad3528448c193b218e077656ca9ccb22ce2cb20db730f8d306"}, + {file = "wrapt-1.17.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a5aaeff38654462bc4b09023918b7f21790efb807f54c000a39d41d69cf552cb"}, + {file = "wrapt-1.17.2-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a7d15bbd2bc99e92e39f49a04653062ee6085c0e18b3b7512a4f2fe91f2d681"}, + {file = "wrapt-1.17.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e3890b508a23299083e065f435a492b5435eba6e304a7114d2f919d400888cc6"}, + {file = "wrapt-1.17.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:8c8b293cd65ad716d13d8dd3624e42e5a19cc2a2f1acc74b30c2c13f15cb61a6"}, + {file = "wrapt-1.17.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c82b8785d98cdd9fed4cac84d765d234ed3251bd6afe34cb7ac523cb93e8b4f"}, + {file = "wrapt-1.17.2-cp313-cp313t-win32.whl", hash = "sha256:13e6afb7fe71fe7485a4550a8844cc9ffbe263c0f1a1eea569bc7091d4898555"}, + {file = "wrapt-1.17.2-cp313-cp313t-win_amd64.whl", hash = "sha256:eaf675418ed6b3b31c7a989fd007fa7c3be66ce14e5c3b27336383604c9da85c"}, + {file = "wrapt-1.17.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5c803c401ea1c1c18de70a06a6f79fcc9c5acfc79133e9869e730ad7f8ad8ef9"}, + {file = "wrapt-1.17.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f917c1180fdb8623c2b75a99192f4025e412597c50b2ac870f156de8fb101119"}, + {file = "wrapt-1.17.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ecc840861360ba9d176d413a5489b9a0aff6d6303d7e733e2c4623cfa26904a6"}, + {file = "wrapt-1.17.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb87745b2e6dc56361bfde481d5a378dc314b252a98d7dd19a651a3fa58f24a9"}, + {file = "wrapt-1.17.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:58455b79ec2661c3600e65c0a716955adc2410f7383755d537584b0de41b1d8a"}, + {file = "wrapt-1.17.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4e42a40a5e164cbfdb7b386c966a588b1047558a990981ace551ed7e12ca9c2"}, + {file = "wrapt-1.17.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:91bd7d1773e64019f9288b7a5101f3ae50d3d8e6b1de7edee9c2ccc1d32f0c0a"}, + {file = "wrapt-1.17.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:bb90fb8bda722a1b9d48ac1e6c38f923ea757b3baf8ebd0c82e09c5c1a0e7a04"}, + {file = "wrapt-1.17.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:08e7ce672e35efa54c5024936e559469436f8b8096253404faeb54d2a878416f"}, + {file = "wrapt-1.17.2-cp38-cp38-win32.whl", hash = "sha256:410a92fefd2e0e10d26210e1dfb4a876ddaf8439ef60d6434f21ef8d87efc5b7"}, + {file = "wrapt-1.17.2-cp38-cp38-win_amd64.whl", hash = "sha256:95c658736ec15602da0ed73f312d410117723914a5c91a14ee4cdd72f1d790b3"}, + {file = "wrapt-1.17.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:99039fa9e6306880572915728d7f6c24a86ec57b0a83f6b2491e1d8ab0235b9a"}, + {file = "wrapt-1.17.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2696993ee1eebd20b8e4ee4356483c4cb696066ddc24bd70bcbb80fa56ff9061"}, + {file = "wrapt-1.17.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:612dff5db80beef9e649c6d803a8d50c409082f1fedc9dbcdfde2983b2025b82"}, + {file = "wrapt-1.17.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:62c2caa1585c82b3f7a7ab56afef7b3602021d6da34fbc1cf234ff139fed3cd9"}, + {file = "wrapt-1.17.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c958bcfd59bacc2d0249dcfe575e71da54f9dcf4a8bdf89c4cb9a68a1170d73f"}, + {file = "wrapt-1.17.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc78a84e2dfbc27afe4b2bd7c80c8db9bca75cc5b85df52bfe634596a1da846b"}, + {file = "wrapt-1.17.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ba0f0eb61ef00ea10e00eb53a9129501f52385c44853dbd6c4ad3f403603083f"}, + {file = "wrapt-1.17.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1e1fe0e6ab7775fd842bc39e86f6dcfc4507ab0ffe206093e76d61cde37225c8"}, + {file = "wrapt-1.17.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c86563182421896d73858e08e1db93afdd2b947a70064b813d515d66549e15f9"}, + {file = "wrapt-1.17.2-cp39-cp39-win32.whl", hash = "sha256:f393cda562f79828f38a819f4788641ac7c4085f30f1ce1a68672baa686482bb"}, + {file = "wrapt-1.17.2-cp39-cp39-win_amd64.whl", hash = "sha256:36ccae62f64235cf8ddb682073a60519426fdd4725524ae38874adf72b5f2aeb"}, + {file = "wrapt-1.17.2-py3-none-any.whl", hash = "sha256:b18f2d1533a71f069c7f82d524a52599053d4c7166e9dd374ae2136b7f40f7c8"}, + {file = "wrapt-1.17.2.tar.gz", hash = "sha256:41388e9d4d1522446fe79d3213196bd9e3b301a336965b9e27ca2788ebd122f3"}, +] + [[package]] name = "xlsxwriter" version = "3.2.0" @@ -5513,11 +5807,11 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", type = ["pytest-mypy"] [extras] -file-based = ["avro", "fastavro", "markdown", "pdf2image", "pdfminer.six", "pyarrow", "pytesseract", "python-calamine", "python-snappy", "unstructured", "unstructured.pytesseract"] +file-based = ["avro", "fastavro", "markdown", "pdf2image", "pdfminer.six", "pi-heif", "pyarrow", "pytesseract", "python-calamine", "python-snappy", "unstructured", "unstructured.pytesseract"] sql = ["sqlalchemy"] vector-db-based = ["cohere", "langchain", "openai", "tiktoken"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "9854ff162fb8407d116438ded5068bb03510e5692c62d81059c376c30c417948" +content-hash = "0f1e011ecf256c0a7d7da5f7ffcadbb4dfb9a70ff16fb365ed8702bbddebbbf1" diff --git a/pyproject.toml b/pyproject.toml index 90d018fed..e9fd7152e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,7 +73,7 @@ nltk = { version = "3.9.1", optional = true } # This will ensure that even when you run poetry install or pip install, the compatible version of numpy will always be chosen. # airbyte-ci will try to install latest version when --use-local-cdk is used, resulting in the conflict. numpy = "<2" -unstructured = { version = "0.10.27", extras = ["docx", "pptx"], optional = true } +unstructured = { version = "0.17.2", extras = ["docx", "pptx"], optional = true } "unstructured.pytesseract" = { version = ">=0.3.12", optional = true } pyjwt = "^2.8.0" cryptography = ">=44.0.0,<45.0.0" # Constrained as transitive dependency due to a bug in newer versions @@ -84,6 +84,7 @@ sqlalchemy = {version = "^2.0,!=2.0.36", optional = true } xmltodict = ">=0.13,<0.15" anyascii = "^0.3.2" whenever = "^0.6.16" +pi-heif = "^0.22.0" [tool.poetry.group.dev.dependencies] freezegun = "*" @@ -106,9 +107,10 @@ types-python-dateutil = "^2.9.0.20241003" types-pyyaml = "^6.0.12.20240917" types-cachetools = "^5.5.0.20240820" deptry = "^0.23.0" +pi-heif = "^0.22.0" [tool.poetry.extras] -file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "python-snappy"] +file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "python-snappy", "pi-heif"] vector-db-based = ["langchain", "openai", "cohere", "tiktoken"] sql = ["sqlalchemy"] @@ -154,7 +156,7 @@ lint-fix = { cmd = "poetry run ruff check --fix .", help = "Auto-fix any lint is lint-fix-unsafe = { cmd = "poetry run ruff check --fix --unsafe-fixes .", help = "Lint-fix modified files, including 'unsafe' fixes. It is recommended to first commit any pending changes and then always manually review any unsafe changes applied." } # ruff fix everything (ignoring non-Python fixes) -ruff-fix = { sequence = ["lint-fix", "_format-fix-ruff"] , help = "Lint-fix and format-fix all code." } +ruff-fix = { sequence = ["lint-fix", "_format-fix-ruff"], help = "Lint-fix and format-fix all code." } # Combined Check and Fix tasks diff --git a/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py b/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py index c0db46e7a..5ff089528 100644 --- a/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +++ b/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py @@ -28,8 +28,14 @@ "type": ["null", "string"], "description": "Error message if the file could not be parsed even though the file is supported", }, - "_ab_source_file_last_modified": {"type": "string"}, - "_ab_source_file_url": {"type": "string"}, + "_ab_source_file_last_modified": { + "type": ["null", "string"], + "description": "Last modified timestamp of the source file", + }, + "_ab_source_file_url": { + "type": ["null", "string"], + "description": "URL or path to the source file", + }, }, }