feat: Add DocumentTypeRouter (#321)

sjrl · anakin87 · web-flow · commit 05b92eeb02b8 · 2025-06-12T13:29:43.000+02:00
* Add DocumentTypeRouter

* small changes

* Add docs and tests

* add header

* Formatting

* Update docs/pydoc/config/routers_api.yml

Co-authored-by: Stefano Fiorucci &lt;stefanofiorucci@gmail.com&gt;

* Update haystack_experimental/components/routers/document_type_router.py

Co-authored-by: Stefano Fiorucci &lt;stefanofiorucci@gmail.com&gt;

* PR comments

---------

Co-authored-by: Stefano Fiorucci &lt;stefanofiorucci@gmail.com&gt;
diff --git a/docs/pydoc/config/routers_api.yml b/docs/pydoc/config/routers_api.yml
@@ -0,0 +1,30 @@
+loaders:
+  - type: haystack_pydoc_tools.loaders.CustomPythonLoader
+    search_path: [../../../]
+    modules:
+      [
+        "haystack_experimental.components.routers.document_type_router",
+      ]
+    ignore_when_discovered: ["__init__"]
+processors:
+  - type: filter
+    expression:
+    documented_only: true
+    do_not_filter_modules: false
+    skip_empty_modules: true
+  - type: smart
+  - type: crossref
+renderer:
+  type: haystack_pydoc_tools.renderers.ReadmeCoreRenderer
+  excerpt: Routers is a group of components that route queries or Documents to other components that can handle them best.
+  category_slug: experiments-api
+  title: Routers
+  slug: experimental-routers-api
+  order: 42
+  markdown:
+    descriptive_class_title: false
+    classdef_code_block: false
+    descriptive_module_title: true
+    add_method_class_prefix: true
+    add_member_class_prefix: false
+    filename: experimental_routers_api.md
diff --git a/haystack_experimental/components/routers/__init__.py b/haystack_experimental/components/routers/__init__.py
@@ -0,0 +1,17 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+from typing import TYPE_CHECKING
+
+from lazy_imports import LazyImporter
+
+_import_structure = {
+    "document_type_router": ["DocumentTypeRouter"],
+}
+
+if TYPE_CHECKING:
+    from .document_type_router import DocumentTypeRouter
+else:
+    sys.modules[__name__] = LazyImporter(name=__name__, module_file=__file__, import_structure=_import_structure)
diff --git a/haystack_experimental/components/routers/document_type_router.py b/haystack_experimental/components/routers/document_type_router.py
@@ -0,0 +1,166 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import mimetypes
+import re
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from haystack import component
+from haystack.components.routers.file_type_router import CUSTOM_MIMETYPES
+from haystack.dataclasses import Document
+
+
+@component
+class DocumentTypeRouter:
+    """
+    Categorizes documents by MIME types based on their metadata.
+
+    DocumentTypeRouter is used to dynamically route documents within a pipeline based on their MIME types.
+    It supports exact MIME type matches and regex patterns.
+
+    MIME types can be extracted directly from document metadata or inferred from file paths using standard or
+    user-supplied MIME type mappings.
+
+    ### Usage example
+
+    ```python
+    from haystack_experimental.components.routers import DocumentTypeRouter
+    from haystack.dataclasses import Document
+
+    docs = [
+        Document(content="Example text", meta={"file_path": "example.txt"}),
+        Document(content="Another document", meta={"mime_type": "application/pdf"}),
+        Document(content="Unknown type")
+    ]
+
+    router = DocumentTypeRouter(
+        mime_type_meta_field="mime_type",
+        file_path_meta_field="file_path",
+        mime_types=["text/plain", "application/pdf"]
+    )
+
+    result = router.run(documents=docs)
+    print(result)
+    ```
+
+    Expected output:
+    ```python
+    {
+        "text/plain": [Document(...)],
+        "application/pdf": [Document(...)],
+        "unclassified": [Document(...)]
+    }
+    ```
+    """
+
+    def __init__(
+        self,
+        *,
+        mime_type_meta_field: Optional[str] = None,
+        file_path_meta_field: Optional[str] = None,
+        mime_types: List[str],
+        additional_mimetypes: Optional[Dict[str, str]] = None,
+    ) -> None:
+        """
+        Initialize the DocumentTypeRouter component.
+
+        :param mime_type_meta_field:
+            Optional name of the metadata field that holds the MIME type.
+
+        :param file_path_meta_field:
+            Optional name of the metadata field that holds the file path. Used to infer the MIME type if
+            `mime_type_meta_field` is not provided or missing in a document.
+
+        :param mime_types:
+            A list of MIME types or regex patterns to classify the input documents.
+            (for example: `["text/plain", "audio/x-wav", "image/jpeg"]`).
+
+        :param additional_mimetypes:
+            Optional dictionary mapping MIME types to file extensions to enhance or override the standard
+            `mimetypes` module. Useful when working with uncommon or custom file types.
+            For example: `{"application/vnd.custom-type": ".custom"}`.
+
+        :raises ValueError: If `mime_types` is empty or if both `mime_type_meta_field` and `file_path_meta_field` are
+            not provided.
+        """
+        if not mime_types:
+            raise ValueError("The list of mime types cannot be empty.")
+
+        if mime_type_meta_field is None and file_path_meta_field is None:
+            raise ValueError(
+                "At least one of 'mime_type_meta_field' or 'file_path_meta_field' must be provided to determine MIME "
+                "types."
+            )
+        self.mime_type_meta_field = mime_type_meta_field
+        self.file_path_meta_field = file_path_meta_field
+
+        if additional_mimetypes:
+            for mime, ext in additional_mimetypes.items():
+                mimetypes.add_type(mime, ext)
+
+        self._mime_type_patterns = []
+        for mime_type in mime_types:
+            try:
+                pattern = re.compile(mime_type)
+            except re.error:
+                raise ValueError(f"Invalid regex pattern '{mime_type}'.")
+            self._mime_type_patterns.append(pattern)
+
+        component.set_output_types(
+            self,
+            unclassified=List[Document],
+            **dict.fromkeys(mime_types, List[Document]),
+        )
+        self.mime_types = mime_types
+        self.additional_mimetypes = additional_mimetypes
+
+    def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
+        """
+        Categorize input documents into groups based on their MIME type.
+
+        MIME types can either be directly available in document metadata or derived from file paths using the
+        standard Python `mimetypes` module and custom mappings.
+
+        :param documents:
+            A list of documents to be categorized.
+
+        :returns:
+            A dictionary where the keys are MIME types (or `"unclassified"`) and the values are lists of documents.
+        """
+        mime_types = defaultdict(list)
+
+        for doc in documents:
+            mime_type = doc.meta.get(self.mime_type_meta_field) if self.mime_type_meta_field else None
+            file_path = doc.meta.get(self.file_path_meta_field) if self.file_path_meta_field else None
+
+            if mime_type is None and file_path:
+                # if mime_type is not provided, try to guess it from the file path
+                mime_type = self._get_mime_type(Path(file_path))
+
+            matched = False
+            if mime_type:
+                for pattern in self._mime_type_patterns:
+                    if pattern.fullmatch(mime_type):
+                        mime_types[pattern.pattern].append(doc)
+                        matched = True
+                        break
+            if not matched:
+                mime_types["unclassified"].append(doc)
+
+        return dict(mime_types)
+
+    def _get_mime_type(self, path: Path) -> Optional[str]:
+        """
+        Get the MIME type of the provided file path.
+
+        :param path: The file path to get the MIME type for.
+
+        :returns: The MIME type of the provided file path, or `None` if the MIME type cannot be determined.
+        """
+        extension = path.suffix.lower()
+        mime_type = mimetypes.guess_type(path.as_posix())[0]
+        # lookup custom mappings if the mime type is not found
+        return CUSTOM_MIMETYPES.get(extension, mime_type)
diff --git a/test/components/routers/__init__.py b/test/components/routers/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/test/components/routers/test_document_type_router.py b/test/components/routers/test_document_type_router.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>`
	`2`	`+#`
	`3`	`+# SPDX-License-Identifier: Apache-2.0`