diff --git a/integrations/hanlp/pyproject.toml b/integrations/hanlp/pyproject.toml index c116ee4880..3c0ea2f03f 100644 --- a/integrations/hanlp/pyproject.toml +++ b/integrations/hanlp/pyproject.toml @@ -7,7 +7,7 @@ name = "hanlp-haystack" dynamic = ["version"] description = 'An integration of Han Language Processing - HanLP as a ChineseDocumentSplitter component.' readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.10" license = "Apache-2.0" keywords = [] authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }] @@ -15,7 +15,6 @@ classifiers = [ "License :: OSI Approved :: Apache Software License", "Development Status :: 4 - Beta", "Programming Language :: Python", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -24,7 +23,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "haystack-ai>=2.13.1", + "haystack-ai>=2.22.0", "hanlp>=2.1.1" ] @@ -88,7 +87,6 @@ line-length = 120 skip-string-normalization = true [tool.ruff] -target-version = "py39" line-length = 120 [tool.ruff.lint] @@ -138,10 +136,6 @@ ignore = [ "RUF001", "RUF002", ] -unfixable = [ - # Don't touch unused imports - "F401", -] [tool.ruff.lint.isort] known-first-party = ["haystack_integrations"] diff --git a/integrations/hanlp/src/haystack_integrations/components/preprocessors/hanlp/chinese_document_splitter.py b/integrations/hanlp/src/haystack_integrations/components/preprocessors/hanlp/chinese_document_splitter.py index e727065e34..7afc3c412e 100644 --- a/integrations/hanlp/src/haystack_integrations/components/preprocessors/hanlp/chinese_document_splitter.py +++ b/integrations/hanlp/src/haystack_integrations/components/preprocessors/hanlp/chinese_document_splitter.py @@ -2,8 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Callable from copy import deepcopy -from typing import Any, Callable, Literal, Optional +from typing import Any, Literal from haystack import Document, component, logging from haystack.core.serialization import default_from_dict, default_to_dict @@ -59,7 +60,7 @@ def __init__( split_overlap: int = 200, split_threshold: int = 0, respect_sentence_boundary: bool = False, - splitting_function: Optional[Callable] = None, + splitting_function: Callable | None = None, granularity: Literal["coarse", "fine"] = "coarse", ): """ @@ -406,7 +407,7 @@ def _create_docs_from_splits( """ documents: list[Document] = [] - for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)): + for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs, strict=True)): copied_meta = deepcopy(meta) copied_meta["page_number"] = splits_pages[i] copied_meta["split_id"] = i diff --git a/integrations/hanlp/tests/test_chinese_document_splitter.py b/integrations/hanlp/tests/test_chinese_document_splitter.py index 6f18448174..f3b84f944b 100644 --- a/integrations/hanlp/tests/test_chinese_document_splitter.py +++ b/integrations/hanlp/tests/test_chinese_document_splitter.py @@ -123,7 +123,7 @@ def test_metadata_copied_to_split_documents(self): splitter.warm_up() result = splitter.run(documents=documents) assert len(result["documents"]) == 2 - for doc, split_doc in zip(documents, result["documents"]): + for doc, split_doc in zip(documents, result["documents"], strict=True): assert doc.meta.items() <= split_doc.meta.items() @pytest.mark.integration @@ -136,7 +136,7 @@ def test_source_id_stored_in_metadata(self): splitter.warm_up() result = splitter.run(documents=documents) assert len(result["documents"]) == 2 - for doc, split_doc in zip(documents, result["documents"]): + for doc, split_doc in zip(documents, result["documents"], strict=True): assert doc.id == split_doc.meta["source_id"] @pytest.mark.integration