feat: Add source attribution reporting on DocumentIndexingPipeline + Tests (#490)

eob · douglas-reid · web-flow · commit 372dca482ec8 · 2023-07-21T18:12:41.000Z
Right now the document indexing pipeline does not properly include
source material attribution.

This PR adds and tests that inclusion. The source attribution is both
automatic (fileId, blockId, page) and user-controlled (via the metadata
argument passed in at index request time)

---------

Co-authored-by: Douglas Reid &lt;douglas-reid@users.noreply.github.com&gt;
diff --git a/src/steamship/agents/functional/output_parser.py b/src/steamship/agents/functional/output_parser.py
@@ -28,6 +28,8 @@ def _extract_action_from_function_call(self, text: str, context: AgentContext) -
         wrapper = json.loads(text)
         fc = wrapper.get("function_call")
         name = fc.get("name", "")
+        if name.startswith("functions."):
+            name = name[len("functions.") :]  # occasionally, OpenAI prepends "functions."
         tool = self.tools_lookup_dict.get(name, None)
         if tool is None:
             raise RuntimeError(
diff --git a/src/steamship/agents/tools/question_answering/vector_search_qa_tool.py b/src/steamship/agents/tools/question_answering/vector_search_qa_tool.py
@@ -2,12 +2,13 @@
 import logging
 from typing import Any, List, Optional, Union
 
-from steamship import Block, Tag, Task
+from steamship import Block, DocTag, Tag, Task
 from steamship.agents.llms import OpenAI
 from steamship.agents.logging import AgentLogging
 from steamship.agents.schema import AgentContext
 from steamship.agents.tools.question_answering.vector_search_tool import VectorSearchTool
 from steamship.agents.utils import get_llm, with_llm
+from steamship.data import TagKind
 from steamship.utils.repl import ToolREPL
 
 DEFAULT_QUESTION_ANSWERING_PROMPT = (
@@ -45,11 +46,16 @@ def answer_question(self, question: str, context: AgentContext) -> List[Block]:
         task.wait()
 
         source_texts = []
+        source_metadata = []
 
         for item in task.output.items:
             if item.tag and item.tag.text:
                 item_data = {"text": item.tag.text}
                 source_texts.append(self.source_document_prompt.format(**item_data))
+                _metadata = {}
+                if item.tag.value:
+                    _metadata.update(item.tag.value)
+                source_metadata.append(_metadata)
 
         final_prompt = self.question_answering_prompt.format(
             **{"source_text": "\n".join(source_texts), "question": question}
@@ -65,8 +71,16 @@ def answer_question(self, question: str, context: AgentContext) -> List[Block]:
                 "prompt": final_prompt,
             },
         )
-
-        return get_llm(context, default=OpenAI(client=context.client)).complete(prompt=final_prompt)
+        output_blocks = get_llm(context, default=OpenAI(client=context.client)).complete(
+            prompt=final_prompt
+        )
+        for output_block in output_blocks:
+            if output_block.tags is None:
+                output_block.tags = []
+            output_block.tags.append(
+                Tag(kind=TagKind.DOCUMENT, name=DocTag.SOURCE, value={"sources": source_metadata})
+            )
+        return output_blocks
 
     def run(self, tool_input: List[Block], context: AgentContext) -> Union[List[Block], Task[Any]]:
         """Answers questions with the assistance of an Embedding Index plugin.
diff --git a/src/steamship/data/tags/tag_constants.py b/src/steamship/data/tags/tag_constants.py
@@ -73,6 +73,7 @@ class DocTag(str, Enum):
     CHAPTER = "chapter"
     TEXT = "text"
     CHAT = "chat"
+    METADATA = "metadata"
 
     @staticmethod
     def from_html_tag(tagname: Optional[str]) -> Optional["DocTag"]:  # noqa: C901
diff --git a/src/steamship/invocable/mixins/blockifier_mixin.py b/src/steamship/invocable/mixins/blockifier_mixin.py
@@ -38,6 +38,7 @@ def blockify(
 
         _mime_type = mime_type or file.mime_type
         if not _mime_type:
+            update_file_status(self.client, file, "Failed Blockifying")
             raise SteamshipError(
                 message=f"No MIME Type found for file {file.id}. Unable to blockify."
             )
@@ -54,6 +55,7 @@ def blockify(
             plugin_instance = self.client.use_plugin("markdown-blockifier-default")
 
         if not plugin_instance:
+            update_file_status(self.client, file, "Failed Blockifying")
             raise SteamshipError(
                 message=f"Unable to blockify file {file.id}. MIME Type {_mime_type} unsupported"
             )
diff --git a/src/steamship/invocable/mixins/file_importer_mixin.py b/src/steamship/invocable/mixins/file_importer_mixin.py
@@ -95,7 +95,7 @@ def import_url_to_file_and_task(self, url: str) -> Tuple[File, Optional[Task]]:
 
     @post("/import_url")
     def import_url(self, url: str) -> File:
-        """Import the URL to a Steamship File. Actual import will be scheduled Async."""
+        """Import the URL to a Steamship File. Actual import will be scheduled async."""
         file, task = self.import_url_to_file_and_task(url)
         return file
 
diff --git a/src/steamship/invocable/mixins/indexer_mixin.py b/src/steamship/invocable/mixins/indexer_mixin.py
@@ -1,11 +1,12 @@
 from typing import Optional, cast
 
 from steamship import Block, DocTag, File, Steamship, Tag
-from steamship.data import TagValueKey
+from steamship.data import TagKind, TagValueKey
 from steamship.data.plugin.index_plugin_instance import EmbeddingIndexPluginInstance, SearchResults
 from steamship.invocable import post
 from steamship.invocable.package_mixin import PackageMixin
 from steamship.utils.file_tags import update_file_status
+from steamship.utils.text_chunker import chunk_text
 
 DEFAULT_EMBEDDING_INDEX_CONFIG = {
     "embedder": {
@@ -71,13 +72,17 @@ def _get_index(self, index_handle: Optional[str] = None) -> EmbeddingIndexPlugin
     def index_text(
         self, text: str, metadata: Optional[dict] = None, index_handle: Optional[str] = None
     ) -> bool:
+        """Load text into an embedding index.
+
+        Optional arguments:
+        - index_handle (uses your default index if blank)
+        - metadata (returned on embedding results for source attribution)
+        """
         tags = []
-        for i in range(0, len(text), self.context_window_size):
-            # Calculate the extent of the window plus the overlap at the edges
-            min_range = max(0, i - self.context_window_overlap)
-            max_range = i + self.context_window_size + self.context_window_overlap
-            chunk = text[min_range:max_range]
-            tags.append(Tag(text=chunk, metadata=metadata))
+        for chunk in chunk_text(
+            text, chunk_size=self.context_window_size, chunk_overlap=self.context_window_overlap
+        ):
+            tags.append(Tag(text=chunk, value=metadata))
         self._get_index(index_handle).insert(tags)
         return True
 
@@ -88,9 +93,9 @@ def _index_block(
         _metadata = {}
         if metadata:
             _metadata.update(metadata)
+
         _metadata.update(
             {
-                "source": "",
                 "file_id": block.file_id,
                 "block_id": block.id,
                 "page": page_id,
@@ -103,13 +108,18 @@ def _index_block(
     def index_block(
         self, block_id: str, metadata: Optional[dict] = None, index_handle: Optional[str] = None
     ):
+        """Load a Steamship Block into an embedding index.
+
+        Optional arguments:
+        - index_handle (uses your default index if blank)
+        - metadata (returned on embedding results for source attribution)
+        """
         block = Block.get(self.client, _id=block_id)
         page_id = self._get_page(block)
         _metadata = {}
         _metadata.update(metadata)
         _metadata.update(
             {
-                "source": "",
                 "file_id": block.file_id,
                 "block_id": block.id,
                 "page": page_id,
@@ -122,11 +132,29 @@ def index_block(
     def index_file(
         self, file_id: str, metadata: Optional[dict] = None, index_handle: Optional[str] = None
     ) -> bool:
+        """Load a Steamship File into an embedding index.
+
+        Optional arguments:
+        - index_handle (uses your default index if blank)
+        - metadata (returned on embedding results for source attribution)
+        """
         file = File.get(self.client, _id=file_id)
         update_file_status(self.client, file, "Indexing")
 
+        _metadata = {}
+        if file.mime_type:
+            _metadata["mime_type"] = file.mime_type
+
+        for tag in file.tags or []:
+            if tag.kind == TagKind.DOCUMENT and tag.name == DocTag.TITLE:
+                if title := tag.value.get(TagValueKey.STRING_VALUE):
+                    _metadata["title"] = title
+
+        if metadata:
+            _metadata.update(metadata)
+
         for block in file.blocks or []:
-            self._index_block(block, metadata=metadata, index_handle=index_handle)
+            self._index_block(block, metadata=_metadata, index_handle=index_handle)
 
         update_file_status(self.client, file, "Indexed")
         return True
@@ -135,6 +163,11 @@ def index_file(
     def search_index(
         self, query: str, index_handle: Optional[str] = None, k: int = 5
     ) -> SearchResults:
+        """Search an embedding index.
+
+        Optional arguments:
+        - index_handle (uses your default index if blank)
+        """
         index = self._get_index(index_handle)
         task = index.search(query, k)
         return task.wait()
diff --git a/src/steamship/invocable/mixins/indexer_pipeline_mixin.py b/src/steamship/invocable/mixins/indexer_pipeline_mixin.py
@@ -53,6 +53,18 @@ def index_url(
         index_handle: Optional[str] = None,
         mime_type: Optional[str] = None,
     ) -> Task:
+        """Load a URL into an embedding index.
+
+        URL Types supported:
+        - PDF (Text)
+        - TXT and Markdown
+        - YouTube (Though failure rate is high)
+
+        Optional arguments:
+        - mime_type (if it can be guessed by the Content-Type header or the URL schema)
+        - index_handle (uses your default index if blank)
+        - metadata (returned on embedding results for source attribution)
+        """
         # Step 1: Import the URL
         file, task = self.importer_mixin.import_url_to_file_and_task(url)
 
@@ -66,13 +78,14 @@ def index_url(
         )
 
         # Step 3: Index the File
+        _metadata = {"url": url}
+        if metadata is not None:
+            _metadata.update(metadata)
+
         index_task = self.invocable.invoke_later(
             method="index_file",
             wait_on_tasks=[blockify_task],
-            arguments={
-                "file_id": file.id,
-                "index_handle": index_handle,
-            },
+            arguments={"file_id": file.id, "index_handle": index_handle, "metadata": _metadata},
         )
 
         # Step 4: Set the File Status to 'indexed'
diff --git a/src/steamship/utils/repl.py b/src/steamship/utils/repl.py
@@ -19,7 +19,7 @@
     from termcolor import colored  # noqa: F401
 except ImportError:
 
-    def colored(text: str, **kwargs):
+    def colored(text: str, color: str, **kwargs):
         print(text)
 
 
diff --git a/src/steamship/utils/text_chunker.py b/src/steamship/utils/text_chunker.py
@@ -0,0 +1,21 @@
+import logging
+
+
+def chunk_text(text: str, chunk_size: int = 200, chunk_overlap: int = 50):
+    """Chunk text for embedding and insertion into an embedding index."""
+    if chunk_size < 1:
+        logging.warning(f"chunk_size was f{chunk_size}. Setting to 200")
+        chunk_size = 200
+
+    if chunk_overlap < 0:
+        logging.warning(f"chunk_overlap was f{chunk_overlap}. Setting to 0")
+        chunk_overlap = 0
+
+    if chunk_overlap > chunk_size:
+        logging.warning(f"chunk_size was f{chunk_size}. Setting to chunk_size - 1 of {chunk_size}")
+        chunk_overlap = chunk_size - 1 if chunk_size > 1 else 1
+
+    step_size = chunk_size - chunk_overlap
+
+    for i in range(0, len(text), step_size):
+        yield text[i : i + chunk_size]
diff --git a/tests/steamship_tests/agents/tools/test_fact_learner_tools.py b/tests/steamship_tests/agents/tools/test_fact_learner_tools.py
@@ -14,10 +14,10 @@ def test_fact_learner_agent_service(client: Steamship):
         agent.invoke("prompt", prompt="please remember that my name is Inigo Montoya")
         agent.invoke("prompt", prompt="please remember that I am skilled swordsman")
 
-        answer_blocks = agent.invoke("prompt", prompt="what is my name?")
+        answer_blocks = agent.invoke("prompt", prompt="Is my name Inigo?")
         assert len(answer_blocks) == 1
-        assert "Inigo Montoya" in Block(**answer_blocks[0]).text
+        assert "yes" in Block(**answer_blocks[0]).text.lower()
 
         answer_blocks = agent.invoke("prompt", prompt="what do I know how to do well?")
         assert len(answer_blocks) == 1
-        assert "sword" in Block(**answer_blocks[0]).text
+        assert "sword" in Block(**answer_blocks[0]).text.lower()
diff --git a/tests/steamship_tests/agents/tools/test_vector_search_qa_tool.py b/tests/steamship_tests/agents/tools/test_vector_search_qa_tool.py
@@ -0,0 +1,41 @@
+import pytest
+
+from steamship import DocTag, Steamship
+from steamship.agents.llms import OpenAI
+from steamship.agents.schema import AgentContext
+from steamship.agents.tools.question_answering import VectorSearchQATool
+from steamship.agents.utils import with_llm
+from steamship.data import TagKind
+from steamship.invocable.mixins.indexer_mixin import IndexerMixin
+
+
+@pytest.mark.usefixtures("client")
+def test_vector_search_qa_tool(client: Steamship):
+    """Tests that we can inspect the package and mixin routes"""
+    indexer = IndexerMixin(client)
+    assert indexer.index_text("Mario was a very fun game.", metadata={"nintendo": True})
+
+    tool = VectorSearchQATool()
+    context = with_llm(
+        OpenAI(client=client),
+        AgentContext.get_or_create(client=client, context_keys={"id": "test"}),
+    )
+    res = tool.answer_question("What was Mario?", context)
+    assert len(res) == 1
+    assert "game" in res[0].text
+
+    assert res[0].tags is not None
+    sources_tag = None
+    for tag in res[0].tags:
+        if tag.kind == TagKind.DOCUMENT and tag.name == DocTag.SOURCE:
+            sources_tag = tag
+
+    assert sources_tag
+    assert sources_tag.value
+
+    assert sources_tag.value.get("sources") is not None
+    sources = sources_tag.value.get("sources")
+    assert len(sources) == 1
+
+    # This is the metadata we passed in at the top
+    assert sources[0].get("nintendo") is True
diff --git a/tests/steamship_tests/app/integration/test_e2e_mixins_indexer_pipeline.py b/tests/steamship_tests/app/integration/test_e2e_mixins_indexer_pipeline.py
@@ -2,7 +2,7 @@
 from steamship_tests import PACKAGES_PATH
 from steamship_tests.utils.deployables import deploy_package
 
-from steamship import Steamship, Task, TaskState
+from steamship import MimeTypes, Steamship, Task, TaskState
 from steamship.data.plugin.index_plugin_instance import SearchResults
 
 
@@ -28,6 +28,37 @@ def test_indexer_pipeline_mixin(client: Steamship):
         assert len(result.items) == 1
         winner = result.items[0]
         assert winner.tag.text
+        assert winner.tag.value
+        assert winner.tag.value.get("block_id")  # It has a block id stamped
+        assert winner.tag.value.get("file_id")  # It has a file id stamped
+        assert winner.tag.value.get("page") == 0
+
+        # LOAD THE TAO TE QING PDF
+        # This will test metadata
+        pdf_url2 = "https://www.with.org/tao_te_ching_en.pdf"
+
+        index_task2 = instance.invoke("index_url", url=pdf_url2, metadata={"is_tao": True})
+        index_task2 = Task.parse_obj(index_task2)
+        index_task2.client = client
+
+        assert index_task2.task_id
+        assert index_task2.state == TaskState.waiting
+
+        index_task2.wait()
+
+        result = instance.invoke("search_index", query="Tao", k=1)
+        result = SearchResults.parse_obj(result)
+        assert len(result.items) == 1
+        winner = result.items[0]
+        assert winner.tag.text
+
+        assert winner.tag.value
+        assert winner.tag.value.get("block_id")  # It has a block id stamped
+        assert winner.tag.value.get("file_id")  # It has a file id stamped
+        assert winner.tag.value.get("page") is not None
+        assert winner.tag.value.get("is_tao") is True
+        assert winner.tag.value.get("url") == pdf_url2
+        assert winner.tag.value.get("mime_type") == MimeTypes.PDF
 
         # NOTE NOTE NOTE
         # This portion of the test will be added.. but commented out, to be run only on localhost on an as-needed
diff --git a/tests/steamship_tests/app/unit/test_indexer_mixin.py b/tests/steamship_tests/app/unit/test_indexer_mixin.py
diff --git a/tests/steamship_tests/utils/test_text_chunker.py b/tests/steamship_tests/utils/test_text_chunker.py

Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,7 @@ def blockify(`
`38`	`38`
`39`	`39`	`_mime_type = mime_type or file.mime_type`
`40`	`40`	`if not _mime_type:`
	`41`	`+ update_file_status(self.client, file, "Failed Blockifying")`
`41`	`42`	`raise SteamshipError(`
`42`	`43`	`message=f"No MIME Type found for file {file.id}. Unable to blockify."`
`43`	`44`	`)`
`@@ -54,6 +55,7 @@ def blockify(`
`54`	`55`	`plugin_instance = self.client.use_plugin("markdown-blockifier-default")`
`55`	`56`
`56`	`57`	`if not plugin_instance:`
	`58`	`+ update_file_status(self.client, file, "Failed Blockifying")`
`57`	`59`	`raise SteamshipError(`
`58`	`60`	`message=f"Unable to blockify file {file.id}. MIME Type {_mime_type} unsupported"`
`59`	`61`	`)`