|
| 1 | +# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> |
| 2 | +# |
| 3 | +# SPDX-License-Identifier: Apache-2.0 |
| 4 | + |
| 5 | +from typing import Any, Dict, List, Optional, Union |
| 6 | + |
| 7 | +from haystack import Document, component, logging |
| 8 | +from haystack.components.builders.answer_builder import AnswerBuilder as HaystackAnswerBuilder |
| 9 | +from haystack.dataclasses.chat_message import ChatMessage |
| 10 | + |
| 11 | +from haystack_experimental.dataclasses import GeneratedAnswer |
| 12 | + |
| 13 | +logger = logging.getLogger(__name__) |
| 14 | + |
| 15 | + |
| 16 | +@component |
| 17 | +class AnswerBuilder(HaystackAnswerBuilder): |
| 18 | + """ |
| 19 | + Converts a query and Generator replies into a `GeneratedAnswer` object. |
| 20 | +
|
| 21 | + AnswerBuilder parses Generator replies using custom regular expressions. |
| 22 | + Check out the usage example below to see how it works. |
| 23 | + Optionally, it can also take documents and metadata from the Generator to add to the `GeneratedAnswer` object. |
| 24 | + AnswerBuilder works with both non-chat and chat Generators. |
| 25 | +
|
| 26 | + ### Usage example |
| 27 | +
|
| 28 | + ```python |
| 29 | + from haystack.components.builders import AnswerBuilder |
| 30 | +
|
| 31 | + builder = AnswerBuilder(pattern="Answer: (.*)") |
| 32 | + builder.run(query="What's the answer?", replies=["This is an argument. Answer: This is the answer."]) |
| 33 | + ``` |
| 34 | + """ |
| 35 | + |
| 36 | + @component.output_types(answers=List[GeneratedAnswer]) |
| 37 | + def run( # pylint: disable=too-many-positional-arguments |
| 38 | + self, |
| 39 | + query: str, |
| 40 | + replies: Union[List[str], List[ChatMessage]], |
| 41 | + meta: Optional[List[Dict[str, Any]]] = None, |
| 42 | + documents: Optional[List[Document]] = None, |
| 43 | + pattern: Optional[str] = None, |
| 44 | + reference_pattern: Optional[str] = None, |
| 45 | + ) -> Dict[str, List[GeneratedAnswer]]: |
| 46 | + """ |
| 47 | + Turns the output of a Generator into `GeneratedAnswer` objects using regular expressions. |
| 48 | +
|
| 49 | + :param query: |
| 50 | + The input query used as the Generator prompt. |
| 51 | + :param replies: |
| 52 | + The output of the Generator. Can be a list of strings or a list of `ChatMessage` objects. |
| 53 | + :param meta: |
| 54 | + The metadata returned by the Generator. If not specified, the generated answer will contain no metadata. |
| 55 | + :param documents: |
| 56 | + The documents used as the Generator inputs. If specified, they are added to |
| 57 | + the`GeneratedAnswer` objects. |
| 58 | + If both `documents` and `reference_pattern` are specified, the documents referenced in the |
| 59 | + Generator output are extracted from the input documents and added to the `GeneratedAnswer` objects. |
| 60 | + :param pattern: |
| 61 | + The regular expression pattern to extract the answer text from the Generator. |
| 62 | + If not specified, the entire response is used as the answer. |
| 63 | + The regular expression can have one capture group at most. |
| 64 | + If present, the capture group text |
| 65 | + is used as the answer. If no capture group is present, the whole match is used as the answer. |
| 66 | + Examples: |
| 67 | + `[^\\n]+$` finds "this is an answer" in a string "this is an argument.\\nthis is an answer". |
| 68 | + `Answer: (.*)` finds "this is an answer" in a string |
| 69 | + "this is an argument. Answer: this is an answer". |
| 70 | + :param reference_pattern: |
| 71 | + The regular expression pattern used for parsing the document references. |
| 72 | + If not specified, no parsing is done, and all documents are referenced. |
| 73 | + References need to be specified as indices of the input documents and start at [1]. |
| 74 | + Example: `\\[(\\d+)\\]` finds "1" in a string "this is an answer[1]". |
| 75 | +
|
| 76 | + :returns: A dictionary with the following keys: |
| 77 | + - `answers`: The answers received from the output of the Generator. |
| 78 | + """ |
| 79 | + if not meta: |
| 80 | + meta = [{}] * len(replies) |
| 81 | + elif len(replies) != len(meta): |
| 82 | + raise ValueError(f"Number of replies ({len(replies)}), and metadata ({len(meta)}) must match.") |
| 83 | + |
| 84 | + if pattern: |
| 85 | + AnswerBuilder._check_num_groups_in_regex(pattern) |
| 86 | + |
| 87 | + pattern = pattern or self.pattern |
| 88 | + reference_pattern = reference_pattern or self.reference_pattern |
| 89 | + all_answers = [] |
| 90 | + |
| 91 | + replies_to_iterate = replies |
| 92 | + meta_to_iterate = meta |
| 93 | + |
| 94 | + if self.last_message_only and replies: |
| 95 | + replies_to_iterate = replies[-1:] |
| 96 | + meta_to_iterate = meta[-1:] |
| 97 | + |
| 98 | + for reply, given_metadata in zip(replies_to_iterate, meta_to_iterate): |
| 99 | + # Extract content from ChatMessage objects if reply is a ChatMessages, else use the string as is |
| 100 | + if isinstance(reply, ChatMessage): |
| 101 | + extracted_reply = reply.text or "" |
| 102 | + else: |
| 103 | + extracted_reply = str(reply) |
| 104 | + extracted_metadata = reply.meta if isinstance(reply, ChatMessage) else {} |
| 105 | + |
| 106 | + extracted_metadata = {**extracted_metadata, **given_metadata} |
| 107 | + extracted_metadata["all_messages"] = replies |
| 108 | + |
| 109 | + referenced_docs = [] |
| 110 | + if documents: |
| 111 | + if reference_pattern: |
| 112 | + reference_idxs = AnswerBuilder._extract_reference_idxs(extracted_reply, reference_pattern) |
| 113 | + else: |
| 114 | + reference_idxs = [doc_idx for doc_idx, _ in enumerate(documents)] |
| 115 | + |
| 116 | + for idx in reference_idxs: |
| 117 | + try: |
| 118 | + referenced_docs.append(documents[idx]) |
| 119 | + except IndexError: |
| 120 | + logger.warning( |
| 121 | + "Document index '{index}' referenced in Generator output is out of range. ", index=idx + 1 |
| 122 | + ) |
| 123 | + |
| 124 | + answer_string = AnswerBuilder._extract_answer_string(extracted_reply, pattern) |
| 125 | + answer = GeneratedAnswer( |
| 126 | + data=answer_string, query=query, documents=referenced_docs, meta=extracted_metadata |
| 127 | + ) |
| 128 | + all_answers.append(answer) |
| 129 | + |
| 130 | + return {"answers": all_answers} |
0 commit comments