Skip to content

Commit 3461d01

Browse files
pk-zipstackclaude
andcommitted
UN-2836 [FEAT] Return full text contents of input file in API response
Add `include_extracted_text` parameter to API deployment endpoints that returns the full extracted text of each input file at the top level of each file result, independent of `include_metadata` and the `ENABLE_HIGHLIGHT_API_DEPLOYMENT` configuration flag. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 3b1a343 commit 3461d01

5 files changed

Lines changed: 46 additions & 2 deletions

File tree

backend/api_v2/api_deployment_views.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,9 @@ def post(
8282
timeout = serializer.validated_data.get(ApiExecution.TIMEOUT_FORM_DATA)
8383
include_metadata = serializer.validated_data.get(ApiExecution.INCLUDE_METADATA)
8484
include_metrics = serializer.validated_data.get(ApiExecution.INCLUDE_METRICS)
85+
include_extracted_text = serializer.validated_data.get(
86+
ApiExecution.INCLUDE_EXTRACTED_TEXT
87+
)
8588
use_file_history = serializer.validated_data.get(ApiExecution.USE_FILE_HISTORY)
8689
tag_names = serializer.validated_data.get(ApiExecution.TAGS)
8790
llm_profile_id = serializer.validated_data.get(ApiExecution.LLM_PROFILE_ID)
@@ -118,6 +121,7 @@ def post(
118121
timeout=timeout,
119122
include_metadata=include_metadata,
120123
include_metrics=include_metrics,
124+
include_extracted_text=include_extracted_text,
121125
use_file_history=use_file_history,
122126
tag_names=tag_names,
123127
llm_profile_id=llm_profile_id,
@@ -172,6 +176,9 @@ def get(
172176
execution_id = serializer.validated_data.get(ApiExecution.EXECUTION_ID)
173177
include_metadata = serializer.validated_data.get(ApiExecution.INCLUDE_METADATA)
174178
include_metrics = serializer.validated_data.get(ApiExecution.INCLUDE_METRICS)
179+
include_extracted_text = serializer.validated_data.get(
180+
ApiExecution.INCLUDE_EXTRACTED_TEXT
181+
)
175182

176183
# Fetch execution status
177184
response: ExecutionResponse = DeploymentHelper.get_execution_status(execution_id)
@@ -231,7 +238,10 @@ def get(
231238
)
232239
if not enable_highlight:
233240
response.remove_result_metadata_keys(["highlight_data"])
234-
response.remove_result_metadata_keys(["extracted_text"])
241+
if not include_extracted_text:
242+
response.remove_result_metadata_keys(["extracted_text"])
243+
if include_extracted_text:
244+
response.promote_extracted_text()
235245
if not include_metadata:
236246
response.remove_result_metadata_keys()
237247
if not include_metrics:

backend/api_v2/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ class ApiExecution:
55
TIMEOUT_FORM_DATA: str = "timeout"
66
INCLUDE_METADATA: str = "include_metadata"
77
INCLUDE_METRICS: str = "include_metrics"
8+
INCLUDE_EXTRACTED_TEXT: str = "include_extracted_text"
89
USE_FILE_HISTORY: str = "use_file_history" # Undocumented parameter
910
EXECUTION_ID: str = "execution_id"
1011
TAGS: str = "tags"

backend/api_v2/deployment_helper.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ def execute_workflow(
153153
timeout: int,
154154
include_metadata: bool = False,
155155
include_metrics: bool = False,
156+
include_extracted_text: bool = False,
156157
use_file_history: bool = False,
157158
tag_names: list[str] = [],
158159
llm_profile_id: str | None = None,
@@ -272,7 +273,10 @@ def execute_workflow(
272273
)
273274
if not enable_highlight:
274275
result.remove_result_metadata_keys(["highlight_data"])
275-
result.remove_result_metadata_keys(["extracted_text"])
276+
if not include_extracted_text:
277+
result.remove_result_metadata_keys(["extracted_text"])
278+
if include_extracted_text:
279+
result.promote_extracted_text()
276280
if not include_metadata:
277281
result.remove_result_metadata_keys()
278282
if not include_metrics:

backend/api_v2/serializers.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,9 @@ class ExecutionRequestSerializer(TagParamsSerializer):
210210
If -1 it corresponds to async execution. Defaults to -1
211211
include_metadata (bool): Flag to include metadata in API response
212212
include_metrics (bool): Flag to include metrics in API response
213+
include_extracted_text (bool): Flag to include the full extracted text
214+
of the input file in the API response. The extracted text is returned
215+
at the top level of each file result, independent of include_metadata.
213216
use_file_history (bool): Flag to use FileHistory to save and retrieve
214217
responses quickly. This is undocumented to the user and can be
215218
helpful for demos.
@@ -232,6 +235,7 @@ class ExecutionRequestSerializer(TagParamsSerializer):
232235
)
233236
include_metadata = BooleanField(default=False)
234237
include_metrics = BooleanField(default=False)
238+
include_extracted_text = BooleanField(default=False)
235239
use_file_history = BooleanField(default=False)
236240

237241
presigned_urls = ListField(child=URLField(), required=False)
@@ -408,6 +412,7 @@ class ExecutionQuerySerializer(Serializer):
408412
execution_id = CharField(required=True)
409413
include_metadata = BooleanField(default=False)
410414
include_metrics = BooleanField(default=False)
415+
include_extracted_text = BooleanField(default=False)
411416

412417
def validate_execution_id(self, value):
413418
"""Trim spaces, validate UUID format, and check if execution_id exists."""

backend/workflow_manager/workflow_v2/dto.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,30 @@ def remove_result_metadata_keys(self, keys_to_remove: list[str] = []) -> None:
6969

7070
self._remove_specific_keys(result=result, keys_to_remove=keys_to_remove)
7171

72+
def promote_extracted_text(self) -> None:
73+
"""Copies extracted_text from metadata to the top level of each file
74+
result item. This allows extracted_text to be returned independently
75+
of include_metadata.
76+
77+
After promotion, the extracted_text appears as:
78+
result[i]["extracted_text"] = "..."
79+
"""
80+
if not isinstance(self.result, list):
81+
return
82+
83+
for item in self.result:
84+
if not isinstance(item, dict):
85+
continue
86+
87+
result = item.get("result")
88+
if not isinstance(result, dict):
89+
continue
90+
91+
metadata = result.get("metadata", {})
92+
extracted_text = metadata.get("extracted_text")
93+
if extracted_text is not None:
94+
item["extracted_text"] = extracted_text
95+
7296
def remove_result_metrics(self) -> None:
7397
"""Removes the 'metrics' key from the 'result' dictionary within each
7498
'result' dictionary in the 'result' list attribute of the instance.

0 commit comments

Comments
 (0)