Skip to content

Commit 05ab71c

Browse files
added missed delta changes (#760)
1 parent e4a051d commit 05ab71c

8 files changed

Lines changed: 131 additions & 58 deletions

File tree

backend/workflow_manager/endpoint_v2/constants.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ class SourceKey:
4444
FILE_EXTENSIONS = "fileExtensions"
4545
PROCESS_SUB_DIRECTORIES = "processSubDirectories"
4646
MAX_FILES = "maxFiles"
47-
ROOT_FOLDER = "rootFolder"
47+
FOLDERS = "folders"
4848

4949

5050
class DestinationKey:
@@ -57,6 +57,8 @@ class DestinationKey:
5757
PATH = "path"
5858
OUTPUT_FOLDER = "outputFolder"
5959
OVERWRITE_OUTPUT_DOCUMENT = "overwriteOutput"
60+
FILE_PATH = "filePath"
61+
EXECUTION_ID = "executionId"
6062

6163

6264
class OutputJsonKey:
@@ -71,8 +73,16 @@ class FileType:
7173

7274
class FilePattern:
7375
PDF_DOCUMENTS = ["*.pdf"]
74-
TEXT_DOCUMENTS = ["*.txt"]
75-
IMAGES = ["*.jpg", "*.jpeg", "*.png", "*.gif", "*.bmp"]
76+
TEXT_DOCUMENTS = ["*.txt", "*.doc", "*.docx"]
77+
IMAGES = [
78+
"*.jpg",
79+
"*.jpeg",
80+
"*.png",
81+
"*.gif",
82+
"*.bmp",
83+
"*.tif",
84+
"*.tiff",
85+
]
7686

7787

7888
class SourceConstant:

backend/workflow_manager/endpoint_v2/database_utils.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def get_sql_values_for_query(
5959
for column in values:
6060
if cls_name == DBConnectionClass.SNOWFLAKE:
6161
col = column.lower()
62-
type_x = column_types[col]
62+
type_x = column_types.get(col, "")
6363
if type_x == "VARIANT":
6464
values[column] = values[column].replace("'", "\\'")
6565
sql_values[column] = f"parse_json($${values[column]}$$)"
@@ -162,6 +162,10 @@ def get_column_types(
162162
def get_columns_and_values(
163163
column_mode_str: str,
164164
data: Any,
165+
file_path: str,
166+
execution_id: str,
167+
file_path_name: str = "file_path",
168+
execution_id_name: str = "execution_id",
165169
include_timestamp: bool = False,
166170
include_agent: bool = False,
167171
agent_name: Optional[str] = AgentName.UNSTRACT_DBWRITER.value,
@@ -214,7 +218,8 @@ def get_columns_and_values(
214218
values[single_column_name] = data
215219
else:
216220
values[single_column_name] = json.dumps(data)
217-
221+
values[file_path_name] = file_path
222+
values[execution_id_name] = execution_id
218223
return values
219224

220225
@staticmethod

backend/workflow_manager/endpoint_v2/destination.py

Lines changed: 35 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import json
44
import logging
55
import os
6-
from typing import Any, Optional
6+
from typing import Any, Optional, Union
77

88
import fsspec
99
import magic
@@ -36,6 +36,9 @@
3636
from workflow_manager.workflow_v2.models.file_history import FileHistory
3737
from workflow_manager.workflow_v2.models.workflow import Workflow
3838

39+
from backend.exceptions import UnstractFSException
40+
from unstract.connectors.exceptions import ConnectorError
41+
3942
logger = logging.getLogger(__name__)
4043

4144

@@ -182,16 +185,17 @@ def handle_output(
182185
self.insert_into_db(input_file_path=input_file_path)
183186
elif connection_type == WorkflowEndpoint.ConnectionType.API:
184187
result = self.get_result(file_history)
185-
metadata = self.get_metadata(file_history)
188+
exec_metadata = self.get_metadata(file_history)
186189
self._handle_api_result(
187-
file_name=file_name, error=error, result=result, metadata=metadata
190+
file_name=file_name, error=error, result=result, metadata=exec_metadata
188191
)
189192
elif connection_type == WorkflowEndpoint.ConnectionType.MANUALREVIEW:
190193
self._push_data_to_queue(file_name, workflow, input_file_path)
191194
if self.execution_service:
192195
self.execution_service.publish_log(
193196
message=f"File '{file_name}' processed successfully"
194197
)
198+
195199
if use_file_history and not file_history:
196200
FileHistoryHelper.create_file_history(
197201
cache_key=file_hash.file_hash,
@@ -222,31 +226,33 @@ def copy_output_to_output_directory(self) -> None:
222226
destination_volume_path = os.path.join(
223227
self.execution_dir, ToolExecKey.OUTPUT_DIR
224228
)
225-
destination_fs.create_dir_if_not_exists(input_dir=output_directory)
226-
destination_fsspec = destination_fs.get_fsspec_fs()
227-
228-
# Traverse local directory and create the same structure in the
229-
# output_directory
230-
for root, dirs, files in os.walk(destination_volume_path):
231-
for dir_name in dirs:
232-
destination_fsspec.mkdir(
233-
os.path.join(
229+
230+
try:
231+
destination_fs.create_dir_if_not_exists(input_dir=output_directory)
232+
233+
# Traverse local directory and create the same structure in the
234+
# output_directory
235+
for root, dirs, files in os.walk(destination_volume_path):
236+
for dir_name in dirs:
237+
current_dir = os.path.join(
234238
output_directory,
235239
os.path.relpath(root, destination_volume_path),
236240
dir_name,
237241
)
238-
)
242+
destination_fs.create_dir_if_not_exists(input_dir=current_dir)
239243

240-
for file_name in files:
241-
source_path = os.path.join(root, file_name)
242-
destination_path = os.path.join(
243-
output_directory,
244-
os.path.relpath(root, destination_volume_path),
245-
file_name,
246-
)
247-
normalized_path = os.path.normpath(destination_path)
248-
with open(source_path, "rb") as source_file:
249-
destination_fsspec.write_bytes(normalized_path, source_file.read())
244+
for file_name in files:
245+
source_path = os.path.join(root, file_name)
246+
destination_path = os.path.join(
247+
output_directory,
248+
os.path.relpath(root, destination_volume_path),
249+
file_name,
250+
)
251+
destination_fs.upload_file_to_storage(
252+
source_path=source_path, destination_path=destination_path
253+
)
254+
except ConnectorError as e:
255+
raise UnstractFSException(core_err=e) from e
250256

251257
def insert_into_db(self, input_file_path: str) -> None:
252258
"""Insert data into the database."""
@@ -276,7 +282,10 @@ def insert_into_db(self, input_file_path: str) -> None:
276282
if not data:
277283
return
278284
# Remove metadata from result
279-
data.pop("metadata", None)
285+
# Tool text-extractor returns data in the form of string.
286+
# Don't pop out metadata in this case.
287+
if isinstance(data, dict):
288+
data.pop("metadata", None)
280289
values = DatabaseUtils.get_columns_and_values(
281290
column_mode_str=column_mode,
282291
data=data,
@@ -401,7 +410,7 @@ def get_result(self, file_history: Optional[FileHistory] = None) -> Optional[Any
401410
output_file = os.path.join(self.execution_dir, WorkflowFileType.INFILE)
402411
metadata: dict[str, Any] = self.get_workflow_metadata()
403412
output_type = self.get_output_type(metadata)
404-
result: Optional[Any] = None
413+
result: Union[dict[str, Any], str] = ""
405414
try:
406415
# TODO: SDK handles validation; consider removing here.
407416
mime = magic.Magic()
@@ -431,7 +440,7 @@ def get_metadata(
431440
"""Get metadata from the output file.
432441
433442
Returns:
434-
Union[dict[str, Any], str]: Meta data.
443+
Union[dict[str, Any], str]: Metadata.
435444
"""
436445
if file_history and file_history.metadata:
437446
return self.parse_string(file_history.metadata)

backend/workflow_manager/endpoint_v2/queue_utils.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import logging
22
from dataclasses import dataclass
33
from enum import Enum
4-
from typing import Any
4+
from typing import Any, Optional
55

66
from utils.constants import Common
77
from workflow_manager.endpoint_v2.exceptions import UnstractQueueException
@@ -34,8 +34,18 @@ def get_queue_inst(connector_settings: dict[str, Any] = {}) -> UnstractQueue:
3434
@dataclass
3535
class QueueResult:
3636
file: str
37-
whisper_hash: str
3837
status: QueueResultStatus
3938
result: Any
4039
workflow_id: str
4140
file_content: str
41+
whisper_hash: Optional[str] = None
42+
43+
def to_dict(self) -> Any:
44+
return {
45+
"file": self.file,
46+
"whisper_hash": self.whisper_hash,
47+
"status": self.status,
48+
"result": self.result,
49+
"workflow_id": self.workflow_id,
50+
"file_content": self.file_content,
51+
}

backend/workflow_manager/workflow_v2/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,4 @@ class WorkflowMessages:
5858
)
5959
FILE_MARKER_CLEAR_SUCCESS = "File marker cleared successfully."
6060
FILE_MARKER_CLEAR_FAILED = "Failed to clear file marker."
61+
WORKFLOW_EXECUTION_NOT_FOUND = "Workflow execution not found."

backend/workflow_manager/workflow_v2/dto.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,58 @@ def __post_init__(self) -> None:
4141
self.message = self.message or None
4242
self.status_api = self.status_api or None
4343

44+
def remove_result_metadata_keys(self, keys_to_remove: list[str] = []) -> None:
45+
"""Removes specified keys from the 'metadata' dictionary within each
46+
'result' dictionary in the 'result' list attribute of the instance. If
47+
'keys_to_remove' is empty, the 'metadata' key itself is removed.
48+
49+
Args:
50+
keys_to_remove (List[str]): List of keys to be removed from 'metadata'.
51+
"""
52+
if not isinstance(self.result, list):
53+
return
54+
55+
for item in self.result:
56+
if not isinstance(item, dict):
57+
break
58+
59+
result = item.get("result")
60+
if not isinstance(result, dict):
61+
break
62+
63+
self._remove_specific_keys(result=result, keys_to_remove=keys_to_remove)
64+
65+
def _remove_specific_keys(self, result: dict, keys_to_remove: list[str]) -> None:
66+
"""Removes specified keys from the 'metadata' dictionary within the
67+
provided 'result' dictionary. If 'keys_to_remove' is empty, the
68+
'metadata' dictionary is cleared.
69+
70+
Args:
71+
result (dict): The dictionary containing the 'metadata' key.
72+
keys_to_remove (List[str]): List of keys to be removed from 'metadata'.
73+
"""
74+
metadata = result.get("metadata", {})
75+
if keys_to_remove:
76+
for key in keys_to_remove:
77+
metadata.pop(key, None)
78+
else:
79+
metadata = {}
80+
self._update_metadata(result=result, metadata=metadata)
81+
82+
def _update_metadata(self, result: dict, metadata: dict) -> None:
83+
"""Updates the 'metadata' key in the provided 'result' dictionary. If
84+
'metadata' is empty, removes the 'metadata' key from 'result'.
85+
86+
Args:
87+
result (dict): The dictionary to be updated.
88+
metadata (dict): The new metadata dictionary to be set. If empty, 'metadata'
89+
is removed.
90+
"""
91+
if metadata:
92+
result["metadata"] = metadata
93+
else:
94+
result.pop("metadata", None)
95+
4496

4597
@dataclass
4698
class AsyncResultData:

backend/workflow_manager/workflow_v2/execution.py

Lines changed: 7 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -242,14 +242,15 @@ def execute(self, run_id: str, file_name: str, single_step: bool = False) -> Non
242242
execution_time = end_time - start_time
243243
logger.info(f"Execution {self.execution_id} stopped")
244244
raise exception
245-
except Exception as exc:
245+
except Exception as exception:
246246
end_time = time.time()
247247
execution_time = end_time - start_time
248-
message = str(exc)[:EXECUTION_ERROR_LENGTH]
248+
message = str(exception)[:EXECUTION_ERROR_LENGTH]
249249
logger.error(
250-
f"Execution {self.execution_id} ran for {execution_time:.4f}s, {exc}"
250+
f"Execution {self.execution_id} ran for {execution_time:.4f}s, "
251+
f" Error {exception}"
251252
)
252-
raise WorkflowExecutionError(message) from exc
253+
raise WorkflowExecutionError(message) from exception
253254

254255
def publish_initial_workflow_logs(self, total_files: int) -> None:
255256
"""Publishes the initial logs for the workflow.
@@ -320,26 +321,6 @@ def execute_input_file(
320321
execution_type = ExecutionType.COMPLETE
321322
if single_step:
322323
execution_type = ExecutionType.STEP
323-
self.execute_uncached_input(
324-
run_id=run_id, file_name=file_name, single_step=single_step
325-
)
326-
self.publish_log(f"Tool executed successfully for '{file_name}'")
327-
self._handle_execution_type(execution_type)
328-
329-
def execute_uncached_input(
330-
self, run_id: str, file_name: str, single_step: bool
331-
) -> None:
332-
"""Executes the uncached input file.
333-
334-
Args:
335-
run_id (str): UUID for a single run of a file
336-
file_name (str): The name of the file to be executed.
337-
single_step (bool): Flag indicating whether to execute in
338-
single step mode.
339-
340-
Returns:
341-
None
342-
"""
343324
self.publish_log(
344325
"No entries found in cache, executing the tools"
345326
f"running the tool(s) for {file_name}"
@@ -350,6 +331,8 @@ def execute_uncached_input(
350331
component=LogComponent.SOURCE,
351332
)
352333
self.execute(run_id, file_name, single_step)
334+
self.publish_log(f"Tool executed successfully for '{file_name}'")
335+
self._handle_execution_type(execution_type)
353336

354337
def initiate_tool_execution(
355338
self,

backend/workflow_manager/workflow_v2/workflow_helper.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -542,6 +542,7 @@ def execute_workflow(
542542
scheduled: bool = False,
543543
execution_mode: Optional[tuple[str, str]] = None,
544544
pipeline_id: Optional[str] = None,
545+
use_file_history: bool = True,
545546
**kwargs: dict[str, Any],
546547
) -> Optional[list[Any]]:
547548
"""Asynchronous Execution By celery.
@@ -557,7 +558,8 @@ def execute_workflow(
557558
WorkflowExecution Mode. Defaults to None.
558559
pipeline_id (Optional[str], optional): Id of pipeline.
559560
Defaults to None.
560-
include_metadata (bool): Whether to include metadata in the prompt output
561+
use_file_history (bool): Use FileHistory table to return results on already
562+
processed files. Defaults to True
561563
562564
Kwargs:
563565
log_events_id (str): Session ID of the user, helps establish
@@ -598,6 +600,7 @@ def execute_workflow(
598600
workflow_execution=workflow_execution,
599601
execution_mode=execution_mode,
600602
hash_values_of_files=hash_values,
603+
use_file_history=use_file_history,
601604
)
602605
except Exception as error:
603606
error_message = traceback.format_exc()

0 commit comments

Comments
 (0)