Skip to content

Commit a712e75

Browse files
committed
dags: Split description out of "Status" column
Implements: AP-684
1 parent e05813d commit a712e75

3 files changed

Lines changed: 19 additions & 12 deletions

File tree

mokelumne/dags/gen_llm_image_descriptions.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
logger = logging.getLogger(__name__)
3434

3535

36-
RunStatus = namedtuple('RunStatus', ('tind_id', 'status', 'path'))
36+
RunStatus = namedtuple('RunStatus', ('tind_id', 'status', 'description', 'path'))
3737

3838
SUPPORTED_IMAGE_TYPES = {"image/jpeg", "image/png", "image/gif", "image/webp"}
3939
"""The supported image MIME types we will fetch."""
@@ -213,15 +213,16 @@ def fetch_image_to_record_directory(run_id: str, fetcher: ImageFetcher,
213213
return RunStatus(
214214
tind_id=tind_id,
215215
path="",
216-
status=f"skipped: Unsupported file type {file_md.get('mime')}",
216+
status="skipped",
217+
description=f"Unsupported file type {file_md.get('mime')}",
217218
)
218219

219220
path = str(fetcher.fetch_one_image_for_record(tind_id, run_id))
220221
except Exception as ex: # pylint: disable=broad-exception-caught
221222
logger.warning("Fetcher encountered exception", exc_info=ex)
222-
return RunStatus(tind_id=tind_id, status=f'failed: {str(ex)}', path='')
223+
return RunStatus(tind_id=tind_id, status="failed", description=str(ex), path="")
223224

224-
return RunStatus(tind_id=tind_id, status="fetched", path=path)
225+
return RunStatus(tind_id=tind_id, status="fetched", description="", path=path)
225226

226227
@task
227228
def write_status_to_fetched_csv(
@@ -233,13 +234,16 @@ def write_status_to_fetched_csv(
233234
fetched_path = run_dir(context["run_id"]) / "fetched.csv"
234235
with fetched_path.open("w", encoding="utf-8") as csv_file:
235236
writer = csv.writer(csv_file)
236-
writer.writerow((*records["Record ID"], "Image Path"))
237237

238238
status_col = records["Record ID"].index("Status")
239+
records["Record ID"].insert(status_col + 1, "Status Description")
240+
241+
writer.writerow((*records["Record ID"], "Image Path"))
239242

240243
for status in statuses:
241-
record = [*records[status[0]], *status[2:]]
244+
record = [*records[status[0]], *status[3:]]
242245
record[status_col] = status[1]
246+
record.insert(status_col + 1, status[2])
243247
writer.writerow(record)
244248

245249
processed = read_csv_to_process()
@@ -308,6 +312,7 @@ def transform_results(
308312
"Image Name": Path(record["Image Path"]).name,
309313
"Collection name": record["Collection name"],
310314
"Status": record["Status"],
315+
"Status description": record.get("Status description", ""),
311316
"520__a-1": record.get("Description", ""),
312317
"5880_a": f"Image description generated by AI ({ENV.get('AWS_MODEL_LABEL')})"
313318
" and reviewed on [MM/YYYY].",
@@ -336,8 +341,8 @@ def write_output_csv(processed_dicts: list[list[dict[str, str]]]) -> None:
336341
writer.writeheader()
337342
writer.writerows(all_results)
338343

339-
prompt = get_prompt()
340344
batches = read_and_batch_csv()
345+
prompt = get_prompt()
341346
batch_results = invoke_llm_on_batch_with_prompt.partial(prompt=prompt).expand(
342347
batch=batches
343348
)

mokelumne/util/image_describer.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ def describe(self, record: dict[str, str]) -> dict[str, str]:
4343
len(encoded),
4444
record_meta,
4545
)
46-
record["Status"] = "failure: file size exceeds limit"
46+
record["Status"] = "failure"
47+
record["Status description"] = "file size exceeds limit"
4748
return record
4849

4950
image_msg = HumanMessage(
@@ -55,13 +56,15 @@ def describe(self, record: dict[str, str]) -> dict[str, str]:
5556
[self.sys_msg, image_msg], config={"callbacks": [self.langfuse_handler]}
5657
)
5758
except ClientError as exc:
58-
record["Status"] = f"failure: {exc.response['Error']['Message']}"
59+
record["Status"] = "failure"
60+
record["Status description"] = exc.response["Error"]["Message"]
5961
return record
6062

6163
if hasattr(result, "content"):
6264
record["Status"] = "success"
6365
record["Description"] = str(result.content)
6466
else:
65-
record["Status"] = "failure: no content in response"
67+
record["Status"] = "failure"
68+
record["Status description"] = "no content in response"
6669

6770
return record

test/unit/test_image_describer.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,7 @@ def test_client_error(self):
7878
describer = image_describer.ImageDescriber(model, TEST_PROMPT)
7979
result = describer.describe(NORMAL_RECORD_FIXTURE)
8080
assert "failure" in result["Status"]
81-
# NOTE: When we separate the Status column, this will need to change.
82-
assert err in result["Status"]
81+
assert err in result["Status description"]
8382

8483
def test_size_error(self):
8584
"""Test case where the record's image is too large."""

0 commit comments

Comments
 (0)