Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 32 additions & 18 deletions dsc/workflows/digitized_theses/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,20 +189,35 @@ def _get_item_submissions_from_synced_batch(self) -> list[ItemSubmission]:
)
continue

# track identifier as 'seen'
seen_item_identifiers.append(item_identifier)
item_submissions.append(
ItemSubmission(
batch_id=self.batch_id,
item_identifier=item_identifier,
workflow_name=self.workflow_name,
status=(
ItemSubmissionStatus.CREATE_SUCCESS
if theses_subfolder in ["replacement-theses", "new-theses"]
else ItemSubmissionStatus.CREATE_SKIPPED
),
)

# create an instance of ItemSubmission
item_submission = ItemSubmission(
batch_id=self.batch_id,
item_identifier=item_identifier,
workflow_name=self.workflow_name,
)

if theses_subfolder == "replacement-theses":
try:
dspace_item = self._get_item_from_dspace(
item_submission.item_identifier
)
item_submission.dspace_handle = dspace_item.handle
item_submission.status = ItemSubmissionStatus.CREATE_SUCCESS
item_submission.status_details = "Replacement thesis"
except exceptions.DSpaceClientSearchError as exception:
item_submission.status = ItemSubmissionStatus.CREATE_SKIPPED
item_submission.status_details = str(exception)
elif theses_subfolder == "new-theses":
item_submission.status = ItemSubmissionStatus.CREATE_SUCCESS
item_submission.status_details = "New thesis"
else:
item_submission.status = ItemSubmissionStatus.CREATE_SKIPPED
item_submission.status_details = "Skipped thesis"
item_submissions.append(item_submission)

return item_submissions

def _create_batch_in_s3(self) -> list[ItemSubmission]:
Expand Down Expand Up @@ -256,7 +271,7 @@ def _create_batch_in_s3(self) -> list[ItemSubmission]:
requests.exceptions.HTTPError,
exceptions.ItemMetadataNotFoundError,
) as exception:
item_submission.status = "create_failed"
item_submission.status = ItemSubmissionStatus.CREATE_FAILED
item_submission.status_details = str(exception)
item_submissions.append(item_submission)
continue
Expand All @@ -265,28 +280,27 @@ def _create_batch_in_s3(self) -> list[ItemSubmission]:
try:
dspace_item = self._get_item_from_dspace(item_submission.item_identifier)
except exceptions.DSpaceClientSearchError as exception:
item_submission.status = "create_skipped"
item_submission.status = ItemSubmissionStatus.CREATE_FAILED
item_submission.status_details = str(exception)
item_submissions.append(item_submission)
continue

# check if item submission is a 'Replacement thesis'
if dspace_item and not self._is_replacement_thesis(dspace_item):
item_submission.dspace_handle = dspace_item.handle
item_submission.status = "create_skipped"
item_submission.status = ItemSubmissionStatus.CREATE_SKIPPED
item_submission.status_details = "Cannot replace the electronic version submitted by the student author." # noqa: E501
item_submissions.append(item_submission)
continue

if dspace_item and self._is_replacement_thesis(dspace_item):
item_submission.dspace_handle = dspace_item.handle
item_submission.status = "create_success"
item_submission.status = ItemSubmissionStatus.CREATE_SUCCESS
item_submission.status_details = "Replacement thesis"
item_submissions.append(item_submission)
else:
item_submission.status = "create_success"
item_submission.status = ItemSubmissionStatus.CREATE_SUCCESS
Comment thread
jonavellecuerdo marked this conversation as resolved.
item_submission.status_details = "New thesis"
item_submissions.append(item_submission)
item_submissions.append(item_submission)

self._move_batch_files_to_theses_subfolders(
item_submissions, batch_location=tmp_batch_path
Expand Down
104 changes: 101 additions & 3 deletions tests/workflows/digitized_theses/test_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from lxml import etree

from dsc import exceptions
from dsc.db.models import ItemSubmissionStatus
from dsc.item_submission import ItemSubmission
from dsc.workflows.digitized_theses import (
DigitizedTheses,
Expand Down Expand Up @@ -110,7 +111,8 @@ def alma_sru_response_no_record():


@pytest.fixture
def mock_s3_digitized_theses(mocked_s3, s3_client):
def mock_s3_digitized_theses_dsc(mocked_s3, s3_client):
"""Mock batch for digitized theses in DSC S3 bucket."""
for source_metadata_file in glob.glob(
"tests/fixtures/digitized-theses/batch-aaa/**/*.xml", recursive=True
):
Expand Down Expand Up @@ -171,6 +173,102 @@ def test_workflow_update_batch_id():
assert workflow._update_batch_id(batch_id="batch-aaa") == "batch-aaa-20250101T090000Z"


@patch("dsc.workflows.digitized_theses.workflow.DigitizedTheses._get_item_from_dspace")
def test_workflow_get_item_submissions_from_synced_batch_replacement(
mock_workflow_get_item_from_dspace, mock_s3_digitized_theses_dsc
):
"""Verify workflow can get item submissions from synced batch.

This test uses mock_s3_digitized_theses, which represents a previously
created batch in the DSC S3 bucket (i.e., contents organized into
theses subfolders). This test shows the workflow's ability to
generate ItemSubmissions based on the contents of the existing
batch in the DSC S3 bucket.
"""
mock_response = MagicMock()
mock_response.handle = "1721.1/157651"
mock_workflow_get_item_from_dspace.return_value = mock_response

workflow = DigitizedTheses(batch_id="batch-aaa")
results = workflow._get_item_submissions_from_synced_batch()

assert results == [
ItemSubmission(
batch_id="batch-aaa",
item_identifier="05588126",
workflow_name="digitized-theses",
dspace_handle="1721.1/157651",
status=ItemSubmissionStatus.CREATE_SUCCESS,
status_details="Replacement thesis",
)
]


@patch("dsc.workflows.digitized_theses.workflow.DigitizedTheses._get_item_from_dspace")
def test_workflow_get_item_submissions_from_synced_batch_replacement_not_found(
mock_workflow_get_item_from_dspace,
mock_s3_digitized_theses_dsc,
):
mock_workflow_get_item_from_dspace.side_effect = exceptions.DSpaceClientSearchError(
"Error occurred"
)
workflow = DigitizedTheses(batch_id="batch-aaa")
results = workflow._get_item_submissions_from_synced_batch()

assert results == [
ItemSubmission(
batch_id="batch-aaa",
item_identifier="05588126",
workflow_name="digitized-theses",
dspace_handle=None,
status=ItemSubmissionStatus.CREATE_SKIPPED,
status_details="Error occurred",
)
]


@patch("dsc.workflows.digitized_theses.workflow.S3Client.files_iter")
def test_workflow_get_item_submissions_from_synced_batch_new(mock_s3client_files_iter):
mock_s3client_files_iter.return_value = [
"tests/fixtures/digitized-theses/batch-aaa/new-theses/05588126/05588126.xml"
]
workflow = DigitizedTheses(batch_id="batch-aaa")
results = workflow._get_item_submissions_from_synced_batch()

assert results == [
ItemSubmission(
batch_id="batch-aaa",
item_identifier="05588126",
workflow_name="digitized-theses",
dspace_handle=None,
status=ItemSubmissionStatus.CREATE_SUCCESS,
status_details="New thesis",
)
]


@patch("dsc.workflows.digitized_theses.workflow.S3Client.files_iter")
def test_workflow_get_item_submissions_from_synced_batch_skipped(
mock_s3client_files_iter,
):
mock_s3client_files_iter.return_value = [
"tests/fixtures/digitized-theses/batch-aaa/skipped-theses/05588126/05588126.xml"
]
workflow = DigitizedTheses(batch_id="batch-aaa")
results = workflow._get_item_submissions_from_synced_batch()

assert results == [
ItemSubmission(
batch_id="batch-aaa",
item_identifier="05588126",
workflow_name="digitized-theses",
dspace_handle=None,
status=ItemSubmissionStatus.CREATE_SKIPPED,
status_details="Skipped thesis",
)
]

Comment thread
jonavellecuerdo marked this conversation as resolved.

@patch("dsc.workflows.digitized_theses.workflow.requests")
def test_workflow_download_metadata_from_alma(
mock_requests, alma_sru_response_single_record, tmp_path
Expand Down Expand Up @@ -371,7 +469,7 @@ def test_workflow_submit_items_handles_errors(
)


def test_workflow_load_batch_manifest(mock_s3_digitized_theses):
def test_workflow_load_batch_manifest(mock_s3_digitized_theses_dsc):
workflow = DigitizedTheses(batch_id="batch-aaa")
assert workflow._load_batch_manifest() == defaultdict(
dict,
Expand All @@ -385,7 +483,7 @@ def test_workflow_load_batch_manifest(mock_s3_digitized_theses):


@freeze_time("2025-01-01 09:00:00")
def test_workflow_get_transformed_metadata(mock_s3_digitized_theses):
def test_workflow_get_transformed_metadata(mock_s3_digitized_theses_dsc):
workflow = DigitizedTheses(batch_id="batch-aaa")
item_metadata = workflow._get_transformed_metadata(
source_metadata_file="tests/fixtures/digitized-theses/batch-aaa/replacement-theses/05588126/05588126.xml"
Expand Down
Loading