diff --git a/dsc/workflows/digitized_theses/workflow.py b/dsc/workflows/digitized_theses/workflow.py index 7d3b4c4..6ad4481 100644 --- a/dsc/workflows/digitized_theses/workflow.py +++ b/dsc/workflows/digitized_theses/workflow.py @@ -189,20 +189,35 @@ def _get_item_submissions_from_synced_batch(self) -> list[ItemSubmission]: ) continue + # track identifier as 'seen' seen_item_identifiers.append(item_identifier) - item_submissions.append( - ItemSubmission( - batch_id=self.batch_id, - item_identifier=item_identifier, - workflow_name=self.workflow_name, - status=( - ItemSubmissionStatus.CREATE_SUCCESS - if theses_subfolder in ["replacement-theses", "new-theses"] - else ItemSubmissionStatus.CREATE_SKIPPED - ), - ) + + # create an instance of ItemSubmission + item_submission = ItemSubmission( + batch_id=self.batch_id, + item_identifier=item_identifier, + workflow_name=self.workflow_name, ) + if theses_subfolder == "replacement-theses": + try: + dspace_item = self._get_item_from_dspace( + item_submission.item_identifier + ) + item_submission.dspace_handle = dspace_item.handle + item_submission.status = ItemSubmissionStatus.CREATE_SUCCESS + item_submission.status_details = "Replacement thesis" + except exceptions.DSpaceClientSearchError as exception: + item_submission.status = ItemSubmissionStatus.CREATE_SKIPPED + item_submission.status_details = str(exception) + elif theses_subfolder == "new-theses": + item_submission.status = ItemSubmissionStatus.CREATE_SUCCESS + item_submission.status_details = "New thesis" + else: + item_submission.status = ItemSubmissionStatus.CREATE_SKIPPED + item_submission.status_details = "Skipped thesis" + item_submissions.append(item_submission) + return item_submissions def _create_batch_in_s3(self) -> list[ItemSubmission]: @@ -256,7 +271,7 @@ def _create_batch_in_s3(self) -> list[ItemSubmission]: requests.exceptions.HTTPError, exceptions.ItemMetadataNotFoundError, ) as exception: - item_submission.status = "create_failed" + item_submission.status = ItemSubmissionStatus.CREATE_FAILED item_submission.status_details = str(exception) item_submissions.append(item_submission) continue @@ -265,7 +280,7 @@ def _create_batch_in_s3(self) -> list[ItemSubmission]: try: dspace_item = self._get_item_from_dspace(item_submission.item_identifier) except exceptions.DSpaceClientSearchError as exception: - item_submission.status = "create_skipped" + item_submission.status = ItemSubmissionStatus.CREATE_FAILED item_submission.status_details = str(exception) item_submissions.append(item_submission) continue @@ -273,20 +288,19 @@ def _create_batch_in_s3(self) -> list[ItemSubmission]: # check if item submission is a 'Replacement thesis' if dspace_item and not self._is_replacement_thesis(dspace_item): item_submission.dspace_handle = dspace_item.handle - item_submission.status = "create_skipped" + item_submission.status = ItemSubmissionStatus.CREATE_SKIPPED item_submission.status_details = "Cannot replace the electronic version submitted by the student author." # noqa: E501 item_submissions.append(item_submission) continue if dspace_item and self._is_replacement_thesis(dspace_item): item_submission.dspace_handle = dspace_item.handle - item_submission.status = "create_success" + item_submission.status = ItemSubmissionStatus.CREATE_SUCCESS item_submission.status_details = "Replacement thesis" - item_submissions.append(item_submission) else: - item_submission.status = "create_success" + item_submission.status = ItemSubmissionStatus.CREATE_SUCCESS item_submission.status_details = "New thesis" - item_submissions.append(item_submission) + item_submissions.append(item_submission) self._move_batch_files_to_theses_subfolders( item_submissions, batch_location=tmp_batch_path diff --git a/tests/workflows/digitized_theses/test_workflow.py b/tests/workflows/digitized_theses/test_workflow.py index 7133104..ffc4d5e 100644 --- a/tests/workflows/digitized_theses/test_workflow.py +++ b/tests/workflows/digitized_theses/test_workflow.py @@ -11,6 +11,7 @@ from lxml import etree from dsc import exceptions +from dsc.db.models import ItemSubmissionStatus from dsc.item_submission import ItemSubmission from dsc.workflows.digitized_theses import ( DigitizedTheses, @@ -110,7 +111,8 @@ def alma_sru_response_no_record(): @pytest.fixture -def mock_s3_digitized_theses(mocked_s3, s3_client): +def mock_s3_digitized_theses_dsc(mocked_s3, s3_client): + """Mock batch for digitized theses in DSC S3 bucket.""" for source_metadata_file in glob.glob( "tests/fixtures/digitized-theses/batch-aaa/**/*.xml", recursive=True ): @@ -171,6 +173,102 @@ def test_workflow_update_batch_id(): assert workflow._update_batch_id(batch_id="batch-aaa") == "batch-aaa-20250101T090000Z" +@patch("dsc.workflows.digitized_theses.workflow.DigitizedTheses._get_item_from_dspace") +def test_workflow_get_item_submissions_from_synced_batch_replacement( + mock_workflow_get_item_from_dspace, mock_s3_digitized_theses_dsc +): + """Verify workflow can get item submissions from synced batch. + + This test uses mock_s3_digitized_theses, which represents a previously + created batch in the DSC S3 bucket (i.e., contents organized into + theses subfolders). This test shows the workflow's ability to + generate ItemSubmissions based on the contents of the existing + batch in the DSC S3 bucket. + """ + mock_response = MagicMock() + mock_response.handle = "1721.1/157651" + mock_workflow_get_item_from_dspace.return_value = mock_response + + workflow = DigitizedTheses(batch_id="batch-aaa") + results = workflow._get_item_submissions_from_synced_batch() + + assert results == [ + ItemSubmission( + batch_id="batch-aaa", + item_identifier="05588126", + workflow_name="digitized-theses", + dspace_handle="1721.1/157651", + status=ItemSubmissionStatus.CREATE_SUCCESS, + status_details="Replacement thesis", + ) + ] + + +@patch("dsc.workflows.digitized_theses.workflow.DigitizedTheses._get_item_from_dspace") +def test_workflow_get_item_submissions_from_synced_batch_replacement_not_found( + mock_workflow_get_item_from_dspace, + mock_s3_digitized_theses_dsc, +): + mock_workflow_get_item_from_dspace.side_effect = exceptions.DSpaceClientSearchError( + "Error occurred" + ) + workflow = DigitizedTheses(batch_id="batch-aaa") + results = workflow._get_item_submissions_from_synced_batch() + + assert results == [ + ItemSubmission( + batch_id="batch-aaa", + item_identifier="05588126", + workflow_name="digitized-theses", + dspace_handle=None, + status=ItemSubmissionStatus.CREATE_SKIPPED, + status_details="Error occurred", + ) + ] + + +@patch("dsc.workflows.digitized_theses.workflow.S3Client.files_iter") +def test_workflow_get_item_submissions_from_synced_batch_new(mock_s3client_files_iter): + mock_s3client_files_iter.return_value = [ + "tests/fixtures/digitized-theses/batch-aaa/new-theses/05588126/05588126.xml" + ] + workflow = DigitizedTheses(batch_id="batch-aaa") + results = workflow._get_item_submissions_from_synced_batch() + + assert results == [ + ItemSubmission( + batch_id="batch-aaa", + item_identifier="05588126", + workflow_name="digitized-theses", + dspace_handle=None, + status=ItemSubmissionStatus.CREATE_SUCCESS, + status_details="New thesis", + ) + ] + + +@patch("dsc.workflows.digitized_theses.workflow.S3Client.files_iter") +def test_workflow_get_item_submissions_from_synced_batch_skipped( + mock_s3client_files_iter, +): + mock_s3client_files_iter.return_value = [ + "tests/fixtures/digitized-theses/batch-aaa/skipped-theses/05588126/05588126.xml" + ] + workflow = DigitizedTheses(batch_id="batch-aaa") + results = workflow._get_item_submissions_from_synced_batch() + + assert results == [ + ItemSubmission( + batch_id="batch-aaa", + item_identifier="05588126", + workflow_name="digitized-theses", + dspace_handle=None, + status=ItemSubmissionStatus.CREATE_SKIPPED, + status_details="Skipped thesis", + ) + ] + + @patch("dsc.workflows.digitized_theses.workflow.requests") def test_workflow_download_metadata_from_alma( mock_requests, alma_sru_response_single_record, tmp_path @@ -371,7 +469,7 @@ def test_workflow_submit_items_handles_errors( ) -def test_workflow_load_batch_manifest(mock_s3_digitized_theses): +def test_workflow_load_batch_manifest(mock_s3_digitized_theses_dsc): workflow = DigitizedTheses(batch_id="batch-aaa") assert workflow._load_batch_manifest() == defaultdict( dict, @@ -385,7 +483,7 @@ def test_workflow_load_batch_manifest(mock_s3_digitized_theses): @freeze_time("2025-01-01 09:00:00") -def test_workflow_get_transformed_metadata(mock_s3_digitized_theses): +def test_workflow_get_transformed_metadata(mock_s3_digitized_theses_dsc): workflow = DigitizedTheses(batch_id="batch-aaa") item_metadata = workflow._get_transformed_metadata( source_metadata_file="tests/fixtures/digitized-theses/batch-aaa/replacement-theses/05588126/05588126.xml"