Skip to content

Commit ef5ee3e

Browse files
authored
remove duplicate resource task (#3436)
1 parent 0f2fc14 commit ef5ee3e

3 files changed

Lines changed: 1 addition & 69 deletions

File tree

learning_resources/tasks.py

Lines changed: 1 addition & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from celery.exceptions import Ignore
1212
from django.conf import settings
1313
from django.db import OperationalError
14-
from django.db.models import Count, Q
14+
from django.db.models import Q
1515
from django.utils import timezone
1616

1717
from learning_resources.constants import LearningResourceType
@@ -51,7 +51,6 @@
5151
)
5252
from learning_resources_search.constants import (
5353
CONTENT_FILE_TYPE,
54-
COURSE_TYPE,
5554
SEARCH_CONN_EXCEPTIONS,
5655
)
5756
from learning_resources_search.exceptions import RetryError
@@ -65,36 +64,6 @@
6564
CLEANUP_RETRY_EXCEPTIONS = (*SEARCH_CONN_EXCEPTIONS, OperationalError)
6665

6766

68-
@app.task(bind=True)
69-
def remove_duplicate_resources(self):
70-
"""Remove duplicate unpublished resources"""
71-
from vector_search.tasks import generate_embeddings
72-
73-
duplicates = (
74-
LearningResource.objects.values("readable_id")
75-
.annotate(count_id=Count("id"))
76-
.filter(count_id__gt=1)
77-
)
78-
embed_tasks = []
79-
for duplicate in duplicates:
80-
unpublished_resources = LearningResource.objects.filter(
81-
readable_id=duplicate["readable_id"],
82-
published=False,
83-
).values_list("id", flat=True)
84-
published_resources = list(
85-
LearningResource.objects.filter(
86-
readable_id=duplicate["readable_id"],
87-
published=True,
88-
).values_list("id", flat=True)
89-
)
90-
# keep the most recently created resource, delete the rest
91-
LearningResource.objects.filter(id__in=unpublished_resources).delete()
92-
embed_tasks.append(
93-
generate_embeddings.si(published_resources, COURSE_TYPE, overwrite=True)
94-
)
95-
self.replace(celery.chain(*embed_tasks))
96-
97-
9867
@app.task
9968
def update_next_start_date_and_prices():
10069
"""Update expired next start dates and prices"""

learning_resources/tasks_test.py

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from learning_resources.factories import (
1818
ContentFileFactory,
1919
LearningResourceFactory,
20-
LearningResourcePlatformFactory,
2120
LearningResourceRunFactory,
2221
)
2322
from learning_resources.models import ContentFile, LearningResource
@@ -27,7 +26,6 @@
2726
get_youtube_data,
2827
get_youtube_transcripts,
2928
marketing_page_for_resources,
30-
remove_duplicate_resources,
3129
scrape_marketing_pages,
3230
sync_canvas_courses,
3331
update_next_start_date_and_prices,
@@ -839,37 +837,6 @@ def test_sync_canvas_courses(settings, mocker, django_assert_num_queries, canvas
839837
assert mock_ingest_course.call_count == 2
840838

841839

842-
def test_remove_duplicate_resources(mocker, mocked_celery):
843-
"""
844-
Test that remove_duplicate_resources removes duplicate unpublished resources
845-
while keeping the most recently created resource.
846-
"""
847-
duplicate_id = "duplicate_id"
848-
849-
for platform_type in [PlatformType.edx, PlatformType.xpro, PlatformType.youtube]:
850-
LearningResourceFactory.create(
851-
readable_id=duplicate_id,
852-
published=False,
853-
platform=LearningResourcePlatformFactory.create(code=platform_type.name),
854-
)
855-
856-
published_reasource = LearningResourceFactory.create(
857-
readable_id=duplicate_id,
858-
published=True,
859-
platform=LearningResourcePlatformFactory.create(
860-
code=platform_type.mitxonline.name
861-
),
862-
)
863-
generate_embeddings_mock = mocker.patch(
864-
"vector_search.tasks.generate_embeddings", autospec=True
865-
)
866-
assert LearningResource.objects.filter(readable_id=duplicate_id).count() == 4
867-
with pytest.raises(mocked_celery.replace_exception_class):
868-
remove_duplicate_resources()
869-
assert generate_embeddings_mock.mock_calls[0].args[0] == [published_reasource.id]
870-
assert LearningResource.objects.filter(readable_id=duplicate_id).count() == 1
871-
872-
873840
@pytest.mark.parametrize(
874841
("etl_source", "archive_path", "overwrite"),
875842
[

main/settings_celery.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -169,10 +169,6 @@
169169
"SCRAPE_MARKETING_PAGES_SCHEDULE_SECONDS", 60 * 60 * 12
170170
), # default is every 12 hours
171171
},
172-
"remove-duplicate-courses-every-6-hours": {
173-
"task": "learning_resources.tasks.remove_duplicate_resources",
174-
"schedule": crontab(minute=0, hour=9), # 5:00am EST
175-
},
176172
"daily_embed_new_learning_resources": {
177173
"task": "vector_search.tasks.embed_new_learning_resources",
178174
"schedule": get_int(

0 commit comments

Comments
 (0)