Improvements and fixes for issues in the vector search app (#3505)

shanbady · Copilot · web-flow · commit 94957adef1d9 · 2026-06-23T12:55:01.000-04:00
* making resource type a distinct list

* add test

* move tune collections out of hot path

* specify prefetch limit to outer prefetch

* adjust gauss decay param for custom score boost

* moving get collection outside of loop section

* fix indexed param

* check for collection before tuning

Co-authored-by: Copilot Autofix powered by AI &lt;175728472+Copilot@users.noreply.github.com&gt;

* moving get collection outside of loop section

---------

Co-authored-by: Copilot Autofix powered by AI &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/main/settings_celery.py b/main/settings_celery.py
@@ -186,6 +186,10 @@
             "task": "vector_search.tasks.sync_topics",
             "schedule": crontab(minute=0, hour="6,18,23"),  # 2am 2pm and 7pm EST
         },
+        "daily_tune_qdrant_collections": {
+            "task": "vector_search.tasks.tune_qdrant_collections",
+            "schedule": crontab(minute=0, hour=10),  # once per day
+        },
         "weekly_check_missing_embeddings": {
             "task": "vector_search.tasks.embeddings_healthcheck",
             "schedule": crontab(
diff --git a/vector_search/constants.py b/vector_search/constants.py
@@ -68,7 +68,7 @@
     "course_feature": models.PayloadSchemaType.KEYWORD,
     "topics[].name": models.PayloadSchemaType.KEYWORD,
     "ocw_topics": models.PayloadSchemaType.KEYWORD,
-    "runs[].level.code": models.PayloadSchemaType.KEYWORD,
+    "runs[].level[].code": models.PayloadSchemaType.KEYWORD,
     "departments[].department_id": models.PayloadSchemaType.KEYWORD,
     "platform.code": models.PayloadSchemaType.KEYWORD,
     "offered_by.code": models.PayloadSchemaType.KEYWORD,
diff --git a/vector_search/tasks.py b/vector_search/tasks.py
@@ -47,10 +47,22 @@
     vector_point_id,
     vector_point_key,
 )
+from vector_search.utils import (
+    tune_qdrant_collections as tune_qdrant_collections_util,
+)
 
 log = logging.getLogger(__name__)
 
 
+@app.task
+def tune_qdrant_collections():
+    """
+    Tune optimizer settings for Qdrant collections.
+    """
+    log.info("Running Qdrant collection tuning task")
+    tune_qdrant_collections_util()
+
+
 def _replace_with_chain(task, task_signatures):
     """
     Replace a task with a chain only when there is work to do.
@@ -347,7 +359,9 @@ def embed_new_learning_resources(self):
     ).exclude(resource_type=CONTENT_FILE_TYPE)
 
     resource_types = list(
-        new_learning_resources.values_list("resource_type", flat=True)
+        new_learning_resources.order_by("resource_type")
+        .values_list("resource_type", flat=True)
+        .distinct()
     )
     tasks = []
     for resource_type in resource_types:
diff --git a/vector_search/tasks_test.py b/vector_search/tasks_test.py
@@ -168,6 +168,7 @@ def test_embed_new_learning_resources(mocker, mocked_celery):
         embed_new_learning_resources.delay()
     list(mocked_celery.group.call_args[0][0])
 
+    assert generate_embeddings_mock.si.call_count == 1
     embedded_ids = generate_embeddings_mock.si.mock_calls[0].args[0]
     assert sorted(new_resource_ids) == sorted(embedded_ids)
 
diff --git a/vector_search/utils.py b/vector_search/utils.py
@@ -204,6 +204,27 @@ def create_qdrant_collections(force_recreate):
     update_qdrant_indexes()
 
 
+def tune_qdrant_collections():
+    """Tune optimizer settings for Qdrant collections."""
+    if not all([settings.QDRANT_HOST, settings.QDRANT_BASE_COLLECTION_NAME]):
+        logger.warning(
+            "Skipping Qdrant collection tuning: "
+            "QDRANT_HOST and QDRANT_BASE_COLLECTION_NAME must be set"
+        )
+        return
+
+    client = qdrant_client()
+    collections = [
+        RESOURCES_COLLECTION_NAME,
+        CONTENT_FILES_COLLECTION_NAME,
+        TOPICS_COLLECTION_NAME,
+    ]
+    for collection_name in collections:
+        if not client.collection_exists(collection_name=collection_name):
+            continue
+        tune_collection(client, collection_name)
+
+
 def create_qdrant_collection(collection_name, force_recreate):
     """
     Create or recreate a QDrant collection
@@ -244,7 +265,6 @@ def create_qdrant_collection(collection_name, force_recreate):
             ),
             hnsw_config=models.HnswConfigDiff(on_disk=False),
         )
-    tune_collection(client, collection_name)
 
 
 def update_qdrant_indexes():
@@ -260,8 +280,8 @@ def update_qdrant_indexes():
     ]:
         indexes = index[0]
         collection_name = index[1]
+        collection = client.get_collection(collection_name=collection_name)
         for index_field in indexes:
-            collection = client.get_collection(collection_name=collection_name)
             if (
                 index_field not in collection.payload_schema
                 or indexes[index_field]
@@ -1317,9 +1337,9 @@ def custom_score_formula(collection_name: str) -> list[models.MultExpression]:
                         models.GaussDecayExpression(
                             gauss_decay=models.DecayParamsExpression(
                                 x="$score",  # decay over the relevance score itself
-                                target=1.0,  # cosine "perfect match" — full boost
+                                target=0.4,  # full boost at this target
                                 scale=0.2,
-                                midpoint=0.5,
+                                midpoint=0.2,
                             )
                         ),
                     ]
diff --git a/vector_search/views.py b/vector_search/views.py
@@ -187,6 +187,7 @@ async def _build_search_params(  # noqa: PLR0913
             prefetch_params = [
                 models.Prefetch(
                     query=custom_formula_query,
+                    limit=prefetch_limit,
                     prefetch=[
                         models.Prefetch(
                             filter=search_filter,
@@ -198,6 +199,7 @@ async def _build_search_params(  # noqa: PLR0913
                 ),
                 models.Prefetch(
                     query=custom_formula_query,
+                    limit=prefetch_limit,
                     prefetch=[
                         models.Prefetch(
                             filter=search_filter,