Skip to content

Commit 041edaf

Browse files
shanbadyCopilot
andauthored
Facet counts and aggregations for Vector search (#3210)
* adding aggregation generation method * adding aggregations to response * adding some optimizations and aggregations to response * regen spec * add published back to learning resources serializer * spec update * show facets on frontend * fixing aggregation counts * fix test * fix typechecks * remove unused test * adding tests for aggregations * Update vector_search/serializers.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update vector_search/views.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fixing 'with_payload' for group by * switch to safe getter * correct comment about dropping admin params * switching collection param map to constant * adding aggregation params for contentfiles * regenerate spec * adding fix for hybrid search offset * fix tests for new expected response * fix contentfile metadata * make hits and get_results same for both serializers * fixing skip with relation to offsets * tune prefetch multiplier * gather count with hits * adding fix for fields returned by contentfile endpoint * default hits to list * Update vector_search/utils.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * restore and update js test for vector hybrid search facet results * move published to resource specific serializer field * update spec --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent c24763c commit 041edaf

11 files changed

Lines changed: 731 additions & 99 deletions

File tree

frontends/api/src/generated/v0/api.ts

Lines changed: 105 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

frontends/main/src/app-pages/SearchPage/SearchPage.test.tsx

Lines changed: 57 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -152,50 +152,6 @@ describe("SearchPage", () => {
152152
},
153153
)
154154

155-
test("Vector Hybrid Search passes correct params and hides count", async () => {
156-
setMockApiResponses({
157-
search: {
158-
count: 700,
159-
metadata: {
160-
aggregations: {
161-
resource_type_group: [{ key: "course", doc_count: 100 }],
162-
},
163-
suggestions: [],
164-
},
165-
results: factories.learningResources.resources({ count: 5 }).results,
166-
},
167-
})
168-
169-
// Authenticate as path editor (admin)
170-
setMockResponse.get(urls.userMe.get(), {
171-
is_learning_path_editor: true,
172-
is_authenticated: true,
173-
})
174-
175-
renderWithProviders(<SearchPage />, { url: "?vector_search=true&q=test" })
176-
177-
await waitFor(() => {
178-
const call = makeRequest.mock.calls.find(([_method, url]) => {
179-
return url.includes(urls.search.vectorResources())
180-
})
181-
expect(call).toBeDefined()
182-
})
183-
184-
const call = makeRequest.mock.calls.find(([_method, url]) =>
185-
url.includes(urls.search.vectorResources()),
186-
)
187-
invariant(call)
188-
const fullUrl = new URL(call[1], "http://mit.edu")
189-
const apiSearchParams = fullUrl.searchParams
190-
191-
expect(apiSearchParams.get("hybrid_search")).toBe("true")
192-
expect(apiSearchParams.get("q")).toBe("test")
193-
194-
// Ensure count is hidden
195-
const hideCountText = screen.queryByText("700 results")
196-
expect(hideCountText).toBeNull()
197-
})
198-
199155
test("Toggling facets", async () => {
200156
setMockApiResponses({
201157
search: {
@@ -1060,4 +1016,61 @@ describe("UniversalAIBanner", () => {
10601016
expect(screen.queryByText("Universal AI")).not.toBeInTheDocument()
10611017
expect(screen.queryByText("New on MIT Learn")).not.toBeInTheDocument()
10621018
})
1019+
1020+
test("Vector Hybrid Search passes correct params and renders expected count/facets", async () => {
1021+
setMockApiResponses({
1022+
search: {
1023+
count: 700,
1024+
metadata: {
1025+
aggregations: {
1026+
resource_type_group: [{ key: "course", doc_count: 100 }],
1027+
},
1028+
suggestions: [],
1029+
},
1030+
results: factories.learningResources.resources({ count: 5 }).results,
1031+
},
1032+
})
1033+
1034+
// Authenticate as path editor (admin)
1035+
setMockResponse.get(urls.userMe.get(), {
1036+
is_learning_path_editor: true,
1037+
is_authenticated: true,
1038+
})
1039+
1040+
renderWithProviders(<SearchPage />, { url: "?vector_search=true&q=test" })
1041+
1042+
await waitFor(() => {
1043+
const call = makeRequest.mock.calls.find(([_method, url]) => {
1044+
return url.includes(urls.search.vectorResources())
1045+
})
1046+
expect(call).toBeDefined()
1047+
})
1048+
1049+
const call = makeRequest.mock.calls.find(([_method, url]) =>
1050+
url.includes(urls.search.vectorResources()),
1051+
)
1052+
invariant(call)
1053+
const fullUrl = new URL(call[1], "http://mit.edu")
1054+
const apiSearchParams = fullUrl.searchParams
1055+
1056+
expect(apiSearchParams.get("hybrid_search")).toBe("true")
1057+
expect(apiSearchParams.get("q")).toBe("test")
1058+
1059+
// Ensure count is visible
1060+
const countText = await screen.findByText("700 results")
1061+
expect(countText).toBeVisible()
1062+
1063+
// Ensure facets are visible
1064+
await waitFor(() => {
1065+
const tabs = screen.getAllByRole("tab")
1066+
expect(
1067+
tabs.map((tab) => (tab.textContent || "").replace(/\s/g, "")),
1068+
).toEqual([
1069+
"All(100)",
1070+
"Courses(100)",
1071+
"Programs(0)",
1072+
"LearningMaterials(0)",
1073+
])
1074+
})
1075+
})
10631076
})

frontends/main/src/page-components/SearchDisplay/SearchDisplay.tsx

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -516,8 +516,8 @@ const searchModeDropdownOptions = Object.entries(
516516

517517
/**
518518
* Extracts only the fields supported by the vector search API from a broader
519-
* search params object, dropping admin-only params (e.g., aggregations,
520-
* content_file_score_weight) that the vector endpoint does not accept.
519+
* search params object, dropping admin-only params (e.g., content_file_score_weight)
520+
* that the vector endpoint does not accept.
521521
*
522522
* The `as` casts for enum arrays are safe because the v0 and v1 generated
523523
* clients define separate (but structurally identical) enum types for the same
@@ -526,6 +526,7 @@ const searchModeDropdownOptions = Object.entries(
526526
const toVectorSearchParams = (
527527
params: ReturnType<typeof getSearchParams>,
528528
): VectorSearchRequest => ({
529+
aggregations: params.aggregations as VectorSearchRequest["aggregations"],
529530
certification: params.certification,
530531
certification_type:
531532
params.certification_type as VectorSearchRequest["certification_type"],
@@ -625,10 +626,13 @@ const SearchDisplay: React.FC<SearchDisplayProps> = ({
625626
const wantsVectorSearch = searchParams.get("vector_search") === "true"
626627
const isVectorSearch = wantsVectorSearch && user?.is_learning_path_editor
627628

629+
const queryOptions = isVectorSearch
630+
? learningResourceQueries.vectorSearch(toVectorSearchParams(allParams))
631+
: learningResourceQueries.search(allParams as LRSearchRequest)
632+
633+
// @ts-expect-error Typescript has trouble unifying the different query key types
628634
const { data, isLoading, isFetching } = useQuery({
629-
...(isVectorSearch
630-
? learningResourceQueries.vectorSearch(toVectorSearchParams(allParams))
631-
: learningResourceQueries.search(allParams as LRSearchRequest)),
635+
...queryOptions,
632636
enabled: !wantsVectorSearch || !isUserLoading,
633637
placeholderData: keepPreviousData,
634638
select: (timedData: {
@@ -985,9 +989,7 @@ const SearchDisplay: React.FC<SearchDisplayProps> = ({
985989
* the count when data is loaded even if count is same as previous
986990
* count.
987991
*/}
988-
{isFetching || isLoading || isVectorSearch
989-
? ""
990-
: `${data?.count} results`}
992+
{isFetching || isLoading ? "" : `${data?.count} results`}
991993
</VisuallyHidden>
992994
<UniversalAIBanner searchParams={searchParams} />
993995
<Stack direction="row" justifyContent="space-between">

main/settings.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -822,10 +822,10 @@ def get_all_config_keys():
822822
QDRANT_CLIENT_TIMEOUT = get_int(name="QDRANT_CLIENT_TIMEOUT", default=10)
823823

824824
VECTOR_HYBRID_SEARCH_PREFETCH_MULTIPLIER = get_int(
825-
name="VECTOR_HYBRID_SEARCH_PREFETCH_MULTIPLIER", default=20
825+
name="VECTOR_HYBRID_SEARCH_PREFETCH_MULTIPLIER", default=5
826826
)
827827
VECTOR_HYBRID_SEARCH_PREFETCH_MAX_LIMIT = get_int(
828-
name="VECTOR_HYBRID_SEARCH_PREFETCH_MAX_LIMIT", default=10000
828+
name="VECTOR_HYBRID_SEARCH_PREFETCH_MAX_LIMIT", default=500
829829
)
830830
# toggle to use requests (default for local) or webdriver which renders js elements
831831
EMBEDDINGS_EXTERNAL_FETCH_USE_WEBDRIVER = get_bool(

openapi/specs/v0.yaml

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -827,6 +827,58 @@ paths:
827827
description: Vector Search for content
828828
summary: Content File Vector Search
829829
parameters:
830+
- in: query
831+
name: aggregations
832+
schema:
833+
type: array
834+
items:
835+
enum:
836+
- key
837+
- course_number
838+
- platform
839+
- offered_by
840+
- file_extension
841+
- content_feature_type
842+
- run_readable_id
843+
- resource_readable_id
844+
- run_title
845+
- edx_module_id
846+
- content_type
847+
- description
848+
- title
849+
- url
850+
- file_type
851+
- summary
852+
- flashcards
853+
- checksum
854+
type: string
855+
description: |-
856+
* `key` - Key
857+
* `course_number` - Course Number
858+
* `platform` - Platform
859+
* `offered_by` - Offered By
860+
* `file_extension` - File Extension
861+
* `content_feature_type` - Content Feature Type
862+
* `run_readable_id` - Run Readable Id
863+
* `resource_readable_id` - Resource Readable Id
864+
* `run_title` - Run Title
865+
* `edx_module_id` - Edx Module Id
866+
* `content_type` - Content Type
867+
* `description` - Description
868+
* `title` - Title
869+
* `url` - Url
870+
* `file_type` - File Type
871+
* `summary` - Summary
872+
* `flashcards` - Flashcards
873+
* `checksum` - Checksum
874+
description: "aggregations for facet counts \n\n* `key` - Key\n\
875+
* `course_number` - Course Number\n* `platform` - Platform\n* `offered_by`\
876+
\ - Offered By\n* `file_extension` - File Extension\n* `content_feature_type`\
877+
\ - Content Feature Type\n* `run_readable_id` - Run Readable Id\n* `resource_readable_id`\
878+
\ - Resource Readable Id\n* `run_title` - Run Title\n* `edx_module_id` -\
879+
\ Edx Module Id\n* `content_type` - Content Type\n* `description` - Description\n\
880+
* `title` - Title\n* `url` - Url\n* `file_type` - File Type\n* `summary`\
881+
\ - Summary\n* `flashcards` - Flashcards\n* `checksum` - Checksum"
830882
- in: query
831883
name: collection_name
832884
schema:
@@ -961,6 +1013,61 @@ paths:
9611013
description: Vector Search for learning resources
9621014
summary: Vector Search
9631015
parameters:
1016+
- in: query
1017+
name: aggregations
1018+
schema:
1019+
type: array
1020+
items:
1021+
enum:
1022+
- readable_id
1023+
- resource_type
1024+
- certification
1025+
- certification_type
1026+
- professional
1027+
- free
1028+
- course_feature
1029+
- topic
1030+
- ocw_topic
1031+
- level
1032+
- department
1033+
- platform
1034+
- offered_by
1035+
- delivery
1036+
- title
1037+
- url
1038+
- resource_type_group
1039+
- resource_category
1040+
- published
1041+
type: string
1042+
description: |-
1043+
* `readable_id` - Readable Id
1044+
* `resource_type` - Resource Type
1045+
* `certification` - Certification
1046+
* `certification_type` - Certification Type
1047+
* `professional` - Professional
1048+
* `free` - Free
1049+
* `course_feature` - Course Feature
1050+
* `topic` - Topic
1051+
* `ocw_topic` - Ocw Topic
1052+
* `level` - Level
1053+
* `department` - Department
1054+
* `platform` - Platform
1055+
* `offered_by` - Offered By
1056+
* `delivery` - Delivery
1057+
* `title` - Title
1058+
* `url` - Url
1059+
* `resource_type_group` - Resource Type Group
1060+
* `resource_category` - Resource Category
1061+
* `published` - Published
1062+
description: "aggregations for facet counts \n\n* `readable_id`\
1063+
\ - Readable Id\n* `resource_type` - Resource Type\n* `certification` -\
1064+
\ Certification\n* `certification_type` - Certification Type\n* `professional`\
1065+
\ - Professional\n* `free` - Free\n* `course_feature` - Course Feature\n\
1066+
* `topic` - Topic\n* `ocw_topic` - Ocw Topic\n* `level` - Level\n* `department`\
1067+
\ - Department\n* `platform` - Platform\n* `offered_by` - Offered By\n*\
1068+
\ `delivery` - Delivery\n* `title` - Title\n* `url` - Url\n* `resource_type_group`\
1069+
\ - Resource Type Group\n* `resource_category` - Resource Category\n* `published`\
1070+
\ - Published"
9641071
- in: query
9651072
name: certification
9661073
schema:
@@ -1255,6 +1362,13 @@ paths:
12551362
schema:
12561363
type: boolean
12571364
nullable: true
1365+
- in: query
1366+
name: published
1367+
schema:
1368+
type: boolean
1369+
default: true
1370+
description: If the resource is published. We default to True unless passed
1371+
in
12581372
- in: query
12591373
name: q
12601374
schema:

vector_search/constants.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@
4545
"title": "title",
4646
"url": "url",
4747
"resource_type_group": "resource_type_group",
48+
"resource_category": "resource_category",
49+
"published": "published",
4850
}
4951

5052

@@ -71,6 +73,7 @@
7173
"url": models.PayloadSchemaType.KEYWORD,
7274
"title": models.PayloadSchemaType.KEYWORD,
7375
"resource_type_group": models.PayloadSchemaType.KEYWORD,
76+
"resource_category": models.PayloadSchemaType.KEYWORD,
7477
}
7578

7679
"""
@@ -92,3 +95,14 @@
9295
QDRANT_TOPIC_INDEXES = {
9396
"name": models.PayloadSchemaType.KEYWORD,
9497
}
98+
99+
100+
CONTENT_FILES_RETRIEVE_PAYLOAD = True
101+
RESOURCES_RETRIEVE_PAYLOAD = ["readable_id"]
102+
103+
104+
COLLECTION_PARAM_MAP = {
105+
RESOURCES_COLLECTION_NAME: QDRANT_RESOURCE_PARAM_MAP,
106+
TOPICS_COLLECTION_NAME: QDRANT_TOPICS_PARAM_MAP,
107+
CONTENT_FILES_COLLECTION_NAME: QDRANT_CONTENT_FILE_PARAM_MAP,
108+
}

0 commit comments

Comments
 (0)