Skip to content

Commit 57d7a30

Browse files
committed
More text_all search tweaks
- on reprovision, first do migration, then delete+insert so the correct schema is ensured - remove the fuzzy operator which doesn't play well with multiple tokens in a query - don't split words on numbers, retaining those unique words like `a56bd3e` used in our tests
1 parent 5180d8e commit 57d7a30

5 files changed

Lines changed: 31 additions & 36 deletions

File tree

components/renku_data_services/search/reprovision.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,9 @@ def log_counter(c: int) -> None:
102102
logger.info(f"Starting reprovisioning with ID {reprovisioning.id}")
103103
started = datetime.now()
104104
await self._search_updates_repo.clear_all()
105+
if migrate_solr_schema:
106+
await migrator.migrate(entity_schema.all_migrations)
107+
105108
async with DefaultSolrClient(self._solr_config) as client:
106109
res = await client.delete("_type:*")
107110
if res.status_code != 200:
@@ -119,9 +122,6 @@ def log_counter(c: int) -> None:
119122
exc_info=False,
120123
)
121124

122-
if migrate_solr_schema:
123-
await migrator.migrate(entity_schema.all_migrations)
124-
125125
all_users = self._user_repo.get_all_users(requested_by=admin)
126126
counter = await self.__update_entities(all_users, "user", started, counter, log_counter)
127127
logger.info(f"Done adding user entities to search_updates table. Record count: {counter}.")

components/renku_data_services/search/solr_token.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ def public_only() -> SolrToken:
186186

187187
def content_all(text: str) -> SolrToken:
188188
"""Search the content_all field."""
189-
terms_str = "(" + __escape_query(text) + "~)"
189+
terms_str = "(" + __escape_query(text) + ")"
190190
return SolrToken(f"{Fields.content_all}:{terms_str}")
191191

192192

components/renku_data_services/solr/solr_schema.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,13 @@ class Filters:
8383
flattenGraph = Filter(name="flattenGraph")
8484
word = Filter(
8585
name="wordDelimiterGraph",
86-
settings={"splitOnCaseChange": "1", "catenateNumbers": "1", "catenateAll": "1", "preserveOriginal": "1"},
86+
settings={
87+
"splitOnCaseChange": "1",
88+
"catenateNumbers": "1",
89+
"catenateAll": "1",
90+
"preserveOriginal": "1",
91+
"splitOnNumerics": "0",
92+
},
8793
)
8894

8995
@classmethod

test/bases/renku_data_services/data_api/test_search.py

Lines changed: 16 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -236,31 +236,6 @@ async def test_projects(
236236
assert_search_result(result, [p1, p3, p2], check_order=True)
237237

238238

239-
# TODO: figure out how to run search tests fully parallel
240-
@pytest.mark.xdist_group("search")
241-
@pytest.mark.asyncio
242-
async def test_distance(
243-
search_reprovision: SearchReprovisionCall,
244-
create_project_model: CreateProjectCall,
245-
search_query: SearchQueryCall,
246-
sanic_client_with_solr: SanicASGITestClient,
247-
app_manager_instance: TestDependencyManager,
248-
) -> None:
249-
"""Search should be lenient to simple typos, distance=2."""
250-
p1 = await create_project_model(
251-
sanic_client_with_solr, "Project Bike Z", visibility="public", description="a bike with a bike"
252-
)
253-
await search_reprovision(app_manager_instance)
254-
255-
result = await search_query(sanic_client_with_solr, "mikin type:project")
256-
assert result.items == []
257-
258-
result = await search_query(sanic_client_with_solr, "mike type:project")
259-
assert result.items is not None
260-
assert len(result.items) == 1
261-
assert __entity_id(result.items[0]) == p1.id
262-
263-
264239
# TODO: figure out how to run search tests fully parallel
265240
@pytest.mark.xdist_group("search")
266241
@pytest.mark.asyncio
@@ -303,15 +278,28 @@ async def test_search_project_with_dash(
303278
sanic_client_with_solr: SanicASGITestClient,
304279
app_manager_instance: TestDependencyManager,
305280
) -> None:
306-
p1 = await create_project_model(sanic_client_with_solr, "project-with-dash")
281+
p1 = await create_project_model(sanic_client_with_solr, "project-with-dash-a0ie84eb")
307282
await search_reprovision(app_manager_instance)
308283

284+
result = await search_query(sanic_client_with_solr, "project-with-dash-a0ie84eb", regular_user)
285+
assert_search_result(result, [p1], check_order=False)
309286
result = await search_query(sanic_client_with_solr, "project-with-dash", regular_user)
310287
assert_search_result(result, [p1], check_order=False)
311-
result = await search_query(sanic_client_with_solr, "project-with-hash", regular_user)
288+
result = await search_query(sanic_client_with_solr, "project-with", regular_user)
289+
assert_search_result(result, [p1], check_order=False)
290+
291+
result = await search_query(sanic_client_with_solr, "project", regular_user)
292+
assert_search_result(result, [p1], check_order=False)
293+
result = await search_query(sanic_client_with_solr, "with", regular_user)
294+
assert_search_result(result, [p1], check_order=False)
295+
result = await search_query(sanic_client_with_solr, "dash", regular_user)
296+
assert_search_result(result, [p1], check_order=False)
297+
result = await search_query(sanic_client_with_solr, "a0ie84eb", regular_user)
312298
assert_search_result(result, [p1], check_order=False)
313299

314300

301+
302+
315303
# TODO: figure out how to run search tests fully parallel
316304
@pytest.mark.xdist_group("search")
317305
@pytest.mark.asyncio
@@ -344,6 +332,7 @@ async def test_search_domain_and_email(
344332
assert_search_result(result, [p1], check_order=False)
345333

346334

335+
347336
def __entity_id(e: SearchEntity) -> str:
348337
match e.root:
349338
case SearchProject() as p:

test/components/renku_data_services/search/test_solr_token.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def test_created_by_exists() -> None:
130130

131131

132132
def test_content_all() -> None:
133-
assert st.content_all("abc") == "content_all:(abc~)"
134-
assert st.content_all("a+b+c") == "content_all:(a\\+b\\+c~)"
135-
assert st.content_all("ab cd") == "content_all:(ab\\ cd~)"
136-
assert st.content_all("ab cd") == "content_all:(ab\\ \\ \\ \\ cd~)"
133+
assert st.content_all("abc") == "content_all:(abc)"
134+
assert st.content_all("a+b+c") == "content_all:(a\\+b\\+c)"
135+
assert st.content_all("ab cd") == "content_all:(ab\\ cd)"
136+
assert st.content_all("ab cd") == "content_all:(ab\\ \\ \\ \\ cd)"

0 commit comments

Comments
 (0)