Skip to content

Commit 6ecd18c

Browse files
committed
Fix solr text field config
WIP: the re-indexing after migration on start doesn't work anymore
1 parent 1a114d7 commit 6ecd18c

5 files changed

Lines changed: 69 additions & 8 deletions

File tree

bases/renku_data_services/data_api/main.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ async def setup_rclone_validator(app: Sanic) -> None:
176176
@app.after_server_start
177177
async def ready(app: Sanic) -> None:
178178
"""Application ready event handler."""
179+
logger.info(f">>>>>> SHOULD RUN RE-INDEX NOW. {getattr(app.ctx, "solr_reindex", False)}")
179180
if getattr(app.ctx, "solr_reindex", False):
180181
logger.info("Creating solr reindex task, as required by migrations.")
181182
app.add_task(solr_reindex(dependency_manager.search_reprovisioning))

components/renku_data_services/search/solr_token.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Model for creating solr lucene queries."""
22

3-
import re
43
from collections.abc import Iterable
54
from datetime import UTC, datetime
65
from typing import NewType
@@ -186,9 +185,8 @@ def public_only() -> SolrToken:
186185

187186

188187
def content_all(text: str) -> SolrToken:
189-
"""Search the content_all field with fuzzy searching each term."""
190-
terms: list[SolrToken] = list(map(lambda s: SolrToken(__escape_query(s)), re.split("\\s+", text)))
191-
terms_str = "(" + " ".join(terms) + ")"
188+
"""Search the content_all field."""
189+
terms_str = "(" + __escape_query(text) + "~)"
192190
return SolrToken(f"{Fields.content_all}:{terms_str}")
193191

194192

components/renku_data_services/solr/entity_schema.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,21 +60,21 @@ class Analyzers:
6060
"""A collection of analyzers."""
6161

6262
text_index: Final[Analyzer] = Analyzer(
63-
tokenizer=Tokenizers.uax29UrlEmail,
63+
tokenizer=Tokenizers.whitespace,
6464
filters=[
65+
Filters.word,
66+
Filters.flattenGraph,
6567
Filters.lowercase,
66-
Filters.stop,
6768
Filters.english_minimal_stem,
6869
Filters.ascii_folding,
6970
Filters.edgeNgram(2, 8, True),
7071
],
7172
)
7273

7374
text_query: Final[Analyzer] = Analyzer(
74-
tokenizer=Tokenizers.uax29UrlEmail,
75+
tokenizer=Tokenizers.whitespace,
7576
filters=[
7677
Filters.lowercase,
77-
Filters.stop,
7878
Filters.english_minimal_stem,
7979
Filters.ascii_folding,
8080
],
@@ -189,4 +189,9 @@ class FieldTypes:
189189
],
190190
requires_reindex=True,
191191
),
192+
SchemaMigration(
193+
version=15,
194+
commands=[ReplaceCommand(FieldTypes.text), ReplaceCommand(FieldTypes.text_all)],
195+
requires_reindex=True,
196+
),
192197
]

components/renku_data_services/solr/solr_schema.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,11 @@ class Filters:
8080
english_minimal_stem = Filter(name="englishMinimalStem")
8181
classic = Filter(name="classic")
8282
ngram = Filter(name="nGram")
83+
flattenGraph = Filter(name="flattenGraph")
84+
word = Filter(
85+
name="wordDelimiterGraph",
86+
settings={"splitOnCaseChange": "1", "catenateNumbers": "1", "catenateAll": "1", "preserveOriginal": "1"},
87+
)
8388

8489
@classmethod
8590
def edgeNgram(cls, min_gram_size: int = 3, maxGramSize: int = 6, preserve_original: bool = True) -> Filter:

test/bases/renku_data_services/data_api/test_search.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,58 @@ async def test_search_by_entity_type(
292292
assert item.root.type == field.value
293293

294294

295+
# TODO: figure out how to run search tests fully parallel
296+
@pytest.mark.xdist_group("search")
297+
@pytest.mark.asyncio
298+
async def test_search_project_with_dash(
299+
create_project_model: CreateProjectCall,
300+
regular_user: UserInfo,
301+
search_query: SearchQueryCall,
302+
search_reprovision: SearchReprovisionCall,
303+
sanic_client_with_solr: SanicASGITestClient,
304+
app_manager_instance: TestDependencyManager,
305+
) -> None:
306+
p1 = await create_project_model(sanic_client_with_solr, "project-with-dash")
307+
await search_reprovision(app_manager_instance)
308+
309+
result = await search_query(sanic_client_with_solr, "project-with-dash", regular_user)
310+
assert_search_result(result, [p1], check_order=False)
311+
result = await search_query(sanic_client_with_solr, "project-with-hash", regular_user)
312+
assert_search_result(result, [p1], check_order=False)
313+
314+
315+
# TODO: figure out how to run search tests fully parallel
316+
@pytest.mark.xdist_group("search")
317+
@pytest.mark.asyncio
318+
async def test_search_domain_and_email(
319+
create_project_model: CreateProjectCall,
320+
regular_user: UserInfo,
321+
search_query: SearchQueryCall,
322+
search_reprovision: SearchReprovisionCall,
323+
sanic_client_with_solr: SanicASGITestClient,
324+
app_manager_instance: TestDependencyManager,
325+
) -> None:
326+
p1 = await create_project_model(
327+
sanic_client_with_solr,
328+
"bob's project",
329+
description="Bob's e-mail address is bob@microsoft.com - keep it safe. Check the homepage at https://bob.me/about!",
330+
)
331+
await search_reprovision(app_manager_instance)
332+
333+
result = await search_query(sanic_client_with_solr, "bob@microsoft.com", regular_user)
334+
assert_search_result(result, [p1], check_order=False)
335+
result = await search_query(sanic_client_with_solr, "https://bob.me/about", regular_user)
336+
assert_search_result(result, [p1], check_order=False)
337+
result = await search_query(sanic_client_with_solr, "bob.me", regular_user)
338+
assert_search_result(result, [p1], check_order=False)
339+
result = await search_query(sanic_client_with_solr, "e-mail", regular_user)
340+
assert_search_result(result, [p1], check_order=False)
341+
result = await search_query(sanic_client_with_solr, "email", regular_user)
342+
assert_search_result(result, [p1], check_order=False)
343+
result = await search_query(sanic_client_with_solr, "bob's project", regular_user)
344+
assert_search_result(result, [p1], check_order=False)
345+
346+
295347
def __entity_id(e: SearchEntity) -> str:
296348
match e.root:
297349
case SearchProject() as p:

0 commit comments

Comments
 (0)