Skip to content

Commit 55d631c

Browse files
committed
more works on word threads
1 parent d742dd6 commit 55d631c

8 files changed

Lines changed: 46 additions & 12 deletions

File tree

app/src/components/SearchTips.vue

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,10 @@
9191
{{ $t('searchTips.multipleAttributes') }}
9292
<code class="code-block" aria-label="code example">charles:pos:PROPN:ner:PERS</code>.
9393
</li>
94+
<li>
95+
{{ $t('searchTips.multiWordLemma') }}
96+
<code class="code-block" aria-label="code example">lemma:parce_que</code>.
97+
</li>
9498
</ol>
9599
</section>
96100

app/src/locales/en.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,7 @@
379379
"wordAttributeExample": "syntax. For example:",
380380
"combineLemma": "You can combine lemma and attribute searches:",
381381
"multipleAttributes": "You can use multiple attributes:",
382+
"multiWordLemma": "Multi-word lemmas (e.g. the French compound \"parce que\") are stored with whitespace collapsed to underscores. Query them with an underscore instead of a space:",
382383
"metadataSearches": "Metadata Searches",
383384
"metadataDescription": "Metadata searches work with the same conventions as word searches. Regular expressions are also supported for pattern matching:",
384385
"metadataPlainTokens": "Plain tokens are interpreted as tokens",

app/src/locales/es.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,7 @@
379379
"wordAttributeExample": "sintaxis. Por ejemplo:",
380380
"combineLemma": "Se puede combinar la búsqueda de vocablo y atributo:",
381381
"multipleAttributes": "Se pueden emplear múltiples atributos:",
382+
"multiWordLemma": "Los lemas de varias palabras (p. ej. el compuesto francés «parce que») se almacenan con los espacios reemplazados por subrayados. Búsquelos con un subrayado en lugar de un espacio:",
382383
"metadataSearches": "Búsquedas de Metadatos",
383384
"metadataDescription": "Las búsquedas de metadatos emplean las mismas convenciones que las de palabras. También soporta el uso de expresiones regulares para concordar patrones:",
384385
"metadataPlainTokens": "Las muestras sencillas se interpretan como muestras",

app/src/locales/fr.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,7 @@
379379
"wordAttributeExample": "syntaxe. Par exemple :",
380380
"combineLemma": "Vous pouvez combiner les recherches de lemme et d'attribut :",
381381
"multipleAttributes": "Vous pouvez utiliser plusieurs attributs :",
382+
"multiWordLemma": "Les lemmes composés de plusieurs mots (par exemple « parce que ») sont stockés avec les espaces remplacés par des soulignements. Utilisez un souligné à la place de l'espace :",
382383
"metadataSearches": "Recherches de métadonnées",
383384
"metadataDescription": "Les recherches de métadonnées fonctionnent avec les mêmes conventions que les recherches de mots. Les expressions régulières sont également supportées pour la recherche par motifs :",
384385
"metadataPlainTokens": "Les tokens simples sont interprétés comme des tokens",

docs/query_syntax.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ When specifying a query, one can select a query method to constrain the relation
2323
#### Lemma and word attribute Searches
2424
If you text collection contains lemma and/or word attribute information (usually in <w> tags), then PhiloLogic allows you to query words based on lemma and/or word attribute value
2525
1. For simple lemma searching, just preprend the lemma with `lemma:` such as in `lemma:have`. Regexes are permitted on the token portion of the search, e.g. `lemma:constitut.*`.
26+
- Multi-word lemmas (e.g. the French compound `parce que`) are stored with whitespace collapsed to underscores. Query them as `lemma:parce_que`, not `lemma:parce que`.
2627
2. For word attribute searching, use the `word:attribute:attribute` syntax such as in `love:pos:NOUN`. Regexes are permitted on the token portion of the search, e.g. `lov.*:pos:NOUN`.
2728
3. You can combine lemma searching with word attribute filtering. Just preprend your token with `lemma:` such as in `lemma:love:pos:NOUN`. Regexes are permitted on the token portion of the search, e.g. `lemma:lov.*:pos:NOUN`.
2829
4. Note that you cannot combine multiple word attributes filters on one token, such as in `charles:pos:PROPN:ner:PERS`.

python/philologic/loadtime/LoadFilters.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def process_file():
7878
record.attrib["pos"] = parsed_word.pos_
7979
record.attrib["tag"] = parsed_word.tag_
8080
record.attrib["ent_type"] = parsed_word.ent_type_
81-
record.attrib["lemma"] = parsed_word.lemma_
81+
record.attrib["lemma"] = "_".join((parsed_word.lemma_ or record.name).lower().split())
8282
print(record, file=tmp_file)
8383

8484
spacy_sentence.tensor = None

python/philologic/runtime/threads.py

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
import hashlib
3232
import os
3333
import time
34-
from typing import Dict, List, Optional, Tuple
34+
from typing import Callable, Dict, List, Optional, Tuple
3535

3636
import numba
3737
import numpy as np
@@ -763,6 +763,9 @@ def detect_threads(
763763
stopwords: Optional[set] = None,
764764
corpus_idf: Optional[Dict[str, float]] = None,
765765
corpus_idf_default: float = 1.0,
766+
# Lazy alternative to corpus_idf: called only on a cache miss, returns
767+
# (idf_map, default). Avoids the ~380 ms full-map load on cache hits.
768+
corpus_idf_loader: Optional[Callable[[], Tuple[Dict[str, float], float]]] = None,
766769
top_n_threads: Optional[int] = None,
767770
min_cluster_size: int = 10,
768771
min_cluster_size_floor: int = 5,
@@ -784,12 +787,16 @@ def detect_threads(
784787
cache_path = _thread_cache_path(db_path, cache_key)
785788
cached = _load_intermediates(cache_path)
786789

787-
if cached is not None:
790+
# A cache entry is only usable if it carries candidate_idf — older entries
791+
# (pre-lazy-idf) lack it, so treat those as a miss to auto-upgrade.
792+
if cached is not None and "candidate_idf" in cached:
788793
flat_ids = cached["flat_ids"]
789794
indptr = cached["indptr"]
790795
years = cached["years"]
791796
candidate_ids = cached["candidate_ids"]
792797
counts_at_cand = cached["counts_at_cand"]
798+
candidate_idf = cached["candidate_idf"]
799+
corpus_idf_default = float(cached["corpus_idf_default"])
793800
raw_K = int(cached["raw_K"])
794801
dist = cached["dist"]
795802
v_blob, v_offsets = _load_vocab(db_path, count_lemmas)
@@ -820,6 +827,20 @@ def detect_threads(
820827
# w in candidate_ids.
821828
counts_at_cand = counts_full[candidate_ids].astype(np.float64)
822829

830+
# Corpus-IDF for the candidate words only. The full corpus idf map is
831+
# ~676k entries and costs ~380 ms to load — but we only need the K
832+
# candidates' values, and only on a cache MISS. Loading lazily here
833+
# (rather than eagerly in the caller) means the common slider-rerun
834+
# path (cache hit) never touches the idf map on any worker.
835+
if corpus_idf is None and corpus_idf_loader is not None:
836+
corpus_idf, corpus_idf_default = corpus_idf_loader()
837+
cidf = corpus_idf or {}
838+
cand_names = [_vocab_name(int(c), v_blob, v_offsets, count_lemmas)
839+
for c in candidate_ids]
840+
candidate_idf = np.array(
841+
[cidf.get(n, corpus_idf_default) for n in cand_names], dtype=np.float64
842+
)
843+
823844
n_vocab = len(v_offsets) - 1
824845
dist = _build_distance(flat_ids, indptr, candidate_ids, n_vocab=n_vocab)
825846
if dist is not None:
@@ -829,14 +850,17 @@ def detect_threads(
829850
cache_path,
830851
flat_ids=flat_ids, indptr=indptr, years=years,
831852
candidate_ids=candidate_ids, counts_at_cand=counts_at_cand,
853+
candidate_idf=candidate_idf,
854+
corpus_idf_default=np.array(corpus_idf_default, dtype=np.float64),
832855
raw_K=np.array(raw_K, dtype=np.int64), dist=dist,
833856
)
834857

835-
# Materialize counts as a vocab-id → count dict for the downstream
836-
# per-thread word ranking. Consumer code (`counts[w]` for w in words ⊆
837-
# candidate_ids) works identically against a dict.
858+
# Materialize counts + idf as vocab-id → value dicts for the downstream
859+
# per-thread word ranking. Both are keyed by candidate vocab id.
838860
counts = {int(candidate_ids[i]): float(counts_at_cand[i])
839861
for i in range(len(candidate_ids))}
862+
idf_by_vocab = {int(candidate_ids[i]): float(candidate_idf[i])
863+
for i in range(len(candidate_ids))}
840864

841865
# Derived values — same in both cache hit / miss paths.
842866
n_hits = len(indptr) - 1
@@ -1008,7 +1032,6 @@ def _pick(method: str):
10081032
union_words.extend(words)
10091033
union_words = sorted(set(union_words))
10101034
cid_order = list(kept_clusters.keys())
1011-
cidf = corpus_idf or {}
10121035

10131036
# Bags are already in flat (flat_ids, indptr) form from _build_hit_bags;
10141037
# the intensity kernel can use them directly per thread.
@@ -1031,10 +1054,12 @@ def _pick(method: str):
10311054
if max_v <= 0:
10321055
continue
10331056

1034-
# Rank thread words by corpus-IDF c-TF-IDF: count × whole-corpus rarity
1057+
# Rank thread words by corpus-IDF c-TF-IDF: count × whole-corpus rarity.
1058+
# idf comes from the precomputed per-candidate array (cached), keyed by
1059+
# vocab id — no full corpus-idf map needed on this path.
10351060
name_of = {w: _vocab_name(w, v_blob, v_offsets, count_lemmas) for w in words}
10361061
ctfidf = {
1037-
w: thread_word_counts[cid][w] * cidf.get(name_of[w], corpus_idf_default)
1062+
w: thread_word_counts[cid][w] * idf_by_vocab.get(w, corpus_idf_default)
10381063
for w in words
10391064
}
10401065
ranked_words = sorted(words, key=lambda w: -ctfidf[w])

www/scripts/get_threads.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,6 @@ def get_threads(request, config):
109109

110110
metadata = dict(request.metadata or {})
111111
stopwords = _load_stopwords(request, config, count_lemmas)
112-
corpus_idf, corpus_idf_default = _load_corpus_idf(config.db_path, count_lemmas)
113112

114113
def _int_or_none(name, lo, hi):
115114
raw = getattr(request, name, "")
@@ -129,8 +128,10 @@ def _int_or_none(name, lo, hi):
129128
db, config.db_path + "/data", q, count_lemmas, attribute, attribute_value,
130129
metadata,
131130
stopwords=stopwords,
132-
corpus_idf=corpus_idf,
133-
corpus_idf_default=corpus_idf_default,
131+
# Lazy: detect_threads only loads the corpus-idf map on a cache miss,
132+
# so the common slider-rerun (cache hit) skips the ~380 ms full-map
133+
# load entirely — on every worker, warm or cold.
134+
corpus_idf_loader=lambda: _load_corpus_idf(config.db_path, count_lemmas),
134135
top_n_threads=top_n_threads,
135136
min_cluster_size_override=min_words,
136137
# The network view is the spatial twin of the streamgraph — built from

0 commit comments

Comments
 (0)