more works on word threads

clovis · clovis · commit 55d631c67b6a · 2026-05-22T09:41:20.000-05:00
diff --git a/app/src/components/SearchTips.vue b/app/src/components/SearchTips.vue
@@ -91,6 +91,10 @@
                             {{ $t('searchTips.multipleAttributes') }}
                             <code class="code-block" aria-label="code example">charles:pos:PROPN:ner:PERS</code>.
                         </li>
+                        <li>
+                            {{ $t('searchTips.multiWordLemma') }}
+                            <code class="code-block" aria-label="code example">lemma:parce_que</code>.
+                        </li>
                     </ol>
                 </section>
 
diff --git a/app/src/locales/en.json b/app/src/locales/en.json
@@ -379,6 +379,7 @@
         "wordAttributeExample": "syntax. For example:",
         "combineLemma": "You can combine lemma and attribute searches:",
         "multipleAttributes": "You can use multiple attributes:",
+        "multiWordLemma": "Multi-word lemmas (e.g. the French compound \"parce que\") are stored with whitespace collapsed to underscores. Query them with an underscore instead of a space:",
         "metadataSearches": "Metadata Searches",
         "metadataDescription": "Metadata searches work with the same conventions as word searches. Regular expressions are also supported for pattern matching:",
         "metadataPlainTokens": "Plain tokens are interpreted as tokens",
diff --git a/app/src/locales/es.json b/app/src/locales/es.json
@@ -379,6 +379,7 @@
         "wordAttributeExample": "sintaxis. Por ejemplo:",
         "combineLemma": "Se puede combinar la búsqueda de vocablo y atributo:",
         "multipleAttributes": "Se pueden emplear múltiples atributos:",
+        "multiWordLemma": "Los lemas de varias palabras (p. ej. el compuesto francés «parce que») se almacenan con los espacios reemplazados por subrayados. Búsquelos con un subrayado en lugar de un espacio:",
         "metadataSearches": "Búsquedas de Metadatos",
         "metadataDescription": "Las búsquedas de metadatos emplean las mismas convenciones que las de palabras. También soporta el uso de expresiones regulares para concordar patrones:",
         "metadataPlainTokens": "Las muestras sencillas se interpretan como muestras",
diff --git a/app/src/locales/fr.json b/app/src/locales/fr.json
@@ -379,6 +379,7 @@
         "wordAttributeExample": "syntaxe. Par exemple :",
         "combineLemma": "Vous pouvez combiner les recherches de lemme et d'attribut :",
         "multipleAttributes": "Vous pouvez utiliser plusieurs attributs :",
+        "multiWordLemma": "Les lemmes composés de plusieurs mots (par exemple « parce que ») sont stockés avec les espaces remplacés par des soulignements. Utilisez un souligné à la place de l'espace :",
         "metadataSearches": "Recherches de métadonnées",
         "metadataDescription": "Les recherches de métadonnées fonctionnent avec les mêmes conventions que les recherches de mots. Les expressions régulières sont également supportées pour la recherche par motifs :",
         "metadataPlainTokens": "Les tokens simples sont interprétés comme des tokens",
diff --git a/docs/query_syntax.md b/docs/query_syntax.md
@@ -23,6 +23,7 @@ When specifying a query, one can select a query method to constrain the relation
 #### Lemma and word attribute Searches
 If you text collection contains lemma and/or word attribute information (usually in <w> tags), then PhiloLogic allows you to query words based on lemma and/or word attribute value
 1. For simple lemma searching, just preprend the lemma with `lemma:` such as in `lemma:have`. Regexes are permitted on the token portion of the search, e.g. `lemma:constitut.*`.
+   - Multi-word lemmas (e.g. the French compound `parce que`) are stored with whitespace collapsed to underscores. Query them as `lemma:parce_que`, not `lemma:parce que`.
 2. For word attribute searching, use the `word:attribute:attribute` syntax such as in `love:pos:NOUN`. Regexes are permitted on the token portion of the search, e.g. `lov.*:pos:NOUN`.
 3. You can combine lemma searching with word attribute filtering. Just preprend your token with `lemma:` such as in `lemma:love:pos:NOUN`. Regexes are permitted on the token portion of the search, e.g. `lemma:lov.*:pos:NOUN`.
 4. Note that you cannot combine multiple word attributes filters on one token, such as in `charles:pos:PROPN:ner:PERS`.
diff --git a/python/philologic/loadtime/LoadFilters.py b/python/philologic/loadtime/LoadFilters.py
@@ -78,7 +78,7 @@ def process_file():
                 record.attrib["pos"] = parsed_word.pos_
                 record.attrib["tag"] = parsed_word.tag_
                 record.attrib["ent_type"] = parsed_word.ent_type_
-                record.attrib["lemma"] = parsed_word.lemma_
+                record.attrib["lemma"] = "_".join((parsed_word.lemma_ or record.name).lower().split())
                 print(record, file=tmp_file)
 
             spacy_sentence.tensor = None
diff --git a/python/philologic/runtime/threads.py b/python/philologic/runtime/threads.py
@@ -31,7 +31,7 @@
 import hashlib
 import os
 import time
-from typing import Dict, List, Optional, Tuple
+from typing import Callable, Dict, List, Optional, Tuple
 
 import numba
 import numpy as np
@@ -763,6 +763,9 @@ def detect_threads(
     stopwords: Optional[set] = None,
     corpus_idf: Optional[Dict[str, float]] = None,
     corpus_idf_default: float = 1.0,
+    # Lazy alternative to corpus_idf: called only on a cache miss, returns
+    # (idf_map, default). Avoids the ~380 ms full-map load on cache hits.
+    corpus_idf_loader: Optional[Callable[[], Tuple[Dict[str, float], float]]] = None,
     top_n_threads: Optional[int] = None,
     min_cluster_size: int = 10,
     min_cluster_size_floor: int = 5,
@@ -784,12 +787,16 @@ def detect_threads(
     cache_path = _thread_cache_path(db_path, cache_key)
     cached = _load_intermediates(cache_path)
 
-    if cached is not None:
+    # A cache entry is only usable if it carries candidate_idf — older entries
+    # (pre-lazy-idf) lack it, so treat those as a miss to auto-upgrade.
+    if cached is not None and "candidate_idf" in cached:
         flat_ids = cached["flat_ids"]
         indptr = cached["indptr"]
         years = cached["years"]
         candidate_ids = cached["candidate_ids"]
         counts_at_cand = cached["counts_at_cand"]
+        candidate_idf = cached["candidate_idf"]
+        corpus_idf_default = float(cached["corpus_idf_default"])
         raw_K = int(cached["raw_K"])
         dist = cached["dist"]
         v_blob, v_offsets = _load_vocab(db_path, count_lemmas)
@@ -820,6 +827,20 @@ def detect_threads(
         # w in candidate_ids.
         counts_at_cand = counts_full[candidate_ids].astype(np.float64)
 
+        # Corpus-IDF for the candidate words only. The full corpus idf map is
+        # ~676k entries and costs ~380 ms to load — but we only need the K
+        # candidates' values, and only on a cache MISS. Loading lazily here
+        # (rather than eagerly in the caller) means the common slider-rerun
+        # path (cache hit) never touches the idf map on any worker.
+        if corpus_idf is None and corpus_idf_loader is not None:
+            corpus_idf, corpus_idf_default = corpus_idf_loader()
+        cidf = corpus_idf or {}
+        cand_names = [_vocab_name(int(c), v_blob, v_offsets, count_lemmas)
+                      for c in candidate_ids]
+        candidate_idf = np.array(
+            [cidf.get(n, corpus_idf_default) for n in cand_names], dtype=np.float64
+        )
+
         n_vocab = len(v_offsets) - 1
         dist = _build_distance(flat_ids, indptr, candidate_ids, n_vocab=n_vocab)
         if dist is not None:
@@ -829,14 +850,17 @@ def detect_threads(
                 cache_path,
                 flat_ids=flat_ids, indptr=indptr, years=years,
                 candidate_ids=candidate_ids, counts_at_cand=counts_at_cand,
+                candidate_idf=candidate_idf,
+                corpus_idf_default=np.array(corpus_idf_default, dtype=np.float64),
                 raw_K=np.array(raw_K, dtype=np.int64), dist=dist,
             )
 
-    # Materialize counts as a vocab-id → count dict for the downstream
-    # per-thread word ranking. Consumer code (`counts[w]` for w in words ⊆
-    # candidate_ids) works identically against a dict.
+    # Materialize counts + idf as vocab-id → value dicts for the downstream
+    # per-thread word ranking. Both are keyed by candidate vocab id.
     counts = {int(candidate_ids[i]): float(counts_at_cand[i])
               for i in range(len(candidate_ids))}
+    idf_by_vocab = {int(candidate_ids[i]): float(candidate_idf[i])
+                    for i in range(len(candidate_ids))}
 
     # Derived values — same in both cache hit / miss paths.
     n_hits = len(indptr) - 1
@@ -1008,7 +1032,6 @@ def _pick(method: str):
         union_words.extend(words)
     union_words = sorted(set(union_words))
     cid_order = list(kept_clusters.keys())
-    cidf = corpus_idf or {}
 
     # Bags are already in flat (flat_ids, indptr) form from _build_hit_bags;
     # the intensity kernel can use them directly per thread.
@@ -1031,10 +1054,12 @@ def _pick(method: str):
         if max_v <= 0:
             continue
 
-        # Rank thread words by corpus-IDF c-TF-IDF: count × whole-corpus rarity
+        # Rank thread words by corpus-IDF c-TF-IDF: count × whole-corpus rarity.
+        # idf comes from the precomputed per-candidate array (cached), keyed by
+        # vocab id — no full corpus-idf map needed on this path.
         name_of = {w: _vocab_name(w, v_blob, v_offsets, count_lemmas) for w in words}
         ctfidf = {
-            w: thread_word_counts[cid][w] * cidf.get(name_of[w], corpus_idf_default)
+            w: thread_word_counts[cid][w] * idf_by_vocab.get(w, corpus_idf_default)
             for w in words
         }
         ranked_words = sorted(words, key=lambda w: -ctfidf[w])
diff --git a/www/scripts/get_threads.py b/www/scripts/get_threads.py
@@ -109,7 +109,6 @@ def get_threads(request, config):
 
     metadata = dict(request.metadata or {})
     stopwords = _load_stopwords(request, config, count_lemmas)
-    corpus_idf, corpus_idf_default = _load_corpus_idf(config.db_path, count_lemmas)
 
     def _int_or_none(name, lo, hi):
         raw = getattr(request, name, "")
@@ -129,8 +128,10 @@ def _int_or_none(name, lo, hi):
         db, config.db_path + "/data", q, count_lemmas, attribute, attribute_value,
         metadata,
         stopwords=stopwords,
-        corpus_idf=corpus_idf,
-        corpus_idf_default=corpus_idf_default,
+        # Lazy: detect_threads only loads the corpus-idf map on a cache miss,
+        # so the common slider-rerun (cache hit) skips the ~380 ms full-map
+        # load entirely — on every worker, warm or cold.
+        corpus_idf_loader=lambda: _load_corpus_idf(config.db_path, count_lemmas),
         top_n_threads=top_n_threads,
         min_cluster_size_override=min_words,
         # The network view is the spatial twin of the streamgraph — built from