1616_FORMS_FLAT_FILES = ("lemmas" , "word_attributes" , "lemma_word_attributes" )
1717
1818
19- def get_lmdb_env (lmdb_path : str ) -> lmdb .Environment :
19+ def _open_lmdb (lmdb_path : str ) -> lmdb .Environment :
2020 """Open a read-only LMDB environment. Caller should close it when done."""
2121 return lmdb .open (lmdb_path , readonly = True , lock = False , readahead = False )
2222
2323
24- def _get_norm_env (freq_file : str ) -> lmdb .Environment :
25- """Open the normalized word frequencies LMDB. Caller should close it when done."""
26- return get_lmdb_env (freq_file + ".lmdb" )
27-
28-
29- def _get_forms_env (db_path : str ) -> lmdb .Environment | None :
30- """Open the word_forms.lmdb env, or return None if it doesn't exist."""
31- lmdb_path = os .path .join (db_path , "frequencies" , "word_forms.lmdb" )
32- if not os .path .exists (lmdb_path ):
33- return None
34- return get_lmdb_env (lmdb_path )
35-
36-
3724def _norm_key (token : str , lowercase : bool = True ) -> bytes :
3825 if lowercase :
3926 token = token .lower ()
@@ -239,9 +226,10 @@ def expand_query_not(split, freq_file, dest_fh, ascii_conversion, lowercase=True
239226 forms, and writes the result to dest_fh.
240227 Groups are separated by blank lines (consumed by get_word_groups()).
241228 """
242- env = _get_norm_env (freq_file )
229+ env = _open_lmdb (freq_file + ".lmdb" )
243230 db_path = os .path .normpath (os .path .join (os .path .dirname (freq_file ), ".." ))
244- forms_env = _get_forms_env (db_path )
231+ forms_lmdb_path = os .path .join (db_path , "frequencies" , "word_forms.lmdb" )
232+ forms_env = _open_lmdb (forms_lmdb_path ) if os .path .exists (forms_lmdb_path ) else None
245233 first = True
246234
247235 try :
@@ -350,18 +338,13 @@ def build_metadata_word_index(db_path: str) -> int:
350338 return len (index )
351339
352340
353- def _get_metadata_index_env (db_path : str ) -> lmdb .Environment :
354- """Open the metadata_word_index.lmdb env. Caller should close it when done."""
355- lmdb_path = os .path .join (db_path , "frequencies" , _META_LMDB_NAME )
356- return get_lmdb_env (lmdb_path )
357-
358341
359342def metadata_word_lookup (db_path : str , field : str , term : str ) -> list [str ]:
360343 """Look up metadata values containing term as a whole word.
361344
362345 Returns list of original metadata values from the inverted word index.
363346 """
364- env = _get_metadata_index_env ( db_path )
347+ env = _open_lmdb ( os . path . join ( db_path , "frequencies" , _META_LMDB_NAME ) )
365348 try :
366349 key = f"{ field } \x00 { term } " .encode ("utf-8" )
367350 with env .begin (buffers = True ) as txn :
@@ -380,7 +363,7 @@ def metadata_word_regex_scan(db_path: str, field: str, pattern: str) -> list[str
380363 indexed word. Returns deduplicated list of original metadata values
381364 from all matching words.
382365 """
383- env = _get_metadata_index_env ( db_path )
366+ env = _open_lmdb ( os . path . join ( db_path , "frequencies" , _META_LMDB_NAME ) )
384367 try :
385368 field_prefix = f"{ field } \x00 " .encode ("utf-8" )
386369 compiled = re .compile (pattern )
@@ -417,7 +400,7 @@ def metadata_word_prefix_scan(db_path: str, field: str, prefix: str,
417400 Returns deduplicated list of original metadata values from all matching words.
418401 Used for metadata autocomplete.
419402 """
420- env = _get_metadata_index_env ( db_path )
403+ env = _open_lmdb ( os . path . join ( db_path , "frequencies" , _META_LMDB_NAME ) )
421404 try :
422405 key_prefix = f"{ field } \x00 { prefix } " .encode ("utf-8" )
423406 seen : set [str ] = set ()
@@ -464,7 +447,7 @@ def expand_autocomplete(kind: str, token: str, frequency_file: str, db_path: str
464447 raw_token = token [1 :- 1 ] if kind == "QUOTE" else token
465448 if not raw_token :
466449 return []
467- env = _get_norm_env (frequency_file )
450+ env = _open_lmdb (frequency_file + ".lmdb" )
468451 try :
469452 with env .begin (buffers = True ) as txn :
470453 if _is_regex_pattern (raw_token ):
@@ -483,7 +466,8 @@ def expand_autocomplete(kind: str, token: str, frequency_file: str, db_path: str
483466 elif kind in ("LEMMA" , "ATTR" , "LEMMA_ATTR" ):
484467 if not token :
485468 return []
486- scan_env = _get_forms_env (db_path ) or get_lmdb_env (os .path .join (db_path , "words.lmdb" ))
469+ forms_lmdb_path = os .path .join (db_path , "frequencies" , "word_forms.lmdb" )
470+ scan_env = _open_lmdb (forms_lmdb_path ) if os .path .exists (forms_lmdb_path ) else _open_lmdb (os .path .join (db_path , "words.lmdb" ))
487471 try :
488472 with scan_env .begin (buffers = True ) as txn :
489473 if _is_regex_pattern (token ):
0 commit comments