22Core corpus management functionality.
33"""
44
5+ import json
56import logging
67from pathlib import Path
7- from typing import List , Dict , Optional , Any , Union
8- from collections import defaultdict
9- import json
8+ from typing import Any , Dict , List , Optional , Union
9+
1010import lxml .etree as ET
11+
1112from corpus_module .query import CorpusQuery
1213
1314logger = logging .getLogger (__name__ )
@@ -39,21 +40,27 @@ def __init__(
3940 Create new corpus with optional input data.
4041
4142 Args:
42- topdir: Input directory with files/subdirs as possible corpus components
43+ topdir: Input directory with files/subdirs as possible corpus
44+ components
4345 infiles: List of files to use (alternative to globstr)
44- globstr: Create infiles using globbing under topdir (requires topdir)
46+ globstr: Create infiles using globbing under topdir (requires
47+ topdir)
4548 outfile: Output file path
4649 mkdir: Make topdir if doesn't exist (default=False)
4750 make_descendants: Makes AmiCorpusContainers for directories on tree
4851 **kwargs: Dict of per-corpus user-specified properties
4952 """
5053 self .topdir = Path (topdir ) if topdir else None
5154 if self .topdir and not self .topdir .is_dir ():
52- raise ValueError (f"AmiCorpus() requires valid directory { self .topdir } " )
55+ raise ValueError (
56+ f"AmiCorpus() requires valid directory { self .topdir } "
57+ )
5358
5459 self .container_by_file = dict ()
5560 # rootnode
56- self .ami_container = self .create_corpus_container (self .topdir , make_descendants = make_descendants , mkdir = mkdir )
61+ self .ami_container = self .create_corpus_container (
62+ self .topdir , make_descendants = make_descendants , mkdir = mkdir
63+ )
5764 self .infiles = infiles
5865 self .outfile = Path (outfile ) if outfile else None
5966 self .globstr = globstr
@@ -89,18 +96,24 @@ def __str__(self):
8996 def _make_infiles (self ):
9097 """Create infiles list from globstr if not provided."""
9198 if self .infiles :
92- logger .info (f "taking infiles from list" )
99+ logger .info ("taking infiles from list" )
93100 else :
94101 if self .topdir and self .globstr :
95- self .infiles = self ._posix_glob (f"{ self .topdir } /{ self .globstr } " , recursive = True )
102+ self .infiles = self ._posix_glob (
103+ f"{ self .topdir } /{ self .globstr } " , recursive = True
104+ )
96105 if self .infiles is None :
97- logger .error (f "self.infiles is None" )
106+ logger .error ("self.infiles is None" )
98107 return
99108 logger .info (f"inputting { len (self .infiles )} files" )
100109 return self .infiles
101110
102111 def create_corpus_container (
103- self , file : Optional [Union [str , Path ]], bib_type : str = "None" , make_descendants : bool = False , mkdir : bool = False
112+ self ,
113+ file : Optional [Union [str , Path ]],
114+ bib_type : str = "None" ,
115+ make_descendants : bool = False ,
116+ mkdir : bool = False ,
104117 ):
105118 """
106119 Create container as child of self.
@@ -141,7 +154,9 @@ def make_descendants(self, file: Optional[Union[str, Path]] = None):
141154 if file is None :
142155 file = self .root_dir
143156 if file is None or not Path (file ).is_dir ():
144- logger .error (f"Cannot make file children for { file } " )
157+ logger .error (
158+ f"Cannot make file children for { file } "
159+ )
145160 return
146161 files = self ._get_children (file )
147162 for f in files :
@@ -155,7 +170,10 @@ def make_special(self, kwargs: Dict[str, Any]):
155170
156171 @classmethod
157172 def make_datatables (
158- cls , indir : Union [str , Path ], outdir : Optional [Union [str , Path ]] = None , outfile_h : Optional [Union [str , Path ]] = None
173+ cls ,
174+ indir : Union [str , Path ],
175+ outdir : Optional [Union [str , Path ]] = None ,
176+ outfile_h : Optional [Union [str , Path ]] = None ,
159177 ):
160178 """
161179 Create a JQuery DataTables HTML file from an AmiCorpus.
@@ -180,15 +198,25 @@ def make_datatables(
180198
181199 if epmc_infile .exists ():
182200 cls .read_json_create_write_html_table (
183- epmc_infile , outfile_h , wanted_keys = None , datatables = datatables , table_id = None , config_ini = config_ini
201+ epmc_infile ,
202+ outfile_h ,
203+ wanted_keys = None ,
204+ datatables = datatables ,
205+ table_id = None ,
206+ config_ini = config_ini ,
184207 )
185208 return
186209
187210 # Try alternative filename
188211 infile = Path (indir ) / EUPMC_RESULTS_JSON
189212 if infile .exists ():
190213 cls .read_json_create_write_html_table (
191- infile , outfile_h , wanted_keys = None , datatables = datatables , table_id = None , config_ini = config_ini
214+ infile ,
215+ outfile_h ,
216+ wanted_keys = None ,
217+ datatables = datatables ,
218+ table_id = None ,
219+ config_ini = config_ini ,
192220 )
193221 return
194222
@@ -226,7 +254,8 @@ def read_json_create_write_html_table(
226254 from datatables_module import Datatables
227255
228256 htmlx , tbody = Datatables .create_table (
229- labels = list (data [0 ].keys ()) if data else [], table_id = table_id or "corpus_table"
257+ labels = list (data [0 ].keys ()) if data else [],
258+ table_id = table_id or "corpus_table" ,
230259 )
231260
232261 # Add data rows
@@ -240,7 +269,9 @@ def read_json_create_write_html_table(
240269
241270 # Write to file
242271 with open (outfile_h , "w" , encoding = "utf-8" ) as f :
243- f .write (ET .tostring (htmlx , encoding = "unicode" , pretty_print = True ))
272+ f .write (
273+ ET .tostring (htmlx , encoding = "unicode" , pretty_print = True )
274+ )
244275
245276 def list_files (self , globstr : str ) -> List [Path ]:
246277 """
@@ -253,11 +284,18 @@ def list_files(self, globstr: str) -> List[Path]:
253284 List of matching file paths
254285 """
255286 if globstr and self .root_dir :
256- return self ._posix_glob (str (self .root_dir / globstr ), recursive = True )
287+ return self ._posix_glob (
288+ str (self .root_dir / globstr ), recursive = True
289+ )
257290 return []
258291
259292 def create_datatables_html_with_filenames (
260- self , html_glob : str , labels : List [str ], table_id : str , outpath : Optional [Union [str , Path ]] = None , debug : bool = True
293+ self ,
294+ html_glob : str ,
295+ labels : List [str ],
296+ table_id : str ,
297+ outpath : Optional [Union [str , Path ]] = None ,
298+ debug : bool = True ,
261299 ):
262300 """
263301 Create DataTables HTML with filenames.
@@ -276,11 +314,15 @@ def create_datatables_html_with_filenames(
276314
277315 from datatables_module import Datatables
278316
279- self .datables_html , tbody = Datatables ._create_html_for_datatables (labels , table_id )
317+ self .datables_html , tbody = Datatables ._create_html_for_datatables (
318+ labels , table_id
319+ )
280320
281321 for html_file in html_files :
282322 if outpath :
283- offset = self ._get_relative_path (html_file , Path (outpath ).parent , walk_up = True )
323+ offset = self ._get_relative_path (
324+ html_file , Path (outpath ).parent , walk_up = True
325+ )
284326 else :
285327 offset = html_file .name
286328
@@ -295,7 +337,13 @@ def create_datatables_html_with_filenames(
295337
296338 if outpath :
297339 with open (outpath , "w" , encoding = "utf-8" ) as f :
298- f .write (ET .tostring (self .datables_html , encoding = "unicode" , pretty_print = True ))
340+ f .write (
341+ ET .tostring (
342+ self .datables_html ,
343+ encoding = "unicode" ,
344+ pretty_print = True ,
345+ )
346+ )
299347
300348 return self .datables_html
301349
@@ -310,7 +358,9 @@ def make_infiles(self, maxfiles: int = 999999999) -> List[Path]:
310358 List of file paths
311359 """
312360 if self .globstr :
313- self .infiles = self ._posix_glob (self .globstr , recursive = True )[:maxfiles ]
361+ self .infiles = self ._posix_glob (
362+ self .globstr , recursive = True
363+ )[:maxfiles ]
314364 return self .infiles
315365
316366 def _make_outfile (self ):
@@ -339,13 +389,20 @@ def get_or_create_corpus_query(
339389 """
340390 corpus_query = self .corpus_queries .get (query_id )
341391 if not corpus_query :
342- corpus_query = CorpusQuery (query_id = query_id , phrasefile = phrasefile , phrases = phrases , outfile = outfile )
392+ corpus_query = CorpusQuery (
393+ query_id = query_id ,
394+ phrasefile = phrasefile ,
395+ phrases = phrases ,
396+ outfile = outfile ,
397+ )
343398 self .corpus_queries [query_id ] = corpus_query
344399 corpus_query .corpus = self
345400
346401 return corpus_query
347402
348- def search_files_with_queries (self , query_ids : Union [str , List [str ]], debug : bool = True ) -> Dict [str , Any ]:
403+ def search_files_with_queries (
404+ self , query_ids : Union [str , List [str ]], debug : bool = True
405+ ) -> Dict [str , Any ]:
349406 """
350407 Run queries. Assumes queries have been loaded and are recallable by ID.
351408
@@ -360,18 +417,30 @@ def search_files_with_queries(self, query_ids: Union[str, List[str]], debug: boo
360417 if isinstance (query_ids , str ):
361418 query_ids = [query_ids ]
362419 elif not isinstance (query_ids , list ):
363- raise ValueError (f"queries requires id/s as list or str, found { type (query_ids )} " )
420+ raise ValueError (
421+ f"queries requires id/s as list or str, found { type (query_ids )} "
422+ )
364423
365424 for query_id in query_ids :
366425 query = self .corpus_queries .get (query_id )
367426 if query is None :
368- logger .error (f"cannot find query: { query_id } " )
427+ err_msg = (
428+ "cannot find query: "
429+ + str (query_id )
430+ )
431+ logger .error (err_msg )
369432 continue
370- logger .debug (f"outfile==> { query .outfile } " )
433+ dbg_msg = (
434+ "outfile==> "
435+ + str (query .outfile )
436+ )
437+ logger .debug (dbg_msg )
371438
372439 # This would need to be implemented based on the search functionality
373440 # For now, we'll create a placeholder
374- logger .info (f"Running query: { query_id } " )
441+ logger .info (
442+ f"Running query: { query_id } "
443+ )
375444
376445 return html_by_query_id
377446
@@ -384,7 +453,9 @@ def _get_children(self, directory: Union[str, Path]) -> List[Path]:
384453 """Get child files/directories of a directory."""
385454 return list (Path (directory ).iterdir ())
386455
387- def _get_relative_path (self , file_path : Path , base_path : Path , walk_up : bool = False ) -> str :
456+ def _get_relative_path (
457+ self , file_path : Path , base_path : Path , walk_up : bool = False
458+ ) -> str :
388459 """Get relative path from base path."""
389460 try :
390461 return str (file_path .relative_to (base_path ))
@@ -416,14 +487,16 @@ def __init__(
416487 exist_ok: Whether to allow existing directory
417488 """
418489 if not isinstance (ami_corpus , AmiCorpus ):
419- raise ValueError (f"ami_corpus has wrong type { type (ami_corpus )} " )
490+ raise ValueError (
491+ f"ami_corpus has wrong type { type (ami_corpus )} "
492+ )
420493
421494 self .ami_corpus = ami_corpus
422495 self .file = Path (file )
423496 self .ami_corpus .container_by_file [self .file ] = self
424497
425498 if not file :
426- logger .error (f "No file argument" )
499+ logger .error ("No file argument" )
427500 return None
428501
429502 self .bib_type = bib_type
@@ -437,13 +510,21 @@ def child_containers(self) -> List["AmiCorpusContainer"]:
437510 if self .ami_corpus and self .file and self .file .is_dir ():
438511 child_nodes = self .ami_corpus ._get_children (self .file )
439512 for child_node in child_nodes :
440- child_container = AmiCorpusContainer (self .ami_corpus , child_node )
441- child_container .bib_type = "" if child_node .is_dir () else "file"
513+ child_container = AmiCorpusContainer (
514+ self .ami_corpus , child_node
515+ )
516+ child_container .bib_type = (
517+ "" if child_node .is_dir () else "file"
518+ )
442519 child_containers .append (child_container )
443520 return child_containers
444521
445522 def create_corpus_container (
446- self , filename : str , bib_type : str = "unknown" , make_descendants : bool = False , mkdir : bool = False
523+ self ,
524+ filename : str ,
525+ bib_type : str = "unknown" ,
526+ make_descendants : bool = False ,
527+ mkdir : bool = False ,
447528 ):
448529 """
449530 Create a child container and optionally its actual directory.
@@ -469,11 +550,18 @@ def create_corpus_container(
469550 logger .error (f"{ path } exists but is not a directory" )
470551 return None
471552
472- corpus_container = AmiCorpusContainer (self .ami_corpus , path , bib_type = bib_type , mkdir = mkdir )
553+ corpus_container = AmiCorpusContainer (
554+ self .ami_corpus , path , bib_type = bib_type , mkdir = mkdir
555+ )
473556 self .child_container_list .append (corpus_container )
474557 return corpus_container
475558
476- def create_document (self , filename : str , text : Optional [str ] = None , type : str = "unknown" ) -> Path :
559+ def create_document (
560+ self ,
561+ filename : str ,
562+ text : Optional [str ] = None ,
563+ type : str = "unknown" ,
564+ ) -> Path :
477565 """
478566 Create document file with name and self as parent.
479567
0 commit comments