OneOffTech
diff --git a/‎src/parxyval/cli/commands/__init__.py‎
Lines changed: 1 addition & 2 deletions b/‎src/parxyval/cli/commands/__init__.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/parxyval/cli/commands/download.py‎
Lines changed: 38 additions & 28 deletions b/‎src/parxyval/cli/commands/download.py‎
Lines changed: 38 additions & 28 deletions
diff --git a/‎src/parxyval/cli/commands/evaluate.py‎
Lines changed: 57 additions & 45 deletions b/‎src/parxyval/cli/commands/evaluate.py‎
Lines changed: 57 additions & 45 deletions
@@ -1,4 +1,3 @@
-
 import typer
 
 from .download import app as download_dataset_app
@@ -9,4 +8,4 @@
 
 app.add_typer(download_dataset_app)
 app.add_typer(parse_app)
-app.add_typer(evaluation_app)
+app.add_typer(evaluation_app)
@@ -19,6 +19,7 @@
 
 app = typer.Typer()
 
+
 @app.command()
 def download(
     limit: Optional[int] = typer.Option(
@@ -58,32 +59,32 @@ def download(
 
     logging.basicConfig(
         level=logging.DEBUG if debug else logging.INFO,
-        format="%(asctime)s : %(levelname)s : %(name)s : %(message)s"
+        format='%(asctime)s : %(levelname)s : %(name)s : %(message)s',
     )
 
     # Set logging level to WARNING or ERROR
     datasets.logging.set_verbosity_warning()
     disable_progress_bars()
 
-    print(f"Creating ground truth from {Dataset.DOCLAYNETV2.value}...")
-
-
+    print(f'Creating ground truth from {Dataset.DOCLAYNETV2.value}...')
 
-    json_folder = f"{output_path}/json"
-    pdf_folder = f"{output_path}/pdf" if include_pdf else None
+    json_folder = f'{output_path}/json'
+    pdf_folder = f'{output_path}/pdf' if include_pdf else None
 
-    dataset_columns = ["metadata"]  # Columns to extract from the HF dataset
+    dataset_columns = ['metadata']  # Columns to extract from the HF dataset
 
     # Create directories if they don't exist
     if json_folder:
-        dataset_columns.extend(["pdf_cells", "category_id"])  # Columns needed to download JSON representation
-        logging.debug(f"Output folder (json): {json_folder}")
+        dataset_columns.extend(
+            ['pdf_cells', 'category_id']
+        )  # Columns needed to download JSON representation
+        logging.debug(f'Output folder (json): {json_folder}')
         if not os.path.isdir(json_folder):
             os.makedirs(json_folder)
 
     if pdf_folder:
-        dataset_columns.extend(["pdf"])  # Column needed to download PDF files
-        logging.debug(f"Output folder (pdf): {pdf_folder}")
+        dataset_columns.extend(['pdf'])  # Column needed to download PDF files
+        logging.debug(f'Output folder (pdf): {pdf_folder}')
         if not os.path.isdir(pdf_folder):
             os.makedirs(pdf_folder)
 
@@ -92,45 +93,54 @@ def download(
 
     ds_builder = load_dataset_builder(Dataset.DOCLAYNETV2.value)
 
-    print(f"Dataset: {Dataset.DOCLAYNETV2.value}")
-    print(f"Dataset: {ds_builder.info.dataset_name} ({ds_builder.info.builder_name})")
-    print(f"Split: Train ({ds_builder.info.splits['train'].num_examples} examples)")
+    print(f'Dataset: {Dataset.DOCLAYNETV2.value}')
+    print(f'Dataset: {ds_builder.info.dataset_name} ({ds_builder.info.builder_name})')
+    print(f'Split: Train ({ds_builder.info.splits["train"].num_examples} examples)')
 
-    print(f"Creating ground truth (limit={limit}, skip={skip})...")
-    
-    data = load_dataset(Dataset.DOCLAYNETV2.value, split="train",
-                    streaming=True, columns=dataset_columns)
+    print(f'Creating ground truth (limit={limit}, skip={skip})...')
+
+    data = load_dataset(
+        Dataset.DOCLAYNETV2.value,
+        split='train',
+        streaming=True,
+        columns=dataset_columns,
+    )
 
     count_processed = 0
     count_skipped = 0
     for row in data:
-
         # Skip the first `skip` entries
         if skip is not None and count_skipped < skip:
             count_skipped += 1
             continue
 
-        logging.debug(f"Processing {row['metadata']['page_hash']}...")
+        logging.debug(f'Processing {row["metadata"]["page_hash"]}...')
 
         # Convert json to Parxy document
         if json_folder:
-            res = doclaynet_v12_to_parxy(row["pdf_cells"], row["metadata"], row["category_id"])
-            with open(os.path.join(json_folder, row['metadata']['page_hash'] + ".json"), "w") as json_file:
+            res = doclaynet_v12_to_parxy(
+                row['pdf_cells'], row['metadata'], row['category_id']
+            )
+            with open(
+                os.path.join(json_folder, row['metadata']['page_hash'] + '.json'), 'w'
+            ) as json_file:
                 json.dump(res.model_dump(), json_file)
 
         # Store PDF file
         if pdf_folder:
-            with open(os.path.join(pdf_folder, row['metadata']['page_hash'] + ".pdf"), "wb") as pdf_file:
-                pdf_file.write(row["pdf"])
+            with open(
+                os.path.join(pdf_folder, row['metadata']['page_hash'] + '.pdf'), 'wb'
+            ) as pdf_file:
+                pdf_file.write(row['pdf'])
 
         count_processed += 1
 
         # Terminate after `n_limit` processed entries
         if limit is not None and count_processed >= limit:
             break
 
-    logging.debug(f"Skipped {count_skipped} records")
-    logging.debug(f"Processed {count_processed} records")
+    logging.debug(f'Skipped {count_skipped} records')
+    logging.debug(f'Processed {count_processed} records')
 
-    print(f"Ground truth created in [green]{json_folder}[/green].")
-    print(f"Entries: {count_processed}")
+    print(f'Ground truth created in [green]{json_folder}[/green].')
+    print(f'Entries: {count_processed}')
@@ -7,7 +7,13 @@
 
 import pandas as pd
 from rich.console import Console
-from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
+from rich.progress import (
+    Progress,
+    SpinnerColumn,
+    TextColumn,
+    BarColumn,
+    TaskProgressColumn,
+)
 from rich.table import Table
 from rich import print
 from dotenv import load_dotenv
@@ -21,14 +27,15 @@
 
 app = typer.Typer()
 
+
 @app.command()
 def evaluate(
     driver: Optional[str] = typer.Argument(
         default='pymupdf',
         help='The Parxy driver to evaluate. If omitted defaults to pymupdf.',
     ),
     metrics: Optional[List[str]] = typer.Option(
-        ["sequence_matcher"],
+        ['sequence_matcher'],
         '--metric',
         '-m',
         help='The metric to evaluate.',
@@ -67,38 +74,40 @@ def evaluate(
         dir_okay=True,
     ),
 ):
-
     logging.basicConfig(
         level=logging.WARNING,
-        format="%(asctime)s : %(levelname)s : %(name)s : %(message)s"
+        format='%(asctime)s : %(levelname)s : %(name)s : %(message)s',
     )
-   
-    metrics_name = [metric.lower().strip().replace("-", "_").replace(" ", "_")
-                    for metric in metrics if get_metric(metric)]
-    
+
+    metrics_name = [
+        metric.lower().strip().replace('-', '_').replace(' ', '_')
+        for metric in metrics
+        if get_metric(metric)
+    ]
+
     if all_metrics is True:
         metrics_name = get_metrics_name()
 
     if not os.path.exists(input_folder):
-        logging.debug(f"The specified input folder [{input_folder}] does not exist!")
+        logging.debug(f'The specified input folder [{input_folder}] does not exist!')
         raise typer.Exit(code=422)
     if not os.path.exists(golden_folder):
-        logging.debug(f"The specified golden folder [{golden_folder}] does not exist!")
+        logging.debug(f'The specified golden folder [{golden_folder}] does not exist!')
         raise typer.Exit(code=422)
     if not os.path.exists(output_folder):
         os.makedirs(output_folder)
 
     if len(metrics_name) == 0:
-        logging.debug(f"The specified metrics are not implemented!")
+        logging.debug(f'The specified metrics are not implemented!')
         raise typer.Exit(code=422)
-    
+
     metrics_fn = list([get_metric(metric) for metric in metrics_name])
 
     console = Console()
-    
-    logging.debug(f"Input folder: {input_folder}")
-    logging.debug(f"Output folder: {output_folder}")
-    logging.debug(f"Metrics: {metrics_name}")
+
+    logging.debug(f'Input folder: {input_folder}')
+    logging.debug(f'Output folder: {output_folder}')
+    logging.debug(f'Metrics: {metrics_name}')
 
     # Get total number of files to process
     files = os.listdir(input_folder)
@@ -107,37 +116,37 @@ def evaluate(
     res_list = []
     with Progress(
         SpinnerColumn(),
-        TextColumn("[progress.description]{task.description}"),
+        TextColumn('[progress.description]{task.description}'),
         BarColumn(),
         TaskProgressColumn(),
         console=console,
-        transient=True
+        transient=True,
     ) as progress:
-        task = progress.add_task("Evaluating documents...", total=total_files)
-        
+        task = progress.add_task('Evaluating documents...', total=total_files)
+
         for filename in files:
-            progress.update(task, description=f"Processing {filename}...")
+            progress.update(task, description=f'Processing {filename}...')
 
             # Read the parsing result
-            with open(os.path.join(input_folder, filename), "r") as f:
+            with open(os.path.join(input_folder, filename), 'r') as f:
                 doc = Document(**json.loads(f.read()))
 
             # Read the ground truth
             try:
-                with open(os.path.join(golden_folder, filename), "r") as f:
+                with open(os.path.join(golden_folder, filename), 'r') as f:
                     golden_doc = Document(**json.loads(f.read()))
             except FileNotFoundError:
-                logging.error(f"File [{filename}] does not exist!")
+                logging.error(f'File [{filename}] does not exist!')
                 progress.advance(task)
                 continue
 
             base_data = {
-                "filename": filename,
-                "collection": golden_doc.source_data["collection"],
-                "doc_category": golden_doc.source_data["doc_category"],
-                "original_filename": golden_doc.source_data["original_filename"],
-                "page_no": golden_doc.source_data["page_no"],
-                "processing_time_seconds": doc.source_data["processing_time_seconds"],
+                'filename': filename,
+                'collection': golden_doc.source_data['collection'],
+                'doc_category': golden_doc.source_data['doc_category'],
+                'original_filename': golden_doc.source_data['original_filename'],
+                'page_no': golden_doc.source_data['page_no'],
+                'processing_time_seconds': doc.source_data['processing_time_seconds'],
             }
 
             # merge all metrics dicts into one
@@ -150,28 +159,31 @@ def evaluate(
             res_list.append(row)
             progress.advance(task)
 
-    timestamp_str = str(time.time()).replace(".", "")
+    timestamp_str = str(time.time()).replace('.', '')
     res_df = pd.DataFrame(res_list)
-    input_folder_name = input_folder.replace(os.sep, "/").replace("\\", "/")
-    input_folder_name = input_folder_name.split("/")[-1].replace(" ", "_").lower()
-    output_file = f"eval_{input_folder_name}_{timestamp_str}.csv"
+    input_folder_name = input_folder.replace(os.sep, '/').replace('\\', '/')
+    input_folder_name = input_folder_name.split('/')[-1].replace(' ', '_').lower()
+    output_file = f'eval_{input_folder_name}_{timestamp_str}.csv'
     output_path = os.path.join(output_folder, output_file)
     res_df.to_csv(output_path, index=False)
 
-    print(f"\n[green]✓[/green] Evaluation completed. Results saved to: [blue]{output_path}[/blue]")
-    
+    print(
+        f'\n[green]✓[/green] Evaluation completed. Results saved to: [blue]{output_path}[/blue]'
+    )
+
     # Print evaluation statistics
     table = Table()
-    table.add_column("Metric")
-    table.add_column("Value", justify="right", style="green")
-    
-    table.add_row("Documents processed", str(len(res_list)))
-    table.add_row("Average parsing time", f"{res_df['processing_time_seconds'].mean():.2f}s")
-    
+    table.add_column('Metric')
+    table.add_column('Value', justify='right', style='green')
+
+    table.add_row('Documents processed', str(len(res_list)))
+    table.add_row(
+        'Average parsing time', f'{res_df["processing_time_seconds"].mean():.2f}s'
+    )
+
     for metric_column in metrics_name:
         if metric_column in res_df.columns and not res_df[metric_column].isna().all():
             if res_df[metric_column].dtype in ['float64', 'int64']:
-                table.add_row(metric_column, f"{res_df[metric_column].mean():.4f}")
-    
-    console.print(table)
+                table.add_row(metric_column, f'{res_df[metric_column].mean():.4f}')
 
+    console.print(table)