Skip to content

Commit 966c363

Browse files
committed
Code style
1 parent cc9447e commit 966c363

12 files changed

Lines changed: 332 additions & 333 deletions

File tree

src/parxyval/cli/commands/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
import typer
32

43
from .download import app as download_dataset_app
@@ -9,4 +8,4 @@
98

109
app.add_typer(download_dataset_app)
1110
app.add_typer(parse_app)
12-
app.add_typer(evaluation_app)
11+
app.add_typer(evaluation_app)

src/parxyval/cli/commands/download.py

Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
app = typer.Typer()
2121

22+
2223
@app.command()
2324
def download(
2425
limit: Optional[int] = typer.Option(
@@ -58,32 +59,32 @@ def download(
5859

5960
logging.basicConfig(
6061
level=logging.DEBUG if debug else logging.INFO,
61-
format="%(asctime)s : %(levelname)s : %(name)s : %(message)s"
62+
format='%(asctime)s : %(levelname)s : %(name)s : %(message)s',
6263
)
6364

6465
# Set logging level to WARNING or ERROR
6566
datasets.logging.set_verbosity_warning()
6667
disable_progress_bars()
6768

68-
print(f"Creating ground truth from {Dataset.DOCLAYNETV2.value}...")
69-
70-
69+
print(f'Creating ground truth from {Dataset.DOCLAYNETV2.value}...')
7170

72-
json_folder = f"{output_path}/json"
73-
pdf_folder = f"{output_path}/pdf" if include_pdf else None
71+
json_folder = f'{output_path}/json'
72+
pdf_folder = f'{output_path}/pdf' if include_pdf else None
7473

75-
dataset_columns = ["metadata"] # Columns to extract from the HF dataset
74+
dataset_columns = ['metadata'] # Columns to extract from the HF dataset
7675

7776
# Create directories if they don't exist
7877
if json_folder:
79-
dataset_columns.extend(["pdf_cells", "category_id"]) # Columns needed to download JSON representation
80-
logging.debug(f"Output folder (json): {json_folder}")
78+
dataset_columns.extend(
79+
['pdf_cells', 'category_id']
80+
) # Columns needed to download JSON representation
81+
logging.debug(f'Output folder (json): {json_folder}')
8182
if not os.path.isdir(json_folder):
8283
os.makedirs(json_folder)
8384

8485
if pdf_folder:
85-
dataset_columns.extend(["pdf"]) # Column needed to download PDF files
86-
logging.debug(f"Output folder (pdf): {pdf_folder}")
86+
dataset_columns.extend(['pdf']) # Column needed to download PDF files
87+
logging.debug(f'Output folder (pdf): {pdf_folder}')
8788
if not os.path.isdir(pdf_folder):
8889
os.makedirs(pdf_folder)
8990

@@ -92,45 +93,54 @@ def download(
9293

9394
ds_builder = load_dataset_builder(Dataset.DOCLAYNETV2.value)
9495

95-
print(f"Dataset: {Dataset.DOCLAYNETV2.value}")
96-
print(f"Dataset: {ds_builder.info.dataset_name} ({ds_builder.info.builder_name})")
97-
print(f"Split: Train ({ds_builder.info.splits['train'].num_examples} examples)")
96+
print(f'Dataset: {Dataset.DOCLAYNETV2.value}')
97+
print(f'Dataset: {ds_builder.info.dataset_name} ({ds_builder.info.builder_name})')
98+
print(f'Split: Train ({ds_builder.info.splits["train"].num_examples} examples)')
9899

99-
print(f"Creating ground truth (limit={limit}, skip={skip})...")
100-
101-
data = load_dataset(Dataset.DOCLAYNETV2.value, split="train",
102-
streaming=True, columns=dataset_columns)
100+
print(f'Creating ground truth (limit={limit}, skip={skip})...')
101+
102+
data = load_dataset(
103+
Dataset.DOCLAYNETV2.value,
104+
split='train',
105+
streaming=True,
106+
columns=dataset_columns,
107+
)
103108

104109
count_processed = 0
105110
count_skipped = 0
106111
for row in data:
107-
108112
# Skip the first `skip` entries
109113
if skip is not None and count_skipped < skip:
110114
count_skipped += 1
111115
continue
112116

113-
logging.debug(f"Processing {row['metadata']['page_hash']}...")
117+
logging.debug(f'Processing {row["metadata"]["page_hash"]}...')
114118

115119
# Convert json to Parxy document
116120
if json_folder:
117-
res = doclaynet_v12_to_parxy(row["pdf_cells"], row["metadata"], row["category_id"])
118-
with open(os.path.join(json_folder, row['metadata']['page_hash'] + ".json"), "w") as json_file:
121+
res = doclaynet_v12_to_parxy(
122+
row['pdf_cells'], row['metadata'], row['category_id']
123+
)
124+
with open(
125+
os.path.join(json_folder, row['metadata']['page_hash'] + '.json'), 'w'
126+
) as json_file:
119127
json.dump(res.model_dump(), json_file)
120128

121129
# Store PDF file
122130
if pdf_folder:
123-
with open(os.path.join(pdf_folder, row['metadata']['page_hash'] + ".pdf"), "wb") as pdf_file:
124-
pdf_file.write(row["pdf"])
131+
with open(
132+
os.path.join(pdf_folder, row['metadata']['page_hash'] + '.pdf'), 'wb'
133+
) as pdf_file:
134+
pdf_file.write(row['pdf'])
125135

126136
count_processed += 1
127137

128138
# Terminate after `n_limit` processed entries
129139
if limit is not None and count_processed >= limit:
130140
break
131141

132-
logging.debug(f"Skipped {count_skipped} records")
133-
logging.debug(f"Processed {count_processed} records")
142+
logging.debug(f'Skipped {count_skipped} records')
143+
logging.debug(f'Processed {count_processed} records')
134144

135-
print(f"Ground truth created in [green]{json_folder}[/green].")
136-
print(f"Entries: {count_processed}")
145+
print(f'Ground truth created in [green]{json_folder}[/green].')
146+
print(f'Entries: {count_processed}')

src/parxyval/cli/commands/evaluate.py

Lines changed: 57 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,13 @@
77

88
import pandas as pd
99
from rich.console import Console
10-
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
10+
from rich.progress import (
11+
Progress,
12+
SpinnerColumn,
13+
TextColumn,
14+
BarColumn,
15+
TaskProgressColumn,
16+
)
1117
from rich.table import Table
1218
from rich import print
1319
from dotenv import load_dotenv
@@ -21,14 +27,15 @@
2127

2228
app = typer.Typer()
2329

30+
2431
@app.command()
2532
def evaluate(
2633
driver: Optional[str] = typer.Argument(
2734
default='pymupdf',
2835
help='The Parxy driver to evaluate. If omitted defaults to pymupdf.',
2936
),
3037
metrics: Optional[List[str]] = typer.Option(
31-
["sequence_matcher"],
38+
['sequence_matcher'],
3239
'--metric',
3340
'-m',
3441
help='The metric to evaluate.',
@@ -67,38 +74,40 @@ def evaluate(
6774
dir_okay=True,
6875
),
6976
):
70-
7177
logging.basicConfig(
7278
level=logging.WARNING,
73-
format="%(asctime)s : %(levelname)s : %(name)s : %(message)s"
79+
format='%(asctime)s : %(levelname)s : %(name)s : %(message)s',
7480
)
75-
76-
metrics_name = [metric.lower().strip().replace("-", "_").replace(" ", "_")
77-
for metric in metrics if get_metric(metric)]
78-
81+
82+
metrics_name = [
83+
metric.lower().strip().replace('-', '_').replace(' ', '_')
84+
for metric in metrics
85+
if get_metric(metric)
86+
]
87+
7988
if all_metrics is True:
8089
metrics_name = get_metrics_name()
8190

8291
if not os.path.exists(input_folder):
83-
logging.debug(f"The specified input folder [{input_folder}] does not exist!")
92+
logging.debug(f'The specified input folder [{input_folder}] does not exist!')
8493
raise typer.Exit(code=422)
8594
if not os.path.exists(golden_folder):
86-
logging.debug(f"The specified golden folder [{golden_folder}] does not exist!")
95+
logging.debug(f'The specified golden folder [{golden_folder}] does not exist!')
8796
raise typer.Exit(code=422)
8897
if not os.path.exists(output_folder):
8998
os.makedirs(output_folder)
9099

91100
if len(metrics_name) == 0:
92-
logging.debug(f"The specified metrics are not implemented!")
101+
logging.debug(f'The specified metrics are not implemented!')
93102
raise typer.Exit(code=422)
94-
103+
95104
metrics_fn = list([get_metric(metric) for metric in metrics_name])
96105

97106
console = Console()
98-
99-
logging.debug(f"Input folder: {input_folder}")
100-
logging.debug(f"Output folder: {output_folder}")
101-
logging.debug(f"Metrics: {metrics_name}")
107+
108+
logging.debug(f'Input folder: {input_folder}')
109+
logging.debug(f'Output folder: {output_folder}')
110+
logging.debug(f'Metrics: {metrics_name}')
102111

103112
# Get total number of files to process
104113
files = os.listdir(input_folder)
@@ -107,37 +116,37 @@ def evaluate(
107116
res_list = []
108117
with Progress(
109118
SpinnerColumn(),
110-
TextColumn("[progress.description]{task.description}"),
119+
TextColumn('[progress.description]{task.description}'),
111120
BarColumn(),
112121
TaskProgressColumn(),
113122
console=console,
114-
transient=True
123+
transient=True,
115124
) as progress:
116-
task = progress.add_task("Evaluating documents...", total=total_files)
117-
125+
task = progress.add_task('Evaluating documents...', total=total_files)
126+
118127
for filename in files:
119-
progress.update(task, description=f"Processing {filename}...")
128+
progress.update(task, description=f'Processing {filename}...')
120129

121130
# Read the parsing result
122-
with open(os.path.join(input_folder, filename), "r") as f:
131+
with open(os.path.join(input_folder, filename), 'r') as f:
123132
doc = Document(**json.loads(f.read()))
124133

125134
# Read the ground truth
126135
try:
127-
with open(os.path.join(golden_folder, filename), "r") as f:
136+
with open(os.path.join(golden_folder, filename), 'r') as f:
128137
golden_doc = Document(**json.loads(f.read()))
129138
except FileNotFoundError:
130-
logging.error(f"File [{filename}] does not exist!")
139+
logging.error(f'File [{filename}] does not exist!')
131140
progress.advance(task)
132141
continue
133142

134143
base_data = {
135-
"filename": filename,
136-
"collection": golden_doc.source_data["collection"],
137-
"doc_category": golden_doc.source_data["doc_category"],
138-
"original_filename": golden_doc.source_data["original_filename"],
139-
"page_no": golden_doc.source_data["page_no"],
140-
"processing_time_seconds": doc.source_data["processing_time_seconds"],
144+
'filename': filename,
145+
'collection': golden_doc.source_data['collection'],
146+
'doc_category': golden_doc.source_data['doc_category'],
147+
'original_filename': golden_doc.source_data['original_filename'],
148+
'page_no': golden_doc.source_data['page_no'],
149+
'processing_time_seconds': doc.source_data['processing_time_seconds'],
141150
}
142151

143152
# merge all metrics dicts into one
@@ -150,28 +159,31 @@ def evaluate(
150159
res_list.append(row)
151160
progress.advance(task)
152161

153-
timestamp_str = str(time.time()).replace(".", "")
162+
timestamp_str = str(time.time()).replace('.', '')
154163
res_df = pd.DataFrame(res_list)
155-
input_folder_name = input_folder.replace(os.sep, "/").replace("\\", "/")
156-
input_folder_name = input_folder_name.split("/")[-1].replace(" ", "_").lower()
157-
output_file = f"eval_{input_folder_name}_{timestamp_str}.csv"
164+
input_folder_name = input_folder.replace(os.sep, '/').replace('\\', '/')
165+
input_folder_name = input_folder_name.split('/')[-1].replace(' ', '_').lower()
166+
output_file = f'eval_{input_folder_name}_{timestamp_str}.csv'
158167
output_path = os.path.join(output_folder, output_file)
159168
res_df.to_csv(output_path, index=False)
160169

161-
print(f"\n[green]✓[/green] Evaluation completed. Results saved to: [blue]{output_path}[/blue]")
162-
170+
print(
171+
f'\n[green]✓[/green] Evaluation completed. Results saved to: [blue]{output_path}[/blue]'
172+
)
173+
163174
# Print evaluation statistics
164175
table = Table()
165-
table.add_column("Metric")
166-
table.add_column("Value", justify="right", style="green")
167-
168-
table.add_row("Documents processed", str(len(res_list)))
169-
table.add_row("Average parsing time", f"{res_df['processing_time_seconds'].mean():.2f}s")
170-
176+
table.add_column('Metric')
177+
table.add_column('Value', justify='right', style='green')
178+
179+
table.add_row('Documents processed', str(len(res_list)))
180+
table.add_row(
181+
'Average parsing time', f'{res_df["processing_time_seconds"].mean():.2f}s'
182+
)
183+
171184
for metric_column in metrics_name:
172185
if metric_column in res_df.columns and not res_df[metric_column].isna().all():
173186
if res_df[metric_column].dtype in ['float64', 'int64']:
174-
table.add_row(metric_column, f"{res_df[metric_column].mean():.4f}")
175-
176-
console.print(table)
187+
table.add_row(metric_column, f'{res_df[metric_column].mean():.4f}')
177188

189+
console.print(table)

0 commit comments

Comments
 (0)