-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
73 lines (60 loc) · 3.15 KB
/
main.py
File metadata and controls
73 lines (60 loc) · 3.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import click
from pathlib import Path
from src.pipeline import Pipeline, configs, preprocess_configs
@click.group()
def cli():
"""Pipeline command line interface for processing PDF reports and questions."""
pass
@cli.command()
def download_models():
"""Download required docling models."""
click.echo("Downloading docling models...")
Pipeline.download_docling_models()
@cli.command()
@click.option('--parallel/--sequential', default=True, help='Run parsing in parallel or sequential mode')
@click.option('--chunk-size', default=2, help='Number of PDFs to process in each worker')
@click.option('--max-workers', default=10, help='Number of parallel worker processes')
def parse_pdfs(parallel, chunk_size, max_workers):
"""Parse PDF reports with optional parallel processing."""
root_path = Path.cwd()
pipeline = Pipeline(root_path)
click.echo(f"Parsing PDFs (parallel={parallel}, chunk_size={chunk_size}, max_workers={max_workers})")
pipeline.parse_pdf_reports(parallel=parallel, chunk_size=chunk_size, max_workers=max_workers)
@cli.command()
@click.option('--max-workers', default=10, help='Number of workers for table serialization')
def serialize_tables(max_workers):
"""Serialize tables in parsed reports using parallel threading."""
root_path = Path.cwd()
pipeline = Pipeline(root_path)
click.echo(f"Serializing tables (max_workers={max_workers})...")
pipeline.serialize_tables(max_workers=max_workers)
@cli.command()
@click.option('--config', type=click.Choice(['ser_tab', 'no_ser_tab']), default='no_ser_tab', help='Configuration preset to use')
def process_reports(config):
"""Process parsed reports through the pipeline stages."""
root_path = Path.cwd()
run_config = preprocess_configs[config]
pipeline = Pipeline(root_path, run_config=run_config)
click.echo(f"Processing parsed reports (config={config})...")
pipeline.process_parsed_reports()
@cli.command()
@click.option('--config', type=click.Choice(['ser_tab', 'no_ser_tab']), default='no_ser_tab', help='Configuration preset to use')
@click.option('--model', default='all-MiniLM-L6-v2', help='Free embedding model to use (all-MiniLM-L6-v2, all-mpnet-base-v2, etc.)')
def process_reports_free(config, model):
"""Process parsed reports using FREE Hugging Face embeddings instead of OpenAI."""
root_path = Path.cwd()
run_config = preprocess_configs[config]
pipeline = Pipeline(root_path, run_config=run_config)
click.echo(f"🆓 Processing parsed reports with FREE embeddings (config={config}, model={model})...")
pipeline.process_parsed_reports_free(model_name=model)
@cli.command()
@click.option('--config', type=click.Choice(['base', 'pdr', 'max', 'max_no_ser_tab', 'max_nst_o3m', 'max_st_o3m', 'ibm_llama70b', 'ibm_llama8b', 'gemini_thinking']), default='base', help='Configuration preset to use')
def process_questions(config):
"""Process questions using the pipeline."""
root_path = Path.cwd()
run_config = configs[config]
pipeline = Pipeline(root_path, run_config=run_config)
click.echo(f"Processing questions (config={config})...")
pipeline.process_questions()
if __name__ == '__main__':
cli()