Skip to content

Commit 27f4baf

Browse files
committed
add log table summary to cli
1 parent ae30610 commit 27f4baf

5 files changed

Lines changed: 437 additions & 21 deletions

File tree

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
-- analyze_logs.sql
2+
.mode box.timer on -- Summary Statistics
3+
SELECT
4+
'Log Summary' as section;
5+
6+
SELECT
7+
COUNT(*) as total_logs,
8+
COUNT(DISTINCT file_name) as unique_trackers,
9+
MIN(timestamp) as earliest,
10+
MAX(timestamp) as latest
11+
FROM
12+
'/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/output_python/tables/table_logs.parquet';
13+
14+
-- Level Distribution
15+
SELECT
16+
'Level Distribution' as section;
17+
18+
SELECT
19+
level,
20+
COUNT(*) as count
21+
FROM
22+
'/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/output_python/tables/table_logs.parquet'
23+
GROUP BY
24+
level
25+
ORDER BY
26+
count DESC;
27+
28+
-- Top Errors
29+
SELECT
30+
'Top 10 Files with Most Errors' as section;
31+
32+
SELECT
33+
file_name,
34+
COUNT(*) as issues
35+
FROM
36+
'/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/output_python/tables/table_logs.parquet'
37+
WHERE
38+
level = 'ERROR'
39+
GROUP BY
40+
file_name
41+
ORDER BY
42+
issues DESC
43+
LIMIT
44+
10;
45+
46+
SELECT
47+
file_name,
48+
COUNT(*) as issues
49+
FROM
50+
'/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/output_python/tables/table_logs.parquet'
51+
WHERE
52+
level = 'WARNING'
53+
GROUP BY
54+
file_name
55+
ORDER BY
56+
issues DESC
57+
LIMIT
58+
10;
59+
60+
-- Exception Summary
61+
SELECT
62+
'Exception Types' as section;
63+
64+
SELECT
65+
exception_type,
66+
COUNT(*) as count
67+
FROM
68+
'/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/output_python/tables/table_logs.parquet'
69+
WHERE
70+
has_exception = true
71+
GROUP BY
72+
exception_type
73+
ORDER BY
74+
count DESC;

a4d-python/src/a4d/cli.py

Lines changed: 126 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,77 @@
11
"""Command-line interface for A4D pipeline."""
22

33
from pathlib import Path
4+
from typing import Annotated
45

6+
import polars as pl
57
import typer
68
from rich.console import Console
79
from rich.table import Table
810

9-
from a4d.pipeline.patient import run_patient_pipeline
11+
from a4d.pipeline.patient import process_patient_tables, run_patient_pipeline
12+
from a4d.tables.logs import create_table_logs
1013

1114
app = typer.Typer(name="a4d", help="A4D medical tracker data processing pipeline", no_args_is_help=True)
1215

1316
console = Console()
1417

1518

19+
def _display_tables_summary(tables: dict[str, Path]) -> None:
20+
"""Display summary table of created tables with record counts.
21+
22+
Args:
23+
tables: Dictionary mapping table name to output path
24+
"""
25+
if not tables:
26+
return
27+
28+
console.print("\n[bold green]Created Tables:[/bold green]")
29+
tables_table = Table(title="Created Tables")
30+
tables_table.add_column("Table", style="cyan")
31+
tables_table.add_column("Path", style="green")
32+
tables_table.add_column("Records", justify="right", style="magenta")
33+
34+
# Add patient tables first, then logs table
35+
for name in ["static", "monthly", "annual"]:
36+
if name in tables:
37+
path = tables[name]
38+
try:
39+
df = pl.read_parquet(path)
40+
record_count = f"{len(df):,}"
41+
except Exception:
42+
record_count = "?"
43+
tables_table.add_row(name, str(path.name), record_count)
44+
45+
# Add logs table last
46+
if "logs" in tables:
47+
path = tables["logs"]
48+
try:
49+
df = pl.read_parquet(path)
50+
record_count = f"{len(df):,}"
51+
except Exception:
52+
record_count = "?"
53+
tables_table.add_row("logs", str(path.name), record_count)
54+
55+
console.print(tables_table)
56+
console.print()
57+
58+
1659
@app.command("process-patient")
1760
def process_patient_cmd(
18-
file: Path | None = typer.Option(
19-
None, "--file", "-f", help="Process specific tracker file (if not set, processes all files in data_root)"
20-
),
21-
workers: int = typer.Option(1, "--workers", "-w", help="Number of parallel workers (1 = sequential)"),
22-
skip_tables: bool = typer.Option(False, "--skip-tables", help="Skip table creation (only extract + clean)"),
23-
force: bool = typer.Option(False, "--force", help="Force reprocessing (ignore existing outputs)"),
24-
output_root: Path | None = typer.Option(None, "--output", "-o", help="Output directory (default: from config)"),
61+
file: Annotated[
62+
Path | None,
63+
typer.Option(
64+
"--file", "-f", help="Process specific tracker file (if not set, processes all files in data_root)"
65+
),
66+
] = None,
67+
workers: Annotated[int, typer.Option("--workers", "-w", help="Number of parallel workers (1 = sequential)")] = 1,
68+
skip_tables: Annotated[
69+
bool, typer.Option("--skip-tables", help="Skip table creation (only extract + clean)")
70+
] = False,
71+
force: Annotated[bool, typer.Option("--force", help="Force reprocessing (ignore existing outputs)")] = False,
72+
output_root: Annotated[
73+
Path | None, typer.Option("--output", "-o", help="Output directory (default: from config)")
74+
] = None,
2575
):
2676
"""Process patient data pipeline.
2777
@@ -126,7 +176,7 @@ def process_patient_cmd(
126176
files_by_errors = sorted(
127177
[(tr.tracker_file.name, tr.cleaning_errors) for tr in result.tracker_results if tr.cleaning_errors > 0],
128178
key=lambda x: x[1],
129-
reverse=True
179+
reverse=True,
130180
)[:10]
131181

132182
errors_table = Table()
@@ -139,16 +189,7 @@ def process_patient_cmd(
139189
console.print(errors_table)
140190

141191
# Show created tables
142-
if result.tables:
143-
console.print("\n[bold green]Created Tables:[/bold green]")
144-
tables_table = Table()
145-
tables_table.add_column("Table", style="cyan")
146-
tables_table.add_column("Path", style="green")
147-
148-
for name, path in result.tables.items():
149-
tables_table.add_row(name, str(path))
150-
151-
console.print(tables_table)
192+
_display_tables_summary(result.tables)
152193

153194
# Exit status
154195
if result.success:
@@ -160,8 +201,74 @@ def process_patient_cmd(
160201

161202
except Exception as e:
162203
console.print(f"\n[bold red]Error: {e}[/bold red]\n")
204+
raise typer.Exit(1) from e
205+
206+
207+
@app.command("create-tables")
208+
def create_tables_cmd(
209+
input_dir: Annotated[Path, typer.Option("--input", "-i", help="Directory containing cleaned parquet files")],
210+
output_dir: Annotated[
211+
Path | None, typer.Option("--output", "-o", help="Output directory for tables (default: input_dir/tables)")
212+
] = None,
213+
):
214+
"""Create final tables from existing cleaned parquet files.
215+
216+
This command creates the patient tables (static, monthly, annual) and logs table
217+
from existing cleaned parquet files, without running the full pipeline.
218+
219+
Useful for:
220+
- Re-creating tables after fixing table creation logic
221+
- Creating tables from manually cleaned data
222+
- Testing table creation independently
223+
224+
\\b
225+
Examples:
226+
# Create tables from existing output
227+
uv run a4d create-tables --input output/patient_data_cleaned
228+
229+
# Specify custom output directory
230+
uv run a4d create-tables --input output/patient_data_cleaned --output custom_tables
231+
"""
232+
console.print("\n[bold blue]A4D Table Creation[/bold blue]\n")
233+
234+
# Determine output directory
235+
if output_dir is None:
236+
output_dir = input_dir.parent / "tables"
237+
238+
console.print(f"Input directory: {input_dir}")
239+
console.print(f"Output directory: {output_dir}\n")
240+
241+
# Find cleaned parquet files
242+
cleaned_files = list(input_dir.glob("*_patient_cleaned.parquet"))
243+
if not cleaned_files:
244+
console.print(f"[bold red]Error: No cleaned parquet files found in {input_dir}[/bold red]\n")
163245
raise typer.Exit(1)
164246

247+
console.print(f"Found {len(cleaned_files)} cleaned parquet files\n")
248+
249+
try:
250+
console.print("[bold]Creating tables...[/bold]")
251+
252+
# Create patient tables
253+
tables = process_patient_tables(input_dir, output_dir)
254+
255+
# Create logs table separately (operational data)
256+
logs_dir = input_dir.parent / "logs"
257+
if logs_dir.exists():
258+
console.print(" • Creating logs table...")
259+
logs_table_path = create_table_logs(logs_dir, output_dir)
260+
tables["logs"] = logs_table_path
261+
else:
262+
console.print(f" [yellow]Warning: Logs directory not found at {logs_dir}[/yellow]")
263+
264+
# Display results
265+
console.print("\n[bold green]✓ Tables created successfully![/bold green]")
266+
_display_tables_summary(tables)
267+
268+
except Exception as e:
269+
console.print(f"\n[bold red]Error creating tables: {e}[/bold red]\n")
270+
raise typer.Exit(1) from e
271+
165272

166273
@app.command("version")
167274
def version_cmd():

a4d-python/src/a4d/pipeline/patient.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from a4d.logging import setup_logging
1212
from a4d.pipeline.models import PipelineResult, TrackerResult
1313
from a4d.pipeline.tracker import process_tracker_patient
14+
from a4d.tables.logs import create_table_logs
1415
from a4d.tables.patient import (
1516
create_table_patient_data_annual,
1617
create_table_patient_data_monthly,
@@ -107,7 +108,7 @@ def process_patient_tables(
107108
annual_path = create_table_patient_data_annual(cleaned_files, output_dir)
108109
tables["annual"] = annual_path
109110

110-
logger.info(f"Created {len(tables)} final tables")
111+
logger.info(f"Created {len(tables)} patient tables")
111112
return tables
112113

113114

@@ -291,8 +292,19 @@ def run_patient_pipeline(
291292
try:
292293
cleaned_dir = output_root / "patient_data_cleaned"
293294
tables_dir = output_root / "tables"
295+
296+
# Create patient tables
294297
tables = process_patient_tables(cleaned_dir, tables_dir)
295-
logger.info(f"Created {len(tables)} final tables")
298+
299+
# Create logs table separately (operational data, not patient data)
300+
logs_dir = output_root / "logs"
301+
if logs_dir.exists():
302+
logger.info("Creating logs table from pipeline execution logs")
303+
logs_table_path = create_table_logs(logs_dir, tables_dir)
304+
tables["logs"] = logs_table_path
305+
logger.info(f"Logs table created: {logs_table_path}")
306+
307+
logger.info(f"Created {len(tables)} tables total")
296308
except Exception as e:
297309
logger.exception("Failed to create tables")
298310
# Don't fail entire pipeline if table creation fails

a4d-python/src/a4d/tables/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Table creation module for final output tables."""
22

3+
from a4d.tables.logs import create_table_logs, parse_log_file
34
from a4d.tables.patient import (
45
create_table_patient_data_annual,
56
create_table_patient_data_monthly,
@@ -12,4 +13,6 @@
1213
"create_table_patient_data_monthly",
1314
"create_table_patient_data_static",
1415
"read_cleaned_patient_data",
16+
"create_table_logs",
17+
"parse_log_file",
1518
]

0 commit comments

Comments
 (0)