nautic-optimizer/cli.py at main · MorganRff/nautic-optimizer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
"""Unified CLI for the nautic-optimizer pipeline.

Each subcommand maps to one pipeline stage with explicit, deterministic
default output paths that chain into the next step.

Pipeline order
--------------
  collect     →  data/{region}_leads.csv        (Google Places + website text)
  predict     →  data/{region}_predicted.csv   (ML filter → tag; is_target==1 only)
  enrich      →  data/{region}_enriched.csv    (email + phone for kept leads only)
  consolidate →  data/_locked/final_for_sql.csv (standardize + unique_key + upsert)
  snapshot    →  data/public_nautical_data.csv  (anonymize for GitHub)
  train       →  nautic_classifier_v1.joblib  +  reports/nautic_classifier_v1/
  mail        →  (reads data/_locked/final_for_sql.csv)
  status      →  data-quality dashboard

Legacy helpers (kept for backward compatibility)
-------------------------------------------------
  standardize      → standalone schema normalisation step
  consolidate-data → standalone website-repair step

Usage
-----
  python cli.py collect     --region occitanie
  python cli.py predict     --region occitanie
  python cli.py enrich      --region occitanie
  python cli.py consolidate --region occitanie
  python cli.py snapshot
  python cli.py train
  python cli.py mail        [--dry-run]
  python cli.py status

All commands append structured logs to logs/pipeline.log.
"""

from __future__ import annotations

import argparse
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parent

from src.utils.logger import get_logger  # noqa: E402  (after ROOT is defined)

LOGGER = get_logger("cli")


# ---------------------------------------------------------------------------
# Stage helpers – each returns the output path(s) it wrote
# ---------------------------------------------------------------------------

def _leads_path(region: str) -> Path:
    return ROOT / "data" / f"{region}_leads.csv"

def _enriched_path(region: str) -> Path:
    return ROOT / "data" / f"{region}_enriched.csv"

def _predicted_path(region: str) -> Path:
    return ROOT / "data" / f"{region}_predicted.csv"

def _standardized_path(region: str) -> Path:
    return ROOT / "data" / f"{region}_standardized.csv"

_DEFAULT_DICTS: list[Path] = [
    ROOT / "data" / "_archive" / "AS.csv",
    ROOT / "data" / "_archive" / "BR.csv",
]
_MASTER_PATH: Path = ROOT / "data" / "_locked" / "final_for_sql.csv"
_PUBLIC_PATH: Path = ROOT / "data" / "public_nautical_data.csv"
_MODEL_PATH: Path = ROOT / "nautic_classifier_v1.joblib"


# ---------------------------------------------------------------------------
# collect
# ---------------------------------------------------------------------------

def cmd_collect(args: argparse.Namespace) -> None:
    from src.engine import GooglePlacesGridCollector, filter_coastal_points
    from src.config import NORTHERN_FRANCE_COAST_COORDS, COASTAL_BUFFER_DEG

    LOGGER.info("[collect] region=%s  max_grid_points=%s", args.region, args.max_grid_points)

    # Automatically narrow the grid to the coastal strip for northern_france to
    # avoid wasting API quota on inland points with no nautical activity.
    point_filter = None
    if args.region == "northern_france":
        point_filter = lambda pts: filter_coastal_points(  # noqa: E731
            pts,
            coast_coords=NORTHERN_FRANCE_COAST_COORDS,
            buffer_deg=COASTAL_BUFFER_DEG,
        )
        LOGGER.info("[collect] coastal filter active (buffer=%.2f°)", COASTAL_BUFFER_DEG)

    collector = GooglePlacesGridCollector(region_name=args.region)
    df = collector.collect_places(
        region_name=args.region,
        max_grid_points=args.max_grid_points,
        lat_min=args.lat_min,
        lat_max=args.lat_max,
        lng_min=args.lng_min,
        lng_max=args.lng_max,
        point_filter=point_filter,
    )
    out = _leads_path(args.region)
    LOGGER.info("[collect] done → %s  (%d rows)", out, len(df))


def _add_collect_parser(sub: argparse._SubParsersAction) -> None:
    p = sub.add_parser("collect", help="Scrape Google Places and write raw leads CSV.")
    p.add_argument("--region", default="bretagne", help="Region key from src.config.REGIONS.")
    p.add_argument("--max-grid-points", type=int, default=None, help="Limit grid points (testing).")
    p.add_argument("--lat-min", type=float, default=None)
    p.add_argument("--lat-max", type=float, default=None)
    p.add_argument("--lng-min", type=float, default=None)
    p.add_argument("--lng-max", type=float, default=None)
    p.set_defaults(func=cmd_collect)


# ---------------------------------------------------------------------------
# enrich
# ---------------------------------------------------------------------------

def cmd_enrich(args: argparse.Namespace) -> None:
    from src.enricher import ContactEnricher

    input_path = Path(args.input) if args.input else _predicted_path(args.region)
    output_path = Path(args.output) if args.output else _enriched_path(args.region)

    if not input_path.exists():
        sys.exit(
            f"Input not found: {input_path}\n"
            f"  Run `python cli.py predict --region {args.region}` first,\n"
            f"  or pass --input explicitly."
        )

    LOGGER.info("[enrich] %s → %s", input_path.name, output_path.name)
    enricher = ContactEnricher(
        http_delay_seconds=args.http_delay,
        requests_timeout_seconds=args.timeout,
        overwrite_email=args.overwrite_email,
        overwrite_phone=args.overwrite_phone,
    )
    enricher.enrich_csv(input_path, output_path, max_rows=args.max_rows)
    LOGGER.info("[enrich] done → %s", output_path)


def _add_enrich_parser(sub: argparse._SubParsersAction) -> None:
    p = sub.add_parser("enrich", help="Crawl websites and fill email + phone.")
    p.add_argument("--region", default="bretagne", help="Used to infer default input/output paths.")
    p.add_argument("--input", default=None, help="Override input CSV (default: {region}_predicted.csv).")
    p.add_argument("--output", default=None, help="Override output CSV path.")
    p.add_argument("--http-delay", type=float, default=1.2)
    p.add_argument("--timeout", type=float, default=12.0)
    p.add_argument("--max-rows", type=int, default=None)
    p.add_argument("--overwrite-email", action="store_true")
    p.add_argument("--overwrite-phone", action="store_true")
    p.set_defaults(func=cmd_enrich)


# ---------------------------------------------------------------------------
# standardize
# ---------------------------------------------------------------------------

def cmd_standardize(args: argparse.Namespace) -> None:
    """(Legacy) Standalone schema normalisation + website fill.

    Prefer the unified ``consolidate`` command for new workflows.
    """
    import pandas as pd
    from scripts.Standardizer import LeadStandardizer
    from src.consolidator import fill_websites

    input_path = Path(args.input) if args.input else _predicted_path(args.region)
    output_path = Path(args.output) if args.output else _standardized_path(args.region)

    if not input_path.exists():
        sys.exit(
            f"Input not found: {input_path}\n"
            f"  Run `python cli.py predict --region {args.region}` first."
        )
    if output_path.resolve() == input_path.resolve():
        sys.exit("Refusing to overwrite input. Choose a different --output.")

    LOGGER.info("[standardize] %s → %s", input_path.name, output_path.name)
    df = pd.read_csv(input_path)
    df = df.drop(columns=["is_target"], errors="ignore")

    std = LeadStandardizer(df)
    out = std.standardize(source=args.source, statut=args.statut)
    out = fill_websites(out, _DEFAULT_DICTS)

    output_path.parent.mkdir(parents=True, exist_ok=True)
    out.to_csv(output_path, index=False)
    LOGGER.info("[standardize] done → %s  (%d rows)", output_path, len(out))


def _add_standardize_parser(sub: argparse._SubParsersAction) -> None:
    p = sub.add_parser("standardize", help="(Legacy) Standalone schema normalisation + website fill.")
    p.add_argument("--region", default="bretagne", help="Used to infer default input/output paths.")
    p.add_argument("--input", default=None, help="Override input CSV (default: {region}_predicted.csv).")
    p.add_argument("--output", default=None, help="Override output CSV path.")
    p.add_argument("--source", default="google_places")
    p.add_argument("--statut", default="new")
    p.set_defaults(func=cmd_standardize)


# ---------------------------------------------------------------------------
# predict
# ---------------------------------------------------------------------------

def cmd_predict(args: argparse.Namespace) -> None:
    """Run ML classifier on collected leads, keep is_target==1 rows, add activity tags.

    Expects ``scraped_text`` to already be present in the input CSV (populated
    by the ``collect`` command).  If the column is absent or empty, predictions
    will still run but with reduced accuracy — re-run ``collect`` to fix this.
    """
    from src.filter_leads import filter_leads

    input_path = Path(args.input) if args.input else _leads_path(args.region)
    output_path = Path(args.output) if args.output else _predicted_path(args.region)
    model_path = Path(args.model)

    if not input_path.exists():
        sys.exit(
            f"Input not found: {input_path}\n"
            f"  Run `python cli.py collect --region {args.region}` first."
        )
    if not model_path.exists():
        sys.exit(
            f"Model not found: {model_path}\n"
            f"  Run `python cli.py train` to train the classifier."
        )

    LOGGER.info("[predict] %s → %s  model=%s", input_path.name, output_path.name, model_path.name)
    result = filter_leads(input_path, output_path, model_path)
    LOGGER.info("[predict] done → %s  (%d rows kept)", output_path, len(result))


def _add_predict_parser(sub: argparse._SubParsersAction) -> None:
    p = sub.add_parser(
        "predict",
        help="Run ML classifier on collected leads: keep is_target==1 rows, add activity tags.",
    )
    p.add_argument("--region", default="bretagne", help="Used to infer default input/output paths.")
    p.add_argument("--input", default=None, help="Override input CSV (default: {region}_leads.csv).")
    p.add_argument("--output", default=None, help="Override output CSV (default: {region}_predicted.csv).")
    p.add_argument("--model", default=str(_MODEL_PATH), help="Path to trained joblib pipeline.")
    p.set_defaults(func=cmd_predict)


# ---------------------------------------------------------------------------
# train
# ---------------------------------------------------------------------------

def cmd_train(args: argparse.Namespace) -> None:
    import scripts.model_training as mt

    data_path = Path(args.data)
    if not data_path.exists():
        sys.exit(f"Training data not found: {data_path}")

    LOGGER.info("[train] data=%s  random_state=%d", data_path.name, args.random_state)
    sys.argv = [
        "model_training",
        "--data", str(data_path),
        "--model-out", str(ROOT / args.model_out),
        "--reports-dir", str(ROOT / args.reports_dir),
        "--audit-errors-out", str(ROOT / args.audit_errors_out),
        "--full-test-out", str(ROOT / args.full_test_out),
        "--random-state", str(args.random_state),
    ]
    mt.main()
    LOGGER.info("[train] done → %s", ROOT / args.model_out)


def _add_train_parser(sub: argparse._SubParsersAction) -> None:
    p = sub.add_parser("train", help="Train the multi-label TF-IDF classifier.")
    p.add_argument("--data", default=str(ROOT / "data" / "training_dataset_cleaned.csv"))
    p.add_argument("--model-out", default="nautic_classifier_v1.joblib")
    p.add_argument("--reports-dir", default="reports/nautic_classifier_v1")
    p.add_argument("--audit-errors-out", default="data/model_audit_errors.csv")
    p.add_argument("--full-test-out", default="data/full_test_results.csv")
    p.add_argument("--random-state", type=int, default=42)
    p.set_defaults(func=cmd_train)


# ---------------------------------------------------------------------------
# mail
# ---------------------------------------------------------------------------

def cmd_mail(args: argparse.Namespace) -> None:
    import scripts.blitz_mailer as bm

    input_path = Path(args.input) if args.input else ROOT / "data" / "_locked" / "final_for_sql.csv"
    LOGGER.info("[mail] input=%s  dry_run=%s  tags_filter=%s", input_path.name, args.dry_run, args.tags_filter)

    sys.argv = [
        "blitz_mailer",
        "--input", str(input_path),
        "--tags-filter", args.tags_filter,
        "--delay-min", str(args.delay_min),
        "--delay-max", str(args.delay_max),
    ]
    if args.dry_run:
        sys.argv.append("--dry-run")

    bm.main()


def _add_mail_parser(sub: argparse._SubParsersAction) -> None:
    p = sub.add_parser("mail", help="Send campaign emails to enriched leads.")
    p.add_argument("--input", default=None, help="Override input CSV path (default: data/_locked/final_for_sql.csv).")
    p.add_argument("--tags-filter", default="nc")
    p.add_argument("--dry-run", action="store_true", help="Preview without sending.")
    p.add_argument("--delay-min", type=float, default=25.0)
    p.add_argument("--delay-max", type=float, default=45.0)
    p.set_defaults(func=cmd_mail)


# ---------------------------------------------------------------------------
# consolidate  (unified: standardize + website fill + upsert into master)
# ---------------------------------------------------------------------------

def cmd_consolidate(args: argparse.Namespace) -> None:
    """Standardize schema, fill websites, then upsert new rows into the master dataset.

    This is the single production step that replaces the old ``standardize``
    + ``consolidate-data`` two-step sequence.  New rows (identified by
    ``unique_key``) are appended to ``final_for_sql.csv``; existing rows are
    left unchanged.
    """
    import pandas as pd
    from scripts.Standardizer import LeadStandardizer
    from src.consolidator import fill_websites

    input_path = Path(args.input) if args.input else _enriched_path(args.region)
    master_path = Path(args.master)

    if not input_path.exists():
        sys.exit(
            f"Input not found: {input_path}\n"
            f"  Run `python cli.py enrich --region {args.region}` first."
        )

    LOGGER.info("[consolidate] %s → %s", input_path.name, master_path.name)
    df = pd.read_csv(input_path)

    # Drop internal routing label if it leaked into the enriched file.
    df = df.drop(columns=["is_target"], errors="ignore")

    # Step 1: Normalise schema and generate deterministic unique keys.
    std = LeadStandardizer(df)
    standardized = std.standardize(source=args.source, statut=args.statut)
    LOGGER.info("[consolidate] standardized: %d rows", len(standardized))

    # Step 2: Repair any missing website values from source-dictionary CSVs.
    standardized = fill_websites(standardized, _DEFAULT_DICTS)

    # Step 3: Upsert – append only rows whose unique_key is not yet in the master.
    master_path.parent.mkdir(parents=True, exist_ok=True)
    if master_path.exists():
        existing = pd.read_csv(master_path)
        existing_keys = set(existing["unique_key"].dropna().astype(str))
        new_rows = standardized[~standardized["unique_key"].astype(str).isin(existing_keys)]
        merged = pd.concat([existing, new_rows], ignore_index=True)
        LOGGER.info(
            "[consolidate] upsert: +%d new rows → master total=%d",
            len(new_rows), len(merged),
        )
    else:
        merged = standardized
        LOGGER.info("[consolidate] creating master: %d rows", len(merged))

    merged.to_csv(master_path, index=False)
    LOGGER.info("[consolidate] done → %s", master_path)


def _add_consolidate_command_parser(sub: argparse._SubParsersAction) -> None:
    p = sub.add_parser(
        "consolidate",
        help="Standardize + website fill + upsert new leads into final_for_sql.csv.",
    )
    p.add_argument("--region", default="bretagne", help="Used to infer default input path.")
    p.add_argument("--input", default=None, help="Override input CSV (default: {region}_enriched.csv).")
    p.add_argument("--master", default=str(_MASTER_PATH), help="Path to master output CSV.")
    p.add_argument("--source", default="google_places")
    p.add_argument("--statut", default="new")
    p.set_defaults(func=cmd_consolidate)


# ---------------------------------------------------------------------------
# consolidate-data  (legacy: website-repair only)
# ---------------------------------------------------------------------------

def cmd_consolidate_data(args: argparse.Namespace) -> None:
    """(Legacy) Repair missing website values in the master CSV.

    Prefer ``consolidate`` for new workflows.
    """
    from src.consolidator import consolidate

    master = Path(args.master)
    dicts = [Path(p.strip()) for p in args.dicts.split(",") if p.strip()]
    output = Path(args.output) if args.output else master

    LOGGER.info("[consolidate-data] master=%s  dicts=%s", master.name, [d.name for d in dicts])
    consolidate(dicts, master, output, public_output_path=None)
    LOGGER.info("[consolidate-data] done → %s", output)


def _add_consolidate_data_parser(sub: argparse._SubParsersAction) -> None:
    default_master = str(_MASTER_PATH)
    default_dicts = ",".join(str(d) for d in _DEFAULT_DICTS)
    p = sub.add_parser(
        "consolidate-data",
        help="(Legacy) Left-join website values from source dictionaries into the master CSV.",
    )
    p.add_argument("--master", default=default_master, help="Path to the master leads CSV.")
    p.add_argument("--dicts", default=default_dicts, help="Comma-separated paths to source-dictionary CSVs.")
    p.add_argument("--output", default=None, help="Output path (defaults to overwrite master).")
    p.set_defaults(func=cmd_consolidate_data)


# ---------------------------------------------------------------------------
# snapshot
# ---------------------------------------------------------------------------

def cmd_snapshot(args: argparse.Namespace) -> None:
    """Anonymize internal master CSV and write a public-safe snapshot."""
    import pandas as pd

    input_path = Path(args.input)
    output_path = Path(args.output)

    if not input_path.exists():
        sys.exit(f"Master dataset not found: {input_path}")

    LOGGER.info("[snapshot] %s → %s", input_path.name, output_path.name)
    df = pd.read_csv(input_path)

    # Anonymize contact columns: presence flag only, no actual values.
    for col in ("email", "phone", "phone_number"):
        if col not in df.columns:
            continue
        is_present = (
            df[col].notna()
            & (df[col].astype(str).str.strip() != "")
            & (df[col].astype(str).str.lower() != "nan")
        )
        df[col] = is_present.map({True: "FOUND", False: "NOT FOUND"})
        LOGGER.info("[snapshot] %s: %d FOUND, %d NOT FOUND", col, is_present.sum(), (~is_present).sum())

    # Drop internal-only columns that must not appear on GitHub.
    internal_cols = ["unique_key", "scraped_text", "source", "statut", "is_target"]
    df = df.drop(columns=[c for c in internal_cols if c in df.columns], errors="ignore")

    output_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_path, index=False)
    LOGGER.info("[snapshot] written → %s  (%d rows, %d columns)", output_path, len(df), len(df.columns))


def _add_snapshot_parser(sub: argparse._SubParsersAction) -> None:
    p = sub.add_parser(
        "snapshot",
        help="Anonymize master CSV (email/phone → FOUND/NOT FOUND) for public GitHub push.",
    )
    p.add_argument("--input", default=str(_MASTER_PATH), help="Path to master leads CSV.")
    p.add_argument("--output", default=str(_PUBLIC_PATH), help="Destination for the public snapshot.")
    p.set_defaults(func=cmd_snapshot)


# ---------------------------------------------------------------------------
# status
# ---------------------------------------------------------------------------

def cmd_status(args: argparse.Namespace) -> None:
    import pandas as pd

    path = Path(args.input)
    if not path.exists():
        sys.exit(f"Dataset not found: {path}")

    LOGGER.info("[status] reading %s", path)
    df = pd.read_csv(path)
    total = len(df)

    def pct(mask: "pd.Series") -> str:  # type: ignore[type-arg]
        n = int(mask.sum())
        return f"{n:>5} / {total}  ({100 * n / total:.1f}%)"

    def filled(col: str) -> "pd.Series":  # type: ignore[type-arg]
        if col not in df.columns:
            return pd.Series([False] * total)
        return df[col].notna() & (df[col].astype(str).str.strip().str.lower() not in ("", "nan", "none"))

    def filled_col(col: str) -> "pd.Series":  # type: ignore[type-arg]
        if col not in df.columns:
            return pd.Series([False] * total)
        s = df[col].fillna("").astype(str).str.strip().str.lower()
        return (s != "") & (s != "nan") & (s != "none")

    is_cols = [c for c in df.columns if c.startswith("is_")]
    if is_cols:
        tagged_mask = (df[is_cols].fillna(0).astype(float) == 1).any(axis=1)
    else:
        tagged_mask = pd.Series([False] * total)

    sep = "-" * 52
    print()
    print("  NAUTIC-OPTIMIZER  —  Data Quality Dashboard")
    print(f"  Source : {path}")
    print(sep)
    print(f"  Total leads            : {total:>5}")
    print(sep)
    print(f"  Website filled         : {pct(filled_col('website'))}")
    print(f"  Email filled           : {pct(filled_col('email'))}")
    print(f"  Phone filled           : {pct(filled_col('phone') if 'phone' in df.columns else filled_col('phone_number'))}")
    print(f"  Geo-coords available   : {pct(filled_col('lat') & filled_col('lng'))}")
    print(sep)
    print(f"  Tagged (≥1 category)   : {pct(tagged_mask)}")
    if is_cols:
        print()
        print("  Category breakdown:")
        counts = df[is_cols].fillna(0).astype(float).sum().sort_values(ascending=False)
        for col, cnt in counts.items():
            bar = "█" * int(cnt / max(counts) * 20) if cnt else ""
            print(f"    {col:<20} {int(cnt):>4}  {bar}")
    print(sep)
    print()
    LOGGER.info(
        "[status] total=%d  website=%.1f%%  email=%.1f%%  tagged=%.1f%%",
        total,
        100 * filled_col("website").mean(),
        100 * filled_col("email").mean(),
        100 * tagged_mask.mean(),
    )


def _add_status_parser(sub: argparse._SubParsersAction) -> None:
    default_input = str(ROOT / "data" / "_locked" / "final_for_sql.csv")
    p = sub.add_parser("status", help="Print data-quality dashboard for the master dataset.")
    p.add_argument("--input", default=default_input, help="Path to leads CSV to inspect.")
    p.set_defaults(func=cmd_status)


# ---------------------------------------------------------------------------
# sync-db
# ---------------------------------------------------------------------------

def cmd_sync_db(args: argparse.Namespace) -> None:
    """Sync final_for_sql.csv into nautical_leads.db (3 tables: leads / leads_content / master_view)."""
    from src.db_sync import sync_to_sqlite

    csv_path = Path(args.input)
    db_path = Path(args.db)

    LOGGER.info("[sync-db] %s  →  %s", csv_path.name, db_path.name)
    row_counts = sync_to_sqlite(csv_path, db_path)

    print(f"\n[sync-db] Sync complete → {db_path}")
    print(f"  {'Table':<16}  {'Rows':>6}")
    print(f"  {'-'*16}  {'-'*6}")
    for table, count in row_counts.items():
        print(f"  {table:<16}  {count:>6}")
    print()
    LOGGER.info(
        "[sync-db] done — leads=%d  leads_content=%d  master_view=%d",
        row_counts["leads"],
        row_counts["leads_content"],
        row_counts["master_view"],
    )


def _add_sync_db_parser(sub: argparse._SubParsersAction) -> None:
    _default_csv = str(_MASTER_PATH)
    _default_db = str(ROOT / "nautical_leads.db")
    p = sub.add_parser(
        "sync-db",
        help="Push final_for_sql.csv into SQLite (leads + leads_content + master_view).",
    )
    p.add_argument(
        "--input",
        default=_default_csv,
        help=f"Master CSV to read (default: {_default_csv}).",
    )
    p.add_argument(
        "--db",
        default=_default_db,
        help=f"SQLite database path (default: {_default_db}).",
    )
    p.set_defaults(func=cmd_sync_db)


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------

def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        prog="cli",
        description="Nautic-Optimizer pipeline CLI",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )
    sub = parser.add_subparsers(dest="command", metavar="COMMAND")
    sub.required = True

    _add_collect_parser(sub)
    _add_predict_parser(sub)
    _add_enrich_parser(sub)
    _add_consolidate_command_parser(sub)
    _add_snapshot_parser(sub)
    _add_sync_db_parser(sub)
    _add_train_parser(sub)
    _add_mail_parser(sub)
    _add_status_parser(sub)
    # Legacy helpers kept for backward compatibility.
    _add_standardize_parser(sub)
    _add_consolidate_data_parser(sub)

    return parser


def main() -> None:
    parser = build_parser()
    args = parser.parse_args()
    LOGGER.info("=== nautic-optimizer cli | command: %s ===", args.command)
    args.func(args)


if __name__ == "__main__":
    main()