xposed/xposedRepo.py at main · sumanrox/xposed · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
#!/usr/bin/env python3
"""
repoXpose.py — parallel .git exposure scanner (camelCase + state resume + colored output)

Features:
 - camelCase naming
 - thread-safe .state file with timestamp for resuming progress (format: STATUS,STATUS_CODE_OR_MSG,URL)
 - prints only VULNERABLE / SUSPICIOUS lines (colorized)
 - simple remaining counter with print(..., end='\r', flush=True)
 - graceful handling of KeyboardInterrupt and thread exceptions
 - optional CSV report named: DD-Mmm-YYYY-RepoXpose.csv (uses 'Sept' for September)
"""

from __future__ import annotations
import argparse
import concurrent.futures
import gc
import re
import threading
import time
import sys
import os
import socket
import urllib.parse
from datetime import datetime
from typing import List, Optional, Tuple, Set, Deque
import collections
import itertools
import sqlite3
import requests # type: ignore
from requests.adapters import HTTPAdapter, Retry # type: ignore
try:
    import modules.dumper
except ImportError:
    # If running directly not as module, or path issues
    pass

# -------------------------
# Config / constants
# -------------------------
VULN = "VULNERABLE"
SUSPICIOUS = "SUSPICIOUS"
OK = "OK"
ERROR = "ERROR"

INDEX_PAT = re.compile(r'Index of /.git', re.IGNORECASE)
GIT_HEAD_PAT = re.compile(r'ref:\s+refs/', re.IGNORECASE)
GIT_CONFIG_PAT = re.compile(r'\[core\]', re.IGNORECASE)
SHA_PAT = re.compile(r'[a-f0-9]{4,40}', re.IGNORECASE)

ANSI_RED = "\033[91m"  # Bright red
ANSI_GREEN = "\033[92m"
ANSI_RESET = "\033[0m"

DEFAULT_THREADS = 50
DEFAULT_TIMEOUT = 5.0
DEFAULT_STATE_FILE = None  # Will be generated with timestamp

# month mapping with 'Sept' for September to match your example
MONTH_MAP = {
    1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr", 5: "May", 6: "Jun",
    7: "Jul", 8: "Aug", 9: "Sept", 10: "Oct", 11: "Nov", 12: "Dec"
}

# -------------------------
# Global runtime state
# -------------------------
totalTargets = 0
remainingLock = threading.Lock()
remaining = 0
lastChecked = ""
stateLock = threading.Lock()
progressLock = threading.Lock()  # rich.Progress is NOT thread-safe


# -------------------------
# Disk-backed dedup + rate limit (dnsx-inspired)
# -------------------------
class TokenBucket:
    """Thread-safe token bucket for rate limiting."""
    def __init__(self, rate: int, per: float = 1.0):
        self.rate = max(rate, 1)
        self.per = per
        self.tokens = float(rate)
        self.last = time.monotonic()
        self.lock = threading.Lock()

    def take(self):
        with self.lock:
            now = time.monotonic()
            elapsed = now - self.last
            self.last = now
            self.tokens += elapsed * (self.rate / self.per)
            if self.tokens > self.rate:
                self.tokens = self.rate
            if self.tokens < 1:
                sleep_time = (1 - self.tokens) * (self.per / self.rate)
                time.sleep(sleep_time)
                self.tokens = 0
            else:
                self.tokens -= 1


class DedupStore:
    """SQLite-backed deduplication store with WAL mode and batched writes."""
    def __init__(self, db_path: str, flush_interval: float = 2.0, batch_size: int = 500):
        self.db_path = db_path
        self.conn = sqlite3.connect(db_path, check_same_thread=False)
        self.conn.execute("PRAGMA journal_mode=WAL")
        self.conn.execute("PRAGMA synchronous=NORMAL")
        self.conn.execute("CREATE TABLE IF NOT EXISTS seen(url TEXT PRIMARY KEY)")
        self.conn.commit()
        self._pending: List[str] = []
        self._lock = threading.Lock()
        self._last_flush = time.monotonic()
        self._flush_interval = flush_interval
        self._batch_size = batch_size

    def has(self, url: str) -> bool:
        with self._lock:
            if url in self._pending:
                return True
        self._maybe_flush()
        cur = self.conn.execute("SELECT 1 FROM seen WHERE url=?", (url,))
        return cur.fetchone() is not None

    def add(self, url: str):
        with self._lock:
            self._pending.append(url)
            should_flush = (
                len(self._pending) >= self._batch_size
                or (time.monotonic() - self._last_flush) > self._flush_interval
            )
        if should_flush:
            self._flush()

    def add_many(self, urls: List[str]):
        if not urls:
            return
        with self._lock:
            self._pending.extend(urls)
        self._flush()

    def _maybe_flush(self):
        with self._lock:
            should_flush = (
                len(self._pending) >= self._batch_size
                or (time.monotonic() - self._last_flush) > self._flush_interval
            )
        if should_flush:
            self._flush()

    def _flush(self):
        with self._lock:
            if not self._pending:
                return
            try:
                self.conn.executemany(
                    "INSERT OR IGNORE INTO seen(url) VALUES (?)",
                    [(u,) for u in self._pending]
                )
                self.conn.commit()
            except Exception:
                pass
            self._pending.clear()
            self._last_flush = time.monotonic()

    def close(self):
        self._flush()
        try:
            self.conn.close()
        except Exception:
            pass


# -------------------------
# Networking helpers
# -------------------------
def makeSession(timeout: int = 5, maxRetries: int = 1, poolConnections: int = 100, poolMaxSize: int = 100) -> requests.Session:
    s = requests.Session()
    retries = Retry(
        total=maxRetries,
        backoff_factor=0.25,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=frozenset(['GET', 'HEAD', 'OPTIONS'])
    )
    adapter = HTTPAdapter(max_retries=retries, pool_connections=poolConnections, pool_maxsize=poolMaxSize)
    s.mount("http://", adapter)
    s.mount("https://", adapter)
    s.headers.update({"User-Agent": "Mozilla/5.0 (compatible; repoXpose/1.0)"})
    return s


def normalizeUrl(u: str) -> Optional[str]:
    try:
        u = u.strip()
        if not u:
            return None
        # reject wildcard DNS patterns (e.g., *.example.com)
        if u.startswith("*.") or "/*." in u:
            return None
        if not re.match(r'^https?://', u):
            u = "https://" + u
        # remove trailing slash for consistency
        return u.rstrip('/')
    except Exception:
        return None


# -------------------------
# State file handling
# -------------------------
def loadState(stateFile: str, dedup: DedupStore) -> None:
    """
    Reads existing .state file and populates dedup store.
    State line format: STATUS,STATUS-CODE-OR-MSG,URL
    """
    if not os.path.exists(stateFile):
        return
    urls: List[str] = []
    try:
        with open(stateFile, "r", encoding="utf-8", errors="ignore") as fh:
            for line in fh:
                line = line.strip()
                if not line:
                    continue
                parts = line.split(",", 2)
                if len(parts) < 3:
                    continue
                url = parts[2].strip()
                if url:
                    urls.append(url)
    except Exception as e:
        print(f"[WARN] Failed to read state file {stateFile}: {e}", file=sys.stderr)
        return
    if urls:
        dedup.add_many(urls)


def appendState(stateFile: str, dedup: DedupStore, status: str, codeOrMsg: str, url: str) -> None:
    """
    Thread-safe append to state file + dedup store.
    """
    try:
        with stateLock:
            with open(stateFile, "a", encoding="utf-8") as fh:
                fh.write(f"{status},{codeOrMsg},{url}\n")
        dedup.add(url)
    except Exception as e:
        # we must not crash on state write failure
        print(f"[ERROR] Failed to write to state file {stateFile}: {e}", file=sys.stderr)


# -------------------------
# Scanner logic
# -------------------------
MAX_PROBE_BYTES = 16384  # Only read first 16KB of any response


def _safeText(response: requests.Response, maxBytes: int = MAX_PROBE_BYTES) -> str:
    """Stream at most *maxBytes* from a response and close it to release the connection.
    Falls back to response.text for compatibility with test mocks."""
    try:
        raw = getattr(response, "raw", None)
        if raw is not None and hasattr(raw, "read"):
            chunk = raw.read(maxBytes)
            response.close()
            return chunk.decode("utf-8", errors="ignore")
        # Fallback for non-streaming responses / mocks
        text = response.text or ""
        response.close()
        return text[:maxBytes]
    except Exception:
        return getattr(response, "text", "")[:maxBytes]


def _contentTypeIsText(response: requests.Response) -> bool:
    ct = response.headers.get("Content-Type", "")
    return ct.startswith("text")


def checkGitExposure(session: requests.Session, baseUrl: str, timeout: float) -> Optional[Tuple[str, str, str, str]]:
    """
    Probe git endpoints. Return (STATUS_LABEL, STATUS_CODE_OR_MSG, baseUrl, serverHeader) or None.
    Returns VULNERABLE for confirmed exposure, SUSPICIOUS for 200 with ambiguous content.
    All requests are streamed and capped to MAX_PROBE_BYTES to avoid OOM on huge responses.
    """
    serverHeader = "N/A"
    try:
        # /.git/ directory listing
        urlGitDir = baseUrl + "/.git/"
        r = session.get(urlGitDir, timeout=timeout, allow_redirects=True, stream=True)
        serverHeader = r.headers.get("Server", "N/A")
        if r.status_code == 200:
            text = _safeText(r, MAX_PROBE_BYTES)
            if INDEX_PAT.search(text):
                return (VULN, str(r.status_code), baseUrl, serverHeader)
        else:
            r.close()

        # /.git/HEAD
        urlHead = baseUrl + "/.git/HEAD"
        r2 = session.get(urlHead, timeout=timeout, allow_redirects=True, stream=True)
        if serverHeader == "N/A":
            serverHeader = r2.headers.get("Server", "N/A")
        if r2.status_code == 200:
            body = _safeText(r2, MAX_PROBE_BYTES).strip()
            if GIT_HEAD_PAT.search(body):
                return (VULN, str(r2.status_code), baseUrl, serverHeader)
            # Only mark as SUSPICIOUS if we got 200 with suspicious SHA-like content (but not confirmed)
            if len(body) > 0 and SHA_PAT.search(body):
                return (SUSPICIOUS, str(r2.status_code), baseUrl, serverHeader)
        else:
            r2.close()

        # /.git/config
        urlConfig = baseUrl + "/.git/config"
        r3 = session.get(urlConfig, timeout=timeout, allow_redirects=True, stream=True)
        if serverHeader == "N/A":
            serverHeader = r3.headers.get("Server", "N/A")
        if r3.status_code == 200 and GIT_CONFIG_PAT.search(_safeText(r3, MAX_PROBE_BYTES)):
            return (VULN, str(r3.status_code), baseUrl, serverHeader)
        else:
            r3.close()

        # objects/info/packs or objects/pack/
        for p in ("/.git/objects/info/packs", "/.git/objects/pack/"):
            rp = session.get(baseUrl + p, timeout=timeout, allow_redirects=True, stream=True)
            if serverHeader == "N/A":
                serverHeader = rp.headers.get("Server", "N/A")
            # Only check for suspicious content if we got 200 OK
            if rp.status_code == 200:
                text = _safeText(rp, MAX_PROBE_BYTES)
                if "pack-" in text or _contentTypeIsText(rp):
                    return (SUSPICIOUS, str(rp.status_code), baseUrl, serverHeader)
            else:
                rp.close()

    except requests.RequestException as e:
        # network error - record as ERROR with message
        return (ERROR, str(e), baseUrl, "Error")
    except Exception as e:
        return (ERROR, str(e), baseUrl, "Error")

    # not vulnerable / nothing conclusive
    return (OK, str(getattr(r, "status_code", "N/A")), baseUrl, serverHeader)


def worker(taskUrl: str, session: requests.Session, timeout: float, stateFile: str, dedup: DedupStore, limiter: Optional[TokenBucket] = None, dumpingExecutor: Optional[concurrent.futures.ThreadPoolExecutor] = None, outputDirArg: Optional[str] = None, progress: Optional[object] = None, scanTaskID: Optional[object] = None, displayQueue: Optional[Deque] = None, queueLock: Optional[object] = None, resolve: bool = False) -> None:
    """
    Worker that runs checkGitExposure and appends to state. Handles exceptions.
    Trigger dump if vulnerable and dumpingExecutor is provided.
    """
    global remaining, lastChecked
    # Late import to ensure it's available or assume top-level import
    from rich.panel import Panel

    result = None
    try:
        # If already processed (from state resume), skip
        if dedup.has(taskUrl):
            with remainingLock:
                remaining -= 1
                lastChecked = taskUrl
            if progress and scanTaskID is not None:
                with progressLock:
                    progress.advance(scanTaskID, 1)
            return

        if limiter:
            limiter.take()

        result = checkGitExposure(session, taskUrl, timeout)
    except KeyboardInterrupt:
        import os
        os._exit(1)  # Worker exit
    except Exception:
        pass

    try:
        if result:
            status, codeOrMsg, url, serverHeader = result
            appendState(stateFile, dedup, status, codeOrMsg, url)
            if status == VULN:
                timeStr = datetime.now().strftime("%H:%M:%S")

                # Resolve IP (optional — costs CPU and blocks threads)
                ip_addr = "N/A"
                if resolve:
                    try:
                        parsed = urllib.parse.urlparse(url)
                        domain = parsed.netloc.split(':')[0]
                        ip_addr = socket.gethostbyname(domain)
                    except Exception:
                        pass

                if dumpingExecutor:
                    # Add row to queue instead of table
                    if displayQueue is not None and queueLock is not None:
                        with queueLock:
                            displayQueue.append((
                                f"[bold red]{status}[/bold red]",
                                url,
                                f"[blue]{ip_addr}[/blue]",
                                f"[magenta]{serverHeader}[/magenta]",
                                f"[dim white]{timeStr}[/dim white]",
                                "[cyan]Dumping...[/cyan]"
                            ))
                    else:
                        print(f"[VULNERABLE] {url} [Server: {serverHeader}] [DUMPING]")

                    try:
                        # Determine output directory
                        parsed = urllib.parse.urlparse(url)
                        domain = parsed.netloc.replace(':', '_')
                        if not domain:
                            domain = "unknown_target"

                        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                        defaultName = f"{domain}-xposed-{timestamp}"
                        finalOutputDir = outputDirArg if outputDirArg else defaultName

                        dumpUrl = url
                        if not dumpUrl.endswith("/.git/"):
                            dumpUrl = dumpUrl.rstrip("/")
                            if not dumpUrl.endswith("/.git"):
                                dumpUrl += "/.git/"

                        # Define a wrapper to handle the dump
                        def runDump():
                            taskID = None
                            try:
                                if progress:
                                    with progressLock:
                                        taskID = progress.add_task(f"[cyan]Dumping {domain}...", total=None)

                                def progressCallback(completed, total, currentFile):
                                    if progress and taskID is not None:
                                        with progressLock:
                                            progress.update(taskID, completed=completed, total=total if total > 0 else None)

                                commitCount = modules.dumper.dumpAndExtract(dumpUrl, finalOutputDir, progressCallback=progressCallback)

                                if progress and taskID is not None:
                                    # Keep it green on success
                                    if isinstance(commitCount, int) and commitCount > 0:
                                        statsMsg = f"({commitCount} commits)"
                                    else:
                                        statsMsg = ""
                                    with progressLock:
                                        progress.update(taskID, completed=100, total=100, description=f"[green]Dumped {domain} ✓ {statsMsg}")

                            except KeyboardInterrupt:
                                if progress and taskID is not None:
                                    with progressLock:
                                        progress.update(taskID, description=f"[yellow]Stopped {domain}")
                            except Exception as e:
                                if progress and taskID is not None:
                                    with progressLock:
                                        progress.update(taskID, description=f"[red]Failed {domain}: {str(e)}")

                        dumpingExecutor.submit(runDump)

                    except Exception as dumpErr:
                        err_msg = f"[red][ERROR] Failed to queue dump for {url}: {dumpErr}[/red]"
                        if progress:
                            with progressLock:
                                progress.console.print(err_msg)
                        else:
                            print(err_msg, file=sys.stderr)
                else:
                    # No dumping, just alert
                    if displayQueue is not None and queueLock is not None:
                        with queueLock:
                            displayQueue.append((
                                f"[bold red]{status}[/bold red]",
                                url,
                                f"[blue]{ip_addr}[/blue]",
                                f"[magenta]{serverHeader}[/magenta]",
                                f"[dim white]{timeStr}[/dim white]",
                                "[yellow]Logged[/yellow]"
                            ))
                    else:
                        print(f"{ANSI_RED}[{status}]{ANSI_RESET} {url}")

            elif status == SUSPICIOUS:
                ip_addr = "N/A"
                if resolve:
                    try:
                        parsed = urllib.parse.urlparse(url)
                        domain = parsed.netloc.split(':')[0]
                        ip_addr = socket.gethostbyname(domain)
                    except Exception:
                        pass

                timeStr = datetime.now().strftime("%H:%M:%S")

                if displayQueue is not None and queueLock is not None:
                    with queueLock:
                        # Mapping: STATUS, URL, IP, SERVER, TIME, ACTION
                        displayQueue.append((
                            f"[bold yellow]{status}[/bold yellow]",
                            url,
                            f"[blue]{ip_addr}[/blue]",
                            f"[magenta]{serverHeader}[/magenta]",
                            f"[dim white]{timeStr}[/dim white]",
                            f"[grey50]Status: {codeOrMsg}[/grey50]"
                        ))
                else:
                    print(f"{ANSI_GREEN}[{status}]{ANSI_RESET} {url}")

        with remainingLock:
            remaining -= 1
            lastChecked = taskUrl

        # Update main scan progress
        if progress and scanTaskID is not None:
            with progressLock:
                progress.advance(scanTaskID, 1)

    except KeyboardInterrupt:
        import os
        os._exit(1)
    except Exception as e:
        # Error handling
        appendState(stateFile, dedup, ERROR, str(e), taskUrl)
        with remainingLock:
            remaining -= 1
            lastChecked = taskUrl
        # Update main scan progress even on error
        if progress and scanTaskID is not None:
            with progressLock:
                progress.advance(scanTaskID, 1)


# -------------------------
# Utilities
# -------------------------
def streamTargetsFromFile(path: str):
    """Yield normalized targets from a file without loading all into memory."""
    try:
        with open(path, "r", encoding="utf-8", errors="ignore") as fh:
            for line in fh:
                u = normalizeUrl(line)
                if u:
                    yield u
    except Exception as e:
        print(f"[ERROR] Could not load input file {path}: {e}", file=sys.stderr)


def loadTargetsFromFile(path: str) -> List[str]:
    lst: List[str] = []
    try:
        with open(path, "r", encoding="utf-8", errors="ignore") as fh:
            for line in fh:
                u = normalizeUrl(line)
                if u:
                    lst.append(u)
    except Exception as e:
        print(f"[ERROR] Could not load input file {path}: {e}", file=sys.stderr)
    return lst


def boundedMap(func, iterable, maxWorkers=50, maxInflight=None):
    """
    Apply *func* to each item in *iterable* using a thread pool,
    but keep the number of in-flight futures bounded to prevent
    unbounded memory growth on huge inputs.
    Yields results in completion order.
    """
    if maxInflight is None:
        maxInflight = maxWorkers * 2

    import concurrent.futures
    from concurrent.futures import FIRST_COMPLETED

    it = iter(iterable)
    with concurrent.futures.ThreadPoolExecutor(max_workers=maxWorkers) as executor:
        futures = set()
        # Seed initial batch
        for item in it:
            futures.add(executor.submit(func, item))
            if len(futures) >= maxInflight:
                break

        while futures:
            done, futures = concurrent.futures.wait(
                futures, return_when=FIRST_COMPLETED
            )
            for future in done:
                yield future.result()
            # Replenish
            for _ in done:
                try:
                    futures.add(executor.submit(func, next(it)))
                except StopIteration:
                    break


def writeFinalCsv(stateFile: str, outPrefix: str = "RepoXpose") -> None:
    """
    Reads the .state file and writes all entries into a CSV
    named like: DD-Mmm-YYYY-RepoXpose.csv
    """
    try:
        today = datetime.now()
        day = f"{today.day:02d}"
        month = MONTH_MAP.get(today.month, today.strftime("%b"))
        year = today.year
        filename = f"{day}-{month}-{year}-{outPrefix}.csv"
        # Read the .state file to include all entries in the same order
        lines = []
        if os.path.exists(stateFile):
            with open(stateFile, "r", encoding="utf-8", errors="ignore") as fh:
                for line in fh:
                    line = line.strip()
                    if line:
                        parts = line.split(",", 2)
                        if len(parts) == 3:
                            status, codeOrMsg, url = parts[0].strip(), parts[1].strip(), parts[2].strip()
                            lines.append((status, codeOrMsg, url))
        # write csv header and rows
        with open(filename, "w", encoding="utf-8") as csvf:
            csvf.write("status,code_or_message,url\n")
            for status, codeOrMsg, url in lines:
                # escape commas in codeOrMsg if any by wrapping in quotes
                if "," in codeOrMsg:
                    codeOrMsg = '"' + codeOrMsg.replace('"', '""') + '"'
                csvf.write(f"{status},{codeOrMsg},{url}\n")
        print(f"\nWrote final CSV report: {filename}")
    except Exception as e:
        print(f"\n[ERROR] Failed to write final CSV: {e}", file=sys.stderr)


def count_lines(path: str) -> int:
    """Fast line count for progress bar totals (reads in binary chunks)."""
    count = 0
    with open(path, "rb") as f:
        for _ in f:
            count += 1
    return count


def stream_targets(path: str, dedup: DedupStore):
    """Yield normalized targets from a file, skipping already-seen URLs."""
    with open(path, "r", encoding="utf-8", errors="ignore") as fh:
        for line in fh:
            u = normalizeUrl(line)
            if u and not dedup.has(u):
                yield u


# -------------------------
# Main orchestration
# -------------------------
def main(argv: Optional[List[str]] = None) -> int:
    global totalTargets, remaining
    parser = argparse.ArgumentParser(description="repoXpose - parallel .git exposure scanner (resumable .state)")
    parser.add_argument("-i", "--input", help="File of targets (one per line). URLs can omit scheme", type=str)
    parser.add_argument("-u", "--url", help="Single target URL", type=str)
    parser.add_argument("-t", "--threads", help=f"Number of parallel workers (default: {DEFAULT_THREADS})", type=int, default=DEFAULT_THREADS)
    parser.add_argument("-T", "--timeout", help=f"Request timeout seconds (default: {DEFAULT_TIMEOUT})", type=float, default=DEFAULT_TIMEOUT)
    parser.add_argument("--rate-limit", help="Max requests per second (0 = unlimited)", type=int, default=0)
    parser.add_argument("--state-file", help="State file to resume from (default: auto-generated with timestamp)", type=str, default=None)
    parser.add_argument("--max", help="Max targets to process from input file (for testing)", type=int, default=0)
    parser.add_argument("--csv", help="Generate CSV report after scan completes", action="store_true")
    parser.add_argument("--dump", help="Automatically dump and recover artifacts from vulnerable targets", action="store_true")
    parser.add_argument("--output-dir", help="Output directory for dump (default: domain-xposed-datetime)", type=str)
    parser.add_argument("--resolve", help="Resolve IP addresses for findings (slower)", action="store_true")
    args = parser.parse_args(argv)

    if not args.input and not args.url:
        parser.error("Specify --input FILE or --url URL")

    # Generate state file name with timestamp if not provided
    if args.state_file is None:
        now = datetime.now()
        timestamp = now.strftime("%Y%m%d_%H%M%S")
        args.state_file = f".state_{timestamp}"

    # Disk-backed dedup store (sqlite) + optional rate limiter
    dedup = DedupStore(db_path=f"{args.state_file}.db")
    limiter: Optional[TokenBucket] = None
    if args.rate_limit > 0:
        limiter = TokenBucket(rate=args.rate_limit)

    # load previous state into dedup store
    try:
        loadState(args.state_file, dedup)
    except Exception as e:
        print(f"[WARN] Could not load state file: {e}", file=sys.stderr)

    # Build streaming target iterator and count for progress bar
    targetIter: iter = iter([])
    totalLines = 0
    if args.url:
        u = normalizeUrl(args.url)
        if not u:
            print("[ERROR] Invalid URL provided via --url", file=sys.stderr)
            dedup.close()
            return 2
        targetIter = iter([u])
        totalLines = 1
    else:
        totalLines = count_lines(args.input)
        targetIter = stream_targets(args.input, dedup)
        if args.max and args.max > 0:
            targetIter = itertools.islice(targetIter, args.max)

    totalTargets = totalLines
    remaining = totalLines

    if totalTargets == 0:
        print("No remaining targets to process (state file indicates all done).")
        if args.csv:
            writeFinalCsv(args.state_file)
        dedup.close()
        return 0

    session = makeSession(timeout=int(args.timeout), maxRetries=1, poolConnections=args.threads+10, poolMaxSize=args.threads+10)

    # ensure state file exists (touch)
    try:
        with stateLock:
            open(args.state_file, "a").close()
    except Exception:
        pass

    # run thread pool
    from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeRemainingColumn, TransferSpeedColumn, MofNCompleteColumn
    from rich.console import Console
    from rich.panel import Panel
    from rich.table import Table
    from rich.live import Live
    from rich.console import Group
    from rich.box import HEAVY_EDGE

    # 1. Create the Findings Table
    findingsTable = Table(
        box=HEAVY_EDGE,
        show_header=True,
        header_style="bold white on blue",
        title="[bold reverse cyan] TARGET EXPOSURE SYSTEMS [/bold reverse cyan]",
        expand=True,
        border_style="bright_blue",
        row_styles=["", "dim"]
    )
    findingsTable.add_column("STATUS", justify="center", width=12, style="bold")
    findingsTable.add_column("TARGET URL", style="cyan")
    findingsTable.add_column("SERVER", style="magenta")
    findingsTable.add_column("TIME", style="dim white")
    findingsTable.add_column("ACTION", style="grey70")

    # Lock for table updates
    tableLock = threading.Lock()

    # 2. Create Progress Bar
    progress = Progress(
        SpinnerColumn(spinner_name="dots12", style="bold cyan"),
        TextColumn("[progress.description]{task.description}"),
        BarColumn(bar_width=None, complete_style="cyan", finished_style="green"),
        TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
        MofNCompleteColumn(),
        TimeRemainingColumn(),
        refresh_per_second=4
    )

    # 3. Create Group and Live View
    ui_group = Group(
        Panel(findingsTable, border_style="cyan"),
        progress
    )

    # Wrap execution in Live context
    with Live(ui_group, refresh_per_second=4, screen=False) as live:

        # Create the Overall Scan Progress bar
        scanTaskID = progress.add_task("[bold white]Scanning Targets[/bold white]", total=totalTargets)

        executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.threads)

        dumpingExecutor = None
        if args.dump:
            # Cap dump workers to prevent OOM when many vulnerable targets are found
            dumpWorkers = min(5, args.threads)
            dumpingExecutor = concurrent.futures.ThreadPoolExecutor(max_workers=dumpWorkers)

        futures = set()
        try:
            # Fixed-size queue for sliding window effect
            displayQueue: Deque[Tuple] = collections.deque(maxlen=10)
            queueLock = threading.Lock()

            # Helper to rebuild table
            def rebuildTable(queue):
                newTable = Table(
                    box=HEAVY_EDGE,
                    show_header=True,
                    header_style="bold white on blue",
                    title="[bold reverse cyan] TARGET EXPOSURE SYSTEMS [/bold reverse cyan]",
                    expand=True,
                    border_style="bright_blue",
                    row_styles=["", "dim"]
                )
                newTable.add_column("STATUS", justify="center", width=12, style="bold", no_wrap=True)
                newTable.add_column("TARGET URL", style="cyan", ratio=1, overflow="fold")
                newTable.add_column("IP ADDRESS", style="blue", width=15, justify="center")
                newTable.add_column("SERVER", style="magenta", width=20, overflow="ellipsis", no_wrap=True)
                newTable.add_column("TIME", style="dim white", width=10, justify="center", no_wrap=True)
                newTable.add_column("ACTION", style="grey70", width=15, justify="center", no_wrap=True)

                with queueLock:
                    for row in queue:
                        if len(row) == 4:
                             newTable.add_row(*row)
                        else:
                             newTable.add_row(*row)
                return newTable

            maxInflight = args.threads * 2

            # Seed initial batch (bounded to prevent unbounded memory growth)
            for _ in range(maxInflight):
                try:
                    url = next(targetIter)
                    futures.add(executor.submit(
                        worker,
                        url,
                        session,
                        args.timeout,
                        args.state_file,
                        dedup,
                        limiter,
                        dumpingExecutor,
                        args.output_dir,
                        progress,
                        scanTaskID,
                        displayQueue,
                        queueLock,
                        args.resolve
                    ))
                except StopIteration:
                    break

            # Track queue length to avoid unnecessary table rebuilds
            lastQueueLen = 0
            lastTableRebuild = 0.0
            TABLE_REBUILD_INTERVAL = 1.0  # seconds

            # Monitor completion and replenish
            while futures:
                done, futures = concurrent.futures.wait(futures, timeout=0.5, return_when=concurrent.futures.FIRST_COMPLETED)

                for future in done:
                    try:
                        future.result()
                    except Exception:
                        pass

                for _ in done:
                    try:
                        url = next(targetIter)
                        futures.add(executor.submit(
                            worker,
                            url,
                            session,
                            args.timeout,
                            args.state_file,
                            dedup,
                            limiter,
                            dumpingExecutor,
                            args.output_dir,
                            progress,
                            scanTaskID,
                            displayQueue,
                            queueLock,
                            args.resolve
                        ))
                    except StopIteration:
                        break

                # Rebuild table only when new rows arrived AND throttled interval elapsed
                currentQueueLen = len(displayQueue)
                now = time.time()
                if currentQueueLen != lastQueueLen and (now - lastTableRebuild) >= TABLE_REBUILD_INTERVAL:
                    lastQueueLen = currentQueueLen
                    lastTableRebuild = now
                    live.update(Group(
                        Panel(rebuildTable(displayQueue), border_style="cyan"),
                        progress
                    ))
                    # Force GC to reclaim old Rich Table/Panel objects and prevent memory creep
                    gc.collect()

            if args.dump and dumpingExecutor:
                pass

            # Clean shutdown (non-interrupt case)
            if args.dump and dumpingExecutor:
                dumpingExecutor.shutdown(wait=True)
            executor.shutdown(wait=True)

        except KeyboardInterrupt:
            try:
                 live.stop()
            except:
                 pass
            import os
            from rich import print as rprint
            rprint("\n[bold yellow]Keyboard Interrupt! Exiting immediately...[/bold yellow]")
            os._exit(1)

        except Exception as e:
            try:
                 live.stop()
            except:
                 pass
            import os
            from rich import print as rprint
            rprint(f"[bold red]Fatal Error: {e}[/bold red]")
            os._exit(2)

    print("\nScan complete.")
    if args.csv:
        writeFinalCsv(args.state_file)
    dedup.close()
    return 0


if __name__ == "__main__":
    try:
        raise SystemExit(main())
    except SystemExit as se:
        # allow normal exits
        raise
    except Exception as e:
        print(f"[FATAL] Unhandled exception: {e}", file=sys.stderr)
        raise
    except KeyboardInterrupt:
        pass