Excel-Risks-Checker---Prototype/models.py at main · chadwickkcc/Excel-Risks-Checker---Prototype · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
"""
models.py — Core data structures for the Excel Workbook Risk Diagnostic Tool.

All data classes are pure value objects with no business logic beyond
score computation and RAG classification. No imports from other project
modules — this file is the root of the dependency graph.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Optional

import settings


@dataclass
class Finding:
    """
    Represents a single risk finding detected in a workbook.

    Attributes:
        check_id:    Numeric identifier (1–24) matching the spec check number.
        name:        Short display name shown in tables and badges.
        severity:    One of "High", "Medium", or "Low".
        category:    Checker category — "Formula", "Error", "Link",
                     "Structure", "VBA", or "Actuarial".
        description: Factual description of what was found (cell references,
                     counts, etc.).
        sheet_name:  Worksheet where the issue was found, or "Workbook" for
                     file-level findings.
        cell_ref:    Cell address (e.g. "B14") or empty string for
                     sheet-level findings.
        explanation: Plain-English explanation of why this is a risk,
                     suitable for inclusion in the PDF report.
    """

    check_id: int
    name: str
    severity: str          # "High" | "Medium" | "Low"
    category: str          # "Formula" | "Error" | "Link" | "Structure" | "VBA" | "Actuarial"
    description: str
    sheet_name: str
    cell_ref: str
    explanation: str

    def score_contribution(self) -> int:
        """Return the number of risk-score points this finding contributes."""
        return settings.SEVERITY_WEIGHTS.get(self.severity, 0)


@dataclass
class WorkbookAnalysisResult:
    """
    Aggregated result of analysing a single Excel workbook.

    Attributes:
        filename:                Original filename of the uploaded workbook.
        file_path:               Path to the temporary file used during analysis.
        file_size_mb:            File size in megabytes.
        analysis_timestamp:      UTC datetime when analysis began.
        findings:                All findings detected, across all checkers.
        high_sensitivity_sheets: Sheet names identified as actuarially sensitive.
        total_score:             Weighted sum of all finding severities.
        rag_rating:              "Green", "Amber", or "Red".
        error_message:           Non-empty string if the file could not be
                                 analysed (e.g. password-protected). When set,
                                 findings will be empty and the PDF will note
                                 the file was unanalysable.
    """

    filename: str
    file_path: Path
    file_size_mb: float
    analysis_timestamp: datetime
    findings: list[Finding] = field(default_factory=list)
    high_sensitivity_sheets: list[str] = field(default_factory=list)
    total_score: int = 0
    rag_rating: str = "Green"
    error_message: str = ""
    ai_commentary: Optional[AICommentary] = None

    # ------------------------------------------------------------------
    # Computed helpers
    # ------------------------------------------------------------------

    def compute_score(self) -> int:
        """
        Calculate and store the weighted risk score from all findings.

        Returns:
            The total integer score.
        """
        self.total_score = sum(f.score_contribution() for f in self.findings)
        return self.total_score

    def compute_rag(self) -> str:
        """
        Map the current total_score to a RAG (Red/Amber/Green) rating.

        Uses thresholds from settings.RAG_GREEN_MAX and settings.RAG_AMBER_MAX.

        Returns:
            One of "Green", "Amber", or "Red".
        """
        if self.total_score <= settings.RAG_GREEN_MAX:
            self.rag_rating = "Green"
        elif self.total_score <= settings.RAG_AMBER_MAX:
            self.rag_rating = "Amber"
        else:
            self.rag_rating = "Red"
        return self.rag_rating

    def findings_by_severity(self, severity: str) -> list[Finding]:
        """Return all findings matching the given severity level."""
        return [f for f in self.findings if f.severity == severity]

    def findings_by_category(self, category: str) -> list[Finding]:
        """Return all findings matching the given category string."""
        return [f for f in self.findings if f.category == category]

    def top_findings(self, n: int = 5) -> list[Finding]:
        """
        Return the top-N findings sorted by severity (High first),
        then by check_id for deterministic ordering.
        """
        severity_order = {"High": 0, "Medium": 1, "Low": 2}
        sorted_findings = sorted(
            self.findings,
            key=lambda f: (severity_order.get(f.severity, 3), f.check_id),
        )
        return sorted_findings[:n]

    def count_by_severity(self) -> dict[str, int]:
        """Return a dict of {severity: count} for all findings."""
        counts: dict[str, int] = {"High": 0, "Medium": 0, "Low": 0}
        for f in self.findings:
            if f.severity in counts:
                counts[f.severity] += 1
        return counts


# ---------------------------------------------------------------------------
# AI Extension dataclasses
# ---------------------------------------------------------------------------

@dataclass
class SheetSummary:
    """Per-sheet extract used to build the AI prompt digest."""
    name: str
    row_count: int
    col_count: int
    is_sensitive: bool          # True if name matches actuarial keywords
    headers: list[str]          # First-row and first-column string values, deduplicated
    top_formulas: list[str]     # Up to AI_TOP_FORMULAS_PER_SHEET, sorted by complexity score descending
    named_ranges: list[str]     # Names of named ranges scoped to this sheet
    references_sheets: list[str]  # Other sheet names this sheet's formulas reference


@dataclass
class WorkbookDigest:
    """Token-efficient structured summary of a workbook for AI analysis."""
    file_name: str
    sheet_summaries: list[SheetSummary]
    workbook_named_ranges: dict[str, str]   # name -> formula or scalar value as string
    vba_present: bool
    vba_module_names: list[str]


@dataclass
class FormulaExplanation:
    """AI-generated plain-English explanation of a single formula."""
    sheet_name: str
    cell_address: str
    formula: str
    explanation: str


@dataclass
class SheetNarrative:
    """AI-generated narrative describing a single worksheet."""
    sheet_name: str
    narrative: str


@dataclass
class AICommentary:
    """Container for all AI-generated commentary features."""
    findings_narrative: Optional[str] = None
    workbook_purpose: Optional[str] = None
    sheet_narratives: list[SheetNarrative] = field(default_factory=list)
    formula_explanations: list[FormulaExplanation] = field(default_factory=list)
    assumption_commentary: Optional[str] = None
    api_error: Optional[str] = None   # Populated if any API call fails; partial results still used