OWASP
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎pytest.ini‎
Lines changed: 10 additions & 0 deletions b/‎pytest.ini‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎requirements-dev.txt‎
Lines changed: 6 additions & 0 deletions b/‎requirements-dev.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎scripts/Generate_CheatSheets_TOC.py‎
Lines changed: 80 additions & 21 deletions b/‎scripts/Generate_CheatSheets_TOC.py‎
Lines changed: 80 additions & 21 deletions
diff --git a/‎scripts/Generate_Technologies_JSON.py‎
Lines changed: 85 additions & 26 deletions b/‎scripts/Generate_Technologies_JSON.py‎
Lines changed: 85 additions & 26 deletions
@@ -18,3 +18,6 @@ venv
 .claude/settings.local.json
 .claude/settings.json
 .claude/worktrees/
+# Python bytecode
+__pycache__/
+*.pyc
@@ -0,0 +1,10 @@
+[pytest]
+testpaths = tests
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+addopts = -v --tb=short --strict-markers
+filterwarnings =
+    error
+    # The repo's scripts predate py3.10; tolerate missing annotations.
+    ignore::DeprecationWarning
@@ -0,0 +1,6 @@
+# Test-only dependencies. Install with:
+#   pip install -r requirements-dev.txt
+#
+# Kept separate from requirements.txt so the runtime image for mkdocs/feedgen
+# does not pull in pytest.
+pytest>=7.0
@@ -8,29 +8,88 @@
 same location that the script in order to be moved later by the caller script.
 """
 import os
+import sys
+from typing import Iterable, List
 
 # Define templates
 cs_md_link_template = "* [%s](cheatsheets/%s)"
 
-# Scan all CS files
-cheatsheets = [f.name for f in os.scandir("../cheatsheets") if f.is_file()]
-cheatsheets.sort()
-
-# Generate the summary file
-with open("TOC.md", "w") as index_file:
-    index_file.write("# Summary\n\n")
-    index_file.write("### Cheatsheets\n\n")
-    index_file.write(cs_md_link_template % ("Index Alphabetical", "Index.md"))
-    index_file.write("\n")
-    index_file.write(cs_md_link_template % ("Index ASVS", "IndexASVS.md"))
-    index_file.write("\n")
-    index_file.write(cs_md_link_template % ("Index ASVS", "IndexMASVS.md"))
-    index_file.write("\n")
-    index_file.write(cs_md_link_template % ("Index Proactive Controls", "IndexProactiveControls.md"))
-    index_file.write("\n")
-    for cheatsheet in cheatsheets:
-        if cheatsheet != "Index.md" and cheatsheet != "IndexASVS.md" and cheatsheet != "IndexMASVS.md" and cheatsheet != "IndexProactiveControls.md" and cheatsheet != "TOC.md":
-            cs_name = cheatsheet.replace("_"," ").replace(".md", "").replace("Cheat Sheet", "")
-            index_file.write(cs_md_link_template % (cs_name, cheatsheet))
+# Files that are not actual cheat sheets and must be excluded from the TOC
+# even if they happen to live in the cheatsheets/ directory.
+_EXCLUDED_FROM_TOC = frozenset({
+    "Index.md",
+    "IndexASVS.md",
+    "IndexMASVS.md",
+    "IndexProactiveControls.md",
+    "TOC.md",
+})
+
+
+def to_display_name(filename: str) -> str:
+    """Convert a cheatsheet filename to its human-readable display name.
+
+    Underscores become spaces, the .md suffix is dropped, and the
+    "Cheat Sheet" suffix (if present) is stripped. The result is
+    whitespace-stripped so trailing/leading spaces do not leak into
+    the rendered link text.
+
+    Examples:
+        >>> to_display_name("Authentication_Cheat_Sheet.md")
+        'Authentication'
+        >>> to_display_name("XSS_Prevention_Cheat_Sheet.md")
+        'XSS Prevention'
+    """
+    return (filename
+            .replace("_", " ")
+            .replace(".md", "")
+            .replace("Cheat Sheet", "")
+            .strip())
+
+
+def should_skip(filename: str) -> bool:
+    """Return True for files that should not appear in the generated TOC."""
+    return filename in _EXCLUDED_FROM_TOC
+
+
+def build_toc_lines(cheatsheets: Iterable[str]) -> List[str]:
+    """Return the list of fixed pre-defined index links for the TOC.
+
+    These four links are always emitted in this order, regardless of the
+    contents of the cheatsheets/ directory.
+    """
+    return [
+        cs_md_link_template % ("Index Alphabetical", "Index.md"),
+        cs_md_link_template % ("Index ASVS", "IndexASVS.md"),
+        cs_md_link_template % ("Index ASVS", "IndexMASVS.md"),
+        cs_md_link_template % ("Index Proactive Controls", "IndexProactiveControls.md"),
+    ]
+
+
+def main(cheatsheets_dir: str = "../cheatsheets", output_file: str = "TOC.md") -> int:
+    """Generate the summary markdown page.
+
+    Scans ``cheatsheets_dir`` for files, sorts them alphabetically, and
+    writes a SUMMARY-style markdown file at ``output_file``. Returns 0 on
+    success.
+    """
+    cheatsheets = sorted(
+        f.name for f in os.scandir(cheatsheets_dir) if f.is_file()
+    )
+    with open(output_file, "w") as index_file:
+        index_file.write("# Summary\n\n")
+        index_file.write("### Cheatsheets\n\n")
+        for link in build_toc_lines(cheatsheets):
+            index_file.write(link)
             index_file.write("\n")
-print("Summary markdown page generated.")
+        for cheatsheet in cheatsheets:
+            if not should_skip(cheatsheet):
+                index_file.write(
+                    cs_md_link_template % (to_display_name(cheatsheet), cheatsheet)
+                )
+                index_file.write("\n")
+    print("Summary markdown page generated.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -10,36 +10,95 @@
 
 Dependencies: pip install requests
 """
-import sys
-import requests
 import json
+import sys
 from collections import OrderedDict
+from typing import Dict, List, Optional, Tuple
+
+import requests
 
 # Define templates
 CS_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/%s.html"
+INDEX_URL = (
+    "https://raw.githubusercontent.com/OWASP/CheatSheetSeries/master/Index.md"
+)
+
+
+def parse_index_line(line: str) -> Optional[Tuple[str, List[str]]]:
+    """Parse a single line from ``Index.md``.
+
+    Index lines that reference technology icons have the shape::
+
+        [Cheatsheet Name](cheatsheets/Filename.md) ![Tech](assets/Index_Tech.svg) ...
+
+    This function returns a ``(cheatsheet_name, [technology_names])`` tuple
+    for any such line, or ``None`` for lines that do not reference
+    technology icons.
+
+    Returns:
+        A tuple of the cheatsheet display name and the list of
+        uppercased technology names, or ``None`` if the line has no
+        technology icon references.
+    """
+    if "(assets/Index_" not in line:
+        return None
+    work = line.strip()
+    cs_name = work[1:work.index("]")]
+    technologies = work.split("!")[1:]
+    tech_names = [tech[1:tech.index("]")].upper() for tech in technologies]
+    return cs_name, tech_names
 
-# Grab the index MD source from the GitHub repository
-response = requests.get(
-    "https://raw.githubusercontent.com/OWASP/CheatSheetSeries/master/Index.md")
-if response.status_code != 200:
-    print("Cannot load the INDEX content: HTTP %s received!" %
-          response.status_code)
-    sys.exit(1)
-else:
-    data = OrderedDict({})
-    for line in response.text.split("\n"):
-        if "(assets/Index_" in line:
-            work = line.strip()
-            # Extract the name of the CS
-            cs_name = work[1:work.index("]")]
-            # Extract technologies and map the CS to them
-            technologies = work.split("!")[1:]
-            for technology in technologies:
-                technology_name = technology[1:technology.index("]")].upper()
-                if technology_name not in data:
-                    data[technology_name] = []
-                data[technology_name].append(
-                    {"CS_NAME": cs_name, "CS_URL": CS_BASE_URL % cs_name.replace(" ", "_")})
-    # Display the built structure and formatted JSON
+
+def build_technologies_dict(
+    index_text: str,
+) -> "OrderedDict[str, List[Dict[str, str]]]":
+    """Build the technology -> [cheatsheet] mapping from ``Index.md`` text.
+
+    The returned dict preserves the order in which technologies first
+    appear in the index, matching the legacy behavior of the script.
+    """
+    data: "OrderedDict[str, List[Dict[str, str]]]" = OrderedDict()
+    for line in index_text.split("\n"):
+        parsed = parse_index_line(line)
+        if parsed is None:
+            continue
+        cs_name, tech_names = parsed
+        for tech in tech_names:
+            data.setdefault(tech, []).append(
+                {
+                    "CS_NAME": cs_name,
+                    "CS_URL": CS_BASE_URL % cs_name.replace(" ", "_"),
+                }
+            )
+    return data
+
+
+def fetch_index_text(url: str = INDEX_URL) -> Tuple[int, str]:
+    """Fetch the ``Index.md`` content from the given URL.
+
+    Returns:
+        A ``(status_code, body)`` tuple. Callers are expected to check
+        the status code and emit a user-facing error if it is not 200.
+    """
+    response = requests.get(url)
+    return response.status_code, response.text
+
+
+def main() -> int:
+    """Fetch the index and print the technologies JSON to stdout.
+
+    Returns 0 on success and 1 if the upstream index cannot be fetched.
+    """
+    status, text = fetch_index_text()
+    if status != 200:
+        print(
+            "Cannot load the INDEX content: HTTP %s received!" % status
+        )
+        return 1
+    data = build_technologies_dict(text)
     print(json.dumps(data, sort_keys=True, indent=1))
-    sys.exit(0)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())