Merge pull request #380 from Sid0004/issue-300-tests

KingAkeem · web-flow · commit 6df64d0b1fc9 · 2025-10-29T15:53:35.000-04:00
tests: add comprehensive unit tests for linktree module (fixes #300)
diff --git a/run_tests.py b/run_tests.py
@@ -0,0 +1,16 @@
+r"""Helper to run pytest with the `src` layout on systems where the package isn't installed.
+
+Usage:
+  .venv\Scripts\python.exe run_tests.py -q
+"""
+import sys
+
+# Ensure the src directory is first on sys.path so `import torbot` works
+sys.path.insert(0, "src")
+
+import pytest
+
+
+if __name__ == "__main__":
+    args = sys.argv[1:] or ["-q"]
+    raise SystemExit(pytest.main(args))
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,35 @@
+"""pytest configuration and test-time shims for TorBot tests.
+
+This module provides a lightweight stub for the NLP classifier so tests can
+run without installing the full scientific stack (numpy, scikit-learn, etc.).
+The stub is only active during test runs and does not affect production code.
+"""
+import sys
+import types
+
+
+def _install_nlp_stub():
+    """Install a minimal stub for torbot.modules.nlp.main during tests."""
+    mod_name = "torbot.modules.nlp.main"
+    if mod_name in sys.modules:
+        return
+
+    # Create a minimal module with classify function
+    stub = types.ModuleType(mod_name)
+    
+    def classify(data):
+        """Lightweight test-only classifier.
+        
+        Returns a deterministic classification without requiring ML libraries.
+        Real implementation uses sklearn pipeline with training data.
+        """
+        _ = data  # unused in stub
+        return ["unknown", 0.0]
+    
+    # Use setattr to avoid linter complaints about dynamic attributes
+    setattr(stub, "classify", classify)
+    sys.modules[mod_name] = stub
+
+
+# Install stub before any test imports
+_install_nlp_stub()
diff --git a/tests/test_linktree_extra.py b/tests/test_linktree_extra.py
@@ -0,0 +1,158 @@
+"""Additional edge-case tests for linktree parsing functions.
+
+These tests cover corner cases and error conditions for the parsing helpers.
+"""
+from bs4 import BeautifulSoup
+import pytest
+
+from torbot.modules.linktree import (
+    parse_hostname,
+    parse_links,
+    parse_emails,
+    parse_phone_numbers,
+)
+
+
+def test_parse_hostname_raises_on_invalid_url() -> None:
+    """Ensure parse_hostname raises exception for URLs without hostname."""
+    with pytest.raises(Exception, match="unable to parse hostname"):
+        parse_hostname("not-a-valid-url")
+
+
+def test_parse_hostname_handles_various_schemes() -> None:
+    """Verify parse_hostname works with http, https, and onion domains."""
+    assert parse_hostname("https://www.example.com/path") == "www.example.com"
+    assert parse_hostname("http://test.onion") == "test.onion"
+    assert parse_hostname("https://sub.domain.co.uk:8080/") == "sub.domain.co.uk"
+
+
+def test_parse_links_filters_only_valid_full_urls() -> None:
+    """Ensure parse_links returns only absolute http(s) URLs."""
+    html = """
+    <html>
+      <a href="/relative/path">relative</a>
+      <a href="https://valid.example/path">valid</a>
+      <a href="http://also.valid.test/">valid2</a>
+      <a href="javascript:void(0)">js</a>
+      <a href="https://valid.example/path">valid-duplicate</a>
+    </html>
+    """
+
+    links = parse_links(html)
+    # only absolute http(s) URLs should be returned, duplicates preserved
+    assert links == [
+        "https://valid.example/path",
+        "http://also.valid.test/",
+        "https://valid.example/path",
+    ]
+
+
+def test_parse_links_empty_html() -> None:
+    """Test parse_links with HTML containing no anchor tags."""
+    html = "<html><body><p>No links here</p></body></html>"
+    links = parse_links(html)
+    assert links == []
+
+
+def test_parse_links_anchor_without_href() -> None:
+    """Ensure parse_links handles anchor tags without href attribute."""
+    html = """
+    <html>
+      <a>No href</a>
+      <a name="anchor">Named anchor</a>
+      <a href="https://valid.com">Valid</a>
+    </html>
+    """
+    links = parse_links(html)
+    assert links == ["https://valid.com"]
+
+
+def test_parse_emails_ignores_invalid_and_returns_unique() -> None:
+    """Verify parse_emails filters invalid emails and removes duplicates."""
+    doc = BeautifulSoup(
+        """
+        <html>
+          <a href="mailto:good@example.com">good</a>
+          <a href="mailto:good@example.com">good-dup</a>
+          <a href="mailto:bad-email@invalid@">bad</a>
+          <a href="mailto:withparams@example.com?subject=hi">withparams</a>
+          <a href="#">not-mailto</a>
+        </html>
+        """,
+        "html.parser",
+    )
+
+    emails = parse_emails(doc)
+    # duplicates removed, invalid emails rejected
+    # Note: current impl splits on 'mailto:' so params might be included
+    # We test actual behavior here
+    assert "good@example.com" in emails
+    assert len([e for e in emails if e == "good@example.com"]) == 1  # no duplicates
+
+
+def test_parse_emails_empty_page() -> None:
+    """Test parse_emails with no mailto links."""
+    doc = BeautifulSoup("<html><body><p>No emails</p></body></html>", "html.parser")
+    emails = parse_emails(doc)
+    assert emails == []
+
+
+def test_parse_emails_malformed_mailto() -> None:
+    """Ensure malformed mailto links are filtered out."""
+    doc = BeautifulSoup(
+        """
+        <html>
+          <a href="mailto:">empty</a>
+          <a href="mailto:not-an-email">invalid</a>
+          <a href="mailto:valid@test.com">valid</a>
+        </html>
+        """,
+        "html.parser",
+    )
+    emails = parse_emails(doc)
+    # Only valid email should be extracted
+    assert emails == ["valid@test.com"]
+
+
+def test_parse_phone_numbers_only_accepts_possible_international_numbers() -> None:
+    """Verify parse_phone_numbers validates international format."""
+    doc = BeautifulSoup(
+        """
+        <html>
+          <a href="tel:+14155552671">us</a>
+          <a href="tel:4155552671">no-plus</a>
+          <a href="tel:+442071838750">uk</a>
+          <a href="tel:invalid_phone">invalid</a>
+        </html>
+        """,
+        "html.parser",
+    )
+
+    numbers = parse_phone_numbers(doc)
+    # only the properly formatted international numbers (with +) are considered possible
+    assert sorted(numbers) == ["+14155552671", "+442071838750"]
+
+
+def test_parse_phone_numbers_empty_page() -> None:
+    """Test parse_phone_numbers with no tel: links."""
+    doc = BeautifulSoup("<html><body><p>No phones</p></body></html>", "html.parser")
+    numbers = parse_phone_numbers(doc)
+    assert numbers == []
+
+
+def test_parse_phone_numbers_removes_duplicates() -> None:
+    """Ensure duplicate phone numbers are deduplicated."""
+    doc = BeautifulSoup(
+        """
+        <html>
+          <a href="tel:+14155551234">call</a>
+          <a href="tel:+14155551234">call again</a>
+          <a href="tel:+14155559999">other</a>
+        </html>
+        """,
+        "html.parser",
+    )
+    numbers = parse_phone_numbers(doc)
+    assert len(numbers) == 2
+    assert "+14155551234" in numbers
+    assert "+14155559999" in numbers
diff --git a/tests/test_linktree_tree.py b/tests/test_linktree_tree.py