Skip to content

Commit 9cad2fc

Browse files
committed
add 6 Cloudflare/score/best_email tests to test_email_utils.py
1 parent 92bd745 commit 9cad2fc

1 file changed

Lines changed: 91 additions & 6 deletions

File tree

tests/test_email_utils.py

Lines changed: 91 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,37 @@
44

55
import unittest
66

7-
from core.email_utils import extract_emails_raw, extract_phones
7+
from core.email_utils import (
8+
best_email,
9+
decode_cloudflare_email,
10+
extract_emails_raw,
11+
extract_phones,
12+
score_email,
13+
)
14+
15+
# ---------------------------------------------------------------------------
16+
# Minimal config matching config.example.yaml defaults used by score_email /
17+
# best_email — enough to exercise all four score tiers without requiring a
18+
# live config.yaml on disk.
19+
# ---------------------------------------------------------------------------
20+
_CFG = {
21+
"skip_email_keywords": [
22+
"noreply", "no-reply", "donotreply", "unsubscribe",
23+
"postmaster", "webmaster", "bounce",
24+
],
25+
"generic_email_keywords": [
26+
"info", "admin", "hello", "contact", "enquiries", "enquiry",
27+
"office", "mail", "email", "team", "support", "help", "sales",
28+
"accounts", "finance", "general", "service", "post",
29+
],
30+
"junk_email_domains": ["mailinator.com", "yopmail.com", "example.com"],
31+
}
832

933

1034
class TestEmailUtils(unittest.TestCase):
1135

36+
# ── existing tests (unchanged) ──────────────────────────────────────────
37+
1238
def test_mailto_query_string_stripped(self):
1339
"""Verifies that query strings and URL fragments are stripped from email addresses."""
1440
html = 'Contact us at <a href="mailto:hello@example.co.uk?subject=test">hello@example.co.uk</a>'
@@ -34,7 +60,6 @@ def test_html_entity_phone_decoded(self):
3460
"""Verifies that HTML-entity-encoded phone numbers are decoded correctly."""
3561
html = "Call us: &#x2B;44 207 123 4567"
3662
phones = extract_phones(html)
37-
# Should decode to +44 207 123 4567
3863
self.assertTrue(any("+44 207 123 4567" in p for p in phones))
3964

4065
def test_decimal_phone_rejected(self):
@@ -43,14 +68,74 @@ def test_decimal_phone_rejected(self):
4368
phones = extract_phones(html)
4469
self.assertFalse(any("132.30" in p for p in phones))
4570
self.assertTrue(any("020 7123 4567" in p for p in phones))
46-
71+
4772
def test_zero_loop_phone_rejected(self):
4873
"""Verifies that numbers containing three or more consecutive zeros are rejected."""
4974
html = "Fake: 020 7000 4567, Real: 020 7123 4567"
5075
phones = extract_phones(html)
51-
# 02070004567 contains 000
5276
self.assertFalse(any("020 7000 4567" in p for p in phones))
5377
self.assertTrue(any("020 7123 4567" in p for p in phones))
5478

55-
if __name__ == '__main__':
56-
unittest.main()
79+
# ── NEW: decode_cloudflare_email ────────────────────────────────────────
80+
81+
def test_decode_cloudflare_known_fixture(self):
82+
"""Decodes a pre-computed Cloudflare hex string to its original address.
83+
84+
Fixture derived from the XOR scheme: key=0x1a, plaintext=hello@example.com.
85+
Key byte prepended, all bytes hex-encoded → 36-char string.
86+
This exercises the full XOR-decode path without any network or browser.
87+
"""
88+
encoded = "1a727f7676755a7f627b776a767f34797577"
89+
self.assertEqual(decode_cloudflare_email(encoded), "hello@example.com")
90+
91+
def test_decode_cloudflare_invalid_hex_returns_empty(self):
92+
"""Returns an empty string when the input is not valid hex."""
93+
self.assertEqual(decode_cloudflare_email("not-valid-hex!!"), "")
94+
self.assertEqual(decode_cloudflare_email(""), "")
95+
96+
# ── NEW: score_email — all four tiers ───────────────────────────────────
97+
98+
def test_score_email_tier_1_personal(self):
99+
"""A personal-name local part scores 1 — highest quality, no generic keywords."""
100+
self.assertEqual(score_email("john.smith@realco.co.uk", _CFG), 1)
101+
102+
def test_score_email_tiers_2_3_and_999(self):
103+
"""Covers priority-generic (2), other-generic (3), and junk (999) in one method."""
104+
# Tier 2: exact priority set {"info","hello","contact","enquiries","enquiry"}
105+
self.assertEqual(score_email("info@realco.co.uk", _CFG), 2)
106+
self.assertEqual(score_email("contact@realco.co.uk", _CFG), 2)
107+
108+
# Tier 3: in generic_email_keywords but not in the tier-2 priority set
109+
self.assertEqual(score_email("support@realco.co.uk", _CFG), 3)
110+
self.assertEqual(score_email("accounts@realco.co.uk", _CFG), 3)
111+
112+
# Tier 999 via skip keyword in local part
113+
self.assertEqual(score_email("noreply@realco.co.uk", _CFG), 999)
114+
115+
# Tier 999 via junk domain regardless of local part
116+
self.assertEqual(score_email("info@mailinator.com", _CFG), 999)
117+
118+
# ── NEW: best_email ─────────────────────────────────────────────────────
119+
120+
def test_best_email_returns_lowest_score(self):
121+
"""best_email picks the candidate with the lowest score (highest quality)."""
122+
candidates = [
123+
"support@realco.co.uk", # tier 3
124+
"info@realco.co.uk", # tier 2
125+
"james.hunt@realco.co.uk", # tier 1 — should win
126+
]
127+
self.assertEqual(best_email(candidates, _CFG), "james.hunt@realco.co.uk")
128+
129+
def test_best_email_discards_junk_score_999(self):
130+
"""Junk-scored (999) emails are excluded; best_email returns '' when all are junk."""
131+
# All junk — must return empty string
132+
all_junk = ["noreply@realco.co.uk", "info@mailinator.com"]
133+
self.assertEqual(best_email(all_junk, _CFG), "")
134+
135+
# Mix of junk and valid — valid one must be returned
136+
mixed = ["noreply@realco.co.uk", "hello@realco.co.uk"]
137+
self.assertEqual(best_email(mixed, _CFG), "hello@realco.co.uk")
138+
139+
140+
if __name__ == "__main__":
141+
unittest.main()

0 commit comments

Comments
 (0)