44
55import unittest
66
7- from core .email_utils import extract_emails_raw , extract_phones
7+ from core .email_utils import (
8+ best_email ,
9+ decode_cloudflare_email ,
10+ extract_emails_raw ,
11+ extract_phones ,
12+ score_email ,
13+ )
14+
15+ # ---------------------------------------------------------------------------
16+ # Minimal config matching config.example.yaml defaults used by score_email /
17+ # best_email — enough to exercise all four score tiers without requiring a
18+ # live config.yaml on disk.
19+ # ---------------------------------------------------------------------------
20+ _CFG = {
21+ "skip_email_keywords" : [
22+ "noreply" , "no-reply" , "donotreply" , "unsubscribe" ,
23+ "postmaster" , "webmaster" , "bounce" ,
24+ ],
25+ "generic_email_keywords" : [
26+ "info" , "admin" , "hello" , "contact" , "enquiries" , "enquiry" ,
27+ "office" , "mail" , "email" , "team" , "support" , "help" , "sales" ,
28+ "accounts" , "finance" , "general" , "service" , "post" ,
29+ ],
30+ "junk_email_domains" : ["mailinator.com" , "yopmail.com" , "example.com" ],
31+ }
832
933
1034class TestEmailUtils (unittest .TestCase ):
1135
36+ # ── existing tests (unchanged) ──────────────────────────────────────────
37+
1238 def test_mailto_query_string_stripped (self ):
1339 """Verifies that query strings and URL fragments are stripped from email addresses."""
1440 html = 'Contact us at <a href="mailto:hello@example.co.uk?subject=test">hello@example.co.uk</a>'
@@ -34,7 +60,6 @@ def test_html_entity_phone_decoded(self):
3460 """Verifies that HTML-entity-encoded phone numbers are decoded correctly."""
3561 html = "Call us: +44 207 123 4567"
3662 phones = extract_phones (html )
37- # Should decode to +44 207 123 4567
3863 self .assertTrue (any ("+44 207 123 4567" in p for p in phones ))
3964
4065 def test_decimal_phone_rejected (self ):
@@ -43,14 +68,74 @@ def test_decimal_phone_rejected(self):
4368 phones = extract_phones (html )
4469 self .assertFalse (any ("132.30" in p for p in phones ))
4570 self .assertTrue (any ("020 7123 4567" in p for p in phones ))
46-
71+
4772 def test_zero_loop_phone_rejected (self ):
4873 """Verifies that numbers containing three or more consecutive zeros are rejected."""
4974 html = "Fake: 020 7000 4567, Real: 020 7123 4567"
5075 phones = extract_phones (html )
51- # 02070004567 contains 000
5276 self .assertFalse (any ("020 7000 4567" in p for p in phones ))
5377 self .assertTrue (any ("020 7123 4567" in p for p in phones ))
5478
55- if __name__ == '__main__' :
56- unittest .main ()
79+ # ── NEW: decode_cloudflare_email ────────────────────────────────────────
80+
81+ def test_decode_cloudflare_known_fixture (self ):
82+ """Decodes a pre-computed Cloudflare hex string to its original address.
83+
84+ Fixture derived from the XOR scheme: key=0x1a, plaintext=hello@example.com.
85+ Key byte prepended, all bytes hex-encoded → 36-char string.
86+ This exercises the full XOR-decode path without any network or browser.
87+ """
88+ encoded = "1a727f7676755a7f627b776a767f34797577"
89+ self .assertEqual (decode_cloudflare_email (encoded ), "hello@example.com" )
90+
91+ def test_decode_cloudflare_invalid_hex_returns_empty (self ):
92+ """Returns an empty string when the input is not valid hex."""
93+ self .assertEqual (decode_cloudflare_email ("not-valid-hex!!" ), "" )
94+ self .assertEqual (decode_cloudflare_email ("" ), "" )
95+
96+ # ── NEW: score_email — all four tiers ───────────────────────────────────
97+
98+ def test_score_email_tier_1_personal (self ):
99+ """A personal-name local part scores 1 — highest quality, no generic keywords."""
100+ self .assertEqual (score_email ("john.smith@realco.co.uk" , _CFG ), 1 )
101+
102+ def test_score_email_tiers_2_3_and_999 (self ):
103+ """Covers priority-generic (2), other-generic (3), and junk (999) in one method."""
104+ # Tier 2: exact priority set {"info","hello","contact","enquiries","enquiry"}
105+ self .assertEqual (score_email ("info@realco.co.uk" , _CFG ), 2 )
106+ self .assertEqual (score_email ("contact@realco.co.uk" , _CFG ), 2 )
107+
108+ # Tier 3: in generic_email_keywords but not in the tier-2 priority set
109+ self .assertEqual (score_email ("support@realco.co.uk" , _CFG ), 3 )
110+ self .assertEqual (score_email ("accounts@realco.co.uk" , _CFG ), 3 )
111+
112+ # Tier 999 via skip keyword in local part
113+ self .assertEqual (score_email ("noreply@realco.co.uk" , _CFG ), 999 )
114+
115+ # Tier 999 via junk domain regardless of local part
116+ self .assertEqual (score_email ("info@mailinator.com" , _CFG ), 999 )
117+
118+ # ── NEW: best_email ─────────────────────────────────────────────────────
119+
120+ def test_best_email_returns_lowest_score (self ):
121+ """best_email picks the candidate with the lowest score (highest quality)."""
122+ candidates = [
123+ "support@realco.co.uk" , # tier 3
124+ "info@realco.co.uk" , # tier 2
125+ "james.hunt@realco.co.uk" , # tier 1 — should win
126+ ]
127+ self .assertEqual (best_email (candidates , _CFG ), "james.hunt@realco.co.uk" )
128+
129+ def test_best_email_discards_junk_score_999 (self ):
130+ """Junk-scored (999) emails are excluded; best_email returns '' when all are junk."""
131+ # All junk — must return empty string
132+ all_junk = ["noreply@realco.co.uk" , "info@mailinator.com" ]
133+ self .assertEqual (best_email (all_junk , _CFG ), "" )
134+
135+ # Mix of junk and valid — valid one must be returned
136+ mixed = ["noreply@realco.co.uk" , "hello@realco.co.uk" ]
137+ self .assertEqual (best_email (mixed , _CFG ), "hello@realco.co.uk" )
138+
139+
140+ if __name__ == "__main__" :
141+ unittest .main ()
0 commit comments