|
| 1 | +"""Unit tests for text utilities.""" |
| 2 | + |
| 3 | +from extropy.simulation.text_utils import compute_trigram_jaccard |
| 4 | + |
| 5 | + |
| 6 | +class TestComputeTrigramJaccard: |
| 7 | + """Tests for trigram Jaccard similarity.""" |
| 8 | + |
| 9 | + def test_identical_texts_returns_1(self): |
| 10 | + """Identical texts should have similarity of 1.0.""" |
| 11 | + text = ( |
| 12 | + "I am very worried about my job security and what this means for my family" |
| 13 | + ) |
| 14 | + assert compute_trigram_jaccard(text, text) == 1.0 |
| 15 | + |
| 16 | + def test_completely_different_texts_returns_0(self): |
| 17 | + """Completely different texts should have similarity near 0.""" |
| 18 | + text1 = "The quick brown fox jumps over the lazy dog" |
| 19 | + text2 = "A completely unrelated sentence with no overlap whatsoever here" |
| 20 | + similarity = compute_trigram_jaccard(text1, text2) |
| 21 | + assert similarity < 0.1 |
| 22 | + |
| 23 | + def test_similar_texts_high_similarity(self): |
| 24 | + """Similar/paraphrased texts should have high similarity.""" |
| 25 | + text1 = "I am worried about my job and what this means for my family" |
| 26 | + text2 = "I am worried about my job and what this means for our family" |
| 27 | + similarity = compute_trigram_jaccard(text1, text2) |
| 28 | + # One word change still yields ~69% similarity |
| 29 | + assert similarity > 0.6 |
| 30 | + |
| 31 | + def test_short_text_returns_0(self): |
| 32 | + """Texts with fewer than 3 words should return 0.""" |
| 33 | + assert compute_trigram_jaccard("hello world", "hello world") == 0.0 |
| 34 | + assert compute_trigram_jaccard("one", "two") == 0.0 |
| 35 | + |
| 36 | + def test_empty_text_returns_0(self): |
| 37 | + """Empty texts should return 0.""" |
| 38 | + assert compute_trigram_jaccard("", "") == 0.0 |
| 39 | + assert compute_trigram_jaccard("hello there friend", "") == 0.0 |
| 40 | + |
| 41 | + def test_case_insensitive(self): |
| 42 | + """Similarity should be case-insensitive.""" |
| 43 | + text1 = "I Am Worried About My Job" |
| 44 | + text2 = "i am worried about my job" |
| 45 | + assert compute_trigram_jaccard(text1, text2) == 1.0 |
| 46 | + |
| 47 | + def test_partial_overlap(self): |
| 48 | + """Texts with partial overlap should have intermediate similarity.""" |
| 49 | + text1 = "I need to save money and cut expenses immediately" |
| 50 | + text2 = "I need to save money but also invest for the future" |
| 51 | + similarity = compute_trigram_jaccard(text1, text2) |
| 52 | + # Some overlap but not complete |
| 53 | + assert 0.2 < similarity < 0.8 |
| 54 | + |
| 55 | + def test_repetitive_reasoning_detection(self): |
| 56 | + """Should detect when agent reasoning is repetitive.""" |
| 57 | + reasoning1 = ( |
| 58 | + "I'm terrified about losing my job. Need to cut spending and save money. " |
| 59 | + "Maybe look for backup work. Lisa and I need to talk about the budget." |
| 60 | + ) |
| 61 | + reasoning2 = ( |
| 62 | + "Still terrified about losing my job. Need to cut spending and save money. " |
| 63 | + "Looking at gig apps for backup work. Lisa and I talked about the budget." |
| 64 | + ) |
| 65 | + similarity = compute_trigram_jaccard(reasoning1, reasoning2) |
| 66 | + # These share themes but are paraphrased — ~43% similarity |
| 67 | + # Higher than completely different texts, showing partial overlap |
| 68 | + assert similarity > 0.3 |
| 69 | + |
| 70 | + def test_different_reasoning_low_similarity(self): |
| 71 | + """Different reasoning should have low similarity.""" |
| 72 | + reasoning1 = ( |
| 73 | + "I'm terrified about losing my job. Need to cut spending and save money. " |
| 74 | + "Maybe look for backup work." |
| 75 | + ) |
| 76 | + reasoning2 = ( |
| 77 | + "Actually feeling more optimistic now. The retraining program looks promising. " |
| 78 | + "I signed up for the AI course and it's going well." |
| 79 | + ) |
| 80 | + similarity = compute_trigram_jaccard(reasoning1, reasoning2) |
| 81 | + assert similarity < 0.3 |
0 commit comments