HashTrace/Rabin_karp.cpp at main · Praveenkumar76/HashTrace · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#include "Rabin_karp.h"
#include <cmath>
#include <algorithm>
#include <stdexcept>

double RabinKarp::computeSimilarity(const std::string &text1, const std::string &text2, int k)
{
    if (k <= 0) {
        throw std::invalid_argument("k-gram size must be positive");
    }

    if (text1.empty() && text2.empty()) return 1.0;
    if (text1.empty() || text2.empty()) return 0.0;

    // Use minimum length to prevent generating more hashes than necessary
    const size_t min_len = std::min(text1.length(), text2.length());
    if (static_cast<size_t>(k) > min_len) {
        k = static_cast<int>(min_len);
    }

    auto set1 = generateUniqueHashes(text1, k);
    auto set2 = generateUniqueHashes(text2, k);

    if (set1.empty() && set2.empty()) return 1.0;
    if (set1.empty() || set2.empty()) return 0.0;

    // Compute Jaccard similarity coefficient
    size_t intersection = 0;
    for (const auto &h : set1) {
        if (set2.count(h)) intersection++;
    }

    size_t union_size = set1.size() + set2.size() - intersection;
    return static_cast<double>(intersection) / union_size;
}

std::vector<std::pair<size_t, size_t>> RabinKarp::findMatches(const std::string &text1, const std::string &text2, int k) {
    std::vector<std::pair<size_t, size_t>> matches;
    if (k <= 0 || text1.empty() || text2.empty()) return matches;

    // Ensure k doesn't exceed text lengths
    const size_t min_len = std::min(text1.length(), text2.length());
    if (static_cast<size_t>(k) > min_len) {
        k = static_cast<int>(min_len);
    }

    auto hashes1 = generateHashes(text1, k);
    auto hashes2 = generateHashes(text2, k);

    if (hashes1.empty() || hashes2.empty()) return matches;

    // Create a map of hash values to their positions in text1
    std::unordered_map<long long, std::vector<size_t>> hashPositions;
    for (size_t i = 0; i < hashes1.size(); ++i) {
        hashPositions[hashes1[i]].push_back(i);
    }

    // Find matches in text2
    for (size_t j = 0; j < hashes2.size(); ++j) {
        auto it = hashPositions.find(hashes2[j]);
        if (it != hashPositions.end()) {
            // Add all positions where this hash occurs in text1
            for (size_t pos1 : it->second) {
                matches.emplace_back(pos1, j);
            }
        }
    }

    // Sort matches by position in first text
    std::sort(matches.begin(), matches.end());

    return matches;
}

std::vector<long long> RabinKarp::generateHashes(const std::string &text, int k) const
{
    std::vector<long long> hashes;
    const size_t text_len = text.length();

    if (text_len < static_cast<size_t>(k) || k <= 0) {
        return hashes;
    }

    long long hash = 0;
    long long power = 1;

    // Precompute power = BASE^(k-1) % MOD
    for (int i = 0; i < k - 1; ++i) {
        power = (power * BASE) % MOD;
    }

    // Compute initial window hash
    for (int i = 0; i < k; ++i) {
        hash = (hash * BASE + static_cast<unsigned char>(text[i])) % MOD;
    }
    hashes.push_back(hash);

    // Rolling hash for remaining windows
    for (size_t i = k; i < text_len; ++i) {
        // Remove leftmost character
        long long leftChar = static_cast<unsigned char>(text[i - k]);
        hash = (hash - (leftChar * power) % MOD + MOD) % MOD;

        // Add new character
        hash = (hash * BASE + static_cast<unsigned char>(text[i])) % MOD;
        hashes.push_back(hash);
    }

    return hashes;
}

std::unordered_set<long long> RabinKarp::generateUniqueHashes(const std::string &text, int k) const
{
    auto hashes = generateHashes(text, k);
    return {hashes.begin(), hashes.end()};
}