@@ -20,31 +20,64 @@ PYBIND11_MODULE(_fuzzybunny, m) {
2020
2121 m.def (" levenshtein" , [](const std::string& s1, const std::string& s2) {
2222 return levenshtein_ratio (utf8_to_u32 (s1), utf8_to_u32 (s2));
23- }, py::arg (" s1" ), py::arg (" s2" ), " Calculate Levenshtein ratio (0.0 - 1.0)" );
23+ }, py::arg (" s1" ), py::arg (" s2" ), R"pbdoc(
24+ Calculate the Levenshtein similarity ratio between two strings.
25+
26+ Returns a score between 0.0 and 1.0, where 1.0 is an exact match.
27+ The ratio is calculated as: 1 - (distance / max_length).
28+ )pbdoc" );
2429
2530 m.def (" partial_ratio" , [](const std::string& s1, const std::string& s2) {
2631 return partial_ratio (utf8_to_u32 (s1), utf8_to_u32 (s2));
27- }, py::arg (" s1" ), py::arg (" s2" ), " Calculate Partial Levenshtein ratio (0.0 - 1.0)" );
32+ }, py::arg (" s1" ), py::arg (" s2" ), R"pbdoc(
33+ Calculate the best substring similarity ratio.
34+
35+ If the shorter string has length k, this finds the best Levenshtein
36+ ratio between the shorter string and any substring of length k
37+ in the longer string.
38+ )pbdoc" );
2839
2940 m.def (" jaccard" , [](const std::string& s1, const std::string& s2) {
3041 return jaccard_similarity (utf8_to_u32 (s1), utf8_to_u32 (s2));
31- }, py::arg (" s1" ), py::arg (" s2" ), " Calculate Jaccard similarity (0.0 - 1.0)" );
42+ }, py::arg (" s1" ), py::arg (" s2" ), R"pbdoc(
43+ Calculate Jaccard similarity between token sets.
44+
45+ Tokenizes both strings and calculates the intersection over union
46+ of the unique tokens.
47+ )pbdoc" );
3248
3349 m.def (" token_sort" , [](const std::string& s1, const std::string& s2) {
3450 return token_sort_ratio (utf8_to_u32 (s1), utf8_to_u32 (s2));
35- }, py::arg (" s1" ), py::arg (" s2" ), " Calculate Token Sort ratio (0.0 - 1.0)" );
51+ }, py::arg (" s1" ), py::arg (" s2" ), R"pbdoc(
52+ Calculate similarity ratio after sorting tokens.
53+
54+ Tokenizes both strings, sorts the tokens alphabetically, joins them
55+ back with spaces, and then calculates the Levenshtein ratio.
56+ )pbdoc" );
3657
3758 m.def (" token_set" , [](const std::string& s1, const std::string& s2) {
3859 return token_set_ratio (utf8_to_u32 (s1), utf8_to_u32 (s2));
39- }, py::arg (" s1" ), py::arg (" s2" ), " Calculate Token Set ratio (0.0 - 1.0)" );
60+ }, py::arg (" s1" ), py::arg (" s2" ), R"pbdoc(
61+ Calculate similarity ratio while ignoring duplicates and token order.
62+
63+ Finds the intersection and differences between token sets and
64+ compares them to find the best possible match.
65+ )pbdoc" );
4066
4167 m.def (" qratio" , [](const std::string& s1, const std::string& s2) {
4268 return qratio (utf8_to_u32 (s1), utf8_to_u32 (s2));
43- }, py::arg (" s1" ), py::arg (" s2" ), " Calculate QRatio (0.0 - 1.0)" );
69+ }, py::arg (" s1" ), py::arg (" s2" ), R"pbdoc(
70+ A simple Levenshtein ratio matching the behavior of other fuzzy libs.
71+ )pbdoc" );
4472
4573 m.def (" wratio" , [](const std::string& s1, const std::string& s2) {
4674 return wratio (utf8_to_u32 (s1), utf8_to_u32 (s2));
47- }, py::arg (" s1" ), py::arg (" s2" ), " Calculate WRatio (0.0 - 1.0)" );
75+ }, py::arg (" s1" ), py::arg (" s2" ), R"pbdoc(
76+ Weighted similarity ratio (recommended for general use).
77+
78+ Combines Levenshtein, partial ratio, and token-based ratios using
79+ heuristics to provide the most 'intuitive' similarity score.
80+ )pbdoc" );
4881
4982 m.def (" rank" , &rank,
5083 py::arg (" query" ),
0 commit comments