-
-
Notifications
You must be signed in to change notification settings - Fork 583
feat(search): add unicode character removal for fuzzy matching #4360
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Changes from all commits
d2f8663
03164fb
f905530
17b2970
0588fa6
c270c9d
6dfb208
0059f3e
352fdc7
bb10b1f
99e7810
56973ed
b6028dc
a52a6af
839b978
c2df52e
99f2b3b
54a458d
2c53648
52bb34f
0559145
7732442
b7bd1cc
2228633
4cbe223
c945d08
e0f7722
459a369
0228f7a
7607844
2ab6405
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,30 +1,41 @@ | ||
| using CommunityToolkit.Mvvm.DependencyInjection; | ||
| using Flow.Launcher.Plugin.SharedModels; | ||
| using System; | ||
| using System; | ||
| using System.Buffers; | ||
| using System.Collections.Generic; | ||
| using System.Linq; | ||
| using CommunityToolkit.Mvvm.DependencyInjection; | ||
| using Flow.Launcher.Infrastructure.UserSettings; | ||
| using Flow.Launcher.Plugin.SharedModels; | ||
|
|
||
| namespace Flow.Launcher.Infrastructure | ||
| { | ||
| public class StringMatcher | ||
| { | ||
| private readonly MatchOption _defaultMatchOption = new(); | ||
|
|
||
| private readonly Settings _settings; | ||
| public SearchPrecisionScore UserSettingSearchPrecision { get; set; } | ||
|
|
||
| private readonly IAlphabet _alphabet; | ||
|
|
||
| public StringMatcher(IAlphabet alphabet, Settings settings) | ||
| { | ||
| _alphabet = alphabet; | ||
| UserSettingSearchPrecision = settings.QuerySearchPrecision; | ||
| _settings = settings; | ||
| UserSettingSearchPrecision = _settings.QuerySearchPrecision; | ||
|
|
||
| _settings.PropertyChanged += (sender, e) => | ||
| { | ||
| switch (e.PropertyName) | ||
| { | ||
| case nameof(Settings.QuerySearchPrecision): | ||
| UserSettingSearchPrecision = _settings.QuerySearchPrecision; | ||
| break; | ||
| } | ||
| }; | ||
| } | ||
|
|
||
| // This is a workaround to allow unit tests to set the instance | ||
| public StringMatcher(IAlphabet alphabet) | ||
| public StringMatcher(IAlphabet alphabet) : this(alphabet, new Settings()) | ||
| { | ||
| _alphabet = alphabet; | ||
| } | ||
|
|
||
| public static MatchResult FuzzySearch(string query, string stringToCompare) | ||
|
|
@@ -80,10 +91,22 @@ public MatchResult FuzzyMatch(string query, string stringToCompare, MatchOption | |
| int acronymsTotalCount = 0; | ||
| int acronymsMatched = 0; | ||
|
|
||
| var fullStringToCompareWithoutCase = opt.IgnoreCase ? stringToCompare.ToLower() : stringToCompare; | ||
| var queryWithoutCase = opt.IgnoreCase ? query.ToLower() : query; | ||
| var fullStringToCompare = stringToCompare; | ||
| var queryToCompare = query; | ||
|
|
||
| if (_settings.IgnoreAccents) | ||
| { | ||
| fullStringToCompare = Normalize(fullStringToCompare); | ||
| queryToCompare = Normalize(queryToCompare); | ||
| } | ||
|
|
||
| if (opt.IgnoreCase) | ||
| { | ||
| fullStringToCompare = fullStringToCompare.ToLower(); | ||
| queryToCompare = queryToCompare.ToLower(); | ||
| } | ||
|
|
||
| var querySubstrings = queryWithoutCase.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); | ||
| var querySubstrings = queryToCompare.Split([' '], StringSplitOptions.RemoveEmptyEntries); | ||
| int currentQuerySubstringIndex = 0; | ||
| var currentQuerySubstring = querySubstrings[currentQuerySubstringIndex]; | ||
| var currentQuerySubstringCharacterIndex = 0; | ||
|
|
@@ -98,7 +121,9 @@ public MatchResult FuzzyMatch(string query, string stringToCompare, MatchOption | |
| var indexList = new List<int>(); | ||
| List<int> spaceIndices = new List<int>(); | ||
|
|
||
| for (var compareStringIndex = 0; compareStringIndex < fullStringToCompareWithoutCase.Length; compareStringIndex++) | ||
| for (var compareStringIndex = 0; | ||
| compareStringIndex < fullStringToCompare.Length; | ||
| compareStringIndex++) | ||
| { | ||
| // If acronyms matching successfully finished, this gets the remaining not matched acronyms for score calculation | ||
| if (currentAcronymQueryIndex >= query.Length && acronymsMatched == query.Length) | ||
|
|
@@ -114,14 +139,14 @@ public MatchResult FuzzyMatch(string query, string stringToCompare, MatchOption | |
|
|
||
| // To maintain a list of indices which correspond to spaces in the string to compare | ||
| // To populate the list only for the first query substring | ||
| if (fullStringToCompareWithoutCase[compareStringIndex] == ' ' && currentQuerySubstringIndex == 0) | ||
| if (fullStringToCompare[compareStringIndex] == ' ' && currentQuerySubstringIndex == 0) | ||
| spaceIndices.Add(compareStringIndex); | ||
|
|
||
| // Acronym Match | ||
| if (IsAcronym(stringToCompare, compareStringIndex)) | ||
| { | ||
| if (fullStringToCompareWithoutCase[compareStringIndex] == | ||
| queryWithoutCase[currentAcronymQueryIndex]) | ||
| if (fullStringToCompare[compareStringIndex] == | ||
| queryToCompare[currentAcronymQueryIndex]) | ||
| { | ||
| acronymMatchData.Add(compareStringIndex); | ||
| acronymsMatched++; | ||
|
|
@@ -133,7 +158,7 @@ public MatchResult FuzzyMatch(string query, string stringToCompare, MatchOption | |
| if (IsAcronymCount(stringToCompare, compareStringIndex)) | ||
| acronymsTotalCount++; | ||
|
|
||
| if (allQuerySubstringsMatched || fullStringToCompareWithoutCase[compareStringIndex] != | ||
| if (allQuerySubstringsMatched || fullStringToCompare[compareStringIndex] != | ||
| currentQuerySubstring[currentQuerySubstringCharacterIndex]) | ||
| { | ||
| matchFoundInPreviousLoop = false; | ||
|
|
@@ -160,7 +185,7 @@ public MatchResult FuzzyMatch(string query, string stringToCompare, MatchOption | |
| var startIndexToVerify = compareStringIndex - currentQuerySubstringCharacterIndex; | ||
|
|
||
| if (AllPreviousCharsMatched(startIndexToVerify, currentQuerySubstringCharacterIndex, | ||
| fullStringToCompareWithoutCase, currentQuerySubstring)) | ||
| fullStringToCompare, currentQuerySubstring)) | ||
| { | ||
| matchFoundInPreviousLoop = true; | ||
|
|
||
|
|
@@ -205,7 +230,8 @@ public MatchResult FuzzyMatch(string query, string stringToCompare, MatchOption | |
|
|
||
| if (acronymScore >= (int)UserSettingSearchPrecision) | ||
| { | ||
| acronymMatchData = acronymMatchData.Select(x => translationMapping?.MapToOriginalIndex(x) ?? x).Distinct().ToList(); | ||
| acronymMatchData = acronymMatchData.Select(x => translationMapping?.MapToOriginalIndex(x) ?? x) | ||
| .Distinct().ToList(); | ||
| return new MatchResult(true, UserSettingSearchPrecision, acronymMatchData, acronymScore); | ||
| } | ||
| } | ||
|
|
@@ -218,19 +244,134 @@ public MatchResult FuzzyMatch(string query, string stringToCompare, MatchOption | |
| // firstMatchIndex - nearestSpaceIndex - 1 is to set the firstIndex as the index of the first matched char | ||
| // preceded by a space e.g. 'world' matching 'hello world' firstIndex would be 0 not 6 | ||
| // giving more weight than 'we or donald' by allowing the distance calculation to treat the starting position at after the space. | ||
| var score = CalculateSearchScore(query, stringToCompare, firstMatchIndex - nearestSpaceIndex - 1, spaceIndices, | ||
| var score = CalculateSearchScore(query, stringToCompare, firstMatchIndex - nearestSpaceIndex - 1, | ||
| spaceIndices, | ||
| lastMatchIndex - firstMatchIndex, allSubstringsContainedInCompareString); | ||
|
|
||
| var resultList = indexList.Select(x => translationMapping?.MapToOriginalIndex(x) ?? x).Distinct().ToList(); | ||
| var resultList = indexList.Select(x => translationMapping?.MapToOriginalIndex(x) ?? x).Distinct() | ||
| .ToList(); | ||
| return new MatchResult(true, UserSettingSearchPrecision, resultList, score); | ||
| } | ||
|
|
||
| return new MatchResult(false, UserSettingSearchPrecision); | ||
| } | ||
|
|
||
|
|
||
| private static readonly Dictionary<char, char> AccentMap = new() | ||
| { | ||
| ['á'] = 'a', | ||
| ['à'] = 'a', | ||
| ['ã'] = 'a', | ||
| ['â'] = 'a', | ||
| ['ä'] = 'a', | ||
| ['å'] = 'a', | ||
| ['ā'] = 'a', | ||
| ['ă'] = 'a', | ||
| ['ą'] = 'a', | ||
| ['é'] = 'e', | ||
| ['è'] = 'e', | ||
| ['ê'] = 'e', | ||
| ['ë'] = 'e', | ||
| ['ē'] = 'e', | ||
| ['ĕ'] = 'e', | ||
| ['ė'] = 'e', | ||
| ['ę'] = 'e', | ||
| ['ě'] = 'e', | ||
| ['í'] = 'i', | ||
| ['ì'] = 'i', | ||
| ['î'] = 'i', | ||
| ['ï'] = 'i', | ||
| ['ī'] = 'i', | ||
| ['ĭ'] = 'i', | ||
| ['į'] = 'i', | ||
| ['ı'] = 'i', | ||
| ['ó'] = 'o', | ||
| ['ò'] = 'o', | ||
| ['õ'] = 'o', | ||
| ['ô'] = 'o', | ||
| ['ö'] = 'o', | ||
| ['ø'] = 'o', | ||
| ['ō'] = 'o', | ||
| ['ŏ'] = 'o', | ||
| ['ő'] = 'o', | ||
| ['ú'] = 'u', | ||
| ['ù'] = 'u', | ||
| ['û'] = 'u', | ||
| ['ü'] = 'u', | ||
| ['ū'] = 'u', | ||
| ['ŭ'] = 'u', | ||
| ['ů'] = 'u', | ||
| ['ű'] = 'u', | ||
| ['ų'] = 'u', | ||
| ['ç'] = 'c', | ||
| ['ć'] = 'c', | ||
| ['ĉ'] = 'c', | ||
| ['ċ'] = 'c', | ||
| ['č'] = 'c', | ||
| ['ñ'] = 'n', | ||
| ['ń'] = 'n', | ||
| ['ņ'] = 'n', | ||
| ['ň'] = 'n', | ||
| ['ŋ'] = 'n', | ||
| ['ý'] = 'y', | ||
| ['ÿ'] = 'y', | ||
| ['ŷ'] = 'y', | ||
| ['ś'] = 's', | ||
| ['ŝ'] = 's', | ||
| ['ş'] = 's', | ||
| ['š'] = 's', | ||
| ['ß'] = 's', | ||
| ['ź'] = 'z', | ||
| ['ż'] = 'z', | ||
| ['ž'] = 'z', | ||
| ['ł'] = 'l', | ||
| ['ď'] = 'd', | ||
| ['đ'] = 'd', | ||
| ['ĝ'] = 'g', | ||
| ['ğ'] = 'g', | ||
| ['ġ'] = 'g', | ||
| ['ģ'] = 'g', | ||
| ['ĥ'] = 'h', | ||
| ['ħ'] = 'h', | ||
| ['ĵ'] = 'j', | ||
| ['ķ'] = 'k', | ||
| ['ŕ'] = 'r', | ||
| ['ř'] = 'r', | ||
| ['ţ'] = 't', | ||
| ['ť'] = 't', | ||
| ['ŧ'] = 't', | ||
| ['æ'] = 'a', | ||
| ['œ'] = 'o' | ||
| }; | ||
|
|
||
| public static string Normalize(string value) | ||
| { | ||
| if (string.IsNullOrEmpty(value)) return value; | ||
| char[] arrayFromPool = null; | ||
| Span<char> buffer = value.Length <= 512 | ||
| ? stackalloc char[value.Length] | ||
| : (arrayFromPool = ArrayPool<char>.Shared.Rent(value.Length)); | ||
| try | ||
| { | ||
| for (int i = 0; i < value.Length; i++) | ||
| { | ||
| var c = char.ToLowerInvariant(value[i]); | ||
| buffer[i] = AccentMap.TryGetValue(c, out var mapped) ? mapped : c; | ||
| } | ||
|
Comment on lines
+356
to
+360
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason this can't be out inside the existing loop? Normalize is still called to loop through both compare strings before the main loop, which also loops through the new strings.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In a way, I placed this loop outside to keep it O(n). If I left this loop inside the existing loop, it would have to traverse the same string more than once, resulting in O(n²) complexity. This is my understanding—I could be wrong, but this is the conclusion I reached.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jjw24 I took a closer look to give a clearer answer: I didn’t put the normalization inside the loop because it needs to already be normalized before entering the loop to perform the processing.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, mostly true- the queryToCompare string will need to be normalized before entering because of the splitting into substrings, but I still think there are benefits to move the stringToCompare normalization inside the main loop so it's not doubling up.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will create a PR against this one to show and review, leave it with me.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, do the changes in my PR work?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Sorry for the delay, I tested it now and it didn’t work.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It doesn't normalize the character? Can you show me how to reproduce please.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
First test I did was the usual one: I enabled
Second, I tried searching with the accent "câmera", and that didn’t work either:
When I disable it, the query works normally.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will take another look soon. |
||
|
|
||
| return new string(buffer.Slice(0, value.Length)); | ||
| } | ||
| finally | ||
| { | ||
| if (arrayFromPool != null) | ||
| ArrayPool<char>.Shared.Return(arrayFromPool); | ||
| } | ||
| } | ||
|
|
||
| private static bool IsAcronym(string stringToCompare, int compareStringIndex) | ||
| { | ||
| if (IsAcronymChar(stringToCompare, compareStringIndex) || IsAcronymNumber(stringToCompare, compareStringIndex)) | ||
| if (IsAcronymChar(stringToCompare, compareStringIndex) || | ||
| IsAcronymNumber(stringToCompare, compareStringIndex)) | ||
| return true; | ||
|
|
||
| return false; | ||
|
|
@@ -274,12 +415,12 @@ private static int CalculateClosestSpaceIndex(List<int> spaceIndices, int firstM | |
| } | ||
|
|
||
| private static bool AllPreviousCharsMatched(int startIndexToVerify, int currentQuerySubstringCharacterIndex, | ||
| string fullStringToCompareWithoutCase, string currentQuerySubstring) | ||
| string fullStringToCompare, string currentQuerySubstring) | ||
| { | ||
| var allMatch = true; | ||
| for (int indexToCheck = 0; indexToCheck < currentQuerySubstringCharacterIndex; indexToCheck++) | ||
| { | ||
| if (fullStringToCompareWithoutCase[startIndexToVerify + indexToCheck] != | ||
| if (fullStringToCompare[startIndexToVerify + indexToCheck] != | ||
| currentQuerySubstring[indexToCheck]) | ||
| { | ||
| allMatch = false; | ||
|
|
@@ -312,7 +453,8 @@ private static bool AllQuerySubstringsMatched(int currentQuerySubstringIndex, in | |
| return currentQuerySubstringIndex >= querySubstringsLength; | ||
| } | ||
|
|
||
| private static int CalculateSearchScore(string query, string stringToCompare, int firstIndex, List<int> spaceIndices, int matchLen, | ||
| private static int CalculateSearchScore(string query, string stringToCompare, int firstIndex, | ||
| List<int> spaceIndices, int matchLen, | ||
| bool allSubstringsContainedInCompareString) | ||
| { | ||
| // A match found near the beginning of a string is scored more than a match found near the end | ||
|
|
||


Uh oh!
There was an error while loading. Please reload this page.