diff --git a/DIRECTORY.md b/DIRECTORY.md index d612b6d8..bf6e738e 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -729,6 +729,8 @@ * [Test Reverse Vowels](https://github.com/BrianLusina/PythonSnips/blob/master/pystrings/reverse_vowels/test_reverse_vowels.py) * Reverse Words * [Test Reverse Words](https://github.com/BrianLusina/PythonSnips/blob/master/pystrings/reverse_words/test_reverse_words.py) + * Similar String Groups + * [Test Similar String Groups](https://github.com/BrianLusina/PythonSnips/blob/master/pystrings/similar_string_groups/test_similar_string_groups.py) * Spreadsheet Encoding * [Test Spreadsheet Encode](https://github.com/BrianLusina/PythonSnips/blob/master/pystrings/spreadsheet_encoding/test_spreadsheet_encode.py) * String Compression diff --git a/datastructures/__init__.py b/datastructures/__init__.py index e69de29b..3f539b87 100644 --- a/datastructures/__init__.py +++ b/datastructures/__init__.py @@ -0,0 +1,6 @@ +from datastructures.sets import DisjointSetUnion, UnionFind + +__all__ = [ + "DisjointSetUnion", + "UnionFind" +] diff --git a/datastructures/sets/__init__.py b/datastructures/sets/__init__.py new file mode 100644 index 00000000..a60dc953 --- /dev/null +++ b/datastructures/sets/__init__.py @@ -0,0 +1,3 @@ +from datastructures.sets.union_find import DisjointSetUnion, UnionFind + +__all__ = ["DisjointSetUnion", "UnionFind"] diff --git a/datastructures/sets/union_find/__init__.py b/datastructures/sets/union_find/__init__.py new file mode 100644 index 00000000..cdf511dd --- /dev/null +++ b/datastructures/sets/union_find/__init__.py @@ -0,0 +1,74 @@ +class DisjointSetUnion: + """A class for the Union-Find (Disjoint Set Union) data structure.""" + + def __init__(self, size: int): + """Initializes the data structure with 'size' elements, each in its own set.""" + if size <= 0: + raise ValueError("Size must be a positive integer.") + self.root = list(range(size)) + self.rank = [1] * size # For union by rank + self.count = size # Number of disjoint sets + + def find(self, i: int) -> int: + """Finds the representative (root) of the set containing element 'i'.""" + if self.root[i] == i: + return i + # Path compression: make all nodes on the path point to the root + self.root[i] = self.find(self.root[i]) + return self.root[i] + + def union(self, i: int, j: int) -> bool: + """ + Merges the sets containing elements 'i' and 'j'. + Returns True if a merge occurred, False if they were already in the same set. + """ + root_i = self.find(i) + root_j = self.find(j) + + if root_i != root_j: + # Union by rank: attach the smaller tree to the larger tree + if self.rank[root_i] > self.rank[root_j]: + self.root[root_j] = root_i + elif self.rank[root_i] < self.rank[root_j]: + self.root[root_i] = root_j + else: + self.root[root_j] = root_i + self.rank[root_i] += 1 + + self.count -= 1 + return True + + return False + + def get_count(self) -> int: + """Returns the current number of disjoint sets.""" + return self.count + + +class UnionFind: + """A minimal Union-Find data structure with path compression.""" + + def __init__(self, size: int): + """Initializes the data structure with 'size' elements.""" + if size <= 0: + raise ValueError("Size must be a positive integer.") + self.parent = list(range(size)) + + def find(self, x: int) -> int: + """Finds the representative (root) of the set containing element 'x'.""" + if self.parent[x] != x: + # Path compression + self.parent[x] = self.find(self.parent[x]) + return self.parent[x] + + def union(self, x: int, y: int) -> bool: + """ + Merges the sets containing elements 'x' and 'y'. + Returns True if a merge occurred, False if already in same set. + """ + root_x = self.find(x) + root_y = self.find(y) + if root_x != root_y: + self.parent[root_y] = root_x + return True + return False \ No newline at end of file diff --git a/pystrings/similar_string_groups/README.md b/pystrings/similar_string_groups/README.md new file mode 100644 index 00000000..f1da33f9 --- /dev/null +++ b/pystrings/similar_string_groups/README.md @@ -0,0 +1,113 @@ +# Similar String Groups + +Two strings x and y are considered similar if they are either exactly the same or can be made identical by swapping at +most two different characters in string x. + +We define a similarity group as a set of strings where each string is similar to at least one other string in the group. +A string doesn't need to be directly similar to every other string in the group — it just needs to be connected to them +through a chain of similarities. + +Given a list of strings strs, where each string is an anagram of the others, your task is to determine how many such +similarity groups exist in the list. + +Constraints: + +- 1 ≤ strs.length ≤ 300 +- 1 ≤ strs[i].length ≤ 300 +- strs[i] consists of lowercase letters only. +- All words in strs have the same length and are anagrams of each other. + +--- + +## Examples + +![Example 1](./images/similar_string_groups_example_1.png) +![Example 2](./images/similar_string_groups_example_2.png) +![Example 3](./images/similar_string_groups_example_3.png) + +--- + +## Solution + +This problem can be seen as a graph connectivity challenge. Each string is a node, and an edge exists between two nodes +if their corresponding strings are similar. Our goal is to count how many connected groups (components) exist in this +graph. + +We solve this problem using the Union-Find (Disjoint Set Union) data structure to efficiently group similar strings. +Initially, each string is placed in its own group. We then iterate over all possible pairs of strings. For each pair at +indexes i and j, we check whether the two strings are similar — that is, either exactly the same or differ at exactly +two positions (meaning one swap can make them equal). If they are similar and currently belong to different groups +(i.e., their roots in the Union-Find structure are different), we perform a union operation to merge their groups. +Repeating this across all string pairs gradually reduces the number of distinct groups. Finally, we count the number of +unique roots in the Union-Find structure, which represents the number of similar string groups. + +Here’s the step-by-step explanation of the solution: + +1. Initialize n = len(strs). +2. Create a Union-Find (DSU) structure with n elements, where each element is its own parent. +3. Define a function areSimilar(s1, s2) that returns TRUE if both strings s1 and s2 are similar according to the given + condition: + - Initialize an empty list diff = [] to track differences. + - Loop through both strings in parallel using zip. + - If characters differ at any position, record the mismatch in diff. + - Early exit if more than 2 differences and return FALSE. + - After the loop is completed, evaluate the result: + - len(diff) == 0 means the strings are identical. + - len(diff) == 2 and diff[0] == diff[1][::-1] means there are exactly two differences and the character pairs are + mirror images of each other. + +4. Loop over all pairs (i, j) such that 0 ≤ i < j < n. +5. For each pair, use the areSimilar function to check if strs[i] and strs[j] are similar. +6. If similar, use find(i) and find(j) to get their root parents. +7. If the roots differ, merge them using union(i, j). +8. After processing all pairs, iterate over all indexes i from 0 to n - 1 and find their root parents using find(i). +9. Add each root to a set to track unique groups. +10. Return the size of the set as the number of similarity groups. + +Let’s look at the following illustration to get a better understanding of the solution: + +![Solution 1](./images/similar_string_groups_solution_1.png) +![Solution 2](./images/similar_string_groups_solution_2.png) +![Solution 3](./images/similar_string_groups_solution_3.png) +![Solution 4](./images/similar_string_groups_solution_4.png) +![Solution 5](./images/similar_string_groups_solution_5.png) +![Solution 6](./images/similar_string_groups_solution_6.png) +![Solution 7](./images/similar_string_groups_solution_7.png) + +### Time Complexity +Let's break the time complexity down into two major components: + +#### **Comparing all pairs of strings** + +To check if two strings are similar, we compare them character by character, which takes _O(m)_ where m is the length +of each string. Given there are n strings and we compare all possible pairs of strings, there are O(n²) comparisons. +Therefore, the total time spent on comparisons is O(n²∗m). + +#### **Union-Find operations (find and union)** + +For each similar pair, we perform a find and possibly a union operation. With path compression, each operation takes +O(α(n)) time, where α(n) is nearly constant in practice. Since there are up to O(n²) similar pairs, the total time for +Union-Find operations is O(n²∗α(n)). + +The comparison step dominates the time complexity, as m (the string length) is typically much larger than α(n), which +grows very slowly. Therefore, the overall time complexity is O(n²∗m). + +### Space Complexity + +The space complexity of the algorithm comes from the following components: + +#### **Union-Find parent array**: + +Requires O(n) space to store the parent of each node (one per string). + +#### **Temporary storage in areSimilar() function**: + +Uses O(1) space — a constant-sized list to track the positions where the two strings differ. Since at most 2 differences +are allowed, space usage remains constant. + +#### **Set to store unique groups (roots)**: + +Requires O(n) space in the worst case, when all strings are in separate groups and each has a unique root. + +The total space complexity is O(n), as all other components (e.g., temporary storage and sets) do not exceed linear +space relative to the input size. diff --git a/pystrings/similar_string_groups/__init__.py b/pystrings/similar_string_groups/__init__.py new file mode 100644 index 00000000..84837a39 --- /dev/null +++ b/pystrings/similar_string_groups/__init__.py @@ -0,0 +1,81 @@ +from typing import List +from datastructures import DisjointSetUnion, UnionFind + + +def num_similar_groups(strs: List[str]) -> int: + strs_len = len(strs) + if strs_len == 0: + return 0 + + # All strings have the same length, per constraints + word_len = len(strs[0]) + + # Initialize Union-Find with n elements, one for each string. + # The initial count is n (each string is its own group). + uf = DisjointSetUnion(strs_len) + + def is_similar(s1: str, s2: str) -> bool: + """ + Checks if two strings are similar. + Similar means identical (0 diffs) or 1 swap (2 diffs). + """ + diff_count = 0 + positions_that_differ = [] + for k in range(word_len): + if s1[k] != s2[k]: + positions_that_differ.append(k) + diff_count += 1 + + # Optimization: If more than 2 differences, + # they can't be similar. + if diff_count > 2: + return False + + if diff_count == 2: + i = positions_that_differ[0] + j = positions_that_differ[1] + return s1[i] == s2[j] and s1[j] == s2[i] + + # At this point, diff_count is either 0 or 1 + # Only 0 differences (identical strings) are similar + return diff_count == 0 + + # Iterate over all unique pairs of strings + for i in range(strs_len): + for j in range(i + 1, strs_len): + # If the strings are similar, merge their groups. + # The union() method handles decrementing the count + # only if they were in different groups. + if is_similar(strs[i], strs[j]): + uf.union(i, j) + + # The final count of disjoint sets is the number of groups + return uf.get_count() + +# Helper: Decide if two strings are similar +def are_similar(s1, s2): + diff = [] + for a, b in zip(s1, s2): + if a != b: + diff.append((a, b)) + if len(diff) > 2: + return False + + return (len(diff) == 0) or ( + len(diff) == 2 and diff[0] == diff[1][::-1] + ) + +def num_similar_groups_2(strs: List[str]) -> int: + n = len(strs) + if n == 0: + return 0 + + uf = UnionFind(n) + + for i in range(n): + for j in range(i + 1, n): + if are_similar(strs[i], strs[j]): + uf.union(i, j) + + roots = {uf.find(i) for i in range(n)} + return len(roots) diff --git a/pystrings/similar_string_groups/images/similar_string_groups_example_1.png b/pystrings/similar_string_groups/images/similar_string_groups_example_1.png new file mode 100644 index 00000000..9d097c19 Binary files /dev/null and b/pystrings/similar_string_groups/images/similar_string_groups_example_1.png differ diff --git a/pystrings/similar_string_groups/images/similar_string_groups_example_2.png b/pystrings/similar_string_groups/images/similar_string_groups_example_2.png new file mode 100644 index 00000000..7720ace5 Binary files /dev/null and b/pystrings/similar_string_groups/images/similar_string_groups_example_2.png differ diff --git a/pystrings/similar_string_groups/images/similar_string_groups_example_3.png b/pystrings/similar_string_groups/images/similar_string_groups_example_3.png new file mode 100644 index 00000000..32cc76eb Binary files /dev/null and b/pystrings/similar_string_groups/images/similar_string_groups_example_3.png differ diff --git a/pystrings/similar_string_groups/images/similar_string_groups_solution_1.png b/pystrings/similar_string_groups/images/similar_string_groups_solution_1.png new file mode 100644 index 00000000..c9f96088 Binary files /dev/null and b/pystrings/similar_string_groups/images/similar_string_groups_solution_1.png differ diff --git a/pystrings/similar_string_groups/images/similar_string_groups_solution_2.png b/pystrings/similar_string_groups/images/similar_string_groups_solution_2.png new file mode 100644 index 00000000..41611cd0 Binary files /dev/null and b/pystrings/similar_string_groups/images/similar_string_groups_solution_2.png differ diff --git a/pystrings/similar_string_groups/images/similar_string_groups_solution_3.png b/pystrings/similar_string_groups/images/similar_string_groups_solution_3.png new file mode 100644 index 00000000..e0690552 Binary files /dev/null and b/pystrings/similar_string_groups/images/similar_string_groups_solution_3.png differ diff --git a/pystrings/similar_string_groups/images/similar_string_groups_solution_4.png b/pystrings/similar_string_groups/images/similar_string_groups_solution_4.png new file mode 100644 index 00000000..955374ae Binary files /dev/null and b/pystrings/similar_string_groups/images/similar_string_groups_solution_4.png differ diff --git a/pystrings/similar_string_groups/images/similar_string_groups_solution_5.png b/pystrings/similar_string_groups/images/similar_string_groups_solution_5.png new file mode 100644 index 00000000..948c602c Binary files /dev/null and b/pystrings/similar_string_groups/images/similar_string_groups_solution_5.png differ diff --git a/pystrings/similar_string_groups/images/similar_string_groups_solution_6.png b/pystrings/similar_string_groups/images/similar_string_groups_solution_6.png new file mode 100644 index 00000000..e6a44fc1 Binary files /dev/null and b/pystrings/similar_string_groups/images/similar_string_groups_solution_6.png differ diff --git a/pystrings/similar_string_groups/images/similar_string_groups_solution_7.png b/pystrings/similar_string_groups/images/similar_string_groups_solution_7.png new file mode 100644 index 00000000..1eac5f93 Binary files /dev/null and b/pystrings/similar_string_groups/images/similar_string_groups_solution_7.png differ diff --git a/pystrings/similar_string_groups/test_similar_string_groups.py b/pystrings/similar_string_groups/test_similar_string_groups.py new file mode 100644 index 00000000..29a23234 --- /dev/null +++ b/pystrings/similar_string_groups/test_similar_string_groups.py @@ -0,0 +1,60 @@ +import unittest +from . import num_similar_groups, num_similar_groups_2 + + +class SimilarStringGroupsTestCase(unittest.TestCase): + def test_1(self): + strs = ["jhki", "kijh", "jkhi", "kihj", "ijhk"] + expected = 3 + actual = num_similar_groups(strs) + self.assertEqual(expected, actual) + + def test_2(self): + strs = ["abc", "acb", "bac", "bca", "cab", "cba"] + expected = 1 + actual = num_similar_groups(strs) + self.assertEqual(expected, actual) + + def test_3(self): + strs = ["abcd", "abdc", "acbd", "bdca"] + expected = 2 + actual = num_similar_groups(strs) + self.assertEqual(expected, actual) + + def test_4(self): + strs = ["fgtdvepeqcfajhlzkwlpuhrwfcueqfbs","fgcdvppeqcfajhlzkwluehrwftuefqbs","fgtdvepeqcfajhlzkwlpuhrwfcuefqbs", + "fgcdvepeqcfajhlzkwluphrwftuefqbs","fgldvepeqcfajhlzkwcuphrwftuefqbs","fgtdvefeqcpajhlzkwlpuhrwfcuefqbs"] + expected = 2 + actual = num_similar_groups(strs) + self.assertEqual(expected, actual) + + +class SimilarStringGroups2TestCase(unittest.TestCase): + def test_1(self): + strs = ["jhki", "kijh", "jkhi", "kihj", "ijhk"] + expected = 3 + actual = num_similar_groups_2(strs) + self.assertEqual(expected, actual) + + def test_2(self): + strs = ["abc", "acb", "bac", "bca", "cab", "cba"] + expected = 1 + actual = num_similar_groups_2(strs) + self.assertEqual(expected, actual) + + def test_3(self): + strs = ["abcd", "abdc", "acbd", "bdca"] + expected = 2 + actual = num_similar_groups_2(strs) + self.assertEqual(expected, actual) + + def test_4(self): + strs = ["fgtdvepeqcfajhlzkwlpuhrwfcueqfbs","fgcdvppeqcfajhlzkwluehrwftuefqbs","fgtdvepeqcfajhlzkwlpuhrwfcuefqbs", + "fgcdvepeqcfajhlzkwluphrwftuefqbs","fgldvepeqcfajhlzkwcuphrwftuefqbs","fgtdvefeqcpajhlzkwlpuhrwfcuefqbs"] + expected = 2 + actual = num_similar_groups_2(strs) + self.assertEqual(expected, actual) + + +if __name__ == '__main__': + unittest.main()