feat(algorithms, hash-table): duplicate in file system

BrianLusina · BrianLusina · commit b914c4ecb6f1 · 2026-04-23T11:32:47.000+03:00
diff --git a/algorithms/hash_table/duplicate_file_in_system/README.md b/algorithms/hash_table/duplicate_file_in_system/README.md
@@ -0,0 +1,79 @@
+# Find Duplicate File in System
+
+Given a list paths of directory info, including the directory path, and all the files with contents in this directory,
+return all the duplicate files in the file system in terms of their paths. You may return the answer in any order.
+
+A group of duplicate files consists of at least two files that have the same content.
+
+A single directory info string in the input list has the following format:
+
+"root/d1/d2/.../dm f1.txt(f1_content) f2.txt(f2_content) ... fn.txt(fn_content)"
+It means there are n files (f1.txt, f2.txt ... fn.txt) with content (f1_content, f2_content ... fn_content) respectively
+in the directory "root/d1/d2/.../dm". Note that n >= 1 and m >= 0. If m = 0, it means the directory is just the root
+directory.
+
+The output is a list of groups of duplicate file paths. For each group, it contains all the file paths of the files that
+have the same content. A file path is a string that has the following format:
+
+"directory_path/file_name.txt"
+
+## Examples
+
+Example 1:
+
+```text
+Input: paths = ["root/a 1.txt(abcd) 2.txt(efgh)","root/c 3.txt(abcd)","root/c/d 4.txt(efgh)","root 4.txt(efgh)"]
+Output: [["root/a/2.txt","root/c/d/4.txt","root/4.txt"],["root/a/1.txt","root/c/3.txt"]]
+```
+
+Example 2:
+
+```text
+Input: paths = ["root/a 1.txt(abcd) 2.txt(efgh)","root/c 3.txt(abcd)","root/c/d 4.txt(efgh)"]
+Output: [["root/a/2.txt","root/c/d/4.txt"],["root/a/1.txt","root/c/3.txt"]]
+```
+
+## Constraints
+
+- 1 <= paths.length <= 2 * 10^4
+- 1 <= paths[i].length <= 3000
+- 1 <= sum(paths[i].length) <= 5 * 10^5
+- paths[i] consist of English letters, digits, '/', '.', '(', ')', and ' '.
+- You may assume no files or directories share the same name in the same directory.
+- You may assume each given directory info represents a unique directory. A single blank space separates the directory
+  path and file info.
+ 
+Follow up:
+
+- Imagine you are given a real file system, how will you search files? DFS or BFS?
+- If the file content is very large (GB level), how will you modify your solution?
+- If you can only read the file by 1kb each time, how will you modify your solution?
+- What is the time complexity of your modified solution? What is the most time-consuming part and memory-consuming part
+  of it? How to optimize?
+- How to make sure the duplicated files you find are not false positive?
+
+
+## Topics
+
+- Array
+- Hash Table
+- String
+
+## Solution
+
+In this approach, firstly we obtain the directory paths, the file names and their contents separately by appropriately
+splitting each string in the given paths list. In order to find the files with duplicate contents, we make use of a
+HashMap, which stores the data in the form (contents,list_of_file_paths_with_this_content). Thus, for every file's
+contents, we check if the same content already exist in the hashmap. If so, we add the current file's path to the list of
+files corresponding to the current contents. Otherwise, we create a new entry in the map, with the current contents as
+the key and the value being a list with only one entry(the current file's path).
+
+At the end, we find out the contents corresponding to which atleast two file paths exist. We obtain the resultant list
+res, which is a list of lists containing these file paths corresponding to the same contents.
+
+The following animation illustrates the process for a clearer understanding.
+
+### Complexity Analysis
+
+- Time complexity : O(n*x). n strings of average length x is parsed.
+- Space complexity : O(n*x). map and res size grows upto n∗x.
diff --git a/algorithms/hash_table/duplicate_file_in_system/__init__.py b/algorithms/hash_table/duplicate_file_in_system/__init__.py
@@ -0,0 +1,37 @@
+from collections import defaultdict
+from typing import List, Dict
+
+
+def find_duplicate(paths: List[str]) -> List[List[str]]:
+    if not paths:
+        return []
+
+    # Dictionary to store the content as key and list of file paths as value
+    file_map: Dict[str, List[str]] = defaultdict(list)
+
+    for path in paths:
+        values = path.split()
+        # Starting at 1 to skip the directory name, Iterate through each file in the current directory path
+        for i in range(1, len(values)):
+            # Split the file name and content
+            name_content = values[i].split("(")
+            # Extract content part
+            content = name_content[1][:-1]
+
+            directory = values[0]
+            file_name = name_content[0]
+
+            # Construct the full file path
+            file_path = f"{directory}/{file_name}"
+
+            # Add the file path to the list of the corresponding content
+            file_map[content].append(file_path)
+
+    result = []
+    # Iterate through the dictionary to find duplicates
+    for file_path_value in file_map.values():
+        # Only add to result if there are more than one file with the same content
+        if len(file_path_value) > 1:
+            result.append(file_path_value)
+
+    return result
diff --git a/algorithms/hash_table/duplicate_file_in_system/test_find_duplicate_files_in_system.py b/algorithms/hash_table/duplicate_file_in_system/test_find_duplicate_files_in_system.py
@@ -0,0 +1,86 @@
+import unittest
+from typing import List
+from parameterized import parameterized
+from algorithms.hash_table.duplicate_file_in_system import find_duplicate
+
+FIND_DUPLICATE_FILE_IN_SYSTEM_TEST_CASES = [
+    (
+        [
+            "root/a 1.txt(abcd) 2.txt(efgh)",
+            "root/c 3.txt(abcd)",
+            "root/c/d 4.txt(efgh)",
+            "root 4.txt(efgh)",
+        ],
+        [
+            ["root/a/2.txt", "root/c/d/4.txt", "root/4.txt"],
+            ["root/a/1.txt", "root/c/3.txt"],
+        ],
+    ),
+    (
+        [
+            "root/a 1.txt(abcd) 2.txt(efgh)",
+            "root/c 3.txt(abcd)",
+            "root/c/d 4.txt(efgh)",
+        ],
+        [["root/a/2.txt", "root/c/d/4.txt"], ["root/a/1.txt", "root/c/3.txt"]],
+    ),
+    (
+        [
+            "data/files 1.csv(data1)",
+            "data/files/processed 2.csv(data2)",
+            "data/files/backup 3.csv(data1)",
+            "data/archives 4.csv(data3)",
+            "data/archives/old 5.csv(data2)",
+        ],
+        [
+            ["data/files/1.csv", "data/files/backup/3.csv"],
+            ["data/files/processed/2.csv", "data/archives/old/5.csv"],
+        ],
+    ),
+    (
+        [
+            "usr/local/bin 1.sh(scriptX) 2.sh(scriptY)",
+            "usr/local/lib 3.sh(scriptZ)",
+            "usr/local/share 4.sh(scriptX)",
+            "usr/local/share/tools 5.sh(scriptY)",
+        ],
+        [
+            ["usr/local/bin/1.sh", "usr/local/share/4.sh"],
+            ["usr/local/bin/2.sh", "usr/local/share/tools/5.sh"],
+        ],
+    ),
+    (
+        [
+            "documents/reports 1.pdf(report1) 2.pdf(report2)",
+            "documents/presentations 3.pdf(presentation1)",
+            "documents/reports/old 4.pdf(report1)",
+            "documents/archive 5.pdf(archive1)",
+        ],
+        [["documents/reports/1.pdf", "documents/reports/old/4.pdf"]],
+    ),
+    (
+        [
+            "root/notes 1.md(note1) 2.md(note2)",
+            "root/notes/meetings 3.md(note3)",
+            "root/notes/summaries 4.md(note1)",
+            "root/summaries 5.md(note4)",
+            "root/summaries/meetings 6.md(note2)",
+        ],
+        [
+            ["root/notes/1.md", "root/notes/summaries/4.md"],
+            ["root/notes/2.md", "root/summaries/meetings/6.md"],
+        ],
+    ),
+]
+
+
+class FindDuplicateFileInSystemTestCase(unittest.TestCase):
+    @parameterized.expand(FIND_DUPLICATE_FILE_IN_SYSTEM_TEST_CASES)
+    def test_find_duplicate(self, paths: List[str], expected: List[List[str]]):
+        actual = find_duplicate(paths)
+        actual.sort()
+        self.assertListEqual(sorted(expected), actual)
+
+
+if __name__ == "__main__":
+    unittest.main()