From d0b030adc8b44418f9b6ddc2dc61f43253e7d2c4 Mon Sep 17 00:00:00 2001 From: supermario_leo Date: Tue, 28 Apr 2026 03:16:23 +0800 Subject: [PATCH 1/2] feat(pattern-matcher): support !-prefixed negation in excluded_patterns Enables gitignore-style exceptions within excluded_patterns so users can exclude broad categories while allowing specific paths through. A pattern beginning with ! un-excludes paths that would otherwise be excluded by preceding patterns. For example: PatternFilePathMatcher( excluded_patterns=[ "**/.*", # exclude all dot-entries "!**/.github/**", # but keep .github through ] ) Directory traversal is handled correctly: is_dir_included() uses a probe path (/__probe__) to detect whether any negation pattern could match files inside an otherwise-excluded directory, preventing premature pruning. Fixes #1778. --- python/cocoindex/resources/file.py | 15 ++- rust/ops_text/src/pattern_matcher.rs | 183 +++++++++++++++++++++++++-- rust/py/src/ops.rs | 4 + 3 files changed, 187 insertions(+), 15 deletions(-) diff --git a/python/cocoindex/resources/file.py b/python/cocoindex/resources/file.py index 76a3da480..595d4ffc3 100644 --- a/python/cocoindex/resources/file.py +++ b/python/cocoindex/resources/file.py @@ -235,6 +235,17 @@ class PatternFilePathMatcher(FilePathMatcher): - `*.py` — matches Python files only in the root directory - `**/.*` — matches dot-prefixed entries (hidden files/dirs) at any depth - `{*.md,*.txt}` — matches multiple extensions using alternation + + ``excluded_patterns`` supports gitignore-style ``!`` negation: a pattern + beginning with ``!`` un-excludes paths that would otherwise be excluded by a + preceding pattern. Example — exclude all dot-directories except ``.github``:: + + PatternFilePathMatcher( + excluded_patterns=[ + "**/.*", # exclude all dot-entries + "!**/.github/**", # but keep .github through + ] + ) """ def __init__( @@ -250,7 +261,9 @@ def __init__( to be included. Use ``**/*.ext`` to match at any depth. excluded_patterns: Glob patterns (globset syntax) matching full path of files and directories to be excluded. If a directory is excluded, all files and - subdirectories within it are also excluded. + subdirectories within it are also excluded. A pattern prefixed with ``!`` + negates the exclusion for matching paths, allowing gitignore-style exceptions + (e.g. ``"!**/.github/**"`` after ``"**/.*"``). Raises: ValueError: If any pattern is invalid. diff --git a/rust/ops_text/src/pattern_matcher.rs b/rust/ops_text/src/pattern_matcher.rs index 9cc8e3d80..f3da00913 100644 --- a/rust/ops_text/src/pattern_matcher.rs +++ b/rust/ops_text/src/pattern_matcher.rs @@ -10,47 +10,126 @@ fn build_glob_set(patterns: Vec) -> Result { Ok(builder.build()?) } -/// Pattern matcher that handles include and exclude patterns for files +/// Splits a list of patterns into regular exclusion patterns and negation (``!``-prefixed) +/// patterns, building a GlobSet for each group. +/// +/// A pattern like ``!**/.github/**`` means "do **not** exclude paths that match +/// ``**/.github/**``", allowing fine-grained exceptions to broad exclusion rules. +fn split_excluded_patterns( + patterns: Option>, +) -> Result<(Option, Option)> { + let Some(pats) = patterns else { + return Ok((None, None)); + }; + + let mut regular: Vec = Vec::new(); + let mut negation: Vec = Vec::new(); + + for p in pats { + if let Some(stripped) = p.strip_prefix('!') { + negation.push(stripped.to_string()); + } else { + regular.push(p); + } + } + + let regular_set = if regular.is_empty() { + None + } else { + Some(build_glob_set(regular)?) + }; + let negation_set = if negation.is_empty() { + None + } else { + Some(build_glob_set(negation)?) + }; + + Ok((regular_set, negation_set)) +} + +/// Pattern matcher that handles include and exclude patterns for files. +/// +/// Supports gitignore-style ``!``-prefixed negation in ``excluded_patterns``: a pattern +/// beginning with ``!`` un-excludes paths that would otherwise be excluded. For example, +/// combining ``"**/.*"`` with ``"!**/.github/**"`` excludes all dot-entries *except* +/// anything inside ``.github/``. #[derive(Debug)] pub struct PatternMatcher { /// Patterns matching full path of files to be included. included_glob_set: Option, - /// Patterns matching full path of files and directories to be excluded. - /// If a directory is excluded, all files and subdirectories within it are also excluded. + /// Regular (non-negated) exclusion patterns. excluded_glob_set: Option, + /// Negation patterns (``!``-prefixed in the original list, stored without the ``!``). + /// A path that matches one of these is *not* excluded even if it matches the regular + /// exclusion patterns. + negation_glob_set: Option, } impl PatternMatcher { - /// Create a new PatternMatcher from optional include and exclude pattern vectors + /// Create a new PatternMatcher from optional include and exclude pattern vectors. + /// + /// Patterns in `excluded_patterns` that start with ``!`` are treated as negations: + /// they un-exclude any path that would otherwise be excluded by the preceding patterns. pub fn new( included_patterns: Option>, excluded_patterns: Option>, ) -> Result { let included_glob_set = included_patterns.map(build_glob_set).transpose()?; - let excluded_glob_set = excluded_patterns.map(build_glob_set).transpose()?; + let (excluded_glob_set, negation_glob_set) = split_excluded_patterns(excluded_patterns)?; Ok(Self { included_glob_set, excluded_glob_set, + negation_glob_set, }) } - /// Check if a file or directory is excluded by the exclude patterns - /// Can be called on directories to prune traversal on excluded directories. + /// Check if a path is excluded after applying both exclusion and negation patterns. pub fn is_excluded(&self, path: &str) -> bool { - self.excluded_glob_set + if !self + .excluded_glob_set + .as_ref() + .is_some_and(|gs| gs.is_match(path)) + { + return false; + } + // The path matches an exclusion pattern; a negation pattern can un-exclude it. + !self + .negation_glob_set .as_ref() - .is_some_and(|glob_set| glob_set.is_match(path)) + .is_some_and(|gs| gs.is_match(path)) } - /// Check if a directory should be included (traversed) based on the exclude patterns. - /// Directories are included unless they match an exclude pattern. + /// Check if a directory should be traversed based on the exclude/negation patterns. + /// + /// A directory is included unless it matches an exclusion pattern *and* no negation + /// pattern applies to it or to any file that could live inside it. The latter check + /// uses a probe path (``/__probe__``) so that patterns like ``**/.github/**`` + /// correctly un-prune the ``.github`` directory. pub fn is_dir_included(&self, path: &str) -> bool { - !self.is_excluded(path) + if !self + .excluded_glob_set + .as_ref() + .is_some_and(|gs| gs.is_match(path)) + { + return true; + } + // Directory matches an exclusion. Check whether negation patterns could apply to + // the directory itself or to a file one level inside it. + if let Some(neg_gs) = &self.negation_glob_set { + if neg_gs.is_match(path) { + return true; + } + // Probe one level inside: catches patterns like `**/.github/**`. + let probe = format!("{}/__probe__", path); + if neg_gs.is_match(probe.as_str()) { + return true; + } + } + false } - /// Check if a file should be included based on both include and exclude patterns - /// Should be called for each file. + /// Check if a file should be included based on both include and exclude patterns. pub fn is_file_included(&self, path: &str) -> bool { self.included_glob_set .as_ref() @@ -129,4 +208,80 @@ mod tests { assert!(matcher.is_file_included("a/b/c/main.py")); assert!(!matcher.is_file_included("main.rs")); } + + // --- Negation (!) pattern tests --- + + #[test] + fn test_negation_un_excludes_file() { + // Exclude all dot-files, but un-exclude .env.example + let matcher = PatternMatcher::new( + None, + Some(vec!["**/.env*".to_string(), "!**/.env.example".to_string()]), + ) + .unwrap(); + assert!(!matcher.is_file_included(".env")); + assert!(!matcher.is_file_included("config/.env.local")); + assert!(matcher.is_file_included(".env.example")); + assert!(matcher.is_file_included("config/.env.example")); + } + + #[test] + fn test_negation_un_excludes_dotdir_files() { + // Exclude all dot-directories, but allow .github workflow files through. + let matcher = PatternMatcher::new( + None, + Some(vec!["**/.*".to_string(), "!**/.github/**".to_string()]), + ) + .unwrap(); + // Dot files/dirs that are NOT in .github remain excluded. + assert!(!matcher.is_file_included(".git/config")); + assert!(!matcher.is_file_included(".vscode/settings.json")); + // Files under .github are un-excluded. + assert!(matcher.is_file_included(".github/workflows/ci.yml")); + assert!(matcher.is_file_included("repo/.github/dependabot.yml")); + // Regular files are unaffected. + assert!(matcher.is_file_included("src/main.rs")); + } + + #[test] + fn test_negation_dir_traversal() { + // Exclude all dot-directories, but un-exclude .github subtree. + let matcher = PatternMatcher::new( + None, + Some(vec!["**/.*".to_string(), "!**/.github/**".to_string()]), + ) + .unwrap(); + // .git should be pruned. + assert!(!matcher.is_dir_included(".git")); + assert!(!matcher.is_dir_included("src/.vscode")); + // .github must NOT be pruned so its contents can be reached. + assert!(matcher.is_dir_included(".github")); + assert!(matcher.is_dir_included("repo/.github")); + // Normal directories are always included. + assert!(matcher.is_dir_included("src")); + } + + #[test] + fn test_negation_only_no_regular_exclusion() { + // A negation without a corresponding exclusion is a no-op — nothing is excluded. + let matcher = + PatternMatcher::new(None, Some(vec!["!**/.github/**".to_string()])).unwrap(); + assert!(matcher.is_file_included(".git/config")); + assert!(matcher.is_file_included(".github/workflows/ci.yml")); + assert!(matcher.is_file_included("src/main.rs")); + } + + #[test] + fn test_negation_with_include_patterns() { + // Include only YAML files, exclude all dot-directories, but un-exclude .github. + let matcher = PatternMatcher::new( + Some(vec!["**/*.yml".to_string(), "**/*.yaml".to_string()]), + Some(vec!["**/.*".to_string(), "!**/.github/**".to_string()]), + ) + .unwrap(); + assert!(matcher.is_file_included(".github/workflows/ci.yml")); + assert!(!matcher.is_file_included(".github/workflows/ci.sh")); // not in included + assert!(!matcher.is_file_included(".git/config")); // excluded, not negated + assert!(matcher.is_file_included("src/config.yaml")); + } } diff --git a/rust/py/src/ops.rs b/rust/py/src/ops.rs index 8de04e5ed..ef297985d 100644 --- a/rust/py/src/ops.rs +++ b/rust/py/src/ops.rs @@ -236,6 +236,10 @@ impl PyPatternMatcher { /// Args: /// included_patterns: Glob patterns for files to include. If None, all files are included. /// excluded_patterns: Glob patterns for files/directories to exclude. + /// A pattern prefixed with ``!`` negates (un-excludes) paths that would otherwise be + /// excluded, enabling gitignore-style exceptions. For example, combining + /// ``"**/.*"`` with ``"!**/.github/**"`` excludes all dot-entries except anything + /// inside ``.github/``. #[new] #[pyo3(signature = (included_patterns=None, excluded_patterns=None))] fn new( From e5f5df27af7ed32322e545dbeaecf417797ff7da Mon Sep 17 00:00:00 2001 From: supermario_leo Date: Tue, 5 May 2026 13:35:08 +0800 Subject: [PATCH 2/2] fix(pattern-matcher): rename negation_glob_set and fix deep-path dir traversal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses two review comments from @georgeh0: 1. Rename `negation_glob_set` → `negation_excluded_glob_set` for clarity. 2. Fix `is_dir_included` for exact-path negations like `!dir1/dir2/dir3/a.yml` combined with a broad exclusion like `dir1/**`. Previously the one-level GlobSet probe was sufficient only for wildcard negations (e.g. `!**/.github/**`); an exact-path negation's ancestor directories were pruned before the exempt file could ever be reached. Fix: retain raw negation strings in `negation_patterns_raw` and add a raw-prefix check in `is_dir_included` — if any negation pattern starts with `/` the directory must be traversed. The existing GlobSet probe is kept as-is for wildcard-based negations. New test: `test_negation_exact_path_deep_dir_traversal` covers the case raised in the review (exclude `dir1/**`, un-exclude `dir1/dir2/dir3/a.yml`). Signed-off-by: Lei Yu --- rust/ops_text/src/pattern_matcher.rs | 94 +++++++++++++++++++++------- 1 file changed, 73 insertions(+), 21 deletions(-) diff --git a/rust/ops_text/src/pattern_matcher.rs b/rust/ops_text/src/pattern_matcher.rs index f3da00913..5f323adbc 100644 --- a/rust/ops_text/src/pattern_matcher.rs +++ b/rust/ops_text/src/pattern_matcher.rs @@ -11,15 +11,13 @@ fn build_glob_set(patterns: Vec) -> Result { } /// Splits a list of patterns into regular exclusion patterns and negation (``!``-prefixed) -/// patterns, building a GlobSet for each group. -/// -/// A pattern like ``!**/.github/**`` means "do **not** exclude paths that match -/// ``**/.github/**``", allowing fine-grained exceptions to broad exclusion rules. +/// patterns. Returns the compiled GlobSets together with the raw negation strings so that +/// callers can do prefix-based path ancestry checks without unpacking compiled globs. fn split_excluded_patterns( patterns: Option>, -) -> Result<(Option, Option)> { +) -> Result<(Option, Option, Vec)> { let Some(pats) = patterns else { - return Ok((None, None)); + return Ok((None, None, Vec::new())); }; let mut regular: Vec = Vec::new(); @@ -38,13 +36,14 @@ fn split_excluded_patterns( } else { Some(build_glob_set(regular)?) }; + let negation_raw = negation.clone(); let negation_set = if negation.is_empty() { None } else { Some(build_glob_set(negation)?) }; - Ok((regular_set, negation_set)) + Ok((regular_set, negation_set, negation_raw)) } /// Pattern matcher that handles include and exclude patterns for files. @@ -59,10 +58,15 @@ pub struct PatternMatcher { included_glob_set: Option, /// Regular (non-negated) exclusion patterns. excluded_glob_set: Option, - /// Negation patterns (``!``-prefixed in the original list, stored without the ``!``). - /// A path that matches one of these is *not* excluded even if it matches the regular - /// exclusion patterns. - negation_glob_set: Option, + /// Negation patterns compiled into a GlobSet (``!``-prefixed in the original list, + /// stored without the ``!``). A path that matches one of these is *not* excluded + /// even if it matches the regular exclusion patterns. + negation_excluded_glob_set: Option, + /// Raw (uncompiled) negation pattern strings, kept so that ``is_dir_included`` can + /// detect directories that lie on the path to a negation-exempt file even when the + /// directory itself would otherwise be pruned (e.g. ``!dir1/dir2/dir3/a.yml`` + /// combined with ``dir1/**``). + negation_patterns_raw: Vec, } impl PatternMatcher { @@ -75,12 +79,14 @@ impl PatternMatcher { excluded_patterns: Option>, ) -> Result { let included_glob_set = included_patterns.map(build_glob_set).transpose()?; - let (excluded_glob_set, negation_glob_set) = split_excluded_patterns(excluded_patterns)?; + let (excluded_glob_set, negation_excluded_glob_set, negation_patterns_raw) = + split_excluded_patterns(excluded_patterns)?; Ok(Self { included_glob_set, excluded_glob_set, - negation_glob_set, + negation_excluded_glob_set, + negation_patterns_raw, }) } @@ -95,7 +101,7 @@ impl PatternMatcher { } // The path matches an exclusion pattern; a negation pattern can un-exclude it. !self - .negation_glob_set + .negation_excluded_glob_set .as_ref() .is_some_and(|gs| gs.is_match(path)) } @@ -103,9 +109,19 @@ impl PatternMatcher { /// Check if a directory should be traversed based on the exclude/negation patterns. /// /// A directory is included unless it matches an exclusion pattern *and* no negation - /// pattern applies to it or to any file that could live inside it. The latter check - /// uses a probe path (``/__probe__``) so that patterns like ``**/.github/**`` - /// correctly un-prune the ``.github`` directory. + /// pattern applies to it or to any file that could live inside it. + /// + /// Two complementary checks are used so that both glob-style and exact-path negations + /// work correctly: + /// + /// 1. **GlobSet probe** — matches ``/__probe__`` against the compiled negation + /// GlobSet. Catches wildcard negations such as ``!**/.github/**`` that use ``**`` + /// to span multiple directory levels. + /// + /// 2. **Raw-prefix check** — scans the raw (uncompiled) negation strings and returns + /// ``true`` if any of them starts with ``/``. Catches exact-path negations + /// such as ``!dir1/dir2/dir3/a.yml`` where the probe alone would not help because + /// the pattern contains no wildcards relative to the directory. pub fn is_dir_included(&self, path: &str) -> bool { if !self .excluded_glob_set @@ -114,18 +130,29 @@ impl PatternMatcher { { return true; } - // Directory matches an exclusion. Check whether negation patterns could apply to - // the directory itself or to a file one level inside it. - if let Some(neg_gs) = &self.negation_glob_set { + // Directory matches an exclusion pattern. Check whether a negation pattern + // could apply to the directory itself or to any descendant. + if let Some(neg_gs) = &self.negation_excluded_glob_set { if neg_gs.is_match(path) { return true; } - // Probe one level inside: catches patterns like `**/.github/**`. + // Probe one level inside: catches glob negations like `**/.github/**`. let probe = format!("{}/__probe__", path); if neg_gs.is_match(probe.as_str()) { return true; } } + // Raw-prefix check: catches exact-path negations like `!dir1/dir2/dir3/a.yml` + // where the parent directories would otherwise be pruned by the exclusion but + // need to be traversed to reach the negation-exempt file. + let dir_prefix = format!("{}/", path); + if self + .negation_patterns_raw + .iter() + .any(|p| p.starts_with(&dir_prefix)) + { + return true; + } false } @@ -261,6 +288,31 @@ mod tests { assert!(matcher.is_dir_included("src")); } + #[test] + fn test_negation_exact_path_deep_dir_traversal() { + // Exclude all of dir1, but un-exclude a specific deep file with an exact path. + // All ancestor directories of dir1/dir2/dir3/a.yml must be traversable. + let matcher = PatternMatcher::new( + None, + Some(vec![ + "dir1/**".to_string(), + "!dir1/dir2/dir3/a.yml".to_string(), + ]), + ) + .unwrap(); + // The negation-exempt file itself must be included. + assert!(matcher.is_file_included("dir1/dir2/dir3/a.yml")); + // Other files under dir1 remain excluded. + assert!(!matcher.is_file_included("dir1/other.txt")); + assert!(!matcher.is_file_included("dir1/dir2/other.txt")); + // Ancestor directories must be traversable to reach the exempt file. + assert!(matcher.is_dir_included("dir1")); + assert!(matcher.is_dir_included("dir1/dir2")); + assert!(matcher.is_dir_included("dir1/dir2/dir3")); + // Sibling directories that contain no negation-exempt files are still pruned. + assert!(!matcher.is_dir_included("dir1/other")); + } + #[test] fn test_negation_only_no_regular_exclusion() { // A negation without a corresponding exclusion is a no-op — nothing is excluded.