diff --git a/python/cocoindex/resources/file.py b/python/cocoindex/resources/file.py index 76a3da480..595d4ffc3 100644 --- a/python/cocoindex/resources/file.py +++ b/python/cocoindex/resources/file.py @@ -235,6 +235,17 @@ class PatternFilePathMatcher(FilePathMatcher): - `*.py` — matches Python files only in the root directory - `**/.*` — matches dot-prefixed entries (hidden files/dirs) at any depth - `{*.md,*.txt}` — matches multiple extensions using alternation + + ``excluded_patterns`` supports gitignore-style ``!`` negation: a pattern + beginning with ``!`` un-excludes paths that would otherwise be excluded by a + preceding pattern. Example — exclude all dot-directories except ``.github``:: + + PatternFilePathMatcher( + excluded_patterns=[ + "**/.*", # exclude all dot-entries + "!**/.github/**", # but keep .github through + ] + ) """ def __init__( @@ -250,7 +261,9 @@ def __init__( to be included. Use ``**/*.ext`` to match at any depth. excluded_patterns: Glob patterns (globset syntax) matching full path of files and directories to be excluded. If a directory is excluded, all files and - subdirectories within it are also excluded. + subdirectories within it are also excluded. A pattern prefixed with ``!`` + negates the exclusion for matching paths, allowing gitignore-style exceptions + (e.g. ``"!**/.github/**"`` after ``"**/.*"``). Raises: ValueError: If any pattern is invalid. diff --git a/rust/ops_text/src/pattern_matcher.rs b/rust/ops_text/src/pattern_matcher.rs index 9cc8e3d80..5f323adbc 100644 --- a/rust/ops_text/src/pattern_matcher.rs +++ b/rust/ops_text/src/pattern_matcher.rs @@ -10,47 +10,153 @@ fn build_glob_set(patterns: Vec) -> Result { Ok(builder.build()?) } -/// Pattern matcher that handles include and exclude patterns for files +/// Splits a list of patterns into regular exclusion patterns and negation (``!``-prefixed) +/// patterns. Returns the compiled GlobSets together with the raw negation strings so that +/// callers can do prefix-based path ancestry checks without unpacking compiled globs. +fn split_excluded_patterns( + patterns: Option>, +) -> Result<(Option, Option, Vec)> { + let Some(pats) = patterns else { + return Ok((None, None, Vec::new())); + }; + + let mut regular: Vec = Vec::new(); + let mut negation: Vec = Vec::new(); + + for p in pats { + if let Some(stripped) = p.strip_prefix('!') { + negation.push(stripped.to_string()); + } else { + regular.push(p); + } + } + + let regular_set = if regular.is_empty() { + None + } else { + Some(build_glob_set(regular)?) + }; + let negation_raw = negation.clone(); + let negation_set = if negation.is_empty() { + None + } else { + Some(build_glob_set(negation)?) + }; + + Ok((regular_set, negation_set, negation_raw)) +} + +/// Pattern matcher that handles include and exclude patterns for files. +/// +/// Supports gitignore-style ``!``-prefixed negation in ``excluded_patterns``: a pattern +/// beginning with ``!`` un-excludes paths that would otherwise be excluded. For example, +/// combining ``"**/.*"`` with ``"!**/.github/**"`` excludes all dot-entries *except* +/// anything inside ``.github/``. #[derive(Debug)] pub struct PatternMatcher { /// Patterns matching full path of files to be included. included_glob_set: Option, - /// Patterns matching full path of files and directories to be excluded. - /// If a directory is excluded, all files and subdirectories within it are also excluded. + /// Regular (non-negated) exclusion patterns. excluded_glob_set: Option, + /// Negation patterns compiled into a GlobSet (``!``-prefixed in the original list, + /// stored without the ``!``). A path that matches one of these is *not* excluded + /// even if it matches the regular exclusion patterns. + negation_excluded_glob_set: Option, + /// Raw (uncompiled) negation pattern strings, kept so that ``is_dir_included`` can + /// detect directories that lie on the path to a negation-exempt file even when the + /// directory itself would otherwise be pruned (e.g. ``!dir1/dir2/dir3/a.yml`` + /// combined with ``dir1/**``). + negation_patterns_raw: Vec, } impl PatternMatcher { - /// Create a new PatternMatcher from optional include and exclude pattern vectors + /// Create a new PatternMatcher from optional include and exclude pattern vectors. + /// + /// Patterns in `excluded_patterns` that start with ``!`` are treated as negations: + /// they un-exclude any path that would otherwise be excluded by the preceding patterns. pub fn new( included_patterns: Option>, excluded_patterns: Option>, ) -> Result { let included_glob_set = included_patterns.map(build_glob_set).transpose()?; - let excluded_glob_set = excluded_patterns.map(build_glob_set).transpose()?; + let (excluded_glob_set, negation_excluded_glob_set, negation_patterns_raw) = + split_excluded_patterns(excluded_patterns)?; Ok(Self { included_glob_set, excluded_glob_set, + negation_excluded_glob_set, + negation_patterns_raw, }) } - /// Check if a file or directory is excluded by the exclude patterns - /// Can be called on directories to prune traversal on excluded directories. + /// Check if a path is excluded after applying both exclusion and negation patterns. pub fn is_excluded(&self, path: &str) -> bool { - self.excluded_glob_set + if !self + .excluded_glob_set + .as_ref() + .is_some_and(|gs| gs.is_match(path)) + { + return false; + } + // The path matches an exclusion pattern; a negation pattern can un-exclude it. + !self + .negation_excluded_glob_set .as_ref() - .is_some_and(|glob_set| glob_set.is_match(path)) + .is_some_and(|gs| gs.is_match(path)) } - /// Check if a directory should be included (traversed) based on the exclude patterns. - /// Directories are included unless they match an exclude pattern. + /// Check if a directory should be traversed based on the exclude/negation patterns. + /// + /// A directory is included unless it matches an exclusion pattern *and* no negation + /// pattern applies to it or to any file that could live inside it. + /// + /// Two complementary checks are used so that both glob-style and exact-path negations + /// work correctly: + /// + /// 1. **GlobSet probe** — matches ``/__probe__`` against the compiled negation + /// GlobSet. Catches wildcard negations such as ``!**/.github/**`` that use ``**`` + /// to span multiple directory levels. + /// + /// 2. **Raw-prefix check** — scans the raw (uncompiled) negation strings and returns + /// ``true`` if any of them starts with ``/``. Catches exact-path negations + /// such as ``!dir1/dir2/dir3/a.yml`` where the probe alone would not help because + /// the pattern contains no wildcards relative to the directory. pub fn is_dir_included(&self, path: &str) -> bool { - !self.is_excluded(path) + if !self + .excluded_glob_set + .as_ref() + .is_some_and(|gs| gs.is_match(path)) + { + return true; + } + // Directory matches an exclusion pattern. Check whether a negation pattern + // could apply to the directory itself or to any descendant. + if let Some(neg_gs) = &self.negation_excluded_glob_set { + if neg_gs.is_match(path) { + return true; + } + // Probe one level inside: catches glob negations like `**/.github/**`. + let probe = format!("{}/__probe__", path); + if neg_gs.is_match(probe.as_str()) { + return true; + } + } + // Raw-prefix check: catches exact-path negations like `!dir1/dir2/dir3/a.yml` + // where the parent directories would otherwise be pruned by the exclusion but + // need to be traversed to reach the negation-exempt file. + let dir_prefix = format!("{}/", path); + if self + .negation_patterns_raw + .iter() + .any(|p| p.starts_with(&dir_prefix)) + { + return true; + } + false } - /// Check if a file should be included based on both include and exclude patterns - /// Should be called for each file. + /// Check if a file should be included based on both include and exclude patterns. pub fn is_file_included(&self, path: &str) -> bool { self.included_glob_set .as_ref() @@ -129,4 +235,105 @@ mod tests { assert!(matcher.is_file_included("a/b/c/main.py")); assert!(!matcher.is_file_included("main.rs")); } + + // --- Negation (!) pattern tests --- + + #[test] + fn test_negation_un_excludes_file() { + // Exclude all dot-files, but un-exclude .env.example + let matcher = PatternMatcher::new( + None, + Some(vec!["**/.env*".to_string(), "!**/.env.example".to_string()]), + ) + .unwrap(); + assert!(!matcher.is_file_included(".env")); + assert!(!matcher.is_file_included("config/.env.local")); + assert!(matcher.is_file_included(".env.example")); + assert!(matcher.is_file_included("config/.env.example")); + } + + #[test] + fn test_negation_un_excludes_dotdir_files() { + // Exclude all dot-directories, but allow .github workflow files through. + let matcher = PatternMatcher::new( + None, + Some(vec!["**/.*".to_string(), "!**/.github/**".to_string()]), + ) + .unwrap(); + // Dot files/dirs that are NOT in .github remain excluded. + assert!(!matcher.is_file_included(".git/config")); + assert!(!matcher.is_file_included(".vscode/settings.json")); + // Files under .github are un-excluded. + assert!(matcher.is_file_included(".github/workflows/ci.yml")); + assert!(matcher.is_file_included("repo/.github/dependabot.yml")); + // Regular files are unaffected. + assert!(matcher.is_file_included("src/main.rs")); + } + + #[test] + fn test_negation_dir_traversal() { + // Exclude all dot-directories, but un-exclude .github subtree. + let matcher = PatternMatcher::new( + None, + Some(vec!["**/.*".to_string(), "!**/.github/**".to_string()]), + ) + .unwrap(); + // .git should be pruned. + assert!(!matcher.is_dir_included(".git")); + assert!(!matcher.is_dir_included("src/.vscode")); + // .github must NOT be pruned so its contents can be reached. + assert!(matcher.is_dir_included(".github")); + assert!(matcher.is_dir_included("repo/.github")); + // Normal directories are always included. + assert!(matcher.is_dir_included("src")); + } + + #[test] + fn test_negation_exact_path_deep_dir_traversal() { + // Exclude all of dir1, but un-exclude a specific deep file with an exact path. + // All ancestor directories of dir1/dir2/dir3/a.yml must be traversable. + let matcher = PatternMatcher::new( + None, + Some(vec![ + "dir1/**".to_string(), + "!dir1/dir2/dir3/a.yml".to_string(), + ]), + ) + .unwrap(); + // The negation-exempt file itself must be included. + assert!(matcher.is_file_included("dir1/dir2/dir3/a.yml")); + // Other files under dir1 remain excluded. + assert!(!matcher.is_file_included("dir1/other.txt")); + assert!(!matcher.is_file_included("dir1/dir2/other.txt")); + // Ancestor directories must be traversable to reach the exempt file. + assert!(matcher.is_dir_included("dir1")); + assert!(matcher.is_dir_included("dir1/dir2")); + assert!(matcher.is_dir_included("dir1/dir2/dir3")); + // Sibling directories that contain no negation-exempt files are still pruned. + assert!(!matcher.is_dir_included("dir1/other")); + } + + #[test] + fn test_negation_only_no_regular_exclusion() { + // A negation without a corresponding exclusion is a no-op — nothing is excluded. + let matcher = + PatternMatcher::new(None, Some(vec!["!**/.github/**".to_string()])).unwrap(); + assert!(matcher.is_file_included(".git/config")); + assert!(matcher.is_file_included(".github/workflows/ci.yml")); + assert!(matcher.is_file_included("src/main.rs")); + } + + #[test] + fn test_negation_with_include_patterns() { + // Include only YAML files, exclude all dot-directories, but un-exclude .github. + let matcher = PatternMatcher::new( + Some(vec!["**/*.yml".to_string(), "**/*.yaml".to_string()]), + Some(vec!["**/.*".to_string(), "!**/.github/**".to_string()]), + ) + .unwrap(); + assert!(matcher.is_file_included(".github/workflows/ci.yml")); + assert!(!matcher.is_file_included(".github/workflows/ci.sh")); // not in included + assert!(!matcher.is_file_included(".git/config")); // excluded, not negated + assert!(matcher.is_file_included("src/config.yaml")); + } } diff --git a/rust/py/src/ops.rs b/rust/py/src/ops.rs index 8de04e5ed..ef297985d 100644 --- a/rust/py/src/ops.rs +++ b/rust/py/src/ops.rs @@ -236,6 +236,10 @@ impl PyPatternMatcher { /// Args: /// included_patterns: Glob patterns for files to include. If None, all files are included. /// excluded_patterns: Glob patterns for files/directories to exclude. + /// A pattern prefixed with ``!`` negates (un-excludes) paths that would otherwise be + /// excluded, enabling gitignore-style exceptions. For example, combining + /// ``"**/.*"`` with ``"!**/.github/**"`` excludes all dot-entries except anything + /// inside ``.github/``. #[new] #[pyo3(signature = (included_patterns=None, excluded_patterns=None))] fn new(