Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion python/cocoindex/resources/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,17 @@ class PatternFilePathMatcher(FilePathMatcher):
- `*.py` — matches Python files only in the root directory
- `**/.*` — matches dot-prefixed entries (hidden files/dirs) at any depth
- `{*.md,*.txt}` — matches multiple extensions using alternation

``excluded_patterns`` supports gitignore-style ``!`` negation: a pattern
beginning with ``!`` un-excludes paths that would otherwise be excluded by a
preceding pattern. Example — exclude all dot-directories except ``.github``::

PatternFilePathMatcher(
excluded_patterns=[
"**/.*", # exclude all dot-entries
"!**/.github/**", # but keep .github through
]
)
"""

def __init__(
Expand All @@ -250,7 +261,9 @@ def __init__(
to be included. Use ``**/*.ext`` to match at any depth.
excluded_patterns: Glob patterns (globset syntax) matching full path of files
and directories to be excluded. If a directory is excluded, all files and
subdirectories within it are also excluded.
subdirectories within it are also excluded. A pattern prefixed with ``!``
negates the exclusion for matching paths, allowing gitignore-style exceptions
(e.g. ``"!**/.github/**"`` after ``"**/.*"``).

Raises:
ValueError: If any pattern is invalid.
Expand Down
235 changes: 221 additions & 14 deletions rust/ops_text/src/pattern_matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,47 +10,153 @@ fn build_glob_set(patterns: Vec<String>) -> Result<GlobSet> {
Ok(builder.build()?)
}

/// Pattern matcher that handles include and exclude patterns for files
/// Splits a list of patterns into regular exclusion patterns and negation (``!``-prefixed)
/// patterns. Returns the compiled GlobSets together with the raw negation strings so that
/// callers can do prefix-based path ancestry checks without unpacking compiled globs.
fn split_excluded_patterns(
patterns: Option<Vec<String>>,
) -> Result<(Option<GlobSet>, Option<GlobSet>, Vec<String>)> {
let Some(pats) = patterns else {
return Ok((None, None, Vec::new()));
};

let mut regular: Vec<String> = Vec::new();
let mut negation: Vec<String> = Vec::new();

for p in pats {
if let Some(stripped) = p.strip_prefix('!') {
negation.push(stripped.to_string());
} else {
regular.push(p);
}
}

let regular_set = if regular.is_empty() {
None
} else {
Some(build_glob_set(regular)?)
};
let negation_raw = negation.clone();
let negation_set = if negation.is_empty() {
None
} else {
Some(build_glob_set(negation)?)
};

Ok((regular_set, negation_set, negation_raw))
}

/// Pattern matcher that handles include and exclude patterns for files.
///
/// Supports gitignore-style ``!``-prefixed negation in ``excluded_patterns``: a pattern
/// beginning with ``!`` un-excludes paths that would otherwise be excluded. For example,
/// combining ``"**/.*"`` with ``"!**/.github/**"`` excludes all dot-entries *except*
/// anything inside ``.github/``.
#[derive(Debug)]
pub struct PatternMatcher {
/// Patterns matching full path of files to be included.
included_glob_set: Option<GlobSet>,
/// Patterns matching full path of files and directories to be excluded.
/// If a directory is excluded, all files and subdirectories within it are also excluded.
/// Regular (non-negated) exclusion patterns.
excluded_glob_set: Option<GlobSet>,
/// Negation patterns compiled into a GlobSet (``!``-prefixed in the original list,
/// stored without the ``!``). A path that matches one of these is *not* excluded
/// even if it matches the regular exclusion patterns.
negation_excluded_glob_set: Option<GlobSet>,
/// Raw (uncompiled) negation pattern strings, kept so that ``is_dir_included`` can
/// detect directories that lie on the path to a negation-exempt file even when the
/// directory itself would otherwise be pruned (e.g. ``!dir1/dir2/dir3/a.yml``
/// combined with ``dir1/**``).
negation_patterns_raw: Vec<String>,
}

impl PatternMatcher {
/// Create a new PatternMatcher from optional include and exclude pattern vectors
/// Create a new PatternMatcher from optional include and exclude pattern vectors.
///
/// Patterns in `excluded_patterns` that start with ``!`` are treated as negations:
/// they un-exclude any path that would otherwise be excluded by the preceding patterns.
pub fn new(
included_patterns: Option<Vec<String>>,
excluded_patterns: Option<Vec<String>>,
) -> Result<Self> {
let included_glob_set = included_patterns.map(build_glob_set).transpose()?;
let excluded_glob_set = excluded_patterns.map(build_glob_set).transpose()?;
let (excluded_glob_set, negation_excluded_glob_set, negation_patterns_raw) =
split_excluded_patterns(excluded_patterns)?;

Ok(Self {
included_glob_set,
excluded_glob_set,
negation_excluded_glob_set,
negation_patterns_raw,
})
}

/// Check if a file or directory is excluded by the exclude patterns
/// Can be called on directories to prune traversal on excluded directories.
/// Check if a path is excluded after applying both exclusion and negation patterns.
pub fn is_excluded(&self, path: &str) -> bool {
self.excluded_glob_set
if !self
.excluded_glob_set
.as_ref()
.is_some_and(|gs| gs.is_match(path))
{
return false;
}
// The path matches an exclusion pattern; a negation pattern can un-exclude it.
!self
.negation_excluded_glob_set
.as_ref()
.is_some_and(|glob_set| glob_set.is_match(path))
.is_some_and(|gs| gs.is_match(path))
}

/// Check if a directory should be included (traversed) based on the exclude patterns.
/// Directories are included unless they match an exclude pattern.
/// Check if a directory should be traversed based on the exclude/negation patterns.
///
/// A directory is included unless it matches an exclusion pattern *and* no negation
/// pattern applies to it or to any file that could live inside it.
///
/// Two complementary checks are used so that both glob-style and exact-path negations
/// work correctly:
///
/// 1. **GlobSet probe** — matches ``<dir>/__probe__`` against the compiled negation
/// GlobSet. Catches wildcard negations such as ``!**/.github/**`` that use ``**``
/// to span multiple directory levels.
///
/// 2. **Raw-prefix check** — scans the raw (uncompiled) negation strings and returns
/// ``true`` if any of them starts with ``<dir>/``. Catches exact-path negations
/// such as ``!dir1/dir2/dir3/a.yml`` where the probe alone would not help because
/// the pattern contains no wildcards relative to the directory.
pub fn is_dir_included(&self, path: &str) -> bool {
!self.is_excluded(path)
if !self
.excluded_glob_set
.as_ref()
.is_some_and(|gs| gs.is_match(path))
{
return true;
}
// Directory matches an exclusion pattern. Check whether a negation pattern
// could apply to the directory itself or to any descendant.
if let Some(neg_gs) = &self.negation_excluded_glob_set {
if neg_gs.is_match(path) {
return true;
}
// Probe one level inside: catches glob negations like `**/.github/**`.
let probe = format!("{}/__probe__", path);
if neg_gs.is_match(probe.as_str()) {
return true;
}
}
// Raw-prefix check: catches exact-path negations like `!dir1/dir2/dir3/a.yml`
// where the parent directories would otherwise be pruned by the exclusion but
// need to be traversed to reach the negation-exempt file.
let dir_prefix = format!("{}/", path);
if self
.negation_patterns_raw
.iter()
.any(|p| p.starts_with(&dir_prefix))
{
return true;
}
false
}

/// Check if a file should be included based on both include and exclude patterns
/// Should be called for each file.
/// Check if a file should be included based on both include and exclude patterns.
pub fn is_file_included(&self, path: &str) -> bool {
self.included_glob_set
.as_ref()
Expand Down Expand Up @@ -129,4 +235,105 @@ mod tests {
assert!(matcher.is_file_included("a/b/c/main.py"));
assert!(!matcher.is_file_included("main.rs"));
}

// --- Negation (!) pattern tests ---

#[test]
fn test_negation_un_excludes_file() {
// Exclude all dot-files, but un-exclude .env.example
let matcher = PatternMatcher::new(
None,
Some(vec!["**/.env*".to_string(), "!**/.env.example".to_string()]),
)
.unwrap();
assert!(!matcher.is_file_included(".env"));
assert!(!matcher.is_file_included("config/.env.local"));
assert!(matcher.is_file_included(".env.example"));
assert!(matcher.is_file_included("config/.env.example"));
}

#[test]
fn test_negation_un_excludes_dotdir_files() {
// Exclude all dot-directories, but allow .github workflow files through.
let matcher = PatternMatcher::new(
None,
Some(vec!["**/.*".to_string(), "!**/.github/**".to_string()]),
)
.unwrap();
// Dot files/dirs that are NOT in .github remain excluded.
assert!(!matcher.is_file_included(".git/config"));
assert!(!matcher.is_file_included(".vscode/settings.json"));
// Files under .github are un-excluded.
assert!(matcher.is_file_included(".github/workflows/ci.yml"));
assert!(matcher.is_file_included("repo/.github/dependabot.yml"));
// Regular files are unaffected.
assert!(matcher.is_file_included("src/main.rs"));
}

#[test]
fn test_negation_dir_traversal() {
// Exclude all dot-directories, but un-exclude .github subtree.
let matcher = PatternMatcher::new(
None,
Some(vec!["**/.*".to_string(), "!**/.github/**".to_string()]),
)
.unwrap();
// .git should be pruned.
assert!(!matcher.is_dir_included(".git"));
assert!(!matcher.is_dir_included("src/.vscode"));
// .github must NOT be pruned so its contents can be reached.
assert!(matcher.is_dir_included(".github"));
assert!(matcher.is_dir_included("repo/.github"));
// Normal directories are always included.
assert!(matcher.is_dir_included("src"));
}

#[test]
fn test_negation_exact_path_deep_dir_traversal() {
// Exclude all of dir1, but un-exclude a specific deep file with an exact path.
// All ancestor directories of dir1/dir2/dir3/a.yml must be traversable.
let matcher = PatternMatcher::new(
None,
Some(vec![
"dir1/**".to_string(),
"!dir1/dir2/dir3/a.yml".to_string(),
]),
)
.unwrap();
// The negation-exempt file itself must be included.
assert!(matcher.is_file_included("dir1/dir2/dir3/a.yml"));
// Other files under dir1 remain excluded.
assert!(!matcher.is_file_included("dir1/other.txt"));
assert!(!matcher.is_file_included("dir1/dir2/other.txt"));
// Ancestor directories must be traversable to reach the exempt file.
assert!(matcher.is_dir_included("dir1"));
assert!(matcher.is_dir_included("dir1/dir2"));
assert!(matcher.is_dir_included("dir1/dir2/dir3"));
// Sibling directories that contain no negation-exempt files are still pruned.
assert!(!matcher.is_dir_included("dir1/other"));
}

#[test]
fn test_negation_only_no_regular_exclusion() {
// A negation without a corresponding exclusion is a no-op — nothing is excluded.
let matcher =
PatternMatcher::new(None, Some(vec!["!**/.github/**".to_string()])).unwrap();
assert!(matcher.is_file_included(".git/config"));
assert!(matcher.is_file_included(".github/workflows/ci.yml"));
assert!(matcher.is_file_included("src/main.rs"));
}

#[test]
fn test_negation_with_include_patterns() {
// Include only YAML files, exclude all dot-directories, but un-exclude .github.
let matcher = PatternMatcher::new(
Some(vec!["**/*.yml".to_string(), "**/*.yaml".to_string()]),
Some(vec!["**/.*".to_string(), "!**/.github/**".to_string()]),
)
.unwrap();
assert!(matcher.is_file_included(".github/workflows/ci.yml"));
assert!(!matcher.is_file_included(".github/workflows/ci.sh")); // not in included
assert!(!matcher.is_file_included(".git/config")); // excluded, not negated
assert!(matcher.is_file_included("src/config.yaml"));
}
}
4 changes: 4 additions & 0 deletions rust/py/src/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,10 @@ impl PyPatternMatcher {
/// Args:
/// included_patterns: Glob patterns for files to include. If None, all files are included.
/// excluded_patterns: Glob patterns for files/directories to exclude.
/// A pattern prefixed with ``!`` negates (un-excludes) paths that would otherwise be
/// excluded, enabling gitignore-style exceptions. For example, combining
/// ``"**/.*"`` with ``"!**/.github/**"`` excludes all dot-entries except anything
/// inside ``.github/``.
#[new]
#[pyo3(signature = (included_patterns=None, excluded_patterns=None))]
fn new(
Expand Down