Skip to content

Commit acb0a12

Browse files
fix: must compile the include and exclude
1 parent bfd4dd0 commit acb0a12

1 file changed

Lines changed: 56 additions & 8 deletions

File tree

DocToolsLLM/utils/batch_file_loader.py

Lines changed: 56 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,13 @@ def parse_recursive_paths(load_kwargs: dict) -> List[dict]:
347347
if "include" in load_kwargs:
348348
for i, d in enumerate(doclist):
349349
keep = True
350-
for inc in load_kwargs["include"]:
350+
for iinc, inc in enumerate(load_kwargs["include"]):
351+
if isinstance(inc, str):
352+
if inc == inc.lower():
353+
inc = re.compile(inc, flags=re.IGNORECASE)
354+
else:
355+
inc = re.compile(inc)
356+
load_kwargs["include"][iinc] = inc
351357
if not inc.search(d):
352358
keep = False
353359
if not keep:
@@ -356,7 +362,13 @@ def parse_recursive_paths(load_kwargs: dict) -> List[dict]:
356362
del load_kwargs["include"]
357363

358364
if "exclude" in load_kwargs:
359-
for exc in load_kwargs["exclude"]:
365+
for iexc, exc in enumerate(load_kwargs["exclude"]):
366+
if isinstance(exc, str):
367+
if exc == exc.lower():
368+
exc = re.compile(exc, flags=re.IGNORECASE)
369+
else:
370+
exc = re.compile(exc)
371+
load_kwargs["exclude"][iexc] = exc
360372
doclist = [d for d in doclist if not exc.search(d)]
361373
del load_kwargs["exclude"]
362374

@@ -387,7 +399,13 @@ def parse_json_entries(load_kwargs: dict) -> List[dict]:
387399
if "include" in load_kwargs:
388400
for i, d in enumerate(doclist):
389401
keep = True
390-
for inc in load_kwargs["include"]:
402+
for iinc, inc in enumerate(load_kwargs["include"]):
403+
if isinstance(inc, str):
404+
if inc == inc.lower():
405+
inc = re.compile(inc, flags=re.IGNORECASE)
406+
else:
407+
inc = re.compile(inc)
408+
load_kwargs["include"][iinc] = inc
391409
if not inc.search(d):
392410
keep = False
393411
if not keep:
@@ -396,7 +414,13 @@ def parse_json_entries(load_kwargs: dict) -> List[dict]:
396414
del load_kwargs["include"]
397415

398416
if "exclude" in load_kwargs:
399-
for exc in load_kwargs["exclude"]:
417+
for iexc, exc in enumerate(load_kwargs["exclude"]):
418+
if isinstance(exc, str):
419+
if exc == exc.lower():
420+
exc = re.compile(exc, flags=re.IGNORECASE)
421+
else:
422+
exc = re.compile(exc)
423+
load_kwargs["exclude"][iexc] = exc
400424
doclist = [d for d in doclist if not exc.search(d)]
401425
del load_kwargs["exclude"]
402426

@@ -443,7 +467,13 @@ def parse_link_file(load_kwargs: dict, task: str) -> List[dict]:
443467
if "include" in load_kwargs:
444468
for i, d in enumerate(doclist):
445469
keep = True
446-
for inc in load_kwargs["include"]:
470+
for iinc, inc in enumerate(load_kwargs["include"]):
471+
if isinstance(inc, str):
472+
if inc == inc.lower():
473+
inc = re.compile(inc, flags=re.IGNORECASE)
474+
else:
475+
inc = re.compile(inc)
476+
load_kwargs["include"][iinc] = inc
447477
if not inc.search(d):
448478
keep = False
449479
if not keep:
@@ -452,7 +482,13 @@ def parse_link_file(load_kwargs: dict, task: str) -> List[dict]:
452482
del load_kwargs["include"]
453483

454484
if "exclude" in load_kwargs:
455-
for exc in load_kwargs["exclude"]:
485+
for iexc, exc in enumerate(load_kwargs["exclude"]):
486+
if isinstance(exc, str):
487+
if exc == exc.lower():
488+
exc = re.compile(exc, flags=re.IGNORECASE)
489+
else:
490+
exc = re.compile(exc)
491+
load_kwargs["exclude"][iexc] = exc
456492
doclist = [d for d in doclist if not exc.search(d)]
457493
del load_kwargs["exclude"]
458494

@@ -485,7 +521,13 @@ def parse_youtube_playlist(load_kwargs: dict) -> List[dict]:
485521
if "include" in load_kwargs:
486522
for i, d in enumerate(doclist):
487523
keep = True
488-
for inc in load_kwargs["include"]:
524+
for iinc, inc in enumerate(load_kwargs["include"]):
525+
if isinstance(inc, str):
526+
if inc == inc.lower():
527+
inc = re.compile(inc, flags=re.IGNORECASE)
528+
else:
529+
inc = re.compile(inc)
530+
load_kwargs["include"][iinc] = inc
489531
if not inc.search(d):
490532
keep = False
491533
if not keep:
@@ -494,7 +536,13 @@ def parse_youtube_playlist(load_kwargs: dict) -> List[dict]:
494536
del load_kwargs["include"]
495537

496538
if "exclude" in load_kwargs:
497-
for exc in load_kwargs["exclude"]:
539+
for iexc, exc in enumerate(load_kwargs["exclude"]):
540+
if isinstance(exc, str):
541+
if exc == exc.lower():
542+
exc = re.compile(exc, flags=re.IGNORECASE)
543+
else:
544+
exc = re.compile(exc)
545+
load_kwargs["exclude"][iexc] = exc
498546
doclist = [d for d in doclist if not exc.search(d)]
499547
del load_kwargs["exclude"]
500548

0 commit comments

Comments
 (0)