Skip to content

Commit 26bfad6

Browse files
fix: remove image from jina reader as it can take a lot of token and is not captionned
1 parent 87ef0b1 commit 26bfad6

1 file changed

Lines changed: 2 additions & 0 deletions

File tree

DocToolsLLM/utils/loaders.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@
9090
clozeregex = re.compile(r"{{c\d+::|}}") # for removing clozes in anki
9191
markdownlink_regex = re.compile(r"\[.*?\]\((.*?)\)") # to find markdown links
9292
markdownlinkparser_regex = re.compile(r'\[([^\]]+)\]\(http[s]?://[^)]+\)') # to replace markdown links by their text
93+
markdownimage_regex = re.compile(r'!\[([^\]]*)\]\s*(\([^\)]+\)|\[[^\]]+\])') # to remove image from jina reader that take a lot of tokens but are not yet used
9394
# to check that a youtube link is valid
9495
yt_link_regex = re.compile("youtube.*watch")
9596
emptyline_regex = re.compile(r"^\s*$", re.MULTILINE)
@@ -1220,6 +1221,7 @@ def load_url(path: str, title=None) -> List[Document]:
12201221
title = text.splitlines()[0].replace("Title: ", "", 1)
12211222
text = text.split("Markdown Content:", 1)[1]
12221223
text = markdownlinkparser_regex.sub(r'\1', text) # remove links
1224+
text = markdownimage_regex.sub(" ", text, flags=re.MULTILINE) # remove markdown images for now as caption is disabled so it's just base64 or something like that
12231225
docs = [
12241226
Document(
12251227
page_content=text,

0 commit comments

Comments
 (0)