Skip to content

Commit 20a37a2

Browse files
fix: better md image replacer
1 parent f729c19 commit 20a37a2

1 file changed

Lines changed: 9 additions & 2 deletions

File tree

DocToolsLLM/utils/loaders.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,14 @@
9090
clozeregex = re.compile(r"{{c\d+::|}}") # for removing clozes in anki
9191
markdownlink_regex = re.compile(r"\[.*?\]\((.*?)\)") # to find markdown links
9292
markdownlinkparser_regex = re.compile(r'\[([^\]]+)\]\(http[s]?://[^)]+\)') # to replace markdown links by their text
93-
markdownimage_regex = re.compile(r'!\[([^\]]*)\]\s*(\([^\)]+\)|\[[^\]]+\])') # to remove image from jina reader that take a lot of tokens but are not yet used
93+
markdownimage_regex = re.compile(r'!\[([^\]]*)\]\s*(\([^\)]+\)|\[[^\]]+\])', flags=re.MULTILINE) # to remove image from jina reader that take a lot of tokens but are not yet used
94+
def md_shorten_image_name(md_image):
95+
"turn a markdown image link into just the name"
96+
name = md_image.group(1)
97+
if len(name) <= 16:
98+
return name
99+
else:
100+
return name[:8] + "…" + name[-8:]
94101
# to check that a youtube link is valid
95102
yt_link_regex = re.compile("youtube.*watch")
96103
emptyline_regex = re.compile(r"^\s*$", re.MULTILINE)
@@ -1221,7 +1228,7 @@ def load_url(path: str, title=None) -> List[Document]:
12211228
title = text.splitlines()[0].replace("Title: ", "", 1)
12221229
text = text.split("Markdown Content:", 1)[1]
12231230
text = markdownlinkparser_regex.sub(r'\1', text) # remove links
1224-
text = markdownimage_regex.sub(" ", text, flags=re.MULTILINE) # remove markdown images for now as caption is disabled so it's just base64 or something like that
1231+
text = markdownimage_regex.sub(md_shorten_image_name, text) # remove markdown images for now as caption is disabled so it's just base64 or something like that, keep only a shorten image name
12251232
docs = [
12261233
Document(
12271234
page_content=text,

0 commit comments

Comments
 (0)