fix: better md image replacer

thiswillbeyourgithub · thiswillbeyourgithub · commit 20a37a2080bd · 2024-06-28T11:15:15.000+02:00
diff --git a/DocToolsLLM/utils/loaders.py b/DocToolsLLM/utils/loaders.py
@@ -90,7 +90,14 @@
 clozeregex = re.compile(r"{{c\d+::|}}")  # for removing clozes in anki
 markdownlink_regex = re.compile(r"\[.*?\]\((.*?)\)")  # to find markdown links
 markdownlinkparser_regex = re.compile(r'\[([^\]]+)\]\(http[s]?://[^)]+\)')  # to replace markdown links by their text
-markdownimage_regex = re.compile(r'!\[([^\]]*)\]\s*(\([^\)]+\)|\[[^\]]+\])')  # to remove image from jina reader that take a lot of tokens but are not yet used
+markdownimage_regex = re.compile(r'!\[([^\]]*)\]\s*(\([^\)]+\)|\[[^\]]+\])', flags=re.MULTILINE)  # to remove image from jina reader that take a lot of tokens but are not yet used
+def md_shorten_image_name(md_image):
+    "turn a markdown image link into just the name"
+    name = md_image.group(1)
+    if len(name) <= 16:
+        return name
+    else:
+        return name[:8] + "…" + name[-8:]
 # to check that a youtube link is valid
 yt_link_regex = re.compile("youtube.*watch")
 emptyline_regex = re.compile(r"^\s*$", re.MULTILINE)
@@ -1221,7 +1228,7 @@ def load_url(path: str, title=None) -> List[Document]:
                     title = text.splitlines()[0].replace("Title: ", "", 1)
             text = text.split("Markdown Content:", 1)[1]
             text = markdownlinkparser_regex.sub(r'\1', text)  # remove links
-            text = markdownimage_regex.sub(" ", text, flags=re.MULTILINE)  # remove markdown images for now as caption is disabled so it's just base64 or something like that
+            text = markdownimage_regex.sub(md_shorten_image_name, text)  # remove markdown images for now as caption is disabled so it's just base64 or something like that, keep only a shorten image name
             docs = [
                 Document(
                     page_content=text,