|
90 | 90 | clozeregex = re.compile(r"{{c\d+::|}}") # for removing clozes in anki |
91 | 91 | markdownlink_regex = re.compile(r"\[.*?\]\((.*?)\)") # to find markdown links |
92 | 92 | markdownlinkparser_regex = re.compile(r'\[([^\]]+)\]\(http[s]?://[^)]+\)') # to replace markdown links by their text |
93 | | -markdownimage_regex = re.compile(r'!\[([^\]]*)\]\s*(\([^\)]+\)|\[[^\]]+\])') # to remove image from jina reader that take a lot of tokens but are not yet used |
| 93 | +markdownimage_regex = re.compile(r'!\[([^\]]*)\]\s*(\([^\)]+\)|\[[^\]]+\])', flags=re.MULTILINE) # to remove image from jina reader that take a lot of tokens but are not yet used |
| 94 | +def md_shorten_image_name(md_image): |
| 95 | + "turn a markdown image link into just the name" |
| 96 | + name = md_image.group(1) |
| 97 | + if len(name) <= 16: |
| 98 | + return name |
| 99 | + else: |
| 100 | + return name[:8] + "…" + name[-8:] |
94 | 101 | # to check that a youtube link is valid |
95 | 102 | yt_link_regex = re.compile("youtube.*watch") |
96 | 103 | emptyline_regex = re.compile(r"^\s*$", re.MULTILINE) |
@@ -1221,7 +1228,7 @@ def load_url(path: str, title=None) -> List[Document]: |
1221 | 1228 | title = text.splitlines()[0].replace("Title: ", "", 1) |
1222 | 1229 | text = text.split("Markdown Content:", 1)[1] |
1223 | 1230 | text = markdownlinkparser_regex.sub(r'\1', text) # remove links |
1224 | | - text = markdownimage_regex.sub(" ", text, flags=re.MULTILINE) # remove markdown images for now as caption is disabled so it's just base64 or something like that |
| 1231 | + text = markdownimage_regex.sub(md_shorten_image_name, text) # remove markdown images for now as caption is disabled so it's just base64 or something like that, keep only a shorten image name |
1225 | 1232 | docs = [ |
1226 | 1233 | Document( |
1227 | 1234 | page_content=text, |
|
0 commit comments