Merge branch 'dev'

thiswillbeyourgithub · thiswillbeyourgithub · commit e21bbff28004 · 2024-07-20T19:08:58.000+02:00
diff --git a/README.md b/README.md
@@ -1,6 +1,4 @@
-<p align="center">
-    <img src="images/icon.png" width="256">
-</p>
+<p align="center"><img src="https://github.com/thiswillbeyourgithub/WDoc/blob/main/images/icon.png?raw=true" width="256"></p>
 
 # WDoc
 
@@ -144,7 +142,7 @@
 8. If you want to make sure your data remains private here's an example with ollama: `wdoc --private --llms_api_bases='{"model": "http://localhost:11434", "query_eval_model": "http://localhost:11434"}' --modelname="ollama_chat/gemma:2b" --query_eval_modelname="ollama_chat/gemma:2b" --embed_model="BAAI/bge-m3" my_task`
 9. Now say you just want to summarize a webpage: `wdoc summary --path="https://arstechnica.com/science/2024/06/to-pee-or-not-to-pee-that-is-a-question-for-the-bladder-and-the-brain/"`.
 
-![](images/summary.png)
+<p align="center"><img src="https://github.com/thiswillbeyourgithub/WDoc/blob/main/images/summary.png?raw=true" width="256"></p>
 
 ## Getting started
 *Tested on python 3.10 and 3.11.7*
diff --git a/WDoc/WDoc.py b/WDoc/WDoc.py
@@ -79,7 +79,7 @@
 class WDoc:
     "This docstring is dynamically replaced by the content of WDoc/docs/USAGE.md"
 
-    VERSION: str = "1.1.8"
+    VERSION: str = "1.1.9"
     allowed_extra_args = extra_args_keys
     md_printer = md_printer
 
diff --git a/WDoc/docs/toml_entries_example.toml b/WDoc/docs/toml_entries_example.toml
@@ -1,4 +1,5 @@
-[[this_will_fetch_all_the_pdf_recursively_inside_a_dir_according_to_some_parameters]]
+# note: don't use a '.' in the dict keys (the line below) otherwise the parsing will fail
+[[this_will_fetch_all_the_pdf_recursively_inside_a_dir_according_to_some_parameters]]  # <- don't put anything other than letters and _ here, especially no '.' because it will  make subdicts that will fail to parse
 path = '../some/path/to/parent/'
 filetype = 'recursive_paths'
 recursed_filetype = 'pdf'
diff --git a/WDoc/utils/loaders.py b/WDoc/utils/loaders.py
@@ -573,7 +573,16 @@ def load_youtube_video(
 
 @optional_typecheck
 @doc_loaders_cache.cache
-def load_online_pdf(debug: bool, task: str, path: str, **kwargs) -> List[Document]:
+def load_online_pdf(
+    debug: bool,
+    task: str,
+    path: str,
+    doccheck_min_lang_prob: float = min_lang_prob,
+    doccheck_min_token: int = min_token,
+    doccheck_max_token: int = max_token,
+    doccheck_max_lines: int = max_lines,
+    **kwargs,
+    ) -> List[Document]:
     whi(f"Loading online pdf: '{path}'")
 
     try:
@@ -1785,6 +1794,10 @@ def load_pdf(
     text_splitter: TextSplitter,
     debug: bool,
     file_hash: str,
+    doccheck_min_lang_prob: float = min_lang_prob,
+    doccheck_min_token: int = min_token,
+    doccheck_max_token: int = max_token,
+    doccheck_max_lines: int = max_lines,
 ) -> List[Document]:
     whi(f"Loading pdf: '{path}'")
     assert Path(path).exists(), f"file not found: '{path}'"
@@ -1826,6 +1839,10 @@ def timeout_handler(signum, frame):
                 docs=docs,
                 identifier=path,
                 check_language=True,
+                min_lang_prob=doccheck_min_lang_prob,
+                min_token=doccheck_min_token,
+                max_token=doccheck_max_token,
+                max_lines=doccheck_max_lines,
             )
 
             if prob >= 0.5:
diff --git a/bumpver.toml b/bumpver.toml
@@ -1,5 +1,5 @@
 [bumpver]
-current_version = "1.1.8"
+current_version = "1.1.9"
 version_pattern = "MAJOR.MINOR.PATCH"
 commit_message = "bump version {old_version} -> {new_version}"
 tag_message = "{new_version}"
diff --git a/setup.py b/setup.py
@@ -27,10 +27,19 @@ def run(self):
 
 with open("README.md", "r") as readme:
     long_description = readme.read()
+    long_description = long_description.replace(
+        '<p align="center"><img src="https://github.com/thiswillbeyourgithub/WDoc/blob/main/images/icon.png?raw=true" width="256"></p>',
+        '![icon](https://github.com/thiswillbeyourgithub/WDoc/blob/main/images/icon.png?raw=true)',
+    )
+    long_description = long_description.replace(
+        '<p align="center"><img src="https://github.com/thiswillbeyourgithub/WDoc/blob/main/images/summary.png?raw=true" width="256"></p>',
+        '![example](https://github.com/thiswillbeyourgithub/WDoc/blob/main/images/summary.png?raw=true)',
+    )
+    assert 'align="center"' not in long_description
 
 setup(
     name="wdoc",
-    version="1.1.8",
+    version="1.1.9",
     description="A perfect AI powered RAG for document query and summary. Supports ~all LLM and ~all filetypes (url, pdf, epub, youtube (incl playlist), audio, anki, md, docx, pptx, oe any combination!)",
     long_description=long_description,
     long_description_content_type="text/markdown",