Skip to content

Commit e21bbff

Browse files
Merge branch 'dev'
2 parents ce67615 + d8fc24b commit e21bbff

6 files changed

Lines changed: 34 additions & 9 deletions

File tree

README.md

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
<p align="center">
2-
<img src="images/icon.png" width="256">
3-
</p>
1+
<p align="center"><img src="https://github.com/thiswillbeyourgithub/WDoc/blob/main/images/icon.png?raw=true" width="256"></p>
42

53
# WDoc
64

@@ -144,7 +142,7 @@
144142
8. If you want to make sure your data remains private here's an example with ollama: `wdoc --private --llms_api_bases='{"model": "http://localhost:11434", "query_eval_model": "http://localhost:11434"}' --modelname="ollama_chat/gemma:2b" --query_eval_modelname="ollama_chat/gemma:2b" --embed_model="BAAI/bge-m3" my_task`
145143
9. Now say you just want to summarize a webpage: `wdoc summary --path="https://arstechnica.com/science/2024/06/to-pee-or-not-to-pee-that-is-a-question-for-the-bladder-and-the-brain/"`.
146144

147-
![](images/summary.png)
145+
<p align="center"><img src="https://github.com/thiswillbeyourgithub/WDoc/blob/main/images/summary.png?raw=true" width="256"></p>
148146

149147
## Getting started
150148
*Tested on python 3.10 and 3.11.7*

WDoc/WDoc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@
7979
class WDoc:
8080
"This docstring is dynamically replaced by the content of WDoc/docs/USAGE.md"
8181

82-
VERSION: str = "1.1.8"
82+
VERSION: str = "1.1.9"
8383
allowed_extra_args = extra_args_keys
8484
md_printer = md_printer
8585

WDoc/docs/toml_entries_example.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
[[this_will_fetch_all_the_pdf_recursively_inside_a_dir_according_to_some_parameters]]
1+
# note: don't use a '.' in the dict keys (the line below) otherwise the parsing will fail
2+
[[this_will_fetch_all_the_pdf_recursively_inside_a_dir_according_to_some_parameters]] # <- don't put anything other than letters and _ here, especially no '.' because it will make subdicts that will fail to parse
23
path = '../some/path/to/parent/'
34
filetype = 'recursive_paths'
45
recursed_filetype = 'pdf'

WDoc/utils/loaders.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -573,7 +573,16 @@ def load_youtube_video(
573573

574574
@optional_typecheck
575575
@doc_loaders_cache.cache
576-
def load_online_pdf(debug: bool, task: str, path: str, **kwargs) -> List[Document]:
576+
def load_online_pdf(
577+
debug: bool,
578+
task: str,
579+
path: str,
580+
doccheck_min_lang_prob: float = min_lang_prob,
581+
doccheck_min_token: int = min_token,
582+
doccheck_max_token: int = max_token,
583+
doccheck_max_lines: int = max_lines,
584+
**kwargs,
585+
) -> List[Document]:
577586
whi(f"Loading online pdf: '{path}'")
578587

579588
try:
@@ -1785,6 +1794,10 @@ def load_pdf(
17851794
text_splitter: TextSplitter,
17861795
debug: bool,
17871796
file_hash: str,
1797+
doccheck_min_lang_prob: float = min_lang_prob,
1798+
doccheck_min_token: int = min_token,
1799+
doccheck_max_token: int = max_token,
1800+
doccheck_max_lines: int = max_lines,
17881801
) -> List[Document]:
17891802
whi(f"Loading pdf: '{path}'")
17901803
assert Path(path).exists(), f"file not found: '{path}'"
@@ -1826,6 +1839,10 @@ def timeout_handler(signum, frame):
18261839
docs=docs,
18271840
identifier=path,
18281841
check_language=True,
1842+
min_lang_prob=doccheck_min_lang_prob,
1843+
min_token=doccheck_min_token,
1844+
max_token=doccheck_max_token,
1845+
max_lines=doccheck_max_lines,
18291846
)
18301847

18311848
if prob >= 0.5:

bumpver.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpver]
2-
current_version = "1.1.8"
2+
current_version = "1.1.9"
33
version_pattern = "MAJOR.MINOR.PATCH"
44
commit_message = "bump version {old_version} -> {new_version}"
55
tag_message = "{new_version}"

setup.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,19 @@ def run(self):
2727

2828
with open("README.md", "r") as readme:
2929
long_description = readme.read()
30+
long_description = long_description.replace(
31+
'<p align="center"><img src="https://github.com/thiswillbeyourgithub/WDoc/blob/main/images/icon.png?raw=true" width="256"></p>',
32+
'![icon](https://github.com/thiswillbeyourgithub/WDoc/blob/main/images/icon.png?raw=true)',
33+
)
34+
long_description = long_description.replace(
35+
'<p align="center"><img src="https://github.com/thiswillbeyourgithub/WDoc/blob/main/images/summary.png?raw=true" width="256"></p>',
36+
'![example](https://github.com/thiswillbeyourgithub/WDoc/blob/main/images/summary.png?raw=true)',
37+
)
38+
assert 'align="center"' not in long_description
3039

3140
setup(
3241
name="wdoc",
33-
version="1.1.8",
42+
version="1.1.9",
3443
description="A perfect AI powered RAG for document query and summary. Supports ~all LLM and ~all filetypes (url, pdf, epub, youtube (incl playlist), audio, anki, md, docx, pptx, oe any combination!)",
3544
long_description=long_description,
3645
long_description_content_type="text/markdown",

0 commit comments

Comments
 (0)