-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmark_extracted_text.py
More file actions
31 lines (25 loc) · 1005 Bytes
/
mark_extracted_text.py
File metadata and controls
31 lines (25 loc) · 1005 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import sys
import pymupdf
def mark_word(page, text):
"""Underline each word that contains 'programming'.
"""
found = 0
wlist = page.get_text("words", delimiters=None) # make the word list
for w in wlist: # scan through all words on page
if text in w[4]: # w[4] is the word's string
found += 1 # count
r = pymupdf.Rect(w[:4]) # make rect from word bbox
page.add_underline_annot(r) # underline
return found
fname = sys.argv[1] # filename
text = sys.argv[2] # search string
doc = pymupdf.open(fname)
print("underlining words containing programming in document '%s'" % ( doc.name))
new_doc = False # indicator if anything found at all
for page in doc: # scan through the pages
found = mark_word(page, text) # mark the page's words
if found: # if anything found ...
new_doc = True
print("found '%s' %i times on page %i" % (text, found, page.number + 1))
if new_doc:
doc.save("marked-" + doc.name)