1+ #!/usr/bin/env python3
2+ """
3+ PDF Hyperlink Adder
4+
5+ This script adds hyperlinks and visual cues to a PDF based on a word list.
6+ It processes a large PDF (e.g., 350 pages) and adds blue underlined text with tooltips
7+ for each matching word from the provided list.
8+
9+ Requirements:
10+ pip install PyMuPDF pdfplumber
11+
12+ Usage:
13+ python pdf_hyperlink_adder.py input.pdf word_list.csv output.pdf
14+ """
15+
16+ import fitz # PyMuPDF
17+ import csv
18+ import re
19+ import sys
20+ from pathlib import Path
21+ from typing import Dict , List , Tuple , Optional
22+
23+ class PDFHyperlinkAdder :
24+ def __init__ (self , input_pdf : str , word_list_file : str , output_pdf : str ):
25+ self .input_pdf = input_pdf
26+ self .word_list_file = word_list_file
27+ self .output_pdf = output_pdf
28+ self .word_links : Dict [str , str ] = {}
29+ self .processed_words = 0
30+ self .total_matches = 0
31+
32+ def load_word_list (self ) -> None :
33+ """Load the word list with hyperlinks from CSV file"""
34+ print (f"📖 Loading word list from { self .word_list_file } ..." )
35+
36+ with open (self .word_list_file , 'r' , encoding = 'utf-8' ) as f :
37+ reader = csv .reader (f )
38+ for row in reader :
39+ if len (row ) >= 2 :
40+ word = row [0 ].strip ().lower ()
41+ link = row [1 ].strip ()
42+ self .word_links [word ] = link
43+
44+ print (f"✅ Loaded { len (self .word_links )} words with hyperlinks" )
45+
46+ def find_word_instances (self , doc : fitz .Document ) -> List [Tuple [int , str , fitz .Rect , str ]]:
47+ """Find all instances of words in the PDF with their positions"""
48+ print ("🔍 Searching for word instances..." )
49+
50+ word_instances = []
51+
52+ for page_num in range (len (doc )):
53+ page = doc [page_num ]
54+
55+ # Get text blocks with positioning
56+ text_dict = page .get_text ("dict" )
57+
58+ for block in text_dict ["blocks" ]:
59+ if "lines" in block :
60+ for line in block ["lines" ]:
61+ for span in line ["spans" ]:
62+ text = span ["text" ]
63+ bbox = fitz .Rect (span ["bbox" ])
64+
65+ # Check each word in our list
66+ for word , link in self .word_links .items ():
67+ # Use case-insensitive search
68+ pattern = re .compile (r'\b' + re .escape (word ) + r'\b' , re .IGNORECASE )
69+ matches = pattern .finditer (text )
70+
71+ for match in matches :
72+ # Calculate position of this specific word
73+ start_pos = match .start ()
74+ end_pos = match .end ()
75+
76+ # Calculate the bbox for this specific word
77+ char_width = bbox .width / len (text )
78+ word_bbox = fitz .Rect (
79+ bbox .x0 + start_pos * char_width ,
80+ bbox .y0 ,
81+ bbox .x0 + end_pos * char_width ,
82+ bbox .y1
83+ )
84+
85+ word_instances .append ((page_num , word , word_bbox , link ))
86+ self .total_matches += 1
87+
88+ print (f"✅ Found { self .total_matches } word instances across { len (doc )} pages" )
89+ return word_instances
90+
91+ def add_hyperlinks_and_styling (self , doc : fitz .Document , word_instances : List [Tuple [int , str , fitz .Rect , str ]]) -> None :
92+ """Add hyperlinks and visual styling to the PDF"""
93+ print ("🎨 Adding hyperlinks and styling..." )
94+
95+ for page_num , word , bbox , link in word_instances :
96+ page = doc [page_num ]
97+
98+ # Add hyperlink annotation
99+ link_annot = page .add_link_annot (bbox , uri = link )
100+
101+ # Add tooltip (using annotation title)
102+ link_annot .set_info (title = f"Click to visit: { word } " )
103+
104+ # Add visual styling - blue underline
105+ # Note: PyMuPDF doesn't directly modify text color, but we can add visual indicators
106+ # We'll add a small blue rectangle under the text as a visual cue
107+ underline_rect = fitz .Rect (bbox .x0 , bbox .y1 - 1 , bbox .x1 , bbox .y1 )
108+ page .draw_rect (underline_rect , color = (0 , 0 , 1 ), width = 1 ) # Blue underline
109+
110+ self .processed_words += 1
111+
112+ if self .processed_words % 100 == 0 :
113+ print (f" Processed { self .processed_words } /{ self .total_matches } words..." )
114+
115+ def process_pdf (self ) -> None :
116+ """Main processing function"""
117+ print (f"📄 Processing PDF: { self .input_pdf } " )
118+ print (f"📝 Word list: { self .word_list_file } " )
119+ print (f"💾 Output: { self .output_pdf } " )
120+ print ("-" * 50 )
121+
122+ # Load word list
123+ self .load_word_list ()
124+
125+ # Open PDF
126+ doc = fitz .open (self .input_pdf )
127+ print (f"📖 PDF opened: { len (doc )} pages" )
128+
129+ # Find all word instances
130+ word_instances = self .find_word_instances (doc )
131+
132+ if not word_instances :
133+ print ("❌ No matching words found in the PDF" )
134+ return
135+
136+ # Add hyperlinks and styling
137+ self .add_hyperlinks_and_styling (doc , word_instances )
138+
139+ # Save the modified PDF
140+ doc .save (self .output_pdf )
141+ doc .close ()
142+
143+ print ("-" * 50 )
144+ print (f"✅ Processing complete!" )
145+ print (f"📊 Statistics:" )
146+ print (f" Total words processed: { self .processed_words } " )
147+ print (f" Total matches found: { self .total_matches } " )
148+ print (f" Output saved to: { self .output_pdf } " )
149+
150+ def create_sample_word_list (filename : str = "word_list.csv" ) -> None :
151+ """Create a sample word list CSV file for testing"""
152+ sample_words = [
153+ ["python" , "https://python.org" ],
154+ ["programming" , "https://en.wikipedia.org/wiki/Programming" ],
155+ ["algorithm" , "https://en.wikipedia.org/wiki/Algorithm" ],
156+ ["database" , "https://en.wikipedia.org/wiki/Database" ],
157+ ["machine learning" , "https://en.wikipedia.org/wiki/Machine_learning" ],
158+ ["artificial intelligence" , "https://en.wikipedia.org/wiki/Artificial_intelligence" ],
159+ ["data science" , "https://en.wikipedia.org/wiki/Data_science" ],
160+ ["web development" , "https://en.wikipedia.org/wiki/Web_development" ],
161+ ["cloud computing" , "https://en.wikipedia.org/wiki/Cloud_computing" ],
162+ ["cybersecurity" , "https://en.wikipedia.org/wiki/Computer_security" ]
163+ ]
164+
165+ with open (filename , 'w' , newline = '' , encoding = 'utf-8' ) as f :
166+ writer = csv .writer (f )
167+ writer .writerow (["word" , "hyperlink" ])
168+ writer .writerows (sample_words )
169+
170+ print (f"📝 Created sample word list: { filename } " )
171+
172+ def main ():
173+ if len (sys .argv ) != 4 :
174+ print ("Usage: python pdf_hyperlink_adder.py input.pdf word_list.csv output.pdf" )
175+ print ("\n Example:" )
176+ print (" python pdf_hyperlink_adder.py document.pdf word_list.csv document_with_links.pdf" )
177+ print ("\n To create a sample word list:" )
178+ print (" python pdf_hyperlink_adder.py --create-sample" )
179+ return
180+
181+ if sys .argv [1 ] == "--create-sample" :
182+ create_sample_word_list ()
183+ return
184+
185+ input_pdf = sys .argv [1 ]
186+ word_list_file = sys .argv [2 ]
187+ output_pdf = sys .argv [3 ]
188+
189+ # Check if files exist
190+ if not Path (input_pdf ).exists ():
191+ print (f"❌ Input PDF not found: { input_pdf } " )
192+ return
193+
194+ if not Path (word_list_file ).exists ():
195+ print (f"❌ Word list file not found: { word_list_file } " )
196+ return
197+
198+ # Process the PDF
199+ adder = PDFHyperlinkAdder (input_pdf , word_list_file , output_pdf )
200+ adder .process_pdf ()
201+
202+ if __name__ == "__main__" :
203+ main ()
0 commit comments