@@ -3,74 +3,6 @@ use crate::core::{BrowserManager, CdpPage};
33use crate :: core:: { Error , Result } ;
44use async_trait:: async_trait;
55
6- // API response types for Google Patents /xhr/query endpoint
7- #[ derive( serde:: Deserialize ) ]
8- struct ApiResponse {
9- results : ApiResults ,
10- }
11-
12- #[ derive( serde:: Deserialize ) ]
13- struct ApiResults {
14- total_num_results : u64 ,
15- cluster : Vec < ApiCluster > ,
16- }
17-
18- #[ derive( serde:: Deserialize ) ]
19- struct ApiCluster {
20- result : Vec < ApiPatentEntry > ,
21- }
22-
23- #[ derive( serde:: Deserialize ) ]
24- struct ApiPatentEntry {
25- patent : ApiPatent ,
26- }
27-
28- #[ derive( serde:: Deserialize ) ]
29- struct ApiPatent {
30- title : Option < String > ,
31- snippet : Option < String > ,
32- filing_date : Option < String > ,
33- assignee : Option < String > ,
34- publication_number : Option < String > ,
35- }
36-
37- fn convert_api_response ( api : ApiResponse ) -> SearchResult {
38- let patents = api
39- . results
40- . cluster
41- . iter ( )
42- . flat_map ( |cluster| cluster. result . iter ( ) )
43- . map ( |entry| {
44- let p = & entry. patent ;
45- let id = p. publication_number . clone ( ) . unwrap_or_default ( ) ;
46- Patent {
47- id : id. clone ( ) ,
48- title : p. title . clone ( ) . unwrap_or_default ( ) ,
49- abstract_text : None ,
50- description_paragraphs : None ,
51- claims : None ,
52- images : None ,
53- snippet : p. snippet . clone ( ) ,
54- description : None ,
55- filing_date : p. filing_date . clone ( ) ,
56- assignee : p. assignee . clone ( ) ,
57- related_application : None ,
58- claiming_priority : None ,
59- family_applications : None ,
60- legal_status : None ,
61- url : format ! ( "https://patents.google.com/patent/{}" , id) ,
62- }
63- } )
64- . collect ( ) ;
65-
66- SearchResult {
67- total_results : api. results . total_num_results . to_string ( ) ,
68- top_assignees : None ,
69- top_cpcs : None ,
70- patents,
71- }
72- }
73-
746#[ async_trait]
757pub trait PatentSearch : Send + Sync {
768 async fn search ( & self , options : & SearchOptions ) -> Result < SearchResult > ;
@@ -174,129 +106,56 @@ impl PatentSearcher {
174106 patents,
175107 } )
176108 } else {
177- // Search results page - fetch via /xhr/query API
178- let mut all_patents: Vec < Patent > = Vec :: new ( ) ;
109+ // Search results page - scrape from DOM
179110 let limit = options. limit . unwrap_or ( 10 ) ;
180- let mut total_results_str = "Unknown" . to_string ( ) ;
181- let mut top_assignees: Option < Vec < crate :: core:: models:: SummaryItem > > = None ;
182- let mut top_cpcs: Option < Vec < crate :: core:: models:: SummaryItem > > = None ;
183111
184112 if self . verbose {
185113 eprintln ! ( "Fetching search results (limit: {})..." , limit) ;
186114 }
187115
188- // Append num=100 to base_url to fetch more results per page if needed
189- let base_url = if limit > 10 { format ! ( "{}&num=100" , base_url) } else { base_url } ;
190-
191- // Calculate pagination
192- let results_per_page = if limit > 10 { 100 } else { 10 } ;
193- let pages_needed = limit. div_ceil ( results_per_page) ;
194-
195- for page_num in 0 ..pages_needed {
196- let page_url = if page_num == 0 {
197- base_url. clone ( )
198- } else {
199- format ! ( "{}&page={}" , base_url, page_num)
200- } ;
201-
202- if self . verbose {
203- eprintln ! ( "Loading page {} of {}..." , page_num + 1 , pages_needed) ;
204- eprintln ! ( "URL: {}" , page_url) ;
205- }
206-
207- page. goto ( & page_url) . await ?;
208-
209- // Check for bot detection / rate limiting page
210- let title = page
211- . evaluate ( "document.title" )
212- . await
213- . ok ( )
214- . and_then ( |v| v. as_str ( ) . map ( String :: from) )
215- . unwrap_or_default ( ) ;
216- if title == "Sorry..." {
217- let _ = page. close ( ) . await ;
218- return Err ( Error :: Search (
219- "Google blocked this request (bot detection / rate limiting). \
220- The IP address may be temporarily blocked. Try again later."
221- . to_string ( ) ,
222- ) ) ;
223- }
224-
225- // Build API URL from the search URL
226- let api_path =
227- base_url. strip_prefix ( "https://patents.google.com/" ) . unwrap_or ( & base_url) ;
228- let api_url = format ! ( "/xhr/query?url={}" , api_path) ;
229- let fetch_script = format ! (
230- r#"(async () => {{
231- try {{
232- const resp = await fetch("{}");
233- if (!resp.ok) return {{ error: "HTTP " + resp.status }};
234- return await resp.json();
235- }} catch(e) {{
236- return {{ error: e.message }};
237- }}
238- }})()"# ,
239- api_url
240- ) ;
241-
242- let api_result = page. evaluate ( & fetch_script) . await ?;
243-
244- if self . verbose {
245- if let Some ( err) = api_result. get ( "error" ) {
246- eprintln ! ( "API error: {}" , err) ;
247- } else {
248- eprintln ! ( "API response received" ) ;
249- }
250- }
251-
252- let sr = serde_json:: from_value :: < ApiResponse > ( api_result)
253- . map_err ( |e| Error :: Search ( format ! ( "Failed to parse API response: {}" , e) ) )
254- . map ( convert_api_response) ?;
255-
256- if page_num == 0 {
257- total_results_str = sr. total_results . clone ( ) ;
258- if self . verbose {
259- eprintln ! ( "Total results found: {}" , total_results_str) ;
260- }
261- top_assignees = sr. top_assignees ;
262- top_cpcs = sr. top_cpcs ;
263- }
264-
265- let page_patents = sr. patents ;
266-
267- if self . verbose {
268- eprintln ! ( "Found {} patents on this page" , page_patents. len( ) ) ;
269- }
270-
271- if page_patents. is_empty ( ) {
272- break ;
273- }
274-
275- all_patents. extend ( page_patents) ;
276-
277- if all_patents. len ( ) >= limit {
278- break ;
279- }
116+ page. goto ( & base_url) . await ?;
117+
118+ // Check for bot detection / rate limiting page
119+ let title = page
120+ . evaluate ( "document.title" )
121+ . await
122+ . ok ( )
123+ . and_then ( |v| v. as_str ( ) . map ( String :: from) )
124+ . unwrap_or_default ( ) ;
125+ if title == "Sorry..." {
126+ let _ = page. close ( ) . await ;
127+ return Err ( Error :: Search (
128+ "Google blocked this request (bot detection / rate limiting). \
129+ The IP address may be temporarily blocked. Try again later."
130+ . to_string ( ) ,
131+ ) ) ;
280132 }
133+
134+ if self . verbose {
135+ eprintln ! ( "Waiting for search results to load..." ) ;
136+ }
137+ // Wait for search results to render
138+ tokio:: time:: sleep ( std:: time:: Duration :: from_secs ( 3 ) ) . await ;
139+
140+ if self . verbose {
141+ eprintln ! ( "Extracting search results from DOM..." ) ;
142+ }
143+ let result = page. evaluate ( include_str ! ( "scripts/extract_search_results.js" ) ) . await ?;
144+ let mut sr: SearchResult = serde_json:: from_value ( result)
145+ . map_err ( |e| Error :: Search ( format ! ( "Failed to parse search results: {}" , e) ) ) ?;
146+
281147 let _ = page. close ( ) . await ;
282148
283149 if self . verbose {
284- eprintln ! ( "Total patents collected: {}" , all_patents. len( ) ) ;
150+ eprintln ! ( "Total results found: {}" , sr. total_results) ;
151+ eprintln ! ( "Patents on page: {}" , sr. patents. len( ) ) ;
285152 }
286153
287- if all_patents. len ( ) > limit {
288- if self . verbose {
289- eprintln ! ( "Truncating to limit: {}" , limit) ;
290- }
291- all_patents. truncate ( limit) ;
154+ if sr. patents . len ( ) > limit {
155+ sr. patents . truncate ( limit) ;
292156 }
293157
294- Ok ( SearchResult {
295- total_results : total_results_str,
296- top_assignees,
297- top_cpcs,
298- patents : all_patents,
299- } )
158+ Ok ( sr)
300159 }
301160 }
302161}
0 commit comments