Skip to content

Commit 16ae9d0

Browse files
sonesukeclaude
andauthored
fix: use DOM scraping for search results and fix assignee URL encoding (#85)
* feat: add lib.rs and fix rust-cache in CI workflows Split into lib + bin crate structure so core, cli, and mcp modules are available as a library for external crates. Also remove the incorrect `workspaces` parameter from Swatinem/rust-cache since this is a single crate, not a Cargo workspace. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: use DOM scraping for search results and fix assignee URL encoding - Replace /xhr/query API calls with DOM scraping via extract_search_results.js, so search filters (assignee, country, etc.) are correctly applied - Fix assignee parameter: remove unnecessary quotes and use url::Url serializer instead of manual URL construction - Split mise run test into unit tests and e2e tests Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * ci: run unit tests only in CI workflow Exclude e2e tests which depend on browser/network and are flaky in CI. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: wait for page render instead of relying on element selector Use a fixed delay before DOM scraping to let search results render, instead of wait_for_element which may not find shadow DOM elements. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 31071df commit 16ae9d0

4 files changed

Lines changed: 62 additions & 225 deletions

File tree

.github/workflows/ci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ jobs:
3232
- name: Build and lint
3333
run: cargo clippy --all-targets -- -D warnings
3434

35-
- name: Run tests
36-
run: cargo test
35+
- name: Run unit tests
36+
run: cargo test --lib --all
3737

3838
- name: Check build
3939
run: cargo check --release

mise.toml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,15 @@ description = "Lint with cargo clippy"
2727
run = "cargo clippy --all-targets -- -D warnings"
2828

2929
[tasks.test]
30-
description = "Run tests with cargo test"
31-
run = "RUSTFLAGS=\"-D warnings\" cargo test --all-targets"
30+
description = "Run unit tests"
31+
run = "RUSTFLAGS=\"-D warnings\" cargo test --lib --all"
32+
33+
[tasks.test-e2e]
34+
description = "Run e2e tests"
35+
run = "cargo test --test e2e_cli --test e2e_mcp"
3236

3337
[tasks.pre-commit]
34-
description = "Run all of the above"
38+
description = "Run fmt, clippy, and unit tests"
3539
depends = ["fmt", "clippy", "test"]
3640

3741
[tasks.skill-test]

src/core/models.rs

Lines changed: 17 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -126,12 +126,19 @@ impl SearchOptions {
126126
q_parts.push(query.clone());
127127
}
128128

129-
// Assignee is handled manually later to support comma separation
130-
131129
if !q_parts.is_empty() {
132130
serializer.append_pair("q", &q_parts.join(" "));
133131
}
134132

133+
// Add assignee as separate parameter (no quotes)
134+
if let Some(assignees) = &self.assignee
135+
&& !assignees.is_empty()
136+
{
137+
for a in assignees {
138+
serializer.append_pair("assignee", a);
139+
}
140+
}
141+
135142
if let Some(country) = &self.country {
136143
serializer.append_pair("country", country);
137144
match country.to_uppercase().as_str() {
@@ -176,31 +183,7 @@ impl SearchOptions {
176183
}
177184
}
178185

179-
let mut url_str = url.to_string();
180-
181-
// Manually append assignee parameter if present
182-
if let Some(assignees) = &self.assignee
183-
&& !assignees.is_empty()
184-
{
185-
let encoded_assignees: Vec<String> = assignees
186-
.iter()
187-
.map(|a| {
188-
// Encode each assignee value, including quotes, using form_urlencoded logic
189-
let quoted = format!("\"{}\"", a);
190-
url::form_urlencoded::byte_serialize(quoted.as_bytes()).collect::<String>()
191-
})
192-
.collect();
193-
194-
// Determine if we need to add '?' or '&'
195-
let separator = if !url_str.contains('?') {
196-
"?"
197-
} else if url_str.ends_with('?') {
198-
""
199-
} else {
200-
"&"
201-
};
202-
url_str.push_str(&format!("{}assignee={}", separator, encoded_assignees.join(",")));
203-
}
186+
let url_str = url.to_string();
204187

205188
// Manual check for empty params (after constructing)
206189
// Check if url string ends with / or /? and has no params
@@ -260,33 +243,26 @@ mod tests {
260243
// Test assignee only (single assignee)
261244
let options =
262245
SearchOptions { assignee: Some(vec!["Google LLC".to_string()]), ..Default::default() };
263-
// assignee="Google LLC" -> encoded %22Google%20LLC%22
264246
let url = options.to_url().unwrap();
265-
266-
// Since no other params, it should start with ?assignee=
267-
// form_urlencoded::byte_serialize uses + for spaces in query values
268-
assert!(url.contains("?assignee=%22Google+LLC%22"));
247+
assert_eq!(url, "https://patents.google.com/?assignee=Google+LLC");
269248

270249
// Test assignee (multiple assignees)
271250
let options = SearchOptions {
272251
assignee: Some(vec!["Google LLC".to_string(), "Microsoft Corp".to_string()]),
273252
..Default::default()
274253
};
275-
// assignee="Google LLC","Microsoft Corp"
276-
// Encoded individual values, joined by comma
277254
let url = options.to_url().unwrap();
278-
assert!(url.contains("?assignee=%22Google+LLC%22,%22Microsoft+Corp%22"));
255+
assert!(url.contains("assignee=Google+LLC"));
256+
assert!(url.contains("assignee=Microsoft+Corp"));
279257

280-
// Test assignee (comma handling)
258+
// Test assignee with comma in name
281259
let options = SearchOptions {
282260
assignee: Some(vec!["Salesforce.com, inc.".to_string()]),
283261
..Default::default()
284262
};
285263
let url = options.to_url().unwrap();
286-
// assignee="Salesforce.com, inc."
287-
// comma inside quotes encoded as %2C. space as %20.
288-
// %22Salesforce.com%2C%20inc.%22
289-
assert!(url.contains("?assignee=%22Salesforce.com%2C+inc.%22"));
264+
// comma encoded as %2C
265+
assert!(url.contains("assignee=Salesforce.com%2C+inc."));
290266

291267
// Test query with assignee
292268
let options = SearchOptions {
@@ -295,11 +271,9 @@ mod tests {
295271
country: None,
296272
..Default::default()
297273
};
298-
// q=foo&assignee="Google LLC"
299-
// q is added via serializer (foo). assignee appended manually (&assignee=...)
300274
let url = options.to_url().unwrap();
301275
assert!(url.contains("q=foo"));
302-
assert!(url.contains("&assignee=%22Google+LLC%22"));
276+
assert!(url.contains("assignee=Google+LLC"));
303277

304278
// Test query with country (JP should add language=JAPANESE)
305279
let options = SearchOptions {

src/core/patent_search.rs

Lines changed: 36 additions & 177 deletions
Original file line numberDiff line numberDiff line change
@@ -3,74 +3,6 @@ use crate::core::{BrowserManager, CdpPage};
33
use crate::core::{Error, Result};
44
use async_trait::async_trait;
55

6-
// API response types for Google Patents /xhr/query endpoint
7-
#[derive(serde::Deserialize)]
8-
struct ApiResponse {
9-
results: ApiResults,
10-
}
11-
12-
#[derive(serde::Deserialize)]
13-
struct ApiResults {
14-
total_num_results: u64,
15-
cluster: Vec<ApiCluster>,
16-
}
17-
18-
#[derive(serde::Deserialize)]
19-
struct ApiCluster {
20-
result: Vec<ApiPatentEntry>,
21-
}
22-
23-
#[derive(serde::Deserialize)]
24-
struct ApiPatentEntry {
25-
patent: ApiPatent,
26-
}
27-
28-
#[derive(serde::Deserialize)]
29-
struct ApiPatent {
30-
title: Option<String>,
31-
snippet: Option<String>,
32-
filing_date: Option<String>,
33-
assignee: Option<String>,
34-
publication_number: Option<String>,
35-
}
36-
37-
fn convert_api_response(api: ApiResponse) -> SearchResult {
38-
let patents = api
39-
.results
40-
.cluster
41-
.iter()
42-
.flat_map(|cluster| cluster.result.iter())
43-
.map(|entry| {
44-
let p = &entry.patent;
45-
let id = p.publication_number.clone().unwrap_or_default();
46-
Patent {
47-
id: id.clone(),
48-
title: p.title.clone().unwrap_or_default(),
49-
abstract_text: None,
50-
description_paragraphs: None,
51-
claims: None,
52-
images: None,
53-
snippet: p.snippet.clone(),
54-
description: None,
55-
filing_date: p.filing_date.clone(),
56-
assignee: p.assignee.clone(),
57-
related_application: None,
58-
claiming_priority: None,
59-
family_applications: None,
60-
legal_status: None,
61-
url: format!("https://patents.google.com/patent/{}", id),
62-
}
63-
})
64-
.collect();
65-
66-
SearchResult {
67-
total_results: api.results.total_num_results.to_string(),
68-
top_assignees: None,
69-
top_cpcs: None,
70-
patents,
71-
}
72-
}
73-
746
#[async_trait]
757
pub trait PatentSearch: Send + Sync {
768
async fn search(&self, options: &SearchOptions) -> Result<SearchResult>;
@@ -174,129 +106,56 @@ impl PatentSearcher {
174106
patents,
175107
})
176108
} else {
177-
// Search results page - fetch via /xhr/query API
178-
let mut all_patents: Vec<Patent> = Vec::new();
109+
// Search results page - scrape from DOM
179110
let limit = options.limit.unwrap_or(10);
180-
let mut total_results_str = "Unknown".to_string();
181-
let mut top_assignees: Option<Vec<crate::core::models::SummaryItem>> = None;
182-
let mut top_cpcs: Option<Vec<crate::core::models::SummaryItem>> = None;
183111

184112
if self.verbose {
185113
eprintln!("Fetching search results (limit: {})...", limit);
186114
}
187115

188-
// Append num=100 to base_url to fetch more results per page if needed
189-
let base_url = if limit > 10 { format!("{}&num=100", base_url) } else { base_url };
190-
191-
// Calculate pagination
192-
let results_per_page = if limit > 10 { 100 } else { 10 };
193-
let pages_needed = limit.div_ceil(results_per_page);
194-
195-
for page_num in 0..pages_needed {
196-
let page_url = if page_num == 0 {
197-
base_url.clone()
198-
} else {
199-
format!("{}&page={}", base_url, page_num)
200-
};
201-
202-
if self.verbose {
203-
eprintln!("Loading page {} of {}...", page_num + 1, pages_needed);
204-
eprintln!("URL: {}", page_url);
205-
}
206-
207-
page.goto(&page_url).await?;
208-
209-
// Check for bot detection / rate limiting page
210-
let title = page
211-
.evaluate("document.title")
212-
.await
213-
.ok()
214-
.and_then(|v| v.as_str().map(String::from))
215-
.unwrap_or_default();
216-
if title == "Sorry..." {
217-
let _ = page.close().await;
218-
return Err(Error::Search(
219-
"Google blocked this request (bot detection / rate limiting). \
220-
The IP address may be temporarily blocked. Try again later."
221-
.to_string(),
222-
));
223-
}
224-
225-
// Build API URL from the search URL
226-
let api_path =
227-
base_url.strip_prefix("https://patents.google.com/").unwrap_or(&base_url);
228-
let api_url = format!("/xhr/query?url={}", api_path);
229-
let fetch_script = format!(
230-
r#"(async () => {{
231-
try {{
232-
const resp = await fetch("{}");
233-
if (!resp.ok) return {{ error: "HTTP " + resp.status }};
234-
return await resp.json();
235-
}} catch(e) {{
236-
return {{ error: e.message }};
237-
}}
238-
}})()"#,
239-
api_url
240-
);
241-
242-
let api_result = page.evaluate(&fetch_script).await?;
243-
244-
if self.verbose {
245-
if let Some(err) = api_result.get("error") {
246-
eprintln!("API error: {}", err);
247-
} else {
248-
eprintln!("API response received");
249-
}
250-
}
251-
252-
let sr = serde_json::from_value::<ApiResponse>(api_result)
253-
.map_err(|e| Error::Search(format!("Failed to parse API response: {}", e)))
254-
.map(convert_api_response)?;
255-
256-
if page_num == 0 {
257-
total_results_str = sr.total_results.clone();
258-
if self.verbose {
259-
eprintln!("Total results found: {}", total_results_str);
260-
}
261-
top_assignees = sr.top_assignees;
262-
top_cpcs = sr.top_cpcs;
263-
}
264-
265-
let page_patents = sr.patents;
266-
267-
if self.verbose {
268-
eprintln!("Found {} patents on this page", page_patents.len());
269-
}
270-
271-
if page_patents.is_empty() {
272-
break;
273-
}
274-
275-
all_patents.extend(page_patents);
276-
277-
if all_patents.len() >= limit {
278-
break;
279-
}
116+
page.goto(&base_url).await?;
117+
118+
// Check for bot detection / rate limiting page
119+
let title = page
120+
.evaluate("document.title")
121+
.await
122+
.ok()
123+
.and_then(|v| v.as_str().map(String::from))
124+
.unwrap_or_default();
125+
if title == "Sorry..." {
126+
let _ = page.close().await;
127+
return Err(Error::Search(
128+
"Google blocked this request (bot detection / rate limiting). \
129+
The IP address may be temporarily blocked. Try again later."
130+
.to_string(),
131+
));
280132
}
133+
134+
if self.verbose {
135+
eprintln!("Waiting for search results to load...");
136+
}
137+
// Wait for search results to render
138+
tokio::time::sleep(std::time::Duration::from_secs(3)).await;
139+
140+
if self.verbose {
141+
eprintln!("Extracting search results from DOM...");
142+
}
143+
let result = page.evaluate(include_str!("scripts/extract_search_results.js")).await?;
144+
let mut sr: SearchResult = serde_json::from_value(result)
145+
.map_err(|e| Error::Search(format!("Failed to parse search results: {}", e)))?;
146+
281147
let _ = page.close().await;
282148

283149
if self.verbose {
284-
eprintln!("Total patents collected: {}", all_patents.len());
150+
eprintln!("Total results found: {}", sr.total_results);
151+
eprintln!("Patents on page: {}", sr.patents.len());
285152
}
286153

287-
if all_patents.len() > limit {
288-
if self.verbose {
289-
eprintln!("Truncating to limit: {}", limit);
290-
}
291-
all_patents.truncate(limit);
154+
if sr.patents.len() > limit {
155+
sr.patents.truncate(limit);
292156
}
293157

294-
Ok(SearchResult {
295-
total_results: total_results_str,
296-
top_assignees,
297-
top_cpcs,
298-
patents: all_patents,
299-
})
158+
Ok(sr)
300159
}
301160
}
302161
}

0 commit comments

Comments
 (0)