Skip to content

Commit 3943b1e

Browse files
DavidTejuclaude
andcommitted
Add Startpage as search backend and make it the default
Google and DuckDuckGo now require JavaScript execution for search results, making their scraper backends non-functional. Startpage proxies Google results as static HTML that can be scraped without JS. - Add Startpage scraper with `a.result-title` CSS selector - Set Startpage as the default search engine for new installs - Add test fixture and parser test for Startpage results - Existing user configs are preserved (no forced migration) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent b5d5a8e commit 3943b1e

5 files changed

Lines changed: 884 additions & 3 deletions

File tree

src/cli.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ where
117117
.num_args(1)
118118
.default_value(&engine)
119119
.value_name("engine")
120-
.value_parser(["duckduckgo", "google", "stackexchange"])
120+
.value_parser(["duckduckgo", "google", "startpage", "stackexchange"])
121121
.help("Use specified search engine")
122122
.next_line_help(true),
123123
);

src/config.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@ use crate::utils;
1515
#[derive(Default)]
1616
pub enum SearchEngine {
1717
DuckDuckGo,
18-
#[default]
1918
Google,
19+
#[default]
20+
Startpage,
2021
StackExchange,
2122
}
2223

@@ -36,6 +37,7 @@ impl fmt::Display for SearchEngine {
3637
let s = match &self {
3738
SearchEngine::DuckDuckGo => "duckduckgo",
3839
SearchEngine::Google => "google",
40+
SearchEngine::Startpage => "startpage",
3941
SearchEngine::StackExchange => "stackexchange",
4042
};
4143
write!(f, "{s}")

src/stackexchange/scraper.rs

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use crate::error::{Error, Result};
1010
/// DuckDuckGo URL
1111
const DUCKDUCKGO_URL: &str = "https://duckduckgo.com";
1212
const GOOGLE_URL: &str = "https://google.com/search";
13+
const STARTPAGE_URL: &str = "https://www.startpage.com/do/search";
1314

1415
// Is question_id unique across all sites? If not, then this edge case is
1516
// unaccounted for when sorting.
@@ -100,6 +101,31 @@ impl Scraper for Google {
100101
}
101102
}
102103

104+
pub struct Startpage;
105+
106+
impl Scraper for Startpage {
107+
/// Parse SE data out of Startpage search results html.
108+
/// Startpage proxies Google results and serves them as static HTML.
109+
fn parse(
110+
&self,
111+
html: &str,
112+
sites: &HashMap<String, String>,
113+
limit: u16,
114+
) -> Result<ScrapedData> {
115+
let anchors = Selector::parse("a.result-title").unwrap();
116+
parse_with_selector(anchors, html, sites, limit)
117+
}
118+
119+
/// Creates Startpage search url given sites and query
120+
fn get_url<'a, I>(&self, query: &str, sites: I) -> Url
121+
where
122+
I: IntoIterator<Item = &'a String>,
123+
{
124+
let q = make_query_arg(query, sites);
125+
Url::parse_with_params(STARTPAGE_URL, &[("q", q.as_str())]).unwrap()
126+
}
127+
}
128+
103129
fn make_query_arg<'a, I>(query: &str, sites: I) -> String
104130
where
105131
I: IntoIterator<Item = &'a String>,
@@ -330,6 +356,36 @@ mod tests {
330356
}
331357
}
332358

359+
#[test]
360+
fn test_startpage_url() {
361+
let q = "how do I exit vim?";
362+
let sites = vec![
363+
String::from("stackoverflow.com"),
364+
String::from("unix.stackexchange.com"),
365+
];
366+
assert_eq!(
367+
Startpage.get_url(q, &sites).as_str(),
368+
String::from(
369+
"https://www.startpage.com/do/search\
370+
?q=%28site%3Astackoverflow.com+OR+site%3Aunix.stackexchange.com%29\
371+
+how+do+I+exit+vim"
372+
)
373+
)
374+
}
375+
376+
#[test]
377+
fn test_startpage_parser() {
378+
let html = include_str!("../../test/startpage/exit-vim.html");
379+
let mut sites = HashMap::new();
380+
sites.insert(
381+
String::from("stackoverflow"),
382+
String::from("stackoverflow.com"),
383+
);
384+
let data = Startpage.parse(html, &sites, 3).unwrap();
385+
assert_eq!(data.question_ids["stackoverflow"].len(), 3);
386+
assert_eq!(data.question_ids["stackoverflow"][0], "11828270");
387+
}
388+
333389
#[test]
334390
fn test_question_url_to_id() {
335391
// Happy path

src/stackexchange/search.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use crate::tui::markdown::Markdown;
99

1010
use super::api::{Answer, Api, Question};
1111
use super::local_storage::SiteMap;
12-
use super::scraper::{DuckDuckGo, Google, ScrapedData, Scraper};
12+
use super::scraper::{DuckDuckGo, Google, ScrapedData, Scraper, Startpage};
1313

1414
/// Limit on concurrent requests (gets passed to `buffer_unordered`)
1515
const CONCURRENT_REQUESTS_LIMIT: usize = 8;
@@ -82,6 +82,7 @@ impl Search {
8282
match self.config.search_engine {
8383
SearchEngine::DuckDuckGo => self.search_by_scraper(DuckDuckGo).await,
8484
SearchEngine::Google => self.search_by_scraper(Google).await,
85+
SearchEngine::Startpage => self.search_by_scraper(Startpage).await,
8586
SearchEngine::StackExchange => self.parallel_search_advanced().await,
8687
}
8788
.and_then(|qs| {

0 commit comments

Comments
 (0)