diff --git a/README.md b/README.md index bd2448a..be0e0b4 100644 --- a/README.md +++ b/README.md @@ -34,8 +34,8 @@ $ so how do i reverse a list in python # search for a latex solution $ so --site tex how to put tilde over character -# use google to search stackoverflow.com, askubuntu.com, and unix.stackexchange.com -$ so -e google -s askubuntu -s stackoverflow -s unix how do i install linux +# search stackoverflow.com, askubuntu.com, and unix.stackexchange.com via startpage +$ so -e startpage -s askubuntu -s stackoverflow -s unix how do i install linux ``` ## installation @@ -169,15 +169,17 @@ StackExchange API with no key up to 300 times per day per IP, which I imagine is fine for most users. ### search engines -The available search engines are StackExchange, DuckDuckGo, and Google. +The available search engines are StackExchange, Startpage, DuckDuckGo, and Google. StackExchange will always be the fastest to search because it doesn't require an additional request or any HTML parsing; however, it is also very primitive. -~~DuckDuckGo is in second place for speed, as its response HTML is much smaller -than Google's. I've found that it performs well for my queries, so it is the -default search engine.~~ -DuckDuckGo [sometimes blocks requests](https://github.com/samtay/so/issues/16), so -it is no longer the default. +**Startpage** is the default search engine. It proxies Google search results and +serves them as static HTML, providing high quality results without requiring +JavaScript. + +Google and DuckDuckGo now require JavaScript execution for search results, making +them unreliable from a terminal client. They are still available via `-e google` +or `-e duckduckgo` but may not return results. ### multi-site searching As stated in the [docs](https://api.stackexchange.com/docs/throttle), diff --git a/benches/html_parsing.rs b/benches/html_parsing.rs index 6ab6c39..311d309 100644 --- a/benches/html_parsing.rs +++ b/benches/html_parsing.rs @@ -1,5 +1,5 @@ use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; -use so::stackexchange::scraper::{DuckDuckGo, Google, Scraper}; +use so::stackexchange::scraper::{DuckDuckGo, Google, Scraper, Startpage}; use std::collections::HashMap; use std::time::Duration; @@ -46,6 +46,12 @@ fn bench_html_parsers(c: &mut Criterion) { |b, html| b.iter(|| DuckDuckGo.parse(html, &sites, limit)), ); + group.bench_with_input( + BenchmarkId::new("Startpage.parse", "exit-vim"), + include_str!("../test/startpage/exit-vim.html"), + |b, html| b.iter(|| Startpage.parse(html, &sites, limit)), + ); + let mut sites = HashMap::new(); sites.insert( String::from("stackoverflow"), diff --git a/src/cli.rs b/src/cli.rs index 39b30e5..136b739 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -117,7 +117,7 @@ where .num_args(1) .default_value(&engine) .value_name("engine") - .value_parser(["duckduckgo", "google", "stackexchange"]) + .value_parser(["duckduckgo", "google", "startpage", "stackexchange"]) .help("Use specified search engine") .next_line_help(true), ); diff --git a/src/config.rs b/src/config.rs index e11ebee..086bbde 100644 --- a/src/config.rs +++ b/src/config.rs @@ -15,8 +15,9 @@ use crate::utils; #[derive(Default)] pub enum SearchEngine { DuckDuckGo, - #[default] Google, + #[default] + Startpage, StackExchange, } @@ -36,6 +37,7 @@ impl fmt::Display for SearchEngine { let s = match &self { SearchEngine::DuckDuckGo => "duckduckgo", SearchEngine::Google => "google", + SearchEngine::Startpage => "startpage", SearchEngine::StackExchange => "stackexchange", }; write!(f, "{s}") diff --git a/src/stackexchange/mod.rs b/src/stackexchange/mod.rs index 1a66cdf..1a3307c 100644 --- a/src/stackexchange/mod.rs +++ b/src/stackexchange/mod.rs @@ -8,6 +8,163 @@ pub use api::{Answer, Id, Question}; pub use local_storage::{LocalStorage, SiteMap}; pub use search::Search; -/// Mock user agent +use reqwest::header::{self, HeaderMap, HeaderName, HeaderValue}; +use reqwest::Client; + +/// Mock user agent (kept for API client which doesn't need browser spoofing) const USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0"; + +/// Pool of modern browser User-Agent strings for scraper requests. +/// Covers Chrome, Firefox, Safari, and Edge across Windows, macOS, and Linux. +const SCRAPER_USER_AGENTS: &[&str] = &[ + // Chrome 131 - Windows + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + // Chrome 131 - macOS + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + // Firefox 133 - Windows + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0", + // Firefox 133 - Linux + "Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0", + // Safari 17.5 - macOS + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15", + // Chrome 131 - Linux + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + // Edge 131 - Windows + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0", + // Firefox 133 - macOS + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:133.0) Gecko/20100101 Firefox/133.0", +]; + +/// Select a User-Agent from the pool using the process ID for variation +/// across invocations without requiring the `rand` crate. +fn select_scraper_user_agent() -> &'static str { + let index = (std::process::id() as usize) % SCRAPER_USER_AGENTS.len(); + SCRAPER_USER_AGENTS[index] +} + +/// Build browser-like default headers matched to the selected User-Agent. +fn scraper_headers() -> HeaderMap { + let ua = select_scraper_user_agent(); + let mut headers = HeaderMap::new(); + + headers.insert(header::USER_AGENT, HeaderValue::from_static(ua)); + headers.insert( + header::ACCEPT_LANGUAGE, + HeaderValue::from_static("en-US,en;q=0.9"), + ); + headers.insert( + HeaderName::from_static("upgrade-insecure-requests"), + HeaderValue::from_static("1"), + ); + + // Match Accept header to browser family + if ua.contains("Firefox") { + headers.insert( + header::ACCEPT, + HeaderValue::from_static( + "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + ), + ); + } else { + headers.insert( + header::ACCEPT, + HeaderValue::from_static( + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", + ), + ); + } + + // Chrome and Edge send Sec-Fetch-* headers + if ua.contains("Chrome") || ua.contains("Edg/") { + headers.insert( + HeaderName::from_static("sec-fetch-dest"), + HeaderValue::from_static("document"), + ); + headers.insert( + HeaderName::from_static("sec-fetch-mode"), + HeaderValue::from_static("navigate"), + ); + headers.insert( + HeaderName::from_static("sec-fetch-site"), + HeaderValue::from_static("none"), + ); + headers.insert( + HeaderName::from_static("sec-fetch-user"), + HeaderValue::from_static("?1"), + ); + } + + headers +} + +/// Build an HTTP client with browser-like headers for scraping search engines. +pub(crate) fn scraper_client() -> Client { + Client::builder() + .default_headers(scraper_headers()) + .build() + .expect("Failed to build scraper HTTP client") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_user_agent_pool_contains_only_modern_agents() { + for ua in SCRAPER_USER_AGENTS { + assert!(!ua.contains("Firefox/11.0"), "{}", ua); + assert!(!ua.contains("Mac OS X 10.7"), "{}", ua); + } + } + + #[test] + fn test_select_scraper_user_agent_returns_from_pool() { + let ua = select_scraper_user_agent(); + assert!(SCRAPER_USER_AGENTS.contains(&ua)); + } + + #[test] + fn test_select_scraper_user_agent_is_deterministic() { + let ua1 = select_scraper_user_agent(); + let ua2 = select_scraper_user_agent(); + assert_eq!(ua1, ua2, "Same process should always select the same UA"); + } + + #[test] + fn test_scraper_headers_include_required_fields() { + let headers = scraper_headers(); + assert!(headers.contains_key(header::USER_AGENT)); + assert!(headers.contains_key(header::ACCEPT)); + assert!(headers.contains_key(header::ACCEPT_LANGUAGE)); + assert!(headers.contains_key("upgrade-insecure-requests")); + } + + #[test] + fn test_scraper_headers_match_browser_family() { + let headers = scraper_headers(); + let ua = headers.get(header::USER_AGENT).unwrap().to_str().unwrap(); + let accept = headers.get(header::ACCEPT).unwrap().to_str().unwrap(); + + if ua.contains("Firefox") { + // Firefox uses a shorter Accept without image types + assert!( + !accept.contains("image/avif"), + "Firefox UA should not have Chrome-style Accept" + ); + } + + if ua.contains("Chrome") || ua.contains("Edg/") { + // Chrome/Edge should have Sec-Fetch headers + assert!( + headers.contains_key("sec-fetch-dest"), + "Chrome/Edge UA should have sec-fetch-dest" + ); + } + } + + #[test] + fn test_scraper_client_builds_successfully() { + let _client = scraper_client(); + } +} diff --git a/src/stackexchange/scraper.rs b/src/stackexchange/scraper.rs index 3cc0218..bf2b806 100644 --- a/src/stackexchange/scraper.rs +++ b/src/stackexchange/scraper.rs @@ -10,6 +10,7 @@ use crate::error::{Error, Result}; /// DuckDuckGo URL const DUCKDUCKGO_URL: &str = "https://duckduckgo.com"; const GOOGLE_URL: &str = "https://google.com/search"; +const STARTPAGE_URL: &str = "https://www.startpage.com/do/search"; // Is question_id unique across all sites? If not, then this edge case is // unaccounted for when sorting. @@ -89,8 +90,7 @@ impl Scraper for Google { parse_with_selector(anchors, html, sites, limit) } - /// Creates duckduckgo search url given sites and query - /// See https://duckduckgo.com/params for more info + /// Creates google search url given sites and query fn get_url<'a, I>(&self, query: &str, sites: I) -> Url where I: IntoIterator, @@ -100,6 +100,31 @@ impl Scraper for Google { } } +pub struct Startpage; + +impl Scraper for Startpage { + /// Parse SE data out of Startpage search results html. + /// Startpage proxies Google results and serves them as static HTML. + fn parse( + &self, + html: &str, + sites: &HashMap, + limit: u16, + ) -> Result { + let anchors = Selector::parse("a.result-title").unwrap(); + parse_with_selector(anchors, html, sites, limit) + } + + /// Creates Startpage search url given sites and query + fn get_url<'a, I>(&self, query: &str, sites: I) -> Url + where + I: IntoIterator, + { + let q = make_query_arg(query, sites); + Url::parse_with_params(STARTPAGE_URL, &[("q", q.as_str())]).unwrap() + } +} + fn make_query_arg<'a, I>(query: &str, sites: I) -> String where I: IntoIterator, @@ -330,6 +355,36 @@ mod tests { } } + #[test] + fn test_startpage_url() { + let q = "how do I exit vim?"; + let sites = vec![ + String::from("stackoverflow.com"), + String::from("unix.stackexchange.com"), + ]; + assert_eq!( + Startpage.get_url(q, &sites).as_str(), + String::from( + "https://www.startpage.com/do/search\ + ?q=%28site%3Astackoverflow.com+OR+site%3Aunix.stackexchange.com%29\ + +how+do+I+exit+vim" + ) + ) + } + + #[test] + fn test_startpage_parser() { + let html = include_str!("../../test/startpage/exit-vim.html"); + let mut sites = HashMap::new(); + sites.insert( + String::from("stackoverflow"), + String::from("stackoverflow.com"), + ); + let data = Startpage.parse(html, &sites, 3).unwrap(); + assert_eq!(data.question_ids["stackoverflow"].len(), 3); + assert_eq!(data.question_ids["stackoverflow"][0], "11828270"); + } + #[test] fn test_question_url_to_id() { // Happy path diff --git a/src/stackexchange/search.rs b/src/stackexchange/search.rs index 8b40615..2ac2d71 100644 --- a/src/stackexchange/search.rs +++ b/src/stackexchange/search.rs @@ -1,7 +1,5 @@ use futures::stream::StreamExt; use rayon::prelude::*; -use reqwest::header; -use reqwest::Client; use std::sync::Arc; use crate::config::{Config, SearchEngine}; @@ -11,7 +9,7 @@ use crate::tui::markdown::Markdown; use super::api::{Answer, Api, Question}; use super::local_storage::SiteMap; -use super::scraper::{DuckDuckGo, Google, ScrapedData, Scraper}; +use super::scraper::{DuckDuckGo, Google, ScrapedData, Scraper, Startpage}; /// Limit on concurrent requests (gets passed to `buffer_unordered`) const CONCURRENT_REQUESTS_LIMIT: usize = 8; @@ -84,6 +82,7 @@ impl Search { match self.config.search_engine { SearchEngine::DuckDuckGo => self.search_by_scraper(DuckDuckGo).await, SearchEngine::Google => self.search_by_scraper(Google).await, + SearchEngine::Startpage => self.search_by_scraper(Startpage).await, SearchEngine::StackExchange => self.parallel_search_advanced().await, } .and_then(|qs| { @@ -98,9 +97,8 @@ impl Search { /// Search query at duckduckgo and then fetch the resulting questions from SE. async fn search_by_scraper(&self, scraper: impl Scraper) -> Result>> { let url = scraper.get_url(&self.query, self.site_map.values()); - let html = Client::new() + let html = super::scraper_client() .get(url) - .header(header::USER_AGENT, super::USER_AGENT) .send() .await? .text() diff --git a/test/startpage/exit-vim.html b/test/startpage/exit-vim.html new file mode 100644 index 0000000..4e50ad1 --- /dev/null +++ b/test/startpage/exit-vim.html @@ -0,0 +1,822 @@ + + + + + Startpage Search Results + + + + + + + + + + + + + +
skip to main content
+
+
Web results
favicon

How do I exit Vim? - Stack Overflow

Aug 6, 2012 ... 218. Are you just trying to quit VIM ? · 80. Don't forget the colon! · 164. It's really easy to learn the basics of vim, and it's built right into ...

+
+
You're all set! Your searches are now private with Startpage.
No Search History
No Ad Targeting
No Digital Footprint
+ + + + + + + +
+ + +