Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ $ so how do i reverse a list in python
# search for a latex solution
$ so --site tex how to put tilde over character

# use google to search stackoverflow.com, askubuntu.com, and unix.stackexchange.com
$ so -e google -s askubuntu -s stackoverflow -s unix how do i install linux
# search stackoverflow.com, askubuntu.com, and unix.stackexchange.com via startpage
$ so -e startpage -s askubuntu -s stackoverflow -s unix how do i install linux
```

## installation
Expand Down Expand Up @@ -169,15 +169,17 @@ StackExchange API with no key up to 300 times per day per IP, which I imagine is
fine for most users.

### search engines
The available search engines are StackExchange, DuckDuckGo, and Google.
The available search engines are StackExchange, Startpage, DuckDuckGo, and Google.
StackExchange will always be the fastest to search because it doesn't require an
additional request or any HTML parsing; however, it is also very primitive.
~~DuckDuckGo is in second place for speed, as its response HTML is much smaller
than Google's. I've found that it performs well for my queries, so it is the
default search engine.~~

DuckDuckGo [sometimes blocks requests](https://github.com/samtay/so/issues/16), so
it is no longer the default.
**Startpage** is the default search engine. It proxies Google search results and
serves them as static HTML, providing high quality results without requiring
JavaScript.

Google and DuckDuckGo now require JavaScript execution for search results, making
them unreliable from a terminal client. They are still available via `-e google`
or `-e duckduckgo` but may not return results.

### multi-site searching
As stated in the [docs](https://api.stackexchange.com/docs/throttle),
Expand Down
8 changes: 7 additions & 1 deletion benches/html_parsing.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use so::stackexchange::scraper::{DuckDuckGo, Google, Scraper};
use so::stackexchange::scraper::{DuckDuckGo, Google, Scraper, Startpage};
use std::collections::HashMap;
use std::time::Duration;

Expand Down Expand Up @@ -46,6 +46,12 @@ fn bench_html_parsers(c: &mut Criterion) {
|b, html| b.iter(|| DuckDuckGo.parse(html, &sites, limit)),
);

group.bench_with_input(
BenchmarkId::new("Startpage.parse", "exit-vim"),
include_str!("../test/startpage/exit-vim.html"),
|b, html| b.iter(|| Startpage.parse(html, &sites, limit)),
);

let mut sites = HashMap::new();
sites.insert(
String::from("stackoverflow"),
Expand Down
2 changes: 1 addition & 1 deletion src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ where
.num_args(1)
.default_value(&engine)
.value_name("engine")
.value_parser(["duckduckgo", "google", "stackexchange"])
.value_parser(["duckduckgo", "google", "startpage", "stackexchange"])
.help("Use specified search engine")
.next_line_help(true),
);
Expand Down
4 changes: 3 additions & 1 deletion src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ use crate::utils;
#[derive(Default)]
pub enum SearchEngine {
DuckDuckGo,
#[default]
Google,
#[default]
Startpage,
StackExchange,
}

Expand All @@ -36,6 +37,7 @@ impl fmt::Display for SearchEngine {
let s = match &self {
SearchEngine::DuckDuckGo => "duckduckgo",
SearchEngine::Google => "google",
SearchEngine::Startpage => "startpage",
SearchEngine::StackExchange => "stackexchange",
};
write!(f, "{s}")
Expand Down
159 changes: 158 additions & 1 deletion src/stackexchange/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,163 @@ pub use api::{Answer, Id, Question};
pub use local_storage::{LocalStorage, SiteMap};
pub use search::Search;

/// Mock user agent
use reqwest::header::{self, HeaderMap, HeaderName, HeaderValue};
use reqwest::Client;

/// Mock user agent (kept for API client which doesn't need browser spoofing)
const USER_AGENT: &str =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0";

/// Pool of modern browser User-Agent strings for scraper requests.
/// Covers Chrome, Firefox, Safari, and Edge across Windows, macOS, and Linux.
const SCRAPER_USER_AGENTS: &[&str] = &[
// Chrome 131 - Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
// Chrome 131 - macOS
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
// Firefox 133 - Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
// Firefox 133 - Linux
"Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0",
// Safari 17.5 - macOS
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15",
// Chrome 131 - Linux
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
// Edge 131 - Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0",
// Firefox 133 - macOS
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:133.0) Gecko/20100101 Firefox/133.0",
];

/// Select a User-Agent from the pool using the process ID for variation
/// across invocations without requiring the `rand` crate.
fn select_scraper_user_agent() -> &'static str {
let index = (std::process::id() as usize) % SCRAPER_USER_AGENTS.len();
SCRAPER_USER_AGENTS[index]
}

/// Build browser-like default headers matched to the selected User-Agent.
fn scraper_headers() -> HeaderMap {
let ua = select_scraper_user_agent();
let mut headers = HeaderMap::new();

headers.insert(header::USER_AGENT, HeaderValue::from_static(ua));
headers.insert(
header::ACCEPT_LANGUAGE,
HeaderValue::from_static("en-US,en;q=0.9"),
);
headers.insert(
HeaderName::from_static("upgrade-insecure-requests"),
HeaderValue::from_static("1"),
);

// Match Accept header to browser family
if ua.contains("Firefox") {
headers.insert(
header::ACCEPT,
HeaderValue::from_static(
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
),
);
} else {
headers.insert(
header::ACCEPT,
HeaderValue::from_static(
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
),
);
}

// Chrome and Edge send Sec-Fetch-* headers
if ua.contains("Chrome") || ua.contains("Edg/") {
headers.insert(
HeaderName::from_static("sec-fetch-dest"),
HeaderValue::from_static("document"),
);
headers.insert(
HeaderName::from_static("sec-fetch-mode"),
HeaderValue::from_static("navigate"),
);
headers.insert(
HeaderName::from_static("sec-fetch-site"),
HeaderValue::from_static("none"),
);
headers.insert(
HeaderName::from_static("sec-fetch-user"),
HeaderValue::from_static("?1"),
);
}

headers
}

/// Build an HTTP client with browser-like headers for scraping search engines.
pub(crate) fn scraper_client() -> Client {
Client::builder()
.default_headers(scraper_headers())
.build()
.expect("Failed to build scraper HTTP client")
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_user_agent_pool_contains_only_modern_agents() {
for ua in SCRAPER_USER_AGENTS {
assert!(!ua.contains("Firefox/11.0"), "{}", ua);
assert!(!ua.contains("Mac OS X 10.7"), "{}", ua);
}
}

#[test]
fn test_select_scraper_user_agent_returns_from_pool() {
let ua = select_scraper_user_agent();
assert!(SCRAPER_USER_AGENTS.contains(&ua));
}

#[test]
fn test_select_scraper_user_agent_is_deterministic() {
let ua1 = select_scraper_user_agent();
let ua2 = select_scraper_user_agent();
assert_eq!(ua1, ua2, "Same process should always select the same UA");
}

#[test]
fn test_scraper_headers_include_required_fields() {
let headers = scraper_headers();
assert!(headers.contains_key(header::USER_AGENT));
assert!(headers.contains_key(header::ACCEPT));
assert!(headers.contains_key(header::ACCEPT_LANGUAGE));
assert!(headers.contains_key("upgrade-insecure-requests"));
}

#[test]
fn test_scraper_headers_match_browser_family() {
let headers = scraper_headers();
let ua = headers.get(header::USER_AGENT).unwrap().to_str().unwrap();
let accept = headers.get(header::ACCEPT).unwrap().to_str().unwrap();

if ua.contains("Firefox") {
// Firefox uses a shorter Accept without image types
assert!(
!accept.contains("image/avif"),
"Firefox UA should not have Chrome-style Accept"
);
}

if ua.contains("Chrome") || ua.contains("Edg/") {
// Chrome/Edge should have Sec-Fetch headers
assert!(
headers.contains_key("sec-fetch-dest"),
"Chrome/Edge UA should have sec-fetch-dest"
);
}
}

#[test]
fn test_scraper_client_builds_successfully() {
let _client = scraper_client();
}
}
59 changes: 57 additions & 2 deletions src/stackexchange/scraper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use crate::error::{Error, Result};
/// DuckDuckGo URL
const DUCKDUCKGO_URL: &str = "https://duckduckgo.com";
const GOOGLE_URL: &str = "https://google.com/search";
const STARTPAGE_URL: &str = "https://www.startpage.com/do/search";

// Is question_id unique across all sites? If not, then this edge case is
// unaccounted for when sorting.
Expand Down Expand Up @@ -89,8 +90,7 @@ impl Scraper for Google {
parse_with_selector(anchors, html, sites, limit)
}

/// Creates duckduckgo search url given sites and query
/// See https://duckduckgo.com/params for more info
/// Creates google search url given sites and query
fn get_url<'a, I>(&self, query: &str, sites: I) -> Url
where
I: IntoIterator<Item = &'a String>,
Expand All @@ -100,6 +100,31 @@ impl Scraper for Google {
}
}

pub struct Startpage;

impl Scraper for Startpage {
/// Parse SE data out of Startpage search results html.
/// Startpage proxies Google results and serves them as static HTML.
fn parse(
&self,
html: &str,
sites: &HashMap<String, String>,
limit: u16,
) -> Result<ScrapedData> {
let anchors = Selector::parse("a.result-title").unwrap();
parse_with_selector(anchors, html, sites, limit)
}

/// Creates Startpage search url given sites and query
fn get_url<'a, I>(&self, query: &str, sites: I) -> Url
where
I: IntoIterator<Item = &'a String>,
{
let q = make_query_arg(query, sites);
Url::parse_with_params(STARTPAGE_URL, &[("q", q.as_str())]).unwrap()
}
}

fn make_query_arg<'a, I>(query: &str, sites: I) -> String
where
I: IntoIterator<Item = &'a String>,
Expand Down Expand Up @@ -330,6 +355,36 @@ mod tests {
}
}

#[test]
fn test_startpage_url() {
let q = "how do I exit vim?";
let sites = vec![
String::from("stackoverflow.com"),
String::from("unix.stackexchange.com"),
];
assert_eq!(
Startpage.get_url(q, &sites).as_str(),
String::from(
"https://www.startpage.com/do/search\
?q=%28site%3Astackoverflow.com+OR+site%3Aunix.stackexchange.com%29\
+how+do+I+exit+vim"
)
)
}

#[test]
fn test_startpage_parser() {
let html = include_str!("../../test/startpage/exit-vim.html");
let mut sites = HashMap::new();
sites.insert(
String::from("stackoverflow"),
String::from("stackoverflow.com"),
);
let data = Startpage.parse(html, &sites, 3).unwrap();
assert_eq!(data.question_ids["stackoverflow"].len(), 3);
assert_eq!(data.question_ids["stackoverflow"][0], "11828270");
}

#[test]
fn test_question_url_to_id() {
// Happy path
Expand Down
8 changes: 3 additions & 5 deletions src/stackexchange/search.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
use futures::stream::StreamExt;
use rayon::prelude::*;
use reqwest::header;
use reqwest::Client;
use std::sync::Arc;

use crate::config::{Config, SearchEngine};
Expand All @@ -11,7 +9,7 @@ use crate::tui::markdown::Markdown;

use super::api::{Answer, Api, Question};
use super::local_storage::SiteMap;
use super::scraper::{DuckDuckGo, Google, ScrapedData, Scraper};
use super::scraper::{DuckDuckGo, Google, ScrapedData, Scraper, Startpage};

/// Limit on concurrent requests (gets passed to `buffer_unordered`)
const CONCURRENT_REQUESTS_LIMIT: usize = 8;
Expand Down Expand Up @@ -84,6 +82,7 @@ impl Search {
match self.config.search_engine {
SearchEngine::DuckDuckGo => self.search_by_scraper(DuckDuckGo).await,
SearchEngine::Google => self.search_by_scraper(Google).await,
SearchEngine::Startpage => self.search_by_scraper(Startpage).await,
SearchEngine::StackExchange => self.parallel_search_advanced().await,
}
.and_then(|qs| {
Expand All @@ -98,9 +97,8 @@ impl Search {
/// Search query at duckduckgo and then fetch the resulting questions from SE.
async fn search_by_scraper(&self, scraper: impl Scraper) -> Result<Vec<Question<String>>> {
let url = scraper.get_url(&self.query, self.site_map.values());
let html = Client::new()
let html = super::scraper_client()
.get(url)
.header(header::USER_AGENT, super::USER_AGENT)
.send()
.await?
.text()
Expand Down
Loading