-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmain.rs
More file actions
103 lines (89 loc) · 2.69 KB
/
main.rs
File metadata and controls
103 lines (89 loc) · 2.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
mod io;
mod logger;
mod parser;
mod types;
mod urls;
use lazy_static::lazy_static;
use std::time::Duration;
use tokio::{fs, io::AsyncWriteExt};
use anyhow::Context;
use tokio::time::sleep;
use types::*;
use scraper::{Html, Selector};
use urls::URLS;
const RESULT_FILE: &str = "./data.json";
fn cache_dir() -> String {
"./.cache".to_string()
}
#[tokio::main(flavor = "multi_thread")]
async fn main() {
println!("[log] starting...");
let _ = fs::DirBuilder::new().create(cache_dir()).await;
let mut file = fs::File::create(RESULT_FILE)
.await
.expect("Failed to create file");
file.write_all("[".as_bytes()).await.unwrap();
let total = URLS.len();
let mut logger = logger::Logger::new(total);
for (faculty_name, base_url) in URLS {
let courses = get_courses_of(base_url).await;
logger.done(faculty_name);
let result = Entry {
name: faculty_name.to_owned(),
courses,
};
io::write_to(&mut file, result).await.unwrap();
file.write_all(",".as_bytes()).await.unwrap();
}
file.write_all("]".as_bytes()).await.unwrap();
logger.close().unwrap();
}
async fn get_courses_of(base_url: &str) -> Vec<Course> {
let courses = page_index_pages(base_url)
.await
.into_iter()
.map(|content_page_url| async {
let html = scrape(&content_page_url).await;
parser::parse_course_info(html)
.context(content_page_url)
.unwrap()
});
futures::future::join_all(courses)
.await
.into_iter()
.flatten()
.collect()
}
lazy_static! {
static ref DETAIL_BUTTONS: Selector =
Selector::parse(".catalog-search-result-card-header-detail-link")
.expect("invalid selector");
}
const BASE_URL: &str = "https://catalog.he.u-tokyo.ac.jp/";
async fn page_index_pages(base_url: &str) -> Vec<String> {
let mut urls: Vec<String> = Vec::new();
for key in 0.. {
let html = scrape(&format!("{}{}", base_url, key)).await;
if html.select(&DETAIL_BUTTONS).next().is_none() {
break;
}
urls.extend(
html.select(&DETAIL_BUTTONS)
.map(|elem| BASE_URL.to_owned() + elem.attr("href").unwrap()),
);
}
urls
}
async fn scrape(url: &str) -> Html {
for tries in 0..10 {
let res = io::request(url).await;
match res {
Ok(val) => return Html::parse_document(&val),
Err(err) => {
eprintln!("request error: {err} for {} times", tries + 1);
sleep(Duration::from_millis(200)).await;
}
}
}
panic!("Request failed too many times");
}