-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmain.js
More file actions
96 lines (75 loc) · 3.27 KB
/
main.js
File metadata and controls
96 lines (75 loc) · 3.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
const cheerio = require("cheerio");
require('dotenv').config({ path: __dirname + '/.env' })
const {fetch} = require(__dirname + "/scraper/fetch.js");
const {MD5} = require(__dirname + "/lib/md5");
let config = require(__dirname + "/config/config.json")
let trainings = require(__dirname + `/config/trainings/${process.env.SCRAPPER_TRAINING_CONFIG_DIR}/trainings.json`);
const baseURL = process.env.SCRAPPER_BASE_URL;
let reject = function (error) {
console.log(error);
console.log('Retry for ' + this.url);
let context = {url: this.url, outVideoDir: this.outVideoDir}
fetch(this.url).then(resolve.bind(context), reject.bind(context))
};
let resolve = function (html) {
let url = this.url
let outVideoDir = this.outVideoDir
const $ = cheerio.load(html);
let lessonTitle = $("h2").first().text();
let patterns = config.scrapper.parser.patterns;
let videoTotal = 0
patterns.forEach(pattern => {
let regex = new RegExp(pattern, config.scrapper.parser.flags)
let bodyHtml = $('body').html()
let videoUrls = {}
let matched = regex.exec(bodyHtml);
let videoNumber = 1;
do {
if (matched && typeof matched['groups'] !== 'undefined' && typeof matched['groups']['url'] !== 'undefined') {
let videoUrl = matched['groups']['url'].replace(/&/g, "&")
let videoUrlHash = MD5(videoUrl);
let videoName = lessonTitle.trim().replace(/\.$/, '') + (videoNumber > 1 ? `-${videoNumber}` : '');
let videoExtension = config.scrapper.downloader.outVideo.extension
let relativePath = outVideoDir
let rootPath = process.env.SCRAPPER_DOWNLOAD_PATH || '/home/webscraper/app/download'
rootPath += rootPath.endsWith("/") ? "" : "/"
relativePath += relativePath.endsWith("/") ? "" : "/"
let outVideoPath = `${rootPath}${relativePath}${videoName}.${videoExtension}`
if (videoUrls[videoUrlHash] === undefined) {
let cmd = config.scrapper.downloader.cmdTemplate
.replace('{videoUrl}', videoUrl)
.replace('{outVideoPath}', outVideoPath)
console.log(cmd)
videoUrls[videoUrlHash] = videoUrl
videoNumber++
videoTotal++
}
}
} while ((matched = regex.exec(bodyHtml)) !== null)
})
if (videoTotal === 0) {
console.log(`No video on ${url}`)
}
};
let trainingIndex = 0;
let fetchAll = function (training) {
console.log(`Training ${trainingIndex + 1} from ${trainings.length}: "${training.title}"`)
let outVideoDir = training.title
.replace(/[^a-zA-Zа-яА-Я0-9-_.,]/g, ' ')
.replace(/\s+/g, ' ')
.replace(/\.\s+/g, '/')
let paths = training.paths
let requests = paths.map(path => {
let url = baseURL + path;
let context = {url: url, outVideoDir: outVideoDir}
return fetch(url).then(resolve.bind(context), reject.bind(context))
})
Promise.all(requests).then(() => {
let training = trainings[++trainingIndex]
if (!training) {
return
}
fetchAll(training)
});
}
fetchAll(trainings[0])