-
Notifications
You must be signed in to change notification settings - Fork 61
Expand file tree
/
Copy path1236-web-crawler.js
More file actions
69 lines (61 loc) · 2.25 KB
/
1236-web-crawler.js
File metadata and controls
69 lines (61 loc) · 2.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
/**
* 1236. Web Crawler
* https://leetcode.com/problems/web-crawler/
* Difficulty: Medium
*
* Given a url startUrl and an interface HtmlParser, implement a web crawler to crawl all
* links that are under the same hostname as startUrl.
*
* Return all urls obtained by your web crawler in any order.
*
* Your crawler should:
* - Start from the page: startUrl
* - Call HtmlParser.getUrls(url) to get all urls from a webpage of given url.
* - Do not crawl the same link twice.
* - Explore only the links that are under the same hostname as startUrl.
*
* As shown in the example url above, the hostname is example.org. For simplicity sake, you may
* assume all urls use http protocol without any port specified. For example, the urls
* http://leetcode.com/problems and http://leetcode.com/contest are under the same hostname,
* while urls http://example.org/test and http://example.com/abc are not under the same hostname.
*
* The HtmlParser interface is defined as such:
* interface HtmlParser {
* // Return a list of all urls from a webpage of given url.
* public List<String> getUrls(String url);
* }
*
* Below are two examples explaining the functionality of the problem, for custom testing purposes
* you'll have three variables urls, edges and startUrl. Notice that you will only have access to
* startUrl in your code, while urls and edges are not directly accessible to you in code.
*
* Note: Consider the same URL with the trailing slash "/" as a different URL. For example,
* "http://news.yahoo.com", and "http://news.yahoo.com/" are different urls.
*/
/**
* @param {string} startUrl
* @param {HtmlParser} htmlParser
* @return {string[]}
*/
var crawl = function(startUrl, htmlParser) {
const targetHostname = getHostname(startUrl);
const visited = new Set();
const queue = [startUrl];
const result = [];
while (queue.length > 0) {
const currentUrl = queue.shift();
if (visited.has(currentUrl)) continue;
visited.add(currentUrl);
result.push(currentUrl);
const urls = htmlParser.getUrls(currentUrl);
for (const url of urls) {
if (!visited.has(url) && getHostname(url) === targetHostname) {
queue.push(url);
}
}
}
return result;
function getHostname(url) {
return url.split('/')[2];
}
};