-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.R
More file actions
executable file
·58 lines (52 loc) · 2.5 KB
/
scrape.R
File metadata and controls
executable file
·58 lines (52 loc) · 2.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
library(magrittr) # to use %>% operator
library(rvest) # for web scraping
###
# scrape is a function that creates a data frame corresponding to
# the table on a webpage. Has been tested only with pages from teamrankings.com
# which have very regular tables (i.e. always 8 columns, one row per team)
# "td" = table data in HTML parlance
# "th" = table heading in HTML parlance
# Different seasons have different numbers of teams, thus number of rows not known in
# advance. But 8 columns, 1 row per team means length(data)/8 rows.
# @param url a web address containing a table
# @return M data frame containing the information from table on url
# Credit: http://r-exercises.com/2016/12/20/web-scraping-solutions/
###
scrape <- function(url) {
TAB=read_html(url)%>%html_nodes('td')%>%html_text()
NAMES=read_html(url)%>%html_nodes('th')%>%html_text()
M=data.frame(matrix(TAB,ncol=8,nrow=length(TAB)/8,byrow=T))
colnames(M) = NAMES
return(M)
}
# read from the "home page" of Stats on this website, and get the urls of all
# pages pertaining to a certain stat
home <- "https://www.teamrankings.com/ncb/stats/"
links <- read_html(home)%>%html_nodes('a')%>%html_attrs()
links <- unlist(links)
links <- links[grep("ncaa-basketball/stat",links,perl=TRUE,value=FALSE)]
links <- paste0("https://www.teamrankings.com",links)
years = c("2017","2016","2015","2014","2013")
combinations <- as.vector(outer(links,years,paste,sep="?date="))
urls <- paste0(combinations,"-05-01") #May 1 is after season end, so gets full season stats
vnames <- gsub("-","_",combinations) # replace all '-' in combinations with '_'
# This function gives a readable, understandable name for an output data file given
# some url.
# Ex: url "https://www.teamrankings.com/ncaa-basketball/stat/points-per-game?date=2017-05-01"
# will give filename "points_per_game_2017.csv"
get.filename.to.write <- function (url) {
start <- unlist(gregexpr(pattern ='/stat/',url)) + 6 # +6 for / s t a t /
question.mark <- unlist(gregexpr(pattern = '\\?',url))
stat <- substr(url,start,question.mark - 1)
stat <- gsub("-","_",stat) # replace '-' with '_'
year <- substr(url,question.mark + 6, question.mark + 9)
return(paste0(stat,"_",year,".csv"))
}
# For each url that we have discovered, get the appropriate file name, scrape the webpage,
# and print the scrape results to the file
for (i in 1:length(urls)) {
file.name <- get.filename.to.write(urls[i])
print(file.name)
M <- scrape(urls[i])
write.csv(M, file.name)
}