diff --git a/docs/appearances_bref.md b/docs/appearances_bref.md new file mode 100644 index 00000000..30dccd5c --- /dev/null +++ b/docs/appearances_bref.md @@ -0,0 +1,21 @@ +# Appearances Bref + +`appearances_bref(season)` + +Get defensive appearances for a given season. + +## Arguments +`season:` Integer. Defaults to the current calendar year if no value is provided. + +## Examples of valid queries + +```python +from pybaseball import appearances_bref + +# get the current season's up-to-date appearances +data = appearances_bref() + +# get the end-of-season appearances for the 1960 season +data = appearances_bref(1960) + +``` diff --git a/pybaseball/__init__.py b/pybaseball/__init__.py index cc223855..cefca214 100644 --- a/pybaseball/__init__.py +++ b/pybaseball/__init__.py @@ -34,6 +34,7 @@ statcast_fielding_run_value ) from .league_batting_stats import batting_stats_bref +from .appearances_bref import appearances_bref from .league_batting_stats import batting_stats_range from .league_batting_stats import bwar_bat from .league_pitching_stats import pitching_stats_bref diff --git a/pybaseball/appearances_bref.py b/pybaseball/appearances_bref.py new file mode 100644 index 00000000..a0840d59 --- /dev/null +++ b/pybaseball/appearances_bref.py @@ -0,0 +1,70 @@ +from typing import Optional + +import pandas as pd +from bs4 import BeautifulSoup + +from . import cache +from .utils import most_recent_season, get_bref_id_from_player_link +from .datasources.bref import BRefSession + +session = BRefSession() + +def get_soup(year: int) -> BeautifulSoup: + url = f'https://www.baseball-reference.com/leagues/majors/{year}-appearances-fielding.shtml' + s = session.get(url).content + return BeautifulSoup(s, "lxml") + +def get_tables(soup: BeautifulSoup, season: int) -> pd.DataFrame: + data = [] + + # get player appearances table + table = soup.find(id='appearances') + headings = [th.get_text() for th in table.find("tr").find_all("th")] + + # remove the Rk header, it's unnecessary + headings.pop(0) + + # add ID column name + headings.append('player_ID') + + # pull in data rows + table_body = table.find('tbody') + rows = table_body.find_all('tr') + for row in rows: + player_link = row.find('a') + if not player_link: + continue + cols = row.find_all('td') + cols = [ele.text.strip() for ele in cols] + + # find bref ID in player link and add to data + cols.append(get_bref_id_from_player_link(player_link)) + + data.append([ele for ele in cols]) + + # use headings for column names + return pd.DataFrame(data, columns=headings) + + +@cache.df_cache() +def appearances_bref(season:Optional[int] = None) -> pd.DataFrame: + """ + Returns a pandas DataFrame of the defensive appearances for a given MLB season, or + appearances for the current / most recent season if the date is not specified. + + ARGUMENTS + season (int): the year of the season + """ + # get most recent standings if date not specified + if season is None: + season = most_recent_season() + if season < 1871: + raise ValueError( + "This query currently only returns appearances until the 1871 season. " + "Try looking at years from 1871 to present." + ) + + # retrieve html from baseball reference + soup = get_soup(season) + df = get_tables(soup, season) + return df diff --git a/pybaseball/utils.py b/pybaseball/utils.py index df24ea9d..0ac4e57e 100644 --- a/pybaseball/utils.py +++ b/pybaseball/utils.py @@ -1,3 +1,4 @@ +import re from collections import namedtuple from datetime import date, datetime, timedelta import functools @@ -7,6 +8,7 @@ import pandas as pd import requests +from bs4 import Tag from . import cache @@ -385,3 +387,9 @@ def norm_positions(pos: Union[int, str], to_word: bool = False, to_number: bool # lower() ok due to positional numbers being cast as strings when created return normed.lower() +# pull out bref ID from player page link using a regex +def get_bref_id_from_player_link(player_link: Tag) -> str: + href = player_link.attrs.get('href') + + return re.search("players/[a-z]/([a-z0-9]+)\\.shtml", href).group(1) + diff --git a/tests/pybaseball/test_appearances_bref.py b/tests/pybaseball/test_appearances_bref.py new file mode 100644 index 00000000..a6f8a47f --- /dev/null +++ b/tests/pybaseball/test_appearances_bref.py @@ -0,0 +1,22 @@ +import unittest +from pybaseball.appearances_bref import appearances_bref + +class TestAppearancesBref(unittest.TestCase): + + def test_wrong_season_error(self): + # ensure error raised for season before 1871 + self.assertRaises(ValueError, appearances_bref, 1870) + + def test_year_with_no_awards(self): + # make sure results are retrieved with no error for a year where the awards column is empty / excluded + appearances_bref_result = appearances_bref(1871) + + # test specific value in results + assert appearances_bref_result[appearances_bref_result["Player"] == "Dave Eggler"]["CF"].values[0] == "33" + + def test_year_with_awards(self): + appearances_bref_result = appearances_bref(1913) + + # test awards column + assert appearances_bref_result[appearances_bref_result["Player"] == "Walter Johnson"]["Awards"].values[0] == \ + "MVP-1" \ No newline at end of file diff --git a/tests/pybaseball/test_utils.py b/tests/pybaseball/test_utils.py index e7b44862..6f1ee537 100644 --- a/tests/pybaseball/test_utils.py +++ b/tests/pybaseball/test_utils.py @@ -1,8 +1,9 @@ from datetime import date, datetime, timedelta import pytest +from bs4 import Tag -from pybaseball.utils import DATE_FORMAT, sanitize_date_range +from pybaseball.utils import DATE_FORMAT, sanitize_date_range, get_bref_id_from_player_link def test_sanitize_date_range_nones() -> None: @@ -52,3 +53,9 @@ def test_sanitize_date_range_start_dt_gt_end_dt() -> None: assert start_dt_date < end_dt_date assert str(start_dt_date) == end_dt assert str(end_dt_date) == start_dt + + +def test_get_bref_id_from_player_link() -> None: + test_link_tag = Tag(name='a', attrs={'href': '/players/s/slapncy01.shtml'}) + + assert get_bref_id_from_player_link(test_link_tag) == 'slapncy01'