Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions docs/appearances_bref.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Appearances Bref

`appearances_bref(season)`

Get defensive appearances for a given season.

## Arguments
`season:` Integer. Defaults to the current calendar year if no value is provided.

## Examples of valid queries

```python
from pybaseball import appearances_bref

# get the current season's up-to-date appearances
data = appearances_bref()

# get the end-of-season appearances for the 1960 season
data = appearances_bref(1960)

```
1 change: 1 addition & 0 deletions pybaseball/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
statcast_fielding_run_value
)
from .league_batting_stats import batting_stats_bref
from .appearances_bref import appearances_bref
from .league_batting_stats import batting_stats_range
from .league_batting_stats import bwar_bat
from .league_pitching_stats import pitching_stats_bref
Expand Down
70 changes: 70 additions & 0 deletions pybaseball/appearances_bref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from typing import Optional

import pandas as pd
from bs4 import BeautifulSoup

from . import cache
from .utils import most_recent_season, get_bref_id_from_player_link
from .datasources.bref import BRefSession

session = BRefSession()

def get_soup(year: int) -> BeautifulSoup:
url = f'https://www.baseball-reference.com/leagues/majors/{year}-appearances-fielding.shtml'
s = session.get(url).content
return BeautifulSoup(s, "lxml")

def get_tables(soup: BeautifulSoup, season: int) -> pd.DataFrame:
data = []

# get player appearances table
table = soup.find(id='appearances')
headings = [th.get_text() for th in table.find("tr").find_all("th")]

# remove the Rk header, it's unnecessary
headings.pop(0)

# add ID column name
headings.append('player_ID')

# pull in data rows
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
player_link = row.find('a')
if not player_link:
continue
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]

# find bref ID in player link and add to data
cols.append(get_bref_id_from_player_link(player_link))

data.append([ele for ele in cols])

# use headings for column names
return pd.DataFrame(data, columns=headings)


@cache.df_cache()
def appearances_bref(season:Optional[int] = None) -> pd.DataFrame:
"""
Returns a pandas DataFrame of the defensive appearances for a given MLB season, or
appearances for the current / most recent season if the date is not specified.

ARGUMENTS
season (int): the year of the season
"""
# get most recent standings if date not specified
if season is None:
season = most_recent_season()
if season < 1871:
raise ValueError(
"This query currently only returns appearances until the 1871 season. "
"Try looking at years from 1871 to present."
)

# retrieve html from baseball reference
soup = get_soup(season)
df = get_tables(soup, season)
return df
8 changes: 8 additions & 0 deletions pybaseball/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from collections import namedtuple
from datetime import date, datetime, timedelta
import functools
Expand All @@ -7,6 +8,7 @@

import pandas as pd
import requests
from bs4 import Tag

from . import cache

Expand Down Expand Up @@ -385,3 +387,9 @@ def norm_positions(pos: Union[int, str], to_word: bool = False, to_number: bool
# lower() ok due to positional numbers being cast as strings when created
return normed.lower()

# pull out bref ID from player page link using a regex
def get_bref_id_from_player_link(player_link: Tag) -> str:
href = player_link.attrs.get('href')

return re.search("players/[a-z]/([a-z0-9]+)\\.shtml", href).group(1)

22 changes: 22 additions & 0 deletions tests/pybaseball/test_appearances_bref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import unittest
from pybaseball.appearances_bref import appearances_bref

class TestAppearancesBref(unittest.TestCase):

def test_wrong_season_error(self):
# ensure error raised for season before 1871
self.assertRaises(ValueError, appearances_bref, 1870)

def test_year_with_no_awards(self):
# make sure results are retrieved with no error for a year where the awards column is empty / excluded
appearances_bref_result = appearances_bref(1871)

# test specific value in results
assert appearances_bref_result[appearances_bref_result["Player"] == "Dave Eggler"]["CF"].values[0] == "33"

def test_year_with_awards(self):
appearances_bref_result = appearances_bref(1913)

# test awards column
assert appearances_bref_result[appearances_bref_result["Player"] == "Walter Johnson"]["Awards"].values[0] == \
"MVP-1"
9 changes: 8 additions & 1 deletion tests/pybaseball/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from datetime import date, datetime, timedelta

import pytest
from bs4 import Tag

from pybaseball.utils import DATE_FORMAT, sanitize_date_range
from pybaseball.utils import DATE_FORMAT, sanitize_date_range, get_bref_id_from_player_link


def test_sanitize_date_range_nones() -> None:
Expand Down Expand Up @@ -52,3 +53,9 @@ def test_sanitize_date_range_start_dt_gt_end_dt() -> None:
assert start_dt_date < end_dt_date
assert str(start_dt_date) == end_dt
assert str(end_dt_date) == start_dt


def test_get_bref_id_from_player_link() -> None:
test_link_tag = Tag(name='a', attrs={'href': '/players/s/slapncy01.shtml'})

assert get_bref_id_from_player_link(test_link_tag) == 'slapncy01'