Skip to content

Commit 2f7ce42

Browse files
authored
Fix empty values (#79)
Closes #78. Adds helper methods to convert string values to `int` and `float`s to handle cases where Basketball Reference accidentally has an empty or blank value for a numerical field.
1 parent 12a8f54 commit 2f7ce42

File tree

8 files changed

+1485
-55
lines changed

8 files changed

+1485
-55
lines changed

basketball_reference_web_scraper/parsers/box_scores/players.py

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from lxml import html
22

33
from basketball_reference_web_scraper.data import Location, Outcome, TEAM_ABBREVIATIONS_TO_TEAM
4+
from basketball_reference_web_scraper.utilities import str_to_int, str_to_float
45

56

67
def parse_location(symbol):
@@ -39,25 +40,25 @@ def parse_player_box_score(row):
3940
return {
4041
"slug": str(row[1].get("data-append-csv")),
4142
"name": str(row[1].text_content()),
42-
"team": TEAM_ABBREVIATIONS_TO_TEAM[row[2].text_content()],
43+
"team": TEAM_ABBREVIATIONS_TO_TEAM.get(row[2].text_content()),
4344
"location": parse_location(row[3].text_content()),
44-
"opponent": TEAM_ABBREVIATIONS_TO_TEAM[row[4].text_content()],
45+
"opponent": TEAM_ABBREVIATIONS_TO_TEAM.get(row[4].text_content()),
4546
"outcome": parse_outcome(row[5].text_content()),
46-
"seconds_played": int(parse_seconds_played(row[6].text_content())),
47-
"made_field_goals": int(row[7].text_content()),
48-
"attempted_field_goals": int(row[8].text_content()),
49-
"made_three_point_field_goals": int(row[10].text_content()),
50-
"attempted_three_point_field_goals": int(row[11].text_content()),
51-
"made_free_throws": int(row[13].text_content()),
52-
"attempted_free_throws": int(row[14].text_content()),
53-
"offensive_rebounds": int(row[16].text_content()),
54-
"defensive_rebounds": int(row[17].text_content()),
55-
"assists": int(row[19].text_content()),
56-
"steals": int(row[20].text_content()),
57-
"blocks": int(row[21].text_content()),
58-
"turnovers": int(row[22].text_content()),
59-
"personal_fouls": int(row[23].text_content()),
60-
"game_score": float(row[25].text_content()),
47+
"seconds_played": parse_seconds_played(row[6].text_content()),
48+
"made_field_goals": str_to_int(row[7].text_content()),
49+
"attempted_field_goals": str_to_int(row[8].text_content()),
50+
"made_three_point_field_goals": str_to_int(row[10].text_content()),
51+
"attempted_three_point_field_goals": str_to_int(row[11].text_content()),
52+
"made_free_throws": str_to_int(row[13].text_content()),
53+
"attempted_free_throws": str_to_int(row[14].text_content()),
54+
"offensive_rebounds": str_to_int(row[16].text_content()),
55+
"defensive_rebounds": str_to_int(row[17].text_content()),
56+
"assists": str_to_int(row[19].text_content()),
57+
"steals": str_to_int(row[20].text_content()),
58+
"blocks": str_to_int(row[21].text_content()),
59+
"turnovers": str_to_int(row[22].text_content()),
60+
"personal_fouls": str_to_int(row[23].text_content()),
61+
"game_score": str_to_float(row[25].text_content()),
6162
}
6263

6364

basketball_reference_web_scraper/parsers/box_scores/teams.py

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,27 @@
11
from lxml import html
22

33
from basketball_reference_web_scraper.data import TEAM_NAME_TO_TEAM
4+
from basketball_reference_web_scraper.utilities import str_to_int
45

56

67
def parse_team_total(footer, team):
78
cells = footer.xpath('tr/td')
89
return {
910
"team": team,
10-
"minutes_played": int(cells[0].text_content()),
11-
"made_field_goals": int(cells[1].text_content()),
12-
"attempted_field_goals": int(cells[2].text_content()),
13-
"made_three_point_field_goals": int(cells[4].text_content()),
14-
"attempted_three_point_field_goals": int(cells[5].text_content()),
15-
"made_free_throws": int(cells[7].text_content()),
16-
"attempted_free_throws": int(cells[8].text_content()),
17-
"offensive_rebounds": int(cells[10].text_content()),
18-
"defensive_rebounds": int(cells[11].text_content()),
19-
"assists": int(cells[13].text_content()),
20-
"steals": int(cells[14].text_content()),
21-
"blocks": int(cells[15].text_content()),
22-
"turnovers": int(cells[16].text_content()),
23-
"personal_fouls": int(cells[17].text_content()),
11+
"minutes_played": str_to_int(cells[0].text_content()),
12+
"made_field_goals": str_to_int(cells[1].text_content()),
13+
"attempted_field_goals": str_to_int(cells[2].text_content()),
14+
"made_three_point_field_goals": str_to_int(cells[4].text_content()),
15+
"attempted_three_point_field_goals": str_to_int(cells[5].text_content()),
16+
"made_free_throws": str_to_int(cells[7].text_content()),
17+
"attempted_free_throws": str_to_int(cells[8].text_content()),
18+
"offensive_rebounds": str_to_int(cells[10].text_content()),
19+
"defensive_rebounds": str_to_int(cells[11].text_content()),
20+
"assists": str_to_int(cells[13].text_content()),
21+
"steals": str_to_int(cells[14].text_content()),
22+
"blocks": str_to_int(cells[15].text_content()),
23+
"turnovers": str_to_int(cells[16].text_content()),
24+
"personal_fouls": str_to_int(cells[17].text_content()),
2425
}
2526

2627

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,32 @@
11
from lxml import html
22

33
from basketball_reference_web_scraper.data import TEAM_ABBREVIATIONS_TO_TEAM, POSITION_ABBREVIATIONS_TO_POSITION
4+
from basketball_reference_web_scraper.utilities import str_to_int
45

56

67
def parse_player_season_totals(row):
78
return {
89
"slug": str(row[1].get("data-append-csv")),
910
"name": str(row[1].text_content()),
1011
"positions": parse_positions(row[2].text_content()),
11-
"age": int(row[3].text_content()) if row[3].text_content() else None,
12-
"team": TEAM_ABBREVIATIONS_TO_TEAM[row[4].text_content()],
13-
"games_played": int(row[5].text_content()),
14-
"games_started": int(row[6].text_content()),
15-
"minutes_played": int(row[7].text_content()),
16-
"made_field_goals": int(row[8].text_content()),
17-
"attempted_field_goals": int(row[9].text_content()),
18-
"made_three_point_field_goals": int(row[11].text_content()),
19-
"attempted_three_point_field_goals": int(row[12].text_content()),
20-
"made_free_throws": int(row[18].text_content()),
21-
"attempted_free_throws": int(row[19].text_content()),
22-
"offensive_rebounds": int(row[21].text_content()),
23-
"defensive_rebounds": int(row[22].text_content()),
24-
"assists": int(row[24].text_content()),
25-
"steals": int(row[25].text_content()),
26-
"blocks": int(row[26].text_content()),
27-
"turnovers": int(row[27].text_content()),
28-
"personal_fouls": int(row[28].text_content()),
12+
"age": str_to_int(row[3].text_content(), default=None),
13+
"team": TEAM_ABBREVIATIONS_TO_TEAM.get(row[4].text_content()),
14+
"games_played": str_to_int(row[5].text_content()),
15+
"games_started": str_to_int(row[6].text_content()),
16+
"minutes_played": str_to_int(row[7].text_content()),
17+
"made_field_goals": str_to_int(row[8].text_content()),
18+
"attempted_field_goals": str_to_int(row[9].text_content()),
19+
"made_three_point_field_goals": str_to_int(row[11].text_content()),
20+
"attempted_three_point_field_goals": str_to_int(row[12].text_content()),
21+
"made_free_throws": str_to_int(row[18].text_content()),
22+
"attempted_free_throws": str_to_int(row[19].text_content()),
23+
"offensive_rebounds": str_to_int(row[21].text_content()),
24+
"defensive_rebounds": str_to_int(row[22].text_content()),
25+
"assists": str_to_int(row[24].text_content()),
26+
"steals": str_to_int(row[25].text_content()),
27+
"blocks": str_to_int(row[26].text_content()),
28+
"turnovers": str_to_int(row[27].text_content()),
29+
"personal_fouls": str_to_int(row[28].text_content()),
2930
}
3031

3132

@@ -46,5 +47,10 @@ def parse_players_season_totals(page):
4647

4748

4849
def parse_positions(positions_content):
49-
return list(map(lambda position_abbreviation: POSITION_ABBREVIATIONS_TO_POSITION[position_abbreviation],
50-
positions_content.split("-")))
50+
parsed_positions = list(
51+
map(
52+
lambda position_abbreviation: POSITION_ABBREVIATIONS_TO_POSITION.get(position_abbreviation),
53+
positions_content.split("-")
54+
)
55+
)
56+
return [position for position in parsed_positions if position is not None]

basketball_reference_web_scraper/parsers/schedule.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import pytz
44

55
from basketball_reference_web_scraper.data import Team
6+
from basketball_reference_web_scraper.utilities import str_to_int
67

78
TEAM_NAME_TO_TEAM = {
89
member.value: member
@@ -45,10 +46,10 @@ def parse_game(row):
4546
start_time = parse_start_time(formatted_date=row[0].text_content(), formatted_time_of_day=row[1].text_content())
4647
return {
4748
"start_time": start_time,
48-
"away_team": TEAM_NAME_TO_TEAM[row[2].text_content().upper()],
49-
"home_team": TEAM_NAME_TO_TEAM[row[4].text_content().upper()],
50-
"away_team_score": int(row[3].text_content()) if row[3].text_content() else None,
51-
"home_team_score": int(row[5].text_content()) if row[5].text_content() else None,
49+
"away_team": TEAM_NAME_TO_TEAM.get(row[2].text_content().upper()),
50+
"home_team": TEAM_NAME_TO_TEAM.get(row[4].text_content().upper()),
51+
"away_team_score": str_to_int(row[3].text_content(), default=None),
52+
"home_team_score": str_to_int(row[5].text_content(), default=None),
5253
}
5354

5455

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
def str_to_int(value, default=int(0)):
2+
stripped_value = value.strip()
3+
try:
4+
return int(stripped_value)
5+
except ValueError:
6+
return default
7+
8+
9+
def str_to_float(value, default=float(0)):
10+
stripped_value = value.strip()
11+
try:
12+
return float(stripped_value)
13+
except ValueError:
14+
return default

tests/12_12_2017_daily_leaders.html

Lines changed: 1352 additions & 0 deletions
Large diffs are not rendered by default.

tests/test_integration_parse_player_box_scores.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
november_03_2003_daily_leaders_html = os.path.join(os.path.dirname(__file__), './11_03_2003_daily_leaders.html')
88
november_01_2006_daily_leaders_html = os.path.join(os.path.dirname(__file__), './11_01_2006_daily_leaders.html')
99
december_18_2015_daily_leaders_html = os.path.join(os.path.dirname(__file__), './12_18_2015_daily_leaders.html')
10+
december_12_2017_daily_leaders_html = os.path.join(os.path.dirname(__file__), './12_12_2017_daily_leaders.html')
1011
january_01_2017_daily_leaders_html = os.path.join(os.path.dirname(__file__), './01_29_2017_daily_leaders.html')
1112

1213

@@ -15,12 +16,23 @@ def setUp(self):
1516
self.november_01_2006_daily_leaders = open(november_01_2006_daily_leaders_html).read()
1617
self.december_18_2015_daily_leaders = open(december_18_2015_daily_leaders_html).read()
1718
self.november_03_2003_daily_leaders = open(november_03_2003_daily_leaders_html).read()
19+
self.december_12_2017_daily_leaders = open(december_12_2017_daily_leaders_html).read()
1820
self.january_01_2017_daily_leaders = open(january_01_2017_daily_leaders_html).read()
1921

2022
def test_box_scores_for_12_18_2015(self):
2123
parsed_box_score = players.parse_player_box_scores(self.december_18_2015_daily_leaders)
2224
self.assertEqual(len(parsed_box_score), 250)
2325

26+
def test_box_scores_for_12_12_2017(self):
27+
parsed_box_score = players.parse_player_box_scores(self.december_12_2017_daily_leaders)
28+
self.assertEqual(len(parsed_box_score), 149)
29+
30+
def test_parses_blank_value_for_andrew_bogut_on_12_12_2017(self):
31+
parsed_box_score = players.parse_player_box_scores(self.december_12_2017_daily_leaders)
32+
andrew_bogut = parsed_box_score[128]
33+
self.assertEqual(andrew_bogut["made_three_point_field_goals"], 0)
34+
self.assertEqual(andrew_bogut["attempted_three_point_field_goals"], 0)
35+
2436
# Test for minutes played greater than or equal to 60 minutes
2537
def test_box_scores_for_01_01_2017(self):
2638
parsed_box_score = players.parse_player_box_scores(self.january_01_2017_daily_leaders)

tests/test_utilties.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from unittest import TestCase
2+
3+
from basketball_reference_web_scraper.utilities import str_to_int, str_to_float
4+
5+
6+
class TestStrToInt(TestCase):
7+
def test_empty_string_is_zero(self):
8+
self.assertEqual(str_to_int(""), 0)
9+
10+
def test_whitespace_is_zero(self):
11+
self.assertEqual(str_to_int(" "), 0)
12+
13+
def test_stringified_number_is_converted(self):
14+
self.assertEqual(str_to_int("10"), 10)
15+
16+
def test_stringified_number_with_leading_whitespace_is_converted(self):
17+
self.assertEqual(str_to_int(" 10"), 10)
18+
19+
def test_stringified_number_with_trailing_whitespace_is_converted(self):
20+
self.assertEqual(str_to_int("10 "), 10)
21+
22+
def test_with_default(self):
23+
self.assertIsNone(str_to_int("", default=None))
24+
25+
26+
class TestStrToFloat(TestCase):
27+
def test_empty_string_is_zero(self):
28+
self.assertEqual(str_to_float(""), 0.0)
29+
30+
def test_whitespace_is_zero(self):
31+
self.assertEqual(str_to_float(" "), 0.0)
32+
33+
def test_stringified_number_is_converted(self):
34+
self.assertEqual(str_to_float("1.234"), 1.234)
35+
36+
def test_stringified_number_with_leading_whitespace_is_converted(self):
37+
self.assertEqual(str_to_float(" 1.234"), 1.234)
38+
39+
def test_stringified_number_with_trailing_whitespace_is_converted(self):
40+
self.assertEqual(str_to_float("1.234 "), 1.234)
41+
42+
def test_with_default(self):
43+
self.assertIsNone(str_to_float("", default=None))

0 commit comments

Comments
 (0)