Spaces:
Running
Running
Commit
·
221cc42
1
Parent(s):
65cf9b7
Fix duplicate name translations
Browse files- data.py +20 -2
- plotting.py +1 -0
- stats.py +36 -73
data.py
CHANGED
|
@@ -212,7 +212,9 @@ data_df = (
|
|
| 212 |
.with_columns((pl.col('x').is_between(-80, 80) & pl.col('y').is_between(25, 25+200) & ~pl.col('heart')).alias('shadow'))
|
| 213 |
.with_columns((pl.col('x').is_between(-100, 101) & pl.col('y').is_between(0, 0+251) & ~pl.col('heart') & ~pl.col('shadow')).alias('chase'))
|
| 214 |
.filter(pl.col('ballKind_code') != '-')
|
|
|
|
| 215 |
)
|
|
|
|
| 216 |
|
| 217 |
|
| 218 |
def select_name(names):
|
|
@@ -328,9 +330,25 @@ players_df = (
|
|
| 328 |
)
|
| 329 |
)
|
| 330 |
print(players_df.filter(pl.len().over('playerId', 'team', 'season') > 1))
|
| 331 |
-
players_df = pl.concat((players_df.group_by('playerId').agg(pl.first('name_en')), manual_translated_df[['playerId', 'name_en']]))
|
|
|
|
|
|
|
| 332 |
|
| 333 |
# join players to data
|
| 334 |
-
data_df =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
if __name__ == '__main__':
|
| 336 |
breakpoint()
|
|
|
|
| 212 |
.with_columns((pl.col('x').is_between(-80, 80) & pl.col('y').is_between(25, 25+200) & ~pl.col('heart')).alias('shadow'))
|
| 213 |
.with_columns((pl.col('x').is_between(-100, 101) & pl.col('y').is_between(0, 0+251) & ~pl.col('heart') & ~pl.col('shadow')).alias('chase'))
|
| 214 |
.filter(pl.col('ballKind_code') != '-')
|
| 215 |
+
.unique()
|
| 216 |
)
|
| 217 |
+
bar = data_df
|
| 218 |
|
| 219 |
|
| 220 |
def select_name(names):
|
|
|
|
| 330 |
)
|
| 331 |
)
|
| 332 |
print(players_df.filter(pl.len().over('playerId', 'team', 'season') > 1))
|
| 333 |
+
players_df = pl.concat((players_df.group_by('playerId').agg(pl.first('name_en')), manual_translated_df[['playerId', 'name_en']])).unique()
|
| 334 |
+
print(players_df.filter(pl.len().over('playerId') > 1).sort('playerId'))
|
| 335 |
+
players_df = players_df.group_by('playerId').agg(pl.first('name_en'))
|
| 336 |
|
| 337 |
# join players to data
|
| 338 |
+
data_df = (
|
| 339 |
+
data_df
|
| 340 |
+
.join(
|
| 341 |
+
players_df.rename({'name_en': 'pitcher_name'})[['playerId', 'pitcher_name']],
|
| 342 |
+
left_on='pitId',
|
| 343 |
+
right_on='playerId',
|
| 344 |
+
how='left'
|
| 345 |
+
)
|
| 346 |
+
.join(
|
| 347 |
+
players_df.rename({'name_en': 'batter_name'})[['playerId', 'batter_name']],
|
| 348 |
+
left_on='batId',
|
| 349 |
+
right_on='playerId',
|
| 350 |
+
how='left'
|
| 351 |
+
)
|
| 352 |
+
)
|
| 353 |
if __name__ == '__main__':
|
| 354 |
breakpoint()
|
plotting.py
CHANGED
|
@@ -169,6 +169,7 @@ font = load_google_font('Saira Extra Condensed', weight='medium')
|
|
| 169 |
|
| 170 |
def create_pitcher_overview_card(id, season, dpi=300):
|
| 171 |
data = get_card_data(id, start_date=date(season, 1, 1), end_date=date(season, 12, 31), game_kind='Regular Season', min_pitches=100, pitch_class_type='general')
|
|
|
|
| 172 |
|
| 173 |
fig = plt.figure(figsize=(1080/300, 1350/300), dpi=dpi)
|
| 174 |
gs = fig.add_gridspec(8, 6, height_ratios=[1, 1, 1.5, 6, 1, 3, 1, 0.5])
|
|
|
|
| 169 |
|
| 170 |
def create_pitcher_overview_card(id, season, dpi=300):
|
| 171 |
data = get_card_data(id, start_date=date(season, 1, 1), end_date=date(season, 12, 31), game_kind='Regular Season', min_pitches=100, pitch_class_type='general')
|
| 172 |
+
print(data)
|
| 173 |
|
| 174 |
fig = plt.figure(figsize=(1080/300, 1350/300), dpi=dpi)
|
| 175 |
gs = fig.add_gridspec(8, 6, height_ratios=[1, 1, 1.5, 6, 1, 3, 1, 0.5])
|
stats.py
CHANGED
|
@@ -67,13 +67,14 @@ def compute_pitch_stats(data, player_type, pitch_class_type, min_pitches=1):
|
|
| 67 |
assert player_type in ('pitcher', 'batter')
|
| 68 |
assert pitch_class_type in ('general', 'specific')
|
| 69 |
id_col = 'pitId' if player_type == 'pitcher' else 'batId'
|
|
|
|
| 70 |
pitch_col = 'ballKind_code' if pitch_class_type == 'specific' else 'general_ballKind_code'
|
| 71 |
pitch_name_col = 'ballKind' if pitch_class_type == 'specific' else 'general_ballKind'
|
| 72 |
pitch_stats = (
|
| 73 |
data
|
| 74 |
.group_by(id_col, pitch_col, 'pitcher_team_name_short')
|
| 75 |
.agg(
|
| 76 |
-
pl.first(
|
| 77 |
*([pl.first('general_ballKind')] if pitch_class_type == 'specific' else []),
|
| 78 |
pl.first(pitch_name_col),
|
| 79 |
pl.len().alias('count'),
|
|
@@ -116,15 +117,14 @@ def compute_pitch_stats(data, player_type, pitch_class_type, min_pitches=1):
|
|
| 116 |
.sort(id_col, 'count', descending=[False, True])
|
| 117 |
)
|
| 118 |
return pitch_stats
|
|
|
|
| 119 |
|
| 120 |
-
|
| 121 |
-
def compute_pitcher_stats(data, min_ip='qualified'):
|
| 122 |
-
data = data.filter(pl.col('ballKind') != '-')
|
| 123 |
data = (
|
| 124 |
compute_team_games(data)
|
| 125 |
.with_columns(
|
| 126 |
pl.when(pl.col('half_inning').str.ends_with('1')).then('home_games').otherwise('visitor_games').first().over('pitId').alias('games'),
|
| 127 |
-
pl.col('inning_code').unique().len().over('pitId').alias('IP')
|
| 128 |
)
|
| 129 |
)
|
| 130 |
|
|
@@ -133,35 +133,37 @@ def compute_pitcher_stats(data, min_ip='qualified'):
|
|
| 133 |
else:
|
| 134 |
data = data.with_columns((pl.col('IP') >= min_ip).alias('qualified'))
|
| 135 |
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
| 162 |
)
|
| 163 |
-
return
|
| 164 |
-
|
| 165 |
|
| 166 |
def get_pitcher_stats(id, lr=None, game_kind=None, start_date=None, end_date=None, min_ip=1, min_pitches=1, pitch_class_type='specific'):
|
| 167 |
# source_data = data_df.filter(pl.col('ballKind_code') != '-')
|
|
@@ -176,19 +178,6 @@ def get_pitcher_stats(id, lr=None, game_kind=None, start_date=None, end_date=Non
|
|
| 176 |
source_data = data_df
|
| 177 |
source_data = filter_data_by_date_and_game_kind(source_data, start_date=start_date, end_date=end_date, game_kind=game_kind)
|
| 178 |
|
| 179 |
-
source_data = (
|
| 180 |
-
compute_team_games(source_data)
|
| 181 |
-
.with_columns(
|
| 182 |
-
pl.when(pl.col('half_inning').str.ends_with('1')).then('home_games').otherwise('visitor_games').first().over('pitId').alias('games'),
|
| 183 |
-
pl.col('inning_code').unique().len().over('pitId').alias('IP')
|
| 184 |
-
)
|
| 185 |
-
)
|
| 186 |
-
|
| 187 |
-
if min_ip == 'qualified':
|
| 188 |
-
source_data = source_data.with_columns((pl.col('IP') >= pl.col('games')).alias('qualified'))
|
| 189 |
-
else:
|
| 190 |
-
source_data = source_data.with_columns((pl.col('IP') >= min_ip).alias('qualified'))
|
| 191 |
-
|
| 192 |
if lr is not None:
|
| 193 |
source_data = source_data.filter(pl.col('batLR') == lr)
|
| 194 |
|
|
@@ -206,32 +195,6 @@ def get_pitcher_stats(id, lr=None, game_kind=None, start_date=None, end_date=Non
|
|
| 206 |
.with_columns((pl.col('ballSpeed')/1.609).alias('ballSpeed_mph'))
|
| 207 |
)
|
| 208 |
|
| 209 |
-
pitcher_stats = (
|
| 210 |
-
source_data
|
| 211 |
-
.group_by('pitId')
|
| 212 |
-
.agg(
|
| 213 |
-
pl.col('pitcher_name').first(),
|
| 214 |
-
(pl.when(pl.col('presult').str.contains('strikeout')).then(1).otherwise(0).sum() / pl.col('pa_code').unique().len()).alias('K%'),
|
| 215 |
-
(pl.when(pl.col('presult') == 'Walk').then(1).otherwise(0).sum() / pl.col('pa_code').unique().len()).alias('BB%'),
|
| 216 |
-
(pl.col('csw').sum() / pl.col('pitch').sum()).alias('CSW%'),
|
| 217 |
-
pl.col('aux_bresult').struct.field('batType').drop_nulls().value_counts(normalize=True),
|
| 218 |
-
pl.first('qualified')
|
| 219 |
-
)
|
| 220 |
-
.explode('batType')
|
| 221 |
-
.unnest('batType')
|
| 222 |
-
.pivot(on='batType', values='proportion')
|
| 223 |
-
.fill_null(0)
|
| 224 |
-
.with_columns(
|
| 225 |
-
(pl.col('G') + pl.col('B')).alias('GB%'),
|
| 226 |
-
(pl.col('F') + pl.col('P')).alias('FB%'),
|
| 227 |
-
pl.col('L').alias('LD%'),
|
| 228 |
-
)
|
| 229 |
-
.drop('G', 'F', 'B', 'P', 'L')
|
| 230 |
-
.with_columns(
|
| 231 |
-
(pl.when(pl.col('qualified')).then(pl.col(stat)).rank(descending=(stat == 'BB%'))/pl.when(pl.col('qualified')).then(pl.col(stat)).count()).alias(f'{stat}_pctl')
|
| 232 |
-
for stat in ['CSW%', 'K%', 'BB%', 'GB%']
|
| 233 |
-
)
|
| 234 |
-
.filter(pl.col('pitId') == id)
|
| 235 |
-
)
|
| 236 |
|
| 237 |
return SimpleNamespace(pitcher_stats=pitcher_stats, pitch_stats=pitch_stats, pitch_shapes=pitch_shapes)
|
|
|
|
| 67 |
assert player_type in ('pitcher', 'batter')
|
| 68 |
assert pitch_class_type in ('general', 'specific')
|
| 69 |
id_col = 'pitId' if player_type == 'pitcher' else 'batId'
|
| 70 |
+
name_col = 'pitcher_name' if player_type == 'pitcher' else 'batter_name'
|
| 71 |
pitch_col = 'ballKind_code' if pitch_class_type == 'specific' else 'general_ballKind_code'
|
| 72 |
pitch_name_col = 'ballKind' if pitch_class_type == 'specific' else 'general_ballKind'
|
| 73 |
pitch_stats = (
|
| 74 |
data
|
| 75 |
.group_by(id_col, pitch_col, 'pitcher_team_name_short')
|
| 76 |
.agg(
|
| 77 |
+
pl.first(name_col),
|
| 78 |
*([pl.first('general_ballKind')] if pitch_class_type == 'specific' else []),
|
| 79 |
pl.first(pitch_name_col),
|
| 80 |
pl.len().alias('count'),
|
|
|
|
| 117 |
.sort(id_col, 'count', descending=[False, True])
|
| 118 |
)
|
| 119 |
return pitch_stats
|
| 120 |
+
|
| 121 |
|
| 122 |
+
def compute_player_stats(data, player_type, min_ip='qualified'):
|
|
|
|
|
|
|
| 123 |
data = (
|
| 124 |
compute_team_games(data)
|
| 125 |
.with_columns(
|
| 126 |
pl.when(pl.col('half_inning').str.ends_with('1')).then('home_games').otherwise('visitor_games').first().over('pitId').alias('games'),
|
| 127 |
+
pl.col('inning_code').unique().len().over('pitId').alias('IP')
|
| 128 |
)
|
| 129 |
)
|
| 130 |
|
|
|
|
| 133 |
else:
|
| 134 |
data = data.with_columns((pl.col('IP') >= min_ip).alias('qualified'))
|
| 135 |
|
| 136 |
+
assert player_type in ('pitcher', 'batter')
|
| 137 |
+
id_col = 'pitId' if player_type == 'pitcher' else 'batId'
|
| 138 |
+
name_col = 'pitcher_name' if player_type == 'pitcher' else 'batter_name'
|
| 139 |
+
player_stats = (
|
| 140 |
+
data
|
| 141 |
+
.group_by(id_col)
|
| 142 |
+
.agg(
|
| 143 |
+
pl.col(name_col).first(),
|
| 144 |
+
(pl.when(pl.col('presult').str.contains('strikeout')).then(1).otherwise(0).sum() / pl.col('pa_code').unique().len()).alias('K%'),
|
| 145 |
+
(pl.when(pl.col('presult') == 'Walk').then(1).otherwise(0).sum() / pl.col('pa_code').unique().len()).alias('BB%'),
|
| 146 |
+
(pl.col('csw').sum() / pl.col('pitch').sum()).alias('CSW%'),
|
| 147 |
+
pl.col('aux_bresult').struct.field('batType').drop_nulls().value_counts(normalize=True),
|
| 148 |
+
pl.first('qualified')
|
| 149 |
+
)
|
| 150 |
+
.explode('batType')
|
| 151 |
+
.unnest('batType')
|
| 152 |
+
.pivot(on='batType', values='proportion')
|
| 153 |
+
.fill_null(0)
|
| 154 |
+
.with_columns(
|
| 155 |
+
(pl.col('G') + pl.col('B')).alias('GB%'),
|
| 156 |
+
(pl.col('F') + pl.col('P')).alias('FB%'),
|
| 157 |
+
pl.col('L').alias('LD%'),
|
| 158 |
+
)
|
| 159 |
+
.drop('G', 'F', 'B', 'P', 'L')
|
| 160 |
+
.with_columns(
|
| 161 |
+
(pl.when(pl.col('qualified')).then(pl.col(stat)).rank(descending=(stat == 'BB%'))/pl.when(pl.col('qualified')).then(pl.col(stat)).count()).alias(f'{stat}_pctl')
|
| 162 |
+
for stat in ['CSW%', 'K%', 'BB%', 'GB%']
|
| 163 |
+
)
|
| 164 |
)
|
| 165 |
+
return player_stats
|
| 166 |
+
|
| 167 |
|
| 168 |
def get_pitcher_stats(id, lr=None, game_kind=None, start_date=None, end_date=None, min_ip=1, min_pitches=1, pitch_class_type='specific'):
|
| 169 |
# source_data = data_df.filter(pl.col('ballKind_code') != '-')
|
|
|
|
| 178 |
source_data = data_df
|
| 179 |
source_data = filter_data_by_date_and_game_kind(source_data, start_date=start_date, end_date=end_date, game_kind=game_kind)
|
| 180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
if lr is not None:
|
| 182 |
source_data = source_data.filter(pl.col('batLR') == lr)
|
| 183 |
|
|
|
|
| 195 |
.with_columns((pl.col('ballSpeed')/1.609).alias('ballSpeed_mph'))
|
| 196 |
)
|
| 197 |
|
| 198 |
+
pitcher_stats = compute_player_stats(source_data, player_type='pitcher', min_ip=min_ip).filter(pl.col('pitId') == id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
return SimpleNamespace(pitcher_stats=pitcher_stats, pitch_stats=pitch_stats, pitch_shapes=pitch_shapes)
|