patrickramos commited on
Commit
221cc42
·
1 Parent(s): 65cf9b7

Fix duplicate name translations

Browse files
Files changed (3) hide show
  1. data.py +20 -2
  2. plotting.py +1 -0
  3. stats.py +36 -73
data.py CHANGED
@@ -212,7 +212,9 @@ data_df = (
212
  .with_columns((pl.col('x').is_between(-80, 80) & pl.col('y').is_between(25, 25+200) & ~pl.col('heart')).alias('shadow'))
213
  .with_columns((pl.col('x').is_between(-100, 101) & pl.col('y').is_between(0, 0+251) & ~pl.col('heart') & ~pl.col('shadow')).alias('chase'))
214
  .filter(pl.col('ballKind_code') != '-')
 
215
  )
 
216
 
217
 
218
  def select_name(names):
@@ -328,9 +330,25 @@ players_df = (
328
  )
329
  )
330
  print(players_df.filter(pl.len().over('playerId', 'team', 'season') > 1))
331
- players_df = pl.concat((players_df.group_by('playerId').agg(pl.first('name_en')), manual_translated_df[['playerId', 'name_en']]))
 
 
332
 
333
  # join players to data
334
- data_df = data_df.join(players_df.rename({'name_en': 'pitcher_name'}), left_on='pitId', right_on='playerId', how='left')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  if __name__ == '__main__':
336
  breakpoint()
 
212
  .with_columns((pl.col('x').is_between(-80, 80) & pl.col('y').is_between(25, 25+200) & ~pl.col('heart')).alias('shadow'))
213
  .with_columns((pl.col('x').is_between(-100, 101) & pl.col('y').is_between(0, 0+251) & ~pl.col('heart') & ~pl.col('shadow')).alias('chase'))
214
  .filter(pl.col('ballKind_code') != '-')
215
+ .unique()
216
  )
217
+ bar = data_df
218
 
219
 
220
  def select_name(names):
 
330
  )
331
  )
332
  print(players_df.filter(pl.len().over('playerId', 'team', 'season') > 1))
333
+ players_df = pl.concat((players_df.group_by('playerId').agg(pl.first('name_en')), manual_translated_df[['playerId', 'name_en']])).unique()
334
+ print(players_df.filter(pl.len().over('playerId') > 1).sort('playerId'))
335
+ players_df = players_df.group_by('playerId').agg(pl.first('name_en'))
336
 
337
  # join players to data
338
+ data_df = (
339
+ data_df
340
+ .join(
341
+ players_df.rename({'name_en': 'pitcher_name'})[['playerId', 'pitcher_name']],
342
+ left_on='pitId',
343
+ right_on='playerId',
344
+ how='left'
345
+ )
346
+ .join(
347
+ players_df.rename({'name_en': 'batter_name'})[['playerId', 'batter_name']],
348
+ left_on='batId',
349
+ right_on='playerId',
350
+ how='left'
351
+ )
352
+ )
353
  if __name__ == '__main__':
354
  breakpoint()
plotting.py CHANGED
@@ -169,6 +169,7 @@ font = load_google_font('Saira Extra Condensed', weight='medium')
169
 
170
  def create_pitcher_overview_card(id, season, dpi=300):
171
  data = get_card_data(id, start_date=date(season, 1, 1), end_date=date(season, 12, 31), game_kind='Regular Season', min_pitches=100, pitch_class_type='general')
 
172
 
173
  fig = plt.figure(figsize=(1080/300, 1350/300), dpi=dpi)
174
  gs = fig.add_gridspec(8, 6, height_ratios=[1, 1, 1.5, 6, 1, 3, 1, 0.5])
 
169
 
170
  def create_pitcher_overview_card(id, season, dpi=300):
171
  data = get_card_data(id, start_date=date(season, 1, 1), end_date=date(season, 12, 31), game_kind='Regular Season', min_pitches=100, pitch_class_type='general')
172
+ print(data)
173
 
174
  fig = plt.figure(figsize=(1080/300, 1350/300), dpi=dpi)
175
  gs = fig.add_gridspec(8, 6, height_ratios=[1, 1, 1.5, 6, 1, 3, 1, 0.5])
stats.py CHANGED
@@ -67,13 +67,14 @@ def compute_pitch_stats(data, player_type, pitch_class_type, min_pitches=1):
67
  assert player_type in ('pitcher', 'batter')
68
  assert pitch_class_type in ('general', 'specific')
69
  id_col = 'pitId' if player_type == 'pitcher' else 'batId'
 
70
  pitch_col = 'ballKind_code' if pitch_class_type == 'specific' else 'general_ballKind_code'
71
  pitch_name_col = 'ballKind' if pitch_class_type == 'specific' else 'general_ballKind'
72
  pitch_stats = (
73
  data
74
  .group_by(id_col, pitch_col, 'pitcher_team_name_short')
75
  .agg(
76
- pl.first('pitcher_name'),
77
  *([pl.first('general_ballKind')] if pitch_class_type == 'specific' else []),
78
  pl.first(pitch_name_col),
79
  pl.len().alias('count'),
@@ -116,15 +117,14 @@ def compute_pitch_stats(data, player_type, pitch_class_type, min_pitches=1):
116
  .sort(id_col, 'count', descending=[False, True])
117
  )
118
  return pitch_stats
 
119
 
120
-
121
- def compute_pitcher_stats(data, min_ip='qualified'):
122
- data = data.filter(pl.col('ballKind') != '-')
123
  data = (
124
  compute_team_games(data)
125
  .with_columns(
126
  pl.when(pl.col('half_inning').str.ends_with('1')).then('home_games').otherwise('visitor_games').first().over('pitId').alias('games'),
127
- pl.col('inning_code').unique().len().over('pitId').alias('IP') # inaccurate
128
  )
129
  )
130
 
@@ -133,35 +133,37 @@ def compute_pitcher_stats(data, min_ip='qualified'):
133
  else:
134
  data = data.with_columns((pl.col('IP') >= min_ip).alias('qualified'))
135
 
136
- pitcher_stats = (
137
- data
138
- .group_by('pitId', 'pitcher_team_name_short')
139
- .agg(
140
- pl.col('pitcher_name').first(),
141
- (pl.when(pl.col('presult').str.contains('strikeout')).then(1).otherwise(0).sum() / pl.col('pa_code').unique().len()).alias('K%'),
142
- (pl.when(pl.col('presult') == 'Walk').then(1).otherwise(0).sum() / pl.col('pa_code').unique().len()).alias('BB%'),
143
- (pl.col('csw').sum() / pl.col('pitch').sum()).alias('CSW%'),
144
- pl.col('whiff').sum().alias('Whiffs'),
145
- pl.col('aux_bresult').struct.field('batType').drop_nulls().value_counts(normalize=True),
146
- pl.first('qualified')
147
- )
148
- .explode('batType')
149
- .unnest('batType')
150
- .pivot(on='batType', values='proportion')
151
- .fill_null(0)
152
- .with_columns(
153
- (pl.col('G') + pl.col('B')).alias('GB%'),
154
- (pl.col('F') + pl.col('P')).alias('FB%'),
155
- pl.col('L').alias('LD%'),
156
- )
157
- .drop('G', 'F', 'B', 'P', 'L', 'null')
158
- .with_columns(
159
- (pl.when(pl.col('qualified')).then(pl.col(stat)).rank(descending=(stat == 'BB%'))/pl.when(pl.col('qualified')).then(pl.col(stat)).count()).alias(f'{stat}_pctl')
160
- for stat in ['CSW%', 'K%', 'BB%', 'GB%']
161
- )
 
 
162
  )
163
- return pitcher_stats
164
-
165
 
166
  def get_pitcher_stats(id, lr=None, game_kind=None, start_date=None, end_date=None, min_ip=1, min_pitches=1, pitch_class_type='specific'):
167
  # source_data = data_df.filter(pl.col('ballKind_code') != '-')
@@ -176,19 +178,6 @@ def get_pitcher_stats(id, lr=None, game_kind=None, start_date=None, end_date=Non
176
  source_data = data_df
177
  source_data = filter_data_by_date_and_game_kind(source_data, start_date=start_date, end_date=end_date, game_kind=game_kind)
178
 
179
- source_data = (
180
- compute_team_games(source_data)
181
- .with_columns(
182
- pl.when(pl.col('half_inning').str.ends_with('1')).then('home_games').otherwise('visitor_games').first().over('pitId').alias('games'),
183
- pl.col('inning_code').unique().len().over('pitId').alias('IP')
184
- )
185
- )
186
-
187
- if min_ip == 'qualified':
188
- source_data = source_data.with_columns((pl.col('IP') >= pl.col('games')).alias('qualified'))
189
- else:
190
- source_data = source_data.with_columns((pl.col('IP') >= min_ip).alias('qualified'))
191
-
192
  if lr is not None:
193
  source_data = source_data.filter(pl.col('batLR') == lr)
194
 
@@ -206,32 +195,6 @@ def get_pitcher_stats(id, lr=None, game_kind=None, start_date=None, end_date=Non
206
  .with_columns((pl.col('ballSpeed')/1.609).alias('ballSpeed_mph'))
207
  )
208
 
209
- pitcher_stats = (
210
- source_data
211
- .group_by('pitId')
212
- .agg(
213
- pl.col('pitcher_name').first(),
214
- (pl.when(pl.col('presult').str.contains('strikeout')).then(1).otherwise(0).sum() / pl.col('pa_code').unique().len()).alias('K%'),
215
- (pl.when(pl.col('presult') == 'Walk').then(1).otherwise(0).sum() / pl.col('pa_code').unique().len()).alias('BB%'),
216
- (pl.col('csw').sum() / pl.col('pitch').sum()).alias('CSW%'),
217
- pl.col('aux_bresult').struct.field('batType').drop_nulls().value_counts(normalize=True),
218
- pl.first('qualified')
219
- )
220
- .explode('batType')
221
- .unnest('batType')
222
- .pivot(on='batType', values='proportion')
223
- .fill_null(0)
224
- .with_columns(
225
- (pl.col('G') + pl.col('B')).alias('GB%'),
226
- (pl.col('F') + pl.col('P')).alias('FB%'),
227
- pl.col('L').alias('LD%'),
228
- )
229
- .drop('G', 'F', 'B', 'P', 'L')
230
- .with_columns(
231
- (pl.when(pl.col('qualified')).then(pl.col(stat)).rank(descending=(stat == 'BB%'))/pl.when(pl.col('qualified')).then(pl.col(stat)).count()).alias(f'{stat}_pctl')
232
- for stat in ['CSW%', 'K%', 'BB%', 'GB%']
233
- )
234
- .filter(pl.col('pitId') == id)
235
- )
236
 
237
  return SimpleNamespace(pitcher_stats=pitcher_stats, pitch_stats=pitch_stats, pitch_shapes=pitch_shapes)
 
67
  assert player_type in ('pitcher', 'batter')
68
  assert pitch_class_type in ('general', 'specific')
69
  id_col = 'pitId' if player_type == 'pitcher' else 'batId'
70
+ name_col = 'pitcher_name' if player_type == 'pitcher' else 'batter_name'
71
  pitch_col = 'ballKind_code' if pitch_class_type == 'specific' else 'general_ballKind_code'
72
  pitch_name_col = 'ballKind' if pitch_class_type == 'specific' else 'general_ballKind'
73
  pitch_stats = (
74
  data
75
  .group_by(id_col, pitch_col, 'pitcher_team_name_short')
76
  .agg(
77
+ pl.first(name_col),
78
  *([pl.first('general_ballKind')] if pitch_class_type == 'specific' else []),
79
  pl.first(pitch_name_col),
80
  pl.len().alias('count'),
 
117
  .sort(id_col, 'count', descending=[False, True])
118
  )
119
  return pitch_stats
120
+
121
 
122
+ def compute_player_stats(data, player_type, min_ip='qualified'):
 
 
123
  data = (
124
  compute_team_games(data)
125
  .with_columns(
126
  pl.when(pl.col('half_inning').str.ends_with('1')).then('home_games').otherwise('visitor_games').first().over('pitId').alias('games'),
127
+ pl.col('inning_code').unique().len().over('pitId').alias('IP')
128
  )
129
  )
130
 
 
133
  else:
134
  data = data.with_columns((pl.col('IP') >= min_ip).alias('qualified'))
135
 
136
+ assert player_type in ('pitcher', 'batter')
137
+ id_col = 'pitId' if player_type == 'pitcher' else 'batId'
138
+ name_col = 'pitcher_name' if player_type == 'pitcher' else 'batter_name'
139
+ player_stats = (
140
+ data
141
+ .group_by(id_col)
142
+ .agg(
143
+ pl.col(name_col).first(),
144
+ (pl.when(pl.col('presult').str.contains('strikeout')).then(1).otherwise(0).sum() / pl.col('pa_code').unique().len()).alias('K%'),
145
+ (pl.when(pl.col('presult') == 'Walk').then(1).otherwise(0).sum() / pl.col('pa_code').unique().len()).alias('BB%'),
146
+ (pl.col('csw').sum() / pl.col('pitch').sum()).alias('CSW%'),
147
+ pl.col('aux_bresult').struct.field('batType').drop_nulls().value_counts(normalize=True),
148
+ pl.first('qualified')
149
+ )
150
+ .explode('batType')
151
+ .unnest('batType')
152
+ .pivot(on='batType', values='proportion')
153
+ .fill_null(0)
154
+ .with_columns(
155
+ (pl.col('G') + pl.col('B')).alias('GB%'),
156
+ (pl.col('F') + pl.col('P')).alias('FB%'),
157
+ pl.col('L').alias('LD%'),
158
+ )
159
+ .drop('G', 'F', 'B', 'P', 'L')
160
+ .with_columns(
161
+ (pl.when(pl.col('qualified')).then(pl.col(stat)).rank(descending=(stat == 'BB%'))/pl.when(pl.col('qualified')).then(pl.col(stat)).count()).alias(f'{stat}_pctl')
162
+ for stat in ['CSW%', 'K%', 'BB%', 'GB%']
163
+ )
164
  )
165
+ return player_stats
166
+
167
 
168
  def get_pitcher_stats(id, lr=None, game_kind=None, start_date=None, end_date=None, min_ip=1, min_pitches=1, pitch_class_type='specific'):
169
  # source_data = data_df.filter(pl.col('ballKind_code') != '-')
 
178
  source_data = data_df
179
  source_data = filter_data_by_date_and_game_kind(source_data, start_date=start_date, end_date=end_date, game_kind=game_kind)
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  if lr is not None:
182
  source_data = source_data.filter(pl.col('batLR') == lr)
183
 
 
195
  .with_columns((pl.col('ballSpeed')/1.609).alias('ballSpeed_mph'))
196
  )
197
 
198
+ pitcher_stats = compute_player_stats(source_data, player_type='pitcher', min_ip=min_ip).filter(pl.col('pitId') == id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
  return SimpleNamespace(pitcher_stats=pitcher_stats, pitch_stats=pitch_stats, pitch_shapes=pitch_shapes)