patrickramos commited on
Commit
0b50ce4
·
1 Parent(s): 48dfbf7

Add pitcher leaderboard

Browse files
Files changed (6) hide show
  1. app.py +4 -0
  2. convert.py +5 -0
  3. pitch_leaderboard.py +6 -25
  4. pitcher_leaderboard.py +158 -0
  5. plotting.py +1 -1
  6. stats.py +49 -13
app.py CHANGED
@@ -3,6 +3,7 @@ import matplotlib as mpl
3
 
4
  from data import data_df
5
  from pitcher_overview import create_pitcher_overview
 
6
  from pitch_leaderboard import create_pitch_leaderboard
7
  from daily_weekly_leaderboard import create_daily_weekly_leaderboard_app
8
  from css import css
@@ -15,6 +16,7 @@ with open('updated.txt') as f:
15
 
16
  limitations = '''**General Limitations**
17
  - As new players make their debut, some names may be missing or not translated/transliterated correctly.
 
18
  '''
19
 
20
  with open('acknowledgements.md', 'r') as f:
@@ -24,6 +26,8 @@ if __name__ == '__main__':
24
  with gr.Blocks(css=css) as app:
25
  with gr.Tab('Pitcher Overview'):
26
  create_pitcher_overview(data_df)
 
 
27
  with gr.Tab('Pitch Leaderboard'):
28
  create_pitch_leaderboard()
29
  with gr.Tab('Daily/Weekly Leaderboard'):
 
3
 
4
  from data import data_df
5
  from pitcher_overview import create_pitcher_overview
6
+ from pitcher_leaderboard import create_pitcher_leaderboard
7
  from pitch_leaderboard import create_pitch_leaderboard
8
  from daily_weekly_leaderboard import create_daily_weekly_leaderboard_app
9
  from css import css
 
16
 
17
  limitations = '''**General Limitations**
18
  - As new players make their debut, some names may be missing or not translated/transliterated correctly.
19
+ - IP is overestimated
20
  '''
21
 
22
  with open('acknowledgements.md', 'r') as f:
 
26
  with gr.Blocks(css=css) as app:
27
  with gr.Tab('Pitcher Overview'):
28
  create_pitcher_overview(data_df)
29
+ with gr.Tab('Pitcher Leaderboard'):
30
+ create_pitcher_leaderboard()
31
  with gr.Tab('Pitch Leaderboard'):
32
  create_pitch_leaderboard()
33
  with gr.Tab('Daily/Weekly Leaderboard'):
convert.py CHANGED
@@ -161,6 +161,11 @@ presult = {
161
  141: 'Unknown'
162
  }
163
 
 
 
 
 
 
164
  bresult = {
165
  0: '空振り三振',
166
  1: '単打', # 1b gb p
 
161
  141: 'Unknown'
162
  }
163
 
164
+ def verify_and_return_presult(presults):
165
+ for _presult in presults:
166
+ assert _presult in presult.values(), f'{_presult} is invalid'
167
+ return presults
168
+
169
  bresult = {
170
  0: '空振り三振',
171
  1: '単打', # 1b gb p
pitch_leaderboard.py CHANGED
@@ -32,6 +32,9 @@ TEAMS = [
32
  ]
33
  notes = '''**Limitations**
34
  - [Column widths get messed up when filtering](https://github.com/gradio-app/gradio/issues/11564)
 
 
 
35
  '''
36
 
37
 
@@ -44,29 +47,8 @@ def gr_create_pitch_leaderboard(start_date, end_date, min_pitches, pitcher_lr='B
44
  if pitcher_lr != 'Both':
45
  data = data.filter(pl.col('batLR') == pitcher_lr[0].lower())
46
 
47
- # both, left, right = [
48
- # (
49
- # compute_pitch_stats(df, player_type='pitcher', min_pitches=min_pitches, pitch_class_type='specific')
50
- # .filter(pl.col('qualified') & (pl.col('ballKind').is_in(include_pitches)))
51
- # .drop('qualified')
52
- # .rename({'pitcher_name': 'Pitcher', 'count': 'Count', 'usage': 'Usage', 'ballKind': 'Pitch', 'general_ballKind': 'Pitch (General)'} | {f'{stat}_pctl': f'{stat} (Pctl)' for stat in STATS_WITH_PCTLS})
53
- # .with_columns(
54
- # pl.col(stat).mul(100).round(1)
55
- # for stat in PCT_STATS + [f'{stat} (Pctl)' for stat in STATS_WITH_PCTLS]
56
- # )
57
- # [['pitId', 'ballKind_code', 'Pitcher', 'Pitch', 'Pitch (General)', 'Count', 'Usage'] + STATS_WITH_PCTLS]
58
- # )
59
- # for df
60
- # in [data, data.filter(pl.col('batLR') == 'l'), data.filter(pl.col('batLR') == 'r')]
61
- # ]
62
- # pitch_stats = (
63
- # both
64
- # .join(left, on=['pitId', 'ballKind_code'], suffix=' (LHH)', how='full')
65
- # .join(right, on=['pitId', 'ballKind_code'], suffix=' (RHH)', how='full')
66
- # .drop('pitId', 'ballKind_code', *list(chain.from_iterable([[f'{col} ({handedness}HH)' for col in ['pitId', 'ballKind_code', 'Pitcher', 'Pitch', 'Pitch (General)']] for handedness in ('L', 'R')])))
67
- # )
68
  pitch_stats = (
69
- compute_pitch_stats(data, player_type='pitcher', min_pitches=min_pitches, pitch_class_type='specific')
70
  .filter(pl.col('qualified') & (pl.col('ballKind').is_in(include_pitches)))
71
  .drop('pitId', 'ballKind_code', 'qualified')
72
  .rename({
@@ -85,7 +67,6 @@ def gr_create_pitch_leaderboard(start_date, end_date, min_pitches, pitcher_lr='B
85
  pl.col(stat).mul(100)
86
  for stat in PCT_STATS
87
  )
88
- # [['Pitcher', 'Team', 'Pitch', 'Pitch (General)'] + STATS + [f'{stat}_pctl' for stat in STATS_WITH_PCTLS]]
89
  )
90
 
91
  if include_teams is not None:
@@ -146,7 +127,7 @@ def create_pitch_leaderboard():
146
  with gr.Column(scale=1):
147
  all_pitches = gr.Button('Select/Deselect all pitches')
148
  min_pitches = gr.Number(100, label='Min. Pitches', precision=0, minimum=0)
149
- pitcher_lr = gr.Radio(['Both', 'Left', 'Right'], value='Both', label='Batter handedness')
150
  with gr.Row():
151
  include_teams = gr.CheckboxGroup(TEAMS, value=TEAMS, label='Teams', scale=3)
152
  all_teams = gr.Button('Select/Deselect all teams')
@@ -164,7 +145,7 @@ def create_pitch_leaderboard():
164
 
165
  gr.Markdown(notes)
166
 
167
- search.click(gr_create_pitch_leaderboard, inputs=[start_date, end_date, min_pitches, pitcher_lr, include_pitches, include_teams], outputs=leaderboard)
168
  all_pitches.click(lambda _pitch_types : [] if _pitch_types == PITCH_TYPES else PITCH_TYPES, inputs=include_pitches, outputs=include_pitches)
169
  all_teams.click(lambda _teams : [] if _teams == TEAMS else TEAMS, inputs=include_teams, outputs=include_teams)
170
  # pin_columns.input(
 
32
  ]
33
  notes = '''**Limitations**
34
  - [Column widths get messed up when filtering](https://github.com/gradio-app/gradio/issues/11564)
35
+
36
+ **To-do**
37
+ - Add post-season
38
  '''
39
 
40
 
 
47
  if pitcher_lr != 'Both':
48
  data = data.filter(pl.col('batLR') == pitcher_lr[0].lower())
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  pitch_stats = (
51
+ compute_pitch_stats(data, player_type='pitcher', min_pitches=min_pitches, pitch_class_type='specific', group_by_team=True)
52
  .filter(pl.col('qualified') & (pl.col('ballKind').is_in(include_pitches)))
53
  .drop('pitId', 'ballKind_code', 'qualified')
54
  .rename({
 
67
  pl.col(stat).mul(100)
68
  for stat in PCT_STATS
69
  )
 
70
  )
71
 
72
  if include_teams is not None:
 
127
  with gr.Column(scale=1):
128
  all_pitches = gr.Button('Select/Deselect all pitches')
129
  min_pitches = gr.Number(100, label='Min. Pitches', precision=0, minimum=0)
130
+ batter_lr = gr.Radio(['Both', 'Left', 'Right'], value='Both', label='Batter handedness')
131
  with gr.Row():
132
  include_teams = gr.CheckboxGroup(TEAMS, value=TEAMS, label='Teams', scale=3)
133
  all_teams = gr.Button('Select/Deselect all teams')
 
145
 
146
  gr.Markdown(notes)
147
 
148
+ search.click(gr_create_pitch_leaderboard, inputs=[start_date, end_date, min_pitches, batter_lr, include_pitches, include_teams], outputs=leaderboard)
149
  all_pitches.click(lambda _pitch_types : [] if _pitch_types == PITCH_TYPES else PITCH_TYPES, inputs=include_pitches, outputs=include_pitches)
150
  all_teams.click(lambda _teams : [] if _teams == TEAMS else TEAMS, inputs=include_teams, outputs=include_teams)
151
  # pin_columns.input(
pitcher_leaderboard.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import polars as pl
3
+ import numpy as np
4
+
5
+ from datetime import datetime
6
+ # from itertools import chain
7
+
8
+ from data import data_df
9
+ from stats import compute_player_stats, filter_data_by_date_and_game_kind
10
+ from convert import team_names_short_to_color, get_text_color_from_team
11
+ from plotting import stat_cmap
12
+
13
+ STATS = ['FB Velo', 'K%', 'BB%', 'Swing%', 'Z-Swing%', 'Chase%', 'Contact%', 'Z-Contact%', 'O-Contact%', 'SwStr%', 'Whiff%', 'CSW%', 'GB%', 'FB%', 'LD%', 'Zone%', 'Arm%', 'Glove%', 'High%', 'Low%', 'MM%']
14
+ PCT_STATS = ['K%', 'BB%', 'Swing%', 'Z-Swing%', 'Chase%', 'Contact%', 'Z-Contact%', 'O-Contact%', 'SwStr%', 'Whiff%', 'CSW%', 'GB%', 'FB%', 'LD%', 'Zone%', 'Arm%', 'Glove%', 'High%', 'Low%', 'MM%']
15
+ STATS_WITH_PCTLS = ['FB Velo', 'K%', 'BB%', 'Swing%', 'Z-Swing%', 'Chase%', 'Contact%', 'Z-Contact%', 'O-Contact%', 'SwStr%', 'Whiff%', 'CSW%', 'GB%', 'FB%', 'LD%', 'Zone%']
16
+ COLUMNS = ['Pitcher', 'Team', 'IP', 'TBF'] + STATS
17
+
18
+ TEAMS = [
19
+ 'Yomiuri',
20
+ 'Yakult',
21
+ 'DeNA',
22
+ 'Chunichi',
23
+ 'Hanshin',
24
+ 'Hiroshima',
25
+ 'Nipponham',
26
+ 'Rakuten',
27
+ 'Seibu',
28
+ 'Lotte',
29
+ 'ORIX',
30
+ 'SoftBank'
31
+ ]
32
+ notes = '''**Limitations**
33
+ - [Column widths get messed up when filtering](https://github.com/gradio-app/gradio/issues/11564)
34
+
35
+ **To-do**
36
+ - Fix IP calculation
37
+ - Add post-season
38
+ '''
39
+
40
+
41
+ def gr_create_pitcher_leaderboard(start_date, end_date, min_ip, pitcher_lr='Both', include_teams=None):
42
+ assert pitcher_lr in ['Both', 'Left', 'Right']
43
+
44
+ data = data_df.filter(pl.col('ballKind_code') != '-')
45
+
46
+ data = filter_data_by_date_and_game_kind(data, start_date=start_date, end_date=end_date, game_kind='Regular Season')
47
+ if pitcher_lr != 'Both':
48
+ data = data.filter(pl.col('batLR') == pitcher_lr[0].lower())
49
+
50
+ pitcher_stats = (
51
+ compute_player_stats(data, player_type='pitcher', min_ip=min_ip, group_by_team=True)
52
+ .filter(pl.col('qualified'))
53
+ .drop('pitId', 'qualified')
54
+ .rename({
55
+ 'pitcher_name': 'Pitcher',
56
+ 'pitcher_team_name_short': 'Team',
57
+ 'PA': 'TBF'
58
+ })
59
+ .with_columns(
60
+ pl.col(stat).mul(100)
61
+ for stat in PCT_STATS
62
+ )
63
+ )
64
+
65
+ if include_teams is not None:
66
+ pitcher_stats = pitcher_stats.filter(pl.col('Team').is_in(include_teams))
67
+
68
+ styling = []
69
+ for i, row in enumerate(pitcher_stats[COLUMNS].iter_rows()):
70
+ styling_row = []
71
+ for col, item in zip(pitcher_stats[COLUMNS].columns, row):
72
+ _styling = 'font-size: 0.75em; '
73
+ if col in STATS_WITH_PCTLS:
74
+ r, g, b = (stat_cmap([pitcher_stats[f'{col}_pctl'][i]])[0, :3]*255).astype(np.uint8)
75
+ styling_row.append(f'background-color: rgba({r}, {g}, {b})')
76
+ elif col == 'Team':
77
+ styling_row.append(f'color: {get_text_color_from_team(item)}; background-color: {team_names_short_to_color[item]}')
78
+ else:
79
+ styling_row.append('')
80
+ styling.append(styling_row)
81
+
82
+ display_value = []
83
+ for row in pitcher_stats[COLUMNS].iter_rows():
84
+ display_value_row = []
85
+ for col, item in zip(COLUMNS, row):
86
+ if col in PCT_STATS:
87
+ display_value_row.append(f'{item:.1f}%')
88
+ elif isinstance(item, float):
89
+ display_value_row.append(f'{item:.1f}')
90
+ else:
91
+ display_value_row.append(item)
92
+ display_value.append(display_value_row)
93
+
94
+ value = {
95
+ 'data': pitcher_stats[COLUMNS].rows(),
96
+ 'headers': COLUMNS,
97
+ 'metadata': {
98
+ 'styling': styling,
99
+ 'display_value': display_value,
100
+ }
101
+ }
102
+
103
+ return value
104
+
105
+
106
+ def create_pitcher_leaderboard():
107
+ now = datetime.now()
108
+ start_datetime_init = datetime(now.year, 1, 1)
109
+ end_datetime_init = now
110
+ with gr.Blocks() as app:
111
+ gr.Markdown('# Pitch Leaderboard')
112
+ with gr.Row():
113
+ start_date = gr.DateTime(start_datetime_init, include_time=False, type='datetime', label='Start')
114
+ end_date = gr.DateTime(end_datetime_init, include_time=False, type='datetime', label='End')
115
+ with gr.Row():
116
+ with gr.Group():
117
+ min_ip_state = gr.State('qualified')
118
+ min_ip = gr.Number(100, label='Min. IP', precision=0, minimum=0, interactive=False)
119
+ qualified = gr.Checkbox(True, label='Qualified')
120
+ batter_lr = gr.Radio(['Both', 'Left', 'Right'], value='Both', label='Batter handedness')
121
+ with gr.Row():
122
+ include_teams = gr.CheckboxGroup(TEAMS, value=TEAMS, label='Teams', scale=3)
123
+ all_teams = gr.Button('Select/Deselect all teams')
124
+
125
+ search = gr.Button('Search')
126
+ pin_columns = gr.Button('Pin columns')
127
+ leaderboard = gr.DataFrame(
128
+ pl.DataFrame({'Pitcher': [], 'Pitch': []}),
129
+ column_widths=[125, 75, 50, 50] + [max(50, 10*len(stat)) for stat in STATS],
130
+ show_copy_button=True,
131
+ show_search='filter',
132
+ pinned_columns=2,
133
+ elem_id='leaderboard'
134
+ )
135
+
136
+ gr.Markdown(notes)
137
+
138
+ search.click(gr_create_pitcher_leaderboard, inputs=[start_date, end_date, min_ip_state, batter_lr, include_teams], outputs=leaderboard)
139
+ all_teams.click(lambda _teams : [] if _teams == TEAMS else TEAMS, inputs=include_teams, outputs=include_teams)
140
+ min_ip_state_kwargs = dict(fn=lambda min_ip, qualified: (qualified if qualified else min_ip, gr.Number(interactive=not qualified)), inputs=[min_ip, qualified], outputs=[min_ip_state, min_ip])
141
+ min_ip.change(**min_ip_state_kwargs)
142
+ qualified.change(**min_ip_state_kwargs)
143
+ # pin_columns.input(
144
+ # lambda _pin_columns : gr.update(pinned_columns=None if _pin_columns else 3),
145
+ # inputs=pin_columns,
146
+ # outputs=leaderboard
147
+ # )
148
+ # pin_columns_state = gr.State(True)
149
+ pin_columns.click(
150
+ lambda : gr.update(pinned_columns=None),
151
+ outputs=leaderboard
152
+ )
153
+
154
+ return app
155
+
156
+ if __name__ == '__main__':
157
+ app = create_pitcher_leaderboard()
158
+ app.launch()
plotting.py CHANGED
@@ -92,7 +92,7 @@ def plot_loc(ax, locs):
92
 
93
  def plot_velo(ax, velos):
94
  trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
95
- for (pitch,), _velos in velos.group_by('general_ballKind_code'):
96
  _velos = _velos.filter(((pl.col('ballSpeed_mph') - pl.col('ballSpeed_mph').mean())/ pl.col('ballSpeed_mph').std()).abs() < 3)
97
 
98
  if len(_velos) <= 1:
 
92
 
93
  def plot_velo(ax, velos):
94
  trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
95
+ for (pitch,), _velos in velos.sort(pl.len().over('general_ballKind_code'), descending=True).group_by('general_ballKind_code', maintain_order=True):
96
  _velos = _velos.filter(((pl.col('ballSpeed_mph') - pl.col('ballSpeed_mph').mean())/ pl.col('ballSpeed_mph').std()).abs() < 3)
97
 
98
  if len(_velos) <= 1:
stats.py CHANGED
@@ -3,6 +3,12 @@ from data import data_df
3
 
4
  from types import SimpleNamespace
5
 
 
 
 
 
 
 
6
  def filter_data_by_date_and_game_kind(data, start_date=None, end_date=None, game_kind=None):
7
  if start_date is not None:
8
  data = data.filter(pl.col('date') >= start_date)
@@ -63,17 +69,19 @@ def compute_team_games(data):
63
  )
64
 
65
 
66
- def compute_pitch_stats(data, player_type, pitch_class_type, min_pitches=1):
67
  assert player_type in ('pitcher', 'batter')
68
  assert pitch_class_type in ('general', 'specific')
69
- id_col = 'pitId' if player_type == 'pitcher' else 'batId'
 
 
70
  name_col = 'pitcher_name' if player_type == 'pitcher' else 'batter_name'
71
  pitch_col = 'ballKind_code' if pitch_class_type == 'specific' else 'general_ballKind_code'
72
  pitch_name_col = 'ballKind' if pitch_class_type == 'specific' else 'general_ballKind'
73
  pitch_stats = (
74
  data
75
  .with_columns((pl.col('ballSpeed') / 1.609).round(1).alias('mph'))
76
- .group_by(id_col, pitch_col, 'pitcher_team_name_short')
77
  .agg(
78
  pl.first(name_col),
79
  *([pl.first('general_ballKind')] if pitch_class_type == 'specific' else []),
@@ -119,17 +127,23 @@ def compute_pitch_stats(data, player_type, pitch_class_type, min_pitches=1):
119
  for stat in ['Avg KPH', 'Max KPH', 'Avg MPH', 'Max MPH', 'Swing%', 'Z-Swing%', 'Chase%', 'Contact%', 'Z-Contact%', 'O-Contact%', 'SwStr%', 'Whiff%', 'CSW%', 'GB%', 'FB%', 'LD%', 'Zone%']
120
  )
121
  .rename({pitch_col: 'ballKind_code', pitch_name_col: 'ballKind'} if pitch_class_type == 'general' else {})
122
- .sort(id_col, 'count', descending=[False, True])
123
  )
124
  return pitch_stats
125
-
126
 
127
- def compute_player_stats(data, player_type, min_ip='qualified'):
128
  data = (
129
  compute_team_games(data)
130
  .with_columns(
131
  pl.when(pl.col('half_inning').str.ends_with('1')).then('home_games').otherwise('visitor_games').first().over('pitId').alias('games'),
132
  pl.col('inning_code').unique().len().over('pitId').alias('IP')
 
 
 
 
 
 
 
133
  )
134
  )
135
 
@@ -139,17 +153,38 @@ def compute_player_stats(data, player_type, min_ip='qualified'):
139
  data = data.with_columns((pl.col('IP') >= min_ip).alias('qualified'))
140
 
141
  assert player_type in ('pitcher', 'batter')
142
- id_col = 'pitId' if player_type == 'pitcher' else 'batId'
 
 
143
  name_col = 'pitcher_name' if player_type == 'pitcher' else 'batter_name'
144
  player_stats = (
145
  data
146
- .group_by(id_col)
 
147
  .agg(
148
  pl.col(name_col).first(),
 
 
 
 
149
  (pl.when(pl.col('presult').str.contains('strikeout')).then(1).otherwise(0).sum() / pl.col('pa_code').unique().len()).alias('K%'),
150
  (pl.when(pl.col('presult') == 'Walk').then(1).otherwise(0).sum() / pl.col('pa_code').unique().len()).alias('BB%'),
151
- (pl.col('csw').sum() / pl.col('pitch').sum()).alias('CSW%'),
152
  pl.col('aux_bresult').struct.field('batType').drop_nulls().value_counts(normalize=True),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  pl.first('qualified')
154
  )
155
  .explode('batType')
@@ -163,9 +198,10 @@ def compute_player_stats(data, player_type, min_ip='qualified'):
163
  )
164
  .drop('G', 'F', 'B', 'P', 'L')
165
  .with_columns(
166
- (pl.when(pl.col('qualified')).then(pl.col(stat)).rank(descending=(stat == 'BB%'))/pl.when(pl.col('qualified')).then(pl.col(stat)).count()).alias(f'{stat}_pctl')
167
- for stat in ['CSW%', 'K%', 'BB%', 'GB%']
168
  )
 
169
  )
170
  return player_stats
171
 
@@ -186,7 +222,7 @@ def get_pitcher_stats(id, lr=None, game_kind=None, start_date=None, end_date=Non
186
  if lr is not None:
187
  source_data = source_data.filter(pl.col('batLR') == lr)
188
 
189
- pitch_stats = compute_pitch_stats(source_data, player_type='pitcher', pitch_class_type=pitch_class_type, min_pitches=min_pitches).filter(pl.col('pitId') == id)
190
 
191
  pitch_shapes = (
192
  source_data
@@ -200,6 +236,6 @@ def get_pitcher_stats(id, lr=None, game_kind=None, start_date=None, end_date=Non
200
  .with_columns((pl.col('ballSpeed')/1.609).alias('ballSpeed_mph'))
201
  )
202
 
203
- pitcher_stats = compute_player_stats(source_data, player_type='pitcher', min_ip=min_ip).filter(pl.col('pitId') == id)
204
 
205
  return SimpleNamespace(pitcher_stats=pitcher_stats, pitch_stats=pitch_stats, pitch_shapes=pitch_shapes)
 
3
 
4
  from types import SimpleNamespace
5
 
6
+ from convert import verify_and_return_presult
7
+
8
+
9
+ valid_pitch = pl.col('x').is_not_null() & pl.col('y').is_not_null() & (pl.col('ballSpeed') > 0)
10
+
11
+
12
  def filter_data_by_date_and_game_kind(data, start_date=None, end_date=None, game_kind=None):
13
  if start_date is not None:
14
  data = data.filter(pl.col('date') >= start_date)
 
69
  )
70
 
71
 
72
+ def compute_pitch_stats(data, player_type, pitch_class_type, min_pitches=1, group_by_team=False):
73
  assert player_type in ('pitcher', 'batter')
74
  assert pitch_class_type in ('general', 'specific')
75
+ id_cols = ['pitId' if player_type == 'pitcher' else 'batId']
76
+ if group_by_team:
77
+ id_cols.append('pitcher_team_name_short')
78
  name_col = 'pitcher_name' if player_type == 'pitcher' else 'batter_name'
79
  pitch_col = 'ballKind_code' if pitch_class_type == 'specific' else 'general_ballKind_code'
80
  pitch_name_col = 'ballKind' if pitch_class_type == 'specific' else 'general_ballKind'
81
  pitch_stats = (
82
  data
83
  .with_columns((pl.col('ballSpeed') / 1.609).round(1).alias('mph'))
84
+ .group_by(*id_cols, pitch_col)
85
  .agg(
86
  pl.first(name_col),
87
  *([pl.first('general_ballKind')] if pitch_class_type == 'specific' else []),
 
127
  for stat in ['Avg KPH', 'Max KPH', 'Avg MPH', 'Max MPH', 'Swing%', 'Z-Swing%', 'Chase%', 'Contact%', 'Z-Contact%', 'O-Contact%', 'SwStr%', 'Whiff%', 'CSW%', 'GB%', 'FB%', 'LD%', 'Zone%']
128
  )
129
  .rename({pitch_col: 'ballKind_code', pitch_name_col: 'ballKind'} if pitch_class_type == 'general' else {})
130
+ .sort(id_cols[0], 'count', descending=[False, True])
131
  )
132
  return pitch_stats
 
133
 
134
+ def compute_player_stats(data, player_type, min_ip='qualified', group_by_team=False):
135
  data = (
136
  compute_team_games(data)
137
  .with_columns(
138
  pl.when(pl.col('half_inning').str.ends_with('1')).then('home_games').otherwise('visitor_games').first().over('pitId').alias('games'),
139
  pl.col('inning_code').unique().len().over('pitId').alias('IP')
140
+ # pl.col('presult').is_in(verify_and_return_presult([
141
+ # 'Groundout', 'Flyout', 'Lineout', 'Groundout (Double play)',
142
+ # 'Foul fly', 'Foul line (?)',
143
+ # 'Sacrifice bunt', 'Sacrifice fly',
144
+ # "Fielder's choice", "Sacrifice fielder's choice",
145
+ # 'Bunt strikeout', 'Swinging strikeout', 'Looking strikeout'
146
+ # ])).sum().over('pitId').mul(1/3).alias('IP')
147
  )
148
  )
149
 
 
153
  data = data.with_columns((pl.col('IP') >= min_ip).alias('qualified'))
154
 
155
  assert player_type in ('pitcher', 'batter')
156
+ id_cols = ['pitId' if player_type == 'pitcher' else 'batId']
157
+ if group_by_team:
158
+ id_cols.append('pitcher_team_name_short')
159
  name_col = 'pitcher_name' if player_type == 'pitcher' else 'batter_name'
160
  player_stats = (
161
  data
162
+ .with_columns(pl.when(pl.col('general_ballKind_code').is_in(['4S', 'FC', 'SI'])).then(pl.when(valid_pitch).then('ballSpeed').mean().over('pitId', 'general_ballKind_code')).mul(1/1.609).round(1).alias('FB Velo'))
163
+ .group_by(id_cols)
164
  .agg(
165
  pl.col(name_col).first(),
166
+ *([] if group_by_team else [pl.col('pitcher_team_name_short').last()]),
167
+ pl.col('IP').first(),
168
+ pl.col('pa_code').unique().len().alias('PA'),
169
+ pl.col('FB Velo').max(),
170
  (pl.when(pl.col('presult').str.contains('strikeout')).then(1).otherwise(0).sum() / pl.col('pa_code').unique().len()).alias('K%'),
171
  (pl.when(pl.col('presult') == 'Walk').then(1).otherwise(0).sum() / pl.col('pa_code').unique().len()).alias('BB%'),
 
172
  pl.col('aux_bresult').struct.field('batType').drop_nulls().value_counts(normalize=True),
173
+ (pl.col('swing').sum() / pl.col('pitch').sum()).alias('Swing%'),
174
+ ((pl.col('swing') & pl.col('zone')).sum() / pl.col('pitch').sum()).alias('Z-Swing%'),
175
+ ((pl.col('swing') & ~pl.col('zone')).sum() / pl.col('pitch').sum()).alias('Chase%'),
176
+ ((pl.col('swing') & ~pl.col('whiff')).sum()/pl.col('swing').sum()).alias('Contact%'),
177
+ ((pl.col('zone') & pl.col('swing') & ~pl.col('whiff')).sum()/(pl.col('zone') & pl.col('swing')).sum()).alias('Z-Contact%'),
178
+ ((~pl.col('zone') & pl.col('swing') & ~pl.col('whiff')).sum()/(~pl.col('zone') & pl.col('swing')).sum()).alias('O-Contact%'),
179
+ (pl.col('whiff').sum() / pl.col('swing').sum()).alias('Whiff%'),
180
+ (pl.col('whiff').sum() / pl.col('pitch').sum()).alias('SwStr%'),
181
+ (pl.col('csw').sum() / pl.col('pitch').sum()).alias('CSW%'),
182
+ (pl.col('zone').sum() / pl.col('pitch').sum()).alias('Zone%'),
183
+ (pl.when(pl.col('pitLR') == 'r').then(pl.col('x') < 0).otherwise(pl.col('x') > 0)).mean().alias('Glove%'),
184
+ (pl.when(pl.col('pitLR') == 'r').then(pl.col('x') >= 0).otherwise(pl.col('x') <= 0)).mean().alias('Arm%'),
185
+ (pl.col('y') > 125).mean().alias('High%'),
186
+ (pl.col('y') <= 125).mean().alias('Low%'),
187
+ (pl.col('x').is_between(-20, 20) & pl.col('y').is_between(100, 100+50)).mean().alias('MM%'),
188
  pl.first('qualified')
189
  )
190
  .explode('batType')
 
198
  )
199
  .drop('G', 'F', 'B', 'P', 'L')
200
  .with_columns(
201
+ (pl.when(pl.col('qualified')).then(pl.col(stat)).rank(descending=stat in ['BB%', 'FB%', 'LD%'] or 'Contact%' in stat)/pl.when(pl.col('qualified')).then(pl.col(stat)).count()).alias(f'{stat}_pctl')
202
+ for stat in ['FB Velo', 'K%', 'BB%', 'Swing%', 'Z-Swing%', 'Chase%', 'Contact%', 'Z-Contact%', 'O-Contact%', 'SwStr%', 'Whiff%', 'CSW%', 'GB%', 'FB%', 'LD%', 'Zone%']
203
  )
204
+ .sort('IP', descending=True)
205
  )
206
  return player_stats
207
 
 
222
  if lr is not None:
223
  source_data = source_data.filter(pl.col('batLR') == lr)
224
 
225
+ pitch_stats = compute_pitch_stats(source_data, player_type='pitcher', pitch_class_type=pitch_class_type, min_pitches=min_pitches, group_by_team=False).filter(pl.col('pitId') == id)
226
 
227
  pitch_shapes = (
228
  source_data
 
236
  .with_columns((pl.col('ballSpeed')/1.609).alias('ballSpeed_mph'))
237
  )
238
 
239
+ pitcher_stats = compute_player_stats(source_data, player_type='pitcher', min_ip=min_ip, group_by_team=False).filter(pl.col('pitId') == id)
240
 
241
  return SimpleNamespace(pitcher_stats=pitcher_stats, pitch_stats=pitch_stats, pitch_shapes=pitch_shapes)