import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Get data from .csv file
allgames = pd.read_csv('Big_Dance_CSV.csv')
display(allgames.head())


# Iterate through all games, compare scores, and update winner columns accordingly
for i, row in allgames.iterrows() :
    if row['Score'] > row['Score.1'] :
        allgames.at[i,'Winner'] = row['Team']
        allgames.at[i,'Winning Seed'] = row['Seed']
    else :
        allgames.at[i,'Winner'] = row['Team.1']
        allgames.at[i,'Winning Seed'] = row['Seed.1']
allgames = allgames.astype({'Winning Seed':'int'})
display(allgames)


# Helper function to get key by value from dictionary
def get_key(val, my_dict):
    for key, value in my_dict.items():
         if val == value:
             return key
    return "There is no such Key"
# Dictionary which tells us the number of wins a team has after progressing the correlated round.
rounds = {'Round of 64': 1, 'Round of 32': 2, 'Sweet 16': 3, 'Elite Eight': 4, 'Final Four': 5, 'Championship': 6}
# Iterate through each round in the tournament and grab the number of times each seed won that round. Plot on bar chart
for rd in rounds.values() :
    allgames_round = allgames.loc[allgames['Round'] == rd]
    winner_freq = allgames_round['Winning Seed'].value_counts()
    plt.figure(figsize=(6,4))
    plt.bar(winner_freq.index.to_list(), winner_freq.values)
    title = str(get_key(rd,rounds)) + ' Wining Seed Frequency'
    plt.title(title)
    plt.xlabel('Winning Seed')
    plt.ylabel('Number of times Seed Won')


# List of all Blue Bloods
bluebloods = ['Duke', 'Indiana', 'Kansas', 'Kentucky', 'North Carolina', 'UCLA', 'Villanova']
years = np.unique(allgames['Year'])

# Plot each Blue Blood number of wins in tournament
for team in bluebloods:
    # For each Blue Blood, grab matchups where they won
    df_numwins = pd.DataFrame()
    df_bb = allgames.loc[allgames['Winner'] == team]
    # Iterate through each year and find number of wins
    for year in years:
        numwins_year = df_bb.loc[df_bb['Year'] == year].iloc[-1:].reset_index()
        df_numwins = pd.concat([df_numwins, numwins_year], ignore_index=True)
    # Plot number of wins per year for each team
    plt.figure(figsize=(6,4))
    title = team + "'s" + " Number of Wins Each Year"
    plt.title(title, fontsize=14)
    plt.bar(df_numwins['Year'],df_numwins['Round'])
    plt.xlabel('Year')
    plt.ylabel('Number of Wins')


# Load in dataset
stats_df = pd.read_csv('cbb.csv')
display(stats_df)


# Drop any null values, or rows where the team did not make the tournament.
stats_df = stats_df.dropna()
stats_df.sort_values(by='YEAR', inplace=True)


stats_df = stats_df[stats_df['POSTSEASON'] != 'R68']
stats_df = stats_df.reset_index(drop=True)


# Insert Win Percentage column at specified index with all Null values.
stats_df.insert(4, 'W_PCT', [np.nan] * len(stats_df))
# Iterate through each row and calculate win percentage.
for i, row in stats_df.iterrows():
    stats_df.at[i,'W_PCT'] = row['W'] / row['G'] * 100
display(stats_df)


import seaborn as sns
years = np.unique(stats_df['YEAR'])
# Dict with rounds and number of wins as described above
rounds = {'R64': 0, 'R32': 1, 'S16': 2, 'E8': 3, 'F4': 4, '2ND': 5, 'Champions': 6}
for i, row in stats_df.iterrows():
    stats_df.at[i,'POST_ROUND'] = rounds.get(row['POSTSEASON'])
display(stats_df.head())


# Sort DataFrame by round so scatter plot order makes sense
stats_df = stats_df.sort_values(by=['POST_ROUND'], ascending=False)
# Figure dimensions
sns.set(rc={"figure.figsize":(10, 8)})
# Plot and set labels
ax = sns.scatterplot(data=stats_df, x='W_PCT', y='POSTSEASON', hue='YEAR', palette='deep')
ax.set(xlabel='Regular Season Win Percentage', ylabel='Round', title='Tournament Outcome vs. Regular Season Win Percentage')
plt.show()


print("Average Regular Season Win Percentage for teams ending as: ")
# Iterate through each round and find avg win percentage for that round
for rd in rounds.keys():
    rd_df = stats_df.loc[stats_df['POSTSEASON'] == rd]
    mean_wpct = round(rd_df['W_PCT'].mean(), 2)
    print(str(rd) + " is " + str(mean_wpct) + "%")

Average Regular Season Win Percentage for teams ending as: 
R64 is 68.87%
R32 is 74.13%
S16 is 74.61%
E8 is 78.45%
F4 is 79.6%
2ND is 83.27%
Champions is 87.35%


# Plot Offensive Efficiency
sns.set(rc={"figure.figsize":(8, 6)})
ax1 = sns.scatterplot(data=stats_df, x='ADJOE', y='POSTSEASON', hue='YEAR', palette='deep')
ax1.set(xlabel='Regular Season Offensive Efficiency', ylabel='Round', title='Tournament Outcome vs. Offensive Efficiency')
plt.show()
# Plot Defensive Efficiency
sns.set(rc={"figure.figsize":(8, 6)})
ax2 = sns.scatterplot(data=stats_df, x='ADJDE', y='POSTSEASON', hue='YEAR', palette='deep')
ax2.set(xlabel='Regular Season Defensive Efficiency', ylabel='Round', title='Tournament Outcome vs. Defensive Efficiency')
plt.show()

# Print out efficiency averages per round
print('Average Offensive Efficiency for teams ending as: ')
for rd in rounds.keys():
    rd_df = stats_df.loc[stats_df['POSTSEASON'] == rd]
    mean_wpct = round(rd_df['ADJOE'].mean(), 2)
    print(str(rd) + " is " + str(mean_wpct))
print('\nAverage Defensive Efficiency for teams ending as: ')
for rd in rounds.keys():
    rd_df = stats_df.loc[stats_df['POSTSEASON'] == rd]
    mean_wpct = round(rd_df['ADJDE'].mean(), 2)
    print(str(rd) + " is " + str(mean_wpct))

Average Offensive Efficiency for teams ending as: 
R64 is 108.72
R32 is 112.83
S16 is 115.51
E8 is 117.74
F4 is 116.37
2ND is 119.79
Champions is 121.3

Average Defensive Efficiency for teams ending as: 
R64 is 98.54
R32 is 95.06
S16 is 93.75
E8 is 92.81
F4 is 91.96
2ND is 91.47
Champions is 90.4


import statsmodels.formula.api as statsmodels

stats_finalfour = stats_df.loc[stats_df['POSTSEASON'].isin(['F4', '2ND', 'Champions'])]
# Create subplots
fig, axes = plt.subplots(1,3, figsize=(20,6))
# Plot stats and regression line
sns.regplot(ax=axes[0], y='POST_ROUND', x='W_PCT', data = stats_finalfour, color='b')
sns.regplot(ax=axes[1], y='POST_ROUND', x='ADJOE', data = stats_finalfour, color='r')
sns.regplot(ax=axes[2], y='POST_ROUND', x='ADJDE', data = stats_finalfour, color='g')
# Set titles, xlabels, and ylabels
axes[0].set_title('Number of Wins vs. Win Percentage')
axes[1].set_title('Offensive Efficiency vs. Win Percentage')
axes[2].set_title('Defensive Efficiency vs. Win Percentage')
axes[0].set_xlabel('Win Percentage')
axes[1].set_xlabel('Offensive Efficiency')
axes[2].set_xlabel('Defensive Efficiency')
for i in range(0,3):
    axes[i].set_ylabel('Number of Postseason Wins')


# Models for number of wins vs. each statistic
wpct_model = statsmodels.ols(formula = 'POST_ROUND ~ W_PCT', data = stats_finalfour).fit()
adjoe_model = statsmodels.ols(formula = 'POST_ROUND ~ ADJOE', data = stats_finalfour).fit()
adjde_model = statsmodels.ols(formula = 'POST_ROUND ~ ADJDE', data = stats_finalfour).fit()
print('Win Percentage R^2: {}'.format(wpct_model.rsquared))
print('Offensive Efficiency R^2: {}'.format(adjoe_model.rsquared))
print('Defensive Efficiency R^2: {}'.format(adjde_model.rsquared))

Win Percentage R^2: 0.15992385596478398
Offensive Efficiency R^2: 0.16486594610489758
Defensive Efficiency R^2: 0.030826302249686188

	Year	Round	Region Number	Region Name	Seed	Score	Team	Team.1	Score.1	Seed.1
0	1985	1	1	West	1	83	St Johns	Southern	59	16
1	1985	1	1	West	2	81	VCU	Marshall	65	15
2	1985	1	1	West	3	65	NC State	Nevada	56	14
3	1985	1	1	West	4	85	UNLV	San Diego St	80	13
4	1985	1	1	West	5	58	Washington	Kentucky	65	12

	Year	Round	Region Number	Region Name	Seed	Score	Team	Team.1	Score.1	Seed.1	Winner	Winning Seed
0	1985	1	1	West	1	83	St Johns	Southern	59	16	St Johns	1
1	1985	1	1	West	2	81	VCU	Marshall	65	15	VCU	2
2	1985	1	1	West	3	65	NC State	Nevada	56	14	NC State	3
3	1985	1	1	West	4	85	UNLV	San Diego St	80	13	UNLV	4
4	1985	1	1	West	5	58	Washington	Kentucky	65	12	Kentucky	12
...	...	...	...	...	...	...	...	...	...	...	...	...
2200	2019	4	3	East	1	80	Virginia	Purdue	75	3	Virginia	1
2201	2019	4	4	Midwest	5	77	Auburn	Kentucky	71	2	Auburn	5
2202	2019	5	1	Final Four	2	51	Michigan St	Texas Tech	61	3	Texas Tech	3
2203	2019	5	2	Final Four	1	63	Virginia	Auburn	62	5	Virginia	1
2204	2019	6	1	Championship	3	77	Texas Tech	Virginia	85	1	Virginia	1

	TEAM	CONF	G	W	ADJOE	ADJDE	BARTHAG	EFG_O	EFG_D	TOR	...	FTRD	2P_O	2P_D	3P_O	3P_D	ADJ_T	WAB	POSTSEASON	SEED	YEAR
0	North Carolina	ACC	40	33	123.3	94.9	0.9531	52.6	48.1	15.4	...	30.4	53.9	44.6	32.7	36.2	71.7	8.6	2ND	1.0	2016
1	Wisconsin	B10	40	36	129.1	93.6	0.9758	54.8	47.7	12.4	...	22.4	54.8	44.7	36.5	37.5	59.3	11.3	2ND	1.0	2015
2	Michigan	B10	40	33	114.4	90.4	0.9375	53.9	47.7	14.0	...	30.0	54.7	46.8	35.2	33.2	65.9	6.9	2ND	3.0	2018
3	Texas Tech	B12	38	31	115.2	85.2	0.9696	53.5	43.0	17.7	...	36.6	52.8	41.9	36.5	29.7	67.5	7.0	2ND	3.0	2019
4	Gonzaga	WCC	39	37	117.8	86.3	0.9728	56.6	41.1	16.2	...	26.9	56.3	40.0	38.2	29.0	71.5	7.7	2ND	1.0	2017
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2450	Michigan St.	B10	35	26	111.4	87.8	0.9392	50.6	44.5	20.8	...	32.4	50.4	44.3	34.1	30.1	64.4	6.7	S16	3.0	2013
2451	Arizona	P12	35	27	114.4	92.2	0.9229	52.5	46.6	19.5	...	32.9	50.6	43.4	37.1	35.8	66.8	4.6	S16	6.0	2013
2452	Oregon	P12	37	28	104.8	88.6	0.8728	49.3	46.4	21.4	...	33.3	49.1	44.9	33.3	33.4	69.2	2.9	S16	12.0	2013
2453	La Salle	A10	34	24	112.0	96.2	0.8516	51.9	49.3	17.1	...	28.5	49.3	50.6	37.7	30.2	66.0	0.3	S16	13.0	2013
2454	Florida Gulf Coast	ASun	35	24	103.4	96.3	0.6952	51.6	46.9	21.0	...	32.7	52.3	46.9	33.4	31.3	69.1	-4.0	S16	15.0	2013

	TEAM	CONF	G	W	W_PCT	ADJOE	ADJDE	BARTHAG	EFG_O	EFG_D	...	FTRD	2P_O	2P_D	3P_O	3P_D	ADJ_T	WAB	POSTSEASON	SEED	YEAR
0	Florida Gulf Coast	ASun	35	24	68.571429	103.4	96.3	0.6952	51.6	46.9	...	32.7	52.3	46.9	33.4	31.3	69.1	-4.0	S16	15.0	2013
1	South Dakota St.	Sum	32	22	68.750000	109.7	103.7	0.6573	53.3	51.2	...	24.4	50.4	49.8	38.9	35.8	64.0	-3.2	R64	13.0	2013
2	Montana	BSky	30	23	76.666667	101.6	101.8	0.4929	52.9	48.6	...	35.9	52.0	48.0	36.4	33.3	63.9	-1.4	R64	13.0	2013
3	New Mexico St.	WAC	34	23	67.647059	102.5	95.3	0.6992	49.3	45.4	...	30.7	49.8	43.2	31.8	33.9	65.1	-3.1	R64	13.0	2013
4	Northwestern St.	Slnd	28	19	67.857143	104.8	102.1	0.5730	49.0	47.8	...	40.5	50.0	46.8	31.0	33.9	73.0	-3.2	R64	14.0	2013
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
443	Yale	Ivy	29	22	75.862069	110.5	102.5	0.7034	55.6	47.3	...	31.1	56.0	47.8	36.5	31.0	72.2	-0.9	R64	14.0	2019
444	Iona	MAAC	33	17	51.515152	105.2	108.8	0.4044	53.0	52.3	...	28.5	53.2	50.3	35.1	36.6	71.9	-10.4	R64	16.0	2019
445	Iowa	B10	35	23	65.714286	116.1	100.6	0.8385	52.7	51.5	...	28.8	51.4	53.5	36.4	32.4	70.9	3.2	R32	10.0	2019
446	Minnesota	B10	36	22	61.111111	110.4	96.3	0.8284	48.2	49.5	...	28.2	48.5	48.7	31.7	34.0	68.8	2.7	R32	10.0	2019
447	New Mexico St.	WAC	33	30	90.909091	111.8	98.2	0.8155	54.1	48.9	...	32.5	56.7	48.0	34.0	33.5	67.1	1.4	R64	12.0	2019

	TEAM	CONF	G	W	W_PCT	ADJOE	ADJDE	BARTHAG	EFG_O	EFG_D	...	2P_O	2P_D	3P_O	3P_D	ADJ_T	WAB	POSTSEASON	SEED	YEAR	POST_ROUND
0	Florida Gulf Coast	ASun	35	24	68.571429	103.4	96.3	0.6952	51.6	46.9	...	52.3	46.9	33.4	31.3	69.1	-4.0	S16	15.0	2013	2.0
1	South Dakota St.	Sum	32	22	68.750000	109.7	103.7	0.6573	53.3	51.2	...	50.4	49.8	38.9	35.8	64.0	-3.2	R64	13.0	2013	0.0
2	Montana	BSky	30	23	76.666667	101.6	101.8	0.4929	52.9	48.6	...	52.0	48.0	36.4	33.3	63.9	-1.4	R64	13.0	2013	0.0
3	New Mexico St.	WAC	34	23	67.647059	102.5	95.3	0.6992	49.3	45.4	...	49.8	43.2	31.8	33.9	65.1	-3.1	R64	13.0	2013	0.0
4	Northwestern St.	Slnd	28	19	67.857143	104.8	102.1	0.5730	49.0	47.8	...	50.0	46.8	31.0	33.9	73.0	-3.2	R64	14.0	2013	0.0

The Madness Behind March Madness¶

An Analysis by: Rahul Shah¶

Technologies to use¶

Raw Matchups¶

Winning Seed Frequency¶

Blue Bloods¶

Addressing Controversy¶

Analyzing Regular Season Statistics¶

Predicting Outcomes¶

Conclusion¶