## Jupyter Snippet CB2nd 04_correlation

Jupyter Snippet CB2nd 04_correlation

# 7.4. Estimating the correlation between two variables with a contingency table and a chi-squared test

``````import numpy as np
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
%matplotlib inline
``````
``````player = 'Roger Federer'
'cookbook-2nd-data/blob/master/'
'federer.csv?raw=true',
parse_dates=['start date'],
dayfirst=True)
``````
``````print(f"Number of columns: {len(df.columns)}")
df[df.columns[:4]].tail()
``````
``````Number of columns: 70
``````

``````npoints = df['player1 total points total']
points = df['player1 total points won'] / npoints
aces = df['player1 aces'] / npoints
``````
``````fig, ax = plt.subplots(1, 1)
ax.plot(points, aces, '.')
ax.set_xlabel('% of points won')
ax.set_ylabel('% of aces')
ax.set_xlim(0., 1.)
ax.set_ylim(0.)
``````

``````df_bis = pd.DataFrame({'points': points,
'aces': aces}).dropna()
df_bis.tail()
``````

``````df_bis.corr()
``````

``````df_bis['result'] = (df_bis['points'] >
df_bis['points'].median())
df_bis['manyaces'] = (df_bis['aces'] >
df_bis['aces'].median())
``````
``````pd.crosstab(df_bis['result'], df_bis['manyaces'])
``````

``````st.chi2_contingency(_)
``````
``````(2.780e+01, 1.338e-07, 1,
array([[ 257.250,  256.749],
[ 256.749,  256.250]]))
``````