mtcars Data Analysis¶

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

#Models
from sklearn.linear_model import LinearRegression
In [2]:
filename = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv"
mtcars_data = pd.read_csv(filename)
In [3]:
mtcars_data.head()
Out[3]:
model mpg cyl disp hp drat wt qsec vs am gear carb
0 Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
1 Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
2 Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
3 Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
4 Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2

EDA¶

In [4]:
fig, ax = plt.subplots(figsize = (10, 6), ncols = 2)
sns.boxplot(x = mtcars_data.cyl, y = mtcars_data.disp, ax=ax[0])
sns.boxplot(x = mtcars_data.cyl, y = mtcars_data.wt, ax=ax[1])
plt.show()
In [10]:
fig2, ax2 = plt.subplots(ncols=2, figsize = (12,6))
sns.scatterplot(x= mtcars_data.wt, y=mtcars_data.mpg, ax = ax2[0])
ax2[0].set_title("Original Data")
sns.scatterplot(x= np.log(mtcars_data.wt), y=np.log(mtcars_data.mpg), ax = ax2[1])
ax2[1].set_title("Log-transformed Data")
plt.show()

Regression analysis on original data and log-transformed data for MPG and WT¶

In [6]:
normal_model = LinearRegression()
normal_model.fit(mtcars_data.wt.values.reshape(-1,1), mtcars_data.mpg)
log_model = LinearRegression()
log_model.fit(np.log(mtcars_data.wt.values.reshape(-1,1)), np.log(mtcars_data.mpg))
Out[6]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [12]:
fig2, ax2 = plt.subplots(ncols=2, figsize = (12,6))
sns.scatterplot(x= mtcars_data.wt, y=mtcars_data.mpg, ax = ax2[0])
ax2[0].plot(mtcars_data.wt,normal_model.predict(mtcars_data.wt.values.reshape(-1,1)))
sns.scatterplot(x= np.log(mtcars_data.wt), y=np.log(mtcars_data.mpg), ax = ax2[1])
ax2[1].plot(np.log(mtcars_data.wt.values), log_model.predict(np.log(mtcars_data.wt.values.reshape(-1,1))))
ax2[0].set_title("Original Data")
ax2[1].set_title("Log-transformed Data")
plt.show()

Contingency Table : CYL and AM, Chi-square Independent Test¶

In [8]:
table = pd.crosstab(index = mtcars_data.am, columns = mtcars_data.cyl, values = mtcars_data.cyl, aggfunc="count")
table
Out[8]:
cyl 4 6 8
am
0 3 4 12
1 8 3 2
In [9]:
stats.chi2_contingency(table)
Out[9]:
Chi2ContingencyResult(statistic=8.740732951259268, pvalue=0.012646605046107276, dof=2, expected_freq=array([[6.53125, 4.15625, 8.3125 ],
       [4.46875, 2.84375, 5.6875 ]]))
In [ ]: