import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
#Models
from sklearn.linear_model import LinearRegression
filename = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv"
mtcars_data = pd.read_csv(filename)
mtcars_data.head()
model | mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Mazda RX4 | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.620 | 16.46 | 0 | 1 | 4 | 4 |
1 | Mazda RX4 Wag | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.875 | 17.02 | 0 | 1 | 4 | 4 |
2 | Datsun 710 | 22.8 | 4 | 108.0 | 93 | 3.85 | 2.320 | 18.61 | 1 | 1 | 4 | 1 |
3 | Hornet 4 Drive | 21.4 | 6 | 258.0 | 110 | 3.08 | 3.215 | 19.44 | 1 | 0 | 3 | 1 |
4 | Hornet Sportabout | 18.7 | 8 | 360.0 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | 2 |
fig, ax = plt.subplots(figsize = (10, 6), ncols = 2)
sns.boxplot(x = mtcars_data.cyl, y = mtcars_data.disp, ax=ax[0])
sns.boxplot(x = mtcars_data.cyl, y = mtcars_data.wt, ax=ax[1])
plt.show()
fig2, ax2 = plt.subplots(ncols=2, figsize = (12,6))
sns.scatterplot(x= mtcars_data.wt, y=mtcars_data.mpg, ax = ax2[0])
ax2[0].set_title("Original Data")
sns.scatterplot(x= np.log(mtcars_data.wt), y=np.log(mtcars_data.mpg), ax = ax2[1])
ax2[1].set_title("Log-transformed Data")
plt.show()
normal_model = LinearRegression()
normal_model.fit(mtcars_data.wt.values.reshape(-1,1), mtcars_data.mpg)
log_model = LinearRegression()
log_model.fit(np.log(mtcars_data.wt.values.reshape(-1,1)), np.log(mtcars_data.mpg))
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
fig2, ax2 = plt.subplots(ncols=2, figsize = (12,6))
sns.scatterplot(x= mtcars_data.wt, y=mtcars_data.mpg, ax = ax2[0])
ax2[0].plot(mtcars_data.wt,normal_model.predict(mtcars_data.wt.values.reshape(-1,1)))
sns.scatterplot(x= np.log(mtcars_data.wt), y=np.log(mtcars_data.mpg), ax = ax2[1])
ax2[1].plot(np.log(mtcars_data.wt.values), log_model.predict(np.log(mtcars_data.wt.values.reshape(-1,1))))
ax2[0].set_title("Original Data")
ax2[1].set_title("Log-transformed Data")
plt.show()
table = pd.crosstab(index = mtcars_data.am, columns = mtcars_data.cyl, values = mtcars_data.cyl, aggfunc="count")
table
cyl | 4 | 6 | 8 |
---|---|---|---|
am | |||
0 | 3 | 4 | 12 |
1 | 8 | 3 | 2 |
stats.chi2_contingency(table)
Chi2ContingencyResult(statistic=8.740732951259268, pvalue=0.012646605046107276, dof=2, expected_freq=array([[6.53125, 4.15625, 8.3125 ], [4.46875, 2.84375, 5.6875 ]]))