import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as st
from sklearn import ensemble, tree, linear_model
import missingno as msno


mpg = pd.read_csv("auto-mpg.csv")


mpg.describe()


mpg.head()


mpg.tail()


mpg.shape

(392, 9)


numeric_features = mpg.select_dtypes(include=[np.number])

numeric_features.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year'],
      dtype='object')


categorical_features = mpg.select_dtypes(include=[object])
categorical_features.columns

Index(['origin', 'car name'], dtype='object')


mpg.skew()

C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\3763326539.py:1: FutureWarning: The default value of numeric_only in DataFrame.skew is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  mpg.skew()

mpg             0.457092
cylinders       0.508109
displacement    0.701669
horsepower      1.087326
weight          0.519586
acceleration    0.291587
model year      0.019688
dtype: float64


mpg.kurt()

C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\2295031750.py:1: FutureWarning: The default value of numeric_only in DataFrame.kurt is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  mpg.kurt()

mpg            -0.515993
cylinders      -1.398199
displacement   -0.778317
horsepower      0.696947
weight         -0.809259
acceleration    0.444234
model year     -1.167446
dtype: float64


y = mpg['mpg']
plt.figure(1); plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=st.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)

C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\1571773139.py:3: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(y, kde=False, fit=st.johnsonsu)
C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\1571773139.py:5: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(y, kde=False, fit=st.norm)
C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\1571773139.py:7: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(y, kde=False, fit=st.lognorm)

<Axes: title={'center': 'Log Normal'}, xlabel='mpg'>


sns.distplot(mpg.skew(),color='blue',axlabel ='Skewness')

C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\916493348.py:1: FutureWarning: The default value of numeric_only in DataFrame.skew is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.distplot(mpg.skew(),color='blue',axlabel ='Skewness')
C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\916493348.py:1: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(mpg.skew(),color='blue',axlabel ='Skewness')

<Axes: xlabel='Skewness', ylabel='Density'>


plt.figure(figsize = (12,8))
sns.distplot(mpg.kurt(),color='r',axlabel ='Kurtosis',norm_hist= False, kde = True,rug = False)
plt.hist(mpg.kurt(),orientation = 'vertical',histtype = 'bar',label ='Kurtosis', color ='blue')
plt.show()

C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\755977596.py:2: FutureWarning: The default value of numeric_only in DataFrame.kurt is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.distplot(mpg.kurt(),color='r',axlabel ='Kurtosis',norm_hist= False, kde = True,rug = False)
C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\755977596.py:2: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(mpg.kurt(),color='r',axlabel ='Kurtosis',norm_hist= False, kde = True,rug = False)
C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\755977596.py:3: FutureWarning: The default value of numeric_only in DataFrame.kurt is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  plt.hist(mpg.kurt(),orientation = 'vertical',histtype = 'bar',label ='Kurtosis', color ='blue')


plt.hist(mpg['mpg'],orientation = 'vertical',histtype = 'bar', color ='blue')
plt.show()


target = np.log(mpg['mpg'])
target.skew()
plt.hist(target,color='blue')

(array([ 3., 10., 40., 46., 66., 57., 66., 58., 38.,  8.]),
 array([2.19722458, 2.36166217, 2.52609977, 2.69053737, 2.85497496,
        3.01941256, 3.18385016, 3.34828775, 3.51272535, 3.67716294,
        3.84160054]),
 <BarContainer object of 10 artists>)


correlation = numeric_features.corr()
print(correlation['mpg'].sort_values(ascending = False),'\n')

mpg             1.000000
model year      0.580541
acceleration    0.423329
cylinders      -0.777618
horsepower     -0.778427
displacement   -0.805127
weight         -0.832244
Name: mpg, dtype: float64


f , ax = plt.subplots(figsize = (14,12))
plt.title('Correlation of Numeric Features with MPG',y=1,size=16)
sns.heatmap(correlation,square = True)

<Axes: title={'center': 'Correlation of Numeric Features with MPG'}>


k= 11
cols = correlation.nlargest(k,'mpg')['mpg'].index
print(cols)
cm = np.corrcoef(mpg[cols].values.T)
f , ax = plt.subplots(figsize = (14,12))
sns.heatmap(cm, vmax=.8, linewidths=0.01,square=True,annot=True,cmap='viridis',
            linecolor="white",xticklabels = cols.values ,annot_kws = {'size':12},yticklabels = cols.values)

Index(['mpg', 'model year', 'acceleration', 'cylinders', 'horsepower',
       'displacement', 'weight'],
      dtype='object')

<Axes: >


sns.set()
columns = ['mpg','cylinders','horsepower','displacement','weight','model year','acceleration']
sns.pairplot(mpg[columns],size = 2 ,kind ='scatter',diag_kind='kde')
plt.show()

C:\Users\leigh\anaconda3\lib\site-packages\seaborn\axisgrid.py:2095: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)


fig, ((ax1, ax2), (ax3, ax4),(ax5,ax6)) = plt.subplots(nrows=3, ncols=2, figsize=(14,10))
cylinders_scatter_plot = pd.concat([mpg['mpg'],mpg['cylinders']],axis = 1)
sns.regplot(x='cylinders',y = 'mpg',data = cylinders_scatter_plot,scatter= True, fit_reg=True, ax=ax1)
horsepower_scatter_plot = pd.concat([mpg['mpg'],mpg['horsepower']],axis = 1)
sns.regplot(x='horsepower',y = 'mpg',data = horsepower_scatter_plot,scatter= True, fit_reg=True, ax=ax2)
displacement_scatter_plot = pd.concat([mpg['mpg'],mpg['displacement']],axis = 1)
sns.regplot(x='displacement',y = 'mpg',data = displacement_scatter_plot,scatter= True, fit_reg=True, ax=ax3)
weight_scatter_plot = pd.concat([mpg['mpg'],mpg['weight']],axis = 1)
sns.regplot(x='weight',y = 'mpg',data = weight_scatter_plot,scatter= True, fit_reg=True, ax=ax4)
acceleration_scatter_plot = pd.concat([mpg['mpg'],mpg['acceleration']],axis = 1)
sns.regplot(x='acceleration',y = 'mpg',data = acceleration_scatter_plot,scatter= True, fit_reg=True, ax=ax5)
model_year_scatter_plot = pd.concat([mpg['mpg'],mpg['model year']],axis = 1)
sns.regplot(x='model year',y = 'mpg',data = model_year_scatter_plot,scatter= True, fit_reg=True, ax=ax6)

<Axes: xlabel='model year', ylabel='mpg'>


mpg_model_year = mpg.pivot_table(index ='model year',values = 'mpg', aggfunc = np.median)
mpg_model_year.plot(kind = 'bar',color = 'blue')
plt.xlabel('Model Year')
plt.ylabel('Median MPG')
plt.show()


var = 'model year'
data = pd.concat([mpg['mpg'], mpg[var]], axis=1)
f, ax = plt.subplots(figsize=(12, 8))
fig = sns.boxplot(x=var, y="mpg", data=data)
fig.axis(ymin=0, ymax=50);

var = 'cylinders'
data = pd.concat([mpg['mpg'], mpg[var]], axis=1)
f, ax = plt.subplots(figsize=(12, 8))
fig = sns.boxplot(x=var, y="mpg", data=data)
fig.axis(ymin=0, ymax=50);


var = 'origin'
data = pd.concat([mpg['mpg'], mpg[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 10))
fig = sns.boxplot(x=var, y="mpg", data=data)
fig.axis(ymin=0, ymax=50);
xt = plt.xticks(rotation=45)

	mpg	cylinders	displacement	horsepower	weight	acceleration	model year
count	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000
mean	23.445918	5.471939	194.411990	104.469388	2977.584184	15.541327	75.979592
std	7.805007	1.705783	104.644004	38.491160	849.402560	2.758864	3.683737
min	9.000000	3.000000	68.000000	46.000000	1613.000000	8.000000	70.000000
25%	17.000000	4.000000	105.000000	75.000000	2225.250000	13.775000	73.000000
50%	22.750000	4.000000	151.000000	93.500000	2803.500000	15.500000	76.000000
75%	29.000000	8.000000	275.750000	126.000000	3614.750000	17.025000	79.000000
max	46.600000	8.000000	455.000000	230.000000	5140.000000	24.800000	82.000000

	mpg	cylinders	displacement	horsepower	weight	acceleration	model year	origin	car name
0	18.0	8	307.0	130	3504	12.0	70	America	chevrolet chevelle malibu
1	15.0	8	350.0	165	3693	11.5	70	America	buick skylark 320
2	18.0	8	318.0	150	3436	11.0	70	America	plymouth satellite
3	16.0	8	304.0	150	3433	12.0	70	America	amc rebel sst
4	17.0	8	302.0	140	3449	10.5	70	America	ford torino

	mpg	cylinders	displacement	horsepower	weight	acceleration	model year	origin	car name
387	27.0	4	140.0	86	2790	15.6	82	America	ford mustang gl
388	44.0	4	97.0	52	2130	24.6	82	Europe	vw pickup
389	32.0	4	135.0	84	2295	11.6	82	America	dodge rampage
390	28.0	4	120.0	79	2625	18.6	82	America	ford ranger
391	31.0	4	119.0	82	2720	19.4	82	America	chevy s-10

Exploratory Data Analysis on auto-mpg dataset¶

Import Libraries¶

Correlation Heat Map¶

Zoomed HeatMap¶

MPG Correlation matrix¶

Pair Plot¶

Pair Plot between 'SalePrice' and correlated variables¶

Scatter Plot¶

Scatter plots between the most correlated variables with MPG¶