import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as st
from sklearn import ensemble, tree, linear_model
import missingno as msno
Read the auto-mpg,csv file
Note:
mpg = pd.read_csv("auto-mpg.csv")
Describe the dataset
mpg.describe()
mpg | cylinders | displacement | horsepower | weight | acceleration | model year | |
---|---|---|---|---|---|---|---|
count | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392.000000 |
mean | 23.445918 | 5.471939 | 194.411990 | 104.469388 | 2977.584184 | 15.541327 | 75.979592 |
std | 7.805007 | 1.705783 | 104.644004 | 38.491160 | 849.402560 | 2.758864 | 3.683737 |
min | 9.000000 | 3.000000 | 68.000000 | 46.000000 | 1613.000000 | 8.000000 | 70.000000 |
25% | 17.000000 | 4.000000 | 105.000000 | 75.000000 | 2225.250000 | 13.775000 | 73.000000 |
50% | 22.750000 | 4.000000 | 151.000000 | 93.500000 | 2803.500000 | 15.500000 | 76.000000 |
75% | 29.000000 | 8.000000 | 275.750000 | 126.000000 | 3614.750000 | 17.025000 | 79.000000 |
max | 46.600000 | 8.000000 | 455.000000 | 230.000000 | 5140.000000 | 24.800000 | 82.000000 |
Look at the top and bottom 4 rows of data and look at the shape of the data frame (number of rows and columns)
mpg.head()
mpg | cylinders | displacement | horsepower | weight | acceleration | model year | origin | car name | |
---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | America | chevrolet chevelle malibu |
1 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | America | buick skylark 320 |
2 | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | America | plymouth satellite |
3 | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | America | amc rebel sst |
4 | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | America | ford torino |
mpg.tail()
mpg | cylinders | displacement | horsepower | weight | acceleration | model year | origin | car name | |
---|---|---|---|---|---|---|---|---|---|
387 | 27.0 | 4 | 140.0 | 86 | 2790 | 15.6 | 82 | America | ford mustang gl |
388 | 44.0 | 4 | 97.0 | 52 | 2130 | 24.6 | 82 | Europe | vw pickup |
389 | 32.0 | 4 | 135.0 | 84 | 2295 | 11.6 | 82 | America | dodge rampage |
390 | 28.0 | 4 | 120.0 | 79 | 2625 | 18.6 | 82 | America | ford ranger |
391 | 31.0 | 4 | 119.0 | 82 | 2720 | 19.4 | 82 | America | chevy s-10 |
mpg.shape
(392, 9)
Examine numerical features in the dataset
numeric_features = mpg.select_dtypes(include=[np.number])
numeric_features.columns
Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year'], dtype='object')
Examine categorical features in the dataset
categorical_features = mpg.select_dtypes(include=[object])
categorical_features.columns
Index(['origin', 'car name'], dtype='object')
Estimate Skewness and Kurtosis
mpg.skew()
C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\3763326539.py:1: FutureWarning: The default value of numeric_only in DataFrame.skew is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. mpg.skew()
mpg 0.457092 cylinders 0.508109 displacement 0.701669 horsepower 1.087326 weight 0.519586 acceleration 0.291587 model year 0.019688 dtype: float64
mpg.kurt()
C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\2295031750.py:1: FutureWarning: The default value of numeric_only in DataFrame.kurt is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. mpg.kurt()
mpg -0.515993 cylinders -1.398199 displacement -0.778317 horsepower 0.696947 weight -0.809259 acceleration 0.444234 model year -1.167446 dtype: float64
Plot the distribution of MPG as it plus, pand transformed with Johnson SU and logarithmic.
y = mpg['mpg']
plt.figure(1); plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=st.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)
C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\1571773139.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(y, kde=False, fit=st.johnsonsu) C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\1571773139.py:5: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(y, kde=False, fit=st.norm) C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\1571773139.py:7: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(y, kde=False, fit=st.lognorm)
<Axes: title={'center': 'Log Normal'}, xlabel='mpg'>
MPG is most normally distributed without any transformation.
sns.distplot(mpg.skew(),color='blue',axlabel ='Skewness')
C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\916493348.py:1: FutureWarning: The default value of numeric_only in DataFrame.skew is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. sns.distplot(mpg.skew(),color='blue',axlabel ='Skewness') C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\916493348.py:1: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(mpg.skew(),color='blue',axlabel ='Skewness')
<Axes: xlabel='Skewness', ylabel='Density'>
plt.figure(figsize = (12,8))
sns.distplot(mpg.kurt(),color='r',axlabel ='Kurtosis',norm_hist= False, kde = True,rug = False)
plt.hist(mpg.kurt(),orientation = 'vertical',histtype = 'bar',label ='Kurtosis', color ='blue')
plt.show()
C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\755977596.py:2: FutureWarning: The default value of numeric_only in DataFrame.kurt is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. sns.distplot(mpg.kurt(),color='r',axlabel ='Kurtosis',norm_hist= False, kde = True,rug = False) C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\755977596.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(mpg.kurt(),color='r',axlabel ='Kurtosis',norm_hist= False, kde = True,rug = False) C:\Users\leigh\AppData\Local\Temp\ipykernel_23200\755977596.py:3: FutureWarning: The default value of numeric_only in DataFrame.kurt is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. plt.hist(mpg.kurt(),orientation = 'vertical',histtype = 'bar',label ='Kurtosis', color ='blue')
plt.hist(mpg['mpg'],orientation = 'vertical',histtype = 'bar', color ='blue')
plt.show()
target = np.log(mpg['mpg'])
target.skew()
plt.hist(target,color='blue')
(array([ 3., 10., 40., 46., 66., 57., 66., 58., 38., 8.]), array([2.19722458, 2.36166217, 2.52609977, 2.69053737, 2.85497496, 3.01941256, 3.18385016, 3.34828775, 3.51272535, 3.67716294, 3.84160054]), <BarContainer object of 10 artists>)
Find Correlation coefficients between numeric features and MPG
correlation = numeric_features.corr()
print(correlation['mpg'].sort_values(ascending = False),'\n')
mpg 1.000000 model year 0.580541 acceleration 0.423329 cylinders -0.777618 horsepower -0.778427 displacement -0.805127 weight -0.832244 Name: mpg, dtype: float64
To explore further we will start with the following visualisation methods to analyze the data better:
f , ax = plt.subplots(figsize = (14,12))
plt.title('Correlation of Numeric Features with MPG',y=1,size=16)
sns.heatmap(correlation,square = True)
<Axes: title={'center': 'Correlation of Numeric Features with MPG'}>
Cylinders, displacement, horsepower and weight are all positively correlated with each other, and negatively correlated with MPG.
k= 11
cols = correlation.nlargest(k,'mpg')['mpg'].index
print(cols)
cm = np.corrcoef(mpg[cols].values.T)
f , ax = plt.subplots(figsize = (14,12))
sns.heatmap(cm, vmax=.8, linewidths=0.01,square=True,annot=True,cmap='viridis',
linecolor="white",xticklabels = cols.values ,annot_kws = {'size':12},yticklabels = cols.values)
Index(['mpg', 'model year', 'acceleration', 'cylinders', 'horsepower', 'displacement', 'weight'], dtype='object')
<Axes: >
Cylinders, horsepower, displacement and weight are all positively correlated with each other, and negatively correlated with MPG. There is also moderate positive correlation between model year, origin and acceleration with MPG.
Visualisation of 'OverallQual','TotalBsmtSF','GrLivArea','GarageArea','FullBath','YearBuilt','YearRemodAdd' features with respect to SalePrice in the form of pair plot & scatter pair plot for better understanding.
sns.set()
columns = ['mpg','cylinders','horsepower','displacement','weight','model year','acceleration']
sns.pairplot(mpg[columns],size = 2 ,kind ='scatter',diag_kind='kde')
plt.show()
C:\Users\leigh\anaconda3\lib\site-packages\seaborn\axisgrid.py:2095: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
The previously identified correlations are all clearly illusrated here too.
fig, ((ax1, ax2), (ax3, ax4),(ax5,ax6)) = plt.subplots(nrows=3, ncols=2, figsize=(14,10))
cylinders_scatter_plot = pd.concat([mpg['mpg'],mpg['cylinders']],axis = 1)
sns.regplot(x='cylinders',y = 'mpg',data = cylinders_scatter_plot,scatter= True, fit_reg=True, ax=ax1)
horsepower_scatter_plot = pd.concat([mpg['mpg'],mpg['horsepower']],axis = 1)
sns.regplot(x='horsepower',y = 'mpg',data = horsepower_scatter_plot,scatter= True, fit_reg=True, ax=ax2)
displacement_scatter_plot = pd.concat([mpg['mpg'],mpg['displacement']],axis = 1)
sns.regplot(x='displacement',y = 'mpg',data = displacement_scatter_plot,scatter= True, fit_reg=True, ax=ax3)
weight_scatter_plot = pd.concat([mpg['mpg'],mpg['weight']],axis = 1)
sns.regplot(x='weight',y = 'mpg',data = weight_scatter_plot,scatter= True, fit_reg=True, ax=ax4)
acceleration_scatter_plot = pd.concat([mpg['mpg'],mpg['acceleration']],axis = 1)
sns.regplot(x='acceleration',y = 'mpg',data = acceleration_scatter_plot,scatter= True, fit_reg=True, ax=ax5)
model_year_scatter_plot = pd.concat([mpg['mpg'],mpg['model year']],axis = 1)
sns.regplot(x='model year',y = 'mpg',data = model_year_scatter_plot,scatter= True, fit_reg=True, ax=ax6)
<Axes: xlabel='model year', ylabel='mpg'>
Again, we can clearly see the negative correlations between MPG and cylinders, horsepower, displacement and weight, which would be intuitive.
The negative correlation between MPG and acceleration is because acceleration is measure in 0-60, so lower numbers actually represent a faster acceeration. So statisticallt eh correlation is positive, in real terms it;s negative in that the greater the acceleration (lower the number) the lower the MPG.
Model year is also positively correlated with MPG, suggesting that newer cars are more fuel efficient.
Now plot a bar chart of median MPG by model year to examine the last correlation in more detail.
mpg_model_year = mpg.pivot_table(index ='model year',values = 'mpg', aggfunc = np.median)
mpg_model_year.plot(kind = 'bar',color = 'blue')
plt.xlabel('Model Year')
plt.ylabel('Median MPG')
plt.show()
Box plot model year and cylinders
var = 'model year'
data = pd.concat([mpg['mpg'], mpg[var]], axis=1)
f, ax = plt.subplots(figsize=(12, 8))
fig = sns.boxplot(x=var, y="mpg", data=data)
fig.axis(ymin=0, ymax=50);
var = 'cylinders'
data = pd.concat([mpg['mpg'], mpg[var]], axis=1)
f, ax = plt.subplots(figsize=(12, 8))
fig = sns.boxplot(x=var, y="mpg", data=data)
fig.axis(ymin=0, ymax=50);
Box Plot - origin
var = 'origin'
data = pd.concat([mpg['mpg'], mpg[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 10))
fig = sns.boxplot(x=var, y="mpg", data=data)
fig.axis(ymin=0, ymax=50);
xt = plt.xticks(rotation=45)
Cars from America have the lowest MPG, followed by Europe, and then finally cars from Asia have the highest MPG.