Download Note Book Here
This Exploratory Data Analysis is my own personal learning practice in this practice i did analysis from datasets i downloaded from kaggle.com
Import Necessary Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import os
warnings.filterwarnings('ignore')
Importing Datasets
df = pd.read_csv(r'C:\Users\user\dl-course-data\abalone.csv')
df.head()
data:image/s3,"s3://crabby-images/4f1d8/4f1d859f12a1ccd846deb3879b3cb59cbc074f8f" alt="Image description"
df.shape
data:image/s3,"s3://crabby-images/b3890/b38904385b2ea1b7756c5d69d7ca13d115fb52ce" alt="Image description"
# Checking the null value in the dataset
df.isnull().sum()
data:image/s3,"s3://crabby-images/96865/96865dd731fa61506c6894b3f6603c9ba59d3acc" alt="image 1.jpg"
df.info()
data:image/s3,"s3://crabby-images/51e87/51e87a30560a98fcb656a32541fb02af04698a77" alt="image 2.jpg"
# Statistical description of dataset
df.describe().T
data:image/s3,"s3://crabby-images/7bf27/7bf27f64c91f7ae56ec3095701fd85a6c3b15672" alt="image 3.jpg"
# Extracting a unique values of type column
a = df['Type'].unique()
print(a)
data:image/s3,"s3://crabby-images/074b5/074b507c6eebfaaf376d5e75fe6af6a0fc961f90" alt="image 4.jpg"
b = df['Type'].value_counts()
print(b)
data:image/s3,"s3://crabby-images/100bc/100bc8f90d6d6783bee037fe5b968a6fe96beeda" alt="image 5.jpg"
# Computing Rings by Type
df.groupby(["Type"])["Rings"].count().reset_index(name="count")
data:image/s3,"s3://crabby-images/b2732/b27325ee8f88acad4b2eccbb9136129b3bf72f78" alt="image 6.jpg"
Adding ID Column to dataset
df['id'] = range(1, len(df)+1)
df.head()
data:image/s3,"s3://crabby-images/3f50b/3f50b708a91ce28190678154b27b5e36f84d05ab" alt="image 7.jpg"
Correlation
correlation = df.corr()
fig = px.imshow(correlation,text_auto=True,aspect="auto")
fig.show()
data:image/s3,"s3://crabby-images/15fda/15fda25cfd976b4a42b22959a15b3188af1ecdbe" alt="image 8.jpg"
# Type M has the highest number of percentage
import plotly.express as px
import pandas as pd
fig = px.pie(df, values='id', names='Type', title='Abalone Type By Height')
fig.update_traces(hoverinfo='label+percent', textinfo='label+percent', textfont_size=20, pull=[0.1,0.1,0.1],
marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.show()
data:image/s3,"s3://crabby-images/2867e/2867e21850866c8d19c2ccf85053def0188d89cb" alt="image 9.jpg"
import plotly.express as px
fig = px.bar(df, x='Type', y='id', color='id')
fig.show()
data:image/s3,"s3://crabby-images/c6b76/c6b760bcb34f66d7f3d4c8e99b96c049ee469c0e" alt="image 10.jpg"
# Include nbins= number_of_bins to specify histogram shape
px.histogram(df, x="id", color="Type")
data:image/s3,"s3://crabby-images/19c64/19c64591ff5861592d8a89824da944ae5f2b8b8f" alt="image 11.jpg"
cross_tab = pd.crosstab(df["Type"],df["Rings"],margins=True)
cross_tab
data:image/s3,"s3://crabby-images/d1754/d1754adba4214bd3829c83e3fd20b079e4955d35" alt="image 12.jpg"
sns.factorplot(df["Type"],df["Rings"],data=df)
data:image/s3,"s3://crabby-images/15d63/15d633421beacf2c83fca970693552d6f97fe0cf" alt="image 13.jpg"
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
alpha = 0.05
stats,p_value,degrees_of_freedom,expected = chi2_contingency(cross_tab)
if p_value > alpha:
print(f'Accept Null Hypothesis\n p_value is {p_value}\n Ringss are independent of Types')
else:
print(f'Reject Null Hypothesis\n p_value is {p_value}\n Rings are not independent of Types')
data:image/s3,"s3://crabby-images/36f5d/36f5d9ef65ea27ea9ea469c4460838e1a37baaaa" alt="image 14.jpg"