Download Note Book Here
This Exploratory Data Analysis is my own personal learning practice in this practice i did analysis from datasets i downloaded from kaggle.com
Import Necessary Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import os
warnings.filterwarnings('ignore')
Importing Datasets
df = pd.read_csv(r'C:\Users\user\dl-course-data\abalone.csv')
df.head()

df.shape

# Checking the null value in the dataset
df.isnull().sum()

df.info()

# Statistical description of dataset
df.describe().T

# Extracting a unique values of type column
a = df['Type'].unique()
print(a)

b = df['Type'].value_counts()
print(b)

# Computing Rings by Type
df.groupby(["Type"])["Rings"].count().reset_index(name="count")

Adding ID Column to dataset
df['id'] = range(1, len(df)+1)
df.head()

Correlation
correlation = df.corr()
fig = px.imshow(correlation,text_auto=True,aspect="auto")
fig.show()

# Type M has the highest number of percentage
import plotly.express as px
import pandas as pd
fig = px.pie(df, values='id', names='Type', title='Abalone Type By Height')
fig.update_traces(hoverinfo='label+percent', textinfo='label+percent', textfont_size=20, pull=[0.1,0.1,0.1],
marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.show()

import plotly.express as px
fig = px.bar(df, x='Type', y='id', color='id')
fig.show()

# Include nbins= number_of_bins to specify histogram shape
px.histogram(df, x="id", color="Type")

cross_tab = pd.crosstab(df["Type"],df["Rings"],margins=True)
cross_tab

sns.factorplot(df["Type"],df["Rings"],data=df)

import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
alpha = 0.05
stats,p_value,degrees_of_freedom,expected = chi2_contingency(cross_tab)
if p_value > alpha:
print(f'Accept Null Hypothesis\n p_value is {p_value}\n Ringss are independent of Types')
else:
print(f'Reject Null Hypothesis\n p_value is {p_value}\n Rings are not independent of Types')
