Exploratory Data Analysis With Chi Square Contingency

Exploratory Data Analysis With Chi Square Contingency

Download Note Book Here

This Exploratory Data Analysis is my own personal learning practice in this practice i did analysis from datasets i downloaded from kaggle.com

Plotly For Visualization

Seaborn For Visualization

Import Necessary Libraries

# Libraries for data manipulation
import pandas as pd
import numpy as np

# Libraries for visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Libraries for operatingsystem
import warnings
import os
warnings.filterwarnings('ignore')

Importing Datasets

# Reading the dataset 
df = pd.read_csv(r'C:\Users\user\dl-course-data\abalone.csv')
df.head()

Image description

Checking data information

# Shape of dataset
df.shape

Image description

# Checking the null value in the dataset
df.isnull().sum()

image 1.jpg

# Infromation about dataset
df.info()

image 2.jpg

# Statistical description of dataset
df.describe().T

image 3.jpg

# Extracting a unique values of type column
a = df['Type'].unique()
print(a)

image 4.jpg

# Finding thee counts of Type
b = df['Type'].value_counts()
print(b)

image 5.jpg

# Computing Rings by Type

df.groupby(["Type"])["Rings"].count().reset_index(name="count")

image 6.jpg

Adding ID Column to dataset

df['id'] = range(1, len(df)+1)
df.head()

image 7.jpg

Correlation

# finding the correlation of datasets
correlation = df.corr()
# Longest Shell has the highest positive correlation value

fig = px.imshow(correlation,text_auto=True,aspect="auto")
fig.show()

image 8.jpg

# Type M has the highest number of percentage

import plotly.express as px
import pandas as pd 

fig = px.pie(df, values='id', names='Type', title='Abalone Type By Height')
fig.update_traces(hoverinfo='label+percent', textinfo='label+percent', textfont_size=20, pull=[0.1,0.1,0.1],
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.show()

image 9.jpg

#Type M has the highest number of counts

import plotly.express as px

fig = px.bar(df, x='Type', y='id', color='id')
fig.show()

image 10.jpg

# Include nbins= number_of_bins to specify histogram shape

px.histogram(df, x="id", color="Type")

image 11.jpg

# Cross tb for Type and Rings for easy understanding

cross_tab = pd.crosstab(df["Type"],df["Rings"],margins=True)
cross_tab

image 12.jpg

# The F type is the factor determinant for the whole parameters

sns.factorplot(df["Type"],df["Rings"],data=df)

image 13.jpg

import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency


alpha = 0.05

stats,p_value,degrees_of_freedom,expected = chi2_contingency(cross_tab)

if p_value > alpha:
    print(f'Accept Null Hypothesis\n p_value is {p_value}\n Ringss are independent of Types')
else:
    print(f'Reject Null Hypothesis\n p_value is {p_value}\n Rings are not independent of Types')

image 14.jpg

References