import pandas as pd
import numpy as np
import seaborn as sns
import random
import plotly.offline as py
import plotly.graph_objects as go
import plotly.express as px
#import chart_studio.plotly as pyIntroduction
Description and Simulation In this working sheet, I will create an artificial data set, Age of population, with a composition technique, In otherword, I’d create six sample sets of population with a different mean of age to portray each interval age, and after that, elaborating the data with descriptive statistic, enabling us to understand the data from different perspective
Construct a Data Set
"""np.random.normal o generate a vector of random values that follow a normal distribution
with a specific mean and standard deviation: mean, sd, size """
#Note: Because making up the data by implementing normoal distribution so some obeservation is negative
#but age can't be negative, need to handle that later
random.seed(112)
a = np.random.normal(loc=5, scale=7, size=2000)
b = np.random.normal(loc=5, scale=7, size=2000)
c = np.random.normal(loc=35, scale=5, size=2500)
d = np.random.normal(loc=50, scale=8, size=3000)
e = np.random.normal(loc=70, scale=5, size=1000)
f = np.random.normal(loc=80, scale=7, size=1000)#combine all the vector
pop = np.concatenate((a, b, c, d, e, f))pop.shape(11500,)
# making all data become absolute value and casting the type to int
pop = np.absolute(pop).astype(int)# exclud age = 0
pop = pop[pop != 0]poparray([ 8, 4, 9, ..., 75, 87, 63])
# numbers of obeservation decreases not significantly, so it should be fine
pop.shape(11136,)
df = pd.DataFrame(pop, columns = ['Age'])sex = ['Male','Female']#random.choice() is a function using to pick a random value from a list
print(random.choice(sex))Female
df['Sex'] = random.choice(sex) # need to fill up an attribute with some values first
df['Sex'] = [ random.choice(sex) for i in df['Sex'] ] #apply random choice with list expressionsdf| Age | Sex | |
|---|---|---|
| 0 | 8 | Female |
| 1 | 4 | Female |
| 2 | 9 | Male |
| 3 | 1 | Female |
| 4 | 6 | Male |
| ... | ... | ... |
| 11131 | 67 | Male |
| 11132 | 73 | Female |
| 11133 | 75 | Female |
| 11134 | 87 | Male |
| 11135 | 63 | Female |
11136 rows × 2 columns
df| Age | Sex | |
|---|---|---|
| 0 | 8 | Female |
| 1 | 4 | Female |
| 2 | 9 | Male |
| 3 | 1 | Female |
| 4 | 6 | Male |
| ... | ... | ... |
| 11131 | 67 | Male |
| 11132 | 73 | Female |
| 11133 | 75 | Female |
| 11134 | 87 | Male |
| 11135 | 63 | Female |
11136 rows × 2 columns
df.groupby(['Sex']).count()| Age | |
|---|---|
| Sex | |
| Female | 5524 |
| Male | 5612 |
Explore data wiht some visualizations
#Prepare the data
"""
I want to make a pyramid population catagorized by gender aging interval. First step is that
I might need to put each person into different bins depending on their age
"""'\nI want to make a pyramid population catagorized by gender aging interval. First step is that \nI might need to put each person into different bins depending on their age\n'
# create the age_interval with 5 bins
df.loc[df['Age'].between(1, 20, 'both'), 'Age_Interval'] = '1-20'
df.loc[df['Age'].between(20, 40, 'right'), 'Age_Interval'] = '21-40'
df.loc[df['Age'].between(40, 60, 'right'), 'Age_Interval'] = '41-60'
df.loc[df['Age'].between(60, 80, 'right'), 'Age_Interval'] = '61-80'
df.loc[df['Age'].between(80, 100, 'right'), 'Age_Interval'] = '81-100'
#Credit: https://medium.com/towards-data-science/how-to-bin-numerical-data-with-pandas-fe5146c9dc55df| Age | Sex | Age_Interval | |
|---|---|---|---|
| 0 | 8 | Female | 1-20 |
| 1 | 4 | Female | 1-20 |
| 2 | 9 | Male | 1-20 |
| 3 | 1 | Female | 1-20 |
| 4 | 6 | Male | 1-20 |
| ... | ... | ... | ... |
| 11131 | 67 | Male | 61-80 |
| 11132 | 73 | Female | 61-80 |
| 11133 | 75 | Female | 61-80 |
| 11134 | 87 | Male | 81-100 |
| 11135 | 63 | Female | 61-80 |
11136 rows × 3 columns
df1 = df.groupby(['Age_Interval','Sex'])[['Age']].count().reset_index().rename(columns={'Age':'Number_of_Pop'})#Noticing that the first attribute is repetitive, this is long format so we need to convert them to wide format
#For analysis purposes, mostly we want wide format, but ploting graphg by R or some analytical tool might require long format
df1| Age_Interval | Sex | Number_of_Pop | |
|---|---|---|---|
| 0 | 1-20 | Female | 1812 |
| 1 | 1-20 | Male | 1775 |
| 2 | 21-40 | Female | 1295 |
| 3 | 21-40 | Male | 1379 |
| 4 | 41-60 | Female | 1314 |
| 5 | 41-60 | Male | 1334 |
| 6 | 61-80 | Female | 890 |
| 7 | 61-80 | Male | 886 |
| 8 | 81-100 | Female | 213 |
| 9 | 81-100 | Male | 238 |
#pivot method is used to convert from long to wide
df2 =pd.pivot(df1,index='Age_Interval' ,columns='Sex', values='Number_of_Pop')
#Credit: https://towardsdatascience.com/reshaping-a-pandas-dataframe-long-to-wide-and-vice-versa-517c7f0995addf2| Sex | Female | Male |
|---|---|---|
| Age_Interval | ||
| 1-20 | 1812 | 1775 |
| 21-40 | 1295 | 1379 |
| 41-60 | 1314 | 1334 |
| 61-80 | 890 | 886 |
| 81-100 | 213 | 238 |
df2['Female'].dtypedtype('int64')
women_bins = [i*-1 for i in df2['Female']]len(women_bins)5
women_bins = np.array(women_bins)
men_bins = np.array(df2['Male'])
df3 = df2.reset_index() # dropping Age_Interval from being an index
y =list(df3['Age_Interval']) # convert to list and utilize it as YAxis
layout = go.Layout(yaxis=go.layout.YAxis(title='Age'),
xaxis=go.layout.XAxis(
range=[-2200, 2200],
tickvals=[-2000, -1500, -1000, -500, 0, 500, 1000, 1500, 2000],
ticktext=[2000, 1500, 1000, 500, 0, 500, 1000, 1500, 2000],
title='Number_of_Population'),
barmode='overlay',
bargap=0.1)
data = [go.Bar(y=y,
x=men_bins,
orientation='h',
name='Men',
hoverinfo='x',
marker=dict(color='powderblue')
),
go.Bar(y=y,
x=women_bins,
orientation='h',
name='Women',
text=-1 * women_bins.astype('int'),
hoverinfo='text',
marker=dict(color='seagreen')
)]
py.iplot(dict(data=data, layout=layout), filename='EXAMPLES/bar_pyramid')Pyramid graph above giving an overall sense of population grouped by aga interval and sexuality. We can see that majority of population is in the range of 1-20 yeas old.
Measure of Central Tendency And Dispsersion of data
Now, I want to explore the central tendency in age of population.Thus, backing to work with data before catogorizing them intp different bin. So as to gain better understanding, I’d show some calculation to get descriptive statistic before using function to get those result
pop # age of individuals, populationarray([ 8, 4, 9, ..., 75, 87, 63])
pop.shape # number of populaiton(11136,)
mean_pop = round(sum(pop)/len(pop))
mean_pop
# Average age of population is 37 years old37
np.median(pop)
# Median of pop is 3737.0
# defining a function to calculate mode. It
# takes list variable as argument
def mode(lst):
# creating a dictionary
freq = {}
for i in lst:
# mapping each value of list to a
# dictionary
freq.setdefault(i, 0)
freq[i] += 1
# finding maximum value of dictionary
hf = max(freq.values())
# creating an empty list
hflst = []
# using for loop we are checking for most
# repeated value
for i, j in freq.items():
if j == hf:
hflst.append(i)
# returning the result
return hflst
# calling mode() function and passing list
# as argument
print(mode(pop))
#Credit: https://www.geeksforgeeks.org/how-to-calculate-the-mode-of-numpy-array/[1]
poparray([ 8, 4, 9, ..., 75, 87, 63])
#Observing the diispersion of the data by geting the deviation
#With that, we substract each element from the mean
dev = []
for i in pop:
temp = i-mean_pop
dev.append(temp)#deviation of mean
dev = np.array(dev)
devarray([-29, -33, -28, ..., 38, 50, 26])
np.mean(dev) # the mean of deviation is usually be zero -0.20510057471264367
#absolute deviation of mean
dev = abs(dev)
devarray([29, 33, 28, ..., 38, 50, 26])
#Mean Absolute deviation or 'MAD'
np.mean(dev)
print("Mean Absolute deviation is % s "
% (np.mean(dev)))Mean Absolute deviation is 20.97162356321839
import statistics
pop_list = pop.tolist()
print("Standard Deviation of sample is % s "
% (statistics.stdev(pop_list)))Standard Deviation of sample is 25.072255798045703
Mean absolute deviation (MAD) is a measure of the average absolute distance between each data value and the mean of a data set. Similar to standard deviation, MAD is a parameter or statistic that measures the spread, or variation, in your data.
Even Both MAD and SD measuring the spread of data,but SD is usually bigger than MAD as SD more sensitive to values that are farther away from the mean for more detail on MAD and SD: https://articles.outlier.org/mean-absolute-deviation-meaning
Describing Dispersion
#Using describe to see basic describtive measure ment
df['Age'].describe()count 11136.000000
mean 36.794899
std 25.072256
min 1.000000
25% 11.000000
50% 37.000000
75% 55.000000
max 100.000000
Name: Age, dtype: float64
#Range
range = max(df['Age'])-min(df['Age'])
print('Range is %s'
%range)Range is 99
#Interquartile range = Q3 – Q1
q1, q3 = np.percentile(df.Age,[25,75])
iqr = q3 - q1
print(iqr)44.0
Detecting Outlier
definition of outlier here is any point of data whihc is beyond the line of lower limit(Q1 - 1.5IQR) or upper limit(Q3 + 1.5IQR)
#firn lower limit and upper limit
lower_limit = q1 - (1.5*iqr)
upper_limit = q3 + (1.5*iqr)
print(lower_limit, upper_limit)
# this could be conclude that if anyone in our population is, at age, more than 120 years old could be considered as outlier-55.0 121.0
# try selecting a sample set and consider its statistic measurement
age_list = df.Age.tolist()
sample = random.sample(age_list,2500)sample = pd.DataFrame(sample)
sample.describe()| 0 | |
|---|---|
| count | 2500.000000 |
| mean | 36.736800 |
| std | 24.606489 |
| min | 1.000000 |
| 25% | 11.000000 |
| 50% | 37.000000 |
| 75% | 54.000000 |
| max | 94.000000 |
df['Age'].describe()count 11136.000000
mean 36.794899
std 25.072256
min 1.000000
25% 11.000000
50% 37.000000
75% 55.000000
max 100.000000
Name: Age, dtype: float64
# the statistical measurement of sample set and population are quite similar
# the sample set well represent the population df| Age | Sex | Age_Interval | |
|---|---|---|---|
| 0 | 8 | Female | 1-20 |
| 1 | 4 | Female | 1-20 |
| 2 | 9 | Male | 1-20 |
| 3 | 1 | Female | 1-20 |
| 4 | 6 | Male | 1-20 |
| ... | ... | ... | ... |
| 11131 | 67 | Male | 61-80 |
| 11132 | 73 | Female | 61-80 |
| 11133 | 75 | Female | 61-80 |
| 11134 | 87 | Male | 81-100 |
| 11135 | 63 | Female | 61-80 |
11136 rows × 3 columns
# performing visualization of population using boxplot, categotized by sexuality
df_box = df
fig = px.box(df_box, x="Sex", y="Age")
fig.show()
# from out artificial data here, I'd try adding 'country attribute', and making more dynamic visualization
df| Age | Sex | Age_Interval | |
|---|---|---|---|
| 0 | 8 | Female | 1-20 |
| 1 | 4 | Female | 1-20 |
| 2 | 9 | Male | 1-20 |
| 3 | 1 | Female | 1-20 |
| 4 | 6 | Male | 1-20 |
| ... | ... | ... | ... |
| 11131 | 67 | Male | 61-80 |
| 11132 | 73 | Female | 61-80 |
| 11133 | 75 | Female | 61-80 |
| 11134 | 87 | Male | 81-100 |
| 11135 | 63 | Female | 61-80 |
11136 rows × 3 columns
country = ['Thailand', 'Taiwan', 'Japan', 'Germany']df['Country'] = random.choice(country)
df['Country'] = [ random.choice(country) for i in df['Country'] ] df| Age | Sex | Age_Interval | Country | |
|---|---|---|---|---|
| 0 | 8 | Female | 1-20 | Germany |
| 1 | 4 | Female | 1-20 | Japan |
| 2 | 9 | Male | 1-20 | Japan |
| 3 | 1 | Female | 1-20 | Germany |
| 4 | 6 | Male | 1-20 | Japan |
| ... | ... | ... | ... | ... |
| 11131 | 67 | Male | 61-80 | Taiwan |
| 11132 | 73 | Female | 61-80 | Japan |
| 11133 | 75 | Female | 61-80 | Japan |
| 11134 | 87 | Male | 81-100 | Thailand |
| 11135 | 63 | Female | 61-80 | Thailand |
11136 rows × 4 columns
from dash import Dash, dcc, html, Input, Output
from jupyter_dash import JupyterDash
app = JupyterDash(__name__)
app.layout = html.Div([
html.H4("Analysis of Age Distribution in Population"),
html.P("x-axis:"),
dcc.Checklist(
id='x-axis',
options=['Country', 'Sex'],
inline=True
),
html.P("y-axis:"),
dcc.RadioItems(
id='y-axis',
value='Age',
inline=True
),
dcc.Graph(id="graph"),
])
@app.callback(
Output("graph", "figure"),
Input("x-axis", "value"),
Input("y-axis", "value"))
def generate_chart(x, y):
df_box = df # replace with your own data source
fig = px.box(df, x=x, y=y)
return fig
if __name__ == '__main__':
app.run_server(mode="inline")