import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
np.set_printoptions(suppress=True)
# Step-1: Copy and paste the path to that file below within quotes to variable 'path'
#path = '-add your path here-'
adult = pd.read_csv(path + 'adult.data', sep=',', header=None, engine='python')
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
'occupation','relationship','race','gender', 'capital-gain',
'capital-loss','hours-per-week','native-country','income-cat']
adult.columns = cols
# Step-4: Look at the first five records using .head()
print(adult.shape)
adult.head(2)
(32561, 15)
| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | gender | capital-gain | capital-loss | hours-per-week | native-country | income-cat | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
| 1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
# use dataframe 'adult' from Q#76
# 0- Print the number of rows and columns in the dataframe
# 1- Print type of data in each column
# 2- Print the number of missing values in each column
# 3- Check to see if any column has the same value in all rows. Drop column if it has the same value in all the rows
# Hint: .unique()
# Hint: use for loop
# 4- Print shape of sea
# ---
# 0- Print the number of rows and columns in the dataframe
print("Number of rows:", adult.shape[0])
print('Number of columns:', adult.shape[1],'\n')
# 1- Print type of data in each column
print('Data types:\n',adult.dtypes, '\n')
# 2- Print the number of missing values in each column
print('Missing values:\n', adult.isna().sum())
# 3- Check to see if any column has the same value in all rows. Hint: .unique()
for col in adult.columns:
num = adult[col].nunique()
if (num == 1):
adult = adult.drop(col, axis=1)
print(col + ' was dropped!')
# 4- Print shape of adult
print('\nShape:\n', adult.shape)
Number of rows: 32561 Number of columns: 15 Data types: age int64 workclass object fnlwgt int64 education object education-num int64 marital-status object occupation object relationship object race object gender object capital-gain int64 capital-loss int64 hours-per-week int64 native-country object income-cat object dtype: object Missing values: age 0 workclass 0 fnlwgt 0 education 0 education-num 0 marital-status 0 occupation 0 relationship 0 race 0 gender 0 capital-gain 0 capital-loss 0 hours-per-week 0 native-country 0 income-cat 0 dtype: int64 Shape: (32561, 15)
# 1-
# clean column names: remove spaces, replace hyphes with underscores
# 2-
# Strip leading and trailing spaces for values in column 'race'
# 3-
# Strip leading and trailing spaces for values in column 'gender'
# 4-
# Strip leading and trailing spaces for values in column 'income_cat'
# 5-
# Strip leading and trailing spaces for values in column 'education'
# 6-
# Strip leading and trailing spaces for values in column 'workclass'
# 7-
# Strip leading and trailing spaces for values in column 'marital_status'
# ---
# 1-
cols = [col.strip().replace("-", "_") for col in adult.columns]
adult.columns = cols
adult.head(2)
# 2-
adult['race'] = adult['race'].apply(lambda x: x.strip())
# 3-
adult['gender'] = adult['gender'].apply(lambda x: x.strip())
# 4-
adult['income_cat'] = adult['income_cat'].apply(lambda x: x.strip())
# 5-
adult['education'] = adult['education'].apply(lambda x: x.strip())
# 6-
adult['workclass'] = adult['workclass'].apply(lambda x: x.strip())
# 7-
adult['marital_status'] = adult['marital_status'].apply(lambda x: x.strip())
# Histogram plot of distribution of variables below and comment.
# 1- age
# 2- education_num
# 3- capital_gain
# 4- hours_per_week
# Bar plot of proportion of categories for variables below and comment:
# Also save df for each below as given
# 5- workclass --> df_workclass
# 6- education --> df_education
# 7- marital_status --> df_marital_status
# 8- occupation --> df_occupation
# 9- relatioship --> df_relatioship
# 10- race --> df_race
# 11- gender --> df_gender
# 12- native_country --> df_native_country
# 13- income_cat --> df_income_cat
# ---
# Function to plot histogram
def plot_hist(df, var):
plt.hist(df[var])
plt.xlabel(var)
plt.ylabel('count')
plt.show()
def plot_proportions(df, var):
# get proportions
df = df.groupby([var])['age'].count().reset_index().rename(columns={'age':'count'}).sort_values(by='count', ascending=True)
df['percent'] = (100.0 * df['count']/(df['count'].sum())).round(2)
# plot
plt.barh(df[var], df['percent'])
plt.xlabel('Percent')
plt.ylabel(var)
plt.show()
return df
# 1- age
plot_hist(adult, 'age')
# 2- education_num
plot_hist(adult, 'education_num')
# 3- capital_gain
plot_hist(adult, 'capital_gain')
# 4- hours_per_week
plot_hist(adult, 'hours_per_week')
# 5- workclass
df_workclass = plot_proportions(adult, 'workclass')
# 6- education
df_education = plot_proportions(adult, 'education')
# 7- marital_status
df_marital_status = plot_proportions(adult, 'marital_status')
# 8- occupation
df_occupation = plot_proportions(adult, 'occupation')
# 9- relationship
df_relationship = plot_proportions(adult, 'relationship')
# 10- race
df_race = plot_proportions(adult, 'race')
# 11- gender
df_gender = plot_proportions(adult, 'gender')
# 12- native_country
df_native_country = plot_proportions(adult, 'native_country')
# 13- income_cat
df_income_cat = plot_proportions(adult, 'income_cat')
# Lets find out income differences!
# 1- between race --> White and Black
# 2- between gender --> Male and Female
# In each case specify which one is higher and by what percentage?
# ---
# 1-
df = adult[adult['race'].isin(['White', 'Black'])]
df = df.groupby(['race', 'income_cat'])['age'].count().reset_index().rename(columns={'age':'count'})
df['percent'] = (100.0 * df['count']/(df['count'].sum())).round(2)
# 2-
df2 = adult[adult['gender'].isin(['Male', 'Female'])]
df2 = df2.groupby(['gender', 'income_cat'])['age'].count().reset_index().rename(columns={'age':'count'})
df2['percent'] = (100.0 * df2['count']/(df2['count'].sum())).round(2)
df2
| gender | income_cat | count | percent | |
|---|---|---|---|---|
| 0 | Female | <=50K | 9592 | 29.46 |
| 1 | Female | >50K | 1179 | 3.62 |
| 2 | Male | <=50K | 15128 | 46.46 |
| 3 | Male | >50K | 6662 | 20.46 |
# 1- Does higher education help you earn more?
# 2- Does higher education and being a man help make more money?
# can above questions be answered? If no, why not? If yes, write code below.
# ---
# 1-
df_total = adult.groupby(['education'])['age'].count().reset_index().rename(columns={'age':'total'})
df_inc_cat = adult.groupby(['education', 'income_cat'])['age'].count().reset_index().rename(columns={'age':'count'})
df = pd.merge(df_inc_cat, df_total, on='education', how='inner')
df['%_of_total'] = round(100.0*df['count']/df['total'])
# 1-
df_total = adult.groupby(['gender'])['age'].count().reset_index().rename(columns={'age':'total'})
df_inc_cat = adult.groupby(['gender', 'income_cat'])['age'].count().reset_index().rename(columns={'age':'count'})
df = pd.merge(df_inc_cat, df_total, on='gender', how='inner')
df['%_of_total'] = round(100.0*df['count']/df['total'])
df
| gender | income_cat | count | total | %_of_total | |
|---|---|---|---|---|---|
| 0 | Female | <=50K | 9592 | 10771 | 89.0 |
| 1 | Female | >50K | 1179 | 10771 | 11.0 |
| 2 | Male | <=50K | 15128 | 21790 | 69.0 |
| 3 | Male | >50K | 6662 | 21790 | 31.0 |
# What is a predominant race that is employed in government jobs such as:
# 'State-gov', 'Federal-gov', 'Local-gov' ?
# can above questions be answered? If no, why not? If yes, write code below.
# ---
df = adult[adult['workclass'].isin(['State-gov', 'Federal-gov', 'Local-gov'])][['race', 'gender', 'income_cat']]
df = df.groupby(['race', 'gender'])['income_cat'].count().reset_index()
df
| race | gender | income_cat | |
|---|---|---|---|
| 0 | Amer-Indian-Eskimo | Female | 36 |
| 1 | Amer-Indian-Eskimo | Male | 34 |
| 2 | Asian-Pac-Islander | Female | 44 |
| 3 | Asian-Pac-Islander | Male | 97 |
| 4 | Black | Female | 328 |
| 5 | Black | Male | 288 |
| 6 | Other | Female | 7 |
| 7 | Other | Male | 14 |
| 8 | White | Female | 1224 |
| 9 | White | Male | 2279 |
# What percentage of self employeed adults earn more than 50K?
# 'Self-emp-inc', 'Self-emp-not-inc'
# ---
df = adult[adult['workclass'].isin(['Self-emp-inc', 'Self-emp-not-inc'])]
df_total = df.groupby(['workclass'])['age'].count().reset_index().rename(columns={'age':'total'})
df_inc_cat = df.groupby(['workclass', 'income_cat'])['age'].count().reset_index().rename(columns={'age':'count'})
df = pd.merge(df_inc_cat, df_total, on='workclass', how='inner')
df['%_of_total'] = round(100.0*df['count']/df['total'])
df
| workclass | income_cat | count | total | %_of_total | |
|---|---|---|---|---|---|
| 0 | Self-emp-inc | <=50K | 494 | 1116 | 44.0 |
| 1 | Self-emp-inc | >50K | 622 | 1116 | 56.0 |
| 2 | Self-emp-not-inc | <=50K | 1817 | 2541 | 72.0 |
| 3 | Self-emp-not-inc | >50K | 724 | 2541 | 28.0 |
# 1- Calculate percentage of divorced men vs. women?
# 2- What fraction for each earns >50K
# ---
# 1-
df = adult[adult['marital_status'].isin(['Divorced'])]
df = df.groupby(['gender'])['age'].count().reset_index().rename(columns={'age':'count'})
df['percent'] = (100.0 * df['count']/(df['count'].sum())).round(2)
# 2-
df = adult[adult['marital_status'].isin(['Divorced'])]
df_total = df.groupby(['gender'])['age'].count().reset_index().rename(columns={'age':'total'})
df_inc_cat = df.groupby(['gender', 'income_cat'])['age'].count().reset_index().rename(columns={'age':'count'})
df = pd.merge(df_inc_cat, df_total, on='gender', how='inner')
df['%_of_total'] = round(100.0*df['count']/df['total'])
df
| gender | income_cat | count | total | %_of_total | |
|---|---|---|---|---|---|
| 0 | Female | <=50K | 2493 | 2672 | 93.0 |
| 1 | Female | >50K | 179 | 2672 | 7.0 |
| 2 | Male | <=50K | 1487 | 1771 | 84.0 |
| 3 | Male | >50K | 284 | 1771 | 16.0 |
# Plot average hours_per_week with error bars that men vs. women work?
# ---
df1 = adult.groupby(['gender'])['hours_per_week'].mean().reset_index().rename(columns={'hours_per_week':'mean'})
df2 = adult.groupby(['gender'])['hours_per_week'].std().reset_index().rename(columns={'hours_per_week':'sd'})
df = pd.merge(df1, df2, on='gender', how='inner')
df
plt.bar(df['gender'], df['mean'])
plt.errorbar(df['gender'], df['mean'],
yerr=df['sd'],
linestyle='', capsize=5, ecolor='red')
plt.ylabel('Mean hours per week')
plt.show()
# Plot average hours_per_week with error bars in Private vs. Goverment (i.e. 'State-gov', 'Federal-gov', 'Local-gov' combined)?
# ---
df = adult[adult['workclass'].isin(['Private', 'State-gov', 'Federal-gov', 'Local-gov'])][['workclass', 'hours_per_week']]
df['workclass'] = df['workclass'].apply(lambda x: 'Government' if (x=='State-gov')|(x=='Federal-gov')|(x=='Local-gov') else
x
)
df1 = df.groupby(['workclass'])['hours_per_week'].mean().reset_index().rename(columns={'hours_per_week':'mean'})
df2 = df.groupby(['workclass'])['hours_per_week'].std().reset_index().rename(columns={'hours_per_week':'sd'})
df = pd.merge(df1, df2, on='workclass', how='inner')
plt.bar(df['workclass'], df['mean'])
plt.errorbar(df['workclass'], df['mean'],
yerr=df['sd'],
linestyle='', capsize=5, ecolor='red')
plt.ylabel('Mean hours per week')
plt.show()