1994 USA Census Data¶

Import libraries¶

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
np.set_printoptions(suppress=True) 

Read data¶

In [4]:
# Step-1: Copy and paste the path to that file below within quotes to variable 'path'
#path = '-add your path here-'
adult = pd.read_csv(path + 'adult.data', sep=',', header=None, engine='python')

cols = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
        'occupation','relationship','race','gender', 'capital-gain',
        'capital-loss','hours-per-week','native-country','income-cat']

adult.columns = cols

# Step-4: Look at the first five records using .head()
print(adult.shape)
adult.head(2)
(32561, 15)
Out[4]:
age workclass fnlwgt education education-num marital-status occupation relationship race gender capital-gain capital-loss hours-per-week native-country income-cat
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K

Data exploration¶

In [5]:
# use dataframe 'adult' from Q#76
# 0- Print the number of rows and columns in the dataframe
# 1- Print type of data in each column
# 2- Print the number of missing values in each column
# 3- Check to see if any column has the same value in all rows. Drop column if it has the same value in all the rows
# Hint: .unique()
# Hint: use for loop 
# 4- Print shape of sea


# ---

# 0- Print the number of rows and columns in the dataframe
print("Number of rows:", adult.shape[0])
print('Number of columns:', adult.shape[1],'\n')

# 1- Print type of data in each column
print('Data types:\n',adult.dtypes, '\n')

# 2- Print the number of missing values in each column
print('Missing values:\n', adult.isna().sum())

# 3- Check to see if any column has the same value in all rows. Hint: .unique()
for col in adult.columns:
    num = adult[col].nunique()
    if (num == 1):
        adult = adult.drop(col, axis=1)
        print(col + ' was dropped!')
        
# 4- Print shape of adult
print('\nShape:\n', adult.shape)
Number of rows: 32561
Number of columns: 15 

Data types:
 age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
gender            object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income-cat        object
dtype: object 

Missing values:
 age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
gender            0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income-cat        0
dtype: int64

Shape:
 (32561, 15)
In [6]:
# 1-
# clean column names: remove spaces, replace hyphes with underscores 

# 2-
# Strip leading and trailing spaces for values in column 'race'

# 3-
# Strip leading and trailing spaces for values in column 'gender'


# 4-
# Strip leading and trailing spaces for values in column 'income_cat'

# 5-
# Strip leading and trailing spaces for values in column 'education'


# 6-
# Strip leading and trailing spaces for values in column 'workclass'

# 7-
# Strip leading and trailing spaces for values in column 'marital_status'



# --- 

# 1-
cols = [col.strip().replace("-", "_") for col in adult.columns]
adult.columns = cols

adult.head(2)

# 2-
adult['race'] = adult['race'].apply(lambda x: x.strip())

# 3-
adult['gender'] = adult['gender'].apply(lambda x: x.strip())

# 4-
adult['income_cat'] = adult['income_cat'].apply(lambda x: x.strip())

# 5-
adult['education'] = adult['education'].apply(lambda x: x.strip())

# 6-
adult['workclass'] = adult['workclass'].apply(lambda x: x.strip())

# 7-
adult['marital_status'] = adult['marital_status'].apply(lambda x: x.strip())
In [7]:
# Histogram plot of distribution of variables below and comment.
# 1- age
# 2- education_num
# 3- capital_gain
# 4- hours_per_week

# Bar plot of proportion of categories for variables below and comment:
# Also save df for each below as given
# 5- workclass --> df_workclass
# 6- education --> df_education
# 7- marital_status --> df_marital_status
# 8- occupation --> df_occupation
# 9- relatioship --> df_relatioship
# 10- race --> df_race
# 11- gender --> df_gender
# 12- native_country --> df_native_country
# 13- income_cat --> df_income_cat


# --- 

# Function to plot histogram
def plot_hist(df, var):
    plt.hist(df[var])
    plt.xlabel(var)
    plt.ylabel('count')
    plt.show()
    
    
def plot_proportions(df, var):
    # get proportions
    df = df.groupby([var])['age'].count().reset_index().rename(columns={'age':'count'}).sort_values(by='count', ascending=True)
    df['percent'] = (100.0 * df['count']/(df['count'].sum())).round(2)

    # plot
    plt.barh(df[var], df['percent'])
    plt.xlabel('Percent')
    plt.ylabel(var)
    plt.show()
    
    return df
    
    
# 1- age
plot_hist(adult, 'age')

# 2- education_num
plot_hist(adult, 'education_num')

# 3- capital_gain
plot_hist(adult, 'capital_gain')

# 4- hours_per_week
plot_hist(adult, 'hours_per_week')



# 5- workclass
df_workclass = plot_proportions(adult, 'workclass') 

# 6- education
df_education = plot_proportions(adult, 'education') 

# 7- marital_status
df_marital_status = plot_proportions(adult, 'marital_status') 

# 8- occupation
df_occupation = plot_proportions(adult, 'occupation') 

# 9- relationship
df_relationship = plot_proportions(adult, 'relationship') 

# 10- race
df_race = plot_proportions(adult, 'race') 

# 11- gender
df_gender = plot_proportions(adult, 'gender') 

# 12- native_country
df_native_country = plot_proportions(adult, 'native_country') 

# 13- income_cat
df_income_cat = plot_proportions(adult, 'income_cat') 

Data Analysis¶

In [8]:
# Lets find out income differences!
# 1- between race --> White and Black
# 2- between gender --> Male and Female

# In each case specify which one is higher and by what percentage?

# --- 

# 1-
df = adult[adult['race'].isin(['White', 'Black'])]
df = df.groupby(['race', 'income_cat'])['age'].count().reset_index().rename(columns={'age':'count'})
df['percent'] = (100.0 * df['count']/(df['count'].sum())).round(2)


# 2-
df2 = adult[adult['gender'].isin(['Male', 'Female'])]
df2 = df2.groupby(['gender', 'income_cat'])['age'].count().reset_index().rename(columns={'age':'count'})
df2['percent'] = (100.0 * df2['count']/(df2['count'].sum())).round(2)
df2
Out[8]:
gender income_cat count percent
0 Female <=50K 9592 29.46
1 Female >50K 1179 3.62
2 Male <=50K 15128 46.46
3 Male >50K 6662 20.46
In [9]:
# 1- Does higher education help you earn more?
# 2- Does higher education and being a man help make more money?

# can above questions be answered? If no, why not? If yes, write code below.

# --- 

# 1-
df_total = adult.groupby(['education'])['age'].count().reset_index().rename(columns={'age':'total'})
df_inc_cat = adult.groupby(['education', 'income_cat'])['age'].count().reset_index().rename(columns={'age':'count'})
df = pd.merge(df_inc_cat, df_total, on='education', how='inner')
df['%_of_total'] = round(100.0*df['count']/df['total'])



# 1-
df_total = adult.groupby(['gender'])['age'].count().reset_index().rename(columns={'age':'total'})
df_inc_cat = adult.groupby(['gender', 'income_cat'])['age'].count().reset_index().rename(columns={'age':'count'})
df = pd.merge(df_inc_cat, df_total, on='gender', how='inner')
df['%_of_total'] = round(100.0*df['count']/df['total'])
df
Out[9]:
gender income_cat count total %_of_total
0 Female <=50K 9592 10771 89.0
1 Female >50K 1179 10771 11.0
2 Male <=50K 15128 21790 69.0
3 Male >50K 6662 21790 31.0
In [10]:
# What is a predominant race that is employed in government jobs such as:
# 'State-gov', 'Federal-gov', 'Local-gov' ?

# can above questions be answered? If no, why not? If yes, write code below.

# --- 

df = adult[adult['workclass'].isin(['State-gov', 'Federal-gov', 'Local-gov'])][['race', 'gender', 'income_cat']]
df = df.groupby(['race', 'gender'])['income_cat'].count().reset_index()
df
Out[10]:
race gender income_cat
0 Amer-Indian-Eskimo Female 36
1 Amer-Indian-Eskimo Male 34
2 Asian-Pac-Islander Female 44
3 Asian-Pac-Islander Male 97
4 Black Female 328
5 Black Male 288
6 Other Female 7
7 Other Male 14
8 White Female 1224
9 White Male 2279
In [11]:
# What percentage of self employeed adults earn more than 50K?
# 'Self-emp-inc',  'Self-emp-not-inc'

# --- 

df = adult[adult['workclass'].isin(['Self-emp-inc',  'Self-emp-not-inc'])]
df_total = df.groupby(['workclass'])['age'].count().reset_index().rename(columns={'age':'total'})
df_inc_cat = df.groupby(['workclass', 'income_cat'])['age'].count().reset_index().rename(columns={'age':'count'})
df = pd.merge(df_inc_cat, df_total, on='workclass', how='inner')
df['%_of_total'] = round(100.0*df['count']/df['total'])
df
Out[11]:
workclass income_cat count total %_of_total
0 Self-emp-inc <=50K 494 1116 44.0
1 Self-emp-inc >50K 622 1116 56.0
2 Self-emp-not-inc <=50K 1817 2541 72.0
3 Self-emp-not-inc >50K 724 2541 28.0
In [12]:
# 1- Calculate percentage of divorced men vs. women?
# 2- What fraction for each earns >50K

# --- 

# 1-
df = adult[adult['marital_status'].isin(['Divorced'])]
df = df.groupby(['gender'])['age'].count().reset_index().rename(columns={'age':'count'})
df['percent'] = (100.0 * df['count']/(df['count'].sum())).round(2)



# 2-
df = adult[adult['marital_status'].isin(['Divorced'])]
df_total = df.groupby(['gender'])['age'].count().reset_index().rename(columns={'age':'total'})
df_inc_cat = df.groupby(['gender', 'income_cat'])['age'].count().reset_index().rename(columns={'age':'count'})
df = pd.merge(df_inc_cat, df_total, on='gender', how='inner')
df['%_of_total'] = round(100.0*df['count']/df['total'])
df
Out[12]:
gender income_cat count total %_of_total
0 Female <=50K 2493 2672 93.0
1 Female >50K 179 2672 7.0
2 Male <=50K 1487 1771 84.0
3 Male >50K 284 1771 16.0
In [685]:
# Plot average hours_per_week with error bars that men vs. women work?

# --- 

df1 = adult.groupby(['gender'])['hours_per_week'].mean().reset_index().rename(columns={'hours_per_week':'mean'})
df2 = adult.groupby(['gender'])['hours_per_week'].std().reset_index().rename(columns={'hours_per_week':'sd'})
df = pd.merge(df1, df2, on='gender', how='inner')
df

plt.bar(df['gender'], df['mean'])
plt.errorbar(df['gender'], df['mean'], 
                 yerr=df['sd'], 
                 linestyle='', capsize=5, ecolor='red')
plt.ylabel('Mean hours per week')
plt.show()
In [691]:
# Plot average hours_per_week with error bars in Private vs. Goverment (i.e. 'State-gov', 'Federal-gov', 'Local-gov' combined)?

# --- 

df = adult[adult['workclass'].isin(['Private', 'State-gov', 'Federal-gov', 'Local-gov'])][['workclass', 'hours_per_week']]
df['workclass'] = df['workclass'].apply(lambda x: 'Government' if (x=='State-gov')|(x=='Federal-gov')|(x=='Local-gov') else
                                           x
                                       )
df1 = df.groupby(['workclass'])['hours_per_week'].mean().reset_index().rename(columns={'hours_per_week':'mean'})
df2 = df.groupby(['workclass'])['hours_per_week'].std().reset_index().rename(columns={'hours_per_week':'sd'})
df = pd.merge(df1, df2, on='workclass', how='inner')


plt.bar(df['workclass'], df['mean'])
plt.errorbar(df['workclass'], df['mean'], 
                 yerr=df['sd'], 
                 linestyle='', capsize=5, ecolor='red')
plt.ylabel('Mean hours per week')
plt.show()
In [ ]:
 
In [ ]:
 
In [ ]: