import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
np.set_printoptions(suppress=True)


# Step-1: Copy and paste the path to that file below within quotes to variable 'path'
#path = '-add your path here-'
adult = pd.read_csv(path + 'adult.data', sep=',', header=None, engine='python')

cols = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
        'occupation','relationship','race','gender', 'capital-gain',
        'capital-loss','hours-per-week','native-country','income-cat']

adult.columns = cols

# Step-4: Look at the first five records using .head()
print(adult.shape)
adult.head(2)

(32561, 15)


# use dataframe 'adult' from Q#76
# 0- Print the number of rows and columns in the dataframe
# 1- Print type of data in each column
# 2- Print the number of missing values in each column
# 3- Check to see if any column has the same value in all rows. Drop column if it has the same value in all the rows
# Hint: .unique()
# Hint: use for loop 
# 4- Print shape of sea


# ---

# 0- Print the number of rows and columns in the dataframe
print("Number of rows:", adult.shape[0])
print('Number of columns:', adult.shape[1],'\n')

# 1- Print type of data in each column
print('Data types:\n',adult.dtypes, '\n')

# 2- Print the number of missing values in each column
print('Missing values:\n', adult.isna().sum())

# 3- Check to see if any column has the same value in all rows. Hint: .unique()
for col in adult.columns:
    num = adult[col].nunique()
    if (num == 1):
        adult = adult.drop(col, axis=1)
        print(col + ' was dropped!')
        
# 4- Print shape of adult
print('\nShape:\n', adult.shape)

Number of rows: 32561
Number of columns: 15 

Data types:
 age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
gender            object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income-cat        object
dtype: object 

Missing values:
 age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
gender            0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income-cat        0
dtype: int64

Shape:
 (32561, 15)


# 1-
# clean column names: remove spaces, replace hyphes with underscores 

# 2-
# Strip leading and trailing spaces for values in column 'race'

# 3-
# Strip leading and trailing spaces for values in column 'gender'


# 4-
# Strip leading and trailing spaces for values in column 'income_cat'

# 5-
# Strip leading and trailing spaces for values in column 'education'


# 6-
# Strip leading and trailing spaces for values in column 'workclass'

# 7-
# Strip leading and trailing spaces for values in column 'marital_status'



# --- 

# 1-
cols = [col.strip().replace("-", "_") for col in adult.columns]
adult.columns = cols

adult.head(2)

# 2-
adult['race'] = adult['race'].apply(lambda x: x.strip())

# 3-
adult['gender'] = adult['gender'].apply(lambda x: x.strip())

# 4-
adult['income_cat'] = adult['income_cat'].apply(lambda x: x.strip())

# 5-
adult['education'] = adult['education'].apply(lambda x: x.strip())

# 6-
adult['workclass'] = adult['workclass'].apply(lambda x: x.strip())

# 7-
adult['marital_status'] = adult['marital_status'].apply(lambda x: x.strip())


# Histogram plot of distribution of variables below and comment.
# 1- age
# 2- education_num
# 3- capital_gain
# 4- hours_per_week

# Bar plot of proportion of categories for variables below and comment:
# Also save df for each below as given
# 5- workclass --> df_workclass
# 6- education --> df_education
# 7- marital_status --> df_marital_status
# 8- occupation --> df_occupation
# 9- relatioship --> df_relatioship
# 10- race --> df_race
# 11- gender --> df_gender
# 12- native_country --> df_native_country
# 13- income_cat --> df_income_cat


# --- 

# Function to plot histogram
def plot_hist(df, var):
    plt.hist(df[var])
    plt.xlabel(var)
    plt.ylabel('count')
    plt.show()
    
    
def plot_proportions(df, var):
    # get proportions
    df = df.groupby([var])['age'].count().reset_index().rename(columns={'age':'count'}).sort_values(by='count', ascending=True)
    df['percent'] = (100.0 * df['count']/(df['count'].sum())).round(2)

    # plot
    plt.barh(df[var], df['percent'])
    plt.xlabel('Percent')
    plt.ylabel(var)
    plt.show()
    
    return df
    
    
# 1- age
plot_hist(adult, 'age')

# 2- education_num
plot_hist(adult, 'education_num')

# 3- capital_gain
plot_hist(adult, 'capital_gain')

# 4- hours_per_week
plot_hist(adult, 'hours_per_week')



# 5- workclass
df_workclass = plot_proportions(adult, 'workclass') 

# 6- education
df_education = plot_proportions(adult, 'education') 

# 7- marital_status
df_marital_status = plot_proportions(adult, 'marital_status') 

# 8- occupation
df_occupation = plot_proportions(adult, 'occupation') 

# 9- relationship
df_relationship = plot_proportions(adult, 'relationship') 

# 10- race
df_race = plot_proportions(adult, 'race') 

# 11- gender
df_gender = plot_proportions(adult, 'gender') 

# 12- native_country
df_native_country = plot_proportions(adult, 'native_country') 

# 13- income_cat
df_income_cat = plot_proportions(adult, 'income_cat')


# Lets find out income differences!
# 1- between race --> White and Black
# 2- between gender --> Male and Female

# In each case specify which one is higher and by what percentage?

# --- 

# 1-
df = adult[adult['race'].isin(['White', 'Black'])]
df = df.groupby(['race', 'income_cat'])['age'].count().reset_index().rename(columns={'age':'count'})
df['percent'] = (100.0 * df['count']/(df['count'].sum())).round(2)


# 2-
df2 = adult[adult['gender'].isin(['Male', 'Female'])]
df2 = df2.groupby(['gender', 'income_cat'])['age'].count().reset_index().rename(columns={'age':'count'})
df2['percent'] = (100.0 * df2['count']/(df2['count'].sum())).round(2)
df2


# 1- Does higher education help you earn more?
# 2- Does higher education and being a man help make more money?

# can above questions be answered? If no, why not? If yes, write code below.

# --- 

# 1-
df_total = adult.groupby(['education'])['age'].count().reset_index().rename(columns={'age':'total'})
df_inc_cat = adult.groupby(['education', 'income_cat'])['age'].count().reset_index().rename(columns={'age':'count'})
df = pd.merge(df_inc_cat, df_total, on='education', how='inner')
df['%_of_total'] = round(100.0*df['count']/df['total'])



# 1-
df_total = adult.groupby(['gender'])['age'].count().reset_index().rename(columns={'age':'total'})
df_inc_cat = adult.groupby(['gender', 'income_cat'])['age'].count().reset_index().rename(columns={'age':'count'})
df = pd.merge(df_inc_cat, df_total, on='gender', how='inner')
df['%_of_total'] = round(100.0*df['count']/df['total'])
df


# What is a predominant race that is employed in government jobs such as:
# 'State-gov', 'Federal-gov', 'Local-gov' ?

# can above questions be answered? If no, why not? If yes, write code below.

# --- 

df = adult[adult['workclass'].isin(['State-gov', 'Federal-gov', 'Local-gov'])][['race', 'gender', 'income_cat']]
df = df.groupby(['race', 'gender'])['income_cat'].count().reset_index()
df


# What percentage of self employeed adults earn more than 50K?
# 'Self-emp-inc',  'Self-emp-not-inc'

# --- 

df = adult[adult['workclass'].isin(['Self-emp-inc',  'Self-emp-not-inc'])]
df_total = df.groupby(['workclass'])['age'].count().reset_index().rename(columns={'age':'total'})
df_inc_cat = df.groupby(['workclass', 'income_cat'])['age'].count().reset_index().rename(columns={'age':'count'})
df = pd.merge(df_inc_cat, df_total, on='workclass', how='inner')
df['%_of_total'] = round(100.0*df['count']/df['total'])
df


# 1- Calculate percentage of divorced men vs. women?
# 2- What fraction for each earns >50K

# --- 

# 1-
df = adult[adult['marital_status'].isin(['Divorced'])]
df = df.groupby(['gender'])['age'].count().reset_index().rename(columns={'age':'count'})
df['percent'] = (100.0 * df['count']/(df['count'].sum())).round(2)



# 2-
df = adult[adult['marital_status'].isin(['Divorced'])]
df_total = df.groupby(['gender'])['age'].count().reset_index().rename(columns={'age':'total'})
df_inc_cat = df.groupby(['gender', 'income_cat'])['age'].count().reset_index().rename(columns={'age':'count'})
df = pd.merge(df_inc_cat, df_total, on='gender', how='inner')
df['%_of_total'] = round(100.0*df['count']/df['total'])
df


# Plot average hours_per_week with error bars that men vs. women work?

# --- 

df1 = adult.groupby(['gender'])['hours_per_week'].mean().reset_index().rename(columns={'hours_per_week':'mean'})
df2 = adult.groupby(['gender'])['hours_per_week'].std().reset_index().rename(columns={'hours_per_week':'sd'})
df = pd.merge(df1, df2, on='gender', how='inner')
df

plt.bar(df['gender'], df['mean'])
plt.errorbar(df['gender'], df['mean'], 
                 yerr=df['sd'], 
                 linestyle='', capsize=5, ecolor='red')
plt.ylabel('Mean hours per week')
plt.show()


# Plot average hours_per_week with error bars in Private vs. Goverment (i.e. 'State-gov', 'Federal-gov', 'Local-gov' combined)?

# --- 

df = adult[adult['workclass'].isin(['Private', 'State-gov', 'Federal-gov', 'Local-gov'])][['workclass', 'hours_per_week']]
df['workclass'] = df['workclass'].apply(lambda x: 'Government' if (x=='State-gov')|(x=='Federal-gov')|(x=='Local-gov') else
                                           x
                                       )
df1 = df.groupby(['workclass'])['hours_per_week'].mean().reset_index().rename(columns={'hours_per_week':'mean'})
df2 = df.groupby(['workclass'])['hours_per_week'].std().reset_index().rename(columns={'hours_per_week':'sd'})
df = pd.merge(df1, df2, on='workclass', how='inner')


plt.bar(df['workclass'], df['mean'])
plt.errorbar(df['workclass'], df['mean'], 
                 yerr=df['sd'], 
                 linestyle='', capsize=5, ecolor='red')
plt.ylabel('Mean hours per week')
plt.show()

	gender	income_cat	count	total	%_of_total
0	Female	<=50K	9592	10771	89.0
1	Female	>50K	1179	10771	11.0
2	Male	<=50K	15128	21790	69.0
3	Male	>50K	6662	21790	31.0

	workclass	income_cat	count	total	%_of_total
0	Self-emp-inc	<=50K	494	1116	44.0
1	Self-emp-inc	>50K	622	1116	56.0
2	Self-emp-not-inc	<=50K	1817	2541	72.0
3	Self-emp-not-inc	>50K	724	2541	28.0

	gender	income_cat	count	total	%_of_total
0	Female	<=50K	2493	2672	93.0
1	Female	>50K	179	2672	7.0
2	Male	<=50K	1487	1771	84.0
3	Male	>50K	284	1771	16.0

1994 USA Census Data¶

Import libraries¶

Read data¶

Data exploration¶

Data Analysis¶

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	gender	capital-gain	capital-loss	hours-per-week	native-country	income-cat
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	0	40	United-States	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	0	13	United-States	<=50K

	race	gender	income_cat
0	Amer-Indian-Eskimo	Female	36
1	Amer-Indian-Eskimo	Male	34
2	Asian-Pac-Islander	Female	44
3	Asian-Pac-Islander	Male	97
4	Black	Female	328
5	Black	Male	288
6	Other	Female	7
7	Other	Male	14
8	White	Female	1224
9	White	Male	2279