import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
np.set_printoptions(suppress=True)
# Question # 1
# Note: The data for this question was downloaded form:
# https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population_density
# Step-1: Download the file: 'population.tsv' into the same folder as this notebook
# Step-2: Copy and paste the path to that file below within quotes to variable 'path'
# Step-3: Read the file using pd.read_csv() using the '\t' separator (tab)
#path = '-add your path here-'
pop = pd.read_csv(path + 'population.tsv', sep='\t')
# Step-4: Look at the first five records using .head()
pop.head()
| Country | Unnamed: 1 | population | area_sq_km | area_sqmi | pop_den_per_km2 | pop_den_per_sqmi | |
|---|---|---|---|---|---|---|---|
| 0 | Macau | Macau | 6,86,607 | 33 | 13 | 20,806 | 53,888 |
| 1 | Monaco | Monaco | 36,686 | 2 | 1 | 18,343 | 47,508 |
| 2 | Singapore | Singapore | 54,53,600 | 716 | 276 | 7,617 | 19,727 |
| 3 | Hong Kong | Hong Kong | 74,94,578 | 1,104 | 426 | 6,789 | 17,582 |
| 4 | Gibraltar (BOT) | Gibraltar (BOT) | 32,669 | 6 | 2 | 5,445 | 14,102 |
# Drop column 'Unnamed: 1'
# and look at first five records
pop = pop.drop('Unnamed: 1', axis=1)
pop.head()
| Country | population | area_sq_km | area_sqmi | pop_den_per_km2 | pop_den_per_sqmi | |
|---|---|---|---|---|---|---|
| 0 | Macau | 6,86,607 | 33 | 13 | 20,806 | 53,888 |
| 1 | Monaco | 36,686 | 2 | 1 | 18,343 | 47,508 |
| 2 | Singapore | 54,53,600 | 716 | 276 | 7,617 | 19,727 |
| 3 | Hong Kong | 74,94,578 | 1,104 | 426 | 6,789 | 17,582 |
| 4 | Gibraltar (BOT) | 32,669 | 6 | 2 | 5,445 | 14,102 |
# print values from the first row only
pop.iloc[0]
Country Macau population 6,86,607 area_sq_km 33 area_sqmi 13 pop_den_per_km2 20,806 pop_den_per_sqmi 53,888 Name: 0, dtype: object
pop[pop['Country'].str.contains('World')]
| Country | population | area_sq_km | area_sqmi | pop_den_per_km2 | pop_den_per_sqmi | |
|---|---|---|---|---|---|---|
| 156 | World (excluding Antarctica) | 7,99,96,50,000 | 13,47,40,000 | 5,20,23,114 | 59 | 154 |
| 161 | World (all land) | 7,99,96,50,000 | 14,89,40,000 | 5,75,05,734 | 54 | 139 |
# use 'pop' and print
#1: shape
#2: data types in each column
#3: number of rows
#4: number of columns
#5: check for missing values in each columns. Hint: .isna().sum()
#6: total number of values or elements or items in the entire dataframe
# ---
#1: shape
print("shape:", pop.shape)
#2: dtypes
print('\ndata types:\n', pop.dtypes)
#3: number of rows
print("\nnumber of rows:", len(pop))
#4: number of columns
print("\nnumber of columns:", pop.shape[1])
#5: check for missing values in each columns. Hint: .isna().sum()
print('\nmissing value count:\n', pop.isna().sum())
#6: total number of values
print('total number of values:', pop.size)
shape: (250, 6) data types: Country object population object area_sq_km object area_sqmi object pop_den_per_km2 object pop_den_per_sqmi object dtype: object number of rows: 250 number of columns: 6 missing value count: Country 0 population 0 area_sq_km 0 area_sqmi 0 pop_den_per_km2 0 pop_den_per_sqmi 0 dtype: int64 total number of values: 1500
# A week has seven weekdays "Monday" to "Sunday". Create:
#1 a list of weekdays 'wk_days'
#2 a dictionary 'd_wk' with key: 'wk_days'
#3 a Series 'wk_days'
#4 a DataFrame 'df_wk' with column 'wk_days'
#Print each output to get expected output
# ---
#1:
wk_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
print("wk_days:\n", wk_days, '\n')
#2:
d_wk = {'wk_days':['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']}
print("d_wk:\n", d_wk, '\n')
#3:
wk_days = pd.Series(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
print("wk_days:\n", wk_days, '\n')
#4:
wk_days = pd.DataFrame({'wk_days':['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']})
print("wk_days:\n", wk_days, '\n')
#wk_days
wk_days:
['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
d_wk:
{'wk_days': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']}
wk_days:
0 Monday
1 Tuesday
2 Wednesday
3 Thursday
4 Friday
5 Saturday
6 Sunday
dtype: object
wk_days:
wk_days
0 Monday
1 Tuesday
2 Wednesday
3 Thursday
4 Friday
5 Saturday
6 Sunday
np.random.seed(0)
#1 Find and use a numpy function to create 5 random float values and save in 'x'
# Print:
#2 type of x
#3 sum of x
#4 mean of x
#5 value at index 2
#6 sum of value of index 2 and 4
# ---
#1 Find and use a numpy function to create 5 random numbers and save in 'x'
x = np.random.rand(5)
print('x:', x)
#2 type of x
print('Type:', type(x))
#3 sum of x
xsum = np.sum(x)
print('Sum:', xsum)
#4 mean of x
xmean = np.mean(x)
print('Mean:', xmean)
#5 value at index 2
val_2 = x[2]
print('Value at index 2:', val_2)
#6 sum of value of index 2 and 4
sum_2_4 = x[2] + x[4]
print("Sum of values at index 2 and 4:", sum_2_4)
x: [0.5488135 0.71518937 0.60276338 0.54488318 0.4236548 ] Type: <class 'numpy.ndarray'> Sum: 2.83530422870719 Mean: 0.567060845741438 Value at index 2: 0.6027633760716439 Sum of values at index 2 and 4: 1.0264181754105486
np.random.seed(0)
#1 Find and use a numpy function to create 5 random integer values between 0 to 10and save in 'x'
# Print:
#2 type of x
#3 sum of x then multiply by 100
#4 mean of x then divide by standard deviation of x
#5 swap items from index 2 and index 0 and save in 'xnew' (i.e. move from 2 to 0 and 0 to 2)
#6 find absolute difference between x and xnew. Hint: abs()
# ---
#1 Find and use a numpy function to create 5 random numbers and save in 'x'
x = np.random.randint(0,10,5)
print('x:', x)
#2 type of x
print('Type:', type(x))
#3 sum of x then multiply by 100
xsum = np.sum(x) * 100
print('Sum:', xsum)
#4 mean of x then divide by standard deviation of x
xmean = np.mean(x) / np.std(x)
print('Mean:', xmean)
#5 swap items from index 2 and index 0 (i.e. move from 2 to 0 and 0 to 2)
xnew = x.copy()
i_2 = x[2]
i_0 = x[0]
xnew[0] = x[2]
xnew[2] = x[0]
print('xnew:', xnew)
#6 find difference between x and xnew
x_diff = abs(x - xnew)
print("Absolute difference between x and xnew:", x_diff)
x: [5 0 3 3 7] Type: <class 'numpy.ndarray'> Sum: 1800 Mean: 1.5434872662825798 xnew: [3 0 5 3 7] Absolute difference between x and xnew: [2 0 2 0 0]
# A tree has 20 large branches
# Each large branch has 10 small branches
# Each small branch has 50 stems or twigs
# Each twig has 10 leaves
# How many leaves does the tree have?
# Create your own variables
# ---
large_branches = 20
small_branches = 10
twigs = 50
leaves = 10
total_leaves = large_branches * small_branches * twigs * leaves
print("Total leaves:", total_leaves)
Total leaves: 100000
# The column country has a hidden string as prefix '\xa0'
# This can happen during import. It is important to remove this
# Write code to remove it from column 'Country'
# --- write code below this line ---
pop['Country'] = pop['Country'].str.replace('\xa0', '')
# Preprocess dataframe 'pop'
#1: Remove commmas from numeric values
#2: convert numeric columns to float
#3:
# print('Before:',pop.shape)
# Then, remove all rows that have the word 'World' in the 'Country' column
# print('After:', pop.shape)
# Notice that the number of rows should be lesser!
#Hint: pop = pop[~pop['Country'].str.contains('World')]
# ---
#1,2:
for col in pop.columns:
if(col !='Country'):
pop[col] = pop[col].str.replace(',', '')
pop[col] = pop[col].astype('float')
#3:
print('Before:',pop.shape)
pop = pop[~pop['Country'].str.contains('World')]
print('After:', pop.shape)
Before: (250, 6) After: (248, 6)
# use 'pop'
#1: find highest population density per square km
#2: find country with highest population density
# ---
#1: find highest population density per square km
highest_pop = pop['pop_den_per_km2'].max()
print("Highest population density:", highest_pop)
#2: find country with highest population density
country = pop[pop['pop_den_per_km2']==highest_pop]['Country'].values[0]
print("Country with highest population density:", country)
Highest population density: 20806.0 Country with highest population density: Macau
# use 'pop'
#1: find lowest population density per square km
#2: find country with lowest population density
# ---
#1: find lowest population density per square km
lowest_pop = pop['pop_den_per_km2'].min()
print("Lowest population density:", lowest_pop)
#2: find country with lowest population density
country = pop[pop['pop_den_per_km2']==lowest_pop]['Country'].values[0]
print("Country with lowest population density:", country)
Lowest population density: 0.03 Country with lowest population density: Greenland(Denmark)
#0: create a copy of 'pop' in 'df'
#1: calculate the population per square km and put it in a new column 'calc_pop_den_sqkm'
#2: calculate the difference in values between 'calc_pop_den_sqkm' and 'pop_den_per_km2' and put in column 'pop_den_diff'
#3: square the difference from 2 and save in column 'squared_diff' Hint: .apply()
#4: calculate sum of the column 'squared_diff' and save in variable 'total_sum_of_squares'
# Print expected output and comment
# ---
# 0:
df = pop.copy()
# drop rows where area==0 because area is zero
df = df[df['area_sq_km']>0.0]
# 1:
df['calc_pop_den_sqkm'] = df['population']/df['area_sq_km']
# 2:
#df['pop_den_diff'] = df['calc_pop_den_sqkm'] - df['calc_pop_den_sqkm'] # ERROR
# -- replace inf with zero
df['pop_den_diff'] = df['calc_pop_den_sqkm'] - df['pop_den_per_km2']
# 3:
df['squared_diff'] = df['pop_den_diff'].apply(np.square)
# 4:
total_sum_of_squares = df['squared_diff'].sum()
print("total_sum_of_squares = ", total_sum_of_squares)
total_sum_of_squares = 20.681494417412495
# use pop
#1: save names of all countries into a list 'country'
#2: keep only those country names that start with the letter 'M'
#3: how many countries are these?
#4: from this list, print only countries with four letter words
# ---
# 1:
country = list(pop['Country'])
# 2:
country = [c for c in country if(c[0]=='M') ]
print('country =\n', country)
#3:
len_country = len(country)
print('\nCountries start with "M" are', len_country)
#4:
four_letter_country = [c for c in country if(len(c)==4)]
print('\nCountries 4 letter words', four_letter_country)
country = ['Macau', 'Monaco', 'Maldives', 'Malta', 'Mayotte(France)', 'Mauritius', 'Martinique(France)', 'Marshall Islands[note 3]', 'Micronesia[note 4]', 'Malawi', 'Moldova', 'Malaysia', 'Morocco', 'Myanmar', 'Mexico', 'Montserrat(BOT)', 'Montenegro', 'Madagascar', 'Mozambique', 'Mali', 'Mauritania', 'Mongolia'] Countries start with "M" are 22 Countries 4 letter words ['Mali']
# use pop
# using indexing find the population density per square km of country 'Malta'
# ---
malta_pop_den = pop[pop['Country']=='Malta']['pop_den_per_km2'].values[0]
print("Population density of Malta (per sq. km.):", malta_pop_den)
Population density of Malta (per sq. km.): 1667.0
# use pop
#1: get values form the column 'area_sq_km' and save in an array 'area'
#2: find mean and standard deviation for area
# Print output
# ---
#1:
area = pop['area_sq_km'].to_numpy()
#2:
area_mean = np.mean(area)
area_std = np.std(area)
print("Mean area (sq.km.):", area_mean, "SD area:", area_std)
Mean area (sq.km.): 545636.9677419355 SD area: 1696921.636869445
# use pop
#0: create a copy of pop in df and remove all rows that have the 'World' in the 'Country' column
#1: find the country with largest population as 'pop_max'
#2: find the country with largest arean in square miles as 'area_max'
#3: If they are the same print "Yes same!" else 'Not same!'
# ---
#1:
pop_max = pop['population'].max()
pop_max_country = pop[pop['population']==pop_max]['Country'].values[0]
print("Country with highest population:", pop_max_country)
#2:
area_max = pop['area_sqmi'].max()
area_max_country = pop[pop['area_sqmi']==area_max]['Country'].values[0]
print("Country with highest area (sq.miles):", area_max_country)
print("pop_max = ", pop_max)
print("area_max =", area_max)
#3:
if(pop_max_country == area_max_country):
print("Yes same!")
else:
print("Not same!")
Country with highest population: China Country with highest area (sq.miles): Russia[note 12] pop_max = 1425893465.0 area_max = 6601633.0 Not same!
# use pop
#1: create df1 with columns 'Country' and 'population'. Sort by 'population' high to low
#1A: subset df1 with top 10 rows with highest values
#1C: create subplot bar plot with 'Country' on x-axis and 'population' on y-axis
#2: create df2 with columns 'Country' and 'pop_den_per_km2'. Sort by 'pop_den_per_km2'
#2A: subset df2 with top 10 rows with highest values
#2B: create subplot bar plot with 'Country on x-axis' and 'pop_den_per_km2' on y-axis
# Hint:
# plt.barh()
# ---
#1:
df1 = pop[['Country', 'population']].sort_values(by=['population'], ascending=False)
df1 = df1.head(10)
#2:
df2 = pop[['Country', 'pop_den_per_km2']].sort_values(by=['pop_den_per_km2'], ascending=False)
df2 = df2.head(10)
#1C:
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plt.barh(df1['Country'], df1['population'])
plt.subplot(1,2,2)
plt.barh(df2['Country'], df2['pop_den_per_km2'])
plt.tight_layout()
plt.show()
# use pop
#1: Create an array 'numbers' with numeric columns from pop
# such that each column in datafram is a column in the array
# print its shape, size, type
#2: Slice the array to get first 5 rows and 3 columns and save in 'new_num' again
#2: Zero mean and scale values in each column that is:
# 2a- subtract each value in a column by its respective column mean
# 2b- then, divide the value from 2a by column standard deviation
# then print output array
#3- print min and max values for each column in 'numbers' and 'new_num'.
# Comment on how values between these two arrays compare with each other.
# ---
#1:
numbers = pop.iloc[:,1:].to_numpy()
print("shape:\n", numbers.shape, "\n")
print("size:\n", numbers.size, "\n")
print("type:\n", type(numbers), "\n")
#2:
new_num = numbers[0:5,0:3]
print('new_num:\n', new_num)
col_mean = np.mean(new_num, axis=0)
print('col_mean:', col_mean)
col_std = np.std(new_num, axis=0)
print('col_std:', col_std)
new_num = (new_num - col_mean)/col_std
print('new_num:\n', new_num)
#3:
numbers_col_min = np.min(numbers, axis=0)
numbers_col_max = np.max(numbers, axis=0)
new_num_col_min = np.min(new_num, axis=0)
new_num_col_max = np.max(new_num, axis=0)
print('\nnumbers_col_min:', numbers_col_min)
print('numbers_col_max:', numbers_col_max)
print('new_num_col_min:', new_num_col_min)
print('new_num_col_max:', new_num_col_max)
shape: (248, 5) size: 1240 type: <class 'numpy.ndarray'> new_num: [[ 686607. 33. 13.] [ 36686. 2. 1.] [5453600. 716. 276.] [7494578. 1104. 426.] [ 32669. 6. 2.]] col_mean: [2740828. 372.2 143.6] col_std: [3124855.32557173 456.05631231 175.90974959] new_num: [[-0.65738115 -0.7437678 -0.74242616] [-0.8653655 -0.81174186 -0.81064296] [ 0.86812723 0.75385427 0.75265868] [ 1.52127043 1.6046264 1.60536867] [-0.866651 -0.80297102 -0.80495823]] numbers_col_min: [47. 0. 0. 0.03 0.07] numbers_col_max: [1.42589346e+09 1.70982460e+07 6.60163300e+06 2.08060000e+04 5.38880000e+04] new_num_col_min: [-0.866651 -0.81174186 -0.81064296] new_num_col_max: [1.52127043 1.6046264 1.60536867]
# use pop
#0: Create a list countries with values from column 'Country'.
#0A: Filter it to keep only those country names that start with letters, A, B, C, and D
#0B: Sort the list in ascending order i.e. from A to D
#1: Create an empty dictionary 'd' with empty lists for keys 'A', 'B', 'C', 'D'
# append values for countries starting with respective alphabet of a key
# For example: key 'A' would have all countries with names ['Andorra', 'Anguilla(BOT)' ...]
# Hint:
# .to_list()
# ---
#0: Create list
countries = pop['Country'].to_list()
countries = [country for country in countries if (country[0] in ['A', 'B', 'C', 'D'])]
countries.sort()
print('countries:\n', countries)
#1: Create dictionary
d = {'A':[], 'B':[], 'C':[], 'D':[]}
for country in countries:
if(country[0]=='A'):
d['A'].append(country)
elif(country[0]=='B'):
d['B'].append(country)
elif(country[0]=='C'):
d['C'].append(country)
elif(country[0]=='D'):
d['D'].append(country)
print('\nd:\n',d)
countries:
['Afghanistan', 'Albania', 'Algeria', 'American Samoa(US)', 'Andorra', 'Angola', 'Anguilla(BOT)', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Artsakh[note 10]', 'Aruba(Netherlands)', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda(BOT)', 'Bhutan', 'Bolivia', 'Bonaire(Netherlands)', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'British Virgin Islands(BOT)', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands(BOT)', 'Central African Republic', 'Chad', 'Chile', 'China', 'Christmas Island(Australia)', 'Cocos (Keeling) Islands(Australia)', 'Colombia', 'Comoros', 'Congo', 'Cook Islands[note 7]', 'Costa Rica', 'Croatia', 'Cuba', 'Curaçao(Netherlands)', 'Cyprus[note 5]', 'Czech Republic', 'DR Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic']
d:
{'A': ['Afghanistan', 'Albania', 'Algeria', 'American Samoa(US)', 'Andorra', 'Angola', 'Anguilla(BOT)', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Artsakh[note 10]', 'Aruba(Netherlands)', 'Australia', 'Austria', 'Azerbaijan'], 'B': ['Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda(BOT)', 'Bhutan', 'Bolivia', 'Bonaire(Netherlands)', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'British Virgin Islands(BOT)', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi'], 'C': ['Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands(BOT)', 'Central African Republic', 'Chad', 'Chile', 'China', 'Christmas Island(Australia)', 'Cocos (Keeling) Islands(Australia)', 'Colombia', 'Comoros', 'Congo', 'Cook Islands[note 7]', 'Costa Rica', 'Croatia', 'Cuba', 'Curaçao(Netherlands)', 'Cyprus[note 5]', 'Czech Republic'], 'D': ['DR Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic']}
# use pop
# create scatter plots as subplots for two items below
#1: population and area_sqmi
#2: np.log(population) and np.log(area_sqmi)
# Learn this!
# Taking a logarithm of values makes small values larger so that we can see it on a plot
# Do you see them in plot #2 ?
# --- write code below this line ---
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plt.scatter(pop['population'], pop['area_sqmi'])
plt.title('original')
plt.xlabel('population')
plt.ylabel('area_sqmi')
plt.subplot(1,2,2)
plt.scatter(np.log(pop['population']), np.log(pop['area_sqmi']))
plt.title('original')
plt.xlabel('log[population]')
plt.ylabel('log[area_sqmi]')
plt.tight_layout
plt.show()
/opt/anaconda3/lib/python3.8/site-packages/pandas/core/arraylike.py:397: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
#1:
# create a copy of 'pop' in 'df'
# subset df to keep only two columns 'Country' and 'population'
# create a new column 'class' in 'df' such that it has values:
# 'low' if population <= 1000000
# 'medium' if 1000000 < population <= 500000000
# 'high' if 500000000 < population <= 1000000000
# 'very_high' if population > 1000000000
#2:
# use line of code below to create to group values
# dfa = df.groupby(['class'])['Country'].count().reset_index().rename(columns={'Country':'count'})
# Then use: sns.barplot(data=dfa, x="class", y="count") to create a bar plot
# What do you see? Comment!
#3:
# print countries with very_high population in df
# ---
#1:
df = pop.copy()
df = df[['Country', 'population']]
df['class'] = np.where((df['population']<1000000), 'low',
np.where(((df['population']>1000000) & (df['population']<=500000000)), 'medium',
np.where(((df['population']>500000000) & (df['population']<=1000000000)), 'high',
'very_high' )))
#2:
dfa = df.groupby(['class'])['Country'].count().reset_index().rename(columns={'Country':'count'})
sns.barplot(data=dfa, x="class", y="count")
plt.show()
#3:
very_high_countries = df[df['class']=='very_high']
very_high_countries
| Country | population | class | |
|---|---|---|---|
| 29 | India | 1.407564e+09 | very_high |
| 84 | China | 1.425893e+09 | very_high |
# use pop
# Create a scatter plot of 'pop_den_per_km2' vs. 'pop_den_per_sqmi'
# Do you see any trend? Comment why?
# ---
plt.scatter(pop['pop_den_per_km2'], pop['pop_den_per_sqmi'])
plt.xlabel('pop_den_per_km2')
plt.ylabel('pop_den_per_sqmi')
plt.title('Population density')
plt.show()
# Question - 24
geo = ['India', ['Maharashtra','Gujarat',['Kerala']], 'USA', [['Texas',['California']], 'Nebraska']]
# write code using indices to print
# 'India'
# 'Kerala'
# 'USA'
# 'California'
# --- write code below this line ---
item = geo[0]
print(item)
item = geo[1][2][0]
print(item)
item = geo[2]
print(item)
item = geo[3][0][1][0]
print(item)
India Kerala USA California
#A:
# Read file 'population_india.tsv' (tst --> tab separated)
# using the method shown Q#1
# and save it into a dataframe 'popi'
#B:
#1: shape
#2: data types in each column
#3: number of rows
#4: number of columns
#5: check for missing values in each columns. Hint: .isna().sum()
#6: total number of values
# ---
#A:
path = '/Users/erv/Desktop/learndataa/Students/Python/Exams_Exercise/Exam_3/data/'
popi = pd.read_csv(path + 'population_india.tsv', sep='\t')
#1: shape
print("shape:", popi.shape)
#2: dtypes
print('\ndata types:\n', popi.dtypes)
#3: number of rows
print("\nnumber of rows:", len(popi))
#4: number of columns
print("\nnumber of columns:", popi.shape[1])
#5: check for missing values in each columns. Hint: .isna().sum()
print('\nmissing value count:\n', popi.isna().sum())
#6: total number of values
print('total number of values:', popi.size)
shape: (73, 11) data types: Year object Mid-year population object Births per year object Deaths per year object Natural change per year object Crude birth rate\n(per 1000) float64 Crude death rate\n(per 1000) float64 Natural change\n(per 1000) float64 Total Fertility rate float64 Infant mortality (per 1000) float64 Life expectancy float64 dtype: object number of rows: 73 number of columns: 11 missing value count: Year 0 Mid-year population 1 Births per year 1 Deaths per year 1 Natural change per year 1 Crude birth rate\n(per 1000) 1 Crude death rate\n(per 1000) 1 Natural change\n(per 1000) 1 Total Fertility rate 1 Infant mortality (per 1000) 1 Life expectancy 1 dtype: int64 total number of values: 803
#1: remove all rows with missing values.
#1a: Print dataframe shape before and after values are dropped.
#1b: Check again for missing values in each column. They should be zero. Hint: popi.isna().sum()
# Hint: popi = popi.dropna()
# --- write code below this line ---
print('Before:', popi.shape)
popi = popi.dropna()
print('After:', popi.shape)
Before: (73, 11) After: (72, 11)
# use 'popi' from Q#26
#1:
# Convert columns below to float:
# - 'Mid-year population'
# - 'Births per year'
# - 'Deaths per year'
# - 'Natural change per year'
# Hint: remove the commas before converting
#2:
# Convert column 'Year' to datetime
#Hint: popi['Year'] = pd.to_datetime(popi.loc[:,'Year'])
#3:
# print data types for preprocessed dataframe popi
#4:
# output dataframe popi.head(3)
# ---
#1:
# Convert columns below to float:
# - 'Mid-year population'
# replace commas
popi['Mid-year population'] = popi['Mid-year population'].str.replace(',','')
popi['Mid-year population'] = popi['Mid-year population'].astype(float)
# - 'Births per year'
popi['Births per year'] = popi['Births per year'].str.replace(',','').astype(float)
# - 'Deaths per year'
popi['Deaths per year'] = popi['Deaths per year'].str.replace(',','').astype(float)
# - 'Natural change per year'
# Hint: remove the commas before converting
popi['Natural change per year'] = popi['Natural change per year'].str.replace(',','').astype(float)
#2:
# Convert column 'Year' to datetime
#Hint: popi['Year'] = pd.to_datetime(popi.loc[:,'Year'])
# How would you then get the year from the date?
popi['Year'] = pd.to_datetime(popi.loc[:,'Year']).dt.year
#3:
# print data types for preprocessed dataframe popi
print(popi.dtypes)
#4:
popi.head(3)
Year int64 Mid-year population float64 Births per year float64 Deaths per year float64 Natural change per year float64 Crude birth rate\n(per 1000) float64 Crude death rate\n(per 1000) float64 Natural change\n(per 1000) float64 Total Fertility rate float64 Infant mortality (per 1000) float64 Life expectancy float64 dtype: object
| Year | Mid-year population | Births per year | Deaths per year | Natural change per year | Crude birth rate\n(per 1000) | Crude death rate\n(per 1000) | Natural change\n(per 1000) | Total Fertility rate | Infant mortality (per 1000) | Life expectancy | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1950 | 357021000.0 | 15651000.0 | 7942000.0 | 7709000.0 | 43.8 | 22.2 | 21.6 | 5.73 | 181.2 | 41.7 |
| 1 | 1951 | 364922000.0 | 16042000.0 | 8171000.0 | 7871000.0 | 44.0 | 22.4 | 21.6 | 5.77 | 180.1 | 41.7 |
| 2 | 1952 | 372997000.0 | 16458000.0 | 8293000.0 | 8165000.0 | 44.1 | 22.2 | 21.9 | 5.82 | 177.5 | 42.0 |
# use 'popi' from Q#27
# format the column names as:
# 1- strip leading and trailing spaces
# - replace '-', '\n', '(', or spaces with '_' (underscore) and ')' with '' nothing
# - convert text to lowercase
# - print column names
# 2- rename popi with these cleaned column names
# - output first three rows of popi
# ---
# 1-
cols = popi.columns
cols = [col.strip().replace('-','_').replace('\n','_').replace('(', '_').replace(')','').replace(' ','_').lower() for col in cols]
print('Formatted column names:\n', cols)
# 2-
popi.columns = cols
popi.head(3)
Formatted column names: ['year', 'mid_year_population', 'births_per_year', 'deaths_per_year', 'natural_change_per_year', 'crude_birth_rate__per_1000', 'crude_death_rate__per_1000', 'natural_change__per_1000', 'total_fertility_rate', 'infant_mortality__per_1000', 'life_expectancy']
| year | mid_year_population | births_per_year | deaths_per_year | natural_change_per_year | crude_birth_rate__per_1000 | crude_death_rate__per_1000 | natural_change__per_1000 | total_fertility_rate | infant_mortality__per_1000 | life_expectancy | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1950 | 357021000.0 | 15651000.0 | 7942000.0 | 7709000.0 | 43.8 | 22.2 | 21.6 | 5.73 | 181.2 | 41.7 |
| 1 | 1951 | 364922000.0 | 16042000.0 | 8171000.0 | 7871000.0 | 44.0 | 22.4 | 21.6 | 5.77 | 180.1 | 41.7 |
| 2 | 1952 | 372997000.0 | 16458000.0 | 8293000.0 | 8165000.0 | 44.1 | 22.2 | 21.9 | 5.82 | 177.5 | 42.0 |
# use 'popi' from Q#28
# 1- line plot: (x-axis) 'year' vs. (y-axis) 'mid_year_population'
# 2- line plot: (x-axis) 'year' vs. (y-axis) 'births_per_year'
# 3- line plot: (x-axis) 'year' vs. (y-axis) 'deaths_per_year'
# 4- line plot: (x-axis) 'year' vs. (y-axis) 'natural_change_per_year'
# 5- Comment on what you observe in each of the above plots.
# ---
# 1-
plt.figure(figsize=(15,8))
plt.subplot(2,2,1)
plt.plot(popi['year'], popi['mid_year_population'], marker='x')
plt.xlabel('Year')
plt.ylabel('Mid year population')
plt.subplot(2,2,2)
plt.plot(popi['year'], popi['births_per_year'], marker='x')
plt.xlabel('Year')
plt.ylabel('Births per year')
plt.subplot(2,2,3)
plt.plot(popi['year'], popi['deaths_per_year'], marker='x')
plt.xlabel('Year')
plt.ylabel('Deaths per year')
plt.subplot(2,2,4)
plt.plot(popi['year'], popi['natural_change_per_year'], marker='x')
plt.xlabel('Year')
plt.ylabel('Natural change year population')
plt.tight_layout()
plt.show()
# use 'popi' from Q#28
# 1-
# create one overlay plot for:
# line plot: (x-axis) 'year' vs. (y-axis) 'deaths_per_year'
# line plot: (x-axis) 'year' vs. (y-axis) 'births_per_year'
# 2-
# draw a vertical line at Year 2000 and 2020
# Hint: plt.axvline(x=2000, ymin=0, ymax=3.0e7, color='red', linestyle='--')
# 3-
# Comment on what you observe in the plot
# ---
plt.plot(popi['year'], popi['births_per_year'], marker='x', label='births')
plt.plot(popi['year'], popi['deaths_per_year'], marker='x', label='deaths')
plt.axvline(x=2000, ymin=0, ymax=3.0e7, color='red', linestyle='--')
plt.axvline(x=2020, ymin=0, ymax=3.0e7, color='green', linestyle='--')
plt.xlabel('Year')
plt.ylabel('Count per year')
plt.legend()
plt.show()
# use 'popi' from Q#28
# 0-
#- create of copy of 'popi' and save in 'dfx'
#- drop columns 'total_fertility_rate', 'infant_mortality__per_1000', 'life_expectancy'
# Hint: dfx.loc[:,~dfx.columns.isin(['total_fertility_rate', ....])]
# note that that '~' sign negates the output of dfx.columns.isin()
# 1- Now use 'dfx' Calculate:
# - births per 1000 people in a year; save in a new column 'calc_birth_rate_per_1000'
# - deaths per 1000 people in a year; save in a new column 'calc_deaths_rate_per_1000'
# - natural change per 1000 people in a year; save in a new column 'calc_natural_change_per_1000'
# - round all above calculated percentages in columns to just one decimal place
# Hint: Calculate percentages based on 'mid_year_population'
# 2-
# - calculate difference between
# - 'calc_birth_rate_per_1000' and 'crude_birth_rate__per_1000' --> save in col 'diff_birth_rate'
# - 'calc_death_rate_per_1000' and 'crude_death_rate__per_1000' --> save in col 'diff_death_rate'
# - 'calc_natural_change_per_1000' and 'natural_change__per_1000' --> save in col 'diff_natural_change'
# 3-
# calculate the sum of columns
# - 'diff_birth_rate' as 'sum_diff_birth_rate'
# - 'diff_death_rate' as 'sum_diff_death_rate'
# - 'diff_natural_change' as 'sum_diff_natural_change'
# - round the sum to zero decimal places
# print output
# 4-
# - Comment on the sum calculated in #3. What do you observe
# ---
# 0-
dfx = popi.copy()
dfx = dfx.loc[:,~dfx.columns.isin(['total_fertility_rate', 'infant_mortality__per_1000', 'life_expectancy'])]
#dfx.head(3)
# 1- Now use 'dfx' Calculate:
dfx['calc_birth_rate_per_1000'] = ((1000*dfx['births_per_year'])/dfx['mid_year_population']).round(1)
dfx['calc_death_rate_per_1000'] = ((1000*dfx['deaths_per_year'])/dfx['mid_year_population']).round(1)
dfx['calc_natural_change_per_1000'] = ((1000*dfx['natural_change_per_year'])/dfx['mid_year_population']).round(1)
# 2-
# - calculate difference between
# - 'calc_birth_rate_per_1000' and 'crude_birth_rate__per_1000' --> save in col 'diff_birth_rate'
# - 'calc_death_rate_per_1000' and 'crude_death_rate__per_1000' --> save in col 'diff_death_rate'
# - 'calc_natural_change_per_1000' and 'natural_change__per_1000' --> save in col 'diff_natural_change_rate'
dfx['diff_birth_rate'] = dfx['calc_birth_rate_per_1000'] - dfx['crude_birth_rate__per_1000']
dfx['diff_death_rate'] = dfx['calc_death_rate_per_1000'] - dfx['crude_death_rate__per_1000']
dfx['diff_natural_change'] = dfx['calc_natural_change_per_1000'] - dfx['natural_change__per_1000']
# 3-
sum_diff_birth_rate = round(dfx['diff_birth_rate'].sum())
sum_diff_death_rate = round(dfx['diff_death_rate'].sum())
sum_diff_natural_change = round(dfx['diff_natural_change'].sum())
print("sum_diff_birth_rate:", sum_diff_birth_rate)
print("sum_diff_death_rate:", sum_diff_death_rate)
print("sum_diff_natural_change:", sum_diff_natural_change)
sum_diff_birth_rate: 0 sum_diff_death_rate: 0 sum_diff_natural_change: 0
# use 'popi' from Q#28
# 1-
# - create of copy of 'popi' and save in 'dfy'
# 2-
# - Comment if it is it possible to calculate the values in the columns
# 'natural_change_per_year' and 'natural_change__per_1000'
# based on other columns in the dataframe
#
# - If no, why not?
# 3- If yes, calculate
# - values in 'natural_change_per_year' and save in column 'new_natural_change_per_year'
# - values in 'natural_change__per_1000' and save in column 'new_natural_change__per_1000'
# 4-
# - calculate difference between
# - 'new_natural_change_per_year' and 'natural_change_per_year' --> save in col 'diff_change'
# - 'new_natural_change__per_1000' and 'natural_change__per_1000' --> save in col 'diff_change_rate'
# 5-
# calculate the sum of columns
# - 'diff_change' as 'sum_diff_change'
# - 'diff_change_rate' as 'sum_diff_change_rate'
# - round the sum to zero decimal places
# print output
# 6-
# - Comment on the sum calculated in #3. What do you observe
# ---
# 1-
dfy = popi.copy()
# 2-
# Comment: Yes it is possible
# 3-
dfy['new_natural_change_per_year'] = dfy['births_per_year'] - dfy['deaths_per_year']
dfy['new_natural_change__per_1000'] = dfy['crude_birth_rate__per_1000'] - dfy['crude_death_rate__per_1000']
# 4-
# - calculate difference between
# - 'new_natural_change_per_year' and 'natural_change_per_year' --> save in col 'diff_change'
# - 'new_natural_change__per_1000' and 'natural_change__per_1000' --> save in col 'diff_change_rate'
dfy['diff_change'] = dfy['new_natural_change_per_year'] - dfy['natural_change_per_year']
dfy['diff_change_rate'] = dfy['new_natural_change__per_1000'] - dfy['natural_change__per_1000']
# 5-
sum_diff_change = round(dfy['diff_change'].sum())
sum_diff_change_rate = round(dfy['diff_change_rate'].sum())
print("sum_diff_change =", sum_diff_change)
print("sum_diff_change_rate =", sum_diff_change_rate)
sum_diff_change = 3000 sum_diff_change_rate = 0
# use 'popi' from Q#28
# Note: Create plots to visualize data for interpretation
# 1-
# - line plot: (x-axis) 'year' vs. (y-axis) 'total_fertility_rate'
# - make sure that the y-axis starts at zero
# - What do you observe in the plot? Comment
# - Is the fertility rate increasing or decreasing or is the same over the years?
# 2- create
# - line plot: (x-axis) 'year' vs. (y-axis) 'total_fertility_rate'
# - line plot: (x-axis) 'year' vs. (y-axis) 'crude_birth_rate__per_1000'
# - line plot: (x-axis) 'year' vs. (y-axis) 'crude_death_rate__per_1000'
# - line plot: (x-axis) 'year' vs. (y-axis) 'life_expectancy'
# Comment: What do you observe?
# 3A- create
# create a dataframe df_temp with columns: 'variable', 'mean_rate', 'std_rate'
# variable --> has names ['total_fertility_rate','crude_birth_rate__per_1000', 'crude_death_rate__per_1000']
# mean_rate --> has mean values for each
# sd_rate --> has standard deviation (SD) values for each
# 3B-
# Then, rename values in column 'variable' as below:
# 'total_fertility_rate' --> 'fertility'
# 'crude_birth_rate__per_1000' --> 'birth'
# 'crude_death_rate__per_1000' --> 'death'
# Hint: nested np.where()
# 3C-
# Then use df_temp to create a bar plot:
# (x-axis): 'total_fertility_rate', 'crude_birth_rate__per_1000', 'crude_death_rate__per_1000'
# (y-axis): mean
# (error bars): standard deviation
# Hint-1:
# mean --> popi['total_fertility_rate'].mean()
# standard deviation --> popi['total_fertility_rate'].std()
#
# Hint-2: plt.errorbar(x=df_temp['variable'], y=df_temp['mean_rate'],
# yerr=df_temp['sd_rate'],
# linestyle='', capsize=5, ecolor='red')
#
# Comment: What do you observe?
# ---
# 1-
plt.plot(popi['year'], popi['total_fertility_rate'], marker='x')
plt.ylim(0,7)
plt.xlabel('Year')
plt.ylabel('Total fertility rate')
plt.show()
# 2-
plt.plot(popi['year'], popi['total_fertility_rate'], marker='x', label='fertility')
plt.plot(popi['year'], popi['crude_birth_rate__per_1000'], marker='x', label='birth')
plt.plot(popi['year'], popi['crude_death_rate__per_1000'], marker='x', label='death')
plt.plot(popi['year'], popi['life_expectancy'], marker='x', label='life_exp')
#plt.ylim(0,7)
plt.legend()
plt.title('Population')
plt.xlabel('Year')
plt.ylabel('Rate')
plt.show()
# 3-
# create DataFrame
mean_fr = popi['total_fertility_rate'].mean()
mean_br = popi['crude_birth_rate__per_1000'].mean()
mean_dr = popi['crude_death_rate__per_1000'].mean()
sd_fr = popi['total_fertility_rate'].std()
sd_br = popi['crude_birth_rate__per_1000'].std()
sd_dr = popi['crude_death_rate__per_1000'].std()
df_temp = pd.DataFrame({
'variable': ['total_fertility_rate','crude_birth_rate__per_1000', 'crude_death_rate__per_1000'],
'mean_rate': [mean_fr, mean_br, mean_dr],
'sd_rate': [sd_fr, sd_br, sd_dr]
})
df_temp
df_temp['variable'] = np.where(df_temp['variable']=='total_fertility_rate', 'fertility',
np.where(df_temp['variable']=='crude_birth_rate__per_1000', 'birth','death'
))
# Plot
plt.bar(df_temp['variable'], df_temp['mean_rate'])
plt.errorbar(x=df_temp['variable'], y=df_temp['mean_rate'],
yerr=df_temp['sd_rate'],
linestyle='', capsize=5, ecolor='red')
plt.ylabel('Rate')
plt.show()
# use 'popi' from Q#28
# Note: Create plots to visualize data for interpretation
# 1-
# Create three scatter subplots:
# - (x-axis) total_fertility_rate' vs. (y-axis) 'total_fertility_rate'
# - (x-axis) total_fertility_rate' vs. (y-axis) 'crude_birth_rate__per_1000'
# - (x-axis) total_fertility_rate' vs. (y-axis) 'crude_death_rate__per_1000'
# - Make sure that both x-axis and y-axis start at zero --> plt.xlim(); plt.ylim()
# Hint: plt.scatter(popi['total_fertility_rate'], popi['total_fertility_rate'], alpha=0.5)
# Comment: What does each plot tell us? For example, as fertility rate increases,
# what happens to values on y-axis? Do they increase or decrease?
# --- write code below this line ---
plt.figure(figsize=(15,5))
plt.subplot(1,3,1)
plt.scatter(popi['total_fertility_rate'], popi['total_fertility_rate'], alpha=0.5)
plt.xlabel('Fertility rate')
plt.ylabel('Fertility rate')
plt.xlim(0,7)
plt.ylim(0,7)
plt.subplot(1,3,2)
plt.scatter(popi['total_fertility_rate'], popi['crude_birth_rate__per_1000'], alpha=0.5)
plt.xlabel('Fertility rate')
plt.ylabel('Birth rate')
plt.xlim(0,7)
plt.ylim(0,45)
plt.subplot(1,3,3)
plt.scatter(popi['total_fertility_rate'], popi['crude_death_rate__per_1000'], alpha=0.5)
plt.xlabel('Fertility rate')
plt.ylabel('Death rate')
plt.xlim(0,7)
plt.ylim(0,24)
plt.show()
# use 'popi' from Q#28
# Note: Create plots to visualize data for interpretation
# 1-
# Create an overlay plot:
# - (x-axis) 'year' vs. (y-axis) 'infant_mortality__per_1000'
# - (x-axis) 'year' vs. (y-axis) 'crude_death_rate__per_1000'
# Comment: What do you observe?
# ---
plt.plot(popi['year'], popi['infant_mortality__per_1000'], marker='+')
plt.plot(popi['year'], popi['crude_death_rate__per_1000'], marker='.')
plt.show()
# use 'popi' from Q#28
# Note: Create plots to visualize data for interpretation
# Calculate percentage increase or decrease in:
# 1- 'mid_year_population' from 1950 to 2020 as 'percent_change_population'
# 2- 'crude_birth_rate__per_1000' from 1950 to 2020 as 'percent_change_birth'
# 3- 'crude_death_rate__per_1000' from 1950 to 2020 as 'percent_change_death'
# 4- 'infant_mortality__per_1000' from 1950 to 2020 as 'percent_change_infant_mortality'
# 5- 'life_expectancy' from 1950 to 2020 as 'percent_change_life_expectancy'
# Note: All above as percent of values in 1950
# Note: Round to zero decimal places
# Comment: What do you see?
# ---
numerator = popi[popi['year']==2020]['mid_year_population'].values[0] - popi[popi['year']==1950]['mid_year_population'].values[0]
denominator = popi[popi['year']==1950]['mid_year_population'].values[0]
percent_change_population = round(100*numerator/denominator)
print('percent_change_population = {}%'.format(percent_change_population))
numerator = popi[popi['year']==2020]['crude_birth_rate__per_1000'].values[0] - popi[popi['year']==1950]['crude_birth_rate__per_1000'].values[0]
denominator = popi[popi['year']==1950]['crude_birth_rate__per_1000'].values[0]
percent_change_birth = round(100*numerator/denominator)
print('percent_change_birth = {}%'.format(percent_change_birth))
numerator = popi[popi['year']==2020]['crude_death_rate__per_1000'].values[0] - popi[popi['year']==1950]['crude_death_rate__per_1000'].values[0]
denominator = popi[popi['year']==1950]['crude_death_rate__per_1000'].values[0]
percent_change_death = round(100*numerator/denominator)
print('percent_change_death = {}%'.format(percent_change_death))
numerator = popi[popi['year']==2020]['infant_mortality__per_1000'].values[0] - popi[popi['year']==1950]['infant_mortality__per_1000'].values[0]
denominator = popi[popi['year']==1950]['infant_mortality__per_1000'].values[0]
percent_change_infant_mortality = round(100*numerator/denominator)
print('percent_change_infant_mortality = {}%'.format(percent_change_infant_mortality))
numerator = popi[popi['year']==2020]['life_expectancy'].values[0] - popi[popi['year']==1950]['life_expectancy'].values[0]
denominator = popi[popi['year']==1950]['life_expectancy'].values[0]
percent_change_life_expectancy = round(100*numerator/denominator)
print('percent_change_life_expectancy = {}%'.format(percent_change_life_expectancy))
percent_change_population = 291% percent_change_birth = -62% percent_change_death = -67% percent_change_infant_mortality = -85% percent_change_life_expectancy = 68%