import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
np.set_printoptions(suppress=True)


# Question # 1
# Note: The data for this question was downloaded form:
# https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population_density

# Step-1: Download the file: 'population.tsv' into the same folder as this notebook
# Step-2: Copy and paste the path to that file below within quotes to variable 'path'


# Step-3: Read the file using pd.read_csv() using the '\t' separator (tab)
#path = '-add your path here-'
pop = pd.read_csv(path + 'population.tsv', sep='\t')

# Step-4: Look at the first five records using .head()
pop.head()


# Drop column 'Unnamed: 1' 
# and look at first five records

pop = pop.drop('Unnamed: 1', axis=1)

pop.head()


# print values from the first row only

pop.iloc[0]

Country                Macau
population          6,86,607
area_sq_km                33
area_sqmi                 13
pop_den_per_km2       20,806
pop_den_per_sqmi      53,888
Name: 0, dtype: object


pop[pop['Country'].str.contains('World')]


# use 'pop' and print
#1: shape 
#2: data types in each column
#3: number of rows
#4: number of columns
#5: check for missing values in each columns. Hint: .isna().sum()
#6: total number of values or elements or items in the entire dataframe

# --- 

#1: shape 
print("shape:", pop.shape)

#2: dtypes
print('\ndata types:\n', pop.dtypes)

#3: number of rows
print("\nnumber of rows:", len(pop))

#4: number of columns
print("\nnumber of columns:", pop.shape[1])

#5: check for missing values in each columns. Hint: .isna().sum()
print('\nmissing value count:\n', pop.isna().sum())

#6: total number of values
print('total number of values:', pop.size)

shape: (250, 6)

data types:
 Country             object
population          object
area_sq_km          object
area_sqmi           object
pop_den_per_km2     object
pop_den_per_sqmi    object
dtype: object

number of rows: 250

number of columns: 6

missing value count:
 Country             0
population          0
area_sq_km          0
area_sqmi           0
pop_den_per_km2     0
pop_den_per_sqmi    0
dtype: int64
total number of values: 1500


# A week has seven weekdays "Monday" to "Sunday". Create:
#1 a list of weekdays 'wk_days'
#2 a dictionary 'd_wk' with key: 'wk_days'
#3 a Series 'wk_days'
#4 a DataFrame 'df_wk' with column 'wk_days'
#Print each output to get expected output

# --- 

#1:
wk_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
print("wk_days:\n", wk_days, '\n')

#2:
d_wk = {'wk_days':['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']}
print("d_wk:\n", d_wk, '\n')

#3:
wk_days = pd.Series(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
print("wk_days:\n", wk_days, '\n')

#4:
wk_days = pd.DataFrame({'wk_days':['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']})
print("wk_days:\n", wk_days, '\n')
#wk_days

wk_days:
 ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] 

d_wk:
 {'wk_days': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']} 

wk_days:
 0       Monday
1      Tuesday
2    Wednesday
3     Thursday
4       Friday
5     Saturday
6       Sunday
dtype: object 

wk_days:
      wk_days
0     Monday
1    Tuesday
2  Wednesday
3   Thursday
4     Friday
5   Saturday
6     Sunday


np.random.seed(0)

#1 Find and use a numpy function to create 5 random float values and save in 'x'
# Print: 
#2 type of x
#3 sum of x
#4 mean of x
#5 value at index 2
#6 sum of value of index 2 and 4


# --- 

#1 Find and use a numpy function to create 5 random numbers and save in 'x'
x = np.random.rand(5)
print('x:', x)

#2 type of x
print('Type:', type(x))

#3 sum of x
xsum = np.sum(x)
print('Sum:', xsum)

#4 mean of x
xmean = np.mean(x)
print('Mean:', xmean)

#5 value at index 2
val_2 = x[2]
print('Value at index 2:', val_2)

#6 sum of value of index 2 and 4
sum_2_4 = x[2] + x[4]
print("Sum of values at index 2 and 4:", sum_2_4)

x: [0.5488135  0.71518937 0.60276338 0.54488318 0.4236548 ]
Type: <class 'numpy.ndarray'>
Sum: 2.83530422870719
Mean: 0.567060845741438
Value at index 2: 0.6027633760716439
Sum of values at index 2 and 4: 1.0264181754105486


np.random.seed(0)

#1 Find and use a numpy function to create 5 random integer values between 0 to 10and save in 'x'
# Print: 
#2 type of x
#3 sum of x then multiply by 100
#4 mean of x then divide by standard deviation of x
#5 swap items from index 2 and index 0 and save in 'xnew' (i.e. move from 2 to 0 and 0 to 2)
#6 find absolute difference between x and xnew. Hint: abs()


# --- 

#1 Find and use a numpy function to create 5 random numbers and save in 'x'
x = np.random.randint(0,10,5)
print('x:', x)

#2 type of x
print('Type:', type(x))

#3 sum of x then multiply by 100
xsum = np.sum(x) * 100
print('Sum:', xsum)

#4 mean of x then divide by standard deviation of x
xmean = np.mean(x) / np.std(x)
print('Mean:', xmean)

#5 swap items from index 2 and index 0 (i.e. move from 2 to 0 and 0 to 2)
xnew = x.copy()
i_2 = x[2]
i_0 = x[0]
xnew[0] = x[2]
xnew[2] = x[0]
print('xnew:', xnew)

#6 find difference between x and xnew
x_diff = abs(x - xnew)
print("Absolute difference between x and xnew:", x_diff)

x: [5 0 3 3 7]
Type: <class 'numpy.ndarray'>
Sum: 1800
Mean: 1.5434872662825798
xnew: [3 0 5 3 7]
Absolute difference between x and xnew: [2 0 2 0 0]


# A tree has 20 large branches
# Each large branch has 10 small branches
# Each small branch has 50 stems or twigs
# Each twig has 10 leaves
# How many leaves does the tree have?

# Create your own variables

# --- 

large_branches = 20
small_branches = 10
twigs = 50
leaves = 10

total_leaves = large_branches * small_branches * twigs * leaves

print("Total leaves:", total_leaves)

Total leaves: 100000


# The column country has a hidden string as prefix '\xa0'
# This can happen during import. It is important to remove this
# Write code to remove it from column 'Country'


# --- write code below this line ---

pop['Country'] = pop['Country'].str.replace('\xa0', '')


# Preprocess dataframe 'pop'
#1: Remove commmas from numeric values
#2: convert numeric columns to float

#3:
# print('Before:',pop.shape)
# Then, remove all rows that have the word 'World' in the 'Country' column
# print('After:', pop.shape)
# Notice that the number of rows should be lesser!
#Hint: pop = pop[~pop['Country'].str.contains('World')]

# --- 

#1,2:
for col in pop.columns:
    if(col !='Country'):
        pop[col] = pop[col].str.replace(',', '')
        pop[col] = pop[col].astype('float')
    
#3:
print('Before:',pop.shape)
pop = pop[~pop['Country'].str.contains('World')]
print('After:', pop.shape)

Before: (250, 6)
After: (248, 6)


# use 'pop'
#1: find highest population density per square km
#2: find country with highest population density

# --- 

#1: find highest population density per square km
highest_pop = pop['pop_den_per_km2'].max()
print("Highest population density:", highest_pop)

#2: find country with highest population density
country = pop[pop['pop_den_per_km2']==highest_pop]['Country'].values[0]
print("Country with highest population density:", country)

Highest population density: 20806.0
Country with highest population density: Macau


# use 'pop'
#1: find lowest population density per square km
#2: find country with lowest population density

# ---

#1: find lowest population density per square km
lowest_pop = pop['pop_den_per_km2'].min()
print("Lowest population density:", lowest_pop)

#2: find country with lowest population density
country = pop[pop['pop_den_per_km2']==lowest_pop]['Country'].values[0]
print("Country with lowest population density:", country)

Lowest population density: 0.03
Country with lowest population density: Greenland(Denmark)


#0: create a copy of 'pop' in 'df'
#1: calculate the population per square km and put it in a new column 'calc_pop_den_sqkm'
#2: calculate the difference in values between 'calc_pop_den_sqkm' and 'pop_den_per_km2' and put in column 'pop_den_diff'
#3: square the difference  from 2 and save in column 'squared_diff' Hint: .apply()
#4: calculate sum of the column 'squared_diff' and save in variable 'total_sum_of_squares'
# Print expected output and comment

# --- 

# 0:
df = pop.copy()

# drop rows where area==0 because area is zero
df = df[df['area_sq_km']>0.0]

# 1:
df['calc_pop_den_sqkm'] = df['population']/df['area_sq_km']

# 2:
#df['pop_den_diff'] = df['calc_pop_den_sqkm'] - df['calc_pop_den_sqkm'] # ERROR

# -- replace inf with zero

df['pop_den_diff'] = df['calc_pop_den_sqkm'] - df['pop_den_per_km2']


# 3:
df['squared_diff'] = df['pop_den_diff'].apply(np.square)

# 4:
total_sum_of_squares = df['squared_diff'].sum()

print("total_sum_of_squares = ", total_sum_of_squares)

total_sum_of_squares =  20.681494417412495


# use pop
#1: save names of all countries into a list 'country'
#2: keep only those country names that start with the letter 'M'
#3: how many countries are these?
#4: from this list, print only countries with four letter words

# --- 

# 1:
country = list(pop['Country'])

# 2:
country = [c for c in country if(c[0]=='M') ]
print('country =\n', country)

#3:
len_country = len(country)
print('\nCountries start with "M" are', len_country)

#4:
four_letter_country = [c for c in country if(len(c)==4)]
print('\nCountries 4 letter words', four_letter_country)

country =
 ['Macau', 'Monaco', 'Maldives', 'Malta', 'Mayotte(France)', 'Mauritius', 'Martinique(France)', 'Marshall Islands[note 3]', 'Micronesia[note 4]', 'Malawi', 'Moldova', 'Malaysia', 'Morocco', 'Myanmar', 'Mexico', 'Montserrat(BOT)', 'Montenegro', 'Madagascar', 'Mozambique', 'Mali', 'Mauritania', 'Mongolia']

Countries start with "M" are 22

Countries 4 letter words ['Mali']


# use pop
# using indexing find the population density per square km of country 'Malta'

# --- 

malta_pop_den = pop[pop['Country']=='Malta']['pop_den_per_km2'].values[0]
print("Population density of Malta (per sq. km.):", malta_pop_den)

Population density of Malta (per sq. km.): 1667.0


# use pop
#1: get values form the column 'area_sq_km' and save in an array 'area'
#2: find mean and standard deviation for area
# Print output

# --- 

#1:
area = pop['area_sq_km'].to_numpy()

#2:
area_mean = np.mean(area)
area_std = np.std(area)

print("Mean area (sq.km.):", area_mean, "SD area:", area_std)

Mean area (sq.km.): 545636.9677419355 SD area: 1696921.636869445


# use pop
#0: create a copy of pop in df and remove all rows that have the 'World' in the 'Country' column
#1: find the country with largest population as 'pop_max'
#2: find the country with largest arean in square miles as 'area_max'
#3: If they are the same print "Yes same!" else 'Not same!'

# --- 

#1:
pop_max = pop['population'].max()
pop_max_country = pop[pop['population']==pop_max]['Country'].values[0]
print("Country with highest population:", pop_max_country)

#2:
area_max = pop['area_sqmi'].max()
area_max_country = pop[pop['area_sqmi']==area_max]['Country'].values[0]
print("Country with highest area (sq.miles):", area_max_country)

print("pop_max = ", pop_max)
print("area_max =", area_max)

#3:
if(pop_max_country == area_max_country):
    print("Yes same!")
else:
    print("Not same!")

Country with highest population: China
Country with highest area (sq.miles): Russia[note 12]
pop_max =  1425893465.0
area_max = 6601633.0
Not same!


# use pop
#1: create df1 with columns 'Country' and 'population'. Sort by 'population' high to low
#1A: subset df1 with top 10 rows with highest values
#1C: create subplot bar plot with 'Country' on x-axis and 'population' on y-axis

#2: create df2 with columns 'Country' and 'pop_den_per_km2'. Sort by 'pop_den_per_km2' 
#2A: subset df2 with top 10 rows with highest values
#2B: create subplot bar plot with 'Country on x-axis' and 'pop_den_per_km2' on y-axis

# Hint:
# plt.barh()

# --- 

#1:
df1 = pop[['Country', 'population']].sort_values(by=['population'], ascending=False)
df1 = df1.head(10)

#2:
df2 = pop[['Country', 'pop_den_per_km2']].sort_values(by=['pop_den_per_km2'], ascending=False)
df2 = df2.head(10)

#1C:
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plt.barh(df1['Country'], df1['population'])
plt.subplot(1,2,2)
plt.barh(df2['Country'], df2['pop_den_per_km2'])

plt.tight_layout()
plt.show()


# use pop
#1: Create an array 'numbers' with numeric columns from pop
# such that each column in datafram is a column in the array
# print its shape, size, type

#2: Slice the array to get first 5 rows and 3 columns and save in 'new_num' again
#2: Zero mean and scale values in each column that is:
# 2a- subtract each value in a column by its respective column mean
# 2b- then, divide the value from 2a by column standard deviation
# then print output array

#3- print min and max values for each column in 'numbers' and 'new_num'. 
# Comment on how values between these two arrays compare with each other.


# --- 

#1:
numbers = pop.iloc[:,1:].to_numpy()

print("shape:\n", numbers.shape, "\n")
print("size:\n", numbers.size, "\n")
print("type:\n", type(numbers), "\n")

#2:
new_num = numbers[0:5,0:3]
print('new_num:\n', new_num)
col_mean = np.mean(new_num, axis=0)
print('col_mean:', col_mean)
col_std = np.std(new_num, axis=0)
print('col_std:', col_std)
new_num = (new_num - col_mean)/col_std
print('new_num:\n', new_num)

#3:
numbers_col_min = np.min(numbers, axis=0)
numbers_col_max = np.max(numbers, axis=0)

new_num_col_min = np.min(new_num, axis=0)
new_num_col_max = np.max(new_num, axis=0)

print('\nnumbers_col_min:', numbers_col_min)
print('numbers_col_max:', numbers_col_max)
print('new_num_col_min:', new_num_col_min)
print('new_num_col_max:', new_num_col_max)

shape:
 (248, 5) 

size:
 1240 

type:
 <class 'numpy.ndarray'> 

new_num:
 [[ 686607.      33.      13.]
 [  36686.       2.       1.]
 [5453600.     716.     276.]
 [7494578.    1104.     426.]
 [  32669.       6.       2.]]
col_mean: [2740828.      372.2     143.6]
col_std: [3124855.32557173     456.05631231     175.90974959]
new_num:
 [[-0.65738115 -0.7437678  -0.74242616]
 [-0.8653655  -0.81174186 -0.81064296]
 [ 0.86812723  0.75385427  0.75265868]
 [ 1.52127043  1.6046264   1.60536867]
 [-0.866651   -0.80297102 -0.80495823]]

numbers_col_min: [47.    0.    0.    0.03  0.07]
numbers_col_max: [1.42589346e+09 1.70982460e+07 6.60163300e+06 2.08060000e+04
 5.38880000e+04]
new_num_col_min: [-0.866651   -0.81174186 -0.81064296]
new_num_col_max: [1.52127043 1.6046264  1.60536867]


# use pop
#0: Create a list countries with values from column 'Country'. 
#0A: Filter it to keep only those country names that start with letters, A, B, C, and D
#0B: Sort the list in ascending order i.e. from A to D

#1: Create an empty dictionary 'd' with empty lists for keys 'A', 'B', 'C', 'D'
# append values for countries starting with respective alphabet of a key
# For example: key 'A' would have all countries with names ['Andorra', 'Anguilla(BOT)' ...]

# Hint:
# .to_list()

# --- 

#0: Create list
countries = pop['Country'].to_list()
countries = [country for country in countries if (country[0] in ['A', 'B', 'C', 'D'])]
countries.sort()
print('countries:\n', countries)

#1: Create dictionary
d = {'A':[], 'B':[], 'C':[], 'D':[]}
for country in countries:
    if(country[0]=='A'):
        d['A'].append(country)
    elif(country[0]=='B'):
        d['B'].append(country)
    elif(country[0]=='C'):
        d['C'].append(country)
    elif(country[0]=='D'):
        d['D'].append(country)
    
print('\nd:\n',d)

countries:
 ['Afghanistan', 'Albania', 'Algeria', 'American Samoa(US)', 'Andorra', 'Angola', 'Anguilla(BOT)', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Artsakh[note 10]', 'Aruba(Netherlands)', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda(BOT)', 'Bhutan', 'Bolivia', 'Bonaire(Netherlands)', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'British Virgin Islands(BOT)', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands(BOT)', 'Central African Republic', 'Chad', 'Chile', 'China', 'Christmas Island(Australia)', 'Cocos (Keeling) Islands(Australia)', 'Colombia', 'Comoros', 'Congo', 'Cook Islands[note 7]', 'Costa Rica', 'Croatia', 'Cuba', 'Curaçao(Netherlands)', 'Cyprus[note 5]', 'Czech Republic', 'DR Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic']

d:
 {'A': ['Afghanistan', 'Albania', 'Algeria', 'American Samoa(US)', 'Andorra', 'Angola', 'Anguilla(BOT)', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Artsakh[note 10]', 'Aruba(Netherlands)', 'Australia', 'Austria', 'Azerbaijan'], 'B': ['Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda(BOT)', 'Bhutan', 'Bolivia', 'Bonaire(Netherlands)', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'British Virgin Islands(BOT)', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi'], 'C': ['Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands(BOT)', 'Central African Republic', 'Chad', 'Chile', 'China', 'Christmas Island(Australia)', 'Cocos (Keeling) Islands(Australia)', 'Colombia', 'Comoros', 'Congo', 'Cook Islands[note 7]', 'Costa Rica', 'Croatia', 'Cuba', 'Curaçao(Netherlands)', 'Cyprus[note 5]', 'Czech Republic'], 'D': ['DR Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic']}


# use pop
# create scatter plots as subplots for two items below
#1: population and area_sqmi
#2: np.log(population) and np.log(area_sqmi)

# Learn this!
# Taking a logarithm of values makes small values larger so that we can see it on a plot
# Do you see them in plot #2 ?

# --- write code below this line ---

plt.figure(figsize=(10,5))

plt.subplot(1,2,1)
plt.scatter(pop['population'], pop['area_sqmi'])
plt.title('original')
plt.xlabel('population')
plt.ylabel('area_sqmi')

plt.subplot(1,2,2)
plt.scatter(np.log(pop['population']), np.log(pop['area_sqmi']))
plt.title('original')
plt.xlabel('log[population]')
plt.ylabel('log[area_sqmi]')

plt.tight_layout
plt.show()

/opt/anaconda3/lib/python3.8/site-packages/pandas/core/arraylike.py:397: RuntimeWarning: divide by zero encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)


#1:
# create a copy of 'pop' in 'df'
# subset df to keep only two columns 'Country' and 'population'
# create a new column 'class' in 'df' such that it has values:
# 'low' if population <= 1000000
# 'medium' if  1000000 < population <= 500000000
# 'high' if 500000000 < population <= 1000000000
# 'very_high' if population > 1000000000 


#2:
# use line of code below to create to group values
# dfa = df.groupby(['class'])['Country'].count().reset_index().rename(columns={'Country':'count'})
# Then use: sns.barplot(data=dfa, x="class", y="count") to create a bar plot
# What do you see? Comment!

#3:
# print countries with very_high population in df

# --- 

#1:
df = pop.copy()
df = df[['Country', 'population']]

df['class'] = np.where((df['population']<1000000), 'low', 
                np.where(((df['population']>1000000) & (df['population']<=500000000)), 'medium',
                    np.where(((df['population']>500000000) & (df['population']<=1000000000)), 'high',
                       'very_high' )))


#2:
dfa = df.groupby(['class'])['Country'].count().reset_index().rename(columns={'Country':'count'})
sns.barplot(data=dfa, x="class", y="count")
plt.show()


#3:
very_high_countries = df[df['class']=='very_high']
very_high_countries


# use pop
# Create a scatter plot of 'pop_den_per_km2' vs. 'pop_den_per_sqmi'
# Do you see any trend? Comment why?

# --- 

plt.scatter(pop['pop_den_per_km2'], pop['pop_den_per_sqmi'])
plt.xlabel('pop_den_per_km2')
plt.ylabel('pop_den_per_sqmi')
plt.title('Population density')
plt.show()


# Question - 24

geo = ['India', ['Maharashtra','Gujarat',['Kerala']], 'USA', [['Texas',['California']], 'Nebraska']]

# write code using indices to  print
# 'India'
# 'Kerala' 
# 'USA'
# 'California'

# --- write code below this line  ---

item = geo[0]
print(item)

item = geo[1][2][0]
print(item)


item = geo[2]
print(item)

item = geo[3][0][1][0]
print(item)

India
Kerala
USA
California


#A:
# Read file 'population_india.tsv' (tst --> tab separated)
# using the method shown Q#1
# and save it into a dataframe 'popi'

#B:
#1: shape 
#2: data types in each column
#3: number of rows
#4: number of columns
#5: check for missing values in each columns. Hint: .isna().sum()
#6: total number of values

# ---

#A:
path = '/Users/erv/Desktop/learndataa/Students/Python/Exams_Exercise/Exam_3/data/'
popi = pd.read_csv(path + 'population_india.tsv', sep='\t')

#1: shape 
print("shape:", popi.shape)

#2: dtypes
print('\ndata types:\n', popi.dtypes)

#3: number of rows
print("\nnumber of rows:", len(popi))

#4: number of columns
print("\nnumber of columns:", popi.shape[1])

#5: check for missing values in each columns. Hint: .isna().sum()
print('\nmissing value count:\n', popi.isna().sum())

#6: total number of values
print('total number of values:', popi.size)

shape: (73, 11)

data types:
 Year                             object
Mid-year population              object
Births per year                  object
Deaths per year                  object
Natural change per year          object
Crude birth rate\n(per 1000)    float64
Crude death rate\n(per 1000)    float64
Natural change\n(per 1000)      float64
Total Fertility rate            float64
Infant mortality (per 1000)     float64
Life expectancy                 float64
dtype: object

number of rows: 73

number of columns: 11

missing value count:
 Year                            0
Mid-year population             1
Births per year                 1
Deaths per year                 1
Natural change per year         1
Crude birth rate\n(per 1000)    1
Crude death rate\n(per 1000)    1
Natural change\n(per 1000)      1
Total Fertility rate            1
Infant mortality (per 1000)     1
Life expectancy                 1
dtype: int64
total number of values: 803


#1: remove all rows with missing values. 
#1a: Print dataframe shape before and after values are dropped.
#1b: Check again for missing values in each column. They should be zero. Hint: popi.isna().sum()
# Hint: popi = popi.dropna()

# --- write code below this line ---

print('Before:', popi.shape)
popi = popi.dropna()
print('After:', popi.shape)

Before: (73, 11)
After: (72, 11)


# use 'popi' from Q#26
#1:
# Convert columns below to float:
# - 'Mid-year population'
# - 'Births per year'
# - 'Deaths per year'
# - 'Natural change per year'
# Hint: remove the commas before converting 

#2:
# Convert column 'Year' to datetime
#Hint: popi['Year'] = pd.to_datetime(popi.loc[:,'Year'])

#3:
# print data types for preprocessed dataframe popi

#4:
# output dataframe popi.head(3)

# ---  


#1:
# Convert columns below to float:
# - 'Mid-year population'
# replace commas
popi['Mid-year population'] = popi['Mid-year population'].str.replace(',','')
popi['Mid-year population'] = popi['Mid-year population'].astype(float)

# - 'Births per year'
popi['Births per year'] = popi['Births per year'].str.replace(',','').astype(float)

# - 'Deaths per year'
popi['Deaths per year'] = popi['Deaths per year'].str.replace(',','').astype(float)

# - 'Natural change per year'
# Hint: remove the commas before converting 
popi['Natural change per year'] = popi['Natural change per year'].str.replace(',','').astype(float)


#2:
# Convert column 'Year' to datetime
#Hint: popi['Year'] = pd.to_datetime(popi.loc[:,'Year'])
# How would you then get the year from the date?
popi['Year'] = pd.to_datetime(popi.loc[:,'Year']).dt.year

#3:
# print data types for preprocessed dataframe popi
print(popi.dtypes)

#4:
popi.head(3)

Year                              int64
Mid-year population             float64
Births per year                 float64
Deaths per year                 float64
Natural change per year         float64
Crude birth rate\n(per 1000)    float64
Crude death rate\n(per 1000)    float64
Natural change\n(per 1000)      float64
Total Fertility rate            float64
Infant mortality (per 1000)     float64
Life expectancy                 float64
dtype: object


# use 'popi' from Q#27
# format the column names as:
# 1- strip leading and trailing spaces 
#  - replace '-', '\n', '(', or spaces with '_' (underscore) and ')' with '' nothing
#  - convert text to lowercase
#  - print column names
# 2- rename popi with these cleaned column names
#  - output first three rows of popi


# --- 

# 1-
cols = popi.columns
cols = [col.strip().replace('-','_').replace('\n','_').replace('(', '_').replace(')','').replace(' ','_').lower() for col in cols]
print('Formatted column names:\n', cols)

# 2-
popi.columns = cols
popi.head(3)

Formatted column names:
 ['year', 'mid_year_population', 'births_per_year', 'deaths_per_year', 'natural_change_per_year', 'crude_birth_rate__per_1000', 'crude_death_rate__per_1000', 'natural_change__per_1000', 'total_fertility_rate', 'infant_mortality__per_1000', 'life_expectancy']


# use 'popi' from Q#28

# 1- line plot: (x-axis) 'year' vs. (y-axis) 'mid_year_population'
# 2- line plot: (x-axis) 'year' vs. (y-axis) 'births_per_year'
# 3- line plot: (x-axis) 'year' vs. (y-axis) 'deaths_per_year'
# 4- line plot: (x-axis) 'year' vs. (y-axis) 'natural_change_per_year'

# 5- Comment on what you observe in each of the above plots. 

# --- 

# 1- 
plt.figure(figsize=(15,8))

plt.subplot(2,2,1)
plt.plot(popi['year'], popi['mid_year_population'], marker='x')
plt.xlabel('Year')
plt.ylabel('Mid year population')

plt.subplot(2,2,2)
plt.plot(popi['year'], popi['births_per_year'], marker='x')
plt.xlabel('Year')
plt.ylabel('Births per year')

plt.subplot(2,2,3)
plt.plot(popi['year'], popi['deaths_per_year'], marker='x')
plt.xlabel('Year')
plt.ylabel('Deaths per year')

plt.subplot(2,2,4)
plt.plot(popi['year'], popi['natural_change_per_year'], marker='x')
plt.xlabel('Year')
plt.ylabel('Natural change year population')


plt.tight_layout()
plt.show()


# use 'popi' from Q#28
# 1- 
# create one overlay plot for:
# line plot: (x-axis) 'year' vs. (y-axis) 'deaths_per_year'
# line plot: (x-axis) 'year' vs. (y-axis) 'births_per_year'

# 2-
# draw a vertical line at Year 2000 and 2020
# Hint: plt.axvline(x=2000, ymin=0, ymax=3.0e7, color='red', linestyle='--')

# 3-
# Comment on what you observe in the plot

# --- 
plt.plot(popi['year'], popi['births_per_year'], marker='x', label='births')
plt.plot(popi['year'], popi['deaths_per_year'], marker='x', label='deaths')
plt.axvline(x=2000, ymin=0, ymax=3.0e7, color='red', linestyle='--')
plt.axvline(x=2020, ymin=0, ymax=3.0e7, color='green', linestyle='--')
plt.xlabel('Year')
plt.ylabel('Count per year')
plt.legend()
plt.show()


# use 'popi' from Q#28
# 0- 
#- create of copy of 'popi' and save in 'dfx' 
#- drop columns 'total_fertility_rate', 'infant_mortality__per_1000', 'life_expectancy'
# Hint: dfx.loc[:,~dfx.columns.isin(['total_fertility_rate', ....])]
# note that that '~' sign negates the output of dfx.columns.isin() 

# 1- Now use 'dfx' Calculate:
# - births per 1000 people in a year; save in a new column 'calc_birth_rate_per_1000'  
# - deaths per 1000 people in a year; save in a new column 'calc_deaths_rate_per_1000' 
# - natural change per 1000 people in a year; save in a new column 'calc_natural_change_per_1000' 
# - round all above calculated percentages in columns to just one decimal place
# Hint: Calculate percentages based on 'mid_year_population'

# 2-
# - calculate difference between
# - 'calc_birth_rate_per_1000' and 'crude_birth_rate__per_1000' --> save in col 'diff_birth_rate'
# - 'calc_death_rate_per_1000' and 'crude_death_rate__per_1000' --> save in col 'diff_death_rate'
# - 'calc_natural_change_per_1000' and 'natural_change__per_1000' --> save in col 'diff_natural_change'


# 3-
# calculate the sum of columns 
# - 'diff_birth_rate' as 'sum_diff_birth_rate'
# - 'diff_death_rate' as 'sum_diff_death_rate'
# - 'diff_natural_change' as 'sum_diff_natural_change'
# - round the sum to zero decimal places
# print output

# 4- 
# - Comment on the sum calculated in #3. What do you observe

# --- 

# 0- 
dfx = popi.copy()
dfx = dfx.loc[:,~dfx.columns.isin(['total_fertility_rate', 'infant_mortality__per_1000', 'life_expectancy'])]
#dfx.head(3)

# 1- Now use 'dfx' Calculate:
dfx['calc_birth_rate_per_1000'] = ((1000*dfx['births_per_year'])/dfx['mid_year_population']).round(1)
dfx['calc_death_rate_per_1000'] = ((1000*dfx['deaths_per_year'])/dfx['mid_year_population']).round(1)
dfx['calc_natural_change_per_1000'] = ((1000*dfx['natural_change_per_year'])/dfx['mid_year_population']).round(1)

# 2-
# - calculate difference between
# - 'calc_birth_rate_per_1000' and 'crude_birth_rate__per_1000' --> save in col 'diff_birth_rate'
# - 'calc_death_rate_per_1000' and 'crude_death_rate__per_1000' --> save in col 'diff_death_rate'
# - 'calc_natural_change_per_1000' and 'natural_change__per_1000' --> save in col 'diff_natural_change_rate'
dfx['diff_birth_rate'] = dfx['calc_birth_rate_per_1000'] - dfx['crude_birth_rate__per_1000']
dfx['diff_death_rate'] = dfx['calc_death_rate_per_1000'] - dfx['crude_death_rate__per_1000']
dfx['diff_natural_change'] = dfx['calc_natural_change_per_1000'] - dfx['natural_change__per_1000']

# 3-
sum_diff_birth_rate = round(dfx['diff_birth_rate'].sum())
sum_diff_death_rate = round(dfx['diff_death_rate'].sum())
sum_diff_natural_change = round(dfx['diff_natural_change'].sum())

print("sum_diff_birth_rate:", sum_diff_birth_rate)
print("sum_diff_death_rate:", sum_diff_death_rate)
print("sum_diff_natural_change:", sum_diff_natural_change)

sum_diff_birth_rate: 0
sum_diff_death_rate: 0
sum_diff_natural_change: 0


# use 'popi' from Q#28
# 1- 
# - create of copy of 'popi' and save in 'dfy' 

# 2-
# - Comment if it is it possible to calculate the values in the columns 
# 'natural_change_per_year' and 'natural_change__per_1000'
# based on other columns in the dataframe
#
# - If no, why not?

# 3- If yes, calculate 
# - values in 'natural_change_per_year' and save in column 'new_natural_change_per_year'
# - values in 'natural_change__per_1000' and save in column 'new_natural_change__per_1000'

# 4-
# - calculate difference between
# - 'new_natural_change_per_year' and 'natural_change_per_year' --> save in col 'diff_change'
# - 'new_natural_change__per_1000' and 'natural_change__per_1000' --> save in col 'diff_change_rate'


# 5-
# calculate the sum of columns 
# - 'diff_change' as 'sum_diff_change'
# - 'diff_change_rate' as 'sum_diff_change_rate'
# - round the sum to zero decimal places
# print output

# 6- 
# - Comment on the sum calculated in #3. What do you observe

# --- 

# 1- 
dfy = popi.copy()

# 2-
# Comment: Yes it is possible

# 3-
dfy['new_natural_change_per_year'] = dfy['births_per_year'] - dfy['deaths_per_year']
dfy['new_natural_change__per_1000'] = dfy['crude_birth_rate__per_1000'] - dfy['crude_death_rate__per_1000']

# 4-
# - calculate difference between
# - 'new_natural_change_per_year' and 'natural_change_per_year' --> save in col 'diff_change'
# - 'new_natural_change__per_1000' and 'natural_change__per_1000' --> save in col 'diff_change_rate'
dfy['diff_change'] = dfy['new_natural_change_per_year'] - dfy['natural_change_per_year']
dfy['diff_change_rate'] = dfy['new_natural_change__per_1000'] - dfy['natural_change__per_1000']

# 5-
sum_diff_change = round(dfy['diff_change'].sum())
sum_diff_change_rate = round(dfy['diff_change_rate'].sum())

print("sum_diff_change =", sum_diff_change)
print("sum_diff_change_rate =", sum_diff_change_rate)

sum_diff_change = 3000
sum_diff_change_rate = 0


# use 'popi' from Q#28
# Note: Create plots to visualize data for interpretation
 
# 1- 
# - line plot: (x-axis) 'year' vs. (y-axis) 'total_fertility_rate'
# - make sure that the y-axis starts at zero
# - What do you observe in the plot? Comment
# - Is the fertility rate increasing or decreasing or is the same over the years?


# 2- create
# - line plot: (x-axis) 'year' vs. (y-axis) 'total_fertility_rate'
# - line plot: (x-axis) 'year' vs. (y-axis) 'crude_birth_rate__per_1000'
# - line plot: (x-axis) 'year' vs. (y-axis) 'crude_death_rate__per_1000'
# - line plot: (x-axis) 'year' vs. (y-axis) 'life_expectancy'
# Comment: What do you observe?


# 3A- create
# create a dataframe df_temp with columns: 'variable', 'mean_rate', 'std_rate'
# variable --> has names ['total_fertility_rate','crude_birth_rate__per_1000', 'crude_death_rate__per_1000']
# mean_rate --> has mean values for each
# sd_rate --> has standard deviation (SD) values for each

# 3B-
# Then, rename values in column 'variable' as below:
# 'total_fertility_rate' --> 'fertility'
# 'crude_birth_rate__per_1000' --> 'birth'
# 'crude_death_rate__per_1000' --> 'death'
# Hint: nested np.where()

# 3C-
# Then use df_temp to create a bar plot: 
# (x-axis): 'total_fertility_rate', 'crude_birth_rate__per_1000', 'crude_death_rate__per_1000'
# (y-axis): mean
# (error bars): standard deviation
# Hint-1: 
# mean --> popi['total_fertility_rate'].mean()
# standard deviation --> popi['total_fertility_rate'].std()
#
# Hint-2: plt.errorbar(x=df_temp['variable'], y=df_temp['mean_rate'], 
#             yerr=df_temp['sd_rate'], 
#             linestyle='', capsize=5, ecolor='red')
#
# Comment: What do you observe?


# --- 

# 1-
plt.plot(popi['year'], popi['total_fertility_rate'], marker='x')
plt.ylim(0,7)
plt.xlabel('Year')
plt.ylabel('Total fertility rate')
plt.show()

# 2-
plt.plot(popi['year'], popi['total_fertility_rate'], marker='x', label='fertility')
plt.plot(popi['year'], popi['crude_birth_rate__per_1000'], marker='x', label='birth')
plt.plot(popi['year'], popi['crude_death_rate__per_1000'], marker='x', label='death')
plt.plot(popi['year'], popi['life_expectancy'], marker='x', label='life_exp')

#plt.ylim(0,7)
plt.legend()
plt.title('Population')
plt.xlabel('Year')
plt.ylabel('Rate')
plt.show()


# 3-
# create DataFrame
mean_fr = popi['total_fertility_rate'].mean()
mean_br = popi['crude_birth_rate__per_1000'].mean()
mean_dr = popi['crude_death_rate__per_1000'].mean()

sd_fr = popi['total_fertility_rate'].std()
sd_br = popi['crude_birth_rate__per_1000'].std()
sd_dr = popi['crude_death_rate__per_1000'].std()


df_temp = pd.DataFrame({
    'variable': ['total_fertility_rate','crude_birth_rate__per_1000', 'crude_death_rate__per_1000'],
    'mean_rate': [mean_fr, mean_br, mean_dr],
    'sd_rate': [sd_fr, sd_br, sd_dr]
})
df_temp

df_temp['variable'] = np.where(df_temp['variable']=='total_fertility_rate', 'fertility',
                            np.where(df_temp['variable']=='crude_birth_rate__per_1000', 'birth','death'
                              ))

# Plot
plt.bar(df_temp['variable'], df_temp['mean_rate'])
plt.errorbar(x=df_temp['variable'], y=df_temp['mean_rate'], 
             yerr=df_temp['sd_rate'], 
             linestyle='', capsize=5, ecolor='red')
plt.ylabel('Rate')
plt.show()


# use 'popi' from Q#28
# Note: Create plots to visualize data for interpretation
 
# 1- 
# Create three scatter subplots:
# - (x-axis) total_fertility_rate' vs. (y-axis) 'total_fertility_rate'
# - (x-axis) total_fertility_rate' vs. (y-axis) 'crude_birth_rate__per_1000'
# - (x-axis) total_fertility_rate' vs. (y-axis) 'crude_death_rate__per_1000'
# - Make sure that both x-axis and y-axis start at zero --> plt.xlim(); plt.ylim()
# Hint: plt.scatter(popi['total_fertility_rate'], popi['total_fertility_rate'], alpha=0.5)

# Comment: What does each plot tell us? For example, as fertility rate increases, 
# what happens to values on y-axis? Do they increase or decrease?


# --- write code below this line ---

plt.figure(figsize=(15,5))
plt.subplot(1,3,1)
plt.scatter(popi['total_fertility_rate'], popi['total_fertility_rate'], alpha=0.5)
plt.xlabel('Fertility rate')
plt.ylabel('Fertility rate')
plt.xlim(0,7)
plt.ylim(0,7)

plt.subplot(1,3,2)
plt.scatter(popi['total_fertility_rate'], popi['crude_birth_rate__per_1000'], alpha=0.5)
plt.xlabel('Fertility rate')
plt.ylabel('Birth rate')
plt.xlim(0,7)
plt.ylim(0,45)

plt.subplot(1,3,3)
plt.scatter(popi['total_fertility_rate'], popi['crude_death_rate__per_1000'], alpha=0.5)
plt.xlabel('Fertility rate')
plt.ylabel('Death rate')
plt.xlim(0,7)
plt.ylim(0,24)

plt.show()


# use 'popi' from Q#28
# Note: Create plots to visualize data for interpretation
 
# 1- 
# Create an overlay plot:
# - (x-axis) 'year' vs. (y-axis) 'infant_mortality__per_1000'
# - (x-axis) 'year' vs. (y-axis) 'crude_death_rate__per_1000'

# Comment: What do you observe?

# --- 

plt.plot(popi['year'], popi['infant_mortality__per_1000'], marker='+')
plt.plot(popi['year'], popi['crude_death_rate__per_1000'], marker='.')
plt.show()


# use 'popi' from Q#28
# Note: Create plots to visualize data for interpretation
 
# Calculate percentage increase or decrease in:
# 1- 'mid_year_population' from 1950 to 2020 as 'percent_change_population'
# 2- 'crude_birth_rate__per_1000' from 1950 to 2020 as 'percent_change_birth'
# 3- 'crude_death_rate__per_1000' from 1950 to 2020 as 'percent_change_death'
# 4- 'infant_mortality__per_1000' from 1950 to 2020 as 'percent_change_infant_mortality'
# 5- 'life_expectancy' from 1950 to 2020 as 'percent_change_life_expectancy'
# Note: All above as percent of values in 1950
# Note: Round to zero decimal places

# Comment: What do you see?

# --- 


numerator = popi[popi['year']==2020]['mid_year_population'].values[0] - popi[popi['year']==1950]['mid_year_population'].values[0]
denominator = popi[popi['year']==1950]['mid_year_population'].values[0]
percent_change_population = round(100*numerator/denominator)
print('percent_change_population = {}%'.format(percent_change_population))


numerator = popi[popi['year']==2020]['crude_birth_rate__per_1000'].values[0] - popi[popi['year']==1950]['crude_birth_rate__per_1000'].values[0]
denominator = popi[popi['year']==1950]['crude_birth_rate__per_1000'].values[0]
percent_change_birth = round(100*numerator/denominator)
print('percent_change_birth = {}%'.format(percent_change_birth))

numerator = popi[popi['year']==2020]['crude_death_rate__per_1000'].values[0] - popi[popi['year']==1950]['crude_death_rate__per_1000'].values[0]
denominator = popi[popi['year']==1950]['crude_death_rate__per_1000'].values[0]
percent_change_death = round(100*numerator/denominator)
print('percent_change_death = {}%'.format(percent_change_death))

numerator = popi[popi['year']==2020]['infant_mortality__per_1000'].values[0] - popi[popi['year']==1950]['infant_mortality__per_1000'].values[0]
denominator = popi[popi['year']==1950]['infant_mortality__per_1000'].values[0]
percent_change_infant_mortality = round(100*numerator/denominator)
print('percent_change_infant_mortality = {}%'.format(percent_change_infant_mortality))

numerator = popi[popi['year']==2020]['life_expectancy'].values[0] - popi[popi['year']==1950]['life_expectancy'].values[0]
denominator = popi[popi['year']==1950]['life_expectancy'].values[0]
percent_change_life_expectancy = round(100*numerator/denominator)
print('percent_change_life_expectancy = {}%'.format(percent_change_life_expectancy))

percent_change_population = 291%
percent_change_birth = -62%
percent_change_death = -67%
percent_change_infant_mortality = -85%
percent_change_life_expectancy = 68%

	Country	population	area_sq_km	area_sqmi	pop_den_per_km2	pop_den_per_sqmi
156	World (excluding Antarctica)	7,99,96,50,000	13,47,40,000	5,20,23,114	59	154
161	World (all land)	7,99,96,50,000	14,89,40,000	5,75,05,734	54	139

	Year	Mid-year population	Births per year	Deaths per year	Natural change per year	Crude birth rate\n(per 1000)	Crude death rate\n(per 1000)	Natural change\n(per 1000)	Total Fertility rate	Infant mortality (per 1000)	Life expectancy
0	1950	357021000.0	15651000.0	7942000.0	7709000.0	43.8	22.2	21.6	5.73	181.2	41.7
1	1951	364922000.0	16042000.0	8171000.0	7871000.0	44.0	22.4	21.6	5.77	180.1	41.7
2	1952	372997000.0	16458000.0	8293000.0	8165000.0	44.1	22.2	21.9	5.82	177.5	42.0

	year	mid_year_population	births_per_year	deaths_per_year	natural_change_per_year	crude_birth_rate__per_1000	crude_death_rate__per_1000	natural_change__per_1000	total_fertility_rate	infant_mortality__per_1000	life_expectancy
0	1950	357021000.0	15651000.0	7942000.0	7709000.0	43.8	22.2	21.6	5.73	181.2	41.7
1	1951	364922000.0	16042000.0	8171000.0	7871000.0	44.0	22.4	21.6	5.77	180.1	41.7
2	1952	372997000.0	16458000.0	8293000.0	8165000.0	44.1	22.2	21.9	5.82	177.5	42.0

Country and Dependencies by Population Density: 2022-2023¶

Import libraries¶

Read data¶

Preprocessing¶

Data Exploration¶

Read data¶

Data analysis¶

	Country	Unnamed: 1	population	area_sq_km	area_sqmi	pop_den_per_km2	pop_den_per_sqmi
0	Macau	Macau	6,86,607	33	13	20,806	53,888
1	Monaco	Monaco	36,686	2	1	18,343	47,508
2	Singapore	Singapore	54,53,600	716	276	7,617	19,727
3	Hong Kong	Hong Kong	74,94,578	1,104	426	6,789	17,582
4	Gibraltar (BOT)	Gibraltar (BOT)	32,669	6	2	5,445	14,102