import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
np.set_printoptions(suppress=True)
#path = '-add your path here-'
path = '/Users/erv/Desktop/learndataa/Students/Python/Exams_Exercise/Exam_3/data/'
# Step-3: Read the file using pd.read_csv() using the '\t' separator (tab)
sea = pd.read_csv(path + 'Black Sea Grain Initiative Voyages - Data.csv', sep=',')
# Step-4: Look at the first five records using .head()
print(sea.shape)
sea.head(2)
(697, 15)
| Status | Outbound Sequence | Vessel name | IMO | Departure port | Country | Commodity | Tonnage | Departure | Inspection İstanbul | Income group | Flag | World Bank region | UN region | Development category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Outbound | 1 | RAZONI | 9086526 | Odesa | Türkiye | Corn | 1,527 | 31-Jul-22 | 02-Aug-22 | upper-middle-income | Sierra Leone | Europe & Central Asia | Asia-Pacific | developing |
| 1 | Outbound + | 1 | RAZONI | 9086526 | Odesa | Egypt | Corn | 25,000 | 31-Jul-22 | 02-Aug-22 | lower-middle income | Sierra Leone | Middle East & North Africa | Africa | developing |
# use dataframe 'sea' from Q#38
# 0- Print the number of rows and columns in the dataframe
# 1- Print type of data in each column
# 2- Print the number of missing values in each column
# 3- Check to see if any column has the same value in all rows. Drop column if it has the same value in all the rows
# Hint: .unique()
# Hint: use for loop
# 4- Print shape of sea
# ---
# 0- Print the number of rows and columns in the dataframe
print("Number of rows:", sea.shape[0])
print('Number of columns:', sea.shape[1],'\n')
# 1- Print type of data in each column
print('Data types:\n',sea.dtypes, '\n')
# 2- Print the number of missing values in each column
print('Missing values:\n', sea.isna().sum())
# 3- Check to see if any column has the same value in all rows. Hint: .unique()
for col in sea.columns:
num = sea[col].nunique()
if (num == 1):
sea = sea.drop(col, axis=1)
print(col + ' was dropped!')
# 4- Print shape of sea
print('\nShape:\n', sea.shape)
Number of rows: 697 Number of columns: 15 Data types: Status object Outbound Sequence int64 Vessel name object IMO int64 Departure port object Country object Commodity object Tonnage object Departure object Inspection İstanbul object Income group object Flag object World Bank region object UN region object Development category object dtype: object Missing values: Status 0 Outbound Sequence 0 Vessel name 0 IMO 0 Departure port 0 Country 0 Commodity 0 Tonnage 0 Departure 0 Inspection İstanbul 41 Income group 0 Flag 1 World Bank region 0 UN region 0 Development category 0 dtype: int64 Shape: (697, 15)
# 1-
col_names = sea.columns
col_names = [ col.strip().replace(' ', '_').lower() for col in col_names]
sea.columns = col_names
print("Columns:\n", sea.columns.values)
# 2-
sea['departure'] = pd.to_datetime(sea['departure'])
sea['inspection_i̇stanbul'] = pd.to_datetime(sea['inspection_i̇stanbul'])
# 3-
sea['tonnage'] = sea['tonnage'].str.replace(',','').astype(float)
# 4-
sea['dep_year'] = sea['departure'].dt.year
sea['dep_month'] = sea['departure'].dt.month
sea['dep_day'] = sea['departure'].dt.day
sea['dep_dow'] = sea['departure'].dt.day_of_week
# 5-
sea['insp_year'] = sea['inspection_i̇stanbul'].dt.year
sea['insp_month'] = sea['inspection_i̇stanbul'].dt.month
sea['insp_day'] = sea['inspection_i̇stanbul'].dt.day
sea['insp_dow'] = sea['inspection_i̇stanbul'].dt.day_of_week
# 6-
print("\nData types:\n", sea.dtypes)
print('\nShape: ', sea.shape)
sea.head(3)
Columns: ['status' 'outbound_sequence' 'vessel_name' 'imo' 'departure_port' 'country' 'commodity' 'tonnage' 'departure' 'inspection_i̇stanbul' 'income_group' 'flag' 'world_bank_region' 'un_region' 'development_category'] Data types: status object outbound_sequence int64 vessel_name object imo int64 departure_port object country object commodity object tonnage float64 departure datetime64[ns] inspection_i̇stanbul datetime64[ns] income_group object flag object world_bank_region object un_region object development_category object dep_year int64 dep_month int64 dep_day int64 dep_dow int64 insp_year float64 insp_month float64 insp_day float64 insp_dow float64 dtype: object Shape: (697, 23)
| status | outbound_sequence | vessel_name | imo | departure_port | country | commodity | tonnage | departure | inspection_i̇stanbul | income_group | flag | world_bank_region | un_region | development_category | dep_year | dep_month | dep_day | dep_dow | insp_year | insp_month | insp_day | insp_dow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Outbound | 1 | RAZONI | 9086526 | Odesa | Türkiye | Corn | 1527.0 | 2022-07-31 | 2022-08-02 | upper-middle-income | Sierra Leone | Europe & Central Asia | Asia-Pacific | developing | 2022 | 7 | 31 | 6 | 2022.0 | 8.0 | 2.0 | 1.0 |
| 1 | Outbound + | 1 | RAZONI | 9086526 | Odesa | Egypt | Corn | 25000.0 | 2022-07-31 | 2022-08-02 | lower-middle income | Sierra Leone | Middle East & North Africa | Africa | developing | 2022 | 7 | 31 | 6 | 2022.0 | 8.0 | 2.0 | 1.0 |
| 2 | Outbound | 2 | NAVI STAR | 9590979 | Odesa | Ireland | Corn | 33000.0 | 2022-08-04 | 2022-08-05 | high-income | Panama | Europe & Central Asia | Western Europe and Others | developed | 2022 | 8 | 4 | 3 | 2022.0 | 8.0 | 5.0 | 4.0 |
# use 'sea' from Q#40
# Objective: Now that the dataset is clean and ready for analysis
# we can start looking into each column
# and see what we can find !!! Exciting !!!
# 1-
# How many typs of 'status' are there for ships leaving the port and coming into the port?
# 2-
# How many unique vessel names?
# 3-
# What are the names of departure ports?
# 4-
# How many unique countries?
# 5-
# What are the different types of commodities that are transported by the ships?
# 6-
# What are different types of income groups?
# 7-
# How many different flags are there? Is it one or more flags per country?
# 8-
# What is the minimum and maximum tonnage for the ships?
# ---
# 1-
# How many typs of 'status' are there for ships leaving the port and coming into the port?
x = sea['status'].unique()
xnum = sea['status'].nunique()
print("status:\n", x)
print("num:\n", xnum)
# 2-
# How many unique vessel names?
x = sea['vessel_name'].nunique()
print("\nvessel nam:\n", x)
# 3-
# What are the names of departure ports?
x = sea['departure_port'].unique()
print("\ndeparture_port:\n", x)
# 4-
# How many unique countries?
x = sea['country'].nunique()
print("\ncountry:\n", x)
# 5-
# What are the different types of commodities that are transported by the ships?
x = sea['commodity'].unique()
print("\ncommodity:\n", x)
# 6-
# What are different types of income groups?
x = sea['income_group'].unique()
print("\nincome_group:\n", x)
# 7-
# How many different flags are there? Is it one or more flags per country?
x = sea['flag'].nunique()
print("\nflag:", x)
print("There are fewer flags than countries.")
# 8-
# What is the minimum and maximum tonnage for the ships?
xmin = sea['tonnage'].min()
xmax = sea['tonnage'].max()
print("\nTonnage (min):", xmin, "\nTonnage (max):", xmax)
status: ['Outbound' 'Outbound +'] num: 2 vessel nam: 481 departure_port: ['Odesa' 'Chornomorsk' 'Yuzhny/Pivdennyi'] country: 42 commodity: ['Corn' 'Sunflower meal' 'Sunflower oil' 'Soya beans' 'Wheat' 'Sunflower seed' 'Sugar beet pellets' 'Rapeseed' 'Peas' 'Barley' 'Wheat bran pellets' 'Canola' 'Rapeseed meal' 'Mixed' 'Sunflower pellets' 'Soya oil' 'Vegetable oil'] income_group: ['upper-middle-income' 'lower-middle income' 'high-income' 'low-income'] flag: 36 There are fewer flags than countries. Tonnage (min): 68.0 Tonnage (max): 74500.0
# 1-
# Create a histogram plot of tonnage. Comment
# 2-
# Create a timeseries plot of departure (on x-axis) vs. tonnage (on y-axis). Comment
# ---
# 1-
plt.hist(sea['tonnage'])
plt.title('Ship tonnage transported')
plt.xlabel('Tons')
plt.ylabel('Count')
plt.show()
# 2-
plt.plot(sea['departure'], sea['tonnage'])
plt.title('Ship tonnage transported')
plt.xlabel('Tons')
plt.ylabel('Count')
plt.show()
# 0-
# create new dataframe 'df' that is a subset 'sea'
# such that it has only those countrys that have more than 50 rows of data
# Hint: use groupby to get the count of rows per country. Then use only these countries to get the subset.
# temp = sea.groupby(['country'])['status'].count().reset_index().rename(columns={'status':'count'})
# temp = temp[temp['count']>50]
# subset_country_list = temp['country'].tolist()
# print shape of df
# 1-
# Create a histogram plot of tonnage by country. Comment
# sns.histplot(data=df, x="tonnage", hue="country", element="step", fill=True)
# 2-
# Create a timeseries plot of departure (on x-axis) vs. tonnage (on y-axis) by country. Comment
# Hint: sns.lineplot(data=df, x='departure', y='tonnage', hue='country')
# ---
# 0-
temp = sea.groupby(['country'])['status'].count().reset_index().rename(columns={'status':'count'})
temp = temp[temp['count']>50]
subset_country_list = temp['country'].tolist()
print('subset_country_list:', subset_country_list)
df = sea[sea['country'].isin(subset_country_list)]
print('Shape:', df.shape)
# 1-
sns.histplot(data=df, x="tonnage", hue="country", element="step", fill=True)
plt.show()
# 2-
sns.lineplot(data=df, x='departure', y='tonnage', hue='country')
plt.show()
subset_country_list: ['China', 'Italy', 'Spain', 'Türkiye'] Shape: (415, 23)
# 1-
# Groupby on 'status' to find total 'tonnage' for each category
# Step-1: peroform groupby
# Step-2: reset index
# Step-3: rename columns
# Step-4: show ouput
### Option -1
# Step-1: peroform groupby sum
df = sea.groupby(['status'])['tonnage'].sum()
# Step-2: reset index
df = df.reset_index()
# Step-3: rename columns
df = df.rename(columns = {'tonnage':'tonnage_sum'})
### Option -3
# Step-1 to 3: peroform groupby sum, reset index and rename in same line
#df = sea.groupby(['status'])['tonnage'].sum().reset_index().rename(columns = {'tonnage':'tonnage_sum'})
# Step-4: show ouput
print(df)
# Using steps above calculate mean tonnage instead of sum
# --- write code below this line ---
df = sea.groupby(['status'])['tonnage'].mean().reset_index().rename(columns = {'tonnage':'tonnage_mean'})
df
status tonnage_sum 0 Outbound 15263021.0 1 Outbound + 1071644.0
| status | tonnage_mean | |
|---|---|---|
| 0 | Outbound | 24737.473258 |
| 1 | Outbound + | 13395.550000 |
# 1- calculate the total tonnage for each vessel name and save in dataframe 'df'.
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row). Then, subset top 25 rows
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment
# ---
# 1-
df = sea.groupby(['vessel_name'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})
# 2-
df = df.sort_values(by='tonnage_sum', ascending=False)
df = df.head(25)
# 3-
plt.figure(figsize=(10,5))
plt.barh(df['vessel_name'], df['tonnage_sum'])
plt.title('Tonnage by vessel name')
plt.xlabel('Total tonnage')
plt.ylabel('Vessel name')
plt.tight_layout()
plt.show()
# 1- calculate the total tonnage for each departure_port and save in dataframe 'df'.
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row).
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment
# ---
# 1-
df = sea.groupby(['departure_port'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})
# 2-
df = df.sort_values(by='tonnage_sum', ascending=True)
# 3-
plt.figure(figsize=(5,3))
plt.barh(df['departure_port'], df['tonnage_sum'])
plt.title('Tonnage by departure port')
plt.xlabel('Total tonnage')
plt.ylabel('Departure port')
plt.tight_layout()
plt.show()
# 1- calculate the total tonnage for each country and save in dataframe 'df'.
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row). Then, subset top 25 rows
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment
# ---
# 1-
df = sea.groupby(['country'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})
# 2-
df = df.sort_values(by='tonnage_sum', ascending=False)
df = df.head(25)
# 3-
plt.figure(figsize=(10,5))
plt.barh(df['country'], df['tonnage_sum'])
plt.title('Tonnage by country')
plt.xlabel('Total tonnage')
plt.ylabel('Country')
plt.tight_layout()
plt.show()
# 1- calculate the total tonnage for each commodity and save in dataframe 'df'.
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row).
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment
# ---
# 1-
df = sea.groupby(['commodity'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})
# 2-
df = df.sort_values(by='tonnage_sum', ascending=False)
# 3-
plt.figure(figsize=(10,5))
plt.barh(df['commodity'], df['tonnage_sum'])
plt.title('Tonnage by commodity')
plt.xlabel('Total tonnage')
plt.ylabel('Commodity')
plt.tight_layout()
plt.show()
# 1- calculate the total tonnage for each commodity and save in dataframe 'df'.
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row).
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment
# ---
# 1-
df = sea.groupby(['income_group'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})
# 2-
df = df.sort_values(by='tonnage_sum', ascending=False)
# 3-
plt.figure(figsize=(10,5))
plt.barh(df['income_group'], df['tonnage_sum'])
plt.title('Tonnage by income group')
plt.xlabel('Total tonnage')
plt.ylabel('Income group')
plt.tight_layout()
plt.show()
# 1- calculate the total tonnage for each flag and save in dataframe 'df'.
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row). Then, subset top 25 rows
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment
# --- write code below this line ---
# 1-
df = sea.groupby(['flag'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})
# 2-
df = df.sort_values(by='tonnage_sum', ascending=False)
df = df.head(25)
# 3-
plt.figure(figsize=(10,5))
plt.barh(df['flag'], df['tonnage_sum'])
plt.title('Tonnage by flag')
plt.xlabel('Total tonnage')
plt.ylabel('Flag')
plt.tight_layout()
plt.show()
# 1- calculate the total tonnage for each 'dep_month' and save in dataframe 'df'.
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row).
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment
# ---
# 1-
df = sea.groupby(['dep_month'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})
# 2-
df['dep_month'] = df['dep_month'].astype(str)
df = df.sort_values(by='tonnage_sum', ascending=False)
# 3-
plt.figure(figsize=(10,5))
plt.barh(df['dep_month'], df['tonnage_sum'])
plt.title('Tonnage by month')
plt.xlabel('Total tonnage')
plt.ylabel('Departure month')
plt.tight_layout()
plt.show()
# 1- calculate the total tonnage for each 'dep_day' and save in dataframe 'df'.
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row).
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment
# ---
# 1-
df = sea.groupby(['dep_day'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})
# 2-
df['dep_day'] = df['dep_day'].astype(str)
df = df.sort_values(by='tonnage_sum', ascending=False)
# 3-
plt.figure(figsize=(10,5))
plt.barh(df['dep_day'], df['tonnage_sum'])
plt.title('Tonnage by day')
plt.xlabel('Total tonnage')
plt.ylabel('Departure day')
plt.tight_layout()
plt.show()
# 1- calculate the total tonnage for each 'dep_dow' and save in dataframe 'df'.
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row).
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment
# ---
# 1-
df = sea.groupby(['dep_dow'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})
# 2-
df['dep_dow'] = df['dep_dow'].astype(str)
df = df.sort_values(by='tonnage_sum', ascending=False)
# 3-
plt.figure(figsize=(10,5))
plt.barh(df['dep_dow'], df['tonnage_sum'])
plt.title('Tonnage by day')
plt.xlabel('Total tonnage')
plt.ylabel('Departure day of week')
plt.tight_layout()
plt.show()
# 1-
# Create a function 'get_list' that
# - takes three input arguments: (1) dfx, (2) col_var, (3) threshold=5
# - returns a list of unique values from a col_var
# Use threshold=5 to keep only those values
# from col_var that have a count greater than 5
# Hint:
def get_list(input_x, input_y):
xy_list = [input_x, input_y]
return xy_list
z = get_list(1, 2)
print('z:', z, '\n\n')
# 2-
# run:
# z = get_counts(sea, 'vessel_name')
# ---
# 1-
def get_list(dfx, col_var, threshold=5):
# Groupby
temp = dfx.groupby([col_var])['tonnage'].count().reset_index().rename(columns={'tonnage':'count'})
# Sort
temp = temp.sort_values(by='count', ascending=False)
# Subset
temp = temp[temp['count'] >= threshold]
# List
temp_list = temp[col_var].tolist()
return temp_list
# 2-
z = get_list(sea, 'vessel_name')
z
z: [1, 2]
['MAVKA', 'INOI', 'FULMAR S', 'ENEIDA']
# 1-
# Create a function 'get_mean_sd' that
# A- takes three input arguments: (1) dfx, (2) col_var
# B- subsets dfx with only those categories found in 'category_list'.
# Get category_list by calling function get_list() from Q-53 above
# For example: in case of col_var = 'vessel_name'
# only rows where the column 'vessel_name' has values ['MAVKA', 'INOI', 'FULMAR S', 'ENEIDA'] would be used
# C- Calculates the 'mean' and 'sd' for the col var
# D- calculates a dataframe with columns:
# - the name in col_var
# - mean
# - sd
# E - from function return dataframe with upto top 25 rows only
# 2-
# run:
# df = get_mean_sd(sea, 'vessel_name')
# ---
# 1-
def get_mean_sd(dfx, col_var):
# 1A-
# Get category list
category_list = get_list(dfx, col_var)
# Subset
dfx = dfx[dfx[col_var].isin(category_list)]
# mean and sd
df = dfx.groupby([col_var])['tonnage'].mean().reset_index().rename(columns={'tonnage':'mean'})
df['sd'] = dfx.groupby([col_var])['tonnage'].std().reset_index(drop=True)
# convert to string
df[col_var] = df[col_var].astype(str)
# sort
df = df.sort_values(by='mean', ascending=False)
df = df.head(25)
return df
# 2-
df = get_mean_sd(sea, 'vessel_name')
df
| vessel_name | mean | sd | |
|---|---|---|---|
| 0 | ENEIDA | 26249.6 | 16337.863272 |
| 2 | INOI | 12346.8 | 5060.280743 |
| 3 | MAVKA | 9890.0 | 3330.427900 |
| 1 | FULMAR S | 8227.0 | 4045.972689 |
# 1-
# Create a function get_plot()
# that takes two input arguments:
# (1) dfx,
# (2) col_var to be plotted, convert it to string
# (3) plot title,
# (4) label for x-axis,
# (5) label for y-axis
# creates a bar plot of mean values with sd error bars
# 2-
# run using df output from Q#54:
# get_plot(df, col_var='vessel_name',
# 'Tonnage by vessel name',
# 'Tonnage',
# 'Vessel name')
# ---
def get_plot(dfx, col_var, title, xlabel, ylabel):
# convert to string
dfx[col_var] = dfx[col_var].astype(str)
# plot
plt.figure(figsize=(10,5))
plt.barh(dfx[col_var], dfx['mean'])
plt.errorbar(y=dfx[col_var], x=dfx['mean'],
xerr=dfx['sd'],
linestyle='', capsize=5, ecolor='red')
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.tight_layout()
plt.show()
# 2-
get_plot(df, 'vessel_name',
'Tonnage by vessel name',
'Tonnage (mean+/-sd)',
'Vessel name')
# 1- Using the function 'get_mean_sd' from Q#54
# create dataframes below with col_var given
# (such as 'vessel_name', 'departure_port' etc.)
# 1A= vessel_name -> as 'df_vessel_name'
# 1B= departure_port -> as 'df_dep_port'
# 1C= country -> as 'df_country'
# 1D= commodity -> as 'df_commodity'
# 1E= income_group -> as 'df_income_group'
# 1F= flag -> as 'df_flag'
# 1G= dep_month -> as 'df_dep_month'
# 1H= dep_day -> as 'df_dep_day'
# 1I= dep_dow -> as 'df_dep_dow'
# ---
# 1A= vessel_name -> as 'df_vessel_name'
df_vessel_name = get_mean_sd(dfx = sea, col_var = 'vessel_name')
get_plot(df_vessel_name, 'vessel_name',
title='Tonnage by vessel name',
xlabel='Tonnage (mean+/-sd)',
ylabel='Vessel name')
plt.show()
print('\n')
# 1B= departure_port -> as 'df_dep_port'
df_dep_port = get_mean_sd(dfx = sea, col_var = 'departure_port')
get_plot(df_dep_port, 'departure_port',
title='Tonnage by departure port',
xlabel='Tonnage (mean+/-sd)',
ylabel='Departure port')
plt.show()
print('\n')
# 1C= country -> as 'df_country'
df_country = get_mean_sd(dfx = sea, col_var = 'country')
get_plot(df_country, 'country',
title='Tonnage by country',
xlabel='Tonnage (mean+/-sd)',
ylabel='Country')
plt.show()
print('\n')
# 1D= commodity -> as 'df_commodity'
df_commodity = get_mean_sd(dfx = sea, col_var = 'commodity')
get_plot(df_commodity, 'commodity',
title='Tonnage by commodity',
xlabel='Tonnage (mean+/-sd)',
ylabel='Commodity')
plt.show()
print('\n')
# 1E= income_group -> as 'df_income_group'
df_income_group = get_mean_sd(dfx = sea, col_var = 'income_group')
get_plot(df_income_group, 'income_group',
title='Tonnage by income group',
xlabel='Tonnage (mean+/-sd)',
ylabel='Income group')
plt.show()
print('\n')
# 1F= flag -> as 'df_flag'
df_flag = get_mean_sd(dfx = sea, col_var = 'flag')
get_plot(df_flag, 'flag',
title='Tonnage by flag',
xlabel='Tonnage (mean+/-sd)',
ylabel='Flag')
plt.show()
print('\n')
# 1G= dep_month -> as 'df_dep_month'
df_dep_month = get_mean_sd(dfx = sea, col_var = 'dep_month')
get_plot(df_dep_month, 'dep_month',
title='Tonnage by departure month',
xlabel='Tonnage (mean+/-sd)',
ylabel='Departure month')
plt.show()
print('\n')
# 1H= dep_day -> as 'df_dep_day'
df_dep_day = get_mean_sd(dfx = sea, col_var = 'dep_day')
get_plot(df_dep_day, 'dep_day',
title='Tonnage by departure day',
xlabel='Tonnage (mean+/-sd)',
ylabel='Departure day')
plt.show()
print('\n')
# 1I= dep_dow -> as 'df_dep_dow'
df_dep_dow = get_mean_sd(dfx = sea, col_var = 'dep_dow')
get_plot(df_dep_dow, 'dep_dow',
title='Tonnage by departure day of week',
xlabel='Tonnage (mean+/-sd)',
ylabel='Departure day of week')
plt.show()
print('\n')
# 1-
# Which country is the main source of crop corn? i.e. total corn is highest?
# 2-
# What percentage of total does it represent?
# ---
# 1-
df = sea[sea['commodity']=='Corn']
df = df.groupby(['country'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})
df = df.sort_values(by='tonnage_sum', ascending=False)
# 2-
df['percent'] = round(100.0*df['tonnage_sum']/df['tonnage_sum'].sum())
df
| country | tonnage_sum | percent | |
|---|---|---|---|
| 1 | China | 1694236.0 | 23.0 |
| 14 | Spain | 1355828.0 | 18.0 |
| 8 | Italy | 848262.0 | 11.0 |
| 17 | Türkiye | 629172.0 | 8.0 |
| 15 | The Netherlands | 559178.0 | 8.0 |
| 2 | Egypt | 398031.0 | 5.0 |
| 7 | Israel | 343520.0 | 5.0 |
| 12 | Republic of Korea | 246720.0 | 3.0 |
| 10 | Libya | 217300.0 | 3.0 |
| 13 | Romania | 203085.0 | 3.0 |
| 3 | Germany | 194260.0 | 3.0 |
| 16 | Tunisia | 176398.0 | 2.0 |
| 0 | Belgium | 158395.0 | 2.0 |
| 11 | Portugal | 138180.0 | 2.0 |
| 5 | Iran | 126234.0 | 2.0 |
| 9 | Lebanon | 54457.0 | 1.0 |
| 4 | Greece | 46346.0 | 1.0 |
| 6 | Ireland | 33000.0 | 0.0 |
# 1-
# What commodity does the country India transport? What is the total tonnage?
# 2- What percent of total does it represent?
# ---
# 1-
df = sea[sea['country']=='India'].groupby(['commodity'])['tonnage'].sum()
print(df)
# 2-
df = sea[sea['commodity']=='Sunflower oil']
df = df.groupby(['country'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})
df = df.sort_values(by='tonnage_sum', ascending=False)
#
df['percent'] = round(100.0*df['tonnage_sum']/df['tonnage_sum'].sum())
df
commodity Sunflower oil 350110.0 Name: tonnage, dtype: float64
| country | tonnage_sum | percent | |
|---|---|---|---|
| 5 | India | 350110.0 | 36.0 |
| 15 | Türkiye | 160646.0 | 17.0 |
| 1 | China | 142750.0 | 15.0 |
| 7 | Italy | 73899.0 | 8.0 |
| 11 | Romania | 47200.0 | 5.0 |
| 13 | Spain | 42797.0 | 4.0 |
| 6 | Iraq | 33000.0 | 3.0 |
| 10 | Oman | 32150.0 | 3.0 |
| 17 | United Kingdom | 18515.0 | 2.0 |
| 16 | United Arab Emirates | 11900.0 | 1.0 |
| 14 | The Netherlands | 10200.0 | 1.0 |
| 0 | Bulgaria | 9701.0 | 1.0 |
| 4 | France | 9000.0 | 1.0 |
| 2 | Djibouti | 6200.0 | 1.0 |
| 12 | Saudi Arabia | 4000.0 | 0.0 |
| 9 | Malaysia | 4000.0 | 0.0 |
| 3 | Egypt | 3100.0 | 0.0 |
| 8 | Lebanon | 2973.0 | 0.0 |
# 1-
# What is the commodity with highest number of vessel names?
# 2-
# What is the commodity with least number of vessel names?
# ---
# 1-
df = sea.groupby(['commodity'])['vessel_name'].count().reset_index().rename(columns={'vessel_name':'vessel_count'})
df = df.sort_values('vessel_count', ascending=False)
df
| commodity | vessel_count | |
|---|---|---|
| 2 | Corn | 231 |
| 15 | Wheat | 192 |
| 11 | Sunflower oil | 83 |
| 10 | Sunflower meal | 44 |
| 7 | Soya beans | 41 |
| 0 | Barley | 40 |
| 5 | Rapeseed | 26 |
| 13 | Sunflower seed | 18 |
| 4 | Peas | 8 |
| 6 | Rapeseed meal | 3 |
| 1 | Canola | 3 |
| 12 | Sunflower pellets | 2 |
| 8 | Soya oil | 2 |
| 9 | Sugar beet pellets | 1 |
| 14 | Vegetable oil | 1 |
| 3 | Mixed | 1 |
| 16 | Wheat bran pellets | 1 |
# 1-
# What is the departure port sees highest number of commodity types?
# 2-
# What is the departure port sees the least number of commodity types?
# ---
# 1-
df = sea.groupby(['departure_port'])['commodity'].count().reset_index().rename(columns={'commodity':'commodity_count'})
df = df.sort_values('commodity_count', ascending=False)
df
| departure_port | commodity_count | |
|---|---|---|
| 0 | Chornomorsk | 285 |
| 1 | Odesa | 222 |
| 2 | Yuzhny/Pivdennyi | 190 |
# 1-
# Is there a commodity that is shipped through only one departure_port?
# 2- What are those ports?
# ---
# 1-
df = sea.groupby(['commodity'])['departure_port'].count().reset_index().rename(columns={'departure_port':'departure_port_count'})
df = df.sort_values('departure_port_count', ascending=True)
xlist = df[df['departure_port_count']==1]['commodity'].tolist()
print(xlist)
# 2-
df = sea[sea['commodity'].isin(xlist)]['departure_port'].unique()
df
['Wheat bran pellets', 'Vegetable oil', 'Mixed', 'Sugar beet pellets']
array(['Chornomorsk'], dtype=object)