Black Sea Grain Initiative Vessel Movements: 2022-2023¶

Import libraries¶

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
np.set_printoptions(suppress=True) 

Read data¶

In [16]:
#path = '-add your path here-'
path = '/Users/erv/Desktop/learndataa/Students/Python/Exams_Exercise/Exam_3/data/'

# Step-3: Read the file using pd.read_csv() using the '\t' separator (tab)
sea = pd.read_csv(path + 'Black Sea Grain Initiative Voyages - Data.csv', sep=',')

# Step-4: Look at the first five records using .head()
print(sea.shape)
sea.head(2)
(697, 15)
Out[16]:
Status Outbound Sequence Vessel name IMO Departure port Country Commodity Tonnage Departure Inspection İstanbul Income group Flag World Bank region UN region Development category
0 Outbound 1 RAZONI 9086526 Odesa Türkiye Corn 1,527 31-Jul-22 02-Aug-22 upper-middle-income Sierra Leone Europe & Central Asia Asia-Pacific developing
1 Outbound + 1 RAZONI 9086526 Odesa Egypt Corn 25,000 31-Jul-22 02-Aug-22 lower-middle income Sierra Leone Middle East & North Africa Africa developing

Data preprocessing¶

In [17]:
# use dataframe 'sea' from Q#38
# 0- Print the number of rows and columns in the dataframe
# 1- Print type of data in each column
# 2- Print the number of missing values in each column
# 3- Check to see if any column has the same value in all rows. Drop column if it has the same value in all the rows
# Hint: .unique()
# Hint: use for loop 
# 4- Print shape of sea


# --- 

# 0- Print the number of rows and columns in the dataframe
print("Number of rows:", sea.shape[0])
print('Number of columns:', sea.shape[1],'\n')

# 1- Print type of data in each column
print('Data types:\n',sea.dtypes, '\n')

# 2- Print the number of missing values in each column
print('Missing values:\n', sea.isna().sum())

# 3- Check to see if any column has the same value in all rows. Hint: .unique()
for col in sea.columns:
    num = sea[col].nunique()
    if (num == 1):
        sea = sea.drop(col, axis=1)
        print(col + ' was dropped!')
        
# 4- Print shape of sea
print('\nShape:\n', sea.shape)
Number of rows: 697
Number of columns: 15 

Data types:
 Status                  object
Outbound Sequence        int64
Vessel name             object
IMO                      int64
Departure port          object
Country                 object
Commodity               object
Tonnage                 object
Departure               object
Inspection İstanbul     object
Income group            object
Flag                    object
World Bank region       object
UN region               object
Development category    object
dtype: object 

Missing values:
 Status                   0
Outbound Sequence        0
Vessel name              0
IMO                      0
Departure port           0
Country                  0
Commodity                0
Tonnage                  0
Departure                0
Inspection İstanbul     41
Income group             0
Flag                     1
World Bank region        0
UN region                0
Development category     0
dtype: int64

Shape:
 (697, 15)

Feature engineering¶

In [18]:
# 1-
col_names = sea.columns
col_names = [ col.strip().replace(' ', '_').lower() for col in col_names]
sea.columns = col_names
print("Columns:\n", sea.columns.values)

# 2-
sea['departure'] = pd.to_datetime(sea['departure'])
sea['inspection_i̇stanbul'] = pd.to_datetime(sea['inspection_i̇stanbul'])

# 3-
sea['tonnage'] = sea['tonnage'].str.replace(',','').astype(float)

# 4-
sea['dep_year'] = sea['departure'].dt.year
sea['dep_month'] = sea['departure'].dt.month
sea['dep_day'] = sea['departure'].dt.day
sea['dep_dow'] = sea['departure'].dt.day_of_week

# 5-
sea['insp_year'] = sea['inspection_i̇stanbul'].dt.year
sea['insp_month'] = sea['inspection_i̇stanbul'].dt.month
sea['insp_day'] = sea['inspection_i̇stanbul'].dt.day
sea['insp_dow'] = sea['inspection_i̇stanbul'].dt.day_of_week

# 6-
print("\nData types:\n", sea.dtypes)
print('\nShape: ', sea.shape)

sea.head(3)
Columns:
 ['status' 'outbound_sequence' 'vessel_name' 'imo' 'departure_port'
 'country' 'commodity' 'tonnage' 'departure' 'inspection_i̇stanbul'
 'income_group' 'flag' 'world_bank_region' 'un_region'
 'development_category']

Data types:
 status                          object
outbound_sequence                int64
vessel_name                     object
imo                              int64
departure_port                  object
country                         object
commodity                       object
tonnage                        float64
departure               datetime64[ns]
inspection_i̇stanbul    datetime64[ns]
income_group                    object
flag                            object
world_bank_region               object
un_region                       object
development_category            object
dep_year                         int64
dep_month                        int64
dep_day                          int64
dep_dow                          int64
insp_year                      float64
insp_month                     float64
insp_day                       float64
insp_dow                       float64
dtype: object

Shape:  (697, 23)
Out[18]:
status outbound_sequence vessel_name imo departure_port country commodity tonnage departure inspection_i̇stanbul income_group flag world_bank_region un_region development_category dep_year dep_month dep_day dep_dow insp_year insp_month insp_day insp_dow
0 Outbound 1 RAZONI 9086526 Odesa Türkiye Corn 1527.0 2022-07-31 2022-08-02 upper-middle-income Sierra Leone Europe & Central Asia Asia-Pacific developing 2022 7 31 6 2022.0 8.0 2.0 1.0
1 Outbound + 1 RAZONI 9086526 Odesa Egypt Corn 25000.0 2022-07-31 2022-08-02 lower-middle income Sierra Leone Middle East & North Africa Africa developing 2022 7 31 6 2022.0 8.0 2.0 1.0
2 Outbound 2 NAVI STAR 9590979 Odesa Ireland Corn 33000.0 2022-08-04 2022-08-05 high-income Panama Europe & Central Asia Western Europe and Others developed 2022 8 4 3 2022.0 8.0 5.0 4.0

Data exploration¶

In [19]:
# use 'sea' from Q#40
# Objective: Now that the dataset is clean and ready for analysis 
# we can start looking into each column 
# and see what we can find !!! Exciting !!!

# 1-
# How many typs of 'status' are there for ships leaving the port and coming into the port?

# 2-
# How many unique vessel names?

# 3- 
# What are the names of departure ports?

# 4-
# How many unique countries?

# 5-
# What are the different types of commodities that are transported by the ships?

# 6-
# What are different types of income groups?

# 7-
# How many different flags are there? Is it one or more flags per country? 

# 8-
# What is the minimum and maximum tonnage for the ships?

# --- 


# 1-
# How many typs of 'status' are there for ships leaving the port and coming into the port?
x = sea['status'].unique()
xnum = sea['status'].nunique()
print("status:\n", x)
print("num:\n", xnum)


# 2-
# How many unique vessel names?
x = sea['vessel_name'].nunique()
print("\nvessel nam:\n", x)

# 3- 
# What are the names of departure ports?
x = sea['departure_port'].unique()
print("\ndeparture_port:\n", x)

# 4-
# How many unique countries?
x = sea['country'].nunique()
print("\ncountry:\n", x)

# 5-
# What are the different types of commodities that are transported by the ships?
x = sea['commodity'].unique()
print("\ncommodity:\n", x)

# 6-
# What are different types of income groups?
x = sea['income_group'].unique()
print("\nincome_group:\n", x)

# 7-
# How many different flags are there? Is it one or more flags per country? 
x = sea['flag'].nunique()
print("\nflag:", x)
print("There are fewer flags than countries.")

# 8-
# What is the minimum and maximum tonnage for the ships?
xmin = sea['tonnage'].min()
xmax = sea['tonnage'].max()

print("\nTonnage (min):", xmin, "\nTonnage (max):", xmax)
status:
 ['Outbound' 'Outbound +']
num:
 2

vessel nam:
 481

departure_port:
 ['Odesa' 'Chornomorsk' 'Yuzhny/Pivdennyi']

country:
 42

commodity:
 ['Corn' 'Sunflower meal' 'Sunflower oil' 'Soya beans' 'Wheat'
 'Sunflower seed' 'Sugar beet pellets' 'Rapeseed' 'Peas' 'Barley'
 'Wheat bran pellets' 'Canola' 'Rapeseed meal' 'Mixed' 'Sunflower pellets'
 'Soya oil' 'Vegetable oil']

income_group:
 ['upper-middle-income' 'lower-middle income' 'high-income' 'low-income']

flag: 36
There are fewer flags than countries.

Tonnage (min): 68.0 
Tonnage (max): 74500.0
In [20]:
# 1- 
# Create a histogram plot of tonnage. Comment

# 2-
# Create a timeseries plot of departure (on x-axis) vs. tonnage (on y-axis). Comment

# --- 

# 1-
plt.hist(sea['tonnage'])
plt.title('Ship tonnage transported')
plt.xlabel('Tons')
plt.ylabel('Count')
plt.show()

# 2-
plt.plot(sea['departure'], sea['tonnage'])
plt.title('Ship tonnage transported')
plt.xlabel('Tons')
plt.ylabel('Count')
plt.show()
In [21]:
# 0-
# create new dataframe 'df' that is a subset 'sea' 
# such that it has only those countrys that have more than 50 rows of data
# Hint: use groupby to get the count of rows per country. Then use only these countries to get the subset.
# temp = sea.groupby(['country'])['status'].count().reset_index().rename(columns={'status':'count'})
# temp = temp[temp['count']>50]
# subset_country_list = temp['country'].tolist()

# print shape of df

# 1- 
# Create a histogram plot of tonnage by country. Comment
# sns.histplot(data=df, x="tonnage", hue="country", element="step", fill=True)

# 2-
# Create a timeseries plot of departure (on x-axis) vs. tonnage (on y-axis) by country. Comment
# Hint: sns.lineplot(data=df, x='departure', y='tonnage', hue='country')


# --- 

# 0-
temp = sea.groupby(['country'])['status'].count().reset_index().rename(columns={'status':'count'})
temp = temp[temp['count']>50]
subset_country_list = temp['country'].tolist()
print('subset_country_list:', subset_country_list)

df = sea[sea['country'].isin(subset_country_list)]
print('Shape:', df.shape)

# 1- 
sns.histplot(data=df, x="tonnage", hue="country", element="step", fill=True)
plt.show()

# 2-
sns.lineplot(data=df, x='departure', y='tonnage', hue='country')
plt.show()
subset_country_list: ['China', 'Italy', 'Spain', 'Türkiye']
Shape: (415, 23)

Analysis¶

In [22]:
# 1- 
# Groupby on 'status' to find total 'tonnage' for each category

# Step-1: peroform groupby
# Step-2: reset index
# Step-3: rename columns
# Step-4: show ouput


### Option -1 
# Step-1: peroform groupby sum
df = sea.groupby(['status'])['tonnage'].sum()
# Step-2: reset index
df = df.reset_index()
# Step-3: rename columns
df = df.rename(columns = {'tonnage':'tonnage_sum'})

### Option -3
# Step-1 to 3: peroform groupby sum, reset index and rename in same line
#df = sea.groupby(['status'])['tonnage'].sum().reset_index().rename(columns = {'tonnage':'tonnage_sum'})

# Step-4: show ouput
print(df)

# Using steps above calculate mean tonnage instead of sum

# --- write code below this line ---


df = sea.groupby(['status'])['tonnage'].mean().reset_index().rename(columns = {'tonnage':'tonnage_mean'})
df
       status  tonnage_sum
0    Outbound   15263021.0
1  Outbound +    1071644.0
Out[22]:
status tonnage_mean
0 Outbound 24737.473258
1 Outbound + 13395.550000
In [23]:
# 1- calculate the total tonnage for each vessel name and save in dataframe 'df'. 
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row). Then, subset top 25 rows
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment

# --- 

# 1-
df = sea.groupby(['vessel_name'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})

# 2-
df = df.sort_values(by='tonnage_sum', ascending=False)
df = df.head(25)

# 3-
plt.figure(figsize=(10,5))
plt.barh(df['vessel_name'], df['tonnage_sum'])
plt.title('Tonnage by vessel name')
plt.xlabel('Total tonnage')
plt.ylabel('Vessel name')
plt.tight_layout()
plt.show()
In [24]:
# 1- calculate the total tonnage for each departure_port and save in dataframe 'df'. 
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row).
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment

# --- 

# 1-
df = sea.groupby(['departure_port'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})

# 2-
df = df.sort_values(by='tonnage_sum', ascending=True)


# 3-
plt.figure(figsize=(5,3))
plt.barh(df['departure_port'], df['tonnage_sum'])
plt.title('Tonnage by departure port')
plt.xlabel('Total tonnage')
plt.ylabel('Departure port')
plt.tight_layout()
plt.show()
In [25]:
# 1- calculate the total tonnage for each country and save in dataframe 'df'. 
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row). Then, subset top 25 rows
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment

# --- 

# 1-
df = sea.groupby(['country'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})

# 2-
df = df.sort_values(by='tonnage_sum', ascending=False)
df = df.head(25)

# 3-
plt.figure(figsize=(10,5))
plt.barh(df['country'], df['tonnage_sum'])
plt.title('Tonnage by country')
plt.xlabel('Total tonnage')
plt.ylabel('Country')
plt.tight_layout()
plt.show()
In [26]:
# 1- calculate the total tonnage for each commodity and save in dataframe 'df'. 
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row).
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment

# --- 

# 1-
df = sea.groupby(['commodity'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})

# 2-
df = df.sort_values(by='tonnage_sum', ascending=False)


# 3-
plt.figure(figsize=(10,5))
plt.barh(df['commodity'], df['tonnage_sum'])
plt.title('Tonnage by commodity')
plt.xlabel('Total tonnage')
plt.ylabel('Commodity')
plt.tight_layout()
plt.show()
In [27]:
# 1- calculate the total tonnage for each commodity and save in dataframe 'df'. 
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row).
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment

# --- 

# 1-
df = sea.groupby(['income_group'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})

# 2-
df = df.sort_values(by='tonnage_sum', ascending=False)


# 3-
plt.figure(figsize=(10,5))
plt.barh(df['income_group'], df['tonnage_sum'])
plt.title('Tonnage by income group')
plt.xlabel('Total tonnage')
plt.ylabel('Income group')
plt.tight_layout()
plt.show()
In [28]:
# 1- calculate the total tonnage for each flag and save in dataframe 'df'. 
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row). Then, subset top 25 rows
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment

# --- write code below this line ---

# 1-
df = sea.groupby(['flag'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})

# 2-
df = df.sort_values(by='tonnage_sum', ascending=False)
df = df.head(25)

# 3-
plt.figure(figsize=(10,5))
plt.barh(df['flag'], df['tonnage_sum'])
plt.title('Tonnage by flag')
plt.xlabel('Total tonnage')
plt.ylabel('Flag')
plt.tight_layout()
plt.show()
In [29]:
# 1- calculate the total tonnage for each 'dep_month' and save in dataframe 'df'. 
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row). 
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment

# --- 

# 1-
df = sea.groupby(['dep_month'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})

# 2-
df['dep_month'] = df['dep_month'].astype(str)
df = df.sort_values(by='tonnage_sum', ascending=False)


# 3-
plt.figure(figsize=(10,5))
plt.barh(df['dep_month'], df['tonnage_sum'])
plt.title('Tonnage by month')
plt.xlabel('Total tonnage')
plt.ylabel('Departure month')
plt.tight_layout()
plt.show()
In [30]:
# 1- calculate the total tonnage for each 'dep_day' and save in dataframe 'df'. 
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row). 
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment

# --- 

# 1-
df = sea.groupby(['dep_day'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})

# 2-
df['dep_day'] = df['dep_day'].astype(str)
df = df.sort_values(by='tonnage_sum', ascending=False)


# 3-
plt.figure(figsize=(10,5))
plt.barh(df['dep_day'], df['tonnage_sum'])
plt.title('Tonnage by day')
plt.xlabel('Total tonnage')
plt.ylabel('Departure day')
plt.tight_layout()
plt.show()
In [31]:
# 1- calculate the total tonnage for each 'dep_dow' and save in dataframe 'df'. 
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row). 
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment

# --- 

# 1-
df = sea.groupby(['dep_dow'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})

# 2-
df['dep_dow'] = df['dep_dow'].astype(str)
df = df.sort_values(by='tonnage_sum', ascending=False)


# 3-
plt.figure(figsize=(10,5))
plt.barh(df['dep_dow'], df['tonnage_sum'])
plt.title('Tonnage by day')
plt.xlabel('Total tonnage')
plt.ylabel('Departure day of week')
plt.tight_layout()
plt.show()
In [32]:
# 1-
# Create a function 'get_list' that 
# - takes three input arguments: (1) dfx, (2) col_var, (3) threshold=5
# - returns a list of unique values from a col_var

# Use threshold=5 to keep only those values 
# from col_var that have a count greater than 5

# Hint:
def get_list(input_x, input_y):
    xy_list = [input_x, input_y]
    return xy_list

z = get_list(1, 2)
print('z:', z, '\n\n')

# 2-
# run: 
# z = get_counts(sea, 'vessel_name')

# --- 

# 1-
def get_list(dfx, col_var, threshold=5):
    # Groupby
    temp = dfx.groupby([col_var])['tonnage'].count().reset_index().rename(columns={'tonnage':'count'})
    # Sort
    temp = temp.sort_values(by='count', ascending=False)
    # Subset
    temp = temp[temp['count'] >= threshold]
    # List
    temp_list = temp[col_var].tolist()
    
    return temp_list

# 2-
z = get_list(sea, 'vessel_name')
z
z: [1, 2] 


Out[32]:
['MAVKA', 'INOI', 'FULMAR S', 'ENEIDA']
In [33]:
# 1-
# Create a function 'get_mean_sd' that 
# A- takes three input arguments: (1) dfx, (2) col_var
# B- subsets dfx with only those categories found in 'category_list'. 
# Get category_list by calling function get_list() from Q-53 above
# For example: in case of col_var = 'vessel_name'
# only rows where the column 'vessel_name' has values ['MAVKA', 'INOI', 'FULMAR S', 'ENEIDA'] would be used

# C- Calculates the 'mean' and 'sd' for the col var

# D- calculates a dataframe with columns:
# - the name in col_var
# - mean
# - sd

# E - from function return dataframe with upto top 25 rows only


# 2-
# run:
# df = get_mean_sd(sea, 'vessel_name')


# --- 

# 1-
def get_mean_sd(dfx, col_var):
    
    # 1A-
    # Get category list
    category_list = get_list(dfx, col_var)
    
    # Subset
    dfx = dfx[dfx[col_var].isin(category_list)]

    # mean and sd
    df = dfx.groupby([col_var])['tonnage'].mean().reset_index().rename(columns={'tonnage':'mean'})
    df['sd'] = dfx.groupby([col_var])['tonnage'].std().reset_index(drop=True)

    # convert to string
    df[col_var] = df[col_var].astype(str)
    
    # sort
    df = df.sort_values(by='mean', ascending=False)
    df = df.head(25)

    return df

# 2-
df = get_mean_sd(sea, 'vessel_name')
df
Out[33]:
vessel_name mean sd
0 ENEIDA 26249.6 16337.863272
2 INOI 12346.8 5060.280743
3 MAVKA 9890.0 3330.427900
1 FULMAR S 8227.0 4045.972689
In [34]:
# 1-
# Create a function get_plot() 
# that takes two input arguments: 
# (1) dfx, 
# (2) col_var to be plotted, convert it to string
# (3) plot title, 
# (4) label for x-axis, 
# (5) label for y-axis
# creates a bar plot of mean values with sd error bars



# 2-
# run using df output from Q#54:
# get_plot(df, col_var='vessel_name', 
#     'Tonnage by vessel name', 
#      'Tonnage', 
#      'Vessel name')

# --- 

def get_plot(dfx, col_var, title, xlabel, ylabel):
    
    # convert to string
    dfx[col_var] = dfx[col_var].astype(str)
    
    # plot
    plt.figure(figsize=(10,5))
    plt.barh(dfx[col_var], dfx['mean'])

    plt.errorbar(y=dfx[col_var], x=dfx['mean'], 
                 xerr=dfx['sd'], 
                 linestyle='', capsize=5, ecolor='red')

    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.show()

    
# 2-
get_plot(df, 'vessel_name', 
         'Tonnage by vessel name', 
         'Tonnage (mean+/-sd)',
         'Vessel name')   
In [35]:
# 1- Using the function 'get_mean_sd' from Q#54 
# create dataframes below with col_var given 
# (such as 'vessel_name', 'departure_port' etc.)

# 1A= vessel_name -> as 'df_vessel_name'
# 1B= departure_port -> as 'df_dep_port'
# 1C= country -> as 'df_country'
# 1D= commodity -> as 'df_commodity'
# 1E= income_group -> as 'df_income_group'
# 1F= flag -> as 'df_flag'
# 1G= dep_month -> as 'df_dep_month'
# 1H= dep_day -> as 'df_dep_day'
# 1I= dep_dow -> as 'df_dep_dow'



# --- 


# 1A= vessel_name -> as 'df_vessel_name'
df_vessel_name = get_mean_sd(dfx = sea, col_var = 'vessel_name')
get_plot(df_vessel_name, 'vessel_name', 
         title='Tonnage by vessel name', 
         xlabel='Tonnage (mean+/-sd)',
         ylabel='Vessel name')  
plt.show()
print('\n')

# 1B= departure_port -> as 'df_dep_port'
df_dep_port = get_mean_sd(dfx = sea, col_var = 'departure_port')
get_plot(df_dep_port, 'departure_port', 
         title='Tonnage by departure port', 
         xlabel='Tonnage (mean+/-sd)',
         ylabel='Departure port')  
plt.show()
print('\n')

# 1C= country -> as 'df_country'
df_country = get_mean_sd(dfx = sea, col_var = 'country')
get_plot(df_country, 'country', 
         title='Tonnage by country', 
         xlabel='Tonnage (mean+/-sd)',
         ylabel='Country')  
plt.show()
print('\n')

# 1D= commodity -> as 'df_commodity'
df_commodity = get_mean_sd(dfx = sea, col_var = 'commodity')
get_plot(df_commodity, 'commodity', 
         title='Tonnage by commodity', 
         xlabel='Tonnage (mean+/-sd)',
         ylabel='Commodity')  
plt.show()
print('\n')

# 1E= income_group -> as 'df_income_group'
df_income_group = get_mean_sd(dfx = sea, col_var = 'income_group')
get_plot(df_income_group, 'income_group', 
         title='Tonnage by income group', 
         xlabel='Tonnage (mean+/-sd)',
         ylabel='Income group')  
plt.show()
print('\n')


# 1F= flag -> as 'df_flag'
df_flag = get_mean_sd(dfx = sea, col_var = 'flag')
get_plot(df_flag, 'flag', 
         title='Tonnage by flag', 
         xlabel='Tonnage (mean+/-sd)',
         ylabel='Flag')  
plt.show()
print('\n')

# 1G= dep_month -> as 'df_dep_month'
df_dep_month = get_mean_sd(dfx = sea, col_var = 'dep_month')
get_plot(df_dep_month, 'dep_month', 
         title='Tonnage by departure month', 
         xlabel='Tonnage (mean+/-sd)',
         ylabel='Departure month')  
plt.show()
print('\n')

# 1H= dep_day -> as 'df_dep_day'
df_dep_day = get_mean_sd(dfx = sea, col_var = 'dep_day')
get_plot(df_dep_day, 'dep_day', 
         title='Tonnage by departure day', 
         xlabel='Tonnage (mean+/-sd)',
         ylabel='Departure day')  
plt.show()
print('\n')

# 1I= dep_dow -> as 'df_dep_dow'
df_dep_dow = get_mean_sd(dfx = sea, col_var = 'dep_dow')
get_plot(df_dep_dow, 'dep_dow', 
         title='Tonnage by departure day of week', 
         xlabel='Tonnage (mean+/-sd)',
         ylabel='Departure day of week')  
plt.show()
print('\n')









In [36]:
# 1- 
# Which country is the main source of crop corn? i.e. total corn is highest?

# 2-
# What percentage of total does it represent?

# --- 

# 1-
df = sea[sea['commodity']=='Corn']
df = df.groupby(['country'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})
df = df.sort_values(by='tonnage_sum', ascending=False)

# 2-
df['percent'] = round(100.0*df['tonnage_sum']/df['tonnage_sum'].sum())

df
Out[36]:
country tonnage_sum percent
1 China 1694236.0 23.0
14 Spain 1355828.0 18.0
8 Italy 848262.0 11.0
17 Türkiye 629172.0 8.0
15 The Netherlands 559178.0 8.0
2 Egypt 398031.0 5.0
7 Israel 343520.0 5.0
12 Republic of Korea 246720.0 3.0
10 Libya 217300.0 3.0
13 Romania 203085.0 3.0
3 Germany 194260.0 3.0
16 Tunisia 176398.0 2.0
0 Belgium 158395.0 2.0
11 Portugal 138180.0 2.0
5 Iran 126234.0 2.0
9 Lebanon 54457.0 1.0
4 Greece 46346.0 1.0
6 Ireland 33000.0 0.0
In [39]:
# 1- 
# What commodity does the country India transport? What is the total tonnage? 

# 2- What percent of total does it represent?

# --- 

# 1-
df = sea[sea['country']=='India'].groupby(['commodity'])['tonnage'].sum()
print(df)

# 2-
df = sea[sea['commodity']=='Sunflower oil']
df = df.groupby(['country'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})
df = df.sort_values(by='tonnage_sum', ascending=False)

# 
df['percent'] = round(100.0*df['tonnage_sum']/df['tonnage_sum'].sum())
df
commodity
Sunflower oil    350110.0
Name: tonnage, dtype: float64
Out[39]:
country tonnage_sum percent
5 India 350110.0 36.0
15 Türkiye 160646.0 17.0
1 China 142750.0 15.0
7 Italy 73899.0 8.0
11 Romania 47200.0 5.0
13 Spain 42797.0 4.0
6 Iraq 33000.0 3.0
10 Oman 32150.0 3.0
17 United Kingdom 18515.0 2.0
16 United Arab Emirates 11900.0 1.0
14 The Netherlands 10200.0 1.0
0 Bulgaria 9701.0 1.0
4 France 9000.0 1.0
2 Djibouti 6200.0 1.0
12 Saudi Arabia 4000.0 0.0
9 Malaysia 4000.0 0.0
3 Egypt 3100.0 0.0
8 Lebanon 2973.0 0.0
In [40]:
# 1- 
# What is the commodity with highest number of vessel names?

# 2- 
# What is the commodity with least number of vessel names?

# --- 

# 1-
df = sea.groupby(['commodity'])['vessel_name'].count().reset_index().rename(columns={'vessel_name':'vessel_count'})
df = df.sort_values('vessel_count', ascending=False)
df
Out[40]:
commodity vessel_count
2 Corn 231
15 Wheat 192
11 Sunflower oil 83
10 Sunflower meal 44
7 Soya beans 41
0 Barley 40
5 Rapeseed 26
13 Sunflower seed 18
4 Peas 8
6 Rapeseed meal 3
1 Canola 3
12 Sunflower pellets 2
8 Soya oil 2
9 Sugar beet pellets 1
14 Vegetable oil 1
3 Mixed 1
16 Wheat bran pellets 1
In [41]:
# 1- 
# What is the departure port sees highest number of commodity types?

# 2- 
# What is the departure port sees the least number of commodity types?

# ---

# 1-
df = sea.groupby(['departure_port'])['commodity'].count().reset_index().rename(columns={'commodity':'commodity_count'})
df = df.sort_values('commodity_count', ascending=False)
df
Out[41]:
departure_port commodity_count
0 Chornomorsk 285
1 Odesa 222
2 Yuzhny/Pivdennyi 190
In [42]:
# 1- 
# Is there a commodity that is shipped through only one departure_port?

# 2- What are those ports?


# --- 

# 1-
df = sea.groupby(['commodity'])['departure_port'].count().reset_index().rename(columns={'departure_port':'departure_port_count'})
df = df.sort_values('departure_port_count', ascending=True)
xlist = df[df['departure_port_count']==1]['commodity'].tolist()
print(xlist)

# 2-
df = sea[sea['commodity'].isin(xlist)]['departure_port'].unique()
df
['Wheat bran pellets', 'Vegetable oil', 'Mixed', 'Sugar beet pellets']
Out[42]:
array(['Chornomorsk'], dtype=object)
In [ ]: