import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
np.set_printoptions(suppress=True)


#path = '-add your path here-'
path = '/Users/erv/Desktop/learndataa/Students/Python/Exams_Exercise/Exam_3/data/'

# Step-3: Read the file using pd.read_csv() using the '\t' separator (tab)
sea = pd.read_csv(path + 'Black Sea Grain Initiative Voyages - Data.csv', sep=',')

# Step-4: Look at the first five records using .head()
print(sea.shape)
sea.head(2)

(697, 15)


# use dataframe 'sea' from Q#38
# 0- Print the number of rows and columns in the dataframe
# 1- Print type of data in each column
# 2- Print the number of missing values in each column
# 3- Check to see if any column has the same value in all rows. Drop column if it has the same value in all the rows
# Hint: .unique()
# Hint: use for loop 
# 4- Print shape of sea


# --- 

# 0- Print the number of rows and columns in the dataframe
print("Number of rows:", sea.shape[0])
print('Number of columns:', sea.shape[1],'\n')

# 1- Print type of data in each column
print('Data types:\n',sea.dtypes, '\n')

# 2- Print the number of missing values in each column
print('Missing values:\n', sea.isna().sum())

# 3- Check to see if any column has the same value in all rows. Hint: .unique()
for col in sea.columns:
    num = sea[col].nunique()
    if (num == 1):
        sea = sea.drop(col, axis=1)
        print(col + ' was dropped!')
        
# 4- Print shape of sea
print('\nShape:\n', sea.shape)

Number of rows: 697
Number of columns: 15 

Data types:
 Status                  object
Outbound Sequence        int64
Vessel name             object
IMO                      int64
Departure port          object
Country                 object
Commodity               object
Tonnage                 object
Departure               object
Inspection İstanbul     object
Income group            object
Flag                    object
World Bank region       object
UN region               object
Development category    object
dtype: object 

Missing values:
 Status                   0
Outbound Sequence        0
Vessel name              0
IMO                      0
Departure port           0
Country                  0
Commodity                0
Tonnage                  0
Departure                0
Inspection İstanbul     41
Income group             0
Flag                     1
World Bank region        0
UN region                0
Development category     0
dtype: int64

Shape:
 (697, 15)


# 1-
col_names = sea.columns
col_names = [ col.strip().replace(' ', '_').lower() for col in col_names]
sea.columns = col_names
print("Columns:\n", sea.columns.values)

# 2-
sea['departure'] = pd.to_datetime(sea['departure'])
sea['inspection_i̇stanbul'] = pd.to_datetime(sea['inspection_i̇stanbul'])

# 3-
sea['tonnage'] = sea['tonnage'].str.replace(',','').astype(float)

# 4-
sea['dep_year'] = sea['departure'].dt.year
sea['dep_month'] = sea['departure'].dt.month
sea['dep_day'] = sea['departure'].dt.day
sea['dep_dow'] = sea['departure'].dt.day_of_week

# 5-
sea['insp_year'] = sea['inspection_i̇stanbul'].dt.year
sea['insp_month'] = sea['inspection_i̇stanbul'].dt.month
sea['insp_day'] = sea['inspection_i̇stanbul'].dt.day
sea['insp_dow'] = sea['inspection_i̇stanbul'].dt.day_of_week

# 6-
print("\nData types:\n", sea.dtypes)
print('\nShape: ', sea.shape)

sea.head(3)

Columns:
 ['status' 'outbound_sequence' 'vessel_name' 'imo' 'departure_port'
 'country' 'commodity' 'tonnage' 'departure' 'inspection_i̇stanbul'
 'income_group' 'flag' 'world_bank_region' 'un_region'
 'development_category']

Data types:
 status                          object
outbound_sequence                int64
vessel_name                     object
imo                              int64
departure_port                  object
country                         object
commodity                       object
tonnage                        float64
departure               datetime64[ns]
inspection_i̇stanbul    datetime64[ns]
income_group                    object
flag                            object
world_bank_region               object
un_region                       object
development_category            object
dep_year                         int64
dep_month                        int64
dep_day                          int64
dep_dow                          int64
insp_year                      float64
insp_month                     float64
insp_day                       float64
insp_dow                       float64
dtype: object

Shape:  (697, 23)


# use 'sea' from Q#40
# Objective: Now that the dataset is clean and ready for analysis 
# we can start looking into each column 
# and see what we can find !!! Exciting !!!

# 1-
# How many typs of 'status' are there for ships leaving the port and coming into the port?

# 2-
# How many unique vessel names?

# 3- 
# What are the names of departure ports?

# 4-
# How many unique countries?

# 5-
# What are the different types of commodities that are transported by the ships?

# 6-
# What are different types of income groups?

# 7-
# How many different flags are there? Is it one or more flags per country? 

# 8-
# What is the minimum and maximum tonnage for the ships?

# --- 


# 1-
# How many typs of 'status' are there for ships leaving the port and coming into the port?
x = sea['status'].unique()
xnum = sea['status'].nunique()
print("status:\n", x)
print("num:\n", xnum)


# 2-
# How many unique vessel names?
x = sea['vessel_name'].nunique()
print("\nvessel nam:\n", x)

# 3- 
# What are the names of departure ports?
x = sea['departure_port'].unique()
print("\ndeparture_port:\n", x)

# 4-
# How many unique countries?
x = sea['country'].nunique()
print("\ncountry:\n", x)

# 5-
# What are the different types of commodities that are transported by the ships?
x = sea['commodity'].unique()
print("\ncommodity:\n", x)

# 6-
# What are different types of income groups?
x = sea['income_group'].unique()
print("\nincome_group:\n", x)

# 7-
# How many different flags are there? Is it one or more flags per country? 
x = sea['flag'].nunique()
print("\nflag:", x)
print("There are fewer flags than countries.")

# 8-
# What is the minimum and maximum tonnage for the ships?
xmin = sea['tonnage'].min()
xmax = sea['tonnage'].max()

print("\nTonnage (min):", xmin, "\nTonnage (max):", xmax)

status:
 ['Outbound' 'Outbound +']
num:
 2

vessel nam:
 481

departure_port:
 ['Odesa' 'Chornomorsk' 'Yuzhny/Pivdennyi']

country:
 42

commodity:
 ['Corn' 'Sunflower meal' 'Sunflower oil' 'Soya beans' 'Wheat'
 'Sunflower seed' 'Sugar beet pellets' 'Rapeseed' 'Peas' 'Barley'
 'Wheat bran pellets' 'Canola' 'Rapeseed meal' 'Mixed' 'Sunflower pellets'
 'Soya oil' 'Vegetable oil']

income_group:
 ['upper-middle-income' 'lower-middle income' 'high-income' 'low-income']

flag: 36
There are fewer flags than countries.

Tonnage (min): 68.0 
Tonnage (max): 74500.0


# 1- 
# Create a histogram plot of tonnage. Comment

# 2-
# Create a timeseries plot of departure (on x-axis) vs. tonnage (on y-axis). Comment

# --- 

# 1-
plt.hist(sea['tonnage'])
plt.title('Ship tonnage transported')
plt.xlabel('Tons')
plt.ylabel('Count')
plt.show()

# 2-
plt.plot(sea['departure'], sea['tonnage'])
plt.title('Ship tonnage transported')
plt.xlabel('Tons')
plt.ylabel('Count')
plt.show()


# 0-
# create new dataframe 'df' that is a subset 'sea' 
# such that it has only those countrys that have more than 50 rows of data
# Hint: use groupby to get the count of rows per country. Then use only these countries to get the subset.
# temp = sea.groupby(['country'])['status'].count().reset_index().rename(columns={'status':'count'})
# temp = temp[temp['count']>50]
# subset_country_list = temp['country'].tolist()

# print shape of df

# 1- 
# Create a histogram plot of tonnage by country. Comment
# sns.histplot(data=df, x="tonnage", hue="country", element="step", fill=True)

# 2-
# Create a timeseries plot of departure (on x-axis) vs. tonnage (on y-axis) by country. Comment
# Hint: sns.lineplot(data=df, x='departure', y='tonnage', hue='country')


# --- 

# 0-
temp = sea.groupby(['country'])['status'].count().reset_index().rename(columns={'status':'count'})
temp = temp[temp['count']>50]
subset_country_list = temp['country'].tolist()
print('subset_country_list:', subset_country_list)

df = sea[sea['country'].isin(subset_country_list)]
print('Shape:', df.shape)

# 1- 
sns.histplot(data=df, x="tonnage", hue="country", element="step", fill=True)
plt.show()

# 2-
sns.lineplot(data=df, x='departure', y='tonnage', hue='country')
plt.show()

subset_country_list: ['China', 'Italy', 'Spain', 'Türkiye']
Shape: (415, 23)


# 1- 
# Groupby on 'status' to find total 'tonnage' for each category

# Step-1: peroform groupby
# Step-2: reset index
# Step-3: rename columns
# Step-4: show ouput


### Option -1 
# Step-1: peroform groupby sum
df = sea.groupby(['status'])['tonnage'].sum()
# Step-2: reset index
df = df.reset_index()
# Step-3: rename columns
df = df.rename(columns = {'tonnage':'tonnage_sum'})

### Option -3
# Step-1 to 3: peroform groupby sum, reset index and rename in same line
#df = sea.groupby(['status'])['tonnage'].sum().reset_index().rename(columns = {'tonnage':'tonnage_sum'})

# Step-4: show ouput
print(df)

# Using steps above calculate mean tonnage instead of sum

# --- write code below this line ---


df = sea.groupby(['status'])['tonnage'].mean().reset_index().rename(columns = {'tonnage':'tonnage_mean'})
df

       status  tonnage_sum
0    Outbound   15263021.0
1  Outbound +    1071644.0


# 1- calculate the total tonnage for each vessel name and save in dataframe 'df'. 
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row). Then, subset top 25 rows
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment

# --- 

# 1-
df = sea.groupby(['vessel_name'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})

# 2-
df = df.sort_values(by='tonnage_sum', ascending=False)
df = df.head(25)

# 3-
plt.figure(figsize=(10,5))
plt.barh(df['vessel_name'], df['tonnage_sum'])
plt.title('Tonnage by vessel name')
plt.xlabel('Total tonnage')
plt.ylabel('Vessel name')
plt.tight_layout()
plt.show()


# 1- calculate the total tonnage for each departure_port and save in dataframe 'df'. 
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row).
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment

# --- 

# 1-
df = sea.groupby(['departure_port'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})

# 2-
df = df.sort_values(by='tonnage_sum', ascending=True)


# 3-
plt.figure(figsize=(5,3))
plt.barh(df['departure_port'], df['tonnage_sum'])
plt.title('Tonnage by departure port')
plt.xlabel('Total tonnage')
plt.ylabel('Departure port')
plt.tight_layout()
plt.show()


# 1- calculate the total tonnage for each country and save in dataframe 'df'. 
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row). Then, subset top 25 rows
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment

# --- 

# 1-
df = sea.groupby(['country'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})

# 2-
df = df.sort_values(by='tonnage_sum', ascending=False)
df = df.head(25)

# 3-
plt.figure(figsize=(10,5))
plt.barh(df['country'], df['tonnage_sum'])
plt.title('Tonnage by country')
plt.xlabel('Total tonnage')
plt.ylabel('Country')
plt.tight_layout()
plt.show()


# 1- calculate the total tonnage for each commodity and save in dataframe 'df'. 
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row).
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment

# --- 

# 1-
df = sea.groupby(['commodity'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})

# 2-
df = df.sort_values(by='tonnage_sum', ascending=False)


# 3-
plt.figure(figsize=(10,5))
plt.barh(df['commodity'], df['tonnage_sum'])
plt.title('Tonnage by commodity')
plt.xlabel('Total tonnage')
plt.ylabel('Commodity')
plt.tight_layout()
plt.show()


# 1- calculate the total tonnage for each commodity and save in dataframe 'df'. 
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row).
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment

# --- 

# 1-
df = sea.groupby(['income_group'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})

# 2-
df = df.sort_values(by='tonnage_sum', ascending=False)


# 3-
plt.figure(figsize=(10,5))
plt.barh(df['income_group'], df['tonnage_sum'])
plt.title('Tonnage by income group')
plt.xlabel('Total tonnage')
plt.ylabel('Income group')
plt.tight_layout()
plt.show()


# 1- calculate the total tonnage for each flag and save in dataframe 'df'. 
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row). Then, subset top 25 rows
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment

# --- write code below this line ---

# 1-
df = sea.groupby(['flag'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})

# 2-
df = df.sort_values(by='tonnage_sum', ascending=False)
df = df.head(25)

# 3-
plt.figure(figsize=(10,5))
plt.barh(df['flag'], df['tonnage_sum'])
plt.title('Tonnage by flag')
plt.xlabel('Total tonnage')
plt.ylabel('Flag')
plt.tight_layout()
plt.show()


# 1- calculate the total tonnage for each 'dep_month' and save in dataframe 'df'. 
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row). 
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment

# --- 

# 1-
df = sea.groupby(['dep_month'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})

# 2-
df['dep_month'] = df['dep_month'].astype(str)
df = df.sort_values(by='tonnage_sum', ascending=False)


# 3-
plt.figure(figsize=(10,5))
plt.barh(df['dep_month'], df['tonnage_sum'])
plt.title('Tonnage by month')
plt.xlabel('Total tonnage')
plt.ylabel('Departure month')
plt.tight_layout()
plt.show()


# 1- calculate the total tonnage for each 'dep_day' and save in dataframe 'df'. 
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row). 
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment

# --- 

# 1-
df = sea.groupby(['dep_day'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})

# 2-
df['dep_day'] = df['dep_day'].astype(str)
df = df.sort_values(by='tonnage_sum', ascending=False)


# 3-
plt.figure(figsize=(10,5))
plt.barh(df['dep_day'], df['tonnage_sum'])
plt.title('Tonnage by day')
plt.xlabel('Total tonnage')
plt.ylabel('Departure day')
plt.tight_layout()
plt.show()


# 1- calculate the total tonnage for each 'dep_dow' and save in dataframe 'df'. 
# Rename column with tonnage sum values .as 'tonnage_sum'
# 2- sort df by tonnage_sum high (first row) and low (last row). 
# 3- create a bar plot: (x-axis) vessel name, (y-axis) total tonnage
# 4- comment

# --- 

# 1-
df = sea.groupby(['dep_dow'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})

# 2-
df['dep_dow'] = df['dep_dow'].astype(str)
df = df.sort_values(by='tonnage_sum', ascending=False)


# 3-
plt.figure(figsize=(10,5))
plt.barh(df['dep_dow'], df['tonnage_sum'])
plt.title('Tonnage by day')
plt.xlabel('Total tonnage')
plt.ylabel('Departure day of week')
plt.tight_layout()
plt.show()


# 1-
# Create a function 'get_list' that 
# - takes three input arguments: (1) dfx, (2) col_var, (3) threshold=5
# - returns a list of unique values from a col_var

# Use threshold=5 to keep only those values 
# from col_var that have a count greater than 5

# Hint:
def get_list(input_x, input_y):
    xy_list = [input_x, input_y]
    return xy_list

z = get_list(1, 2)
print('z:', z, '\n\n')

# 2-
# run: 
# z = get_counts(sea, 'vessel_name')

# --- 

# 1-
def get_list(dfx, col_var, threshold=5):
    # Groupby
    temp = dfx.groupby([col_var])['tonnage'].count().reset_index().rename(columns={'tonnage':'count'})
    # Sort
    temp = temp.sort_values(by='count', ascending=False)
    # Subset
    temp = temp[temp['count'] >= threshold]
    # List
    temp_list = temp[col_var].tolist()
    
    return temp_list

# 2-
z = get_list(sea, 'vessel_name')
z

z: [1, 2]

['MAVKA', 'INOI', 'FULMAR S', 'ENEIDA']


# 1-
# Create a function 'get_mean_sd' that 
# A- takes three input arguments: (1) dfx, (2) col_var
# B- subsets dfx with only those categories found in 'category_list'. 
# Get category_list by calling function get_list() from Q-53 above
# For example: in case of col_var = 'vessel_name'
# only rows where the column 'vessel_name' has values ['MAVKA', 'INOI', 'FULMAR S', 'ENEIDA'] would be used

# C- Calculates the 'mean' and 'sd' for the col var

# D- calculates a dataframe with columns:
# - the name in col_var
# - mean
# - sd

# E - from function return dataframe with upto top 25 rows only


# 2-
# run:
# df = get_mean_sd(sea, 'vessel_name')


# --- 

# 1-
def get_mean_sd(dfx, col_var):
    
    # 1A-
    # Get category list
    category_list = get_list(dfx, col_var)
    
    # Subset
    dfx = dfx[dfx[col_var].isin(category_list)]

    # mean and sd
    df = dfx.groupby([col_var])['tonnage'].mean().reset_index().rename(columns={'tonnage':'mean'})
    df['sd'] = dfx.groupby([col_var])['tonnage'].std().reset_index(drop=True)

    # convert to string
    df[col_var] = df[col_var].astype(str)
    
    # sort
    df = df.sort_values(by='mean', ascending=False)
    df = df.head(25)

    return df

# 2-
df = get_mean_sd(sea, 'vessel_name')
df


# 1-
# Create a function get_plot() 
# that takes two input arguments: 
# (1) dfx, 
# (2) col_var to be plotted, convert it to string
# (3) plot title, 
# (4) label for x-axis, 
# (5) label for y-axis
# creates a bar plot of mean values with sd error bars



# 2-
# run using df output from Q#54:
# get_plot(df, col_var='vessel_name', 
#     'Tonnage by vessel name', 
#      'Tonnage', 
#      'Vessel name')

# --- 

def get_plot(dfx, col_var, title, xlabel, ylabel):
    
    # convert to string
    dfx[col_var] = dfx[col_var].astype(str)
    
    # plot
    plt.figure(figsize=(10,5))
    plt.barh(dfx[col_var], dfx['mean'])

    plt.errorbar(y=dfx[col_var], x=dfx['mean'], 
                 xerr=dfx['sd'], 
                 linestyle='', capsize=5, ecolor='red')

    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.show()

    
# 2-
get_plot(df, 'vessel_name', 
         'Tonnage by vessel name', 
         'Tonnage (mean+/-sd)',
         'Vessel name')


# 1- Using the function 'get_mean_sd' from Q#54 
# create dataframes below with col_var given 
# (such as 'vessel_name', 'departure_port' etc.)

# 1A= vessel_name -> as 'df_vessel_name'
# 1B= departure_port -> as 'df_dep_port'
# 1C= country -> as 'df_country'
# 1D= commodity -> as 'df_commodity'
# 1E= income_group -> as 'df_income_group'
# 1F= flag -> as 'df_flag'
# 1G= dep_month -> as 'df_dep_month'
# 1H= dep_day -> as 'df_dep_day'
# 1I= dep_dow -> as 'df_dep_dow'



# --- 


# 1A= vessel_name -> as 'df_vessel_name'
df_vessel_name = get_mean_sd(dfx = sea, col_var = 'vessel_name')
get_plot(df_vessel_name, 'vessel_name', 
         title='Tonnage by vessel name', 
         xlabel='Tonnage (mean+/-sd)',
         ylabel='Vessel name')  
plt.show()
print('\n')

# 1B= departure_port -> as 'df_dep_port'
df_dep_port = get_mean_sd(dfx = sea, col_var = 'departure_port')
get_plot(df_dep_port, 'departure_port', 
         title='Tonnage by departure port', 
         xlabel='Tonnage (mean+/-sd)',
         ylabel='Departure port')  
plt.show()
print('\n')

# 1C= country -> as 'df_country'
df_country = get_mean_sd(dfx = sea, col_var = 'country')
get_plot(df_country, 'country', 
         title='Tonnage by country', 
         xlabel='Tonnage (mean+/-sd)',
         ylabel='Country')  
plt.show()
print('\n')

# 1D= commodity -> as 'df_commodity'
df_commodity = get_mean_sd(dfx = sea, col_var = 'commodity')
get_plot(df_commodity, 'commodity', 
         title='Tonnage by commodity', 
         xlabel='Tonnage (mean+/-sd)',
         ylabel='Commodity')  
plt.show()
print('\n')

# 1E= income_group -> as 'df_income_group'
df_income_group = get_mean_sd(dfx = sea, col_var = 'income_group')
get_plot(df_income_group, 'income_group', 
         title='Tonnage by income group', 
         xlabel='Tonnage (mean+/-sd)',
         ylabel='Income group')  
plt.show()
print('\n')


# 1F= flag -> as 'df_flag'
df_flag = get_mean_sd(dfx = sea, col_var = 'flag')
get_plot(df_flag, 'flag', 
         title='Tonnage by flag', 
         xlabel='Tonnage (mean+/-sd)',
         ylabel='Flag')  
plt.show()
print('\n')

# 1G= dep_month -> as 'df_dep_month'
df_dep_month = get_mean_sd(dfx = sea, col_var = 'dep_month')
get_plot(df_dep_month, 'dep_month', 
         title='Tonnage by departure month', 
         xlabel='Tonnage (mean+/-sd)',
         ylabel='Departure month')  
plt.show()
print('\n')

# 1H= dep_day -> as 'df_dep_day'
df_dep_day = get_mean_sd(dfx = sea, col_var = 'dep_day')
get_plot(df_dep_day, 'dep_day', 
         title='Tonnage by departure day', 
         xlabel='Tonnage (mean+/-sd)',
         ylabel='Departure day')  
plt.show()
print('\n')

# 1I= dep_dow -> as 'df_dep_dow'
df_dep_dow = get_mean_sd(dfx = sea, col_var = 'dep_dow')
get_plot(df_dep_dow, 'dep_dow', 
         title='Tonnage by departure day of week', 
         xlabel='Tonnage (mean+/-sd)',
         ylabel='Departure day of week')  
plt.show()
print('\n')


# 1- 
# Which country is the main source of crop corn? i.e. total corn is highest?

# 2-
# What percentage of total does it represent?

# --- 

# 1-
df = sea[sea['commodity']=='Corn']
df = df.groupby(['country'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})
df = df.sort_values(by='tonnage_sum', ascending=False)

# 2-
df['percent'] = round(100.0*df['tonnage_sum']/df['tonnage_sum'].sum())

df


# 1- 
# What commodity does the country India transport? What is the total tonnage? 

# 2- What percent of total does it represent?

# --- 

# 1-
df = sea[sea['country']=='India'].groupby(['commodity'])['tonnage'].sum()
print(df)

# 2-
df = sea[sea['commodity']=='Sunflower oil']
df = df.groupby(['country'])['tonnage'].sum().reset_index().rename(columns={'tonnage':'tonnage_sum'})
df = df.sort_values(by='tonnage_sum', ascending=False)

# 
df['percent'] = round(100.0*df['tonnage_sum']/df['tonnage_sum'].sum())
df

commodity
Sunflower oil    350110.0
Name: tonnage, dtype: float64


# 1- 
# What is the commodity with highest number of vessel names?

# 2- 
# What is the commodity with least number of vessel names?

# --- 

# 1-
df = sea.groupby(['commodity'])['vessel_name'].count().reset_index().rename(columns={'vessel_name':'vessel_count'})
df = df.sort_values('vessel_count', ascending=False)
df


# 1- 
# What is the departure port sees highest number of commodity types?

# 2- 
# What is the departure port sees the least number of commodity types?

# ---

# 1-
df = sea.groupby(['departure_port'])['commodity'].count().reset_index().rename(columns={'commodity':'commodity_count'})
df = df.sort_values('commodity_count', ascending=False)
df


# 1- 
# Is there a commodity that is shipped through only one departure_port?

# 2- What are those ports?


# --- 

# 1-
df = sea.groupby(['commodity'])['departure_port'].count().reset_index().rename(columns={'departure_port':'departure_port_count'})
df = df.sort_values('departure_port_count', ascending=True)
xlist = df[df['departure_port_count']==1]['commodity'].tolist()
print(xlist)

# 2-
df = sea[sea['commodity'].isin(xlist)]['departure_port'].unique()
df

['Wheat bran pellets', 'Vegetable oil', 'Mixed', 'Sugar beet pellets']

array(['Chornomorsk'], dtype=object)

	vessel_name	mean	sd
0	ENEIDA	26249.6	16337.863272
2	INOI	12346.8	5060.280743
3	MAVKA	9890.0	3330.427900
1	FULMAR S	8227.0	4045.972689

	country	tonnage_sum	percent
1	China	1694236.0	23.0
14	Spain	1355828.0	18.0
8	Italy	848262.0	11.0
17	Türkiye	629172.0	8.0
15	The Netherlands	559178.0	8.0
2	Egypt	398031.0	5.0
7	Israel	343520.0	5.0
12	Republic of Korea	246720.0	3.0
10	Libya	217300.0	3.0
13	Romania	203085.0	3.0
3	Germany	194260.0	3.0
16	Tunisia	176398.0	2.0
0	Belgium	158395.0	2.0
11	Portugal	138180.0	2.0
5	Iran	126234.0	2.0
9	Lebanon	54457.0	1.0
4	Greece	46346.0	1.0
6	Ireland	33000.0	0.0

	country	tonnage_sum	percent
5	India	350110.0	36.0
15	Türkiye	160646.0	17.0
1	China	142750.0	15.0
7	Italy	73899.0	8.0
11	Romania	47200.0	5.0
13	Spain	42797.0	4.0
6	Iraq	33000.0	3.0
10	Oman	32150.0	3.0
17	United Kingdom	18515.0	2.0
16	United Arab Emirates	11900.0	1.0
14	The Netherlands	10200.0	1.0
0	Bulgaria	9701.0	1.0
4	France	9000.0	1.0
2	Djibouti	6200.0	1.0
12	Saudi Arabia	4000.0	0.0
9	Malaysia	4000.0	0.0
3	Egypt	3100.0	0.0
8	Lebanon	2973.0	0.0

Black Sea Grain Initiative Vessel Movements: 2022-2023¶

Import libraries¶

Read data¶

Data preprocessing¶

Feature engineering¶

Data exploration¶

Analysis¶

	Status	Outbound Sequence	Vessel name	IMO	Departure port	Country	Commodity	Tonnage	Departure	Inspection İstanbul	Income group	Flag	World Bank region	UN region	Development category
0	Outbound	1	RAZONI	9086526	Odesa	Türkiye	Corn	1,527	31-Jul-22	02-Aug-22	upper-middle-income	Sierra Leone	Europe & Central Asia	Asia-Pacific	developing
1	Outbound +	1	RAZONI	9086526	Odesa	Egypt	Corn	25,000	31-Jul-22	02-Aug-22	lower-middle income	Sierra Leone	Middle East & North Africa	Africa	developing

	commodity	vessel_count
2	Corn	231
15	Wheat	192
11	Sunflower oil	83
10	Sunflower meal	44
7	Soya beans	41
0	Barley	40
5	Rapeseed	26
13	Sunflower seed	18
4	Peas	8
6	Rapeseed meal	3
1	Canola	3
12	Sunflower pellets	2
8	Soya oil	2
9	Sugar beet pellets	1
14	Vegetable oil	1
3	Mixed	1
16	Wheat bran pellets	1

	status	tonnage_mean
0	Outbound	24737.473258
1	Outbound +	13395.550000

	departure_port	commodity_count
0	Chornomorsk	285
1	Odesa	222
2	Yuzhny/Pivdennyi	190