GIS with and
Part III: Data Munging...Combining GIS with Other Tools¶
Set-up our environment as before¶
Let's import the packages we will use and set the paths for outputs.
# Let's import pandas and some other basic packages we will use
from __future__ import division
import pandas as pd
import numpy as np
import os, sys
# GIS packages
import geopandas as gpd
from geopandas.tools import overlay
from shapely.geometry import Polygon, Point
import georasters as gr
# Alias for Geopandas
gp = gpd
# Plotting
import matplotlib as mpl
import seaborn as sns
# Setup seaborn
sns.set()
# Mapping
import geoplot as gplt
import geoplot.crs as gcrs
import mapclassify as mc
import textwrap
%pylab --no-import-all
%matplotlib inline
# Functions for plotting
def center_wrap(text, cwidth=32, **kw):
'''Center Text (to be used in legend)'''
lines = text
#lines = textwrap.wrap(text, **kw)
return "\n".join(line.center(cwidth) for line in lines)
def MyChoropleth(mydf, myfile='', myvar='',
mylegend='',
k=5,
extent=[-180, -90, 180, 90],
bbox_to_anchor=(0.2, 0.5),
edgecolor='white', facecolor='lightgray',
scheme='FisherJenks', bins=None, pct=None,
legend_labels=None,
save=True,
percent=False,
cmap='Reds',
**kwargs):
# Chloropleth
# Color scheme
if scheme=='EqualInterval':
scheme = mc.EqualInterval(mydf[myvar], k=k)
elif scheme=='Quantiles':
scheme = mc.Quantiles(mydf[myvar], k=k)
elif scheme=='BoxPlot':
scheme = mc.BoxPlot(mydf[myvar], k=k)
elif scheme=='FisherJenks':
scheme = mc.FisherJenks(mydf[myvar], k=k)
elif scheme=='FisherJenksSampled':
scheme = mc.FisherJenksSampled(mydf[myvar], k=k)
elif scheme=='HeadTailBreaks':
scheme = mc.HeadTailBreaks(mydf[myvar], k=k)
elif scheme=='JenksCaspall':
scheme = mc.JenksCaspall(mydf[myvar], k=k)
elif scheme=='JenksCaspallForced':
scheme = mc.JenksCaspallForced(mydf[myvar], k=k)
elif scheme=='JenksCaspallSampled':
scheme = mc.JenksCaspallSampled(mydf[myvar], k=k)
elif scheme=='KClassifiers':
scheme = mc.KClassifiers(mydf[myvar], k=k)
elif scheme=='Percentiles':
scheme = mc.Percentiles(mydf[myvar], pct=pct)
elif scheme=='UserDefined':
scheme = mc.UserDefined(mydf[myvar], bins=bins)
if legend_labels is None:
# Format legend
upper_bounds = scheme.bins
# get and format all bounds
bounds = []
for index, upper_bound in enumerate(upper_bounds):
if index == 0:
lower_bound = mydf[myvar].min()
else:
lower_bound = upper_bounds[index-1]
# format the numerical legend here
if percent:
bound = f'{lower_bound:.0%} - {upper_bound:.0%}'
else:
bound = f'{float(lower_bound):,.0f} - {float(upper_bound):,.0f}'
bounds.append(bound)
legend_labels = bounds
#Plot
ax = gplt.choropleth(
mydf, hue=myvar, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
edgecolor='white', linewidth=1,
cmap=cmap, legend=True,
scheme=scheme,
legend_kwargs={'bbox_to_anchor': bbox_to_anchor,
'frameon': True,
'title':mylegend,
},
legend_labels = legend_labels,
figsize=(24, 16),
rasterized=True,
)
gplt.polyplot(
countries, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
edgecolor=edgecolor, facecolor=facecolor,
ax=ax,
rasterized=True,
extent=extent,
)
if save:
plt.savefig(pathgraphs + myfile + '_' + myvar +'.pdf', dpi=300, bbox_inches='tight')
plt.savefig(pathgraphs + myfile + '_' + myvar +'.png', dpi=300, bbox_inches='tight')
pass
# Paths
pathout = './data/'
if not os.path.exists(pathout):
os.mkdir(pathout)
pathgraphs = './graphs/'
if not os.path.exists(pathgraphs):
os.mkdir(pathgraphs)
Let's plot the countries for which Colombian citizens do not require visas¶
The Colombian Cancillery's website has a list with visa requirements for colombians. Let's use it to map countries for which visas are not required. Below is the link to the information. The problem is that it is a pdf file. Let's open the website and check it out
# Import display options for showing websites
from IPython.display import IFrame
url = 'https://www.cancilleria.gov.co/sites/default/files/FOTOS2020/relacion_de_paises_que_exigen_o_no_visas_a_colombianos_17-04-2020.pdf'
IFrame(url, width=800, height=400)
# Import package for downloading internet content and save it to file
import requests
url = 'https://www.cancilleria.gov.co/sites/default/files/FOTOS2020/relacion_de_paises_que_exigen_o_no_visas_a_colombianos_17-04-2020.pdf'
response = requests.get(url)
with open(pathout + 'visas.pdf', 'wb') as f:
f.write(response.content)
# Import package to read pdf tables
import camelot
visas = camelot.read_pdf(pathout + 'visas.pdf', pages='1-7')
Let's explore the visas object
visas
So there are 7 tables in visas. What does Table 1 have?
visas[0]
visas[0].df
Ok, let's concatenate all these pandas
dataframes.
visadf = pd.concat([i.df for i in visas]).reset_index(drop=True)
visadf
We need to correct the header¶
visadf.columns = visadf.iloc[5]
visadf.head(10)
visadf = visadf.iloc[6:].copy()
visadf.columns.name = ''
visadf.head(10)
Let's code SI (YES) as 1 and NO as 0
visadf['visa_req'] = visadf.SI.map({'X':1, '':0})
Let's check whether things were mapped correctly
visadf.loc[visadf.visa_req.isna()]
IFrame(url, width=800, height=400)
visadf.loc[(visadf.SI=='X X') | (visadf.SI.shift(1)=='X X') | (visadf.SI.shift(-1)=='X X')]
visadf.loc[(visadf.SI=='X X X') | (visadf.SI.shift(1)=='X X X') | (visadf.SI.shift(-1)=='X X X')]
Ok it seems we have two types of errors. First, notince that sometimes the type of visa is defined, e.g., Azerbayán. Second, the OCR software has mixed some rows, so that now we have XX, XXX, etc. Looking at the pdf it seems this is due to assigning an X from a previous row to the current row ("X X") or from both the previous and next ("X X X"). Let's try to correct these errors programatically (obviously sometimes it may just be faster and better to export the dataframe, correct it by hand snd then load the corrected one, but we're here to learn, right?).
First, let's replace the repeated X with what seems to be the correct data.
X X
visadf.loc[(visadf.SI=='X X') | (visadf.SI.shift(-1)=='X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X') | (visadf.SI.shift(-1)=='X X')]
X X X
visadf.loc[(visadf.SI=='X X X') | (visadf.SI.shift(1)=='X X X') | (visadf.SI.shift(-1)=='X X X'), 'visa_req'] =1
visadf.loc[(visadf.SI=='X X X') | (visadf.SI.shift(1)=='X X X') | (visadf.SI.shift(-1)=='X X X')]
X X X X
visadf.loc[(visadf.SI=='X X X X') | (visadf.SI.shift(1)=='X X X X') | (visadf.SI.shift(-1)=='X X X X') | (visadf.SI.shift(2)=='X X X X') | (visadf.SI.shift(-2)=='X X X X') | (visadf.SI.shift(-3)=='X X X X')]
visadf.loc[(visadf.SI=='X X X X') | (visadf.SI.shift(1)=='X X X X') | (visadf.SI.shift(-1)=='X X X X') | (visadf.SI.shift(-2)=='X X X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X X X') | (visadf.SI.shift(1)=='X X X X') | (visadf.SI.shift(-1)=='X X X X') | (visadf.SI.shift(-2)=='X X X X')]
X X X X X
visadf.loc[(visadf.SI=='X X X X X') | (visadf.SI.shift(1)=='X X X X X') | (visadf.SI.shift(-1)=='X X X X X') | (visadf.SI.shift(-2)=='X X X X X') | (visadf.SI.shift(2)=='X X X X X')]
visadf.loc[(visadf.SI=='X X X X X') | (visadf.SI.shift(1)=='X X X X X') | (visadf.SI.shift(-1)=='X X X X X') | (visadf.SI.shift(-2)=='X X X X X') | (visadf.SI.shift(2)=='X X X X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X X X X') | (visadf.SI.shift(1)=='X X X X X') | (visadf.SI.shift(-1)=='X X X X X') | (visadf.SI.shift(-2)=='X X X X X') | (visadf.SI.shift(2)=='X X X X X')]
X X X X X X
visadf.loc[(visadf.SI=='X X X X X X') | (visadf.SI.shift(1)=='X X X X X X') | (visadf.SI.shift(-1)=='X X X X X X') | (visadf.SI.shift(-2)=='X X X X X X') | (visadf.SI.shift(2)=='X X X X X X') | (visadf.SI.shift(-3)=='X X X X X X') | (visadf.SI.shift(3)=='X X X X X X')]
visadf.loc[(visadf.SI=='X X X X X X') | (visadf.SI.shift(1)=='X X X X X X') | (visadf.SI.shift(-1)=='X X X X X X') | (visadf.SI.shift(-2)=='X X X X X X') | (visadf.SI.shift(2)=='X X X X X X') | (visadf.SI.shift(-3)=='X X X X X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X X X X X') | (visadf.SI.shift(1)=='X X X X X X') | (visadf.SI.shift(-1)=='X X X X X X') | (visadf.SI.shift(-2)=='X X X X X X') | (visadf.SI.shift(2)=='X X X X X X') | (visadf.SI.shift(-3)=='X X X X X X')]
Let's also replace visa required for any row that has the word "visa".
visadf.loc[visadf.SI.str.lower().str.find('visa')!=-1]
visadf.loc[visadf.SI.str.lower().str.find('visa')!=-1, 'visa_req'] = 1
visadf.loc[visadf.SI.str.lower().str.find('visa')!=-1]
Let's check again
visadf.loc[visadf.visa_req.isna()]
Ok, it seems we have coded which countries need and which do not need visa for colombian citizens. Let's analyze this data a bit.
visadf['visa_req_YN'] = visadf.visa_req.map({0:'NO', 1:'YES'})
visadf
visadf.hist()
visadf.visa_req.describe()
df = visadf.groupby('visa_req_YN').count().reset_index()
df
sns.set(rc={'figure.figsize':(11.7,8.27)})
#sns.reset_orig()
sns.set_context("talk")
# Plot
fig, ax = plt.subplots()
sns.barplot(x='visa_req_YN', y='visa_req', data=df, alpha=1)
ax.tick_params(axis = 'both', which = 'major')
ax.tick_params(axis = 'both', which = 'minor')
ax.set_xlabel('Visa Required')
ax.set_ylabel('Number of Countries')
Let's try to map these countries. First let's get the Natural Earth shapefile.
import requests
import io
#headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
url = 'https://naturalearth.s3.amazonaws.com/10m_cultural/ne_10m_admin_0_countries.zip'
r = requests.get(url, headers=headers)
countries = gp.read_file(io.BytesIO(r.content))
#countries = gpd.read_file('https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_0_countries.zip')
countries
Luckily there are country names in Spanish. Let's see if we can merge these two data sets.
countries.NAME_ES
col_visa = countries.merge(visadf, left_on='NAME_ES', right_on='PAIS')
cmap = mpl.colors.ListedColormap(['blue', 'red'])
mylegend = center_wrap(["Visa Requirements", "For Colombian Citizens"], cwidth=32, width=32)
MyChoropleth(mydf=col_visa, myfile='col_visa', myvar='visa_req', mylegend=mylegend, k=1, bbox_to_anchor=(0.25, 0.3),
edgecolor='white', facecolor='lightgray', cmap=cmap, scheme='UserDefined', bins=[0,1], legend_labels=['NO', 'YES'],
save=False)
So it seems not everything merged correctly
col_visa.shape
visadf.shape
col_visa.loc[col_visa.visa_req.isna(), 'NAME_ES'].sort_values()
So we are not linking all countries. This is usually due to symbols like accents and ~, but in this case also because the tail of the data frame includes territories of countries, so their names are non-standard (and OCR may have made some mistakes).
visadf.tail(25)
Let's correct the country names to improve matching. It's always a good practice to keep the original names.
visadf['PAIS_OR'] = visadf.PAIS
visadf.loc[visadf.PAIS.str.find('(')!=-1, 'PAIS'] = visadf.loc[visadf.PAIS_OR.str.find('(')!=-1, 'PAIS_OR'].apply(lambda x: x[:x.find('(')])
visadf.PAIS = visadf.PAIS.str.strip()
visadf.tail(30)
col_visa = countries.merge(visadf, left_on='NAME_ES', right_on='PAIS')
cmap = mpl.colors.ListedColormap(['blue', 'red'])
mylegend = center_wrap(["Visa Requirements", "For Colombian Citizens"], cwidth=32, width=32)
MyChoropleth(mydf=col_visa, myfile='col_visa', myvar='visa_req', mylegend=mylegend, k=1, bbox_to_anchor=(0.25, 0.3),
edgecolor='white', facecolor='lightgray', cmap=cmap, scheme='UserDefined', bins=[0,1], legend_labels=['NO', 'YES'],
save=False)
col_visa.shape
Ok, that helped a bit. Let's see what else is different. Let's start by finding which countries are not linked.
miss_countries = list(set(countries.NAME_ES).difference(col_visa.NAME_ES))
#miss_countries.remove(None)
#miss_countries.sort()
miss_visadf = list(set(visadf.PAIS).difference(col_visa.PAIS))
miss_visadf.remove('')
miss_visadf.sort()
print('Misssing countries', miss_countries)
print('')
print('Missing PAIS', miss_visadf)
Let's choose one example to see why/how they differ
countries.loc[countries.NAME_ES.str.find('Congo')!=-1, 'NAME_ES']
visadf.loc[visadf.PAIS.str.find('Congo')!=-1, 'PAIS']
OK, so not an easy fix. We can correct by hand the missing ones or perhaps if we can find a way of linking for each missing country in one dataframe the most similar country in the other we may be able to simplify our work. If you google for help you will find e.g., that the package difflib
can help.
# Import package to match text
import difflib
Let's create a dataframe to keep the matches we create between the country name in countries
and visadf
.
matches = pd.DataFrame(miss_countries, columns=['countries'])
matches = matches.loc[matches.countries.isna()==False].reset_index(drop=True).copy()
matches
Now, let's use the difflib.get_close_matches
function to find the closest match to each country name in countries
to visadf
.
matches['visadf'] = matches.countries.apply(lambda x: difflib.get_close_matches(x, miss_visadf, cutoff=0.8))
matches.loc[matches.visadf.apply(lambda x: x!=[])]
So it works! Of course now we need to improve matches and try to find as many as we can so we do not have to do it by hand. One way to do it is to keep the correct matches and decrease the cutoff required for a match.
matches.loc[matches.visadf.apply(lambda x: x!=[] and len(x)==1), 'k'] = 0.8
matches.loc[matches.visadf.apply(lambda x: x!=[] and len(x)==1), 'visadf_matched'] = matches.loc[matches.visadf.apply(lambda x: x!=[] and len(x)==1), 'visadf'].apply(lambda x: x[0])
matches.loc[matches.visadf.apply(lambda x: x!=[] and len(x)==1)]
for k in np.arange(0.9,0.1,-0.025):
if matches.visadf_matched.isna().sum()!=0:
print(k)
matches['visadf'] = matches.countries.apply(lambda x: difflib.get_close_matches(x, miss_visadf, cutoff=k))
matches.loc[(matches.visadf.apply(lambda x: x!=[] and len(x)==1)) & (matches.visadf_matched.isna()), 'k'] = k
matches.loc[(matches.visadf.apply(lambda x: x!=[] and len(x)==1)) & (matches.visadf_matched.isna()), 'visadf_matched'] = matches.loc[(matches.visadf.apply(lambda x: x!=[] and len(x)==1)) & (matches.visadf_matched.isna()), 'visadf'].apply(lambda x: x[0])
matches
matches.sort_values('k', ascending=False)
Let's create the opposite match
matches2 = pd.DataFrame(miss_visadf, columns=['visadf'])
matches2 = matches2.loc[matches2.visadf.isna()==False].reset_index(drop=True).copy()
matches2['countries'] = matches2.visadf.apply(lambda x: difflib.get_close_matches(x, miss_countries, cutoff=0.9))
matches2.loc[matches2.countries.apply(lambda x: x!=[] and len(x)==1), 'k'] = 0.8
matches2.loc[matches2.countries.apply(lambda x: x!=[] and len(x)==1), 'countries_matched'] = matches2.loc[matches2.countries.apply(lambda x: x!=[] and len(x)==1), 'countries'].apply(lambda x: x[0])
for k in np.arange(0.9,0.1,-0.025):
if matches2.countries_matched.isna().sum()!=0:
print(k)
matches2['countries'] = matches2.visadf.apply(lambda x: difflib.get_close_matches(x, miss_countries, cutoff=k))
matches2.loc[(matches2.countries.apply(lambda x: x!=[] and len(x)==1)) & (matches2.countries_matched.isna()), 'k'] = k
matches2.loc[(matches2.countries.apply(lambda x: x!=[] and len(x)==1)) & (matches2.countries_matched.isna()), 'countries_matched'] = matches2.loc[(matches2.countries.apply(lambda x: x!=[] and len(x)==1)) & (matches2.countries_matched.isna()), 'countries'].apply(lambda x: x[0])
matches2
Clearly, this still will need some work...some were linked correctly, others not (although the correct ones seem to be in the list countries
) and still there are some that do not seem to be there at all! This is partly due to the fact that the Natural Earth shapefile does not seem to have some countries (e.g., Bonaire, Sint Eustatius, Saba, Reunion). Given the missing locations in countries
it may be easier to use matches2
to finish the matching.
namecols = ['SOVEREIGNT', 'NAME_ES'] + [col for col in countries.columns if col.find('NAME')!=-1]
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('bonaire').any(), axis=1), namecols]
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('eust').any(), axis=1), namecols]
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('sab').any(), axis=1), namecols]
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('reun').any(), axis=1), namecols]
Or have different writing/names (e.g., Myanmar, Swaziland) or because the Spanish name used by the Colombian Cancillery is non-standard (e.g., Santa Sede vs Vaticano)
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('myan').any(), axis=1), namecols]
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('swazi').any(), axis=1), namecols]
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('vatic').any(), axis=1), namecols]
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('china').any(), axis=1), namecols]
Let's try to see how goodf the best macthes are
matches2.sort_values('k', ascending=False)
Seems we won't be able to improve, so let's finish by hand (using code of course, since we want replicability of our results).
# Correct matches2
matches2.loc[matches2.visadf=='Suazilandia', 'countries_matched'] = 'eSwatini'
matches2.loc[matches2.visadf=='Laos República Democrática P', 'countries_matched'] = 'Laos'
matches2.loc[matches2.visadf=='Corea República Popular Dem.', 'countries_matched'] = 'Corea del Norte'
matches2.loc[matches2.visadf=='Corea República', 'countries_matched'] = 'Corea del Sur'
matches2.loc[matches2.visadf=='Martinica', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Santa Sede', 'countries_matched'] = 'Ciudad del Vaticano'
matches2.loc[matches2.visadf=='Bonaire', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Myanmar', 'countries_matched'] = 'Birmania'
matches2.loc[matches2.visadf=='Rusia Federación', 'countries_matched'] = 'Rusia'
matches2.loc[matches2.visadf=='Réunion', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Mayotte', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Reino Unido Gran Bretaña e Irlanda del Norte', 'countries_matched'] = 'Reino Unido'
matches2.loc[matches2.visadf=='Congo', 'countries_matched'] = 'República del Congo'
matches2.loc[matches2.visadf=='Sint Eustatius', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Guadalupe', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Guyana Francesa', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Brunei Darussalam', 'countries_matched'] = 'Brunéi'
matches2.loc[matches2.visadf=='Palau', 'countries_matched'] = 'Palaos'
matches2.loc[matches2.visadf=='Saba', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='China República Popular', 'countries_matched'] = 'China'
#matches2.loc[matches2.visadf=='', 'countries_matched'] = ''
#matches2.loc[matches2.visadf=='', 'countries_matched'] = ''
matches2.sort_values('k', ascending=False)
#countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('aba').any(), axis=1), namecols]
This is as good as we can do it with this dataset and shapefile (of course we may need a different shapefile if we really need to ensure that we are plotting all the correct information. E.g., does French Guyana have the same visa requirements than France and the other French Territories represented in Natural Earth's shapefile as France? If so, then we are ok! Otherwise we would need another shapefile or transform this one).
visadf['countries_matched'] = visadf.PAIS
visadf.loc[visadf.PAIS.apply(lambda x: x in miss_visadf), 'countries_matched'] = visadf.loc[visadf.PAIS.apply(lambda x: x in miss_visadf)].PAIS.map(matches2[['visadf', 'countries_matched']].set_index('visadf').to_dict()['countries_matched'])
visadf
col_visa = countries.merge(visadf, left_on='NAME_ES', right_on='countries_matched')
cmap = mpl.colors.ListedColormap(['blue', 'red'])
mylegend = center_wrap(["Visa Requirements", "For Colombian Citizens"], cwidth=32, width=32)
MyChoropleth(mydf=col_visa, myfile='col_visa', myvar='visa_req', mylegend=mylegend, k=1, bbox_to_anchor=(0.25, 0.3),
edgecolor='white', facecolor='lightgray', cmap=cmap, scheme='UserDefined', bins=[0,1], legend_labels=['NO', 'YES'],
save=False)
Let's check whether all Franch Territories depicted have the correct visa assignment.
col_visa.loc[col_visa.PAIS_OR.str.contains('Francia'), ['SOVEREIGNT', 'NAME_ES', 'ADM0_A3'] + visadf.columns.to_list()]
visadf.loc[visadf.PAIS_OR.str.contains('Francia')]
Seems the ones we are missing have the same equirements as mainland France, so we are lucky and do not seem to need to do more.
Exercise¶
- Merge the
col_visa
data with data from the World Development indicators - Explore the characteristics of the two sets of countries. Compare them in terms of income per capita, population, trade.
- Find trade, travel, FDI data for each country in relation to Colombia. What do you find?
- Can you provide the correlates of visa requirements for Colombia?