GIS with Python and IPython

Part III: Data Munging...Combining GIS with Other Tools

Set-up our environment as before

Let's import the packages we will use and set the paths for outputs.

In [1]:
# Let's import pandas and some other basic packages we will use 
from __future__ import division

import pandas as pd
import numpy as np
import os, sys

# GIS packages
import geopandas as gpd
from geopandas.tools import overlay
from shapely.geometry import Polygon, Point
import georasters as gr
# Alias for Geopandas
gp = gpd

# Plotting
import matplotlib as mpl
import seaborn as sns
# Setup seaborn
sns.set()

# Mapping
import geoplot as gplt
import geoplot.crs as gcrs
import mapclassify as mc
import textwrap

%pylab --no-import-all
%matplotlib inline
Using matplotlib backend: <object object at 0x195b1fa70>
%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib
In [2]:
# Functions for plotting
def center_wrap(text, cwidth=32, **kw):
    '''Center Text (to be used in legend)'''
    lines = text
    #lines = textwrap.wrap(text, **kw)
    return "\n".join(line.center(cwidth) for line in lines)

def MyChoropleth(mydf, myfile='', myvar='',
                  mylegend='',
                  k=5,
                  extent=[-180, -90, 180, 90],
                  bbox_to_anchor=(0.2, 0.5),
                  edgecolor='white', facecolor='lightgray',
                  scheme='FisherJenks', bins=None, pct=None,
                  legend_labels=None,
                  save=True,
                  percent=False,
                  cmap='Reds',
                  **kwargs):
    # Chloropleth
    # Color scheme
    if scheme=='EqualInterval':
        scheme = mc.EqualInterval(mydf[myvar], k=k)
    elif scheme=='Quantiles':
        scheme = mc.Quantiles(mydf[myvar], k=k)
    elif scheme=='BoxPlot':
        scheme = mc.BoxPlot(mydf[myvar], k=k)
    elif scheme=='FisherJenks':
        scheme = mc.FisherJenks(mydf[myvar], k=k)
    elif scheme=='FisherJenksSampled':
        scheme = mc.FisherJenksSampled(mydf[myvar], k=k)
    elif scheme=='HeadTailBreaks':
        scheme = mc.HeadTailBreaks(mydf[myvar], k=k)
    elif scheme=='JenksCaspall':
        scheme = mc.JenksCaspall(mydf[myvar], k=k)
    elif scheme=='JenksCaspallForced':
        scheme = mc.JenksCaspallForced(mydf[myvar], k=k)
    elif scheme=='JenksCaspallSampled':
        scheme = mc.JenksCaspallSampled(mydf[myvar], k=k)
    elif scheme=='KClassifiers':
        scheme = mc.KClassifiers(mydf[myvar], k=k)
    elif scheme=='Percentiles':
        scheme = mc.Percentiles(mydf[myvar], pct=pct)
    elif scheme=='UserDefined':
        scheme = mc.UserDefined(mydf[myvar], bins=bins)
    
    if legend_labels is None:
        # Format legend
        upper_bounds = scheme.bins
        # get and format all bounds
        bounds = []
        for index, upper_bound in enumerate(upper_bounds):
            if index == 0:
                lower_bound = mydf[myvar].min()
            else:
                lower_bound = upper_bounds[index-1]
            # format the numerical legend here
            if percent:
                bound = f'{lower_bound:.0%} - {upper_bound:.0%}'
            else:
                bound = f'{float(lower_bound):,.0f} - {float(upper_bound):,.0f}'
            bounds.append(bound)
        legend_labels = bounds
    #Plot
    ax = gplt.choropleth(
        mydf, hue=myvar, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
        edgecolor='white', linewidth=1,
        cmap=cmap, legend=True,
        scheme=scheme,
        legend_kwargs={'bbox_to_anchor': bbox_to_anchor,
                       'frameon': True,
                       'title':mylegend,
                       },
        legend_labels = legend_labels,
        figsize=(24, 16),
        rasterized=True,
    )
    gplt.polyplot(
        countries, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
        edgecolor=edgecolor, facecolor=facecolor,
        ax=ax,
        rasterized=True,
        extent=extent,
    )
    if save:
        plt.savefig(pathgraphs + myfile + '_' + myvar +'.pdf', dpi=300, bbox_inches='tight')
        plt.savefig(pathgraphs + myfile + '_' + myvar +'.png', dpi=300, bbox_inches='tight')
    pass
In [3]:
# Paths
pathout = './data/'

if not os.path.exists(pathout):
    os.mkdir(pathout)
    
pathgraphs = './graphs/'
if not os.path.exists(pathgraphs):
    os.mkdir(pathgraphs)

Let's plot the countries for which Colombian citizens do not require visas

The Colombian Cancillery's website has a list with visa requirements for colombians. Let's use it to map countries for which visas are not required. Below is the link to the information. The problem is that it is a pdf file. Let's open the website and check it out

In [4]:
# Import display options for showing websites
from IPython.display import IFrame

url = 'https://www.cancilleria.gov.co/sites/default/files/FOTOS2020/relacion_de_paises_que_exigen_o_no_visas_a_colombianos_17-04-2020.pdf'
IFrame(url, width=800, height=400)
Out[4]:

Roadblock

Someone forgot to make our life easy and made the data available in a pdf.

Only Human?What Shall We Do now?

Luckily python has tools to deal with this.

New

So let's download it, save it to disk and use these tools to process the pdf into a pandas.DataFrame.

In [5]:
# Import package for downloading internet content and save it to file
import requests

url = 'https://www.cancilleria.gov.co/sites/default/files/FOTOS2020/relacion_de_paises_que_exigen_o_no_visas_a_colombianos_17-04-2020.pdf'
response = requests.get(url)
with open(pathout + 'visas.pdf', 'wb') as f:
    f.write(response.content)
In [6]:
# Import package to read pdf tables
import camelot
visas = camelot.read_pdf(pathout + 'visas.pdf', pages='1-7')

Let's explore the visas object

In [7]:
visas
Out[7]:
<TableList n=7>

So there are 7 tables in visas. What does Table 1 have?

In [8]:
visas[0]
Out[8]:
<Table shape=(28, 3)>
In [9]:
visas[0].df
Out[9]:
0 1 2
0 MINISTERIO DE RELACIONES EXTERIORES DE COLOMBIA
1 DIRECCION DE ASUNTOS MIGRATORIOS, CONSULARES Y...
2 COORDINACION DE VISAS E INMIGRACION
3 Estados y territorios que exigen o NO visas a ...
4 EXIGEN VISA A
5 PAIS SI NO
6 Afganistán X
7 Albania X
8 Alemania X
9 Andorra X
10 Angola X
11 Antigua y Barbuda X
12 Arabia Saudita X
13 Argelia X
14 Argentina X
15 Armenia
16 Australia X X
17 Austria X
18 Azerbaiyán X (Visa electrónica)
19 Bahamas X
20 Bahréin X (visa a la llegada y visa electrónica)
21 Bangladesh X
22 Barbados X
23 Bélgica X
24 Belice X
25 Benin
26 Belarús X X
27 Bolivia X

Ok, let's concatenate all these pandas dataframes.

In [10]:
visadf = pd.concat([i.df for i in visas]).reset_index(drop=True)
visadf
Out[10]:
0 1 2
0 MINISTERIO DE RELACIONES EXTERIORES DE COLOMBIA
1 DIRECCION DE ASUNTOS MIGRATORIOS, CONSULARES Y...
2 COORDINACION DE VISAS E INMIGRACION
3 Estados y territorios que exigen o NO visas a ...
4 EXIGEN VISA A
... ... ... ...
220 Taiwan X Visa electrónica
221 Wallis y Futuna (Francia) X
222
223 Actualización 21 -10-2019
224 El presente cuadro presenta generalidades sobr...

225 rows × 3 columns

We need to correct the header

In [11]:
visadf.columns = visadf.iloc[5]
In [12]:
visadf.head(10)
Out[12]:
5 PAIS SI NO
0 MINISTERIO DE RELACIONES EXTERIORES DE COLOMBIA
1 DIRECCION DE ASUNTOS MIGRATORIOS, CONSULARES Y...
2 COORDINACION DE VISAS E INMIGRACION
3 Estados y territorios que exigen o NO visas a ...
4 EXIGEN VISA A
5 PAIS SI NO
6 Afganistán X
7 Albania X
8 Alemania X
9 Andorra X
In [13]:
visadf = visadf.iloc[6:].copy()
In [14]:
visadf.columns.name = ''
In [15]:
visadf.head(10)
Out[15]:
PAIS SI NO
6 Afganistán X
7 Albania X
8 Alemania X
9 Andorra X
10 Angola X
11 Antigua y Barbuda X
12 Arabia Saudita X
13 Argelia X
14 Argentina X
15 Armenia

Let's code SI (YES) as 1 and NO as 0

In [16]:
visadf['visa_req'] = visadf.SI.map({'X':1, '':0})

Let's check whether things were mapped correctly

In [17]:
visadf.loc[visadf.visa_req.isna()]
Out[17]:
PAIS SI NO visa_req
16 Australia X X NaN
18 Azerbaiyán X (Visa electrónica) NaN
20 Bahréin X (visa a la llegada y visa electrónica) NaN
26 Belarús X X NaN
34 Burundi X X X NaN
36 Cabo Verde X (Visa a la llegada) NaN
37 Camboya X (Visa a la llegada) NaN
39 Canadá X X X NaN
46 Congo X X X NaN
50 Costa de Marfil X X NaN
58 Egipto X (Visa a la llegada) NaN
68 Fiji X X NaN
76 Granada X X NaN
80 Guinea-Bissau X X X NaN
88 Irán X X X X X NaN
93 Islas Salomón X X NaN
98 Jordania X X NaN
100 Kenia X Visa a la llegada NaN
102 Kiribati X X NaN
105 Laos República Democrática P X Visa a la llegada NaN
110 Libia X X NaN
116 Malasia X X X NaN
122 Mauricio X X X NaN
131 Myanmar X (Visa a la llegada) NaN
135 Nicaragua X (visa a la llegada para titulares de visa de... NaN
137 Nigeria X X NaN
140 Omán X (Visa de turismo al ingreso a Omán en los pu... NaN
143 Palau X X NaN
156 Ruanda X (Visa electrónica) NaN
167 Sierra Leona X X NaN
172 Sudáfrica X X X X X X NaN
179 Tailandia X X NaN
180 Tanzania X Visa a la llegada NaN
183 Togo X X X X NaN
194 Vanuatu X X NaN
197 Yemen X X X NaN
207 Macao (SARG-China) (*) X Visa a la llegada NaN
220 Taiwan X Visa electrónica NaN
In [18]:
IFrame(url, width=800, height=400)
Out[18]:
In [19]:
visadf.loc[(visadf.SI=='X X') | (visadf.SI.shift(1)=='X X')  | (visadf.SI.shift(-1)=='X X')]
Out[19]:
PAIS SI NO visa_req
15 Armenia 0.0
16 Australia X X NaN
17 Austria X 0.0
25 Benin 0.0
26 Belarús X X NaN
27 Bolivia X 0.0
49 Corea República Popular Dem. 0.0
50 Costa de Marfil X X NaN
51 Costa Rica X A titulares de Visa de EE UU o Schengen vigen... 1.0
67 Etiopía 0.0
68 Fiji X X NaN
69 Filipinas X Hasta por 30 días 0.0
75 Ghana 0.0
76 Granada X X NaN
77 Grecia X 0.0
92 Islas Marshall 0.0
93 Islas Salomón X X NaN
94 Israel X 0.0
97 Japón 0.0
98 Jordania X X NaN
99 Kazajstán X (Hasta por 30 días) 0.0
101 Kirguistán 0.0
102 Kiribati X X NaN
103 Kuwait X 1.0
109 Liberia 0.0
110 Libia X X NaN
111 Liechtenstein X 0.0
136 Níger 0.0
137 Nigeria X X NaN
138 Noruega X 0.0
142 Pakistán 0.0
143 Palau X X NaN
144 Panamá X 0.0
166 Seychelles 0.0
167 Sierra Leona X X NaN
168 Singapur X Hasta por 30 días 0.0
178 Suazilandia 0.0
179 Tailandia X X NaN
180 Tanzania X Visa a la llegada NaN
193 Uzbekistán 0.0
194 Vanuatu X X NaN
195 Venezuela X 0.0
In [20]:
visadf.loc[(visadf.SI=='X X X') | (visadf.SI.shift(1)=='X X X')  | (visadf.SI.shift(-1)=='X X X')]
Out[20]:
PAIS SI NO visa_req
33 Burkina Faso 0.0
34 Burundi X X X NaN
35 Bután 0.0
38 Camerún 0.0
39 Canadá X X X NaN
40 Chad 0.0
45 Comoras 0.0
46 Congo X X X NaN
47 Congo República Democrática 0.0
79 Guinea 0.0
80 Guinea-Bissau X X X NaN
81 Guinea Ecuatorial 0.0
115 Madagascar 0.0
116 Malasia X X X NaN
117 Malawi 0.0
121 Marruecos 0.0
122 Mauricio X X X NaN
123 Mauritania 0.0
196 Vietnam 0.0
197 Yemen X X X NaN
198 Zambia 0.0

Ok it seems we have two types of errors. First, notince that sometimes the type of visa is defined, e.g., Azerbayán. Second, the OCR software has mixed some rows, so that now we have XX, XXX, etc. Looking at the pdf it seems this is due to assigning an X from a previous row to the current row ("X X") or from both the previous and next ("X X X"). Let's try to correct these errors programatically (obviously sometimes it may just be faster and better to export the dataframe, correct it by hand snd then load the corrected one, but we're here to learn, right?).

First, let's replace the repeated X with what seems to be the correct data.

X X

In [21]:
visadf.loc[(visadf.SI=='X X') | (visadf.SI.shift(-1)=='X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X') | (visadf.SI.shift(-1)=='X X')]
Out[21]:
PAIS SI NO visa_req
15 Armenia 1.0
16 Australia X X 1.0
25 Benin 1.0
26 Belarús X X 1.0
49 Corea República Popular Dem. 1.0
50 Costa de Marfil X X 1.0
67 Etiopía 1.0
68 Fiji X X 1.0
75 Ghana 1.0
76 Granada X X 1.0
92 Islas Marshall 1.0
93 Islas Salomón X X 1.0
97 Japón 1.0
98 Jordania X X 1.0
101 Kirguistán 1.0
102 Kiribati X X 1.0
109 Liberia 1.0
110 Libia X X 1.0
136 Níger 1.0
137 Nigeria X X 1.0
142 Pakistán 1.0
143 Palau X X 1.0
166 Seychelles 1.0
167 Sierra Leona X X 1.0
178 Suazilandia 1.0
179 Tailandia X X 1.0
193 Uzbekistán 1.0
194 Vanuatu X X 1.0

X X X

In [22]:
visadf.loc[(visadf.SI=='X X X') | (visadf.SI.shift(1)=='X X X')  | (visadf.SI.shift(-1)=='X X X'), 'visa_req'] =1
visadf.loc[(visadf.SI=='X X X') | (visadf.SI.shift(1)=='X X X')  | (visadf.SI.shift(-1)=='X X X')]
Out[22]:
PAIS SI NO visa_req
33 Burkina Faso 1.0
34 Burundi X X X 1.0
35 Bután 1.0
38 Camerún 1.0
39 Canadá X X X 1.0
40 Chad 1.0
45 Comoras 1.0
46 Congo X X X 1.0
47 Congo República Democrática 1.0
79 Guinea 1.0
80 Guinea-Bissau X X X 1.0
81 Guinea Ecuatorial 1.0
115 Madagascar 1.0
116 Malasia X X X 1.0
117 Malawi 1.0
121 Marruecos 1.0
122 Mauricio X X X 1.0
123 Mauritania 1.0
196 Vietnam 1.0
197 Yemen X X X 1.0
198 Zambia 1.0

X X X X

In [23]:
visadf.loc[(visadf.SI=='X X X X') | (visadf.SI.shift(1)=='X X X X')  | (visadf.SI.shift(-1)=='X X X X') | (visadf.SI.shift(2)=='X X X X')  | (visadf.SI.shift(-2)=='X X X X')  | (visadf.SI.shift(-3)=='X X X X')]
Out[23]:
PAIS SI NO visa_req
180 Tanzania X Visa a la llegada NaN
181 Tayikistán 0.0
182 Timor Oriental 0.0
183 Togo X X X X NaN
184 Tonga 0.0
185 Trinidad y Tobago X 0.0
In [24]:
visadf.loc[(visadf.SI=='X X X X') | (visadf.SI.shift(1)=='X X X X')  | (visadf.SI.shift(-1)=='X X X X') | (visadf.SI.shift(-2)=='X X X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X X X') | (visadf.SI.shift(1)=='X X X X')  | (visadf.SI.shift(-1)=='X X X X') | (visadf.SI.shift(-2)=='X X X X')]
Out[24]:
PAIS SI NO visa_req
181 Tayikistán 1.0
182 Timor Oriental 1.0
183 Togo X X X X 1.0
184 Tonga 1.0

X X X X X

In [25]:
visadf.loc[(visadf.SI=='X X X X X') | (visadf.SI.shift(1)=='X X X X X')  | (visadf.SI.shift(-1)=='X X X X X') | (visadf.SI.shift(-2)=='X X X X X') | (visadf.SI.shift(2)=='X X X X X')]
Out[25]:
PAIS SI NO visa_req
86 India 0.0
87 Indonesia 0.0
88 Irán X X X X X NaN
89 Iraq 0.0
90 Irlanda 0.0
In [26]:
visadf.loc[(visadf.SI=='X X X X X') | (visadf.SI.shift(1)=='X X X X X')  | (visadf.SI.shift(-1)=='X X X X X') | (visadf.SI.shift(-2)=='X X X X X') | (visadf.SI.shift(2)=='X X X X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X X X X') | (visadf.SI.shift(1)=='X X X X X')  | (visadf.SI.shift(-1)=='X X X X X') | (visadf.SI.shift(-2)=='X X X X X') | (visadf.SI.shift(2)=='X X X X X')]
Out[26]:
PAIS SI NO visa_req
86 India 1.0
87 Indonesia 1.0
88 Irán X X X X X 1.0
89 Iraq 1.0
90 Irlanda 1.0

X X X X X X

In [27]:
visadf.loc[(visadf.SI=='X X X X X X') | (visadf.SI.shift(1)=='X X X X X X')  | (visadf.SI.shift(-1)=='X X X X X X') | (visadf.SI.shift(-2)=='X X X X X X') | (visadf.SI.shift(2)=='X X X X X X') | (visadf.SI.shift(-3)=='X X X X X X') | (visadf.SI.shift(3)=='X X X X X X')]
Out[27]:
PAIS SI NO visa_req
169 Siria 0.0
170 Somalia 0.0
171 Sri Lanka 0.0
172 Sudáfrica X X X X X X NaN
173 Sudán del Sur 0.0
174 Sudán 0.0
175 Suecia X 0.0
In [28]:
visadf.loc[(visadf.SI=='X X X X X X') | (visadf.SI.shift(1)=='X X X X X X')  | (visadf.SI.shift(-1)=='X X X X X X') | (visadf.SI.shift(-2)=='X X X X X X') | (visadf.SI.shift(2)=='X X X X X X') | (visadf.SI.shift(-3)=='X X X X X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X X X X X') | (visadf.SI.shift(1)=='X X X X X X')  | (visadf.SI.shift(-1)=='X X X X X X') | (visadf.SI.shift(-2)=='X X X X X X') | (visadf.SI.shift(2)=='X X X X X X') | (visadf.SI.shift(-3)=='X X X X X X')]
Out[28]:
PAIS SI NO visa_req
169 Siria 1.0
170 Somalia 1.0
171 Sri Lanka 1.0
172 Sudáfrica X X X X X X 1.0
173 Sudán del Sur 1.0
174 Sudán 1.0

Let's also replace visa required for any row that has the word "visa".

In [29]:
visadf.loc[visadf.SI.str.lower().str.find('visa')!=-1]
Out[29]:
PAIS SI NO visa_req
18 Azerbaiyán X (Visa electrónica) NaN
20 Bahréin X (visa a la llegada y visa electrónica) NaN
36 Cabo Verde X (Visa a la llegada) NaN
37 Camboya X (Visa a la llegada) NaN
58 Egipto X (Visa a la llegada) NaN
100 Kenia X Visa a la llegada NaN
105 Laos República Democrática P X Visa a la llegada NaN
131 Myanmar X (Visa a la llegada) NaN
135 Nicaragua X (visa a la llegada para titulares de visa de... NaN
140 Omán X (Visa de turismo al ingreso a Omán en los pu... NaN
156 Ruanda X (Visa electrónica) NaN
180 Tanzania X Visa a la llegada NaN
207 Macao (SARG-China) (*) X Visa a la llegada NaN
220 Taiwan X Visa electrónica NaN
In [30]:
visadf.loc[visadf.SI.str.lower().str.find('visa')!=-1, 'visa_req'] = 1
visadf.loc[visadf.SI.str.lower().str.find('visa')!=-1]
Out[30]:
PAIS SI NO visa_req
18 Azerbaiyán X (Visa electrónica) 1.0
20 Bahréin X (visa a la llegada y visa electrónica) 1.0
36 Cabo Verde X (Visa a la llegada) 1.0
37 Camboya X (Visa a la llegada) 1.0
58 Egipto X (Visa a la llegada) 1.0
100 Kenia X Visa a la llegada 1.0
105 Laos República Democrática P X Visa a la llegada 1.0
131 Myanmar X (Visa a la llegada) 1.0
135 Nicaragua X (visa a la llegada para titulares de visa de... 1.0
140 Omán X (Visa de turismo al ingreso a Omán en los pu... 1.0
156 Ruanda X (Visa electrónica) 1.0
180 Tanzania X Visa a la llegada 1.0
207 Macao (SARG-China) (*) X Visa a la llegada 1.0
220 Taiwan X Visa electrónica 1.0

Let's check again

In [31]:
visadf.loc[visadf.visa_req.isna()]
Out[31]:
PAIS SI NO visa_req

Ok, it seems we have coded which countries need and which do not need visa for colombian citizens. Let's analyze this data a bit.

In [32]:
visadf['visa_req_YN'] = visadf.visa_req.map({0:'NO', 1:'YES'})
visadf
Out[32]:
PAIS SI NO visa_req visa_req_YN
6 Afganistán X 1.0 YES
7 Albania X 0.0 NO
8 Alemania X 0.0 NO
9 Andorra X 0.0 NO
10 Angola X 1.0 YES
... ... ... ... ... ...
220 Taiwan X Visa electrónica 1.0 YES
221 Wallis y Futuna (Francia) X 0.0 NO
222 0.0 NO
223 Actualización 21 -10-2019 0.0 NO
224 El presente cuadro presenta generalidades sobr... 0.0 NO

219 rows × 5 columns

In [33]:
visadf.hist()
visadf.visa_req.describe()
Out[33]:
count    219.000000
mean       0.547945
std        0.498836
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: visa_req, dtype: float64
In [34]:
df = visadf.groupby('visa_req_YN').count().reset_index()
df
Out[34]:
visa_req_YN PAIS SI NO visa_req
0 NO 99 99 99 99
1 YES 120 120 120 120
In [35]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
#sns.reset_orig()
sns.set_context("talk")
# Plot
fig, ax = plt.subplots()
sns.barplot(x='visa_req_YN', y='visa_req', data=df, alpha=1)
ax.tick_params(axis = 'both', which = 'major')
ax.tick_params(axis = 'both', which = 'minor')
ax.set_xlabel('Visa Required')
ax.set_ylabel('Number of Countries')
Out[35]:
Text(0, 0.5, 'Number of Countries')

Let's try to map these countries. First let's get the Natural Earth shapefile.

In [36]:
import requests
import io

#headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}

url = 'https://naturalearth.s3.amazonaws.com/10m_cultural/ne_10m_admin_0_countries.zip'
r = requests.get(url, headers=headers)
countries = gp.read_file(io.BytesIO(r.content))
#countries = gpd.read_file('https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_0_countries.zip')
In [37]:
countries
Out[37]:
featurecla scalerank LABELRANK SOVEREIGNT SOV_A3 ADM0_DIF LEVEL TYPE TLC ADMIN ... FCLASS_TR FCLASS_ID FCLASS_PL FCLASS_GR FCLASS_IT FCLASS_NL FCLASS_SE FCLASS_BD FCLASS_UA geometry
0 Admin-0 country 0 2 Indonesia IDN 0 2 Sovereign country 1 Indonesia ... None None None None None None None None None MULTIPOLYGON (((117.70361 4.16341, 117.70361 4...
1 Admin-0 country 0 3 Malaysia MYS 0 2 Sovereign country 1 Malaysia ... None None None None None None None None None MULTIPOLYGON (((117.70361 4.16341, 117.69711 4...
2 Admin-0 country 0 2 Chile CHL 0 2 Sovereign country 1 Chile ... None None None None None None None None None MULTIPOLYGON (((-69.51009 -17.50659, -69.50611...
3 Admin-0 country 0 3 Bolivia BOL 0 2 Sovereign country 1 Bolivia ... None None None None None None None None None POLYGON ((-69.51009 -17.50659, -69.51009 -17.5...
4 Admin-0 country 0 2 Peru PER 0 2 Sovereign country 1 Peru ... None None None None None None None None None MULTIPOLYGON (((-69.51009 -17.50659, -69.63832...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
253 Admin-0 country 0 4 China CH1 1 2 Country 1 Macao S.A.R ... None None None None None None None None None MULTIPOLYGON (((113.55860 22.16303, 113.56943 ...
254 Admin-0 country 6 5 Australia AU1 1 2 Dependency 1 Ashmore and Cartier Islands ... None None None None None None None None None POLYGON ((123.59702 -12.42832, 123.59775 -12.4...
255 Admin-0 country 6 8 Bajo Nuevo Bank (Petrel Is.) BJN 0 2 Indeterminate 1 Bajo Nuevo Bank (Petrel Is.) ... Unrecognized Unrecognized Unrecognized Unrecognized Unrecognized Unrecognized Unrecognized Unrecognized Unrecognized POLYGON ((-79.98929 15.79495, -79.98782 15.796...
256 Admin-0 country 6 5 Serranilla Bank SER 0 2 Indeterminate 1 Serranilla Bank ... Unrecognized Unrecognized Unrecognized Unrecognized Unrecognized Unrecognized Unrecognized Unrecognized Unrecognized POLYGON ((-78.63707 15.86209, -78.64041 15.864...
257 Admin-0 country 6 6 Scarborough Reef SCR 0 2 Indeterminate 1 Scarborough Reef ... None None None None None None None None None POLYGON ((117.75389 15.15437, 117.75569 15.151...

258 rows × 169 columns

Luckily there are country names in Spanish. Let's see if we can merge these two data sets.

In [38]:
countries.NAME_ES
Out[38]:
0                    Indonesia
1                      Malasia
2                        Chile
3                      Bolivia
4                         Perú
                ...           
253                      Macao
254    Islas Ashmore y Cartier
255                 Bajo Nuevo
256            Isla Serranilla
257           Bajo de Masinloc
Name: NAME_ES, Length: 258, dtype: object
In [39]:
col_visa = countries.merge(visadf, left_on='NAME_ES', right_on='PAIS')
In [40]:
cmap = mpl.colors.ListedColormap(['blue', 'red'])
mylegend = center_wrap(["Visa Requirements", "For Colombian Citizens"], cwidth=32, width=32)
MyChoropleth(mydf=col_visa, myfile='col_visa', myvar='visa_req', mylegend=mylegend, k=1, bbox_to_anchor=(0.25, 0.3),
                  edgecolor='white', facecolor='lightgray', cmap=cmap, scheme='UserDefined', bins=[0,1], legend_labels=['NO', 'YES'],
                  save=False)

So it seems not everything merged correctly

In [41]:
col_visa.shape
Out[41]:
(164, 174)
In [42]:
visadf.shape
Out[42]:
(219, 5)
In [43]:
col_visa.loc[col_visa.visa_req.isna(), 'NAME_ES'].sort_values()
Out[43]:
Series([], Name: NAME_ES, dtype: object)

So we are not linking all countries. This is usually due to symbols like accents and ~, but in this case also because the tail of the data frame includes territories of countries, so their names are non-standard (and OCR may have made some mistakes).

In [44]:
visadf.tail(25)
Out[44]:
PAIS SI NO visa_req visa_req_YN
200 OTROS TERRITORIOS 0.0 NO
201 Aruba (Países Bajos) X 0.0 NO
202 Bonaire (Países Bajos) X 0.0 NO
203 Curazao (Países Bajos) X 0.0 NO
204 Guadalupe (Francia) X 0.0 NO
205 Guyana Francesa X 0.0 NO
206 Hong Kong (SARG-China) X Por 90 días 0.0 NO
207 Macao (SARG-China) (*) X Visa a la llegada 1.0 YES
208 Martinica (Francia) X 0.0 NO
209 Mayotte (Francia) X 0.0 NO
210 Nueva Caledonia (Francia) X 0.0 NO
211 Palestina X 1.0 YES
212 Polinesia Francesa X 0.0 NO
213 Réunion (Francia) X 0.0 NO
214 Saba (Países Bajos) X 0.0 NO
215 Saint Barthélémy (Francia) X 1.0 YES
216 Saint Pïerre et Miquelon (Francia) X 0.0 NO
217 Saint Martin (Francia) X 1.0 YES
218 Sint Maarten (Países Bajos) X 0.0 NO
219 Sint Eustatius (Países Bajos) X 0.0 NO
220 Taiwan X Visa electrónica 1.0 YES
221 Wallis y Futuna (Francia) X 0.0 NO
222 0.0 NO
223 Actualización 21 -10-2019 0.0 NO
224 El presente cuadro presenta generalidades sobr... 0.0 NO

Let's correct the country names to improve matching. It's always a good practice to keep the original names.

In [45]:
visadf['PAIS_OR'] = visadf.PAIS
In [46]:
visadf.loc[visadf.PAIS.str.find('(')!=-1, 'PAIS'] = visadf.loc[visadf.PAIS_OR.str.find('(')!=-1, 'PAIS_OR'].apply(lambda x: x[:x.find('(')])
visadf.PAIS = visadf.PAIS.str.strip()
In [47]:
visadf.tail(30)
Out[47]:
PAIS SI NO visa_req visa_req_YN PAIS_OR
195 Venezuela X 0.0 NO Venezuela
196 Vietnam 1.0 YES Vietnam
197 Yemen X X X 1.0 YES Yemen
198 Zambia 1.0 YES Zambia
199 Zimbabwe X 1.0 YES Zimbabwe
200 OTROS TERRITORIOS 0.0 NO OTROS TERRITORIOS
201 Aruba X 0.0 NO Aruba (Países Bajos)
202 Bonaire X 0.0 NO Bonaire (Países Bajos)
203 Curazao X 0.0 NO Curazao (Países Bajos)
204 Guadalupe X 0.0 NO Guadalupe (Francia)
205 Guyana Francesa X 0.0 NO Guyana Francesa
206 Hong Kong X Por 90 días 0.0 NO Hong Kong (SARG-China)
207 Macao X Visa a la llegada 1.0 YES Macao (SARG-China) (*)
208 Martinica X 0.0 NO Martinica (Francia)
209 Mayotte X 0.0 NO Mayotte (Francia)
210 Nueva Caledonia X 0.0 NO Nueva Caledonia (Francia)
211 Palestina X 1.0 YES Palestina
212 Polinesia Francesa X 0.0 NO Polinesia Francesa
213 Réunion X 0.0 NO Réunion (Francia)
214 Saba X 0.0 NO Saba (Países Bajos)
215 Saint Barthélémy X 1.0 YES Saint Barthélémy (Francia)
216 Saint Pïerre et Miquelon X 0.0 NO Saint Pïerre et Miquelon (Francia)
217 Saint Martin X 1.0 YES Saint Martin (Francia)
218 Sint Maarten X 0.0 NO Sint Maarten (Países Bajos)
219 Sint Eustatius X 0.0 NO Sint Eustatius (Países Bajos)
220 Taiwan X Visa electrónica 1.0 YES Taiwan
221 Wallis y Futuna X 0.0 NO Wallis y Futuna (Francia)
222 0.0 NO
223 Actualización 21 -10-2019 0.0 NO Actualización 21 -10-2019
224 El presente cuadro presenta generalidades sobr... 0.0 NO El presente cuadro presenta generalidades sobr...
In [48]:
col_visa = countries.merge(visadf, left_on='NAME_ES', right_on='PAIS')
cmap = mpl.colors.ListedColormap(['blue', 'red'])
mylegend = center_wrap(["Visa Requirements", "For Colombian Citizens"], cwidth=32, width=32)
MyChoropleth(mydf=col_visa, myfile='col_visa', myvar='visa_req', mylegend=mylegend, k=1, bbox_to_anchor=(0.25, 0.3),
                  edgecolor='white', facecolor='lightgray', cmap=cmap, scheme='UserDefined', bins=[0,1], legend_labels=['NO', 'YES'],
                  save=False)