Search
Countries

Cleaning of countries dataset

from IPython.display import display_markdown

display_markdown(open("README.md").read(), raw=True)

Countries data

This dataset contains administrative boundaries of countries.

  • Source: Natural Earth
  • URL

https://www.naturalearthdata.com/downloads/10m-cultural-vectors/10m-admin-0-countries/

  • Processing: transformations documented in countries_cleaning.ipynb
    • Clean file: countries_clean.geojson
%matplotlib inline

import geopandas
  • Remove small islands
source_url = ("https://www.naturalearthdata.com/"\
              "http//www.naturalearthdata.com/download/"\
              "10m/cultural/ne_10m_admin_0_countries.zip"
             )
source_url
'https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_0_countries.zip'
ctys = geopandas.read_file(source_url)
ctys.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f3811b3fa10>
areas = ctys.to_crs(epsg=3857).area
areas.plot.hist(bins=100)
<matplotlib.axes._subplots.AxesSubplot at 0x7f3811976250>
smallest = areas.max() / 4000
small = areas.loc[areas<smallest].index
large = ctys.loc[ctys.index.difference(small), :]
large.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f38119b8450>
  • Antartica
ys = large.centroid.geometry.y
large = large.loc[ys > ys.min(), :]

%time large = large.to_crs(epsg=3857)
CPU times: user 1.67 s, sys: 20.9 ms, total: 1.7 s
Wall time: 1.7 s
large.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f3811b79650>
  • Keep only relevant columns
large.info()
<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 160 entries, 0 to 239
Data columns (total 95 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   featurecla  160 non-null    object  
 1   scalerank   160 non-null    int64   
 2   LABELRANK   160 non-null    int64   
 3   SOVEREIGNT  160 non-null    object  
 4   SOV_A3      160 non-null    object  
 5   ADM0_DIF    160 non-null    int64   
 6   LEVEL       160 non-null    int64   
 7   TYPE        160 non-null    object  
 8   ADMIN       160 non-null    object  
 9   ADM0_A3     160 non-null    object  
 10  GEOU_DIF    160 non-null    int64   
 11  GEOUNIT     160 non-null    object  
 12  GU_A3       160 non-null    object  
 13  SU_DIF      160 non-null    int64   
 14  SUBUNIT     160 non-null    object  
 15  SU_A3       160 non-null    object  
 16  BRK_DIFF    160 non-null    int64   
 17  NAME        160 non-null    object  
 18  NAME_LONG   160 non-null    object  
 19  BRK_A3      160 non-null    object  
 20  BRK_NAME    160 non-null    object  
 21  BRK_GROUP   0 non-null      object  
 22  ABBREV      160 non-null    object  
 23  POSTAL      160 non-null    object  
 24  FORMAL_EN   158 non-null    object  
 25  FORMAL_FR   5 non-null      object  
 26  NAME_CIAWF  159 non-null    object  
 27  NOTE_ADM0   5 non-null      object  
 28  NOTE_BRK    4 non-null      object  
 29  NAME_SORT   160 non-null    object  
 30  NAME_ALT    2 non-null      object  
 31  MAPCOLOR7   160 non-null    int64   
 32  MAPCOLOR8   160 non-null    int64   
 33  MAPCOLOR9   160 non-null    int64   
 34  MAPCOLOR13  160 non-null    int64   
 35  POP_EST     160 non-null    int64   
 36  POP_RANK    160 non-null    int64   
 37  GDP_MD_EST  160 non-null    float64 
 38  POP_YEAR    160 non-null    int64   
 39  LASTCENSUS  160 non-null    int64   
 40  GDP_YEAR    160 non-null    int64   
 41  ECONOMY     160 non-null    object  
 42  INCOME_GRP  160 non-null    object  
 43  WIKIPEDIA   160 non-null    int64   
 44  FIPS_10_    160 non-null    object  
 45  ISO_A2      160 non-null    object  
 46  ISO_A3      160 non-null    object  
 47  ISO_A3_EH   160 non-null    object  
 48  ISO_N3      160 non-null    object  
 49  UN_A3       160 non-null    object  
 50  WB_A2       160 non-null    object  
 51  WB_A3       160 non-null    object  
 52  WOE_ID      160 non-null    int64   
 53  WOE_ID_EH   160 non-null    int64   
 54  WOE_NOTE    160 non-null    object  
 55  ADM0_A3_IS  160 non-null    object  
 56  ADM0_A3_US  160 non-null    object  
 57  ADM0_A3_UN  160 non-null    int64   
 58  ADM0_A3_WB  160 non-null    int64   
 59  CONTINENT   160 non-null    object  
 60  REGION_UN   160 non-null    object  
 61  SUBREGION   160 non-null    object  
 62  REGION_WB   160 non-null    object  
 63  NAME_LEN    160 non-null    int64   
 64  LONG_LEN    160 non-null    int64   
 65  ABBREV_LEN  160 non-null    int64   
 66  TINY        160 non-null    int64   
 67  HOMEPART    160 non-null    int64   
 68  MIN_ZOOM    160 non-null    float64 
 69  MIN_LABEL   160 non-null    float64 
 70  MAX_LABEL   160 non-null    float64 
 71  NE_ID       160 non-null    int64   
 72  WIKIDATAID  160 non-null    object  
 73  NAME_AR     160 non-null    object  
 74  NAME_BN     160 non-null    object  
 75  NAME_DE     160 non-null    object  
 76  NAME_EN     160 non-null    object  
 77  NAME_ES     160 non-null    object  
 78  NAME_FR     160 non-null    object  
 79  NAME_EL     160 non-null    object  
 80  NAME_HI     160 non-null    object  
 81  NAME_HU     160 non-null    object  
 82  NAME_ID     160 non-null    object  
 83  NAME_IT     160 non-null    object  
 84  NAME_JA     160 non-null    object  
 85  NAME_KO     160 non-null    object  
 86  NAME_NL     160 non-null    object  
 87  NAME_PL     160 non-null    object  
 88  NAME_PT     160 non-null    object  
 89  NAME_RU     160 non-null    object  
 90  NAME_SV     160 non-null    object  
 91  NAME_TR     160 non-null    object  
 92  NAME_VI     160 non-null    object  
 93  NAME_ZH     160 non-null    object  
 94  geometry    160 non-null    geometry
dtypes: float64(4), geometry(1), int64(27), object(63)
memory usage: 120.0+ KB
tokeep = [
    "ADMIN",
    "geometry"
         ]
  • Write out to file
large[tokeep].to_file('countries_clean.gpkg', 
                      driver="GPKG"
                     )