Constructing a consistent US County Per Capita Income Dataset 1969-2017#

This notebook builds a cleaned geopackage for use in the inequality chapter of the book. It does so by carrying out a number of data processing steps to:

  • Generalize the boundaries of the county shapefile to faciltate processing

  • Joining income attributes with geometries

  • Handling the birth and deaths of counties over the time series

import pandas
from zipfile import ZipFile
import glob
from urllib.request import urlopen
from io import BytesIO
!pip install topojson
import fiona.io
import geopandas as gpd
import pandas
import requests
import topojson as tp
url = 'https://github.com/gdsbook/data_archive/raw/master/us_county_income/tl_2019_us_county.zip'

response = requests.get(url)
data_bytes = response.content
with fiona.io.ZipMemoryFile(data_bytes) as zip_memory_file:
    with zip_memory_file.open('tl_2019_us_county.shp') as collection:
        gdf = gpd.GeoDataFrame.from_features(collection, crs=collection.crs)
#gdf.to_file('tl_2019_us_county.shp')
gdf.shape
topo = tp.Topology(gdf, prequantize=False)
gdf_simplified = topo.toposimplify(5).to_gdf()
gdf_simplified.shape
gdf_simplified.to_file('tl_2019_us_county.shp')

Read the remote file with county income data#

caincurl = 'https://github.com/gdsbook/data_archive/raw/master/us_county_income/CAINC1.zip'
with urlopen(caincurl) as zipresp:
    with ZipFile(BytesIO(zipresp.read())) as zf:
        zf.extractall()
csv_files = glob.glob("*.csv")
csv_files
data = pandas.read_csv('CAINC1__ALL_STATES_1969_2017.csv', encoding='latin-1', 
                      skipfooter=3, engine='python')
data.head()
virginia = data[data.GeoFIPS.str.contains('\"51')] # select virginia
virginia.head()
virginia.shape
virginia.to_csv('virginia.csv')
data.head(20)
data.tail(10)
# fips codes source: https://www.nrcs.usda.gov/wps/portal/nrcs/detail/?cid=nrcs143_013696
fips = """Alabama 	AL 	01
Alaska 	AK 	02
Arizona 	AZ 	04
Arkansas 	AR 	05
California 	CA 	06
Colorado 	CO 	08
Connecticut 	CT 	09
Delaware 	DE 	10
Florida 	FL 	12
Georgia 	GA 	13
Hawaii 	HI 	15
Idaho 	ID 	16
Illinois 	IL 	17
Indiana 	IN 	18
Iowa 	IA 	19
Kansas 	KS 	20
Kentucky 	KY 	21
Louisiana 	LA 	22
Maine 	ME 	23
Maryland 	MD 	24
Massachusetts 	MA 	25
Michigan 	MI 	26
Minnesota 	MN 	27
Mississippi 	MS 	28
Missouri 	MO 	29
Montana 	MT 	30
Nebraska 	NE 	31
Nevada 	NV 	32
New Hampshire 	NH 	33
New Jersey 	NJ 	34
New Mexico 	NM 	35
New York 	NY 	36
North Carolina 	NC 	37
North Dakota 	ND 	38
Ohio 	OH 	39
Oklahoma 	OK 	40
Oregon 	OR 	41
Pennsylvania 	PA 	42
Rhode Island 	RI 	44
South Carolina 	SC 	45
South Dakota 	SD 	46
Tennessee 	TN 	47
Texas 	TX 	48
Utah 	UT 	49
Vermont 	VT 	50
Virginia 	VA 	51
Washington 	WA 	53
West Virginia 	WV 	54
Wisconsin 	WI 	55
Wyoming 	WY 	56
American Samoa 	AS 	60
Guam 	GU 	66
Northern Mariana Islands 	MP 	69
Puerto Rico 	PR 	72
Virgin Islands 	VI 	78"""
fips_df = pandas.DataFrame([fip.split("\t") for fip in fips.split("\n")])
fips_df.columns=["State", "Abbreviation", 'FIPS']
fips_df
omit_fips = [" \""+str(fip) for fip in ['02', 15, 60, 66, 69, 72, 78, 90, 91, 92, 93, 94, 95, 96, 97, 98]]
omit_fips
small = data.copy()
for omit in omit_fips:
    small = small[~small.GeoFIPS.str.contains(omit)]
    
small.shape
data.shape
geofips = set(code[:4] for code in data.GeoFIPS.values)
geofips
geofips = set(code[:4] for code in small.GeoFIPS.values)
geofips
small.shape
for fip in geofips:
    sdf = small[small.GeoFIPS.str.contains(fip)]
    sdf.to_csv(fip[2:]+".csv")

We are done as all the csv files have been extracted.

## old 015
import pandas
import geopandas
%matplotlib inline
from zipfile import ZipFile

import glob

Read remote shapefile with county geometries#

gdf = geopandas.read_file("tl_2019_us_county.shp")

State Specific Data Frames#

#gdf = geopandas.read_file("zip:tl_2019_us_county.zip!tl_2019_us_county.shp")
csv_files = glob.glob("??.csv")
csv_files.sort()
csv_files.pop(0) # kick out US
mismatch = []
gdfs = []
#for csv_file in csv_files[:1]:
for csv_file in csv_files:

    #print(csv_file)
    csv = pandas.read_csv(csv_file)
    st = csv_file[:2]
    st_gdf = gdf[gdf.STATEFP==st]
    csv = csv.iloc[3:] # kick out the state level records
    nc, kc = csv.shape
    if nc/3 != st_gdf.shape[0]:
        mismatch.append(st)
        print(st)
    else:
        csv['GeoFIPS'] = [fip.strip().replace("\"", "") for fip in csv.GeoFIPS]
        csv['GEOID'] = csv.GeoFIPS.astype(str)
        merged_gdf = st_gdf.merge(csv, on='GEOID')
        gdfs.append(merged_gdf)
        
csv.GEOID
len(gdfs)
gdf = pandas.concat(gdfs)
gdf.plot()
gdf.shape[0]/3
gdf.head()
gdf.columns
years = [str(year) for year in range(1969,2018)]
dtypes = list(zip(years,[gdf[year].dtype for year in years]))
dtypes

Issues#

  1. Virginia

  2. Wisconsin

  3. Object data types for 1969-2001

gdf.to_file('pcincome0.shp')
# old 020

Virginia independent cities#

Reference https://en.wikipedia.org/wiki/Independent_city_(United_States)

In the United States, an independent city is a city that is not in the territory of any county or counties with exceptions noted below. Of the 41 independent U.S. cities,[1] 38 are in Virginia, whose state constitution makes them a special case. The three independent cities outside Virginia are Baltimore, Maryland; St. Louis, Missouri; and Carson City, Nevada. The U.S. Census Bureau uses counties as its base unit for presentation of statistical information, and treats independent cities as county equivalents for those purposes. The most populous of them is Baltimore, Maryland.

From BEA Income Data:#

Virginia combination areas consist of one or two independent cities with 1980 populations of less than 100,000 combined with an adjacent county. The county name appears first, followed by the city name(s). Separate estimates for the jurisdictions making up the combination area are not available. Bedford County, VA includes the independent city of Bedford for all years.

Virginia#

The Commonwealth of Virginia is divided into 95 counties, along with 38 independent cities that are considered county-equivalents for census purposes. The map in this article, taken from the official United States Census Bureau site, includes Clifton Forge and Bedford as independent cities. This reflected the political reality at the time of the 2000 Census. However, both have since chosen to revert to town status. In Virginia, cities are co-equal levels of government to counties, but towns are part of counties. For some counties, for statistical purposes, the Bureau of Economic Analysis combines any independent cities with the county that it was once part of (before the legislation creating independent cities took place in 1871).

Source

Approach#

Dissolve boundaries of independent cities that BEA does not disclose values for with their adjacent county.

csv = pandas.read_csv('51.csv')
csv.head()
csv['GeoFIPS'] = [fip.strip().replace("\"", "") for fip in csv.GeoFIPS]
gdf = geopandas.read_file('tl_2019_us_county.shp')

gdf = geopandas.read_file(“zip:tl_2019_us_county.zip!tl_2019_us_county.shp”) zipfile = “zip:tl_2019_us_county.zip!tl_2019_us_county.shp”

virginia_gdf = gdf[gdf.STATEFP=="51"]
virginia_gdf.plot()
virginia_gdf.head()
data = csv
data.head()
data['GEOID'] = data.GeoFIPS
data.GEOID

gdf records missing income values#

#data = data[data.LineCode==3]
merged_gdf = virginia_gdf.merge(data, on='GEOID')
merged_gdf.plot()
merged_gdf.head()
merged_gdf.shape[0]/3
matched_names = set(merged_gdf.NAMELSAD)
matched_names
missing_gdf = virginia_gdf[~virginia_gdf.GEOID.isin(merged_gdf.GEOID)]
missing_names = set(missing_gdf.NAMELSAD)
missing_names
missing_gdf.plot()
missing_gdf.shape
missing_gdf
#names = missing_gdf.NAME
names = missing_gdf.NAMELSAD
virginia_income = data[data.GeoName.str.contains(", VA")]
virginia_income.shape
virginia_income.head()
geonames = pandas.unique(virginia_income.GeoName)
geonames
class Combination:
    def __init__(self, label):
        n_commas = label.count(",")
        self.label = label
        self.n_commas = n_commas
        if "Fairfax City" in label:
            label = label.replace('Fairfax City', 'Fairfax')
        if n_commas == 1:
            if "Bedford" in label:
                self.county = 'Bedford'
                self.cities = []
            else:
                words = label.split("+")
                county = words[0].strip()
                self.county = county
                self.cities = [words[-1].split(",")[0].strip()+" city"]
        elif n_commas == 2:
            words = label.split(",")
            self.county = words[0].strip()
            cities = words[1]
            cities = cities.split("+")
            
            self.cities = [city.strip()+" city" for city in cities]
        else:
            print('Bad label: ', label)
        self.county = self.county + " County"
        self.NAMELSAD = self.label
        
combinations_0 = [name for name in geonames if "VA*" in name]
combinations = [Combination(c) for c in combinations_0]
c0 = combinations[0]
c0.label
matches = {}
for name in names:
    hits = []
    matches[name] = []
    if "city" in name:
        hits = [comb for comb in combinations if name in comb.cities]
    else:
        hits = [comb for comb in combinations if name == comb.county]
    if hits:
        matches[name] = hits[0].label
    
        
        

Match combinations to rows of df#

for combination in combinations:
    print("label: ",combination.label)
    comb_names = [combination.county]+combination.cities
    print(comb_names)
merged_dfs = []
geoids = []
for combination in combinations:
    places = combination.cities + [combination.county]
    print(places)
    rows = virginia_gdf[virginia_gdf.NAMELSAD.isin(places)]
    
    if len(rows) != len(places):
        print('missed:', places)
    #d_df = rows.dissolve(by='STATEFP')
    rd = rows.dissolve(by='STATEFP').head()
    rd['NAMELSAD']= combination.label
    geoid = virginia_income[virginia_income.GeoName==combination.label].GEOID.values[0]
    rd['GEOID'] = geoid
    merged_dfs.append(rd)
    
rows.head()
rd.head()
dissolved = pandas.concat(merged_dfs)
dissolved.head()
d_merge = dissolved.merge(virginia_income, on='GEOID')
d_merge.head()
d_merge.shape
d_merge.plot()
merged_gdf.plot()
final_gdf = pandas.concat([merged_gdf, d_merge])
final_gdf.plot()
final_gdf.shape
virginia_gdf.shape
final_gdf.head()
final_gdf.tail()
final_gdf.plot(column='1978')
final_gdf['1979'] = final_gdf['1979'].astype(int)
final_gdf.plot(column='1979', legend=False, scheme='Quantiles', k=9)
final_gdf.columns
final_gdf.head()
final_gdf.shape
327859/37657 * 1000
318/3
final_gdf.STATEFP='51'
final_gdf[final_gdf.NAME=='Charlottesville']
727271*1000/93879

Reverse Matching#

Which income records do not match a record in the shapefile

virginia_gdf = gdf[gdf.STATEFP=="51"]
#data = data[data.LineCode==3]
merged_df = data.merge(virginia_gdf, on='GEOID')
merged_df.shape
missing_df = data[~data.GEOID.isin(merged_df.GEOID)]
#~virginia_gdf.GEOID.isin(merged_gdf.GEOID)
set(missing_df.GeoName)
pcincome_gdf = geopandas.read_file('pcincome0.shp')
pcincome_gdf.head()
final_gdf.head()
final_gdf.columns
pcincome_gdf.columns
common = set(pcincome_gdf.columns).intersection(final_gdf.columns)
common
us = pcincome_gdf[common]
va = final_gdf[common]
va.reset_index(inplace=True)
dup = va[va[["NAME", "GEOID", "1969"]].duplicated()]
dup.NAME
dup
va = va.drop(dup.index)
usva = pandas.concat([us,va])
usva.plot()
usva.shape
9027/3
years = [str(year) for year in range(1969,2018)]
dtypes = list(zip(years,[usva[year].dtype for year in years]))
dtypes
usva.to_file('usva.shp')
usva.head()
usva.columns
namedf = usva[['GeoName', 'STATEFP', '1969']]
namedf[namedf.duplicated()]
namedf[namedf.GeoName=='Bedford, VA*']

Wisconsin#

csv = pandas.read_csv('55.csv')
csv.head()
csv[csv.GeoName.str.startswith('Shawano')]
csv[csv.GeoName.str.startswith('Menominee')]
csv = csv.replace('(NA)', 0)
menominee = csv.iloc[[120, 121, 122]]
shawano = csv.iloc[[177, 178, 179]]
combined = csv.iloc[[219, 220, 221]]
years = [str(y) for y in range(1989, 2018)]
menominee[years] = menominee[years].astype(int)
shawano[years] = shawano[years].astype(int)
combined[years] = combined[years].astype(int)
cindex = [ combined.columns.values.tolist().index(y) for y in years]

for c,y in zip(cindex,years):
    #print(menominee[y].dtype)
    #print(shawano[y].dtype)
    #print(combined[y].dtype)
    combined.iloc[0,c] = menominee.iloc[0,c] + shawano.iloc[0,c]
    combined.iloc[1,c] = menominee.iloc[1,c] + shawano.iloc[1,c]
    combined.iloc[2,c] = int(combined.iloc[0,c]*1000/ combined.iloc[1,c])
    csv.iloc[219,c] = combined.iloc[0,c]
    csv.iloc[220,c] = combined.iloc[1,c]
    csv.iloc[221,c] = combined.iloc[2,c]
combined
csv.iloc[[219,220,221]]

Drop shawano and menomiee from csv#

drop_ids = [120, 121, 122, 177, 178, 179]
csv = csv.drop(drop_ids)
years = [str(y) for y in range(1969, 2018)]
csv[years] = csv[years].astype(int)
for year in range(1969,2018):
    print(csv[str(year)].dtype)
    

Dissolve shawano and menominee geometries in gdf#

dissolve and assign geofips

gdf = geopandas.read_file("tl_2019_us_county.shp")
wisconsin_gdf = gdf[gdf.STATEFP=="55"]
wisconsin_gdf.plot()
wisconsin_gdf[wisconsin_gdf.NAME=='Shawano']
wisconsin_gdf[wisconsin_gdf.NAME=='Menominee']
combined = wisconsin_gdf[wisconsin_gdf.COUNTYFP.isin(['115', '078'])].dissolve(by='STATEFP')
combined.plot()
combined.head()
combined['NAME'] = "Shawano+Menominee"
combined['NAMELSAD'] = "Shawano+Menominee Counties"
combined.head()
combined['GEOID'] = '55901'
combined.head()
wisc0 = wisconsin_gdf[~wisconsin_gdf.COUNTYFP.isin(['115', '078'])]
wisc0.plot()
wisc1 = pandas.concat([wisc0, combined])
wisc1.plot()
wisc1.head()
wisc1.shape
wisconsin_gdf.shape

Merge gdf and csv#

csv['GeoFIPS'] = [fip.strip().replace("\"", "") for fip in csv.GeoFIPS]
wisc1.head()
wisc1.STATEFP='55'
data = csv
data.head()
data['GEOID'] = data.GeoFIPS
data.GEOID

gdf records missing income values#

#data = data[data.LineCode==3]
merged_gdf = wisc1.merge(data, on='GEOID')
set(wisc1.GEOID)
merged_gdf.plot()
merged_gdf.head()
merged_gdf.shape[0]/3
matched_names = set(merged_gdf.NAMELSAD)
matched_names
missing_gdf = wisc1[~wisc1.GEOID.isin(merged_gdf.GEOID)]
missing_names = set(missing_gdf.NAMELSAD)
missing_names
missing_gdf.head()
usva = geopandas.read_file('usva.shp')
us_no_wisc = usva[usva.STATEFP!='55']
us = pandas.concat([merged_gdf, us_no_wisc])
us.shape
9098/3
us_no_wisc.plot()
us.plot()
us.shape
9240/3
us.head()
for y in range(1969, 2018):
    print(us[str(y)].dtype)
us.to_file('usincome.shp')

Other#

Issue from BEA Income Data:#

  • * Cibola, NM was separated from Valencia in June 1981, but in these estimates, Valencia includes Cibola through the end of 1981.
  • * La Paz County, AZ was separated from Yuma County on January 1, 1983. The Yuma, AZ MSA contains the area that became La Paz County, AZ through 1982 and excludes it beginning with 1983.
  • * Broomfield County, CO, was created from parts of Adams, Boulder, Jefferson, and Weld counties effective November 15, 2001. Estimates for Broomfield county begin with 2002.
  • Approach#

    • combine Cibola NM with Valencia for 1981-2017

    • combine La Paz County AZ with Yuma for 1983-2017

    • combine Broomfield County with Boulder CO all years (<2002 Boulder, >2002 Boulder+Broomfield)

    import pandas
    import geopandas
    %matplotlib inline
    

    NM#

    csv = pandas.read_csv('35.csv')
    
    csv = csv.replace('(NA)', 0)
    
    cibola = csv[csv.GeoName.str.startswith('Cibola')]
    valencia = csv[csv.GeoName.str.startswith('Valencia')]
    
    cibola
    
    valencia
    
    years = [ str(y) for y in range(1969, 2018)]
    
    combined = cibola.copy()
    cibola[years] = cibola[years].astype(int)
    valencia[years] = valencia[years].astype(int)
    combined[years] = combined[years].astype(int)
    
    cindex = [ combined.columns.values.tolist().index(y) for y in years]
    
    for c,y in zip(cindex,years):
        #print(menominee[y].dtype)
        #print(shawano[y].dtype)
        #print(combined[y].dtype)
        
        
        combined.iloc[0,c] = valencia.iloc[0,c] + cibola.iloc[0,c]
        combined.iloc[1,c] = valencia.iloc[1,c] + cibola.iloc[1,c]
        combined.iloc[2,c] = int(combined.iloc[0,c]*1000/ combined.iloc[1,c])
        #csv.iloc[219,c] = combined.iloc[0,c]
        #csv.iloc[220,c] = combined.iloc[1,c]
        #csv.iloc[221,c] = combined.iloc[2,c]
    
    combined
    

    Drop Cibola and Valencia and Update Combined Fields#

    csv = csv.drop([12, 13, 14, 99, 100, 101])
    combined['GeoName'] = 'Cibola+Valencia'
    combined['GeoFIPS'] = '"35061"'
    
    combined = combined[csv.columns]
    
    csv = pandas.concat([csv, combined])
    
    csv.head()
    
    csv.tail()
    
    gdf = geopandas.read_file("tl_2019_us_county.shp")
    
    nm_gdf = gdf[gdf.STATEFP=="35"]
    
    nm_gdf.plot()
    
    nm_gdf[nm_gdf.NAME.isin(['Valencia', 'Cibola'])]
    
    combined = nm_gdf[nm_gdf.NAME.isin(['Valencia', 'Cibola'])].dissolve(by='STATEFP')
    
    combined.plot()
    
    combined['NAME'] = 'Cibola+Valencia'
    combined['NAMELSAD'] = 'Cibola+Valencia Counties'
    combined['STATEFP'] = 35
    
    combined['GEOID'] =  '35061' # Valencia
    
    nm0 = nm_gdf[~nm_gdf.COUNTYFP.isin(['006', '061'])]
    
    nm0.plot()
    
    nm1 = pandas.concat([nm0, combined])
    
    nm1.plot()
    
    nm_gdf.shape
    
    nm1.shape
    
    csv['GeoFIPS'] = [fip.strip().replace("\"", "") for fip in csv.GeoFIPS]
    
    data = csv
    data['GEOID'] = data.GeoFIPS
    
    merged_gdf = nm1.merge(data, on='GEOID')
    
    merged_gdf.shape
    
    merged_gdf.head()
    
    us = geopandas.read_file('usincome.shp')
    
    us1 = pandas.concat([us[us.STATEFP!='35'], merged_gdf])
    
    us1.shape
    
    us1.head()
    
    us1.tail()
    
    9237/3
    
    merged_gdf.head()
    
    us1[us1.NAME.str.startswith('Cibo')]
    

    Arizona#

    • combine La Paz County AZ with Yuma for 1983-2017

    csv = pandas.read_csv('04.csv')
    
    csv.head()
    
    csv = csv.replace('(NA)', 0)
    
    csv[csv.GeoName.str.startswith('La Paz')]
    
    csv[csv.GeoName.str.startswith('Yuma')]
    
    la_paz = csv.iloc[[21, 22, 23]]
    yuma = csv.iloc[[45, 46, 47]]
    combined = yuma.copy()
    years = [str(y) for y in range(1969, 2018)]
    la_paz[years] = la_paz[years].astype(int)
    yuma[years] = yuma[years].astype(int)
    combined[years] = combined[years].astype(int)
    
    cindex = [ combined.columns.values.tolist().index(y) for y in years]
    
    for c,y in zip(cindex,years):
        #print(menominee[y].dtype)
        #print(shawano[y].dtype)
        #print(combined[y].dtype)
        combined.iloc[0,c] = la_paz.iloc[0,c] + yuma.iloc[0,c]
        combined.iloc[1,c] = la_paz.iloc[1,c] + yuma.iloc[1,c]
        combined.iloc[2,c] = int(combined.iloc[0,c]*1000/ combined.iloc[1,c])
    
    combined
    
    csv = csv.drop([21, 22, 23, 45, 46, 47])
    combined['GeoName'] = 'Yuma+La Paz'
    combined['GeoFIPS'] = '"04027"'
    combined['STATEFP'] = 4
    
    combined = combined[csv.columns]
    
    combined.head()
    
    csv = pandas.concat([csv, combined])
    
    csv.shape
    
    csv.head()
    
    gdf = geopandas.read_file("tl_2019_us_county.shp")
    az_gdf = gdf[gdf.STATEFP=='04']
    
    az_gdf.plot()
    
    az_gdf[az_gdf.NAME=='La Paz']
    
    az_gdf[az_gdf.NAME=='Yuma']
    
    combined = az_gdf[az_gdf.COUNTYFP.isin(['027', '012'])].dissolve(by='STATEFP')
    
    combined.plot()
    
    combined['NAME']= "Yuma+La Paz"
    combined['NAMELSAD'] = "Yuma+La Paz Counties"
    
    combined.head()
    
    az0 = az_gdf[~az_gdf.COUNTYFP.isin(['027', '012'])]
    az0.plot()
    
    az1 = pandas.concat([az0, combined])
    
    az1.plot()
    

    Merge gdf and csv#

    csv['GeoFIPS'] = [fip.strip().replace("\"", "") for fip in csv.GeoFIPS]
    
    data = csv
    data.head()
    
    data['GEOID'] = data.GeoFIPS
    
    merged_gdf = az1.merge(data, on='GEOID')
    merged_gdf['STATEFP'] = '04'
    
    merged_gdf.plot()
    
    us2 = pandas.concat([us1[us1.STATEFP!='04'], merged_gdf])
    
    us2[us2.STATEFP=='04'].shape
    
    us2.shape[0]/3
    

    Colorado#

    • combine Broomfield County with Boulder CO all years (<2002 Boulder, >2002 Boulder+Broomfield)

    csv = pandas.read_csv('08.csv')
    
    csv.head()
    
    csv[csv.GeoName.str.startswith('Boulder')]
    
    csv[csv.GeoName.str.startswith('Broom')]
    
    csv = csv.replace('(NA)', 0)
    boulder = csv.iloc[[21, 22, 23]]
    broomfield = csv.iloc[[24, 25, 26]]
    combined = boulder.copy()
    years = [str(y) for y in range(1989, 2018)]
    boulder[years] = boulder[years].astype(int)
    broomfield[years] = broomfield[years].astype(int)
    combined[years] = combined[years].astype(int)
    
    cindex = [ combined.columns.values.tolist().index(y) for y in years]
    
    for c,y in zip(cindex,years):
        #print(menominee[y].dtype)
        #print(shawano[y].dtype)
        #print(combined[y].dtype)
        combined.iloc[0,c] = boulder.iloc[0,c] + broomfield.iloc[0,c]
        combined.iloc[1,c] = boulder.iloc[1,c] + broomfield.iloc[1,c]
        combined.iloc[2,c] = int(combined.iloc[0,c]*1000/ combined.iloc[1,c])
        
    
    csv = csv.drop([21, 22, 23, 24, 25, 26])
    years = [str(y) for y in range(1969, 2018)]
    csv[years] = csv[years].astype(int)
    
    combined['GeoName'] = 'Boulder+Broomfield'
    combined['GeoFIPS'] = '"08013"'
    
    combined = combined[csv.columns]
    
    csv = pandas.concat([csv, combined])
    
    csv.head()
    
    csv.tail()
    
    csv[csv.GeoName.str.startswith('Bould')]
    
    gdf = geopandas.read_file("tl_2019_us_county.shp")
    
    co_gdf = gdf[gdf.STATEFP=='08']
    
    co_gdf.plot()
    
    co_gdf[co_gdf.NAME=='Boulder']
    
    co_gdf[co_gdf.NAME=='Broomfield']
    
    combined = co_gdf[co_gdf.COUNTYFP.isin(['013', '014'])].dissolve(by='STATEFP')
    
    combined.plot()
    
    combined.head()
    
    combined['NAME'] = "Boulder+Bloomfield"
    combined['NAMELSAD'] = "Boulder+Bloomfield Counties"
    
    combined['GEOID'] = '08013'
    
    combined.columns
    
    co0 = co_gdf[~co_gdf.COUNTYFP.isin(['013', '014'])]
    co0.plot()
    
    co1 = pandas.concat([co0, combined])
    co1.plot()
    
    co_gdf.plot()
    
    co1.shape
    
    co_gdf.shape
    
    csv['GeoFIPS'] = [fip.strip().replace("\"", "") for fip in csv.GeoFIPS]
    
    data = csv
    data['GEOID'] = data.GeoFIPS
    
    merged = co1.merge(data, on='GEOID')
    merged['STATEFP'] = '08'
    
    merged[merged.NAME.str.startswith('Bould')]['STATEFP']
    
    co1.shape
    
    merged.shape
    
    data.shape
    
    merged.plot()
    
    merged.head()
    
    us2.head()
    
    us3 = pandas.concat([us2[us2.STATEFP!='08'], merged])
    
    us3.shape[0]/3
    
    us2.shape[0]/3
    
    us3
    
    us3[us3.STATEFP=='08'].tail()
    
    us3[us3.NAME.str.startswith('Boulder')]
    
    us3.columns
    
    columns = us3.columns
    print(columns)
    
    us3.shape
    
    columns = us3.columns.values.tolist()
    columns
    
    us3.loc[:,'Unnamed: 0']
    
    us3.shape
    
    us4 = us3.drop(columns=['Unnamed: 0', 'Unnamed_ 0', 'IndustryCl'])
    
    us4.shape
    
    for year in range(1969, 2018):
        year = str(year)
        us4[year] = us4[year].astype('int')
    
    us4.to_file('usincome_final.shp')
    
    us4.shape
    

    Shrinking file size#

    We are duplicating the shapes three times (once for each attribute)

    Split out the attributes from the geometries, two different dataframes

    gdf = gpd.read_file('usincome_final.shp')
    
    uscountyincome = pandas.DataFrame(gdf.drop(columns='geometry'))
    
    uscountyincome.to_csv('uscountyincome.csv')
    
    gdf.head()
    
    gdf.columns
    
    gdf.LineCode
    
    gdf = gdf[gdf.LineCode==3]
    
    gdf.shape
    
    %pylab inline
    
    gdf.plot(column='2017')
    
    gdf.to_file("uscountypcincome.gpkg", layer='pcincome', driver="GPKG")
    
    gdf.columns
    
    gdf.shape