16  AirBnb

%matplotlib inline

import requests
import pandas as pd
import geopandas as gpd
#import googlemaps
from scipy.spatial.distance import cdist

16.1 Download files

  • Download detailed listings file:
url = 'http://data.insideairbnb.com/united-states/'\
      'ca/san-diego/2016-07-07/data/'\
      'listings.csv.gz'
r = requests.get(url)
with open('listings.csv.gz', 'wb') as fo:
    fo.write(r.content)
  • Download calendar file with prices
url = 'http://data.insideairbnb.com/united-states/'\
      'ca/san-diego/2016-07-07/data/'\
      'calendar.csv.gz'
r = requests.get(url)
with open('calendar.csv.gz', 'wb') as fo:
    fo.write(r.content)
  • Download neighborhoods
url = 'http://data.insideairbnb.com/united-states/'\
      'ca/san-diego/2016-07-07/visualisations/'\
      'neighbourhoods.geojson'
r = requests.get(url)
with open('neighbourhoods.geojson', 'wb') as fo:
    fo.write(r.content)

16.2 Variable setup

16.2.1 Parse price

lst = pd.read_csv('listings.csv.gz')
lst['priceN'] = lst['price'].apply(
                    lambda x: float(str(x)\
                                    .replace(',', '')\
                                    .strip('$')))
lst['l_price'] = pd.np.log(lst['priceN'])

16.2.2 GeoDataFrame

from shapely.geometry import Point
xys = lst[['longitude', 'latitude']]\
        .apply(lambda row: Point(*row), axis=1)
gdb = gpd.GeoDataFrame(lst.assign(geometry=xys),
                       crs="+init=epsg:4326")

16.2.3 Variables

  • Pool
ams = []
gdb['pool'] = 0
for i in range(gdb.shape[0]):
    r = gdb.loc[i, 'amenities']
    pcs = r.strip('{').strip('}').split(',')
    ams.extend(pcs)
    if 'Pool' in pcs:
        gdb.loc[i, 'pool'] = 1
set(ams)
  • Distance to Balboa park


View Larger Map

We can use geopy to find out its location:

from geopy.geocoders import Nominatim
geolocator = Nominatim()

Just type the name into the locator:

bp = geolocator.geocode("Balboa Park, San Diego, US")
bp
b_ll = bp.longitude, bp.latitude
b_ll

Then calculate distance to the park from each house:

# USA Contiguous Albers Equal Area (m.)
# http://epsg.io/102003
tgt_crs = "+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=37.5 "\
          "+lon_0=-96 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs"

b_xy = gpd.GeoSeries(Point(b_ll), crs=gdb.crs).to_crs(tgt_crs)[0]
b_xy = (b_xy.x, b_xy.y)
# Calculate distance in Km.
d2b = lambda pt: cdist([(pt.x, pt.y)], [b_xy])[0][0] / 1000
gdb['d2balboa'] = gdb['geometry'].to_crs(tgt_crs)\
                                 .apply(d2b)
gdb.plot(column='d2balboa', scheme='quantiles', k=9,
         cmap='viridis_r', s=1)
  • Elevation
key = open('../google_maps_key').readline().strip('\n')
gmaps = googlemaps.Client(key=key)
# Google takes lat/lon instead of lon/lat
gmaps.elevation([b_ll[::-1]])
pts = gdb['geometry'].apply(lambda pt: (pt.y, pt.x))
%time ele = gmaps.elevation(pts.head().tolist())
ele
extract_ele = lambda x: pd.Series(x)['elevation']
eleS = pd.Series(ele).apply(extract_ele)
eleS
  • Coastal neighborhood?

NOTE: Still some mistakes it seems but neighborhood_cleansed works much better than neighborhood.

coastal_neighborhoods = ['Wooded Area', 'Ocean Beach', 'Pacific Beach', \
                         'La Jolla', 'Torrey Pines', 'Del Mar Heighs', \
                         'Mission Bay']
def coastal(neigh):
    if neigh in coastal_neighborhoods:
        return 1
    else:
        return 0
gdb['coastal_neig'] = gdb['neighbourhood_cleansed'].apply(coastal)

gdb.plot(column='coastal_neig', s=1,
         categorical=True, legend=True);
  • Large neighborhood

We keep only observations in neighborhoods with more than 25 AirBnb houses so FE on neighborhood make sense.

lrg_nei = gdb.groupby('neighbourhood_cleansed').size() > 25
gdb['lrg_nei'] = gdb['neighbourhood_cleansed'].map(lrg_nei)
  • List to keep
xs = ['accommodates', 'bathrooms', 'bedrooms', 
      'beds', 'neighbourhood_cleansed', 'pool',
      'd2balboa', 'coastal_neig', 'lrg_nei',
      'priceN', 'l_price',
      'geometry', 'id']

16.2.4 Dummies

  • Room type
rt = pd.get_dummies(gdb['room_type'], prefix='rt').rename(columns=lambda x: x.replace(' ', '_'))
  • Property type
def simplify(p):
    bigs = ['House', 'Apartment', 'Condominium', 'Townhouse']
    if p in bigs:
        return p
    else:
        return 'Other'

gdb['property_group'] = gdb['property_type'].apply(simplify)
pg = pd.get_dummies(gdb['property_group'], prefix='pg')

16.3 Write out

gdb[['lrg_nei']].info()
! rm 'regression_db.geojson'
final = gdb[xs].join(pg)\
               .join(rt)\
               .rename(columns={'priceN': 'price'})\
               .loc[gdb['lrg_nei']==True, :]\
               .drop(['lrg_nei'], axis=1)\
               .dropna()
final = final.rename(columns=dict(neighbourhood_cleansed='neighborhood', 
                          coastal_neig='coastal',
                          l_price = 'log_price'))
!rm regression_db.geojson
final.to_file('regression_db.geojson', driver='GeoJSON')
final.info()