You can find details about how I obtained the NYC Airbnb dataset here
Let’s set some global options
knitr::opts_chunk$set(
fig.width = 6,
fig.asp = .6,
out.width = "90%"
)
library(tidyverse)
library(viridis)
Let’s clean the data a bit
nyc_airbnb = read_csv("data/nyc_airbnb.csv") %>%
select(-X1) %>%
mutate(stars = review_scores_location/2) %>%
rename(borough = neighbourhood_group)
Now that we have our data and libraries ready. Let the exploration begin!
nyc_airbnb %>%
count(room_type) %>% knitr::kable()
room_type | n |
---|---|
Entire home/apt | 26050 |
Private room | 22822 |
Shared room | 1169 |
nyc_airbnb %>% count(borough) %>%
arrange(desc(n)) %>%
knitr::kable()
borough | n |
---|---|
Manhattan | 23052 |
Brooklyn | 20234 |
Queens | 5454 |
Bronx | 960 |
Staten Island | 341 |
nyc_airbnb %>%
group_by(neighbourhood) %>%
summarize(median = median(stars, na.rm = TRUE),
mean = mean(stars, na.rm = TRUE),
number_of_listings = n()) %>%
knitr::kable()
neighbourhood | median | mean | number_of_listings |
---|---|---|---|
Allerton | 4.50 | 4.553571 | 28 |
Arden Heights | 4.25 | 4.250000 | 2 |
Arrochar | 5.00 | 4.718750 | 19 |
Arverne | 5.00 | 4.700000 | 81 |
Astoria | 5.00 | 4.789148 | 940 |
Bath Beach | 5.00 | 4.818182 | 13 |
Battery Park City | 5.00 | 4.918919 | 66 |
Bay Ridge | 5.00 | 4.791262 | 133 |
Bay Terrace | 4.50 | 4.500000 | 5 |
Bay Terrace, Staten Island | 4.75 | 4.750000 | 3 |
Baychester | 4.25 | 4.250000 | 6 |
Bayside | 4.75 | 4.708333 | 45 |
Bayswater | 4.50 | 4.416667 | 17 |
Bedford-Stuyvesant | 4.50 | 4.534142 | 3552 |
Belle Harbor | 5.00 | 5.000000 | 6 |
Bellerose | 5.00 | 4.444444 | 14 |
Belmont | 4.50 | 4.642857 | 17 |
Bensonhurst | 4.50 | 4.448980 | 62 |
Bergen Beach | 4.50 | 4.571429 | 12 |
Boerum Hill | 5.00 | 4.932099 | 196 |
Borough Park | 4.50 | 4.494444 | 124 |
Breezy Point | 4.75 | 4.750000 | 2 |
Briarwood | 4.50 | 4.613636 | 53 |
Brighton Beach | 5.00 | 4.769231 | 65 |
Bronxdale | 5.00 | 4.687500 | 18 |
Brooklyn Heights | 5.00 | 4.961538 | 148 |
Brownsville | 4.50 | 4.241667 | 74 |
Bull’s Head | 5.00 | 5.000000 | 5 |
Bushwick | 4.50 | 4.625798 | 2461 |
Cambria Heights | 4.50 | 4.692308 | 19 |
Canarsie | 4.50 | 4.471963 | 127 |
Carroll Gardens | 5.00 | 4.925258 | 230 |
Castle Hill | 4.75 | 4.750000 | 5 |
Castleton Corners | 5.00 | 4.875000 | 6 |
Chelsea | 5.00 | 4.934833 | 1161 |
Chelsea, Staten Island | NA | NaN | 1 |
Chinatown | 5.00 | 4.722570 | 394 |
City Island | 5.00 | 5.000000 | 15 |
Civic Center | 5.00 | 4.794872 | 67 |
Claremont Village | 4.50 | 4.300000 | 22 |
Clason Point | 4.50 | 4.708333 | 19 |
Clifton | 4.50 | 4.227273 | 13 |
Clinton Hill | 5.00 | 4.805319 | 594 |
Co-op City | 4.50 | 4.666667 | 6 |
Cobble Hill | 5.00 | 4.929348 | 103 |
College Point | 4.50 | 4.676471 | 20 |
Columbia St | 5.00 | 4.763889 | 43 |
Concord | 5.00 | 4.735294 | 19 |
Concourse | 4.50 | 4.475000 | 52 |
Concourse Village | 4.50 | 4.312500 | 37 |
Coney Island | 4.50 | 4.447368 | 22 |
Corona | 4.50 | 4.670454 | 53 |
Crown Heights | 4.50 | 4.585004 | 1608 |
Cypress Hills | 4.50 | 4.395238 | 118 |
Ditmars Steinway | 5.00 | 4.770270 | 346 |
Dongan Hills | 4.75 | 4.750000 | 2 |
Douglaston | 5.00 | 4.875000 | 8 |
Downtown Brooklyn | 5.00 | 4.896552 | 84 |
DUMBO | 5.00 | 4.804348 | 38 |
Dyker Heights | 4.50 | 4.638889 | 23 |
East Elmhurst | 4.50 | 4.692000 | 152 |
East Flatbush | 4.50 | 4.416667 | 397 |
East Harlem | 4.50 | 4.509424 | 1191 |
East Morrisania | 4.50 | 4.428571 | 8 |
East New York | 4.50 | 4.267956 | 210 |
East Village | 5.00 | 4.875550 | 2012 |
Eastchester | 4.50 | 4.687500 | 12 |
Edenwald | 5.00 | 4.750000 | 9 |
Edgemere | 4.50 | 4.166667 | 9 |
Elmhurst | 4.50 | 4.663889 | 226 |
Eltingville | 5.00 | 5.000000 | 3 |
Emerson Hill | 5.00 | 5.000000 | 4 |
Far Rockaway | 4.50 | 4.500000 | 29 |
Fieldston | 5.00 | 4.916667 | 13 |
Financial District | 5.00 | 4.910405 | 604 |
Flatbush | 4.50 | 4.633745 | 646 |
Flatiron District | 5.00 | 4.981013 | 107 |
Flatlands | 4.50 | 4.418182 | 70 |
Flushing | 4.50 | 4.634561 | 422 |
Fordham | 4.50 | 4.214286 | 35 |
Forest Hills | 5.00 | 4.754310 | 148 |
Fort Greene | 5.00 | 4.860955 | 451 |
Fort Hamilton | 5.00 | 4.822222 | 57 |
Fort Wadsworth | NA | NaN | 1 |
Fresh Meadows | 4.50 | 4.673913 | 27 |
Gerritsen Beach | 4.75 | 4.750000 | 3 |
Glen Oaks | 4.50 | 4.500000 | 1 |
Glendale | 4.50 | 4.428571 | 47 |
Gowanus | 5.00 | 4.774510 | 245 |
Gramercy | 5.00 | 4.923701 | 408 |
Graniteville | 4.50 | 4.333333 | 5 |
Grant City | 5.00 | 4.833333 | 3 |
Gravesend | 4.50 | 4.662500 | 56 |
Great Kills | 5.00 | 4.666667 | 9 |
Greenpoint | 5.00 | 4.795073 | 1157 |
Greenwich Village | 5.00 | 4.974359 | 412 |
Grymes Hill | 5.00 | 4.833333 | 4 |
Harlem | 4.50 | 4.631371 | 2807 |
Hell’s Kitchen | 5.00 | 4.907673 | 2087 |
Highbridge | 4.50 | 4.483333 | 32 |
Hollis | 5.00 | 4.444444 | 13 |
Holliswood | 4.50 | 4.500000 | 2 |
Howard Beach | 5.00 | 4.818182 | 16 |
Howland Hook | 4.50 | 4.500000 | 1 |
Huguenot | 5.00 | 5.000000 | 3 |
Hunts Point | 4.00 | 4.227273 | 14 |
Inwood | 4.50 | 4.548781 | 266 |
Jackson Heights | 5.00 | 4.744828 | 186 |
Jamaica | 4.50 | 4.547771 | 191 |
Jamaica Estates | 5.00 | 4.791667 | 21 |
Jamaica Hills | 5.00 | 5.000000 | 3 |
Kensington | 4.50 | 4.628906 | 171 |
Kew Gardens | 5.00 | 4.807692 | 43 |
Kew Gardens Hills | 4.50 | 4.625000 | 17 |
Kingsbridge | 4.50 | 4.575472 | 66 |
Kips Bay | 5.00 | 4.834232 | 511 |
Laurelton | 4.50 | 4.583333 | 15 |
Lighthouse Hill | 4.75 | 4.750000 | 3 |
Little Italy | 5.00 | 4.842857 | 141 |
Little Neck | 5.00 | 5.000000 | 2 |
Long Island City | 5.00 | 4.748307 | 546 |
Longwood | 4.50 | 4.473684 | 50 |
Lower East Side | 5.00 | 4.819608 | 981 |
Manhattan Beach | 4.50 | 4.571429 | 9 |
Marble Hill | 5.00 | 4.750000 | 12 |
Mariners Harbor | 4.25 | 4.250000 | 8 |
Maspeth | 4.50 | 4.493506 | 97 |
Melrose | 4.00 | 4.125000 | 6 |
Middle Village | 4.50 | 4.692308 | 28 |
Midland Beach | 5.00 | 4.700000 | 6 |
Midtown | 5.00 | 4.919551 | 1696 |
Midwood | 4.50 | 4.628049 | 121 |
Mill Basin | 4.50 | 4.500000 | 6 |
Morningside Heights | 5.00 | 4.829670 | 398 |
Morris Heights | 4.50 | 4.500000 | 23 |
Morris Park | 5.00 | 4.909091 | 16 |
Morrisania | 4.50 | 4.583333 | 13 |
Mott Haven | 4.50 | 4.409091 | 51 |
Mount Eden | 4.50 | 4.555556 | 10 |
Mount Hope | 4.50 | 4.571429 | 14 |
Murray Hill | 5.00 | 4.834936 | 488 |
Navy Yard | 4.50 | 4.681818 | 13 |
Neponsit | 5.00 | 5.000000 | 2 |
New Brighton | 4.50 | 4.333333 | 4 |
New Dorp | 5.00 | 5.000000 | 2 |
New Dorp Beach | 4.50 | 4.500000 | 4 |
New Springville | 5.00 | 4.900000 | 8 |
NoHo | 5.00 | 4.976923 | 81 |
Nolita | 5.00 | 4.952586 | 339 |
North Riverdale | 4.75 | 4.625000 | 10 |
Norwood | 4.50 | 4.520000 | 37 |
Oakwood | 5.00 | 5.000000 | 2 |
Olinville | 5.00 | 5.000000 | 3 |
Ozone Park | 4.50 | 4.557143 | 49 |
Park Slope | 5.00 | 4.956221 | 544 |
Parkchester | 4.50 | 4.637931 | 33 |
Pelham Bay | 4.50 | 4.346154 | 15 |
Pelham Gardens | 4.75 | 4.750000 | 25 |
Pleasant Plains | NA | NaN | 1 |
Port Morris | 4.50 | 4.357143 | 38 |
Port Richmond | 4.50 | 4.357143 | 10 |
Prince’s Bay | 5.00 | 5.000000 | 3 |
Prospect Heights | 5.00 | 4.920128 | 371 |
Prospect-Lefferts Gardens | 4.50 | 4.612766 | 574 |
Queens Village | 4.50 | 4.583333 | 45 |
Randall Manor | 4.50 | 4.500000 | 18 |
Red Hook | 4.50 | 4.663934 | 72 |
Rego Park | 5.00 | 4.641975 | 96 |
Richmond Hill | 4.50 | 4.557971 | 79 |
Richmondtown | 5.00 | 5.000000 | 1 |
Ridgewood | 4.50 | 4.583612 | 413 |
Riverdale | 5.00 | 4.833333 | 15 |
Rockaway Beach | 5.00 | 4.738095 | 70 |
Roosevelt Island | 5.00 | 4.692983 | 83 |
Rosebank | 4.50 | 4.500000 | 4 |
Rosedale | 4.50 | 4.413793 | 37 |
Rossville | 5.00 | 5.000000 | 1 |
Schuylerville | 4.50 | 4.600000 | 7 |
Sea Gate | 5.00 | 4.666667 | 5 |
Sheepshead Bay | 5.00 | 4.763441 | 154 |
Shore Acres | 4.75 | 4.750000 | 9 |
Silver Lake | 5.00 | 5.000000 | 2 |
SoHo | 5.00 | 4.977032 | 380 |
Soundview | 4.75 | 4.750000 | 9 |
South Beach | 5.00 | 4.875000 | 5 |
South Ozone Park | 4.50 | 4.558823 | 48 |
South Slope | 5.00 | 4.939076 | 290 |
Springfield Gardens | 4.50 | 4.500000 | 57 |
Spuyten Duyvil | 4.75 | 4.625000 | 5 |
St. Albans | 4.50 | 4.561224 | 62 |
St. George | 5.00 | 4.743590 | 47 |
Stapleton | 4.50 | 4.586207 | 34 |
Stuyvesant Town | 5.00 | 4.769231 | 28 |
Sunnyside | 5.00 | 4.785185 | 350 |
Sunset Park | 4.50 | 4.656151 | 410 |
Theater District | 5.00 | 4.903704 | 250 |
Throgs Neck | 5.00 | 4.846154 | 14 |
Todt Hill | 5.00 | 5.000000 | 1 |
Tompkinsville | 4.50 | 4.590909 | 38 |
Tottenville | 5.00 | 4.833333 | 4 |
Tremont | 4.00 | 4.187500 | 10 |
Tribeca | 5.00 | 4.954918 | 171 |
Two Bridges | 4.50 | 4.638889 | 69 |
Unionport | 4.50 | 4.500000 | 4 |
University Heights | 4.50 | 4.473684 | 23 |
Upper East Side | 5.00 | 4.875772 | 1952 |
Upper West Side | 5.00 | 4.901019 | 2156 |
Van Nest | 4.50 | 4.363636 | 15 |
Vinegar Hill | 5.00 | 4.708333 | 31 |
Wakefield | 4.50 | 4.500000 | 41 |
Washington Heights | 4.50 | 4.585714 | 919 |
West Brighton | 4.50 | 4.535714 | 17 |
West Farms | 4.50 | 4.166667 | 5 |
West Village | 5.00 | 4.974097 | 815 |
Westchester Square | 4.50 | 4.583333 | 7 |
Westerleigh | 5.00 | 5.000000 | 3 |
Whitestone | 5.00 | 4.857143 | 12 |
Williamsbridge | 4.50 | 4.465517 | 39 |
Williamsburg | 5.00 | 4.856574 | 4168 |
Willowbrook | 4.50 | 4.500000 | 1 |
Windsor Terrace | 5.00 | 4.855634 | 173 |
Woodhaven | 4.50 | 4.416667 | 73 |
Woodlawn | 4.75 | 4.750000 | 8 |
Woodrow | 4.50 | 4.500000 | 2 |
Woodside | 5.00 | 4.708333 | 211 |
nyc_airbnb %>%
group_by(room_type) %>%
summarize(median_stars = median(stars, na.rm = TRUE),
average_stars = mean(stars, na.rm = TRUE),
number_of_listings = n())
## # A tibble: 3 x 4
## room_type median_stars average_stars number_of_listings
## <chr> <dbl> <dbl> <int>
## 1 Entire home/apt 5 4.78 26050
## 2 Private room 5 4.71 22822
## 3 Shared room 5 4.68 1169
ggplot(nyc_airbnb, aes(x = stars), color = boro) +
geom_histogram() +
facet_grid(~room_type)
nyc_airbnb %>%
group_by(borough, room_type) %>%
summarise(
mean_stars = mean(stars, na.rm = TRUE)) %>%
spread(key = room_type, value = mean_stars)
## # A tibble: 5 x 4
## # Groups: borough [5]
## borough `Entire home/apt` `Private room` `Shared room`
## <chr> <dbl> <dbl> <dbl>
## 1 Bronx 4.59 4.50 4.35
## 2 Brooklyn 4.73 4.67 4.61
## 3 Manhattan 4.84 4.79 4.80
## 4 Queens 4.71 4.68 4.60
## 5 Staten Island 4.69 4.62 5
nyc_airbnb %>%
filter(borough == "Manhattan", price < 1000) %>%
ggplot(aes(x = longitude, y = latitude, color = price)) +
geom_point(alpha = .2) +
facet_grid(~room_type) +
coord_cartesian() +
scale_color_viridis()
Entire homes are more expensive than other room types, and are more clustered in southern Manhattan.
nyc_airbnb %>%
select(borough, neighbourhood, price) %>%
filter(borough == "Manhattan") %>%
group_by(neighbourhood) %>%
summarize(mean_price = mean(price, na.rm = TRUE),
median_price = median(price, na.rm = TRUE)) %>%
top_n(5, median_price) %>%
arrange(desc(median_price))
## # A tibble: 5 x 3
## neighbourhood mean_price median_price
## <chr> <dbl> <dbl>
## 1 Tribeca 413. 251
## 2 NoHo 265. 245
## 3 Battery Park City 295. 225
## 4 Flatiron District 318. 218
## 5 Midtown 278. 210
nyc_airbnb %>%
select(borough, neighbourhood, price) %>%
filter(borough == "Brooklyn") %>%
group_by(neighbourhood) %>%
summarize(mean_price = mean(price, na.rm = TRUE),
median_price = median(price, na.rm = TRUE)) %>%
top_n(5, median_price) %>%
arrange(desc(median_price))
## # A tibble: 6 x 3
## neighbourhood mean_price median_price
## <chr> <dbl> <dbl>
## 1 DUMBO 190. 162.
## 2 Vinegar Hill 164. 159
## 3 Boerum Hill 180. 150
## 4 Brooklyn Heights 195. 150
## 5 Carroll Gardens 181. 150
## 6 Downtown Brooklyn 162. 150
nyc_airbnb %>%
select(borough, neighbourhood, price) %>%
filter(borough == "Manhattan") %>%
group_by(neighbourhood) %>%
summarize(mean_price = mean(price, na.rm = TRUE),
median_price = median(price, na.rm = TRUE)) %>%
top_n(-5, median_price) %>%
arrange(median_price)
## # A tibble: 6 x 3
## neighbourhood mean_price median_price
## <chr> <dbl> <dbl>
## 1 Washington Heights 89.4 70
## 2 Inwood 88.3 75
## 3 Roosevelt Island 103. 79
## 4 Marble Hill 82.2 89
## 5 Harlem 120. 90
## 6 Morningside Heights 109. 90
nyc_airbnb %>%
select(borough, neighbourhood, price) %>%
filter(borough == "Brooklyn") %>%
group_by(neighbourhood) %>%
summarize(mean_price = mean(price, na.rm = TRUE),
median_price = median(price, na.rm = TRUE)) %>%
top_n(-5, median_price) %>%
arrange(median_price)
## # A tibble: 6 x 3
## neighbourhood mean_price median_price
## <chr> <dbl> <dbl>
## 1 Gerritsen Beach 74 50
## 2 Borough Park 69.7 55
## 3 Gravesend 81.3 61
## 4 Cypress Hills 80.2 61.5
## 5 Brownsville 86.3 65
## 6 Bushwick 81.8 65