So this week we have some data on San Francisco trees. This week I’m more interested in plotting some fancy maps than in anything else, so I researched a little bit how to achieve it. I found a package called leaflet. It allows you to plot interactive maps using coordinates and also add markers to it. The markers are fully customizable. The only thing I didn’t like is that I didn’t find good docs about it.

library(tidyverse)
knitr::opts_chunk$set(echo = TRUE, fig.align="center")
knitr::opts_chunk$set(fig.width=12, fig.height=8) 

sf_trees <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-28/sf_trees.csv')

glimpse(sf_trees)
## Observations: 192,987
## Variables: 12
## $ tree_id      <dbl> 53719, 30313, 30312, 30314, 30315, 30316, 48435, 30319...
## $ legal_status <chr> "Permitted Site", "Permitted Site", "Permitted Site", ...
## $ species      <chr> "Tree(s) ::", "Tree(s) ::", "Tree(s) ::", "Pittosporum...
## $ address      <chr> "2963 Webster St", "501 Arkansas St", "501 Arkansas St...
## $ site_order   <dbl> 1, 3, 2, 1, 5, 6, 4, 2, 1, 3, 1, 3, 1, 2, 4, 1, 1, 1, ...
## $ site_info    <chr> "Sidewalk: Curb side : Cutout", "Sidewalk: Curb side :...
## $ caretaker    <chr> "Private", "Private", "Private", "Private", "Private",...
## $ date         <date> 1955-09-19, 1955-10-20, 1955-10-20, 1955-10-20, 1955-...
## $ dbh          <dbl> NA, NA, NA, 16, NA, NA, NA, NA, NA, NA, 2, NA, NA, NA,...
## $ plot_size    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ latitude     <dbl> 37.79787, 37.75984, 37.75984, 37.75977, 37.79265, 37.7...
## $ longitude    <dbl> -122.4341, -122.3981, -122.3981, -122.3981, -122.4124,...
trees <- sf_trees %>%
  mutate_if(is.character, fct_explicit_na)

Basic data visualization and exploratory analysis

trees %>%
  count(species) %>%
  arrange(n)
## # A tibble: 571 x 2
##    species                                             n
##    <fct>                                           <int>
##  1 Acer campestre :: Hedge Maple                       1
##  2 Acer ginnela :: Amur Maple                          1
##  3 Acer paxii :: Evergreen Maple                       1
##  4 Acer platanoides 'Crimson King' :: Norway maple     1
##  5 Acer platanoides :: Norway Maple                    1
##  6 Acer saccharum :: Sugar Maple                       1
##  7 Acer tegmentosum :: Manchurian snakebark maple      1
##  8 Acer x 'Autumn Blaze' :: Hybrid Maple               1
##  9 Aesculus spp :: Horsechestnut                       1
## 10 Alnus rubra :: Red Alder                            1
## # ... with 561 more rows

We have a lot of species with less than 100 observations, may be worth to discard those samples. Let’s take a look at legal_status

trees %>%
  count(legal_status, sort = TRUE)
## # A tibble: 10 x 2
##    legal_status                      n
##    <fct>                         <int>
##  1 DPW Maintained               141725
##  2 Permitted Site                39732
##  3 Undocumented                   8106
##  4 Significant Tree               1648
##  5 Planning Code 138.1 required    971
##  6 Property Tree                   316
##  7 Section 143                     230
##  8 Private                         163
##  9 (Missing)                        54
## 10 Landmark tree                    42

Here we can see that it happens also with legal_status. I’m going to do some geom_col() plots for them and also the other variables, to get a better idea of the dataset.

trees %>%
  count(species, sort = TRUE) %>%
  head(20) %>%
  mutate(species = reorder(species, n)) %>%
  ggplot(aes(x = species, y = n)) + geom_col() + coord_flip()

trees %>%
  count(legal_status, sort = TRUE) %>%
  mutate(legal_status = reorder(legal_status, n)) %>%
  ggplot(aes(x = legal_status, y = n)) + geom_col() + coord_flip()

trees %>%
  count(site_info, sort = TRUE) %>%
  mutate(site_info = reorder(site_info, n)) %>%
  ggplot(aes(x = site_info, y = n)) + geom_col() + coord_flip()

trees %>%
  count(caretaker, sort = TRUE) %>%
  mutate(caretaker = reorder(caretaker, n)) %>%
  ggplot(aes(x = caretaker, y = n)) + geom_col() + coord_flip()

So visually we can see that there are some factors that it’s count is really low. For demonstration purposes, I’m going to use fct_lump to collapse a little bit the species column.

trees_lump <- trees %>% 
  mutate(species = fct_lump_min(species, 100))

trees_lump %>% 
  count(species, sort = TRUE)
## # A tibble: 156 x 2
##    species                                            n
##    <fct>                                          <int>
##  1 Tree(s) ::                                     11629
##  2 Platanus x hispanica :: Sycamore: London Plane 11557
##  3 Metrosideros excelsa :: New Zealand Xmas Tree   8744
##  4 Lophostemon confertus :: Brisbane Box           8581
##  5 Other                                           7842
##  6 Tristaniopsis laurina :: Swamp Myrtle           7197
##  7 Pittosporum undulatum :: Victorian Box          7122
##  8 Prunus cerasifera :: Cherry Plum                6716
##  9 Magnolia grandiflora :: Southern Magnolia       6285
## 10 Arbutus 'Marina' :: Hybrid Strawberry Tree      5702
## # ... with 146 more rows
trees_lump %>%
  filter(species == "Other")
## # A tibble: 7,842 x 12
##    tree_id legal_status species address site_order site_info caretaker
##      <dbl> <fct>        <fct>   <fct>        <dbl> <fct>     <fct>    
##  1   30350 DPW Maintai~ Other   440 Ar~          1 Sidewalk~ Private  
##  2   53216 DPW Maintai~ Other   828 Ar~          1 Sidewalk~ Private  
##  3   53249 DPW Maintai~ Other   348 Te~          1 Sidewalk~ Private  
##  4   53309 Permitted S~ Other   1775 1~          1 Sidewalk~ Private  
##  5   53302 Permitted S~ Other   38 Lev~          1 Sidewalk~ Private  
##  6   53426 Permitted S~ Other   26 Par~          1 Sidewalk~ Private  
##  7   53496 DPW Maintai~ Other   37 Coo~          1 Sidewalk~ Private  
##  8   53927 Permitted S~ Other   616-61~          1 Sidewalk~ Private  
##  9   53947 DPW Maintai~ Other   100 Ga~          1 Sidewalk~ Private  
## 10   54018 DPW Maintai~ Other   850 35~          1 Sidewalk~ Private  
## # ... with 7,832 more rows, and 5 more variables: date <date>, dbh <dbl>,
## #   plot_size <fct>, latitude <dbl>, longitude <dbl>

I’m interested in check wheter the caretaker is private or not, so I’m going to create a new column to check it, and later make a model with tidymodels.

trees_proc <- trees_lump %>%
  mutate(isPrivate = case_when(
    as.character(caretaker) == "Private" ~ as.character(caretaker),
            TRUE ~ "Other")) %>%
  filter(longitude > -125) %>%
  select(-caretaker)

trees_proc %>%
  count(isPrivate, sort = TRUE)
## # A tibble: 2 x 2
##   isPrivate      n
##   <chr>      <int>
## 1 Private   160061
## 2 Other      29964

Cool! Let’s end the visualization making a plot with the lat and long For the sake of complexity and visualization, I’m only going to plot 2000 samples in the interactive plot, and 10000 in the static one.

library(leaflet)

set.seed(555)
pal <- colorFactor(c("green", "navy"), domain = c("Other", "Private"))
trees_proc %>%
  sample_n(2000) %>%
  leaflet() %>%
  addTiles() %>%
  addCircleMarkers(
    radius = 0.5,
    color = ~pal(isPrivate)
  ) %>%
  addLegend("bottomright", pal = pal, values = ~isPrivate,
             title = "Caretaker type")
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
trees_proc %>%
  sample_n(10000) %>%
  ggplot(aes(x=latitude, y = longitude, color = isPrivate)) + geom_point(alpha = 0.2) + theme_minimal()

Time to model things! I’m going to downsample isPrivate, because we are working with an unbalanced dataset. Also I’m gonna do a little bit more of preprocessing, recipes makes it really easy. In Julia Silge’s screencast I saw how she used the doParallel library, and after checking the package out I can’t figure why I didn’t use it until now. The training may take a while…

library(tidymodels)
library(lubridate)

trees_proc <- trees_proc %>%
  mutate(site_order = as_factor(site_order)) %>%
  mutate(site_order = fct_explicit_na(site_order, na_level = "-1"))

trees_split <- initial_split(trees_proc)
trees_train <- training(trees_split)
trees_test <- testing(trees_split)

trees_recipe <- trees_train %>%
  recipe(isPrivate ~ .) %>%
  update_role(address, new_role = "loc") %>%
  update_role(tree_id, new_role = "id") %>%
  step_rm(date, plot_size, dbh) %>%
  step_downsample(isPrivate) %>%
  step_dummy(all_predictors(), -all_outcomes(), -all_numeric()) %>%
  prep()
  

trees_train <- juice(trees_recipe)
trees_testing <- trees_recipe %>%
  bake(trees_test)

library(doParallel)
doParallel::registerDoParallel()
trees_ranger <- rand_forest(trees = 100, mode = "classification") %>%
  set_engine("ranger", importance = "permutation") %>%
  fit(isPrivate ~ ., data = trees_train)

predict(trees_ranger, trees_testing)
## # A tibble: 47,506 x 1
##    .pred_class
##    <fct>      
##  1 Private    
##  2 Private    
##  3 Private    
##  4 Private    
##  5 Private    
##  6 Private    
##  7 Private    
##  8 Private    
##  9 Private    
## 10 Private    
## # ... with 47,496 more rows
trees_ranger %>%
  predict(trees_testing) %>%
  bind_cols(trees_testing) %>%
  metrics(truth = isPrivate, estimate = .pred_class)
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.942
## 2 kap      binary         0.798
trees_ranger %>%
  predict(trees_testing) %>%
  count(.pred_class)
## # A tibble: 2 x 2
##   .pred_class     n
##   <fct>       <int>
## 1 Other        9014
## 2 Private     38492