UseR conference Wickham notes

2016/08/01

Notes from Wickham useR 2016 presentation.

library(dplyr)    # tools: filter, slice, summarise, select, rename, arrange, sample_n
library(tidyr)    # tools: gather, separate, spread
library(stringr)
library(readr)
library(ggplot2)

library(purrr)
library(modelr) # experimental



# Generate some data and visualize ----------------------------------------------------------------------------

# (this is real model behind phenomenom)
true_model <- function(x){
  1 + 2 * x + rnorm(length(x), sd = 0.25)
}

# Example what model gives you
true_model(9)

# Make data of observations. This is what we are modelling.
df <- data_frame(
  x = seq(0, 1, length = 20),
  y = true_model(x)
)

# Draw observations
df %>% 
  ggplot(aes(x, y)) +
  geom_point()


# Fit an overly complicated model -----------------------------------------------------------------------------

my_model <- function(df){
  # Polynomial model with 8 degrees of freedom
  lm(y ~ poly(x, 8), data = df)
}
mod <- my_model(df)

grid <- df %>% tidyr::expand(x = seq_range(x, 50))
preds <- grid %>% add_predictions(mod, var = "y")

df %>% 
  ggplot(aes(x, y)) +
  geom_line(data = preds) +
  geom_point()

rmse(mod, df)


# Get a better estimate of error -----------------------------------------------------------------------------

cv <- crossv_mc(df, 100)
cd <- cv %>% 
  mutate(
    mod = map(train, my_model),
    rmse = map2_dbl(mod, test, rmse)
  )
cv

cv %>%
  ggplot(aes(rmse)) +
  geom_ref_line(v = rmse(mod, df)) +
  geom_freqpoly(binwidth = 0.2) +
  geom_rug()

filter(cv, rmse > 5) %>%
  tidyr::unnest(map(train, as.data.frame)) %>%
  ggplot(aes(x, .id)) +
  geom_point() +
  xlim(0, 1)



# END

Listcols vis

library(ggplot2)
library(tibble)
library(dplyr)
library(USAboundaries) # by Lincoln Mullen

c18 <- us_boundaries(as.Date("1820-01-01"))
class(c18)

# Extract the locations -------------------------------------------------------------------
borders <- c18 %>% fortify() %>% as_data_frame()
borders

ggplot(borders, aes(long, lat)) +
  geom_polygon(aes(group = group), colour = "grey50", fill = NA) +
  coord_quickmap()

# Extract the metadata ---------------------------------------------------------------------

metadata <- c18 %>% 
  as_data_frame() %>% 
  mutate(id = as.character(id_num - 1)) %>% 
  select(id, name, terr_type, start_posix, end_posix)
metadata

borders <- borders %>% 
  left_join(metadata, by = "id")

ggplot(borders, aes(long, lat)) + 
  geom_polygon(aes(group = group, fill = terr_type)) +
  coord_quickmap()

# But this is quite inefficient!
borders

# Instead we could use list-columns ---------------------------------------------------------

borders %>% 
  group_by(id) %>% 
  summarise(long = list(long), lat = list(lat)) %>% 
  left_join(metadata, by = "id")

# END --------------------------------------------------------------------------------------