Notes from Wickham useR 2016 presentation.
library(dplyr) # tools: filter, slice, summarise, select, rename, arrange, sample_n
library(tidyr) # tools: gather, separate, spread
library(stringr)
library(readr)
library(ggplot2)
library(purrr)
library(modelr) # experimental
# Generate some data and visualize ----------------------------------------------------------------------------
# (this is real model behind phenomenom)
true_model <- function(x){
1 + 2 * x + rnorm(length(x), sd = 0.25)
}
# Example what model gives you
true_model(9)
# Make data of observations. This is what we are modelling.
df <- data_frame(
x = seq(0, 1, length = 20),
y = true_model(x)
)
# Draw observations
df %>%
ggplot(aes(x, y)) +
geom_point()
# Fit an overly complicated model -----------------------------------------------------------------------------
my_model <- function(df){
# Polynomial model with 8 degrees of freedom
lm(y ~ poly(x, 8), data = df)
}
mod <- my_model(df)
grid <- df %>% tidyr::expand(x = seq_range(x, 50))
preds <- grid %>% add_predictions(mod, var = "y")
df %>%
ggplot(aes(x, y)) +
geom_line(data = preds) +
geom_point()
rmse(mod, df)
# Get a better estimate of error -----------------------------------------------------------------------------
cv <- crossv_mc(df, 100)
cd <- cv %>%
mutate(
mod = map(train, my_model),
rmse = map2_dbl(mod, test, rmse)
)
cv
cv %>%
ggplot(aes(rmse)) +
geom_ref_line(v = rmse(mod, df)) +
geom_freqpoly(binwidth = 0.2) +
geom_rug()
filter(cv, rmse > 5) %>%
tidyr::unnest(map(train, as.data.frame)) %>%
ggplot(aes(x, .id)) +
geom_point() +
xlim(0, 1)
# END
Listcols vis
library(ggplot2)
library(tibble)
library(dplyr)
library(USAboundaries) # by Lincoln Mullen
c18 <- us_boundaries(as.Date("1820-01-01"))
class(c18)
# Extract the locations -------------------------------------------------------------------
borders <- c18 %>% fortify() %>% as_data_frame()
borders
ggplot(borders, aes(long, lat)) +
geom_polygon(aes(group = group), colour = "grey50", fill = NA) +
coord_quickmap()
# Extract the metadata ---------------------------------------------------------------------
metadata <- c18 %>%
as_data_frame() %>%
mutate(id = as.character(id_num - 1)) %>%
select(id, name, terr_type, start_posix, end_posix)
metadata
borders <- borders %>%
left_join(metadata, by = "id")
ggplot(borders, aes(long, lat)) +
geom_polygon(aes(group = group, fill = terr_type)) +
coord_quickmap()
# But this is quite inefficient!
borders
# Instead we could use list-columns ---------------------------------------------------------
borders %>%
group_by(id) %>%
summarise(long = list(long), lat = list(lat)) %>%
left_join(metadata, by = "id")
# END --------------------------------------------------------------------------------------