Data: tidy practise

2016/07/03

Tidy-Practise

# Tidy data practices

library(tidyverse)

# Remember these functions
# Spread - 
# Gather - data, key, value
# 


# Religion data, make it tidy -------------------------------------------------------
pew <- read.delim(
  file = "http://stat405.had.co.nz/data/pew.txt",
  header = TRUE,
  stringsAsFactors = FALSE,
  check.names = FALSE
)
# How many variables there is?
head(pew)
# Three variables: religion, income and frequency.

# To make this dataset to tidy, we need to melt or stack it (make wide to long)
pew_tidy <- gather(pew, income, freq, `<$10k`:`Don't know/refused`)
head(pew_tidy)


# Weather data, make it tidy ------------------------------------------
# This more complicated example.

weather <- read.delim(
  file = "http://stat405.had.co.nz/data/weather.txt",
  strip.white = FALSE
)
# How is datasets?
head(weather)
# This data seems to have two problems; 
# First, it has variables in the rows in the column 'element'. 
# Second, it has a variable 'd' in the column header spread across multiple columns.

# First melt, put variables 'd1:d31' name to variable 'day' and value it consist to variable 'value'
weather_tidy <- gather(weather, day, value, d1:d31)
# make date variable
weather_tidy$date <- as.Date( paste0(weather_tidy$year, "-", weather_tidy$month, "-", substr(weather_tidy$day, 2, nchar(weather_tidy$day))) )
# Remove variables year, month, day
weather_tidy <- weather_tidy[ , c("id", "date", "element", "value")]
# Remove NA's
weather_tidy <-  weather_tidy[!is.na(weather_tidy$value),]

head(weather_tidy) # check data

# Second melt, variable element is actually two variable TMAX and TMIN. So we need to expand that to own variables.
weather_tidy <- spread(weather_tidy, element, value)

head(weather_tidy) # done!



# Final words -----------------------------------------------------

# There are vairous features of messy data