2 Data
In this section we are going to download and explore datasets before editing and storing data into duckdb. Data sets consists of the population and disease prevalances information. It can be downloaded as (several) CSV-files from the links below:
Download data manually and store them to data/ folder. There are many csv-files so loading those can be done easily with vroom and fs packages. Before transferring datasets into mighty duckdb database, we find out little bit more what these data sets holds.
2.1 Population
We are interested of the population by age and gender in European countries.
2.1.1 Load data into R
Load the vroom() library. Load one data set into IHME2010 object and use the id argument to add the file name to the data frame. Use file_name as the argument’s value. Load the file spec into a variable called spec, using the spec() command.
library(vroom)
IHME2010 <- vroom("data/IHME_GBD_2019_POP_2010_2019_SYA/IHME_GBD_2019_POP_SYA_2010_Y2021M01D28.CSV", id = "file_name")
spec <- spec(IHME2010)
spec2.1.2 Explore dataset
Explore dataset and it’s variables by summary()
summary(IHME2010)
rm(IHME2010) # Remove dataset from memory2.1.3 Load and modify multiple files
Load the fs and dplyr libraries. Load to a variable named files list files in the ./data/ folder using the dir_ls() function, use the glob argument to pass a wildcard to list CSV files only.
library(fs)
library(dplyr)
loc <- ("data/IHME_GBD_2019_POP_2010_2019_SYA/")
files <- dir_ls(loc, glob = "*.CSV")
filesGroup population by location_name, year_id, sex_name, age_group_name, file_name and get the total of val and the number of items. Name them total_population and items respectively. Name the new dataset countries. Define the countries variable as NULL prior to the for loop and add a bind_rows() step to countries to preserve each summarized view. Remove the population variable at the end of each cycle.
countries <- NULL
for(i in seq_along(files)) {
population <- vroom(files[i], id = "file_name")
countries <- population %>%
group_by(location_name, year_id, sex_name, age_group_name, age_group_id) %>%
summarise(total_population = sum(val, na.rm = T)) %>%
bind_rows(countries)
rm(population)
}Preview the countries dataset by checking how many countries are in the dataset and calculating total population by year
## How many countries/locations are included?
length(unique(countries$location_name))
## Calculate countries population by year
countries %>%
filter(sex_name == "both") %>%
group_by(location_name, year_id, sex_name) %>%
summarise(pop = sum(total_population)) -> countries_population
head(countries_population)2.1.4 Included European countries
Below is a list of 51 European countries. See which of these countries are NOT included in population dataset
europe <- c("Albania","Andorra","Armenia","Austria","Azerbaijan","Belarus","Belgium","Bosnia and Herzegovina","Bulgaria","Croatia","Cyprus","Czechia","Denmark","Estonia","Finland","France","Georgia","Germany","Greece","Hungary","Iceland","Ireland","Italy","Kazakhstan","Latvia","Lithuania","Luxembourg","Malta","Republic of Moldova","Monaco","Montenegro","Netherlands","North Macedonia","Norway","Poland","Portugal","Romania","Russian Federation","San Marino","Serbia","Slovakia","Slovenia","Spain","Sweden","Switzerland","Turkey","Ukraine","United Kingdom")Create new dataset which has only europe countries and compare which are not included
eucountries <- countries %>%
filter(location_name %in% europe)
## How many European countries are included in the dataset?
length(unique(eucountries$location_name))
## These countries are missing
europe[!europe %in% eucountries$location_name]2.1.5 Europe total population in 2019
Calculate Total population in 2019 of selected European countries
eucountries %>%
filter(sex_name == "both" & year_id == 2019 & location_name %in% europe) %>%
group_by(2019) %>%
summarise(pop = sum(total_population)) -> total_population_2019
total_population_20192.1.6 PLOT: Finland population 2019
Draw a plot of Finland population in 2019 by age and gender
## re-adjust age_group variable of the population for filtering
eucountries$age_group <- eucountries$age_group_name
eucountries$age_group[eucountries$age_group == "<1 year"] <- 0
eucountries$age_group[eucountries$age_group == "95 plus"] <- 95
eucountries$age_group <- as.integer(eucountries$age_group)
## plot age distribution
library(ggplot2)
ggplot(data = eucountries[eucountries$sex_name %in% c("female", "male") & eucountries$location_name == "Finland" & eucountries$year_id == 2019, ]) +
geom_bar(aes(x=age_group, y=total_population, fill=sex_name), stat = "identity") +
labs(x="age", y="freq", title="Finland population 2019")2.1.7 PLOT: European countries population
Filter and group dataset to get total population by years and countries
eucountries %>%
filter(sex_name == "both" & year_id == 2019) %>%
group_by(location_name, year_id) %>%
summarise(total_population = sum(total_population)) -> populations
populations$year_id <- as.integer(populations$year_id)Draw ordered population bar diagram of the populations
options(scipen = 999)
ggplot(data = populations) +
geom_bar(aes(x=reorder(location_name,total_population), y=total_population, group=year_id), position="dodge", stat="identity") +
coord_flip() +
labs(x="", y="population", title = "Countries total population in 2019")2.2 IHME dataset
Exploring IHME dataset which is publicly available at GBD webpage. Dataset has prevalences and incidences of different disease by countries, age group and sex. You can download full dataset by this link.
2.2.1 Load few datasets
Firstly load libraries
library(vroom)
library(fs)
library(dplyr)Downloaded data is stored in data/ihmedata/. List files under the data location by using fs-package and load few files
files <- fs::dir_ls ("data/ihmedata/", recurse = T, glob = "*.csv")
vroom(file = files, n_max = 1000, id = "file_name")Let’s find out what two datasets has using command spec() and summary()
d1 <- vroom(file = files[1])
spec(d1)
summary(d1)d2 <- vroom(file = files[2])
spec(d2)
summary(d2)Locations on each datasets
length(unique(d1$location_name))
length(unique(d2$location_name))Causes on each datasets
length(unique(d1$cause_name))
length(unique(d2$cause_name))d1 %>%
group_by(location_name, cause_name) %>%
summarise(n = n(),
val=sum(val)) %>%
arrange(n) %>%
tail(10)
d2 %>%
group_by(location_name, cause_name) %>%
summarise(n = n(),
val=sum(val)) %>%
arrange(n) %>%
tail(10)2.2.2 Load all datasets
Load all datasets to one object and filter only Europe countries
europe <- c("Albania","Andorra","Armenia","Austria","Azerbaijan","Belarus","Belgium","Bosnia and Herzegovina","Bulgaria","Croatia","Cyprus","Czechia","Denmark","Estonia","Finland","France","Georgia","Germany","Greece","Hungary","Iceland","Ireland","Italy","Kazakhstan","Kosovo","Latvia","Liechtenstein","Lithuania","Luxembourg","Malta","Moldova","Monaco","Montenegro","Netherlands","North Macedonia","Norway","Poland","Portugal","Romania","Russian Federation","San Marino","Serbia","Slovakia","Slovenia","Spain","Sweden","Switzerland","Turkey","Ukraine","United Kingdom","Vatican City")
ihme <- NULL
for(i in seq_along(files)) {
ihme <- ihme %>%
rbind(vroom(files[i], id = "file_name")) %>%
filter(location_name %in% europe)
}