2 Data

In this section we are going to download and explore datasets before editing and storing data into duckdb. Data sets consists of the population and disease prevalances information. It can be downloaded as (several) CSV-files from the links below:

Download data manually and store them to data/ folder. There are many csv-files so loading those can be done easily with vroom and fs packages. Before transferring datasets into mighty duckdb database, we find out little bit more what these data sets holds.

2.1 Population

We are interested of the population by age and gender in European countries.

2.1.1 Load data into R

Load the vroom() library. Load one data set into IHME2010 object and use the id argument to add the file name to the data frame. Use file_name as the argument’s value. Load the file spec into a variable called spec, using the spec() command.

library(vroom)
IHME2010 <- vroom("data/IHME_GBD_2019_POP_2010_2019_SYA/IHME_GBD_2019_POP_SYA_2010_Y2021M01D28.CSV", id = "file_name")
spec <- spec(IHME2010)
spec

2.1.2 Explore dataset

Explore dataset and it’s variables by summary()

summary(IHME2010)
rm(IHME2010) # Remove dataset from memory

2.1.3 Load and modify multiple files

Load the fs and dplyr libraries. Load to a variable named files list files in the ./data/ folder using the dir_ls() function, use the glob argument to pass a wildcard to list CSV files only.

library(fs)
library(dplyr)
loc <- ("data/IHME_GBD_2019_POP_2010_2019_SYA/")
files <- dir_ls(loc, glob = "*.CSV")
files

Group population by location_name, year_id, sex_name, age_group_name, file_name and get the total of val and the number of items. Name them total_population and items respectively. Name the new dataset countries. Define the countries variable as NULL prior to the for loop and add a bind_rows() step to countries to preserve each summarized view. Remove the population variable at the end of each cycle.

countries <- NULL
for(i in seq_along(files)) {
  population <- vroom(files[i], id = "file_name")
  
  countries <- population %>% 
    group_by(location_name, year_id, sex_name, age_group_name, age_group_id) %>% 
    summarise(total_population = sum(val, na.rm = T)) %>% 
    bind_rows(countries)
  
  rm(population)
}

Preview the countries dataset by checking how many countries are in the dataset and calculating total population by year

## How many countries/locations are included?    
length(unique(countries$location_name))

## Calculate countries population by year 
countries %>% 
  filter(sex_name == "both") %>% 
  group_by(location_name, year_id, sex_name) %>% 
  summarise(pop = sum(total_population)) -> countries_population

head(countries_population)

2.1.4 Included European countries

Below is a list of 51 European countries. See which of these countries are NOT included in population dataset

europe <- c("Albania","Andorra","Armenia","Austria","Azerbaijan","Belarus","Belgium","Bosnia and Herzegovina","Bulgaria","Croatia","Cyprus","Czechia","Denmark","Estonia","Finland","France","Georgia","Germany","Greece","Hungary","Iceland","Ireland","Italy","Kazakhstan","Latvia","Lithuania","Luxembourg","Malta","Republic of Moldova","Monaco","Montenegro","Netherlands","North Macedonia","Norway","Poland","Portugal","Romania","Russian Federation","San Marino","Serbia","Slovakia","Slovenia","Spain","Sweden","Switzerland","Turkey","Ukraine","United Kingdom")

Create new dataset which has only europe countries and compare which are not included

eucountries <- countries %>% 
  filter(location_name %in% europe)

## How many European countries are included in the dataset?
length(unique(eucountries$location_name))

## These countries are missing
europe[!europe %in% eucountries$location_name]

2.1.5 Europe total population in 2019

Calculate Total population in 2019 of selected European countries

eucountries %>% 
  filter(sex_name == "both" & year_id == 2019 & location_name %in% europe) %>% 
  group_by(2019) %>%
  summarise(pop = sum(total_population)) -> total_population_2019

total_population_2019

2.1.6 PLOT: Finland population 2019

Draw a plot of Finland population in 2019 by age and gender

## re-adjust age_group variable of the population for filtering
eucountries$age_group <- eucountries$age_group_name
eucountries$age_group[eucountries$age_group == "<1 year"] <- 0
eucountries$age_group[eucountries$age_group == "95 plus"] <- 95
eucountries$age_group <- as.integer(eucountries$age_group)

## plot age distribution
library(ggplot2)
ggplot(data = eucountries[eucountries$sex_name %in% c("female", "male") & eucountries$location_name == "Finland" & eucountries$year_id == 2019, ]) +
  geom_bar(aes(x=age_group, y=total_population, fill=sex_name), stat = "identity") + 
  labs(x="age", y="freq", title="Finland population 2019")

2.1.7 PLOT: European countries population

Filter and group dataset to get total population by years and countries

eucountries %>% 
  filter(sex_name == "both" & year_id == 2019) %>% 
  group_by(location_name, year_id) %>% 
  summarise(total_population = sum(total_population)) -> populations
populations$year_id <- as.integer(populations$year_id)

Draw ordered population bar diagram of the populations

options(scipen = 999)
ggplot(data = populations) +
  geom_bar(aes(x=reorder(location_name,total_population), y=total_population, group=year_id), position="dodge", stat="identity") +
  coord_flip() +
  labs(x="", y="population", title = "Countries total population in 2019")

2.2 IHME dataset

Exploring IHME dataset which is publicly available at GBD webpage. Dataset has prevalences and incidences of different disease by countries, age group and sex. You can download full dataset by this link.

2.2.1 Load few datasets

Firstly load libraries

library(vroom)
library(fs)
library(dplyr)

Downloaded data is stored in data/ihmedata/. List files under the data location by using fs-package and load few files

files <- fs::dir_ls ("data/ihmedata/", recurse = T, glob = "*.csv")
vroom(file = files, n_max = 1000, id = "file_name")

Let’s find out what two datasets has using command spec() and summary()

d1 <- vroom(file = files[1])
spec(d1)
summary(d1)
d2 <- vroom(file = files[2])
spec(d2)
summary(d2)

Locations on each datasets

length(unique(d1$location_name))
length(unique(d2$location_name))

Causes on each datasets

length(unique(d1$cause_name))
length(unique(d2$cause_name))
d1 %>% 
  group_by(location_name, cause_name) %>% 
  summarise(n = n(),
            val=sum(val)) %>% 
  arrange(n) %>% 
  tail(10)

d2 %>% 
  group_by(location_name, cause_name) %>% 
  summarise(n = n(),
            val=sum(val)) %>% 
  arrange(n) %>% 
  tail(10)

2.2.2 Load all datasets

Load all datasets to one object and filter only Europe countries

europe <- c("Albania","Andorra","Armenia","Austria","Azerbaijan","Belarus","Belgium","Bosnia and Herzegovina","Bulgaria","Croatia","Cyprus","Czechia","Denmark","Estonia","Finland","France","Georgia","Germany","Greece","Hungary","Iceland","Ireland","Italy","Kazakhstan","Kosovo","Latvia","Liechtenstein","Lithuania","Luxembourg","Malta","Moldova","Monaco","Montenegro","Netherlands","North Macedonia","Norway","Poland","Portugal","Romania","Russian Federation","San Marino","Serbia","Slovakia","Slovenia","Spain","Sweden","Switzerland","Turkey","Ukraine","United Kingdom","Vatican City")

ihme <- NULL
for(i in seq_along(files)) {
  ihme <- ihme %>% 
    rbind(vroom(files[i], id = "file_name")) %>% 
    filter(location_name %in% europe)
}

2.2.3 Summary

Get basic info of the dataset

summary(ihme)