2 Data
In this section we are going to download and explore datasets before editing and storing data into duckdb
. Data sets consists of the population and disease prevalances information. It can be downloaded as (several) CSV-files from the links below:
Download data manually and store them to data/
folder. There are many csv-files so loading those can be done easily with vroom
and fs
packages. Before transferring datasets into mighty duckdb
database, we find out little bit more what these data sets holds.
2.1 Population
We are interested of the population by age and gender in European countries.
2.1.1 Load data into R
Load the vroom()
library. Load one data set into IHME2010
object and use the id
argument to add the file name to the data frame. Use file_name as the argument’s value. Load the file spec into a variable called spec
, using the spec()
command.
library(vroom)
<- vroom("data/IHME_GBD_2019_POP_2010_2019_SYA/IHME_GBD_2019_POP_SYA_2010_Y2021M01D28.CSV", id = "file_name")
IHME2010 <- spec(IHME2010)
spec spec
2.1.2 Explore dataset
Explore dataset and it’s variables by summary()
summary(IHME2010)
rm(IHME2010) # Remove dataset from memory
2.1.3 Load and modify multiple files
Load the fs
and dplyr
libraries. Load to a variable named files
list files in the ./data/ folder using the dir_ls()
function, use the glob
argument to pass a wildcard to list CSV files only.
library(fs)
library(dplyr)
<- ("data/IHME_GBD_2019_POP_2010_2019_SYA/")
loc <- dir_ls(loc, glob = "*.CSV")
files files
Group population
by location_name, year_id, sex_name, age_group_name, file_name
and get the total of val
and the number of items. Name them total_population
and items
respectively. Name the new dataset countries
. Define the countries
variable as NULL
prior to the for loop and add a bind_rows()
step to countries
to preserve each summarized view. Remove the population
variable at the end of each cycle.
<- NULL
countries for(i in seq_along(files)) {
<- vroom(files[i], id = "file_name")
population
<- population %>%
countries group_by(location_name, year_id, sex_name, age_group_name, age_group_id) %>%
summarise(total_population = sum(val, na.rm = T)) %>%
bind_rows(countries)
rm(population)
}
Preview the countries
dataset by checking how many countries are in the dataset and calculating total population by year
## How many countries/locations are included?
length(unique(countries$location_name))
## Calculate countries population by year
%>%
countries filter(sex_name == "both") %>%
group_by(location_name, year_id, sex_name) %>%
summarise(pop = sum(total_population)) -> countries_population
head(countries_population)
2.1.4 Included European countries
Below is a list of 51 European countries. See which of these countries are NOT included in population dataset
<- c("Albania","Andorra","Armenia","Austria","Azerbaijan","Belarus","Belgium","Bosnia and Herzegovina","Bulgaria","Croatia","Cyprus","Czechia","Denmark","Estonia","Finland","France","Georgia","Germany","Greece","Hungary","Iceland","Ireland","Italy","Kazakhstan","Latvia","Lithuania","Luxembourg","Malta","Republic of Moldova","Monaco","Montenegro","Netherlands","North Macedonia","Norway","Poland","Portugal","Romania","Russian Federation","San Marino","Serbia","Slovakia","Slovenia","Spain","Sweden","Switzerland","Turkey","Ukraine","United Kingdom") europe
Create new dataset which has only europe countries and compare which are not included
<- countries %>%
eucountries filter(location_name %in% europe)
## How many European countries are included in the dataset?
length(unique(eucountries$location_name))
## These countries are missing
!europe %in% eucountries$location_name] europe[
2.1.5 Europe total population in 2019
Calculate Total population in 2019 of selected European countries
%>%
eucountries filter(sex_name == "both" & year_id == 2019 & location_name %in% europe) %>%
group_by(2019) %>%
summarise(pop = sum(total_population)) -> total_population_2019
total_population_2019
2.1.6 PLOT: Finland population 2019
Draw a plot of Finland population in 2019 by age and gender
## re-adjust age_group variable of the population for filtering
$age_group <- eucountries$age_group_name
eucountries$age_group[eucountries$age_group == "<1 year"] <- 0
eucountries$age_group[eucountries$age_group == "95 plus"] <- 95
eucountries$age_group <- as.integer(eucountries$age_group)
eucountries
## plot age distribution
library(ggplot2)
ggplot(data = eucountries[eucountries$sex_name %in% c("female", "male") & eucountries$location_name == "Finland" & eucountries$year_id == 2019, ]) +
geom_bar(aes(x=age_group, y=total_population, fill=sex_name), stat = "identity") +
labs(x="age", y="freq", title="Finland population 2019")
2.1.7 PLOT: European countries population
Filter and group dataset to get total population by years and countries
%>%
eucountries filter(sex_name == "both" & year_id == 2019) %>%
group_by(location_name, year_id) %>%
summarise(total_population = sum(total_population)) -> populations
$year_id <- as.integer(populations$year_id) populations
Draw ordered population bar diagram of the populations
options(scipen = 999)
ggplot(data = populations) +
geom_bar(aes(x=reorder(location_name,total_population), y=total_population, group=year_id), position="dodge", stat="identity") +
coord_flip() +
labs(x="", y="population", title = "Countries total population in 2019")
2.2 IHME dataset
Exploring IHME dataset which is publicly available at GBD webpage. Dataset has prevalences and incidences of different disease by countries, age group and sex. You can download full dataset by this link.
2.2.1 Load few datasets
Firstly load libraries
library(vroom)
library(fs)
library(dplyr)
Downloaded data is stored in data/ihmedata/
. List files under the data location by using fs
-package and load few files
<- fs::dir_ls ("data/ihmedata/", recurse = T, glob = "*.csv")
files vroom(file = files, n_max = 1000, id = "file_name")
Let’s find out what two datasets has using command spec()
and summary()
<- vroom(file = files[1])
d1 spec(d1)
summary(d1)
<- vroom(file = files[2])
d2 spec(d2)
summary(d2)
Locations on each datasets
length(unique(d1$location_name))
length(unique(d2$location_name))
Causes on each datasets
length(unique(d1$cause_name))
length(unique(d2$cause_name))
%>%
d1 group_by(location_name, cause_name) %>%
summarise(n = n(),
val=sum(val)) %>%
arrange(n) %>%
tail(10)
%>%
d2 group_by(location_name, cause_name) %>%
summarise(n = n(),
val=sum(val)) %>%
arrange(n) %>%
tail(10)
2.2.2 Load all datasets
Load all datasets to one object and filter only Europe countries
<- c("Albania","Andorra","Armenia","Austria","Azerbaijan","Belarus","Belgium","Bosnia and Herzegovina","Bulgaria","Croatia","Cyprus","Czechia","Denmark","Estonia","Finland","France","Georgia","Germany","Greece","Hungary","Iceland","Ireland","Italy","Kazakhstan","Kosovo","Latvia","Liechtenstein","Lithuania","Luxembourg","Malta","Moldova","Monaco","Montenegro","Netherlands","North Macedonia","Norway","Poland","Portugal","Romania","Russian Federation","San Marino","Serbia","Slovakia","Slovenia","Spain","Sweden","Switzerland","Turkey","Ukraine","United Kingdom","Vatican City")
europe
<- NULL
ihme for(i in seq_along(files)) {
<- ihme %>%
ihme rbind(vroom(files[i], id = "file_name")) %>%
filter(location_name %in% europe)
}