3 Database
In this section we are going to clean and store datasets into duckdb
database. We are going to filter datasets only to include these countries:
## [1] "Albania" "Armenia" "Austria" "Azerbaijan" "Belarus"
## [6] "Belgium" "Bosnia and Herzegovina" "Bulgaria" "Croatia" "Cyprus"
## [11] "Czechia" "Denmark" "Estonia" "Finland" "France"
## [16] "Georgia" "Germany" "Greece" "Hungary" "Iceland"
## [21] "Ireland" "Italy" "Kazakhstan" "Latvia" "Lithuania"
## [26] "Luxembourg" "Malta" "Republic of Moldova" "Montenegro" "Netherlands"
## [31] "North Macedonia" "Norway" "Poland" "Portugal" "Romania"
## [36] "Russian Federation" "Serbia" "Slovakia" "Slovenia" "Spain"
## [41] "Sweden" "Switzerland" "Turkey" "Ukraine" "United Kingdom"
First load necessary packages
library(fs)
library(dplyr)
library(vroom)
library(readxl)
3.1 Load datasets
3.1.1 Population dataset
Population dataset is in folder data/IHME_GBD_2019_POP_2010_2019_SYA/
. Let’s load all file locations
<- ("data/IHME_GBD_2019_POP_2010_2019_SYA/")
loc <- dir_ls(loc, glob = "*.CSV")
files files
Load all files and check available countries and years
Load all files and clean the datasets
<- NULL
POP for(i in seq_along(files)) {
<- vroom(files[i], id = "file_name")
population
<- population %>%
POP group_by(location_name, year_id, sex_name, age_group_name, age_group_id) %>%
summarise(total_population = sum(val, na.rm = T)) %>%
bind_rows(POP)
rm(population)
}
## create a new age_group variable by re-adjusting age_group_name variable to integer
unique(POP$age_group_name)
$age_group <- POP$age_group_name
POP$age_group[POP$age_group == "<1 year"] <- 0
POP$age_group[POP$age_group == "95 plus"] <- 95
POP$age_group <- as.integer(POP$age_group)
POP
## filter only europe countries
<- POP %>%
POP filter(location_name %in% europe)
## Change location_names to universal
## change original to this: "Bosnia and Herz."> "Bosnia and Herzegovina"
## change original to this: Czechia" > ("Czech Rep.") > "Czech Republic"
## change original to this: "Russian Federation" > "Russia"
## change original to this: "North Macedonia" > "Macedonia"
## change original to this: "Republic of Moldova" > "Moldova"
# unique(POP$location_name)
# POP[POP$location_name == "Bosnia and Herz.",] <- "Bosnia and Herzegovina"
$location_name[POP$location_name == "Czechia"] <- "Czech Republic"
POP$location_name[POP$location_name == "Russian Federation"] <- "Russia"
POP$location_name[POP$location_name == "North Macedonia"] <- "Macedonia"
POP$location_name[POP$location_name == "Republic of Moldova"] <- "Moldova" POP
Calculate Total population in 2019 of selected European countries
## Calculate Total population in 2019 of selected European countries
%>%
POP filter(sex_name == "both" & year_id == 2019) %>%
group_by(2019) %>%
summarise(pop = sum(total_population)) -> total_population_2019
total_population_2019
3.1.2 IHME dataset
Datasets are in folder data/ihmedata/
. Load all the file location into a object.
<- fs::dir_ls ("data/ihmedata/", recurse = T, glob = "*.csv")
files # files
Load all files and filter only selected countries
<- NULL
IHME for(i in seq_along(files)) {
<- IHME %>%
IHME rbind(vroom(files[i], id = "file_name")) %>%
filter(location_name %in% europe)
}
3.1.3 Calculator files
We have included a Excel file which holds data for the calculator. These data are taken from the article Ameni et al. (2019) Cost-of-illness study of Obstructive Sleep Apnea Syndrome (OSAS) in Italy
<- readxl::read_xlsx("files/osa_tables.xlsx", sheet = "causes")
tbl1 <- readxl::read_xlsx("files/osa_tables.xlsx", sheet = "causes_simple")
tbl4 <- readxl::read_xlsx("files/osa_tables.xlsx", sheet = "osa_armeni")
tbl2 <- readxl::read_xlsx("files/osa_tables.xlsx", sheet = "osa_benjafield") tbl3
3.2 Moving data to duckdb
In this section we move created datasets to duckdb database and trasform to parquet file.
Further reading:
Firstly install duckdb and arrow packages
install.packages("duckdb")
install.packages("arrow")
Create a connection and move datasets POP
and IHME
into duckdb.
library(duckdb)
= dbConnect(duckdb::duckdb(), dbdir="./data/sleep22.duckdb", read_only=FALSE)
con dbWriteTable(con, "IHME", IHME, overwrite=TRUE)
dbWriteTable(con, "POP", POP, overwrite=TRUE)
dbWriteTable(con, "causes", tbl1, overwrite=TRUE) # old slapnea
dbWriteTable(con, "osa_armeni", tbl2, overwrite=TRUE)
dbWriteTable(con, "osa_benjafield", tbl3, overwrite=TRUE)
dbWriteTable(con, "causes_simple", tbl4, overwrite=TRUE)
dbDisconnect(conn = con)
3.2.1 Transform to parquet
Transform whole database to parquet file system
= dbConnect(duckdb::duckdb(), dbdir="./data/sleep22.duckdb", read_only=FALSE)
con if(dir.exists("data/parquet/")) fs::dir_delete("data/parquet/")
if(!dir.exists("data/parquet/")) dir.create("data/parquet/")
dbSendQuery(con, "EXPORT DATABASE 'data/parquet/' (FORMAT PARQUET);")
dbDisconnect(conn = con)
#END