3 Database

In this section we are going to clean and store datasets into duckdb database. We are going to filter datasets only to include these countries:

##  [1] "Albania"                "Armenia"                "Austria"                "Azerbaijan"             "Belarus"               
##  [6] "Belgium"                "Bosnia and Herzegovina" "Bulgaria"               "Croatia"                "Cyprus"                
## [11] "Czechia"                "Denmark"                "Estonia"                "Finland"                "France"                
## [16] "Georgia"                "Germany"                "Greece"                 "Hungary"                "Iceland"               
## [21] "Ireland"                "Italy"                  "Kazakhstan"             "Latvia"                 "Lithuania"             
## [26] "Luxembourg"             "Malta"                  "Republic of Moldova"    "Montenegro"             "Netherlands"           
## [31] "North Macedonia"        "Norway"                 "Poland"                 "Portugal"               "Romania"               
## [36] "Russian Federation"     "Serbia"                 "Slovakia"               "Slovenia"               "Spain"                 
## [41] "Sweden"                 "Switzerland"            "Turkey"                 "Ukraine"                "United Kingdom"

First load necessary packages

library(fs)
library(dplyr)
library(vroom)
library(readxl)

3.1 Load datasets

3.1.1 Population dataset

Population dataset is in folder data/IHME_GBD_2019_POP_2010_2019_SYA/. Let’s load all file locations

loc <- ("data/IHME_GBD_2019_POP_2010_2019_SYA/")
files <- dir_ls(loc, glob = "*.CSV")
files

Load all files and check available countries and years

Load all files and clean the datasets

POP <- NULL
for(i in seq_along(files)) {
  population <- vroom(files[i], id = "file_name")
  
  POP <- population %>% 
    group_by(location_name, year_id, sex_name, age_group_name, age_group_id) %>% 
    summarise(total_population = sum(val, na.rm = T)) %>% 
    bind_rows(POP)
  
  rm(population)
}

## create a new age_group variable by re-adjusting age_group_name variable to integer
unique(POP$age_group_name)
POP$age_group <- POP$age_group_name
POP$age_group[POP$age_group == "<1 year"] <- 0
POP$age_group[POP$age_group == "95 plus"] <- 95
POP$age_group <- as.integer(POP$age_group)

## filter only europe countries
POP <- POP %>% 
  filter(location_name %in% europe) 

## Change location_names to universal
## change original to this: "Bosnia and Herz."> "Bosnia and Herzegovina"
## change original to this: Czechia" > ("Czech Rep.") > "Czech Republic"
## change original to this: "Russian Federation" > "Russia"
## change original to this: "North Macedonia" > "Macedonia"
## change original to this: "Republic of Moldova" > "Moldova" 
# unique(POP$location_name)
# POP[POP$location_name == "Bosnia and Herz.",] <- "Bosnia and Herzegovina"
POP$location_name[POP$location_name == "Czechia"] <- "Czech Republic"
POP$location_name[POP$location_name == "Russian Federation"] <- "Russia"
POP$location_name[POP$location_name == "North Macedonia"] <- "Macedonia"
POP$location_name[POP$location_name == "Republic of Moldova"] <- "Moldova"

Calculate Total population in 2019 of selected European countries

## Calculate Total population in 2019 of selected European countries
POP %>% 
  filter(sex_name == "both" & year_id == 2019) %>% 
  group_by(2019) %>%
  summarise(pop = sum(total_population)) -> total_population_2019
total_population_2019

3.1.2 IHME dataset

Datasets are in folder data/ihmedata/. Load all the file location into a object.

files <- fs::dir_ls ("data/ihmedata/", recurse = T, glob = "*.csv")
# files

Load all files and filter only selected countries

IHME <- NULL
for(i in seq_along(files)) {
  IHME <- IHME %>% 
    rbind(vroom(files[i], id = "file_name")) %>% 
    filter(location_name %in% europe)
}

3.1.3 Calculator files

We have included a Excel file which holds data for the calculator. These data are taken from the article Ameni et al. (2019) Cost-of-illness study of Obstructive Sleep Apnea Syndrome (OSAS) in Italy

tbl1 <- readxl::read_xlsx("files/osa_tables.xlsx", sheet = "causes")
tbl4 <- readxl::read_xlsx("files/osa_tables.xlsx", sheet = "causes_simple")
tbl2 <- readxl::read_xlsx("files/osa_tables.xlsx", sheet = "osa_armeni")
tbl3 <- readxl::read_xlsx("files/osa_tables.xlsx", sheet = "osa_benjafield")

3.2 Moving data to duckdb

In this section we move created datasets to duckdb database and trasform to parquet file.

Further reading:

Firstly install duckdb and arrow packages

install.packages("duckdb")
install.packages("arrow")

Create a connection and move datasets POP and IHME into duckdb.

library(duckdb)
con = dbConnect(duckdb::duckdb(), dbdir="./data/sleep22.duckdb", read_only=FALSE)
dbWriteTable(con, "IHME", IHME, overwrite=TRUE)
dbWriteTable(con, "POP", POP, overwrite=TRUE)
dbWriteTable(con, "causes", tbl1, overwrite=TRUE) # old slapnea
dbWriteTable(con, "osa_armeni", tbl2, overwrite=TRUE)
dbWriteTable(con, "osa_benjafield", tbl3, overwrite=TRUE)
dbWriteTable(con, "causes_simple", tbl4, overwrite=TRUE)
dbDisconnect(conn = con)

3.2.1 Transform to parquet

Transform whole database to parquet file system

con = dbConnect(duckdb::duckdb(), dbdir="./data/sleep22.duckdb", read_only=FALSE)
if(dir.exists("data/parquet/")) fs::dir_delete("data/parquet/")
if(!dir.exists("data/parquet/")) dir.create("data/parquet/")
dbSendQuery(con, "EXPORT DATABASE 'data/parquet/' (FORMAT PARQUET);")
dbDisconnect(conn = con)
#END