3 Database

In this section we are going to clean and store datasets into duckdb database. We are going to filter datasets only to include these countries:

##  [1] "Albania"                "Armenia"                "Austria"                "Azerbaijan"             "Belarus"               
##  [6] "Belgium"                "Bosnia and Herzegovina" "Bulgaria"               "Croatia"                "Cyprus"                
## [11] "Czechia"                "Denmark"                "Estonia"                "Finland"                "France"                
## [16] "Georgia"                "Germany"                "Greece"                 "Hungary"                "Iceland"               
## [21] "Ireland"                "Italy"                  "Kazakhstan"             "Latvia"                 "Lithuania"             
## [26] "Luxembourg"             "Malta"                  "Republic of Moldova"    "Montenegro"             "Netherlands"           
## [31] "North Macedonia"        "Norway"                 "Poland"                 "Portugal"               "Romania"               
## [36] "Russian Federation"     "Serbia"                 "Slovakia"               "Slovenia"               "Spain"                 
## [41] "Sweden"                 "Switzerland"            "Turkey"                 "Ukraine"                "United Kingdom"

First load necessary packages

library(fs)
library(dplyr)
library(vroom)
library(readxl)

3.1 Load datasets

3.1.1 Population dataset

Population dataset is in folder data/IHME_GBD_2019_POP_2010_2019_SYA/. Let’s load all file locations

loc <- ("data/IHME_GBD_2019_POP_2010_2019_SYA/")
files <- dir_ls(loc, glob = "*.CSV")
files

Load all files and check available countries and years

Load all files and clean the datasets

POP <- NULL
for(i in seq_along(files)) {
  population <- vroom(files[i], id = "file_name")
  
  POP <- population %>% 
    group_by(location_name, year_id, sex_name, age_group_name, age_group_id) %>% 
    summarise(total_population = sum(val, na.rm = T)) %>% 
    bind_rows(POP)
  
  rm(population)
}

## create a new age_group variable by re-adjusting age_group_name variable to integer
unique(POP$age_group_name)
POP$age_group <- POP$age_group_name
POP$age_group[POP$age_group == "<1 year"] <- 0
POP$age_group[POP$age_group == "95 plus"] <- 95
POP$age_group <- as.integer(POP$age_group)

## filter only europe countries
POP <- POP %>% 
  filter(location_name %in% europe) 

## Change location_names to universal
## change original to this: "Bosnia and Herz."> "Bosnia and Herzegovina"
## change original to this: Czechia" > ("Czech Rep.") > "Czech Republic"
## change original to this: "Russian Federation" > "Russia"
## change original to this: "North Macedonia" > "Macedonia"
## change original to this: "Republic of Moldova" > "Moldova" 
# unique(POP$location_name)
# POP[POP$location_name == "Bosnia and Herz.",] <- "Bosnia and Herzegovina"
POP$location_name[POP$location_name == "Czechia"] <- "Czech Republic"
POP$location_name[POP$location_name == "Russian Federation"] <- "Russia"
POP$location_name[POP$location_name == "North Macedonia"] <- "Macedonia"
POP$location_name[POP$location_name == "Republic of Moldova"] <- "Moldova"

Calculate Total population in 2019 of selected European countries

## Calculate Total population in 2019 of selected European countries
POP %>% 
  filter(sex_name == "both" & year_id == 2019) %>% 
  group_by(2019) %>%
  summarise(pop = sum(total_population)) -> total_population_2019
total_population_2019

3.1.2 IHME dataset

Datasets are in folder data/ihmedata/. Load all the file location into a object.

files <- fs::dir_ls ("data/ihmedata/", recurse = T, glob = "*.csv")
# files

Load all files and filter only selected countries

IHME <- NULL
for(i in seq_along(files)) {
  IHME <- IHME %>% 
    rbind(vroom(files[i], id = "file_name")) %>% 
    filter(location_name %in% europe)
}

3.1.3 Calculator files

We have included a Excel file which holds data for the calculator. These data are taken from the article Ameni et al. (2019) Cost-of-illness study of Obstructive Sleep Apnea Syndrome (OSAS) in Italy

tbl1 <- readxl::read_xlsx("files/osa_tables.xlsx", sheet = "causes")
tbl4 <- readxl::read_xlsx("files/osa_tables.xlsx", sheet = "causes_simple")
tbl2 <- readxl::read_xlsx("files/osa_tables.xlsx", sheet = "osa_armeni")
tbl3 <- readxl::read_xlsx("files/osa_tables.xlsx", sheet = "osa_benjafield")

3.2 Moving data to duckdb

In this section we move created datasets to duckdb database and trasform to parquet file.

3.2.1 Transform to parquet

Transform whole database to parquet file system

con = dbConnect(duckdb::duckdb(), dbdir="./data/sleep22.duckdb", read_only=FALSE)
if(dir.exists("data/parquet/")) fs::dir_delete("data/parquet/")
if(!dir.exists("data/parquet/")) dir.create("data/parquet/")
dbSendQuery(con, "EXPORT DATABASE 'data/parquet/' (FORMAT PARQUET);")
dbDisconnect(conn = con)

#END