Developing Survival Function

2026/02/03

Currently I’m working with three overlapping projects

  1. Creating analysis results for Dementia and Hip Fracture connection

  2. Creating a survival function which can handle input cases

  3. Scaling this analyse to Shiny application

This memo is for myself problems which encountered and a possible solutions.

Survival Function

library(tidyr)
library(dplyr)
library(lubridate)
library(cmprsk)
library(cuminc)
library(survminer)
library(ggcompetingrisks)

survival_analysis <- function(exposure_diagnoses,
                              response_diagnoses,
                              dpop,
                              start = c("DATE_EXPOSURE", "DATE_RESPONSE", "DATE_50"),
                              # DATE_EXPOSURE
                              # DATE_RESPONSE
                              # DATE_50
                              censoring_date = as.Date("2024-12-21"),
                              pre_entry_handling = c("truncate", "skip", "asis")
                              # truncate: diagnosis before entry is assigned to entry date
                              # skip: diagnosis before entry is ignored; first post-entry diagnosis is used
                              # asis: diagnosis before entry is used as it is
) {
  # type <- match.arg(type)
  internal_function <- function() {
    .safe_inc_progress(1/3)
    ## ESIMERKKI CASE
    if(FALSE){
      start = "DATE_50"
      censoring_date = as.Date("2023-12-01")
      pre_entry_handling = "truncate" #c("truncate", "skip", "asis")
    }
    colors_groups <- c(
      "non-exposure" = "#5BC0DE",
      "exposure"     = "#D9534F",
      "non-response" = "#F0AD4E",
      "response"     = "#5CB85C",
      "dead"         = "#292B2C"
    )
    
    ## PHASE 0 - MUODOSTETAAN PVM DATA
    ### Aineisto pvm per ID
    if(TRUE){
      ## Populaation oleelliset pvm-muuttujat
      d0 <- dpop |>
        dplyr::mutate(
          DATE_BIRTH = as.Date(DATE_BIRTH),
          # DATE_DEATH = as.Date(DATE_DEATH),
          DATE_DEATH = as.Date(ifelse(DATE_DEATH > censoring_date, NA, DATE_DEATH), origin = "1970-01-01"),
          # DATE_MIGRATION = as.Date(DATE_MIGRATION),
          DATE_MIGRATION = as.Date(ifelse(DATE_MIGRATION > censoring_date, NA, DATE_MIGRATION), origin = "1970-01-01"),
          DATE_50 = DATE_BIRTH + 50 * 365.25 ## TODO this should be in dpop, and named as DATE_START
        ) |>
        select(ID, DATE_BIRTH, DATE_DEATH, DATE_MIGRATION, DATE_50)
      
      ## EXPOSURE DIAGNOSES
      d1 <- exposure_diagnoses |>
        dplyr::filter(DATE <= censoring_date) |>
        dplyr::left_join(d0, by = "ID") |>
        dplyr::select(ID, DATE, DATE_50) |>
        dplyr::rename(DATE_EXPOSURE = DATE) |>
        dplyr::mutate(DATE_EXPOSURE_ORIGINAL = DATE_EXPOSURE)
      ### pre_entry_handling c("truncate", "skip", "asis")
      if (pre_entry_handling == "truncate") {
        d1 <- d1 |> mutate(
          DATE_EXPOSURE = as.Date(ifelse(DATE_EXPOSURE < DATE_50, DATE_50, DATE_EXPOSURE), origin = "1970-01-01")
        )
      }
      if (pre_entry_handling == "skip") {
        d1 <- dplyr::filter(d1, DATE_EXPOSURE >= DATE_50)
      }
      ### Otetaan vain ensimmäisen exposure tapauksen DATE per id
      d1 <- d1 |>
        arrange(DATE_EXPOSURE) |>
        group_by(ID) |>
        summarise(DATE_EXPOSURE = dplyr::first(DATE_EXPOSURE),
                  DATE_EXPOSURE_ORIGINAL = dplyr::first(DATE_EXPOSURE_ORIGINAL))
      
      ## RESPONSE_DIAGNOSES
      d2 <- response_diagnoses |>
        dplyr::filter(DATE <= censoring_date) |>
        dplyr::left_join(d0, by = "ID") |>
        dplyr::select(ID, DATE, DATE_50) |>
        dplyr::rename(DATE_RESPONSE = DATE) |>
        dplyr::mutate(DATE_RESPONSE_ORIGINAL = DATE_RESPONSE)
      ### pre_entry_handling c("truncate", "skip", "asis")
      if (pre_entry_handling == "truncate") {
        d2 <- d2 |> mutate(
          DATE_RESPONSE_ORIGINAL = DATE_RESPONSE, # TODO this should be carried along
          DATE_RESPONSE = as.Date(ifelse(DATE_RESPONSE < DATE_50, DATE_50, DATE_RESPONSE) , origin = "1970-01-01")
        )
      }
      if (pre_entry_handling == "skip") {
        d2 <- dplyr::filter(d2, DATE_RESPONSE >= DATE_50)
      }
      ### Otetaan vain ensimmäisen response tapauksen DATE per id
      d2 <- d2 |>
        arrange(DATE_RESPONSE) |>
        group_by(ID) |>
        summarise(DATE_RESPONSE = dplyr::first(DATE_RESPONSE),
                  DATE_RESPONSE_ORIGINAL = dplyr::first(DATE_RESPONSE_ORIGINAL))
      
      
      
      ## FINAL (ensimmäiset DATE_EXPOSURE, DATE_RESPONSE ja DATE_CENSOR -tapaukset)
      df <- d0 |>
        dplyr::left_join(d1, by = "ID") |>
        dplyr::left_join(d2, by = "ID") |>
        filter(is.na(DATE_DEATH) | DATE_DEATH > DATE_50) |> ## tiputetaan kuolleet ennen seurantaa pois
        dplyr::mutate(
          DATE_CENSOR = pmin(DATE_MIGRATION, censoring_date, na.rm = TRUE)
        )
      rm(list = c("d0", "d1", "d2"))
      
      # df %>% filter(DATE_DEATH < DATE_50)
      # df$DATE_EXPOSURE <- as.Date(df$DATE_EXPOSURE, origin = "1970-01-01")
      # df$DATE_RESPONSE <- as.Date(df$DATE_RESPONSE, origin = "1970-01-01")
    }
    
    ## DEBUG
    if(FALSE){
      ## Katsotaan kuinka monta tapausta joissa expsure ennen response
      df %>% filter(DATE_EXPOSURE <= DATE_RESPONSE) %>% count()
      ## pysyy oikein
    }
    
    
    # Phase1 - calculate times (years from starting point)
    if(TRUE){
      dsurv_uusi <- df |>
        dplyr::mutate(
          ## tämä on joko DATE_EXPOSURE, DATE_RESPONSE tai DATE_50
          apvm =  case_when(start == "DATE_EXPOSURE" ~ DATE_EXPOSURE,
                            start == "DATE_RESPONSE" ~ DATE_RESPONSE,
                            start == "DATE_50" ~ DATE_50,
                            TRUE ~ NA_Date_),
          ## sensurointi
          epvm = DATE_CENSOR,
          ## lasketaan ajat
          time_exposure = trunc((apvm%--% DATE_EXPOSURE) / days(1)),
          # time_exposure = trunc(lubridate::time_length(apvm %--% DATE_EXPOSURE, "years")),
          time_response = trunc((apvm %--% DATE_RESPONSE) / days(1)),
          # time_response = trunc(lubridate::time_length(apvm %--% DATE_RESPONSE, "years")),
          time_dead = ifelse(!is.na(DATE_DEATH),
                             trunc((apvm %--% DATE_DEATH) / days(1)), NA),
          # trunc(lubridate::time_length(apvm %--% DATE_DEATH, "years")), NA),
          time_censoring = trunc((apvm %--% epvm) / days(1))
          # time_censoring = trunc(lubridate::time_length(apvm %--% epvm, "years"))
        )
      ## to long format
      dsurv_uusi2 <- dsurv_uusi |>
        tidyr::pivot_longer(cols = c(time_exposure, time_response, time_dead, time_censoring)) |>
        dplyr::filter(!is.na(value)) |>
        select(ID, name, value)
    }
    
    ## DEBUG
    if(FALSE){
      ## Katsotaan kuinka monta tapausta exp<=resp
      dsurv_uusi2 %>% filter(name == "time_response") %>% count()
      dsurv_uusi2 %>% filter(name == "time_response" & value >=0) %>% count()
      ## tässä muuttuu tapausten määrä
      ## Tarkastellaan, miten on alkuperäiset DATE määreet
      temp <- dsurv_uusi2 %>% filter(name == "time_response" & value >=0) %>% left_join(df, by = "ID")
      temp <- temp %>% mutate(flag = ifelse(DATE_RESPONSE < DATE_EXPOSURE, 1, 0))
      ## Tämä johtuu siitä että time pyöristää valmiiksi!
      ## TODO lasketaan ajat päivinä. Loppuvaiheessa voidaan muuttaa vuosiksi
    }
    
    
    ## Phase3 - create event coding
    if(TRUE){
      dsurv_uusi2 <- dsurv_uusi2 |>
        dplyr::mutate(
          event = case_when(
            ## Start == DATE_EXPOSURE
            start == "DATE_EXPOSURE" ~ case_when(
              name == "time_response"  ~ 1L,
              name == "time_dead"      ~ 2L,
              name == "time_censoring" ~ 3L,
              TRUE ~ NA_integer_
            ),
            ## Start == DATE_RESPONSE
            start == "DATE_RESPONSE" ~ case_when(
              name == "time_exposure"  ~ 1L,
              name == "time_dead"  ~ 2L,
              name == "time_censoring" ~ 3L,
              TRUE ~ NA_integer_
            ),
            # start == "DATE_50"
            start == "DATE_50" ~ case_when(
              name == "time_exposure"  ~ 1L,
              name == "time_response"  ~ 2L,
              name == "time_dead"      ~ 3L,
              name == "time_censoring" ~ 4L,
              TRUE ~ NA_integer_
            ),
            TRUE ~ NA_integer_
          )
        )
    }
    
    ## Limiting results only from starting point
    dsurv_uusi2 <- dsurv_uusi2 |> filter(value >=0)
    
    ## PHASE 5 - CR and plot
    if(TRUE){
      ## Otetaan vain ensimmäinen tapaus
      dsurv_cr <- dsurv_uusi2 |>
        filter(!is.na(event)) |> ## Tyhjät tapausrivit pois
        dplyr::group_by(ID) |>
        dplyr::slice_min(value, with_ties = FALSE) |>
        dplyr::ungroup()
      
      ## Aikamääre päivä ja vuosi
      dsurv_cr <- dsurv_cr %>%
        mutate(
          days = value,
          years = value / 365.25
        )
      
      
      ## Sensuroinnin koodaus (sensurointi joko 3 tai 4)
      if(start == "DATE_EXPOSURE" | start == "DATE_RESPONSE") {
        CR_days <- cmprsk::cuminc(
          ftime   = dsurv_cr$days,
          fstatus = dsurv_cr$event,
          cencode = 3
        )
        CR_years <- cmprsk::cuminc(
          ftime   = dsurv_cr$years,
          fstatus = dsurv_cr$event,
          cencode = 3
        )
      }
      if(start == "DATE_50") {
        CR_days <- cmprsk::cuminc(
          ftime   = dsurv_cr$value,
          fstatus = dsurv_cr$event,
          cencode = 4
        )
        CR_years <- cmprsk::cuminc(
          ftime   = dsurv_cr$years,
          fstatus = dsurv_cr$event,
          cencode = 4
        )
      }
      
      
      ## Lisätään värit ja nimet
      ### exposure eteenpäin tapaukset Response, Death
      if(start == "DATE_EXPOSURE"){
        ## Piirretään cumulative incidence ratio kuvaaja
        p_days <- survminer::ggcompetingrisks(
          fit = CR_days,
          multiple_panels = FALSE,
          # conf.int = TRUE,
          xlab = "Time after Exposure (days)",
          ylab = "Cumulative incidence",
          title = "Competing Risk Model: Exposure to Response/Death",
          subtitle = paste0("pre_entry_handling=", pre_entry_handling)
        ) +
          ggplot2::scale_color_manual(
            values = c("1" = colors_groups[["response"]],
                       "2" = colors_groups[["dead"]]),
            labels = c("1" = "Response",
                       "2" = "Death"),
            name = "Event"
          )
        ## Piirretään cumulative incidence ratio kuvaaja
        p_years <- survminer::ggcompetingrisks(
          fit = CR_years,
          multiple_panels = FALSE,
          xlab = "Time after Exposure (years)",
          ylab = "Cumulative incidence",
          title = "Competing Risk Model: Exposure to Response/Death",
          subtitle = paste0("pre_entry_handling=", pre_entry_handling)
        ) +
          ggplot2::scale_color_manual(
            values = c("1" = colors_groups[["response"]],
                       "2" = colors_groups[["dead"]]),
            labels = c("1" = "Response",
                       "2" = "Death"),
            name = "Event"
          )
      }
      ### Response eteenpäin tapaukset exposure, Death
      if(start == "DATE_RESPONSE"){
        ## Piirretään cumulative incidence ratio kuvaaja
        p_days <- survminer::ggcompetingrisks(
          fit = CR_days,
          multiple_panels = FALSE,
          xlab = "Time after Response (days)",
          ylab = "Cumulative incidence",
          title = "Competing Risk Model: Response to Exposure/Death",
          subtitle = paste0("pre_entry_handling=", pre_entry_handling)
        )  +
          scale_color_manual(
            values = c("1" = colors_groups[["exposure"]],
                       "2" = colors_groups[["dead"]]),
            labels = c("1" = "Exposure",
                       "2" = "Death"),
            name = "Event"
          )
        ## Piirretään cumulative incidence ratio kuvaaja
        p_years <- survminer::ggcompetingrisks(
          fit = CR_years,
          multiple_panels = FALSE,
          xlab = "Time after Response (years)",
          ylab = "Cumulative incidence",
          title = "Competing Risk Model: Response to Exposure/Death",
          subtitle = paste0("pre_entry_handling=", pre_entry_handling)
        )  +
          scale_color_manual(
            values = c("1" = colors_groups[["exposure"]],
                       "2" = colors_groups[["dead"]]),
            labels = c("1" = "Exposure",
                       "2" = "Death"),
            name = "Event"
          )
      }
      ### 50v eteenpäin Exposure, Response tai Death
      if(start == "DATE_50"){
        ## Piirretään cumulative incidence ratio kuvaaja
        p_days <- survminer::ggcompetingrisks(
          fit = CR_days,
          multiple_panels = FALSE,
          xlab = "Time after follow up start (50 years old, days)",
          ylab = "Cumulative incidence",
          title = "Competing Risk Model: From Follow up to Exposure/Response/Death",
          subtitle = paste0("pre_entry_handling=", pre_entry_handling)
        )  +
          ggplot2::scale_color_manual(
            values = c("1" = colors_groups[["exposure"]],
                       "2" = colors_groups[["response"]],
                       "3" = colors_groups[["dead"]]),
            labels = c("1" = "Exposure",
                       "2" = "Response",
                       "3" = "Death"),
            name = "Event"
          )
        ## Piirretään cumulative incidence ratio kuvaaja
        p_years <- survminer::ggcompetingrisks(
          fit = CR_years,
          multiple_panels = FALSE,
          xlab = "Time after follow up start (50 years old, years)",
          ylab = "Cumulative incidence",
          title = "Competing Risk Model: From Follow up to Exposure/Response/Death",
          subtitle = paste0("pre_entry_handling=", pre_entry_handling)
        )  +
          ggplot2::scale_color_manual(
            values = c("1" = colors_groups[["exposure"]],
                       "2" = colors_groups[["response"]],
                       "3" = colors_groups[["dead"]]),
            labels = c("1" = "Exposure",
                       "2" = "Response",
                       "3" = "Death"),
            name = "Event"
          )
      }
    }
    ## Kootaan kaikki tulokset listaan, joka palautetaan
    d <- list(plot_days = p_days,
              plot_years = p_years,
              CR_days = CR_days,
              CR_years = CR_years,
              dsurv = dsurv_cr
    )
    .safe_inc_progress(3/3)
    return(d)
  }
  
  # Run with or without shiny progress
  if (shiny::isRunning()) {
    withProgress(message = "Creating Survival Analysis", value = 0, {
      return(internal_function())
    })
  } else {
    return(internal_function())
  }
}

This function has now something to be tested