0. Setup and data preparation

Authors

Francois Mercier

Daniel Sabanés Bové

Published

2025-06-25

Setup

First we need to load the necessary packages and set some default options for the MCMC sampling. We also set the theme for the plots to theme_bw with a base size of 12.

Show the code
library(bayesplot)
library(brms)
library(ggplot2)
library(gt)
library(here)
library(janitor)
library(jmpost)
library(modelr)
library(posterior)
library(readxl)
library(rstan)
library(tidybayes)
library(tidyverse)
library(truncnorm)
library(fuzzyjoin)
library(sn)
library(glue)

if (require(cmdstanr)) {
    # If cmdstanr is available, instruct brms to use cmdstanr as backend
    # and cache all Stan binaries
    options(
        brms.backend = "cmdstanr",
        cmdstanr_write_stan_file_dir = here("_brms-cache")
    )
    dir.create(here("_brms-cache"), FALSE) # create cache directory if not yet available
} else {
    rstan::rstan_options(auto_write = TRUE)
}

# MCMC options
options(mc.cores = 4)
ITER <- 1000 # number of sampling iterations after warm up
WARMUP <- 2000 # number of warm up iterations
CHAINS <- 4
BAYES.SEED <- 878
REFRESH <- 500

theme_set(theme_bw(base_size = 12))

We also need a small function definition, which is still missing in brms:

Show the code
int_step <- function(x) {
    stopifnot(is.logical(x))
    ifelse(x, 1, 0)
}

Data preparation

We prepare now the same dataset as before in the OS session.

We will again use the OAK study data.

For the tumor growth data, we are using the S1 data set from here. For simplicity, we have copied the data set in this GitHub repository.

Show the code
tumor_data <- here("data/journal.pcbi.1009822.s006.xlsx") |> 
    read_excel(sheet = "Study4") |> 
    clean_names() |> 
    mutate(
        id = factor(as.character(patient_anonmyized)),
        day = as.integer(treatment_day),
        year = day / 365.25,
        target_lesion_long_diam_mm = case_match(
            target_lesion_long_diam_mm,
            "TOO SMALL TO MEASURE" ~ "2",
            "NOT EVALUABLE" ~ NA_character_,
            .default = target_lesion_long_diam_mm
        ),
        sld = as.numeric(target_lesion_long_diam_mm),
        sld = ifelse(sld == 0, 2, sld),
        study = factor(gsub("^Study_(\\d+)_Arm_\\d+$", "\\1", study_arm)),
        arm = factor(gsub("^Study_\\d+_Arm_(\\d+)$", "\\1", study_arm)),
        arm = fct_recode(arm, "Docetaxel" = "1", "MPDL3280A" = "2")
    ) |> 
    select(id, year, sld, arm)

head(tumor_data)
# A tibble: 6 × 4
  id                       year   sld arm      
  <fct>                   <dbl> <dbl> <fct>    
1 -594245265511412736  -0.00821  42   Docetaxel
2 -594245265511412736   0.0986   44   Docetaxel
3 -594245265511412736   0.222    44   Docetaxel
4 -4078394139718744064 -0.0110   65.8 Docetaxel
5 -4078394139718744064  0.832    66   Docetaxel
6 -4078394139718744064  1.07     68   Docetaxel
Show the code
summary(tumor_data)
                    id            year              sld        
 5394902984416364544 :  16   Min.   :-0.1068   Min.   :  2.00  
 7160320731596789760 :  16   1st Qu.: 0.1123   1st Qu.: 19.05  
 -4159496062492130816:  15   Median : 0.2519   Median : 32.00  
 -7900338178541499392:  15   Mean   : 0.3995   Mean   : 38.16  
 4878279915140903936 :  15   3rd Qu.: 0.5749   3rd Qu.: 51.00  
 9202154259772598272 :  15   Max.   : 2.0753   Max.   :228.00  
 (Other)             :4034                     NA's   :27      
        arm      
 Docetaxel:1658  
 MPDL3280A:2468  
                 
                 
                 
                 
                 

Here we have 701 patients, and we know from Fig. 1 in the publication that this is the subset of patients with at least 3 tumor size measurements. We have guessed from the second Excel sheet with parameter estimates that arm 1 means docetaxel and arm 2 means MPDL3280A. This also seems reasonable given the larger number of measurements per patient:

Show the code
# make a table
tumor_data |> 
    group_by(id) |> 
    summarize(arm = arm[1], n = n())  |> 
    group_by(arm) |> 
    summarize(mean_n = mean(n))
# A tibble: 2 × 2
  arm       mean_n
  <fct>      <dbl>
1 Docetaxel   5.10
2 MPDL3280A   6.56

For the OS data, we will be using the Supplementary Table 8 from here. Again, we have copied the data set in this GitHub repository.

From the supplementary material, we know that:

  • PFS/OS: time in months
  • PFS.CNSR/OS.CNSR: 1 for censor, 0 for event

Therefore we will convert the time to years and the censoring to a logical variable:

Show the code
os_data <- here("data/41591_2018_134_MOESM3_ESM.xlsx") |> 
    read_excel(sheet = "OAK_Clinical_Data") |> 
    clean_names() |> 
    rename(
        age = bage,
        race = race2,
        ecog = ecoggr,
        arm = trt01p,
        response = bcor
    ) |> 
    mutate(
        id = factor(as.character(pt_id)),
        sld = as.numeric(na_if(bl_sld, ".")),
        pfs_time = as.numeric(pfs) / 12,
        pfs_event = as.numeric(pfs_cnsr) == 0,
        os_time = as.numeric(os) / 12,
        os_event = as.numeric(os_cnsr) == 0,
        race = factor(race),
        ecog = factor(ecog),
        arm = factor(arm),
        response = factor(
          response, 
          levels = c("CR", "PR", "SD", "PD", "NE")
        ),
        sex = factor(sex)
    ) |> 
    select(
      id,
      arm,
      ecog,
      age,
      race,
      sex,
      sld,
      response,
      pfs_time,
      pfs_event,
      os_time,
      os_event
    )

head(os_data)
# A tibble: 6 × 12
  id    arm    ecog    age race  sex     sld response pfs_time pfs_event os_time
  <fct> <fct>  <fct> <dbl> <fct> <fct> <dbl> <fct>       <dbl> <lgl>       <dbl>
1 318   Docet… 1        64 WHITE M     152   PD          0.107 TRUE        0.433
2 1088  Docet… 0        65 WHITE M      36   PD          0.194 TRUE        0.402
3 345   Docet… 1        75 WHITE M      92   <NA>        0.162 TRUE        0.162
4 588   Docet… 0        61 WHITE F      36   SD          1.02  TRUE        2.05 
5 306   MPDL3… 1        53 WHITE F      86   PD          0.118 TRUE        0.498
6 718   Docet… 1        80 WHITE M      45.7 SD          0.712 TRUE        1.60 
# ℹ 1 more variable: os_event <lgl>
Show the code
summary(os_data)
       id             arm      ecog         age           race     sex    
 1000   :  1   Docetaxel:425   0:315   Min.   :33.00   ASIAN:180   F:330  
 1001   :  1   MPDL3280A:425   1:535   1st Qu.:57.00   OTHER: 72   M:520  
 1002   :  1                           Median :64.00   WHITE:598          
 1003   :  1                           Mean   :63.23                      
 1004   :  1                           3rd Qu.:70.00                      
 1005   :  1                           Max.   :85.00                      
 (Other):844                                                              
      sld         response      pfs_time        pfs_event      
 Min.   : 10.00   CR  :  7   Min.   :0.002738   Mode :logical  
 1st Qu.: 41.00   PR  :108   1st Qu.:0.117728   FALSE:95       
 Median : 66.00   SD  :327   Median :0.240931   TRUE :755      
 Mean   : 76.69   PD  :304   Mean   :0.443174                  
 3rd Qu.: 99.00   NE  : 24   3rd Qu.:0.580424                  
 Max.   :316.00   NA's: 80   Max.   :2.242300                  
 NA's   :1                                                     
    os_time          os_event      
 Min.   :0.002738   Mode :logical  
 1st Qu.:0.364134   FALSE:281      
 Median :0.803559   TRUE :569      
 Mean   :0.949841                  
 3rd Qu.:1.620808                  
 Max.   :2.253251                  
                                   

Now we would like to join the two data sets, i.e. know which tumor growth data belongs to which baseline covariates and overall survival time and event indicator. Unfortunately we cannot use the id variable as it is not consistent between the two data sets, because both were anonymized. Instead, we will use the sld variable in combination with the treatment arm. We can also double check with the best overall response variable, because that is closely related to the tumor growth.

First, we will summarize the patient information int he tumor data set. We will calculate the baseline SLD, determine the time of the last tumor measurement and approximate the best overall response by looking at the change from baseline:

Show the code
get_baseline <- function(sld, year) {
    which_base <- tail(which(year <= 0), 1L)
    if (length(which_base) == 0) {
      which_base <- which.min(year)
    }
    sld[which_base]
}

get_contig_below_thresh <- function(sld, year, bsld, thresh) {
    rle_res <- with(
        rle(((sld[year > 0] - bsld) / bsld) < 0.2), 
        lengths[values]
    )
    if (length(rle_res) == 0) {
      0
    } else {
      max(rle_res)
    }
}

tumor_data_summary <- tumor_data |> 
    group_by(id) |> 
    arrange(year) |>
    summarize(
      arm = arm[1L],
      bsld = get_baseline(sld, year),
      last_year = tail(year, 1L),
      nadir = min(sld[year >= 0], na.rm = TRUE),
      max_cfn = max((sld[year >= 0] - nadir) / nadir, na.rm = TRUE),
      min_cfb = min((sld[year >= 0] - bsld) / bsld, na.rm = TRUE),
      contig_below_0.2 = get_contig_below_thresh(sld, year, bsld, thresh = 0.2),
      approx_response = case_when(
        min_cfb <= -0.3 ~ "PR",
        contig_below_0.2 >= 2 ~ "SD",
        max_cfn >= 0.2 ~ "PD",        
        .default = "NE"
      )   
    )
head(tumor_data_summary)
# A tibble: 6 × 9
  id                arm    bsld last_year nadir max_cfn min_cfb contig_below_0.2
  <fct>             <fct> <dbl>     <dbl> <dbl>   <dbl>   <dbl>            <dbl>
1 -100096848245159… Doce…    53     0.356    55  0       0.0377                3
2 -102777882269110… MPDL…    11     0.901     2  2      -0.818                 7
3 -105049343897269… MPDL…    20     0.345    21  1.62    0.05                  2
4 -106934201929572… Doce…    15     1.72     11  0.273  -0.267                12
5 -108949367808374… Doce…    33     0.709    31  0.0968 -0.0606                6
6 -110624773747042… MPDL…    24     1.32     16  0.562  -0.333                 9
# ℹ 1 more variable: approx_response <chr>

Now let’s try to fuzzy join this with the OS data set, based on the baseline SLD and the treatment arm. In addition, we know that the last time point in the tumor data needs to be before the OS time point. We will also check that the best overall response is the same in both data sets. There will be some errors here: For example, we don’t know whether there were new lesions leading to a progression assessment, or whether the patient had later better results. So we need to expect that the matching will not be perfect.

Show the code
os_data_keys <- os_data |> 
    select(id, arm, sld, pfs_time, os_time, response)
head(os_data_keys)
# A tibble: 6 × 6
  id    arm         sld pfs_time os_time response
  <fct> <fct>     <dbl>    <dbl>   <dbl> <fct>   
1 318   Docetaxel 152      0.107   0.433 PD      
2 1088  Docetaxel  36      0.194   0.402 PD      
3 345   Docetaxel  92      0.162   0.162 <NA>    
4 588   Docetaxel  36      1.02    2.05  SD      
5 306   MPDL3280A  86      0.118   0.498 PD      
6 718   Docetaxel  45.7    0.712   1.60  SD      
Show the code
dist_match <- function(v1, v2) {
    dist <- abs(v1 - v2)
    data.frame(include = (dist <= 0.05))
}

less_match <- function(lower, upper) {
    data.frame(include = lower <= upper)
}

tumor_os_data_joined <- tumor_data_summary |> 
    fuzzy_left_join(
        os_data_keys,
        by = c(
          "arm" = "arm", 
          "bsld" = "sld",          
          "last_year" = "os_time",
          "approx_response" = "response"
        ),
        match_fun = list(
          `==`, 
          dist_match, 
          less_match,
          `==`
        )
    )
nrow(tumor_os_data_joined)
[1] 954

We see that we cannot match the two data sets exactly:

  • We have less patients to start with in the tumor size data set.
  • We have patients in the tumor size data set which match multiple patients in the OS data set based on the treatment arm and the baseline SLD, and also the last measurement time point cannot disambiguate them.
  • We have patients in the tumor size data set which do not match any patient in the OS data set.

However, for the purpose of this training we don’t need to be able to recover the true full data set. Therefore we will just subset the matched data set to a reasonable combination of the two data sets:

Show the code
tumor_os_data_joined_subset <- tumor_os_data_joined |> 
    na.omit() |> 
    filter(!duplicated(id.y)) |>
    filter(!duplicated(id.x))
nrow(tumor_os_data_joined_subset)
[1] 203

Now let’s save the joining information:

Show the code
tgi_os_join_keys <- tumor_os_data_joined_subset |> 
    select(id.x, id.y, arm.x) |> 
    rename(
        id_tgi = id.x,
        id_os = id.y,
        arm = arm.x
    )
table(tgi_os_join_keys$arm)

Docetaxel MPDL3280A 
       96       107 

So we have about 100 patients in each arm.

We subset the two data sets accordingly:

Show the code
tumor_data <- tumor_data |> 
    inner_join(
      tgi_os_join_keys, 
      by = c("id" = "id_tgi", "arm" = "arm")
    ) |> 
    select(-id) |> 
    rename(id = id_os)

os_data <- os_data |>
    inner_join(
      tgi_os_join_keys, 
      by = c("id" = "id_os", "arm" = "arm")
    ) |> 
    select(-id_tgi)