时间分层的病例交叉研究

文献实例

  • Chen, R., Jiang, Y., Hu, J., Chen, H., Li, H., Meng, X., … & Kan, H. (2022). Hourly air pollutants and acute coronary syndrome onset in 1.29 million patients. Circulation (IF=39.9), 145(24), 1749-1760. PDF

  • Liu, Y., Pan, J., Fan, C., Xu, R., Wang, Y., Xu, C., … & Zhang, L. (2021). Short-term exposure to ambient air pollution and mortality from myocardial infarction. Journal of the American College of Cardiology (IF=27.2), 77(3), 271-281. PDF

  • Cai, M., Wei, J., Zhang, S., Liu, W., Wang, L., Qian, Z., Lin, H., Liu, E., McMillin, S.E., Cao, Y., Yin, P. (2023). Short-term air pollution exposure associated with death from kidney diseases: a nationwide time-stratified case-crossover study in China from 2015 to 2019. BMC Medicine (IF=11.8), 21(1), 32. PDF

R语言代码实战

数据读取

pacman::p_load(
  arrow, dplyr, ggplot2, data.table, broom, survival, splines, purrr, latex2exp, patchwork, tableone, lubridate)

cco_pm = arrow::read_parquet(
  'Data/cco_synthetic.parquet') %>% 
  mutate(id = as.integer(id))

glimpse(cco_pm)
Rows: 36,393
Columns: 13
$ id                <int> 8, 8, 8, 8, 18, 18, 18, 18, 21, 21, 21, 21, 21, 30, …
$ case              <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0…
$ date_control      <date> 2015-07-04, 2015-07-11, 2015-07-18, 2015-07-25, 201…
$ date_event        <dttm> 2015-07-18, 2015-07-18, 2015-07-18, 2015-07-18, 201…
$ Temperature_MA_07 <dbl> 25.157479, 26.654723, 27.834193, 26.476728, 5.590609…
$ RH_MA_07          <dbl> 56.29268, 56.55889, 63.31046, 78.44241, 39.37510, 41…
$ PM2.5_MA_01       <dbl> 49.64125, 71.13610, 59.45812, 61.66376, 42.55674, 63…
$ PM2.5_MA_02       <dbl> 51.20873, 69.57587, 61.56069, 61.54747, 51.69360, 72…
$ PM2.5_MA_03       <dbl> 52.53419, 69.62826, 62.17871, 60.31168, 59.29741, 81…
$ PM2.5_MA_04       <dbl> 50.22526, 69.04128, 62.92857, 60.95178, 63.24530, 82…
$ PM2.5_MA_05       <dbl> 50.45423, 67.57404, 64.49090, 61.53650, 65.97250, 85…
$ PM2.5_MA_06       <dbl> 54.99141, 65.71766, 65.11285, 61.19540, 74.70118, 84…
$ PM2.5_MA_07       <dbl> 57.36381, 64.22242, 65.32272, 60.85549, 81.05209, 79…
cco_pm
           id case date_control date_event Temperature_MA_07 RH_MA_07
    1:      8    0   2015-07-04 2015-07-18         25.157479 56.29268
    2:      8    0   2015-07-11 2015-07-18         26.654723 56.55889
    3:      8    1   2015-07-18 2015-07-18         27.834193 63.31046
    4:      8    0   2015-07-25 2015-07-18         26.476728 78.44241
    5:     18    1   2015-03-04 2015-03-04          5.590609 39.37510
   ---                                                               
36389: 101967    0   2019-05-28 2019-05-21         23.694810 43.10004
36390: 101975    0   2019-10-06 2019-10-27         18.622571 56.82667
36391: 101975    0   2019-10-13 2019-10-27         15.517046 74.24209
36392: 101975    0   2019-10-20 2019-10-27         13.168067 62.80148
36393: 101975    1   2019-10-27 2019-10-27         13.901184 61.54964
       PM2.5_MA_01 PM2.5_MA_02 PM2.5_MA_03 PM2.5_MA_04 PM2.5_MA_05 PM2.5_MA_06
    1:    49.64125    51.20873    52.53419    50.22526    50.45423    54.99141
    2:    71.13610    69.57587    69.62826    69.04128    67.57404    65.71766
    3:    59.45812    61.56069    62.17871    62.92857    64.49090    65.11285
    4:    61.66376    61.54747    60.31168    60.95178    61.53650    61.19540
    5:    42.55674    51.69360    59.29741    63.24530    65.97250    74.70118
   ---                                                                        
36389:    24.23324    29.30652    33.04356    33.84328    34.94796    35.59586
36390:    22.31457    29.18031    32.21481    33.99911    35.63098    37.01669
36391:    35.99716    34.55549    32.92004    37.70209    39.71017    39.65420
36392:    78.43440    80.46264    77.49954    73.02039    65.72646    59.30752
36393:    50.84817    44.64038    54.94739    60.00659    63.58455    65.79182
       PM2.5_MA_07
    1:    57.36381
    2:    64.22242
    3:    65.32272
    4:    60.85549
    5:    81.05209
   ---            
36389:    35.43058
36390:    38.24265
36391:    38.10932
36392:    56.35800
36393:    66.64737

创建时间分层的数据

original_data = cco_pm %>% 
  filter(case == 1) %>% 
  select(id, date_event)

original_data
          id date_event
   1:      8 2015-07-18
   2:     18 2015-03-04
   3:     21 2015-01-29
   4:     30 2015-01-13
   5:     32 2015-06-12
  ---                  
8276: 101900 2019-07-02
8277: 101948 2019-05-26
8278: 101955 2019-10-07
8279: 101967 2019-05-21
8280: 101975 2019-10-27
dstrata = data.table(
    date_control = ymd(as.IDate(as.Date(min(original_data$date_event)):
                                  as.Date(max(original_data$date_event))))) %>% 
    .[,`:=`(strata_id = paste0(
      year(date_control),
      stringr::str_pad(month(date_control), width = 2, side = 'left', pad = '0'),
      '0',
      wday(date_control)))]
  
# Create controls for each case
case_control = original_data %>%
  .[, `:=`(strata_id = paste0(
    year(date_event),
    stringr::str_pad(
      month(date_event),
      width = 2,
      side = 'left',
      pad = '0'
    ),
    '0',
    wday(date_event)
  ))] %>%
  merge(dstrata,
        by = 'strata_id',
        all.x = TRUE,
        allow.cartesian = TRUE) %>%
  select(-strata_id) %>% 
  setkey(id, date_control)

case_control
           id date_event date_control
    1:      8 2015-07-18   2015-07-04
    2:      8 2015-07-18   2015-07-11
    3:      8 2015-07-18   2015-07-18
    4:      8 2015-07-18   2015-07-25
    5:     18 2015-03-04   2015-03-04
   ---                               
36389: 101967 2019-05-21   2019-05-28
36390: 101975 2019-10-27   2019-10-06
36391: 101975 2019-10-27   2019-10-13
36392: 101975 2019-10-27   2019-10-20
36393: 101975 2019-10-27   2019-10-27

检查生成的数据框是否一致

identical(
  cco_pm[,.(id, date_event, date_control)] %>% 
    setkey(id, date_control), 
  case_control)
[1] TRUE

描述性统计

var_vec = c("Temperature_MA_07", 
"RH_MA_07", "PM2.5_MA_01", "PM2.5_MA_02", "PM2.5_MA_03", "PM2.5_MA_04", 
"PM2.5_MA_05", "PM2.5_MA_06", "PM2.5_MA_07")

tab1 = cco_pm %>% 
  CreateTableOne(
    vars = var_vec, 
    #factorVars = fact_var,
    strata = 'case', 
    addOverall = TRUE) %>% 
  print(formatOptions = list(big.mark = ","),
        missing = TRUE,
        catDigits = 2,
        contDigits = 2,
        showAllLevels = TRUE,
        printToggle = FALSE) %>% 
    as.data.frame() %>% 
    setDT(x = ., keep.rownames = 'variables') %>% 
    dplyr::select(-test)

tab1
                        variables level        Overall              0
 1:                             n               36,393         28,113
 2: Temperature_MA_07 (mean (SD))         14.83 (9.82)   14.86 (9.81)
 3:          RH_MA_07 (mean (SD))        68.83 (15.65)  68.86 (15.64)
 4:       PM2.5_MA_01 (mean (SD))        44.05 (29.72)  43.95 (29.63)
 5:       PM2.5_MA_02 (mean (SD))        44.09 (28.82)  44.01 (28.67)
 6:       PM2.5_MA_03 (mean (SD))        44.11 (28.13)  44.04 (28.02)
 7:       PM2.5_MA_04 (mean (SD))        44.13 (27.52)  44.05 (27.42)
 8:       PM2.5_MA_05 (mean (SD))        44.11 (27.01)  44.04 (26.91)
 9:       PM2.5_MA_06 (mean (SD))        44.09 (26.62)  44.03 (26.52)
10:       PM2.5_MA_07 (mean (SD))        44.10 (26.31)  44.03 (26.22)
                1      p Missing
 1:         8,280               
 2:  14.73 (9.84)  0.310     0.0
 3: 68.70 (15.68)  0.391     0.0
 4: 44.38 (30.03)  0.254     0.0
 5: 44.36 (29.31)  0.324     0.0
 6: 44.36 (28.51)  0.367     0.0
 7: 44.37 (27.87)  0.358     0.0
 8: 44.35 (27.35)  0.364     0.0
 9: 44.32 (26.94)  0.375     0.0
10: 44.31 (26.63)  0.391     0.0

统计建模-条件逻辑回归

fit_clogit = function(x, tm_RH_lag = 'MA_07', data = cco_pm){
  print(glue::glue('\n----------{x}; {tm_RH_lag}----------\n'))
  iqr_x = IQR(data[[x]], na.rm = TRUE)
  
  clean_fit = clogit(
    formula = as.formula(paste0("case ~ ", 
                    x, 
                    ' + ns(Temperature_',
                    tm_RH_lag,
                    ', df = 5) + ns(RH_',
                    tm_RH_lag,
                    ', df = 5)',
                    ' + strata(id)')),
    data = data,
    method = 'approximate') %>% 
    broom::tidy() %>% 
    mutate(OR = exp(estimate*iqr_x), 3,
           OR_round = round(exp(estimate*iqr_x), 3),
           CI_low = exp(estimate*iqr_x - 1.96*std.error*iqr_x),
           CI_high = exp(estimate*iqr_x + 1.96*std.error*iqr_x),
           CI_low_round = round(exp(estimate*iqr_x - 1.96*std.error*iqr_x), 3),
           CI_high_round = round(exp(estimate*iqr_x + 1.96*std.error*iqr_x), 3),
           predictor = x,
           iqr = iqr_x,
           tm_RH_lag = tm_RH_lag) %>% 
    mutate(result = paste0(format(OR_round, nsmall = 3), 
                           ' (', 
                           format(CI_low_round, nsmall = 3), 
                           '-', 
                           format(CI_high_round, nsmall = 3),
                           ')')) %>% 
    filter(predictor == term) %>% 
    mutate(result = gsub('000e\\+00| ', '', result)) %>% 
    select(predictor, term, iqr, result, OR, CI_low, CI_high, p.value)
  
  return(clean_fit)
}

pm_cols = cco_pm %>% 
  select(starts_with('PM2.5')) %>% 
  names()

pm_cols
[1] "PM2.5_MA_01" "PM2.5_MA_02" "PM2.5_MA_03" "PM2.5_MA_04" "PM2.5_MA_05"
[6] "PM2.5_MA_06" "PM2.5_MA_07"
fit_cco = map_dfr(
  pm_cols,
  fit_clogit,
  data = cco_pm)
----------PM2.5_MA_01; MA_07----------
----------PM2.5_MA_02; MA_07----------
----------PM2.5_MA_03; MA_07----------
----------PM2.5_MA_04; MA_07----------
----------PM2.5_MA_05; MA_07----------
----------PM2.5_MA_06; MA_07----------
----------PM2.5_MA_07; MA_07----------
fit_cco 
# A tibble: 7 × 8
  predictor   term          iqr result                OR CI_low CI_high p.value
  <chr>       <chr>       <dbl> <chr>              <dbl>  <dbl>   <dbl>   <dbl>
1 PM2.5_MA_01 PM2.5_MA_01  29.4 1.023(0.984-1.063)  1.02  0.984    1.06   0.249
2 PM2.5_MA_02 PM2.5_MA_02  29.2 1.022(0.980-1.065)  1.02  0.980    1.07   0.318
3 PM2.5_MA_03 PM2.5_MA_03  29.0 1.023(0.978-1.070)  1.02  0.978    1.07   0.324
4 PM2.5_MA_04 PM2.5_MA_04  28.8 1.029(0.980-1.080)  1.03  0.980    1.08   0.249
5 PM2.5_MA_05 PM2.5_MA_05  28.7 1.035(0.983-1.090)  1.04  0.983    1.09   0.194
6 PM2.5_MA_06 PM2.5_MA_06  28.7 1.040(0.983-1.099)  1.04  0.983    1.10   0.172
7 PM2.5_MA_07 PM2.5_MA_07  28.6 1.042(0.982-1.106)  1.04  0.982    1.11   0.175

暴露反应关系曲线

cco_pm[,quantile(PM2.5_MA_01, c(0.25, 0.5, 0.75))]
     25%      50%      75% 
24.85516 36.29862 54.21549 
fit_pm1 = clogit(
  case ~ ns(PM2.5_MA_01, df = 4, knots = c(24.85516, 36.29862, 54.21549)) + 
    ns(Temperature_MA_07, df = 5) + ns(RH_MA_07, df = 5) + strata(id),
  data = cco_pm) 

sp_pm1 = fit_pm1 %>%
  termplot(se = T, plot = F) %>%
  .[['PM2.5_MA_01']] %>%
  as.data.table() %>% 
  .[,`:=`(HR = exp(y),
          HRmin = exp(y - 1.96*se),
          HRmax = exp(y + 1.96*se))] %>% 
  .[, `:=`(pollutant = 'PM2.5')] %>% 
  .[]

sp_pm1
                  x            y        se        HR     HRmin    HRmax
    1:   0.01227085  0.042335668 0.2042817 1.0432446 0.6990336 1.556948
    2:   0.01321473  0.042334044 0.2042703 1.0432429 0.6990480 1.556911
    3:   0.01686711  0.042327760 0.2042264 1.0432364 0.6991039 1.556767
    4:   0.01824977  0.042325381 0.2042097 1.0432339 0.6991251 1.556713
    5:   0.03312462  0.042299789 0.2040306 1.0432072 0.6993527 1.556126
   ---                                                                 
36342: 364.39734347  0.015824585 0.4892802 1.0159505 0.3893936 2.650673
36343: 372.32051222  0.006956253 0.5136792 1.0069805 0.3679328 2.755964
36344: 382.24794690 -0.004490001 0.5447729 0.9955201 0.3422394 2.895810
36345: 395.03642367 -0.019732329 0.5856061 0.9804611 0.3111375 3.089643
36346: 482.52457202 -0.133471700 0.8801825 0.8750522 0.1558861 4.912024
       pollutant
    1:     PM2.5
    2:     PM2.5
    3:     PM2.5
    4:     PM2.5
    5:     PM2.5
   ---          
36342:     PM2.5
36343:     PM2.5
36344:     PM2.5
36345:     PM2.5
36346:     PM2.5
arr = list('PM1' = TeX("PM$_{1}$"),  
           "PM2.5" = TeX("PM$_{2.5}$"), 
           "PM10" = TeX("PM$_{10}$"))
mylabel = function(val) { return(lapply(val, function(x) arr[x])) }
cco_pm[,quantile(PM2.5_MA_01, c(0.01, 0.99))]
        1%        99% 
  7.021625 154.188829 
color7 = c('#7570b3', '#d95f02', '#1b9e77')

p_pm1 = sp_pm1 %>% 
  .[x >= 7 & x <= 154] %>% 
  .[sample(.N, 1000)] %>% 
  ggplot(aes(x = x, y = HR)) +
  geom_hline(yintercept = 1, linewidth = 0.8, alpha = 0.6, linetype = 'dashed') +
  geom_ribbon(aes(ymin = HRmin, ymax = HRmax), alpha = 0.3, color = NA, fill = color7[1]) +
  geom_line(linewidth = 1.5, color = color7[1]) +
  facet_wrap(~ pollutant, scales = 'free', labeller = mylabel) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0.05)) +
  labs(x = NULL, y = 'Odds of death') +
  theme_test(base_size = 20, base_family = 'serif') + # , base_family = 'serif'
  theme(plot.margin = unit(c(0, 0.5, 0, 0.2), 'cm'),
        plot.title = element_text(size = 20, face = 'bold'),
        strip.background = element_rect(fill = ggplot2::alpha(color7[1], 0.3), color = NA),
        strip.text = element_text(size = 18, face = 'bold'))

p_pm1

ggsave('cco_splines.pdf', p_pm1, width = 8, height = 6, device = cairo_pdf)