队列研究

文献实例

Di, Q., Wang, Y., Zanobetti, A., Wang, Y., Koutrakis, P., Choirat, C., … Schwartz, J. D. (2017). Air pollution and mortality in the Medicare population. New England Journal of Medicine (IF=176), 376(26), 2513-2522. PDF DOI: 10.1056/NEJMoa1702747
Yu, K., Qiu, G., Chan, K. H., Lam, K. B. H., Kurmi, O. P., Bennett, D. A., … Wu, T. (2018). Association of solid fuel use with risk of cardiovascular and all-cause mortality in rural China. JAMA (IF=157), 319(13), 1351-1361. PDF DOI: https://doi.org/10.1001/jama.2018.2151
Cai, M., Li, H., Wu, Y., Zhang. S., Wang. X., Zhang. Z., Lin. H. (2022) Ambient Air Pollution Associated with Body Fat Percentages at Different Body Compartments: A Cohort Study of UK Biobank Participants. Environmental Health Perspectives (IF=11.0). 130(6): 067702. PDF
Cai, M., Lin, X., Wang, X., Zhang, S., Qian, Z., McMillin, S.E., Aaron, H.E., Lin, H., Wei, J., Zhang, Z., Pan, J. (2022) Ambient particulate matter pollution of different sizes associated with recurrent stroke hospitalization in China: a cohort study of 1.07 million stroke patients. Science of the Total Environment (IF=10.8). 159104. PDF

R语言代码实战

Cox回归

读取数据

pacman::p_load(
  arrow, dplyr, ggplot2, data.table, broom, survival, splines, survival, purrr, latex2exp, patchwork, rms)

cohort_cox = arrow::read_parquet(
  'Data/cohort_Cox_synthetic.parquet')

glimpse(cohort_cox)

Rows: 8,262
Columns: 12
$ rehosp_staus      <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0…
$ t2rehosp          <int> 741, 1267, 510, 126, 1076, 402, 980, 589, 502, 524, …
$ age               <dbl> 74, 79, 72, 79, 52, 75, 68, 62, 75, 73, 77, 70, 76, …
$ sex               <fct> Female, Female, Female, Female, Female, Female, Fema…
$ ethnicity         <fct> Han, Han, Han, Han, Han, Han, Han, Han, Han, Han, Ha…
$ marriage          <fct> Married, Married, Married, Married, Married, Married…
$ occupation        <fct> Farmer, Farmer, Other, Other, Other, Farmer, Other, …
$ PM1_MA_365        <dbl> 26.94837, 21.58845, 26.15891, 25.80991, 34.91308, 36…
$ PM2.5_MA_365      <dbl> 41.79592, 31.49986, 36.12802, 42.01941, 48.28277, 44…
$ PM10_MA_365       <dbl> 68.51770, 63.77828, 57.15183, 67.48609, 76.22407, 72…
$ Temperature_MA_07 <dbl> 5.847840, 24.166486, 26.243227, 9.688386, 6.529096, …
$ RH_MA_07          <dbl> 73.08185, 67.75124, 87.87124, 72.36949, 68.59028, 71…

拟合模型

fit0 = coxph(
    formula = Surv(t2rehosp, rehosp_staus) ~ PM1_MA_365 + age + ethnicity + 
      marriage + occupation + ns(Temperature_MA_07, df = 5) + ns(RH_MA_07, df = 5), 
    data = cohort_cox)

class(fit0)

[1] "coxph"

fit0

Call:
coxph(formula = Surv(t2rehosp, rehosp_staus) ~ PM1_MA_365 + age + 
    ethnicity + marriage + occupation + ns(Temperature_MA_07, 
    df = 5) + ns(RH_MA_07, df = 5), data = cohort_cox)

                                    coef exp(coef)  se(coef)      z       p
PM1_MA_365                      0.011054  1.011115  0.003903  2.832 0.00462
age                            -0.001324  0.998677  0.002192 -0.604 0.54576
ethnicityNon-han               -0.058235  0.943428  0.201408 -0.289 0.77247
marriageUnmarried               0.186563  1.205100  0.104045  1.793 0.07296
marriageWidowed                 0.038527  1.039279  0.096506  0.399 0.68973
marriageDivorced                0.207441  1.230525  0.159879  1.297 0.19446
marriageOther                  -0.118606  0.888158  0.158128 -0.750 0.45322
occupationPrivate institution  -0.134948  0.873761  0.151888 -0.888 0.37429
occupationFarmer               -0.101090  0.903852  0.128781 -0.785 0.43247
occupationJobless              -0.149602  0.861050  0.177534 -0.843 0.39942
occupationRetired              -0.101371  0.903598  0.143299 -0.707 0.47931
occupationOther                -0.241753  0.785250  0.131212 -1.842 0.06541
ns(Temperature_MA_07, df = 5)1  0.010090  1.010141  0.762198  0.013 0.98944
ns(Temperature_MA_07, df = 5)2  0.036154  1.036816  0.810714  0.045 0.96443
ns(Temperature_MA_07, df = 5)3 -0.044010  0.956944  0.474768 -0.093 0.92614
ns(Temperature_MA_07, df = 5)4  0.292356  1.339580  1.608833  0.182 0.85580
ns(Temperature_MA_07, df = 5)5 -0.025618  0.974708  0.371333 -0.069 0.94500
ns(RH_MA_07, df = 5)1           0.419264  1.520842  0.664890  0.631 0.52832
ns(RH_MA_07, df = 5)2           0.450642  1.569319  0.691113  0.652 0.51437
ns(RH_MA_07, df = 5)3           0.077616  1.080708  0.392458  0.198 0.84323
ns(RH_MA_07, df = 5)4           0.620296  1.859479  1.408819  0.440 0.65972
ns(RH_MA_07, df = 5)5           0.194996  1.215307  0.419406  0.465 0.64198

Likelihood ratio test=34.99  on 22 df, p=0.03881
n= 8262, number of events= 2474

# 提取结果成数据框，方便导出excel
fit0 %>% 
  tidy()

# A tibble: 22 × 5
   term                          estimate std.error statistic p.value
   <chr>                            <dbl>     <dbl>     <dbl>   <dbl>
 1 PM1_MA_365                     0.0111    0.00390     2.83  0.00462
 2 age                           -0.00132   0.00219    -0.604 0.546  
 3 ethnicityNon-han              -0.0582    0.201      -0.289 0.772  
 4 marriageUnmarried              0.187     0.104       1.79  0.0730 
 5 marriageWidowed                0.0385    0.0965      0.399 0.690  
 6 marriageDivorced               0.207     0.160       1.30  0.194  
 7 marriageOther                 -0.119     0.158      -0.750 0.453  
 8 occupationPrivate institution -0.135     0.152      -0.888 0.374  
 9 occupationFarmer              -0.101     0.129      -0.785 0.432  
10 occupationJobless             -0.150     0.178      -0.843 0.399  
# ℹ 12 more rows

# 提取结果为exp后的HR值，而非线性项的值
fit0 %>% 
  tidy(exponentiate = TRUE)

# A tibble: 22 × 5
   term                          estimate std.error statistic p.value
   <chr>                            <dbl>     <dbl>     <dbl>   <dbl>
 1 PM1_MA_365                       1.01    0.00390     2.83  0.00462
 2 age                              0.999   0.00219    -0.604 0.546  
 3 ethnicityNon-han                 0.943   0.201      -0.289 0.772  
 4 marriageUnmarried                1.21    0.104       1.79  0.0730 
 5 marriageWidowed                  1.04    0.0965      0.399 0.690  
 6 marriageDivorced                 1.23    0.160       1.30  0.194  
 7 marriageOther                    0.888   0.158      -0.750 0.453  
 8 occupationPrivate institution    0.874   0.152      -0.888 0.374  
 9 occupationFarmer                 0.904   0.129      -0.785 0.432  
10 occupationJobless                0.861   0.178      -0.843 0.399  
# ℹ 12 more rows

# 提取结果为exp后的HR值，而非线性项的值
fit0 %>% 
  tidy(exponentiate = TRUE, conf.int = T, conf.level = 0.95)

# A tibble: 22 × 7
   term                  estimate std.error statistic p.value conf.low conf.high
   <chr>                    <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
 1 PM1_MA_365               1.01    0.00390     2.83  0.00462    1.00       1.02
 2 age                      0.999   0.00219    -0.604 0.546      0.994      1.00
 3 ethnicityNon-han         0.943   0.201      -0.289 0.772      0.636      1.40
 4 marriageUnmarried        1.21    0.104       1.79  0.0730     0.983      1.48
 5 marriageWidowed          1.04    0.0965      0.399 0.690      0.860      1.26
 6 marriageDivorced         1.23    0.160       1.30  0.194      0.900      1.68
 7 marriageOther            0.888   0.158      -0.750 0.453      0.651      1.21
 8 occupationPrivate in…    0.874   0.152      -0.888 0.374      0.649      1.18
 9 occupationFarmer         0.904   0.129      -0.785 0.432      0.702      1.16
10 occupationJobless        0.861   0.178      -0.843 0.399      0.608      1.22
# ℹ 12 more rows

fit_cox = function(x, dat = drehosp, scale_num = 10){
  clean_fit = coxph(
    formula = as.formula(paste0("Surv(t2rehosp, rehosp_staus) ~ ", 
                    x, 
                    ' + age + ethnicity + marriage + occupation + 
                    ns(Temperature_MA_07, df = 5) + ns(RH_MA_07, df = 5)')), 
    data = dat) %>% 
    tidy() %>% 
    filter(grepl('PM|O3|CO|NO2|SO2', term)) %>% 
    mutate(OR = exp(estimate*scale_num), 3,
           OR_round = round(exp(estimate*scale_num), 3),
           CI_low = exp(estimate*scale_num - 1.96*std.error*scale_num),
           CI_high = exp(estimate*scale_num + 1.96*std.error*scale_num),
           CI_low_round = round(exp(estimate*scale_num - 1.96*std.error*scale_num), 3),
           CI_high_round = round(exp(estimate*scale_num + 1.96*std.error*scale_num), 3),
           predictor = x) %>% 
    mutate(result = paste0(format(OR_round, nsmall = 3), 
                           ' (', 
                           format(CI_low_round, nsmall = 3), 
                           '-', 
                           format(CI_high_round, nsmall = 3),
                           ')')) %>% 
    select(predictor, term, result, OR, CI_low, CI_high, p.value) 
  
  print(x)
  
  return(clean_fit)
}

estimate_cox = map_dfr(
     c("PM1_MA_365", "PM2.5_MA_365", "PM10_MA_365"), 
     fit_cox, 
     dat = cohort_cox,
     scale_num = 10)

[1] "PM1_MA_365"
[1] "PM2.5_MA_365"
[1] "PM10_MA_365"

estimate_cox

# A tibble: 3 × 7
  predictor    term         result                 OR CI_low CI_high     p.value
  <chr>        <chr>        <chr>               <dbl>  <dbl>   <dbl>       <dbl>
1 PM1_MA_365   PM1_MA_365   1.117 (1.035-1.206)  1.12   1.03    1.21     4.62e-3
2 PM2.5_MA_365 PM2.5_MA_365 1.112 (1.062-1.165)  1.11   1.06    1.17     7.28e-6
3 PM10_MA_365  PM10_MA_365  1.098 (1.061-1.136)  1.10   1.06    1.14     6.20e-8

暴露反应关系

quantile(cohort_cox$PM2.5_MA_365, c(0.25, 0.5, 0.75))

     25%      50%      75% 
38.12038 42.49512 48.53397

quantile(cohort_cox$PM2.5_MA_365, c(0.01, 0.99))

      1%      99% 
21.55512 67.39910

fit_cox_pm2.5 = coxph(
  Surv(t2rehosp, rehosp_staus) ~ ns(PM2.5_MA_365, knots = c(40.48, 60, 63)) + 
    age + ethnicity + marriage + occupation + ns(Temperature_MA_07, df = 5) + ns(RH_MA_07, df = 5),
  data = cohort_cox %>% 
    filter(PM2.5_MA_365 > 21.55 & PM2.5_MA_365 < 67)) 

sp_cox_pm2.5 = fit_cox_pm2.5 %>%
  termplot(se = T, plot = F) %>%
  .[['PM2.5_MA_365']] %>%
  as.data.table() %>% 
  .[,`:=`(HR = exp(y),
          HRmin = exp(y - 1.96*se),
          HRmax = exp(y + 1.96*se))] %>% 
  .[, `:=`(pollutant = 'PM2.5')] %>% 
  .[]

sp_cox_pm2.5

             x          y        se        HR     HRmin     HRmax pollutant
   1: 21.57385 -0.3434542 0.1633379 0.7093160 0.5149958 0.9769578     PM2.5
   2: 21.57581 -0.3434234 0.1633122 0.7093378 0.5150376 0.9769386     PM2.5
   3: 21.60621 -0.3429450 0.1629124 0.7096772 0.5156881 0.9766404     PM2.5
   4: 21.73148 -0.3409740 0.1612655 0.7110774 0.5183760 0.9754136     PM2.5
   5: 21.77162 -0.3403425 0.1607381 0.7115266 0.5192400 0.9750213     PM2.5
  ---                                                                      
8048: 66.53952  0.1896596 0.2733318 1.2088381 0.7074625 2.0655364     PM2.5
8049: 66.64180  0.1940871 0.2833337 1.2142020 0.6968069 2.1157749     PM2.5
8050: 66.73012  0.1979229 0.2920360 1.2188684 0.6876552 2.1604434     PM2.5
8051: 66.93246  0.2067362 0.3121532 1.2296581 0.6669207 2.2672248     PM2.5
8052: 66.95539  0.2077358 0.3144453 1.2308879 0.6645953 2.2797108     PM2.5

arr = list('PM1' = TeX("PM$_{1}$"),  
           "PM2.5" = TeX("PM$_{2.5}$"), 
           "PM10" = TeX("PM$_{10}$"))
mylabel = function(val) { return(lapply(val, function(x) arr[x])) }

color7 = c('#7570b3', '#d95f02', '#1b9e77')

p_cox_pm2.5 = sp_cox_pm2.5 %>% 
  ggplot(aes(x = x, y = HR)) +
  geom_hline(yintercept = 1, linewidth = 0.8, alpha = 0.6, linetype = 'dashed') +
  geom_ribbon(aes(ymin = HRmin, ymax = HRmax), alpha = 0.3, color = NA, fill = color7[1]) +
  geom_line(linewidth = 1.5, color = color7[1]) +
  facet_wrap(~ pollutant, scales = 'free', labeller = mylabel) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0.05)) +
  labs(x = NULL, y = 'Hazard of death') +
  theme_test(base_size = 20, base_family = 'serif') + # , base_family = 'serif'
  theme(plot.margin = unit(c(0, 0.5, 0, 0.2), 'cm'),
        plot.title = element_text(size = 20, face = 'bold'),
        strip.background = element_rect(fill = ggplot2::alpha(color7[1], 0.3), color = NA),
        strip.text = element_text(size = 18, face = 'bold'))

p_cox_pm2.5

可以尝试调节不同节点和不同df对曲线的影响。

多个图形进行拼图

先拟合一个PM\(_1\)的图形：

quantile(cohort_cox$PM10_MA_365, c(0.25, 0.5, 0.75))

     25%      50%      75% 
61.18076 69.45975 77.40257

quantile(cohort_cox$PM10_MA_365, c(0.01, 0.99))

       1%       99% 
 39.12063 105.25648

fit_cox_pm10 = coxph(
  Surv(t2rehosp, rehosp_staus) ~ ns(PM10_MA_365, knots = c(40.48, 80, 90)) + 
    age + ethnicity + marriage + occupation + ns(Temperature_MA_07, df = 5) + ns(RH_MA_07, df = 5),
  data = cohort_cox %>% 
    filter(PM10_MA_365 > 40 & PM10_MA_365 < 105)) 

sp_cox_pm10 = fit_cox_pm10 %>%
  termplot(se = T, plot = F) %>%
  .[['PM10_MA_365']] %>%
  as.data.table() %>% 
  .[,`:=`(HR = exp(y),
          HRmin = exp(y - 1.96*se),
          HRmax = exp(y + 1.96*se))] %>% 
  .[, `:=`(pollutant = 'PM10')] %>% 
  .[]

p_cox_pm10 = sp_cox_pm10 %>% 
  ggplot(aes(x = x, y = HR)) +
  geom_hline(yintercept = 1, linewidth = 0.8, alpha = 0.6, linetype = 'dashed') +
  geom_ribbon(aes(ymin = HRmin, ymax = HRmax), alpha = 0.3, color = NA, fill = color7[2]) +
  geom_line(linewidth = 1.5, color = color7[2]) +
  facet_wrap(~ pollutant, scales = 'free', labeller = mylabel) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0.05)) +
  labs(x = NULL, y = 'Hazard of death') +
  theme_test(base_size = 20, base_family = 'serif') + # , base_family = 'serif'
  theme(plot.margin = unit(c(0, 0.5, 0, 0.2), 'cm'),
        plot.title = element_text(size = 20, face = 'bold'),
        strip.background = element_rect(fill = ggplot2::alpha(color7[2], 0.3), color = NA),
        strip.text = element_text(size = 18, face = 'bold'))

p_cox_pm10

使用patchwork包进行拼图

p_cox_pm2.5|p_cox_pm10

p_cox_pm2.5/p_cox_pm10

限制性立方样条

rcs_cox_pm2.5 <- cph(Surv(t2rehosp, rehosp_staus) ~ rcs(PM2.5_MA_365, c(40.48, 60, 63)) + age + ethnicity + marriage + occupation + rcs(Temperature_MA_07, df = 5), 
                     data = cohort_cox %>% 
    filter(PM2.5_MA_365 > 21.55 & PM2.5_MA_365 < 67))

anova(rcs_cox_pm2.5)

                Wald Statistics          Response: Surv(t2rehosp, rehosp_staus) 

 Factor            Chi-Square d.f. P     
 PM2.5_MA_365      26.36       2   <.0001
  Nonlinear         3.44       1   0.0637
 age                0.62       1   0.4327
 ethnicity          0.00       1   0.9448
 marriage           5.03       4   0.2839
 occupation        12.64       5   0.0270
 Temperature_MA_07  4.29       4   0.3682
  Nonlinear         1.92       3   0.5894
 TOTAL NONLINEAR    5.21       4   0.2661
 TOTAL             44.41      17   0.0003

dd <- datadist(cohort_cox %>% 
    filter(PM2.5_MA_365 > 21.55 & PM2.5_MA_365 < 67))
options(datadist = 'dd')
HR <- Predict(rcs_cox_pm2.5, PM2.5_MA_365, fun = exp, ref.zero = TRUE) %>% 
  setDT()
p2 <- HR %>% 
  ggplot(aes(x = PM2.5_MA_365, y = yhat)) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.3) +
  geom_line(linewidth = 1) +
  theme_classic()
p2

混合线性模型

读取数据

pacman::p_load(
  arrow, dplyr, ggplot2, data.table, broom.mixed, lme4, purrr, glue, stringr)

cohort_lme = arrow::read_parquet(
  'Data/cohort_lmer_synthetic.parquet')

glimpse(cohort_lme)

Rows: 22,138
Columns: 13
$ id              <int> 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, …
$ delta_t         <dbl> 0.00, 10.54, 0.00, 9.20, 0.00, 4.71, 9.06, 0.00, 7.54,…
$ PM2.5           <dbl> 10.4, 10.4, 10.4, 10.4, 10.9, 10.9, 10.9, 9.7, 9.7, 11…
$ body_fat_p      <dbl> 1.94000, 25.15878, 2.30000, 27.85871, 2.23000, 14.0947…
$ arm_left_fat_p  <dbl> 0.17300, 20.53660, 0.22500, 24.00601, 0.21500, 10.7182…
$ arm_right_fat_p <dbl> 1.70000, 21.46575, 1.99000, 24.08298, 1.97000, 11.3556…
$ leg_left_fat_p  <dbl> 17.3, 18.7, 17.1, 21.8, 17.5, 19.6, 22.4, 37.2, 40.6, …
$ leg_right_fat_p <dbl> 17.7, 20.3, 14.3, 20.8, 17.2, 20.7, 22.5, 37.4, 41.1, …
$ trunk_fat_p     <dbl> 20.8, 23.5, 27.5, 31.2, 25.4, 27.8, 28.8, 32.6, 37.2, …
$ age             <dbl> 59, 59, 44, 44, 49, 49, 49, 45, 45, 50, 50, 50, 50, 60…
$ sex             <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Male"…
$ ethnicity       <chr> "White", "White", "White", "White", "White", "White", …
$ smoking_status  <fct> 0. Never, 0. Never, 0. Never, 0. Never, 1. Previous, 1…

混合效应模型

\[y_{ij} = \beta_0 + \beta_{1}\text{PM}_{i}\times\text{year}_{ij} + \beta_{2}\text{year}_{ij} + \beta_3\text{age}_i + \beta_4\text{sex}_i + \beta_5\text{smoking}_i + \epsilon_{ij}\]

fit_lme = function(y){
  f0 = lmer(as.formula(glue('{y} ~ PM2.5:delta_t + delta_t + age + sex + smoking_status +
              (1|id)')), data = cohort_lme)
  
  f0 %>% 
    tidy(conf.int = TRUE) %>% 
    filter(grepl('PM2.5', term)) %>% 
    mutate(estimate = estimate,
           conf.low = conf.low,
           conf.high = conf.high) %>% 
    mutate(CI = paste0(str_pad(round(estimate, 3), width = 5, side = 'right', pad = '0'), 
                       ' (', 
                       str_pad(round(conf.low, 3), width = 5, side = 'right', pad = '0'), 
                       ', ', 
                       str_pad(round(conf.high, 3), width = 5, side = 'right', pad = '0'), 
                       ')')) %>%
    mutate(y = y,
           N = nobs(f0)) %>% 
    dplyr::select(y, term, estimate, conf.low, conf.high, CI, std.error)
}

res = map_dfr(
  c("body_fat_p", "arm_left_fat_p", "arm_right_fat_p", 
    "leg_left_fat_p", "leg_right_fat_p", "trunk_fat_p"),
  fit_lme)

res

# A tibble: 6 × 7
  y               term          estimate conf.low conf.high CI         std.error
  <chr>           <chr>            <dbl>    <dbl>     <dbl> <chr>          <dbl>
1 body_fat_p      PM2.5:delta_t  0.0283   0.0186     0.0381 0.028 (0.…   0.00498
2 arm_left_fat_p  PM2.5:delta_t  0.0577   0.0437     0.0717 0.058 (0.…   0.00713
3 arm_right_fat_p PM2.5:delta_t  0.0770   0.0633     0.0907 0.077 (0.…   0.00699
4 leg_left_fat_p  PM2.5:delta_t  0.00748  0.00134    0.0136 0.007 (0.…   0.00313
5 leg_right_fat_p PM2.5:delta_t  0.00783  0.00129    0.0144 0.008 (0.…   0.00333
6 trunk_fat_p     PM2.5:delta_t  0.00659 -0.00237    0.0155 0.007 (-0…   0.00457