时间序列研究

研究设计

背景与参考文献

  • Liu, C., Chen, R., Sera, F., Vicedo-Cabrera, A. M., Guo, Y., Tong, S., … & Kan, H. (2019). Ambient particulate air pollution and daily mortality in 652 cities. New England Journal of Medicine (IF=176), 381(8), 705-715. PDF

  • Chen, G., Guo, Y., Yue, X., Tong, S., Gasparrini, A., Bell, M. L., … & Li, S. (2021). Mortality risk attributable to wildfire-related PM\(_{2.5}\) pollution: a global time series study in 749 locations. The Lancet Planetary Health (IF=29), 5(9), e579-e587. PDF

  • Chen, R., Yin, P., Wang, L., Liu, C., Niu, Y., Wang, W., … & Zhou, M. (2018). Association between ambient temperature and mortality risk and burden: time series study in 272 main Chinese cities. BMJ (IF=93), 363.

  • Chen, R., Yin, P., Meng, X., Liu, C., Wang, L., Xu, X., … & Zhou, M. (2017). Fine particulate air pollution and daily mortality. A nationwide analysis in 272 Chinese cities. American journal of Respiratory and Critical Care Medicine (IF=31), 196(1), 73-81.

模型

两阶段模型法

第一阶段:广义加性线性模型

广义加性线性模型(generalized additive model):

  • 计数型因变量(如发病人数)–> 类泊松连接函数
  • 连续型因变量(如YLL ) –> 高斯连接函数

\[\log E(Y_t) = \alpha + \beta_0 \times\text{PM}_{2.5} \\ + \beta_1 \times\text{day of week}\\ + \beta_2 \times\text{public holidays}\\ s(t, \text{df} = \frac{6}{\text{year}}) \\ + s(\text{temperature}, \text{df} = 6)\\ + s(\text{relative humidity}, \text{df} = 6) \]

第二阶段:随机效应模型meta回归

I\(^2\)统计量用来评价不同城市间的异质性。

文献实例

  • Liu, C., Chen, R., Sera, F., Vicedo-Cabrera, A. M., Guo, Y., Tong, S., … & Kan, H. (2019). Ambient particulate air pollution and daily mortality in 652 cities. New England Journal of Medicine, 381(8), 705-715. PDF

  • Chen, G., Guo, Y., Yue, X., Tong, S., Gasparrini, A., Bell, M. L., … & Li, S. (2021). Mortality risk attributable to wildfire-related PM\(_{2.5}\) pollution: a global time series study in 749 locations. The Lancet Planetary Health (IF=29), 5(9), e579-e587. PDF

  • Jiang, Y., Huang, J., Li, G., Wang, W., Wang, K., Wang, J., … & Wu, S. (2023). Ozone pollution and hospital admissions for cardiovascular events. European Heart Journal, ehad091. DOI: 10.1093/eurheartj/ehad091

R语言代码实战

数据读取与检查

pacman::p_load(lubridate, glue, tidyr, purrr, broom, mgcv, dplyr, data.table, ggplot2)

ts_city = arrow::read_parquet('Data/ts_synthetic.parquet') %>% 
  mutate(admit_date_int = as.integer(admit_date))

glimpse(ts_city)
Rows: 22,958
Columns: 13
$ City              <chr> "city_01", "city_01", "city_01", "city_01", "city_01…
$ admit_date        <date> 2014-01-01, 2014-01-02, 2014-01-03, 2014-01-04, 201…
$ N                 <int> 71, 30, 53, 37, 33, 27, 26, 41, 46, 38, 30, 30, 40, …
$ PM2.5_MA_07       <dbl> 43.48810, 47.60833, 53.47381, 59.12381, 61.48333, 62…
$ SO4_MA_07         <dbl> 8.103643, 8.859619, 9.862833, 10.802631, 10.981190, …
$ NO3_MA_07         <dbl> 11.321107, 12.452488, 14.035452, 15.446714, 15.80534…
$ NH4_MA_07         <dbl> 7.374893, 8.115286, 9.126012, 10.078940, 10.428202, …
$ OM_MA_07          <dbl> 10.740298, 11.796440, 13.649619, 15.619893, 16.65766…
$ BC_MA_07          <dbl> 1.664417, 1.873250, 2.214726, 2.557393, 2.782690, 2.…
$ Temperature_MA_07 <dbl> 8.647635, 8.697264, 9.145584, 9.433634, 9.423733, 10…
$ RH_MA_07          <dbl> 75.89122, 74.89766, 76.77965, 79.10466, 80.76195, 79…
$ dow               <ord> Sunday, Monday, Tuesday, Wednesday, Thursday, Friday…
$ admit_date_int    <int> 16071, 16072, 16073, 16074, 16075, 16076, 16077, 160…
fit_ts_sep = function(pm_components, df_date, y, dat){
  
  fit_model = glue('{y} ~ {pm_components} +
                    s(admit_date_int, k = {df_date}, fx = T, bs = "cr") +
                    dow + s(Temperature_MA_07, k = 6) +
                    s(RH_MA_07, k = 6)') %>%
    as.formula() %>%
    gam(family = quasipoisson(link = "log"),
        scale = -1,
        control = gam.control(epsilon = 0.0000001,
                              maxit = 1000),
        na.action = na.omit,
        data = dat)
  
  coef_vec = summary(fit_model)$p.coeff
  se_vec = summary(fit_model)$se
  pval_vec = summary(fit_model)$p.pv
  
  coef_point = coef_vec[names(coef_vec) == pm_components]
  se_point = se_vec[names(se_vec) == pm_components]
  pval_point = pval_vec[names(pval_vec) == pm_components]
  
  ER = (exp(coef_point) - 1)*100
  CI_lower = (exp(coef_point - 1.96*se_point) - 1)*100
  CI_upper = (exp(coef_point + 1.96*se_point) - 1)*100
  
  ER_95CI = glue("{formatC(round(ER, 2), format = 'f', digits = 2)} ",
                 "({formatC(round(CI_lower, 2), format = 'f', digits = 2)} to ",
                 "{formatC(round(CI_upper, 2), format = 'f', digits = 2)})")
  
  output = data.table(
    pm_components = pm_components, 
    #pm_component_IQR = formatC(round(IQR_range, 2), format = 'f', digits = 2), 
    n = dat[,.N],
    ER_95CI = ER_95CI,
    ER = ER, 
    CI_lower = CI_lower, 
    CI_upper = CI_upper, 
    p_value = pval_point,
    log_OR = coef_point, 
    log_OR_se = se_point)
  
  return(output)
}

map_city = function(dat, df_date, y){
  
  #pb$tick()$print()
  df_date_vector = rep(df_date, length(pollutant_vec7))
  
  res = map2_dfr(pollutant_vec7, df_date, fit_ts_sep, dat = dat, y = y)
  
  return(res)
}

第一阶段GAM建模

计算各污染物的四分位间距

pollutant_vec7 = c(
  "PM2.5_MA_07", "BC_MA_07", "OM_MA_07", "SO4_MA_07", "NO3_MA_07", "NH4_MA_07")

IQR_pollutant = map_dfr(
  pollutant_vec7,
  ~data.table(pollutant = .x, 
              IQR = IQR(ts_city[[.x]], na.rm = TRUE)))

IQR_pollutant
     pollutant        IQR
1: PM2.5_MA_07 22.6991687
2:    BC_MA_07  0.9554241
3:    OM_MA_07  5.6129462
4:   SO4_MA_07  4.1445419
5:   NO3_MA_07  6.7363995
6:   NH4_MA_07  4.3471178

设置各个城市的观测时长的自由度

city_obs_length = ts_city %>% 
  group_by(City) %>% 
  summarise(admit_date_min = min(admit_date),
            admit_date_max = max(admit_date),
            obs_length = as.integer(max(admit_date) - min(admit_date))/365,
            df = round(obs_length)*4) %>% 
  ungroup()

city_obs_length
# A tibble: 21 × 5
   City    admit_date_min admit_date_max obs_length    df
   <chr>   <date>         <date>              <dbl> <dbl>
 1 city_01 2014-01-01     2016-12-30           3.00    12
 2 city_02 2014-01-01     2016-12-30           3.00    12
 3 city_03 2014-01-01     2016-12-30           3.00    12
 4 city_04 2014-01-01     2016-12-30           3.00    12
 5 city_05 2014-01-01     2016-12-30           3.00    12
 6 city_06 2014-01-01     2016-12-30           3.00    12
 7 city_07 2014-01-01     2016-12-30           3.00    12
 8 city_08 2014-01-01     2016-12-30           3.00    12
 9 city_09 2014-01-01     2016-12-29           2.99    12
10 city_10 2014-01-01     2016-12-30           3.00    12
# ℹ 11 more rows

分城市进行建模

models_city = ts_city %>% 
  mutate(across(PM2.5_MA_07:BC_MA_07, ~.x/IQR(.x))) %>% 
  group_by(City) %>% 
  nest() %>%
  left_join(city_obs_length %>% 
              dplyr::select(City, df),
            by = 'City') %>% 
  mutate(models = map2(data, df, map_city, y = 'N'))

models_city
# A tibble: 21 × 4
# Groups:   City [21]
   City    data                     df models      
   <chr>   <list>                <dbl> <list>      
 1 city_01 <tibble [1,095 × 12]>    12 <dt [6 × 8]>
 2 city_02 <tibble [1,095 × 12]>    12 <dt [6 × 8]>
 3 city_03 <tibble [1,095 × 12]>    12 <dt [6 × 8]>
 4 city_04 <tibble [1,095 × 12]>    12 <dt [6 × 8]>
 5 city_05 <tibble [1,095 × 12]>    12 <dt [6 × 8]>
 6 city_06 <tibble [1,095 × 12]>    12 <dt [6 × 8]>
 7 city_07 <tibble [1,095 × 12]>    12 <dt [6 × 8]>
 8 city_08 <tibble [1,095 × 12]>    12 <dt [6 × 8]>
 9 city_09 <tibble [1,094 × 12]>    12 <dt [6 × 8]>
10 city_10 <tibble [1,095 × 12]>    12 <dt [6 × 8]>
# ℹ 11 more rows

计算各城市的模型估计值

estimate_city = models_city %>% 
  select(-data) %>% 
  unnest(cols = c(models)) %>% 
  setDT()
estimate_city
        City df pm_components               ER_95CI         ER   CI_lower
  1: city_01 12   PM2.5_MA_07  0.79 (-2.83 to 4.55)  0.7930900 -2.8296791
  2: city_01 12      BC_MA_07 -0.85 (-3.47 to 1.83) -0.8534384 -3.4659267
  3: city_01 12      OM_MA_07 -0.51 (-3.34 to 2.40) -0.5118881 -3.3424207
  4: city_01 12     SO4_MA_07  2.59 (-1.14 to 6.46)  2.5889907 -1.1405705
  5: city_01 12     NO3_MA_07  1.55 (-2.76 to 6.05)  1.5528460 -2.7551925
 ---                                                                     
122: city_21 12      BC_MA_07  0.38 (-5.67 to 6.82)  0.3789915 -5.6708142
123: city_21 12      OM_MA_07 5.76 (-1.36 to 13.39)  5.7578337 -1.3584923
124: city_21 12     SO4_MA_07  9.94 (0.33 to 20.46)  9.9379058  0.3315034
125: city_21 12     NO3_MA_07 20.06 (8.06 to 33.40) 20.0614717  8.0555993
126: city_21 12     NH4_MA_07 12.56 (2.03 to 24.17) 12.5562146  2.0258434
      CI_upper      p_value       log_OR  log_OR_se
  1:  4.550926 0.6723891302  0.007899615 0.01867575
  2:  1.829751 0.5294120184 -0.008571010 0.01362405
  3:  2.401534 0.7275376000 -0.005132028 0.01472629
  4:  6.459253 0.1763879549  0.025560438 0.01889372
  5:  6.051735 0.4861197066  0.015409127 0.02211619
 ---                                               
122:  6.816802 0.9050826372  0.003782751 0.03171546
123: 13.387555 0.1155189939  0.055981707 0.03554069
124: 20.464089 0.0425078336  0.094745528 0.04665101
125: 33.401296 0.0006954683  0.182833690 0.05375407
126: 24.173455 0.0184442512  0.118282596 0.05011563

第二阶段meta分析

tidy_rma = function(x, dat){
  dat %>% 
    .[pm_components == x] %>% 
    .[,metafor::rma(yi = log_OR, sei = log_OR_se)] %>% 
    tidy(conf.int = T, exponentiate = T) %>% 
    mutate(pollut = x, 
           `ER (95% CI)` = glue(
           "{formatC((estimate - 1)*100, 
           format = 'f', digits = 2)}% ({formatC((conf.low - 1)*100, 
           format = 'f', digits = 2)} to {formatC((conf.high - 1)*100, 
           format = 'f', digits = 2)}%)")) %>% 
    dplyr::select(pollut, 
                  `ER (95% CI)`,
                  OR = estimate, 
                  OR_CI_low = conf.low, 
                  OR_CI_up = conf.high, 
                  p_val = p.value, 
                  se = std.error)
}
estimate_meta = map_dfr(pollutant_vec7, tidy_rma, dat = estimate_city)
estimate_meta
# A tibble: 6 × 7
  pollut      `ER (95% CI)`             OR OR_CI_low OR_CI_up      p_val      se
  <chr>       <glue>                 <dbl>     <dbl>    <dbl>      <dbl>   <dbl>
1 PM2.5_MA_07 0.94% (0.40 to 1.49%)   1.01     1.00      1.01 0.000710   0.00277
2 BC_MA_07    0.04% (-0.33 to 0.41%)  1.00     0.997     1.00 0.824      0.00188
3 OM_MA_07    0.36% (-0.07 to 0.79%)  1.00     0.999     1.01 0.0970     0.00219
4 SO4_MA_07   1.39% (0.82 to 1.98%)   1.01     1.01      1.02 0.00000216 0.00292
5 NO3_MA_07   1.41% (0.67 to 2.15%)   1.01     1.01      1.02 0.000177   0.00373
6 NH4_MA_07   0.97% (0.34 to 1.60%)   1.01     1.00      1.02 0.00239    0.00317

结果可视化

estimate_meta %>%
  mutate(pollutant = gsub('_MA_07', '', pollut)) %>%
  ggplot(aes(pollutant, OR)) +
  geom_hline(yintercept = 1, color = 'red', linewidth = 1) +
  geom_errorbar(aes(ymin = OR_CI_low, ymax = OR_CI_up), width = 0.25) +
  geom_point(size = 4) +
  labs(x = 'Pollutants', y = 'OR (95%)') +
  theme_classic()