<- map(
df_LISS .x = c("./data/ch07a_2p_EN.sav", "./data/ch08b_EN_1.3p.sav",
"./data/ch09c_EN_1.1p.sav", "./data/ch10d_EN_1.0p.sav",
"./data/ch11e_EN_1.0p.sav", "./data/ch12f_EN_1.0p.sav",
"./data/ch13g_EN_1.0p.sav", "./data/ch15h_EN_1.2p.sav",
"./data/ch16i_EN_1.0p.sav", "./data/ch17j_EN_1.0p.sav",
"./data/ch18k_EN_1.0p.sav", "./data/ch19l_EN_1.0p.sav",
"./data/ch20m_EN_1.0p.sav", "./data/ch21n_EN_1.0p.sav"),
read.spss, use.value.labels = FALSE,
to.data.frame = TRUE
|>
) reduce(full_join, by = "nomem_encr") |>
select( # Select exposure, outcome, and covariates
nomem_encr, ends_with("017"), # Weight
ends_with("126"), # Smoking status ("Do you smoke?", 1 = yes, 2 = no, I quit)
ends_with("001"), # Sex
ends_with("002"), # Age
ends_with("005"), # Is your health better or worse than last year?
ends_with("127"), # What do/did you smoke? Cigarettes (including shag)
ends_with("128"), # What do/did you smoke? Pipe
ends_with("129"), # What do/did you smoke? Cigars
ends_with("130"), # Number of cigarettes per day
ends_with("131"), # Number of pipes per day
ends_with("132"), # Number of cigars per day
ends_with("133"), # Alcohol (average last 12 months, categories)
ends_with("080"), # Doctor diagnoses angina (last year, 0 = no, 1 = yes)
ends_with("081"), # Doctor diagnoses heart problems (last year, 0 = no, 1 = yes)
ends_with("082"), # Doctor diagnoses high blood pressure (last year, 0 = no, 1 = yes)
ends_with("083"), # Doctor diagnoses high cholesterol (last year, 0 = no, 1 = yes)
ends_with("084"), # Doctor diagnoses beroerte (last year, 0 = no, 1 = yes)
ends_with("085"), # Doctor diagnoses diabetes (last year, 0 = no, 1 = yes)
ends_with("086"), # Doctor diagnoses lung disease (last year, 0 = no, 1 = yes)
ends_with("087"), # Doctor diagnoses astma (last year, 0 = no, 1 = yes)
ends_with("088"), # Doctor diagnoses artritis (last year, 0 = no, 1 = yes)
ends_with("089"), # Doctor diagnoses cancer (last year, 0 = no, 1 = yes)
ends_with("090"), # Doctor diagnoses gastric ulcer (last year, 0 = no, 1 = yes)
ends_with("092"), # Doctor diagnoses staar (last year, 0 = no, 1 = yes)
ends_with("096"), # Doctor diagnoses goedaardige tumor (last year, 0 = no, 1 = yes)
ends_with("186"), # Hours heavy physical activity (average last week)
ends_with("071") # Do you regularly suffer from heart problems, angina, pain in chest?
|>
) rename(
ID = nomem_encr,
weight_07 = ch07a017, weight_08 = ch08b017, weight_09 = ch09c017, weight_10 = ch10d017,
weight_11 = ch11e017, weight_12 = ch12f017, weight_13 = ch13g017, weight_15 = ch15h017,
weight_16 = ch16i017, weight_17 = ch17j017, weight_18 = ch18k017, weight_19 = ch19l017,
weight_20 = ch20m017, weight_21 = ch21n017,
smoke_07 = ch07a126, smoke_08 = ch08b126, smoke_09 = ch09c126, smoke_10 = ch10d126,
smoke_11 = ch11e126, smoke_12 = ch12f126, smoke_13 = ch13g126, smoke_15 = ch15h126,
smoke_16 = ch16i126, smoke_17 = ch17j126, smoke_18 = ch18k126, smoke_19 = ch19l126,
smoke_20 = ch20m126, smoke_21 = ch21n126,
sex_08 = ch08b001, sex_09 = ch09c001, sex_10 = ch10d001,
sex_11 = ch11e001, sex_12 = ch12f001, sex_13 = ch13g001, sex_15 = ch15h001,
sex_16 = ch16i001, sex_17 = ch17j001, sex_18 = ch18k001, sex_19 = ch19l001,
sex_20 = ch20m001, sex_21 = ch21n001,
age_08 = ch08b002, age_09 = ch09c002, age_10 = ch10d002,
age_11 = ch11e002, age_12 = ch12f002, age_13 = ch13g002, age_15 = ch15h002,
age_16 = ch16i002, age_17 = ch17j002, age_18 = ch18k002, age_19 = ch19l002,
age_20 = ch20m002, age_21 = ch21n002,
geznd_07 = ch07a005, geznd_08 = ch08b005, geznd_09 = ch09c005, geznd_10 = ch10d005,
geznd_11 = ch11e005, geznd_12 = ch12f005, geznd_13 = ch13g005, geznd_15 = ch15h005,
geznd_16 = ch16i005, geznd_17 = ch17j005, geznd_18 = ch18k005, geznd_19 = ch19l005,
geznd_20 = ch20m005, geznd_21 = ch21n005,
tCret_07 = ch07a127, tCret_08 = ch08b127, tCret_09 = ch09c127, tCret_10 = ch10d127,
tCret_11 = ch11e127, tCret_12 = ch12f127, tCret_13 = ch13g127, tCret_15 = ch15h127,
tCret_16 = ch16i127, tCret_17 = ch17j127, tCret_18 = ch18k127, tCret_19 = ch19l127,
tCret_20 = ch20m127, tCret_21 = ch21n127,
tPipe_07 = ch07a128, tPipe_08 = ch08b128, tPipe_09 = ch09c128, tPipe_10 = ch10d128,
tPipe_11 = ch11e128, tPipe_12 = ch12f128, tPipe_13 = ch13g128, tPipe_15 = ch15h128,
tPipe_16 = ch16i128, tPipe_17 = ch17j128, tPipe_18 = ch18k128, tPipe_19 = ch19l128,
tPipe_20 = ch20m128, tPipe_21 = ch21n128,
tCigar_07 = ch07a129, tCigar_08 = ch08b129, tCigar_09 = ch09c129, tCigar_10 = ch10d129,
tCigar_11 = ch11e129, tCigar_12 = ch12f129, tCigar_13 = ch13g129, tCigar_15 = ch15h129,
tCigar_16 = ch16i129, tCigar_17 = ch17j129, tCigar_18 = ch18k129, tCigar_19 = ch19l129,
tCigar_20 = ch20m129, tCigar_21 = ch21n129,
nCret_07 = ch07a130, nCret_08 = ch08b130, nCret_09 = ch09c130, nCret_10 = ch10d130,
nCret_11 = ch11e130, nCret_12 = ch12f130, nCret_13 = ch13g130, nCret_15 = ch15h130,
nCret_16 = ch16i130, nCret_17 = ch17j130, nCret_18 = ch18k130, nCret_19 = ch19l130,
nCret_20 = ch20m130, nCret_21 = ch21n130,
nPipe_07 = ch07a131, nPipe_08 = ch08b131, nPipe_09 = ch09c131, nPipe_10 = ch10d131,
nPipe_11 = ch11e131, nPipe_12 = ch12f131, nPipe_13 = ch13g131, nPipe_15 = ch15h131,
nPipe_16 = ch16i131, nPipe_17 = ch17j131, nPipe_18 = ch18k131, nPipe_19 = ch19l131,
nPipe_20 = ch20m131, nPipe_21 = ch21n131,
nCigar_07 = ch07a132, nCigar_08 = ch08b132, nCigar_09 = ch09c132, nCigar_10 = ch10d132,
nCigar_11 = ch11e132, nCigar_12 = ch12f132, nCigar_13 = ch13g132, nCigar_15 = ch15h132,
nCigar_16 = ch16i132, nCigar_17 = ch17j132, nCigar_18 = ch18k132, nCigar_19 = ch19l132,
nCigar_20 = ch20m132, nCigar_21 = ch21n132,
alc_07 = ch07a133, alc_08 = ch08b133, alc_09 = ch09c133, alc_10 = ch10d133,
alc_11 = ch11e133, alc_12 = ch12f133, alc_13 = ch13g133, alc_15 = ch15h133,
alc_16 = ch16i133, alc_17 = ch17j133, alc_18 = ch18k133, alc_19 = ch19l133,
alc_20 = ch20m133, alc_21 = ch21n133,
dAngina_07 = ch07a080, dAngina_08 = ch08b080, dAngina_09 = ch09c080, dAngina_10 = ch10d080,
dAngina_11 = ch11e080, dAngina_12 = ch12f080, dAngina_13 = ch13g080, dAngina_15 = ch15h080,
dAngina_16 = ch16i080, dAngina_17 = ch17j080, dAngina_18 = ch18k080, dAngina_19 = ch19l080,
dAngina_20 = ch20m080, dAngina_21 = ch21n080,
dHeart_07 = ch07a081, dHeart_08 = ch08b081, dHeart_09 = ch09c081, dHeart_10 = ch10d081,
dHeart_11 = ch11e081, dHeart_12 = ch12f081, dHeart_13 = ch13g081, dHeart_15 = ch15h081,
dHeart_16 = ch16i081, dHeart_17 = ch17j081, dHeart_18 = ch18k081, dHeart_19 = ch19l081,
dHeart_20 = ch20m081, dHeart_21 = ch21n081,
dBlood_07 = ch07a082, dBlood_08 = ch08b082, dBlood_09 = ch09c082, dBlood_10 = ch10d082,
dBlood_11 = ch11e082, dBlood_12 = ch12f082, dBlood_13 = ch13g082, dBlood_15 = ch15h082,
dBlood_16 = ch16i082, dBlood_17 = ch17j082, dBlood_18 = ch18k082, dBlood_19 = ch19l082,
dBlood_20 = ch20m082, dBlood_21 = ch21n082,
dChol_07 = ch07a083, dChol_08 = ch08b083, dChol_09 = ch09c083, dChol_10 = ch10d083,
dChol_11 = ch11e083, dChol_12 = ch12f083, dChol_13 = ch13g083, dChol_15 = ch15h083,
dChol_16 = ch16i083, dChol_17 = ch17j083, dChol_18 = ch18k083, dChol_19 = ch19l083,
dChol_20 = ch20m083, dChol_21 = ch21n083,
dBeroe_07 = ch07a084, dBeroe_08 = ch08b084, dBeroe_09 = ch09c084, dBeroe_10 = ch10d084,
dBeroe_11 = ch11e084, dBeroe_12 = ch12f084, dBeroe_13 = ch13g084, dBeroe_15 = ch15h084,
dBeroe_16 = ch16i084, dBeroe_17 = ch17j084, dBeroe_18 = ch18k084, dBeroe_19 = ch19l084,
dBeroe_20 = ch20m084, dBeroe_21 = ch21n084,
dDiab_07 = ch07a085, dDiab_08 = ch08b085, dDiab_09 = ch09c085, dDiab_10 = ch10d085,
dDiab_11 = ch11e085, dDiab_12 = ch12f085, dDiab_13 = ch13g085, dDiab_15 = ch15h085,
dDiab_16 = ch16i085, dDiab_17 = ch17j085, dDiab_18 = ch18k085, dDiab_19 = ch19l085,
dDiab_20 = ch20m085, dDiab_21 = ch21n085,
dLung_07 = ch07a086, dLung_08 = ch08b086, dLung_09 = ch09c086, dLung_10 = ch10d086,
dLung_11 = ch11e086, dLung_12 = ch12f086, dLung_13 = ch13g086, dLung_15 = ch15h086,
dLung_16 = ch16i086, dLung_17 = ch17j086, dLung_18 = ch18k086, dLung_19 = ch19l086,
dLung_20 = ch20m086, dLung_21 = ch21n086,
dAstm_07 = ch07a087, dAstm_08 = ch08b087, dAstm_09 = ch09c087, dAstm_10 = ch10d087,
dAstm_11 = ch11e087, dAstm_12 = ch12f087, dAstm_13 = ch13g087, dAstm_15 = ch15h087,
dAstm_16 = ch16i087, dAstm_17 = ch17j087, dAstm_18 = ch18k087, dAstm_19 = ch19l087,
dAstm_20 = ch20m087, dAstm_21 = ch21n087,
dArtr_07 = ch07a088, dArtr_08 = ch08b088, dArtr_09 = ch09c088, dArtr_10 = ch10d088,
dArtr_11 = ch11e088, dArtr_12 = ch12f088, dArtr_13 = ch13g088, dArtr_15 = ch15h088,
dArtr_16 = ch16i088, dArtr_17 = ch17j088, dArtr_18 = ch18k088, dArtr_19 = ch19l088,
dArtr_20 = ch20m088, dArtr_21 = ch21n088,
dKank_07 = ch07a089, dKank_08 = ch08b089, dKank_09 = ch09c089, dKank_10 = ch10d089,
dKank_11 = ch11e089, dKank_12 = ch12f089, dKank_13 = ch13g089, dKank_15 = ch15h089,
dKank_16 = ch16i089, dKank_17 = ch17j089, dKank_18 = ch18k089, dKank_19 = ch19l089,
dKank_20 = ch20m089, dKank_21 = ch21n089,
dUlcer_07 = ch07a090, dUlcer_08 = ch08b090, dUlcer_09 = ch09c090, dUlcer_10 = ch10d090,
dUlcer_11 = ch11e090, dUlcer_12 = ch12f090, dUlcer_13 = ch13g090, dUlcer_15 = ch15h090,
dUlcer_16 = ch16i090, dUlcer_17 = ch17j090, dUlcer_18 = ch18k090, dUlcer_19 = ch19l090,
dUlcer_20 = ch20m090, dUlcer_21 = ch21n090,
dStaa_07 = ch07a092, dStaa_08 = ch08b092, dStaa_09 = ch09c092, dStaa_10 = ch10d092,
dStaa_11 = ch11e092, dStaa_12 = ch12f092, dStaa_13 = ch13g092, dStaa_15 = ch15h092,
dStaa_16 = ch16i092, dStaa_17 = ch17j092, dStaa_18 = ch18k092, dStaa_19 = ch19l092,
dStaa_20 = ch20m092, dStaa_21 = ch21n092,
dTumo_07 = ch07a096, dTumo_08 = ch08b096, dTumo_09 = ch09c096, dTumo_10 = ch10d096,
dTumo_11 = ch11e096, dTumo_12 = ch12f096, dTumo_13 = ch13g096, dTumo_15 = ch15h096,
dTumo_16 = ch16i096, dTumo_17 = ch17j096, dTumo_18 = ch18k096, dTumo_19 = ch19l096,
dTumo_20 = ch20m096, dTumo_21 = ch21n096,
hourHPA_07 = ch07a186, hourHPA_08 = ch08b186, hourHPA_09 = ch09c186, hourHPA_10 = ch10d186,
hourHPA_11 = ch11e186, hourHPA_12 = ch12f186, hourHPA_13 = ch13g186, hourHPA_15 = ch15h186,
hourHPA_16 = ch16i186, hourHPA_17 = ch17j186, hourHPA_18 = ch18k186, # From 2019 onwards not measured
sHeart_07 = ch07a071, sHeart_08 = ch08b071, sHeart_09 = ch09c071, sHeart_10 = ch10d071,
sHeart_11 = ch11e071, sHeart_12 = ch12f071, sHeart_13 = ch13g071, sHeart_15 = ch15h071,
sHeart_16 = ch16i071, sHeart_17 = ch17j071, sHeart_18 = ch18k071, sHeart_19 = ch19l071,
sHeart_20 = ch20m071, sHeart_21 = ch21n071
|>
) remove_all_labels()
# For each enrollment year, select target population
<- df_LISS |> filter(smoke_07 == 1)
df_LISS07
<- df_LISS |> filter(is.na(weight_07) & smoke_08 == 1)
df_LISS08
<- df_LISS |>
df_LISS09 filter(is.na(weight_07) & is.na(weight_08) & smoke_09 == 1)
<- df_LISS |>
df_LISS10 filter(is.na(weight_07) & is.na(weight_08) & is.na(weight_09) & smoke_10 == 1)
<- df_LISS |>
df_LISS11 filter(
is.na(weight_07) & is.na(weight_08) & is.na(weight_09) & is.na(weight_10) &
== 1
smoke_11
)
<- df_LISS |>
df_LISS12 filter(
is.na(weight_07) & is.na(weight_08) & is.na(weight_09) & is.na(weight_10) &
is.na(weight_11) &
== 1
smoke_12
)
<- df_LISS |>
df_LISS13 filter(
is.na(weight_07) & is.na(weight_08) & is.na(weight_09) & is.na(weight_10) &
is.na(weight_11) & is.na(weight_12) &
== 1
smoke_13
)
<- df_LISS |>
df_LISS15 filter(
is.na(weight_07) & is.na(weight_08) & is.na(weight_09) & is.na(weight_10) &
is.na(weight_11) & is.na(weight_12) & is.na(weight_13) &
== 1
smoke_15
)
<- df_LISS |>
df_LISS16 filter(
is.na(weight_07) & is.na(weight_08) & is.na(weight_09) & is.na(weight_10) &
is.na(weight_11) & is.na(weight_12) & is.na(weight_13) & is.na(weight_15) &
== 1
smoke_16
)
<- df_LISS |>
df_LISS17 filter(
is.na(weight_07) & is.na(weight_08) & is.na(weight_09) & is.na(weight_10) &
is.na(weight_11) & is.na(weight_12) & is.na(weight_13) & is.na(weight_15) &
is.na(weight_16) &
== 1
smoke_17
)
<- df_LISS |>
df_LISS18 filter(
is.na(weight_07) & is.na(weight_08) & is.na(weight_09) & is.na(weight_10) &
is.na(weight_11) & is.na(weight_12) & is.na(weight_13) & is.na(weight_15) &
is.na(weight_16) & is.na(weight_17) &
== 1
smoke_18
)
<- df_LISS |>
df_LISS19 filter(
is.na(weight_07) & is.na(weight_08) & is.na(weight_09) & is.na(weight_10) &
is.na(weight_11) & is.na(weight_12) & is.na(weight_13) & is.na(weight_15) &
is.na(weight_16) & is.na(weight_17) & is.na(weight_18) &
== 1
smoke_19
)
<- df_LISS |>
df_LISS20 filter(
is.na(weight_07) & is.na(weight_08) & is.na(weight_09) & is.na(weight_10) &
is.na(weight_11) & is.na(weight_12) & is.na(weight_13) & is.na(weight_15) &
is.na(weight_16) & is.na(weight_17) & is.na(weight_18) & is.na(weight_19) &
== 1
smoke_20
)
# Merge target population from year-specific datasets together
<- df_LISS07 |>
df_target bind_rows(
df_LISS08, df_LISS09, df_LISS10, df_LISS11, df_LISS12, df_LISS13,
df_LISS15, df_LISS16, df_LISS17, df_LISS18, df_LISS19, df_LISS20|>
) pivot_longer(
cols = -ID,
names_to = c("var", "time"),
names_sep = "_",
names_transform = list(time = as.numeric)
|>
) pivot_wider(values_from = value, names_from = var) |>
drop_na(smoke) |>
group_by(ID) |>
mutate(time = 0:(n() - 1)) |>
ungroup() |>
filter(time <= 3) |>
pivot_wider(names_from = time, values_from = c(weight:sHeart))
rm(df_LISS07, df_LISS08, df_LISS09, df_LISS10, df_LISS11, df_LISS12, df_LISS13,
df_LISS15, df_LISS16, df_LISS17, df_LISS18, df_LISS19, df_LISS20)
Online supplementary materials
``Estimating causal effects of time-varying exposures: The overlap and differences between structural equation modeling and marginal structural models’’ by Mulder, Luijken, Penning-de Vries, and Hamaker (under review)
Introduction
This document contains the online supplementary materials of Mulder, Luijken, Penning-de Vries, and Hamaker (under review). R code and Mplus syntax for the analysis of the empirical example, and for the simulation study can be found below. All analyses in R were done using version 4.2.2 R Core Team (2022b). We relied on functions from the R packages
- WeightIt (version 0.14.0) by Greifer (2023b);
- cobalt (version 4.5.0) by Greifer (2023a);
- boot (version 1.3-28) by Canty and Ripley (2022);
- mice (version 3.16.0) by Buuren and Groothuis-Oudshoorn (2011);
- foreign (version 0.8-83) by R Core Team (2022a);
- dplyr (version 1.1.4) by Wickham et al. (2023);
- purrr (version 1.0.2) by Wickham and Henry (2023); and
- ggplot2 (version 3.4.4) by Wickham (2016).
Analyses in Mplus were done using version 8.9 Muthén and Muthén (2017).
Empirical example
For the empirical example we made use of the health survey data of the Longitudinal Internet studies for the Social Sciences panel, administered by Centerdata (Tilburg University, The Netherlands). More information about the LISS panel can be found at https://www.lissdata.nl Scherpenzeel (2018). The wave-specific health datasets (from 2007 to 2021) were first imported into R and merged together.
Data cleaning and preparation
Exploration of the data with describe(df_target)
from the psych package revealed individuals with implausible values: These were removed from the dataset. Missing values in this sample were then filled in by single imputation, and the data were exported to Mplus.
# Descriptive sample
::describe(df_target)
psych
# Prepare sample data for analyses
<- df_target |>
df_target_clean filter(is.na(weight_0) | (weight_0 > 20 & weight_0 < 155)) |>
filter(is.na(weight_1) | (weight_1 > 30 & weight_1 < 160)) |>
filter(is.na(weight_3) | (weight_3 > 30 & weight_3 < 160)) |>
filter(is.na(hourHPA_3) | hourHPA_3 < 168) |>
rowwise(ID) |>
mutate(
age = mean(c(age_0, age_1, age_2, age_3), na.rm = TRUE),
sex = mean(c(sex_0, sex_1, sex_2, sex_3), na.rm = TRUE) - 1,
comor1_0 = if_else(sum(c_across(starts_with("d") & ends_with("_0"))) == 1, 1, 0),
comor2_0 = if_else(sum(c_across(starts_with("d") & ends_with("_0"))) > 1, 1, 0),
comor1_1 = if_else(sum(c_across(starts_with("d") & ends_with("_1"))) == 1, 1, 0),
comor2_1 = if_else(sum(c_across(starts_with("d") & ends_with("_1"))) > 1, 1, 0),
comor1_2 = if_else(sum(c_across(starts_with("d") & ends_with("_2"))) == 1, 1, 0),
comor2_2 = if_else(sum(c_across(starts_with("d") & ends_with("_2"))) > 1, 1, 0),
smoke_0 = smoke_0 - 1,
smoke_1 = smoke_1 - 1,
smoke_2 = smoke_2 - 1,
smoke_3 = smoke_3 - 1
|>
) ungroup() |>
select(
-contains("Cret"),
-contains("Pipe"),
-contains("Cigar"),
-starts_with("d"),
-sex_0, -sex_1, -sex_2, -sex_3,
-age_0, -age_1, -age_2, -age_3,
-smoke_0, -smoke_3,
-hourHPA_3, -geznd_3, -sHeart_3, -alc_3
|>
) filter(sex == 0 | sex == 1) |>
mutate_at(c("smoke_1", "smoke_2", "sHeart_0", "sHeart_1", "sHeart_2", "sex",
"comor1_0", "comor2_0", "comor1_1", "comor2_1", "comor1_2",
"comor2_2"), as.factor) |>
mutate_at(c("geznd_0", "geznd_1", "geznd_2", "alc_0", "alc_1", "alc_2"), as.ordered)
# Single imputation
<- mice(df_target_clean, m = 1, seed = 20230728)
fit_imp <- complete(fit_imp)
df_target_imp summary(df_target_imp)
# Export for Mplus
<- function(f){ as.numeric(levels(f))[f] }
factor_to_numeric
|>
df_target_imp mutate_if(is.factor, factor_to_numeric) |>
mutate(smXco1_2 = smoke_1 * comor1_2) |>
::write.table(
utilsfile = "./Mplus/df_target_imp.dat",
sep = "\t",
col.names = FALSE,
row.names = FALSE,
na = "-999",
quote = FALSE
)
Weighting
Stabilized inverse-probability-of-exposure weights were computed, and covariate imbalance was assessed before and after weighted. Below you can also find R code for the creation of Figure 3 in the main manuscript.
# Compute inverse-probability-of-exposure weights
<- weightitMSM(
SW_smoke list(
~ age + sex + # Baseline
smoke_1 + weight_0 + alc_0 + hourHPA_0 + sHeart_0 + comor1_0 + comor2_0 + # Lag 1
geznd_0 + weight_1 + alc_1 + hourHPA_1 + sHeart_1 + comor1_1 + comor2_1, # Lag 0
geznd_1 ~ age + sex + # Baseline
smoke_2 + weight_0 + alc_0 + hourHPA_0 + sHeart_0 + comor1_0 + comor2_0 + # Lag 2
geznd_0 + weight_1 + alc_1 + hourHPA_1 + sHeart_1 + comor1_1 + comor2_1 + # Lag 1
geznd_1 + weight_2 + alc_2 + hourHPA_2 + sHeart_2 + comor1_2 + comor2_2 # Lag 0
geznd_2
),data = df_target_imp,
method = "glm",
stabilize = TRUE,
link = "logit"
)
summary(SW_smoke)
$SW_smoke <- SW_smoke$weights
df_target_imp$PS_1 <- SW_smoke$ps.list[[1]]
df_target_imp$PS_2 <- SW_smoke$ps.list[[2]]
df_target_imp
# Examine positivity by overlap PS distributions
<- df_target_imp |>
plot_PS select(ID, starts_with("smoke"), starts_with("PS")) |>
mutate_at(c("smoke_1", "smoke_2"), as.numeric) |>
pivot_longer(
cols = -ID,
names_to = c("var", "time"),
names_sep = "_",
names_transform = list(time = as.numeric)
|>
) pivot_wider(values_from = value, names_from = var) |>
mutate(facet_time = factor(time, labels = c("Time 1", "Time 2"))) |>
ggplot(aes(x = PS, fill = as.factor(smoke))) +
geom_density(alpha = .7) +
facet_wrap( ~ facet_time) +
theme(
panel.background = element_rect(fill = "white"),
axis.text.x = element_text(color = "black"),
axis.title.y = element_blank(),
panel.border = element_rect(fill = NA, color = "black"),
plot.background = element_blank(),
legend.background = element_blank(),
legend.key = element_blank(),
legend.position = "bottom",
text = element_text(size = 20)
+
) scale_fill_discrete(name = "Exposure", labels = c("Smoking", "Quit smoking"))
ggsave(plot_PS, filename = "./figures/plot_PS.pdf")
# Examine imbalance after reweighing
<- bal.tab(
tab_bal x = list(
~ age + sex + # Baseline
smoke_1 + weight_0 + alc_0 + hourHPA_0 + sHeart_0 + comor1_0 + comor2_0 + # Lag 1
geznd_0 + weight_1 + alc_1 + hourHPA_1 + sHeart_1 + comor1_1 + comor2_1, # Lag 0
geznd_1 ~ age + sex + # Baseline
smoke_2 + weight_0 + alc_0 + hourHPA_0 + sHeart_0 + comor1_0 + comor2_0 + # Lag 2
geznd_0 + weight_1 + alc_1 + hourHPA_1 + sHeart_1 + comor1_1 + comor2_1 + # Lag 1
geznd_1 + weight_2 + alc_2 + hourHPA_2 + sHeart_2 + comor1_2 + comor2_2 # Lag 0
geznd_2
),data = df_target_imp,
stats = c("m", "ks"),
weights = df_target_imp$SW_smoke,
which.time = .all,
un = TRUE
)
<- c(
plot_love_names age = "Age (years)",
sex = "Sex: male",
geznd_0_1 = "Health change t0: considerably poorer ",
geznd_0_2 = "Health change t0: somewhat poorer",
geznd_0_3 = "Health change t0: same ",
geznd_0_4 = "Health change t0: somewhat better ",
geznd_0_5 = "Health change t0: considerably better",
weight_0 = "Weight t0 (kg)",
alc_0_1 = "Alcohol frequency t0: daily",
alc_0_2 = "Alcohol frequency t0: 5/6 PW",
alc_0_3 = "Alcohol frequency t0: 3/4 PW",
alc_0_4 = "Alcohol frequency t0: 1/2 PW",
alc_0_5 = "Alcohol frequency t0: 1/2 PM",
alc_0_6 = "Alcohol frequency t0: 1 P2M",
alc_0_7 = "Alcohol frequency t0: 1/2 PY",
alc_0_8 = "Alcohol frequency t0: never",
hourHPA_0 = "Physical activity t0 (hours per week)",
sHeart_0 = "Heart problems t0: yes",
comor1_0 = "Comorbidity t0: 1 diagnosis",
comor2_0 = "Comorbidity t0: > 1 diagnoses",
geznd_1_1 = "Health change t1: considerably poorer",
geznd_1_2 = "Health change t1: somewhat poorer",
geznd_1_3 = "Health change t1: same ",
geznd_1_4 = "Health change t1: somewhat better ",
geznd_1_5 = "Health change t1: considerably better",
weight_1 = "Weight t1 (kg)",
alc_1_1 = "Alcohol frequency t1: daily",
alc_1_2 = "Alcohol frequency t1: 5/6 PW",
alc_1_3 = "Alcohol frequency t1: 3/4 PW",
alc_1_4 = "Alcohol frequency t1: 1/2 PW",
alc_1_5 = "Alcohol frequency t1: 1/2 PM",
alc_1_6 = "Alcohol frequency t1: 1 P2M",
alc_1_7 = "Alcohol frequency t1: 1/2 PY",
alc_1_8 = "Alcohol frequency t1: never",
hourHPA_1 = "Physical activity t1 (hours per week)",
sHeart_1 = "Heart problems t1: yes",
comor1_1 = "Comorbidity t1: 1 diagnosis",
comor2_1 = "Comorbidity t1: > 1 diagnoses"
)
<- love.plot(
plot_love x = tab_bal,
drop.distance = TRUE,
abs = TRUE,
thresholds = c(m = .2),
var.names = plot_love_names,
shapes = c("triangle filled", "circle filled"),
sample.names = c("Unweighted", "PS weighted"),
line = TRUE,
which.time = .all,
colors = TRUE,
stars = "raw",
position = "bottom"
+
) labs(title = NULL)
ggsave(plot_love, filename = "./figures/plot_love.pdf", height = 8, width = 7)
Estimating effects
The below R code estimates the joint effect of smoking cessation on end-of-study body weight, and computed 95% bootstrapped confidence intervals.
# Fit MSM
## Using bootstrap for 95% CI
<- function(data, i) {
fn_boot <- data[i,]
df_boot
# PS weighting for exposure
<- weightitMSM(
SW_smoke list(
~ age + sex + # Baseline
smoke_1 + weight_0 + alc_0 + hourHPA_0 + sHeart_0 + comor1_0 + comor2_0 + # Lag 1
geznd_0 + weight_1 + alc_1 + hourHPA_1 + sHeart_1 + comor1_1 + comor2_1, # Lag 0
geznd_1 ~ age + sex + # Baseline
smoke_2 + weight_0 + alc_0 + hourHPA_0 + sHeart_0 + comor1_0 + comor2_0 + # Lag 2
geznd_0 + weight_1 + alc_1 + hourHPA_1 + sHeart_1 + comor1_1 + comor2_1 + # Lag 1
geznd_1 + weight_2 + alc_2 + hourHPA_2 + sHeart_2 + comor1_2 + comor2_2 # Lag 0
geznd_2
),data = df_boot,
method = "glm",
stabilize = TRUE,
link = "logit"
)
# Bring weights into the dataset
$weights <- SW_smoke$weights
df_boot
# Fit outcome model
<- glm(
fit formula = weight_3 ~ smoke_1 + smoke_2,
data = df_boot,
weights = weights
)
return(
c(
coef(fit)["smoke_11"],
coef(fit)["smoke_21"],
coef(fit)["smoke_11"] + coef(fit)["smoke_21"]
)
)
}
### Perform bootstrap
set.seed(20230725)
<- boot(df_target_imp, fn_boot, R = 999)
out_boot boot.ci(out_boot, type = "perc", index = 1)
Simulation study
The simulations are based on several helper functions, which are introduced and defined below. Data generation was performed in R, while data analysis was performed both in R (for IPW estimation-based methods) and Mplus (for SEM-based methods). The simulation results from R and Mplus were then extracted and merged across the simulation conditions. Finally, the simulation results were visualized.
Helper functions
Three helper functions were defined which all work together to generate data and compute inverse-probability-of-exposure weights for a specific simulation condition.
This function generates data according to the data generating mechanism in Figure 4 of the main paper. Structural relations between variables are specified as polynomials of order 2. By default the parameters of the quadratic term (denoted by B2_...
) are set to 0 such that only linear relationships between variables are specified (i.e., DGM 1). Using the argument param_funcForm
, the parameters of quadratic terms can be set to other values, thereby allowing for data generated under DGMs 2 to 5. The params_prop_exposed
argument sets the intercepts of the exposure-variables, thereby controlling the proportion exposed for each sample.
<- function(
simulate_data
sample_size, params_funcForm, params_prop_exposed, # Wave 1
B0_L1 = 1, B_C0L1 = 0.1, B2_C0L1 = 0,
B0_A1 = 0, B_C0A1 = 0.1, B_L1A1 = 0.5, B2_L1A1 = 0,
# Wave 2
B0_L2 = 1, B_L1L2 = 0.3, B_A1L2 = 0.4, B_C0L2 = 0.1,
B2_C0L2 = 0, B2_L1L2 = 0,
B0_A2 = -.545, B_A1A2 = 0.8, B_L1A2 = 0.25, B_L2A2 = 0.5, B_C0A2 = 0.1,
B2_L1A2 = 0, B2_L2A2 = 0,
# Outcome
B0_Y = 0, B_L2Y = 0.3, B_L1Y = 0.15, B_A2Y = 0.4, B_A1Y = 0.2, B_C0Y = 0.1,
B2_L1Y = 0, B2_L2Y = 0, B2_C0Y = 0
) {
# Set (mis)specified population values
eval(parse(text = params_funcForm))
# Set intercepts exposure model for proportion exposed
eval(parse(text = params_prop_exposed))
# Simulate data
<- rnorm(sample_size, mean = 4)
C0
<- B0_L1 + # Intercept
L1 * C0 + # Linear
B_C0L1 * I((C0 - mean(C0))^2) + # Quadratic
B2_C0L1 rnorm(sample_size)
<- plogis(
Pr_A1 + # Intercept
B0_A1 * C0 + B_L1A1 * L1 + # Linear
B_C0A1 * I((L1 - mean(L1))^2) # Quadratic
B2_L1A1
)
<- rbinom(n = sample_size, size = 1, prob = Pr_A1)
A1
<- B0_L2 + # Intercept
L2 * L1 + B_A1L2 * A1 + B_C0L2 * C0 + # Linear
B_L1L2 * I((C0 - mean(C0))^2) + B2_L1L2 * I((L1 - mean(L1))^2) + # Quadratic
B2_C0L2 rnorm(sample_size)
<- plogis(
Pr_A2 + # Intercept
B0_A2 * L1 + B_A1A2 * A1 + B_L2A2 * L2 + B_C0A2 * C0 + # Linear
B_L1A2 * I((L1 - mean(L1))^2) + B2_L2A2 * I((L2 - mean(L2))^2) # Quadratic
B2_L1A2
)
<- rbinom(n = sample_size, size = 1, prob = Pr_A2)
A2
<- B0_Y + # Intercept
Y * L2 + B_L1Y * L1 + B_A2Y * A2 + B_A1Y * A1 + B_C0Y * C0 + # Linear
B_L2Y * I((L1 - mean(L1))^2) + B2_L2Y * I((L2 - mean(L2))^2) + # Quadratic
B2_L1Y * I((C0 - mean(C0))^2) +
B2_C0Y rnorm(sample_size)
<- data.frame(
df C0 = C0,
L1 = as.numeric(L1),
A1 = as.numeric(A1),
L2 = as.numeric(L2),
A2 = as.numeric(A2),
Y = as.numeric(Y),
Pr_A1 = Pr_A1,
Pr_A2 = Pr_A2
)
return(df)
}
This function computes stabilized inverse-probability-of-exposure weights using using both a linear propensity score model, and a quadratic propensity score model. For DGMs 4 and 5, the quadratic propensity score model is correct.
# Compute inverse probability of exposure weights
<- function(df, stabilized, m_A1_correct, m_A2_correct) {
compute_weights
# Compute probability of treatment conditional on covariates
<- glm(A1 ~ C0 + L1, data = df, family = "binomial")
fit_ps1 <- glm(A2 ~ C0 + L1 + A1 + L2, data = df, family = "binomial")
fit_ps2
<- glm(as.formula(m_A1_correct), data = df, family = "binomial")
fit_ps1_correct <- glm(as.formula(m_A2_correct), data = df, family = "binomial")
fit_ps2_correct
<- predict(fit_ps1, type = "response")
ps1_den <- predict(fit_ps2, type = "response")
ps2_den
<- predict(fit_ps1_correct, type = "response")
ps1_den_correct <- predict(fit_ps2_correct, type = "response")
ps2_den_correct
# Compute probability of treatment conditional on treatment history
<- glm(A1 ~ 1, data = df, family = "binomial")
fit_ps1_num <- glm(A2 ~ A1, data = df, family = "binomial")
fit_ps2_num
<- predict(fit_ps1_num, type = "response")
ps1_num <- predict(fit_ps2_num, type = "response")
ps2_num
# Compute stabilized inverse probability weights
<- df$A1 * (ps1_num / ps1_den) + (1 - df$A1) * ((1 - ps1_num) / (1 - ps1_den))
ipw1 <- df$A2 * (ps2_num / ps2_den) + (1 - df$A2) * ((1 - ps2_num) / (1 - ps2_den))
ipw2
<- df$A1 * (ps1_num / ps1_den_correct) + (1 - df$A1) * ((1 - ps1_num) / (1 - ps1_den_correct))
ipw1_correct <- df$A2 * (ps2_num / ps2_den_correct) + (1 - df$A2) * ((1 - ps2_num) / (1 - ps2_den_correct))
ipw2_correct
# Compute stabilized weights
<- ipw1 * ipw2
W <- ipw1_correct * ipw2_correct
W_correct <- df |>
df ::bind_cols(ps1 = ps1_den, ps2 = ps2_den, W = W, W_correct = W_correct)
dplyr
return(df)
}
This function controls the simulation process for a particular simulation scenario. Based on the population parameter values associated with a particular simulation scenario, it repeatedly generates data using simulate_data
, and estimates the always-exposed versus never-exposed effect for an IPW method, a lavaan-based SEM method (not in main manuscript), a t-test (not in main manuscript), and a naive regression method. It outputs three dataframes:
df_performance
, containing performance metrics such as bias, convergence issues, etc. per estimation method;df_estimates
, containing the parameter estimates across all replications; anddf_prop_exposed
, containing the proportion exposed in the sample per generated sample.
Each generated dataset is also saved, and exported for analysis in Mplus.
# Function to run simulations for single condition
<- function(condition, n_reps, p, save_folder) {
run_condition
# Extract information
<- as.numeric(condition["sample_size"])
sample_size <- as.numeric(condition["id_condition"])
id_condition <- as.numeric(condition["id_funcForm"])
id_funcForm <- as.numeric(condition["PV_A1"])
PV_A1 <- as.numeric(condition["PV_AvN"])
PV_AvN
# Memory allocation
<- df_regression_A1 <- df_correct_A1 <- df_path_A1 <-
df_IPW_A1 <- df_regression_AvN <- df_ttest_AvN <- df_path_AvN <-
df_IPW_AvN <-
df_correct_AvN data.frame(
matrix(
data = NA,
ncol = 2,
nrow = n_reps,
dimnames = list(NULL, c("PV","est"))
)|>
) bind_cols(inadmissible = FALSE, not_converged = FALSE, error = FALSE)
<- data.frame(
df_prop_exposed matrix(
data = NA,
ncol = 2,
nrow = n_reps,
dimnames = list(NULL, c("n_exposed_A1", "n_exposed_A2"))
)
)
# Create folder for data save
<- file.path(getwd(), save_folder, paste0("condition", id_condition))
save_path dir.create(save_path)
# Replications
for (i in 1:n_reps) {
# Fill in population values
"PV"] <- df_regression_A1[i, "PV"] <- df_path_A1[i, "PV"] <-
df_IPW_A1[i, "PV"] <- PV_A1
df_correct_A1[i,
"PV"] <- df_regression_AvN[i, "PV"] <-
df_IPW_AvN[i, "PV"] <- df_path_AvN[i, "PV"] <-
df_ttest_AvN[i, "PV"] <- PV_AvN
df_correct_AvN[i,
# Simulate data
<- simulate_data(sample_size,
df_dat params_funcForm = condition[["funcForm"]],
params_prop_exposed = condition[["prop_exposed_funcForm"]]
|>
) compute_weights(
stabilized = TRUE,
m_A1_correct = condition[["m_A1_correct"]],
m_A2_correct = condition[["m_A2_correct"]]
)
# Save data
::write.table(df_dat,
utilsfile = file.path(save_path, paste0("df", i, "_condition", id_condition, ".dat")),
sep = "\t", col.names = FALSE, row.names = FALSE, na = "-999"
)
# Fill in proportion exposed in sample
"n_exposed_A1"] <- sum(df_dat$A1)
df_prop_exposed[i, "n_exposed_A2"] <- sum(df_dat$A2)
df_prop_exposed[i,
# Fit models
<- tryCatch(
fit_IPW
{glm(Y ~ A1 + A2, data = df_dat, weights = W)
},error = function(e) {
"error"] <<- TRUE
df_IPW_A1[i, "error"] <<- TRUE
df_IPW_AvN[i,
}
)
<- tryCatch(
fit_regression
{lm(Y ~ A1 + A2, data = df_dat)
},error = function(e) {
"error"] <<- TRUE
df_regression_A1[i, "error"] <<- TRUE
df_regression_AvN[i,
}
)
<- tryCatch(
fit_path
{suppressWarnings(
lavaan(model = m_path, data = df_dat, ordered = c("A1", "A2"))
)
},error = function(e) {
"error"] <<- TRUE
df_path_A1[i, "error"] <<- TRUE
df_path_AvN[i,
}
)
<- tryCatch(
fit_correct
{glm(Y ~ A1 + A2, data = df_dat, weights = W_correct)
},error = function(e) {
"error"] <<- TRUE
df_correct[i,
}
)
<- tryCatch(
fit_ttest
{ <- filter(df_dat, A1 == 1 & A2 == 1 | A1 == 0 & A2 == 0)
df_AvN t.test(Y ~ A1, data = df_AvN)
},error = function(e) {
"error"] <<- TRUE
df_ttest_AvN[i,
}
)
# Check convergence and inadmissibility of IPTW-SEM results
if (is.null(fit_path) || !lavaan::lavInspect(fit_path, what = "converged")) {
"not_converged"] <- TRUE
df_path_A1[i, "not_converged"] <- TRUE
df_path_AvN[i,
}
if (!suppressWarnings(lavaan::lavInspect(fit_path, what = "post.check"))) {
"inadmissible"] <- TRUE
df_path_A1[i, "inadmissible"] <- TRUE
df_path_AvN[i,
}
# Extract point estimates
tryCatch(
{"est"] <- coef(fit_IPW)["A1"]
df_IPW_A1[i,
<- data.frame(C0 = df_dat$C0, A1 = 0, A2 = 0)
df_Y_0 $Y_0 <- predict(fit_IPW, newdata = df_Y_0)
df_Y_0
<- data.frame(C0 = df_dat$C0, A1 = 1, A2 = 1)
df_Y_1 $Y_1 <- predict(fit_IPW, newdata = df_Y_1)
df_Y_1
"est"] <- mean(df_Y_1$Y_1 - df_Y_0$Y_0)
df_IPW_AvN[i,
},error = function(e) {
"error"] <<- TRUE
df_IPW_A1[i, "error"] <<- TRUE
df_IPW_AvN[i,
}
)
tryCatch(
{"est"] <- coef(fit_regression)["A1"]
df_regression_A1[i,
<- data.frame(C0 = df_dat$C0, A1 = 0, A2 = 0)
df_Y_0 $Y_0 <- predict(fit_regression, newdata = df_Y_0)
df_Y_0
<- data.frame(C0 = df_dat$C0, A1 = 1, A2 = 1)
df_Y_1 $Y_1 <- predict(fit_regression, newdata = df_Y_1)
df_Y_1
"est"] <- mean(df_Y_1$Y_1 - df_Y_0$Y_0)
df_regression_AvN[i,
},error = function(e) {
"error"] <<- TRUE
df_regression_A1[i, "error"] <<- TRUE
df_regression_AvN[i,
}
)
tryCatch(
{"est"] <- parameterEstimates(fit_path)[32, "est"]
df_path_A1[i, "est"] <- parameterEstimates(fit_path)[33, "est"]
df_path_AvN[i,
},error = function(e) {
"error"] <<- TRUE
df_regression_A1[i, "error"] <<- TRUE
df_regression_AvN[i,
}
)
tryCatch(
{"est"] <- fit_ttest$estimate[2] - fit_ttest$estimate[1]
df_ttest_AvN[i,
},error = function(e) {
"error"] <<- TRUE
df_ttest_AvN[i,
}
)
tryCatch(
{"est"] <- coef(fit_correct)["A1"]
df_correct_A1[i,
<- data.frame(C0 = df_dat$C0, A1 = 0, A2 = 0)
df_Y_0 $Y_0 <- predict(fit_correct, newdata = df_Y_0)
df_Y_0
<- data.frame(C0 = df_dat$C0, A1 = 1, A2 = 1)
df_Y_1 $Y_1 <- predict(fit_correct, newdata = df_Y_1)
df_Y_1
"est"] <- mean(df_Y_1$Y_1 - df_Y_0$Y_0)
df_correct_AvN[i,
},error = function(e) {
"error"] <<- TRUE
df_correct_A1 [i, "error"] <<- TRUE
df_correct_AvN [i,
}
)
# Signal progress
p()
}
# Compute performance measures
<- list(
df_performance IPW_A1 = df_IPW_A1,
IPW_AvN = df_IPW_AvN,
regression_A1 = df_regression_A1,
regression_AvN = df_regression_AvN,
path_A1 = df_path_A1,
path_AvN = df_path_AvN,
ttest_AvN = df_ttest_AvN,
correct_A1 = df_correct_A1,
correct_AvN = df_correct_AvN
|>
) ::map_dfr(
purrrfunction(x) {
|>
x summarise(
error = sum(error),
inadmissible = sum(inadmissible),
not_converged = sum(not_converged),
est_bias = mean(est - PV, na.rm = TRUE),
est_bias = mean((est - PV)^2, na.rm = TRUE),
PV = mean(PV, na.rm = TRUE)
)
},.id = "model"
|>
) mutate(
id_funcForm = id_funcForm,
sample_size = sample_size,
prop_exposed = as.numeric(condition["prop_exposed"])
)
# Collect point estimates in long-format
<- bind_rows(
df_estimates list(
IPW_A1 = df_IPW_A1,
IPW_AvN = df_IPW_AvN,
regression_A1 = df_regression_A1,
regression_AvN = df_regression_AvN,
path_A1 = df_path_A1,
path_AvN = df_path_AvN,
ttest_AvN = df_ttest_AvN,
correct_A1 = df_correct_A1,
correct_AvN = df_correct_AvN
),.id = "model"
|>
) mutate(
bias = est - PV,
id_funcForm = id_funcForm,
sample_size = sample_size,
prop_exposed = as.numeric(condition["prop_exposed"])
|>
) select(model, bias, id_funcForm, sample_size, prop_exposed)
# Collect proportion exposed in sample
<- df_prop_exposed |>
df_prop_exposed mutate(
id_funcForm = id_funcForm,
sample_size = sample_size,
target_prop_exposed = as.numeric(condition["prop_exposed"]),
sample_prop_exposed_A1 = n_exposed_A1 / sample_size,
sample_prop_exposed_A2 = n_exposed_A2 / sample_size
|>
) select(id_funcForm:sample_prop_exposed_A2)
# Create repList
<- paste0("df", 1:n_reps, "_condition", id_condition, ".dat")
repList ::write.table(repList,
utilsfile = file.path(save_path, paste0("repList_condition", id_condition, ".dat")),
sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE
)
return(list(
df_performance = df_performance,
df_estimates = df_estimates,
df_prop_exposed = df_prop_exposed
)) }
Running simulations
The below R code for creates a dataframe with the different simulation scenarios in the rows, and the columns containing the factors that define a simulation scenario. The run_condition()
function is then called on each row resulting in the generation of data per scenario, and the analysis of the data by various methods. The generated data was exported such that it can be analysed in Mplus as well.
# Specify population parameter values for different data generating mechanisms
<- c(
funcForms "",
"B2_C0L1 <- B2_C0L2 <- 0.1; B2_L1L2 <- 0.3",
"B2_L2Y <- 0.3; B2_C0Y <- 0.1; B2_L1Y <- 0.15",
"B2_L1A1 <- 0.5; B2_L1A2 <- 0.25; B2_L2A2 <- 0.5",
"B2_L1A1 <- 0.5; B2_L1A2 <- 0.25; B2_L2A2 <- 0.5; B2_L2Y <- 0.3; B2_C0Y <- 0.1; B2_L1Y <- 0.15"
)
# Specify correct PS model for "correct" modeling approach
<- c(
m_A1_correct "A1 ~ C0 + L1",
"A1 ~ C0 + L1",
"A1 ~ C0 + L1",
"A1 ~ C0 + L1 + I( (L1 - mean(L1))^2 )",
"A1 ~ C0 + L1 + I( (L1 - mean(L1))^2 )"
)
<- c(
m_A2_correct "A2 ~ C0 + A1 + L1 + L2",
"A2 ~ C0 + A1 + L1 + L2",
"A2 ~ C0 + A1 + L1 + L2",
"A2 ~ C0 + A1 + L1 + L2 + I( (L1 - mean(L1))^2 ) + I( (L2 - mean(L2))^2 )",
"A2 ~ C0 + A1 + L1 + L2 + I( (L1 - mean(L1))^2 ) + I( (L2 - mean(L2))^2 )"
)
# Simulation factor 2 - Sample size
<- c(300, 1000)
sample_sizes
# Simulation factor 3 - Proportion exposed
<- c(.1, .5, .9)
props_exposed
# Set intercepts of binary expose for controlling proportion exposed
<- c(
prop_exposed_funcForm "B0_A1 <- -3.4; B0_A2 <- -4.18", # DGM 1
"B0_A1 <- -1.1; B0_A2 <- -2.16",
"B0_A1 <- 1.2; B0_A2 <- -0.13",
"B0_A1 <- -3.44; B0_A2 <- -4.45", # DGM 2
"B0_A1 <- -1.15; B0_A2 <- -2.39",
"B0_A1 <- 1.14; B0_A2 <- -0.36",
"B0_A1 <- -3.4; B0_A2 <- -4.18", # DGM 3
"B0_A1 <- -1.1; B0_A2 <- -2.15",
"B0_A1 <- 1.2; B0_A2 <- -0.13",
"B0_A1 <- -4.1; B0_A2 <- -5.31", # DGM 4
"B0_A1 <- -1.53; B0_A2 <- -2.87",
"B0_A1 <- 0.8; B0_A2 <- -0.82",
"B0_A1 <- -4.1; B0_A2 <- -5.31", # DGM 5
"B0_A1 <- -1.53; B0_A2 <- -2.87",
"B0_A1 <- 0.8; B0_A2 <- -0.82"
)
# Create simulation conditions
<- data.frame(
conditions funcForm = rep(funcForms, each = 3),
id_funcForm = rep(c(1:5), each = 3),
m_A1_correct = rep(m_A1_correct, each = 3),
m_A2_correct = rep(m_A2_correct, each = 3),
PV_A1 = rep(PVs_A1, each = 3),
PV_AvN = rep(PVs_AvN, each = 3),
prop_exposed = rep(props_exposed, times = 5),
prop_exposed_funcForm = prop_exposed_funcForm
)
<- conditions |>
df_conditions bind_rows(conditions) |>
mutate(
sample_size = rep(sample_sizes, each = nrow(conditions)),
id_condition = row_number()
)
<- asplit(df_conditions, 1)
conditions
## Run simulations
<- 1000
n_reps
::plan(multisession, workers = 7)
future
with_progress({
<- progressor(nrow(conditions)*n_reps)
p
<- furrr::future_map(
out .x = conditions,
.f = run_condition,
n_reps = n_reps,
p = p,
save_folder = "simulation",
.options = furrr::furrr_options(
seed = 20230429
)
)
})
::plan(sequential) future
Per simulation scenario, two Mplus input files were created: One for the analysis using only a linear path model, and one for the analysis using a path model that also contained quadratic relations (when relevant). The syntax below is the linear path model for simulation scenario 13 (DGM 5, sample size of 300, proportion exposed is 0.10). Mplus outputs a text file containing the parameter estimates for each analyzed dataset.
TITLE:
Traditional path analysis for estimating joint effects
and prolongued exposure effect.
- K = 2 (waves)
- All effects assumed linear
DATA:
FILE = repList_condition13.dat;
TYPE = MONTECARLO;
VARIABLE:
NAMES = C0 L1 A1 L2 A2 Y
Pr_A1 Pr_A2 ps1 ps2 W W_correct;
USEVARIABLES = C0-Y;
MISSING = .;
CATEGORICAL = A1 A2;
ANALYSIS:
ESTIMATOR = ML;
LINK = LOGIT; ! Alternatively, LINK = LOGIT
! When ESTIMATOR = WLSMV, LINK defaults to PROBIT.
! When using LINK = LOGIT, use Bootstrapped CI
MODEL:
! Lagged effects on Y
Y ON A1*0.2 A2*0.4 (B_A1Y B_A2Y);
Y ON L1*0.15 L2*0.3 (B_L1Y B_L2Y);
Y ON C0*0.1 (B_C0Y);
! Lagged effects on L
L2 ON L1*0.3 A1*0.4 C0*0.1 (B_L1L2 B_A1L2 B_C0L2);
L1 ON C0*0.1;
! Lagged effects on A
A2 ON C0 A1 L1 L2;
A1 ON C0 L1;
! Variance
C0;
! Residual variances
L1 L2 Y;
! Thresholds
[A1$1 A2$1] (tau1 tau2);
MODEL CONSTRAINT:
NEW(JTE_A1*0.32);
JTE_A1 = B_A1L2*B_L2Y + B_A1Y;
NEW(AvN*0.72);
AvN = JTE_A1 + B_A2Y;
SAVEDATA:
RESULTS = results_condition13.dat;
The syntax below is the quadratic path model (i.e., correctly specified) for simulation scenario 13 (DGM 5, sample size of 300, proportion exposed is 0.10).
TITLE:
Traditional path analysis for estimating joint effects
and prolongued exposure effect.
- K = 2 (waves)
- Model correctly specified
DATA:
FILE = repList_condition13.dat;
TYPE = MONTECARLO;
VARIABLE:
NAMES = C0 L1 A1 L2 A2 Y
Pr_A1 Pr_A2 ps1 ps2 W W_correct;
USEVARIABLES = C0-Y C0C0 L1L1 L2L2;
MISSING = .;
CATEGORICAL = A1 A2;
DEFINE:
CENTER C0 L1 L2 (GRANDMEAN);
C0C0 = C0**2;
L1L1 = L1**2;
L2L2 = L2**2;
ANALYSIS:
ESTIMATOR = ML;
LINK = LOGIT; ! Alternatively, LINK = LOGIT
! When ESTIMATOR = WLSMV, LINK defaults to PROBIT.
! When using LINK = LOGIT, use Bootstrapped CI
MODEL:
! Lagged effects on Y
Y ON A1*0.2 A2*0.4 (B_A1Y B_A2Y);
Y ON L1*0.15 L2*0.3 (B_L1Y B_L2Y);
Y ON C0*0.1 (B_C0Y);
Y ON C0C0*0.1 L1L1*0.15 L2L2*0.3;
! Lagged effects on L
L2 ON L1*0.3 A1*0.4 C0*0.1 (B_L1L2 B_A1L2 B_C0L2);
L2 ON C0C0*0.1 L1L1*0.15;
L1 ON C0*0.1 C0C0*0.1;
! Lagged effects on A
A2 ON C0 A1 L1 L2 L1L1 L2L2;
A1 ON C0 L1 L1L1;
! Variance
C0;
! Residual variances
L1 L2 Y;
! Thresholds
[A1$1 A2$1] (tau1 tau2);
MODEL CONSTRAINT:
NEW(JTE_A1*0.32);
JTE_A1 = B_A1L2*B_L2Y + B_A1Y;
NEW(AvN*0.72);
AvN = JTE_A1 + B_A2Y;
SAVEDATA:
RESULTS = results_correct_condition13.dat;
Extracting simulation results
Output from the analyses in R are saved in an object called out
, and output from Mplus analyses are saved in .dat files per simulation scenario. The below R code collects alls results, and merges them into a single large dataframe.
# Restructure results
<- out |>
out_performance ::map(function(x) {x$df_performance}) |>
purrrbind_rows(.id = "condition")
<- out |>
out_estimates ::map(function(x) {x$df_estimates}) |>
purrrbind_rows(.id = "condition")
<- out |>
out_prop_exposed ::map(function(x) {x$df_prop_exposed}) |>
purrrbind_rows(.id = "condition")
# Mplus results (linear)
<- map(
df_Mplus_linear .x = paste0(".\\simulation\\condition", 1:30, "\\results_condition", 1:30, ".dat"),
.f = read.delim,
header = FALSE
|> map(slice, seq(4, (8*n_reps) - 4, 8)) |>
) map(
.f = separate,
col = V1,
into = c(paste0("P", 21:25), "Mplus_linear_A1", "Mplus_linear_AvN"),
sep = c(15, 31, 47, 63, 79, 95)
|>
) map(select, Mplus_linear_A1, Mplus_linear_AvN) |>
map(
.f = pivot_longer,
cols = everything(),
names_to = "model",
values_to = "estimate",
values_transform = list(estimate = as.numeric)
|>
) list_rbind(names_to = "condition")
## Check if extraction of point estimates was successful: Compare to Mplus .out file
<- df_Mplus_linear |>
check_linear_A1 group_by(condition, model) |>
summarise(
average = mean(estimate),
SD = sd(estimate),
MSE = mean((estimate - .32)^2)
)
## Extract sample_size, prop_exposed, etc. per condition
<- out_estimates |>
df_Mplus_add group_by(condition) |>
slice_head(n = 2*n_reps) |>
mutate(condition = as.numeric(condition)) |>
select(-bias, -model) |>
arrange(condition) |>
ungroup()
## Add Mplus (linear) estimate to other results
<- df_Mplus_linear |>
out_estimates cbind(select(df_Mplus_add, id_funcForm, sample_size, prop_exposed)) |>
mutate(PV = if_else(model == "Mplus_linear_A1", .32, .72), bias = estimate - PV) |>
select(-estimate, - PV) |>
relocate(bias, .before = id_funcForm) |>
rbind(out_estimates)
rm(df_Mplus_add)
# Read in Mplus results DGM 1 (correct)
<- 2
n_params <- 3
n_props <- 2
n_sample_sizes
<- map(
df_Mplus_correct_DGM1 .x = paste0(".\\simulation\\condition", c(1:3, 16:18), "\\results_correct_condition", c(1:3, 16:18), ".dat"),
.f = read.delim,
header = FALSE
|> map(slice, seq(4, (8*n_reps) - 4, 8)) |>
) map(
.f = separate,
col = V1,
into = c(paste0("P", 21:25), "Mplus_correct_A1", "Mplus_correct_AvN"),
sep = c(15, 31, 47, 63, 79, 95)
|>
) map(select, Mplus_correct_A1, Mplus_correct_AvN) |>
map(
.f = pivot_longer,
cols = everything(),
names_to = "model",
values_to = "estimate",
values_transform = list(estimate = as.numeric)
|>
) list_rbind() |>
mutate(
condition = rep(c(1:3, 16:18), each = n_params*n_reps),
id_funcForm = 1,
sample_size = rep(c(300, 1000), each = n_params*n_reps*n_props),
prop_exposed = rep(rep(c(0.1, 0.5, 0.9), each = n_params*n_reps), times = n_sample_sizes),
PV = if_else(model == "Mplus_correct_A1", .32, .72),
bias = estimate - PV
|>
) relocate(bias, .before = id_funcForm) |>
relocate(condition, .before = model)
## Check if extraction of point estimates was successful: Compare to Mplus .out file
<- df_Mplus_correct_DGM1 |>
check_correct_DGM1 group_by(condition, model) |>
summarise(
average = mean(estimate),
SD = sd(estimate),
MSE = mean((estimate - PV)^2)
)
# Read in Mplus results DGM 2, 3, and 4 (correct)
<- map(
df_Mplus_correct_DGM234 .x = paste0(".\\simulation\\condition", c(4:12, 19:27), "\\results_correct_condition", c(4:12, 19:27), ".dat"),
.f = read.delim,
header = FALSE
|> map(slice, seq(4, (8*n_reps) - 4, 8)) |>
) map(
.f = separate,
col = V1,
into = c(paste0("P", 21:28), "Mplus_correct_A1", "Mplus_correct_AvN"),
sep = c(15, 31, 47, 63, 79, 95, 111, 127, 143)
|>
) map(select, Mplus_correct_A1, Mplus_correct_AvN) |>
map(
.f = pivot_longer,
cols = everything(),
names_to = "model",
values_to = "estimate",
values_transform = list(estimate = as.numeric)
|>
) list_rbind() |>
mutate(
condition = rep(c(4:12, 19:27), each = n_params*n_reps),
id_funcForm = rep(rep(c(2:4), each = n_params*n_props*n_reps), times = n_sample_sizes),
sample_size = rep(c(300, 1000), each = n_params*n_reps*n_props*3),
prop_exposed = rep(rep(c(0.1, 0.5, 0.9), each = n_params*n_reps), times = 3*n_sample_sizes),
PV = if_else(model == "Mplus_correct_A1", .32, .72),
bias = estimate - PV
|>
) relocate(bias, .before = id_funcForm) |>
relocate(condition, .before = model)
## Check if extraction of point estimates was successful: Compare to Mplus .out file
<- df_Mplus_correct_DGM234 |>
check_correct_DGM234 group_by(condition, model) |>
summarise(
average = mean(estimate),
SD = sd(estimate),
MSE = mean((estimate - PV)^2)
)
# Read in Mplus results DGM 5 (correct)
<- map(
df_Mplus_correct_DGM5 .x = paste0(".\\simulation\\condition", c(13:15, 28:30), "\\results_correct_condition", c(13:15, 28:30), ".dat"),
.f = read.delim,
header = FALSE
|> map(slice, seq(5, (10*n_reps) - 5, 10)) |>
) map(
.f = separate,
col = V1,
into = c(paste0("P", 31:34), "Mplus_correct_A1", "Mplus_correct_AvN"),
sep = c(15, 31, 47, 63, 79)
|>
) map(select, Mplus_correct_A1, Mplus_correct_AvN) |>
map(
.f = pivot_longer,
cols = everything(),
names_to = "model",
values_to = "estimate",
values_transform = list(estimate = as.numeric)
|>
) list_rbind() |>
mutate(
condition = rep(c(13:15, 28:30), each = n_params*n_reps),
id_funcForm = 5,
sample_size = rep(c(300, 1000), each = n_params*n_reps*n_props),
prop_exposed = rep(rep(c(0.1, 0.5, 0.9), each = n_params*n_reps), times = n_sample_sizes),
PV = if_else(model == "Mplus_correct_A1", .32, .72),
bias = estimate - PV
|>
) relocate(bias, .before = id_funcForm) |>
relocate(condition, .before = model)
## Check if extraction of point estimates was successful: Compare to Mplus .out file
<- df_Mplus_correct_DGM5 |>
check_correct_DGM5 group_by(condition, model) |>
summarise(
average = mean(estimate),
SD = sd(estimate),
MSE = mean((estimate - PV)^2)
)
# Add estimates of Mplus_correct to other estimates
<- map(
out_estimates .x = list(df_Mplus_correct_DGM1, df_Mplus_correct_DGM234, df_Mplus_correct_DGM5),
.f = select,
-PV, -estimate
|>
) reduce(rbind) |>
rbind(out_estimates)
Visualizing simulation results
The below R code visualizes the results. It can be used to create the Figures 6 and 7 in the main manuscript.
<- out_estimates |>
performance_AvN #filter(sample_size == 1000) |>
filter(model == "IPW_AvN" | model == "regression_AvN" |
== "Mplus_linear_AvN" | model == "Mplus_correct_AvN" |
model == "correct_AvN") |>
model mutate(est_AvN = bias + 0.72) |>
group_by(model, prop_exposed, id_funcForm, sample_size) |>
summarise(
est_bias = mean(bias),
SE_bias = sqrt( (1 / (1000 * 999)) * sum((est_AvN - mean(est_AvN))^2) ),
est_MSE = mean(bias^2),
SE_MSE = sqrt( (sum((bias^2 - est_MSE)^2)) / (1000 * 999))
|>
) ungroup()
# Visualize performance statistics
<- performance_AvN |>
plot_bias_AvN_complete #filter(prop_exposed == .1 | prop_exposed == .5) |>
filter(model != "ttest_AvN") |>
mutate(
model_mod = recode_factor(model,
"IPW_AvN" = "IPW (L)",
"Mplus_linear_AvN" = "Path (L)",
"correct_AvN" = "IPW (L, Q)",
"Mplus_correct_AvN" = "Path (L, Q)",
"regression_AvN" = "Unadjusted"
), prop_exposed_mod = recode(prop_exposed,
"0.1" = "10% exposed",
"0.5" = "50% exposed",
"0.9" = "90% exposed"
),id_funcForm_mod = recode(id_funcForm,
"1" = "DGM 1\nNo misspecification",
"2" = "DGM 2\nCovariate",
"3" = "DGM 3\nOutcome",
"4" = "DGM 4\nExposure",
"5" = "DGM 5\nCombined"
)|>
) ggplot(aes(x = est_bias, y = as.factor(model_mod), color = as.factor(sample_size))) +
geom_point(size = 3) +
#geom_pointrange(aes(xmin = est_bias - 1.96*SE_bias, xmax = est_bias + 1.96*SE_bias)) +
scale_y_discrete(limits = c("Unadjusted", "Path (L, Q)", "IPW (L, Q)", "Path (L)", "IPW (L)")) +
scale_x_continuous(n.breaks = 3, limits = c(-.5, .5)) +
geom_vline(xintercept = 0, linetype = 2) +
facet_grid(prop_exposed_mod ~ as.factor(id_funcForm_mod)) +
xlab("Bias") +
labs(color = "Sample size") +
theme(
legend.position = "bottom",
axis.title.y = element_blank(),
text = element_text(size = 20)
)
ggsave(
filename = "./figures/plot_bias_AvN_complete.png",
plot = plot_bias_AvN_complete,
width = 14,
height = 7
)
<- performance_AvN |>
plot_MSE_AvN #filter(prop_exposed == .1 | prop_exposed == .5) |>
filter(model != "ttest_AvN") |>
mutate(
model_mod = recode_factor(model,
"IPW_AvN" = "IPW (L)",
"Mplus_linear_AvN" = "Path (L)",
"correct_AvN" = "IPW (L, Q)",
"Mplus_correct_AvN" = "Path (L, Q)",
"regression_AvN" = "Unadjusted"
), prop_exposed_mod = recode(prop_exposed,
"0.1" = "10% exposed",
"0.5" = "50% exposed",
"0.9" = "90% exposed"
),id_funcForm_mod = recode(id_funcForm,
"1" = "DGM 1\nNo misspecification",
"2" = "DGM 2\nCovariate",
"3" = "DGM 3\nOutcome",
"4" = "DGM 4\nExposure",
"5" = "DGM 5\nCombined"
)|>
) ggplot(aes(x = est_MSE, y = as.factor(model_mod), color = as.factor(sample_size))) +
geom_point(size = 3) +
#geom_pointrange(aes(xmin = est_MSE - 1.96*SE_MSE, xmax = est_MSE + 1.96*SE_MSE)) +
scale_y_discrete(limits = c("Unadjusted", "Path (L, Q)", "IPW (L, Q)", "Path (L)", "IPW (L)")) +
scale_x_continuous(n.breaks = 3, limits = c(0, 1)) +
geom_vline(xintercept = 0, linetype = 2) +
facet_grid(prop_exposed_mod ~ as.factor(id_funcForm_mod)) +
xlab("MSE") +
labs(color = "Sample size") +
theme(
legend.position = "bottom",
axis.title.y = element_blank(),
text = element_text(size = 20)
)
ggsave(
filename = "./figures/plot_MSE_AvN_complete.png",
plot = plot_MSE_AvN,
width = 14,
height = 7
)
# Visualize raw data
<- out_estimates |>
plot_estimates_AvN filter(sample_size == 1000 & (prop_exposed == .1 | prop_exposed == .5)) |>
filter(model == "IPW_AvN" | model == "regression_AvN" |
== "Mplus_linear_AvN" | model == "Mplus_correct_AvN" |
model == "correct_AvN") |>
model mutate(
est_AvN = bias + 0.72,
model_mod = recode_factor(model,
"IPW_AvN" = "MSM (L)",
"Mplus_linear_AvN" = "SEM (L)",
"correct_AvN" = "MSM (C)",
"Mplus_correct_AvN" = "SEM (C)",
"regression_AvN" = "Unadjusted"
), prop_exposed_mod = recode(prop_exposed,
"0.1" = "10% exposed",
"0.5" = "50% exposed"
),id_funcForm_mod = recode(id_funcForm,
"1" = "DGM 1\nNo misspecification",
"2" = "DGM 2\nCovariate",
"3" = "DGM 3\nOutcome",
"4" = "DGM 4\nExposure",
"5" = "DGM 5\nCombined"
)|>
) ggplot(aes(x = est_AvN, y = as.factor(model_mod), fill = as.factor(model_mod))) +
::geom_density_ridges(
ggridgesscale = .9,
quantile_lines = TRUE,
quantiles = c(.025, .5, .975),
alpha = .7
+
) scale_y_discrete(limits = rev) +
geom_vline(xintercept = 0.72, linetype = 2) +
facet_grid(prop_exposed_mod ~ as.factor(id_funcForm_mod)) +
scale_x_continuous(n.breaks = 5, limits = c(-1, 3)) +
xlab(expression(hat(theta))) +
theme(
legend.position = "none",
axis.title.y = element_blank(),
text = element_text(size = 10)
+
) scale_fill_viridis_d(option = "D") # Colorblind-safe colours
ggsave(
filename = "./figures/plot_estimates_AvN.png",
plot = plot_estimates_AvN,
width = 7,
height = 7
)