This document processes and analyzes survey data. The analysis involves calculating various survey metrics, including weighted counts, weighted percentages with 95% confidence intervals, and unweighted counts.
Steps include:
Each step is detailed with specific code snippets to achieve the desired analysis.
Population Assessment of Tobacco and Health (PATH) Study
# remove all the objects present in the workspace
rm(list=ls())
# load the data
# this sample data is a subset of PATH wave1
PATH_wave1 <- read.csv("PATH_wave1_data.csv")
# tobacco product use status
## cigarette
## current: smoked 100+ cigarettes and at least 1 day in the past 30 days
## former: ever used but did not smoke at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(PATH_wave1$UMp1_W1_A_smkstat_Reg_1past30, useNA = "always")
##
## current former missing never <NA>
## 11208 5079 173 15849 0
## cigar (Non-premium)
## current: smoked 100+ cigars and at least 1 day in the past 30 days
## former: ever used but did not smoke at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(PATH_wave1$UM_W1_A_cigarstat_1past30, useNA = "always")
##
## current former missing never <NA>
## 1706 2002 1214 27387 0
## e-cigarette
## current: ever used and vaped at least 1 day in the past 30 days
## former: ever used but did not vape at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(PATH_wave1$UMp1_W1_A_ecigstat_Reg_1past30, useNA = "always")
##
## current former missing never <NA>
## 1369 892 70 29978 0
# age: 1: 18-24, 2: 25-34, 3: 35-54, 4: 55+
table(PATH_wave1$UM_W1_A_agecat, useNA = "always")
##
## 1 2 3 4 <NA>
## 9110 6338 9778 7083 0
# sex: 0: Female, 1: Male
table(PATH_wave1$UM_W1_A_male_imp, useNA = "always")
##
## 0 1 <NA>
## 15989 16320 0
# Filter out rows with missing value
PATH_wave1 = PATH_wave1[!is.na(PATH_wave1$UM_W1_A_agecat),] # age
PATH_wave1 = PATH_wave1[!is.na(PATH_wave1$UM_W1_A_male_imp),] # sex
# generate a combination of tobacco product use status for cigarettes, cigars, and e-cigarettes
# creates a vector that contains all possible combinations
cig_cigar_ecig_name=paste0(rep(c("Never","Former","Current"),each=9),"/",
rep(c("Never","Former","Current"),each=3),"/",
rep(c("Never","Former","Current")))
# loop through all combinations and assign values:
PATH_wave1$UM_W1_cig_cigar1_ecig_27stat=NA
m=1
for(i in c("never","former","current")){
for(j in c("never","former","current")){
for(k in c("never","former","current")){
PATH_wave1$UM_W1_cig_cigar1_ecig_27stat[PATH_wave1$UMp1_W1_A_smkstat_Reg_1past30 %in% i & PATH_wave1$UM_W1_A_cigarstat_1past30 %in% j & PATH_wave1$UMp1_W1_A_ecigstat_Reg_1past30 %in% k]=cig_cigar_ecig_name[m]
m=m+1
}
}
}
# for example, the loop will generate and assign the combination "Current/Former/Never" to the UM_W1_cig_cigar1_ecig_27stat column in PATH_wave1 for observations where cigarettes are "Current", cigars are "Former", and e-cigarettes are "Never".
PATH_wave1$UM_W1_cig_cigar1_ecig_27stat=factor(PATH_wave1$UM_W1_cig_cigar1_ecig_27stat, levels=unique(cig_cigar_ecig_name))
table(PATH_wave1$UM_W1_cig_cigar1_ecig_27stat,useNA = "always")
##
## Never/Never/Never Never/Never/Former Never/Never/Current
## 13978 71 137
## Never/Former/Never Never/Former/Former Never/Former/Current
## 444 25 8
## Never/Current/Never Never/Current/Former Never/Current/Current
## 503 14 45
## Former/Never/Never Former/Never/Former Former/Never/Current
## 3694 137 278
## Former/Former/Never Former/Former/Former Former/Former/Current
## 476 37 46
## Former/Current/Never Former/Current/Former Former/Current/Current
## 221 10 17
## Current/Never/Never Current/Never/Former Current/Never/Current
## 7903 406 612
## Current/Former/Never Current/Former/Former Current/Former/Current
## 769 104 85
## Current/Current/Never Current/Current/Former Current/Current/Current
## 723 65 99
## <NA>
## 1402
PATH_wave1=PATH_wave1[!is.na(PATH_wave1$UM_W1_cig_cigar1_ecig_27stat),]
# set options for survey analysis
options(survey.replicates.mse = TRUE) # compute the mean square error for replicate weights
options(warn = -1) # suppress warnings
options(digits = 20) # set 20 digits to display for numerical output
# create a survey design object using replicate weights
svy_design = svrepdesign(
id = ~PERSONID, # primary sampling unit (PSU) variable
weights = ~R01_A_PWGT, # sampling weights
repweights = "R01_A_PWGT[1-100]+", # replicate weights for PATH wave 1
type = "Fay", # replication method
rho = 0.3, # set the rho parameter for Fay's method
data = PATH_wave1 # your data set
)
# calculate overall survey metrics by grouping the data by tobacco use status combinations
# compute weighted counts, percentages with 95% confidence intervals, and unweighted counts
# add overall indicators for sex and age
metrics_all = as_survey_rep(svy_design) %>% # use as_survey_rep for replicate weights
# group by tobacco use status combinations, and ensures that all levels are included
group_by(UM_W1_cig_cigar1_ecig_27stat, .drop = FALSE) %>%
# calculate the outputs for each group
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se","ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
# indicate that the metrics are for which group, here for the overall population
mutate(sex = "Overall",age = "Overall") %>% ungroup()
metrics_all
## # A tibble: 27 × 10
## UM_W1_cig_cigar1_ecig_27stat `weighted count` `weighted count_se`
## <fct> <dbl> <dbl>
## 1 Never/Never/Never 132831175. 1367042.
## 2 Never/Never/Former 218596. 24553.
## 3 Never/Never/Current 453316. 45260.
## 4 Never/Former/Never 1990235. 123820.
## 5 Never/Former/Former 84903. 23102.
## 6 Never/Former/Current 23038. 11337.
## 7 Never/Current/Never 1707233. 91458.
## 8 Never/Current/Former 37719. 13190.
## 9 Never/Current/Current 146509. 24344.
## 10 Former/Never/Never 39264002. 1197085.
## # ℹ 17 more rows
## # ℹ 7 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>, sex <chr>, age <chr>
# calculation is similar to the overall survey metrics
# but it additionally groups the data by age categories, providing outputs for each age group within each tobacco use status combination
metrics_by_age = as_survey_rep(svy_design) %>%
group_by(UM_W1_A_agecat, UM_W1_cig_cigar1_ecig_27stat, .drop = FALSE) %>%
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
mutate(sex = "Overall") %>% ungroup()
metrics_by_age <- rename(metrics_by_age, age = UM_W1_A_agecat)
metrics_by_age
## # A tibble: 108 × 10
## age UM_W1_cig_cigar1_ecig_27stat `weighted count` `weighted count_se`
## <int> <fct> <dbl> <dbl>
## 1 1 Never/Never/Never 21200441. 203775.
## 2 1 Never/Never/Former 107800. 22259.
## 3 1 Never/Never/Current 203842. 32095.
## 4 1 Never/Former/Never 556847. 28795.
## 5 1 Never/Former/Former 45419. 12391.
## 6 1 Never/Former/Current 10372. 4674.
## 7 1 Never/Current/Never 616245. 59252.
## 8 1 Never/Current/Former 21998. 7329.
## 9 1 Never/Current/Current 59352. 15874.
## 10 1 Former/Never/Never 976824. 64343.
## # ℹ 98 more rows
## # ℹ 6 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>, sex <chr>
# calculation is similar to the overall survey metrics
# but it additionally groups the data by age and sex, providing outputs for each combination of sex, age, and tobacco use status within each tobacco use status combination
metrics_by_sex_age = as_survey_rep(svy_design) %>%
group_by(UM_W1_A_male_imp, UM_W1_A_agecat, UM_W1_cig_cigar1_ecig_27stat, .drop = FALSE) %>%
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
ungroup()
metrics_by_sex_age <- rename(metrics_by_sex_age, sex = UM_W1_A_male_imp)
metrics_by_sex_age <- rename(metrics_by_sex_age, age = UM_W1_A_agecat)
metrics_by_sex_age
## # A tibble: 216 × 10
## sex age UM_W1_cig_cigar1_ecig_27stat `weighted count` `weighted count_se`
## <int> <int> <fct> <dbl> <dbl>
## 1 0 1 Never/Never/Never 11312305. 121360.
## 2 0 1 Never/Never/Former 48967. 11601.
## 3 0 1 Never/Never/Current 80183. 15832.
## 4 0 1 Never/Former/Never 224587. 19852.
## 5 0 1 Never/Former/Former 8944. 6628.
## 6 0 1 Never/Former/Current 0 0
## 7 0 1 Never/Current/Never 194000. 34322.
## 8 0 1 Never/Current/Former 9437. 5015.
## 9 0 1 Never/Current/Current 19447. 7056.
## 10 0 1 Former/Never/Never 530845. 40526.
## # ℹ 206 more rows
## # ℹ 5 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>
# combine survey metrics into a data frame
data <- rbind.data.frame(metrics_all, metrics_by_age, metrics_by_sex_age)
# select relevant columns and recode for readability
PATH_wave1_metrics <- data %>%
dplyr::select(UM_W1_cig_cigar1_ecig_27stat, age, sex,
"weighted count", "unweighted_count", "weighted percent",
"weighted percent_se", "weighted percent_low", "weighted percent_upp") %>%
# filters out rows with zero unweighted counts
filter(unweighted_count > 0)
# add wave identifier and rename columns
PATH_wave1_metrics = PATH_wave1_metrics %>% mutate(wave = 1)
PATH_wave1_metrics <- rename(PATH_wave1_metrics, cig_cigar_ecig = UM_W1_cig_cigar1_ecig_27stat)
PATH_wave1_metrics <- rename(PATH_wave1_metrics, "unweighted count" = unweighted_count)
# recode sex and age variables
PATH_wave1_metrics$sex[PATH_wave1_metrics$sex %in% 0] = "Female"
PATH_wave1_metrics$sex[PATH_wave1_metrics$sex %in% 1] = "Male"
PATH_wave1_metrics$age[PATH_wave1_metrics$age %in% 1] = "18-24"
PATH_wave1_metrics$age[PATH_wave1_metrics$age %in% 2] = "25-34"
PATH_wave1_metrics$age[PATH_wave1_metrics$age %in% 3] = "35-54"
PATH_wave1_metrics$age[PATH_wave1_metrics$age %in% 4] = "55+"
# export the processed data
write.csv(PATH_wave1_metrics,"PATH_wave1_metrics.csv",row.names = F)
National Health Interview Survey (NHIS)
# remove all the objects present in the workspace
rm(list=ls())
# load the data
# this sample data is a subset of NHIS 2023
NHIS_2023 <- read.csv("NHIS_2023_data.csv")
# tobacco product use status
## cigarette
## 1: current: current ever day/some day smoker
## 2: former: former smoker
## 3: never: never smoker
table(NHIS_2023$cig_status, useNA = "always")
##
## 1 2 3 <NA>
## 3135 7235 18111 1041
## e-cigarette
## 1: current e-cigarette user
## 2: used e-cigarette, not current user
## 3: never e-cigarette user
table(NHIS_2023$ecig_status, useNA = "always")
##
## 1 2 3 <NA>
## 1514 3518 23470 1020
# Age: 1: 18-24, 2: 25-34, 3: 35-44, 4: 45-54, 5: 55-64, 6:65+
table(NHIS_2023$agegrp, useNA = "always")
##
## 1 2 3 4 5 6 <NA>
## 1857 4253 4540 4078 5053 9741 0
# Sex: 0: Female, 1: Male
table(NHIS_2023$male, useNA = "always")
##
## 0 1 <NA>
## 16059 13457 6
# Filter out rows with missing value
NHIS_2023=NHIS_2023[!is.na(NHIS_2023$agegrp),] # age
NHIS_2023=NHIS_2023[!is.na(NHIS_2023$male),] # sex
# generate a combination of tobacco product use status for cigarettes and e-cigarettes
# replace the numeric codes with the corresponding descriptive labels
NHIS_2023 <- NHIS_2023 %>%
mutate(
cig_status = recode(cig_status, `1` = "current", `2` = "former", `3` = "never"),
ecig_status = recode(ecig_status, `1` = "current", `2` = "former", `3` = "never")
)
# creates a vector that contains all possible combinations
cig_ecig_name=paste0(rep(c("Never","Former","Current"),each=3),"/",c("Never","Former","Current"))
# loop through all combinations and assign values:
NHIS_2023$NHIS_9stat=NA
k=1
for(i in c("never","former","current")){
for(j in c("never","former","current")){
NHIS_2023$NHIS_9stat[NHIS_2023$cig_status %in% i & NHIS_2023$ecig_status %in% j]=cig_ecig_name[k]
k=k+1
}
}
# for example, the loop will generate and assign the combination "Current/Former" to the NHIS_9stat column for observations where cigarettes are "Current" and e-cigarettes are "Former".
NHIS_2023$NHIS_9stat=factor(NHIS_2023$NHIS_9stat, levels=unique(cig_ecig_name))
NHIS_2023=NHIS_2023[!is.na(NHIS_2023$NHIS_9stat),]
table(NHIS_2023$NHIS_9stat,useNA = "always")
##
## Never/Never Never/Former Never/Current Former/Never Former/Former
## 16415 1299 389 5311 1238
## Former/Current Current/Never Current/Former Current/Current <NA>
## 682 1718 975 439 0
# set options for survey analysis
options(survey.lonely.psu = "adjust") # adjusts for lonely primary sampling units (PSUs) by redistributing weights
options(warn = -1) # suppress warnings
options(digits = 20) # set 20 digits to display for numerical output
# create a survey design object using the svydesign function
svy_design <- svydesign(
id = ~fpx, # primary sampling unit (PSU) variable
strata = ~pstrat, # stratification variable
weights = ~wtfa_sa, # sampling weights
PSU = ~ppsu, # primary sampling unit (PSU)
data = NHIS_2023, # your data set
nest = TRUE # specify that the strata and PSUs are nested
)
# calculate overall survey metrics by grouping the data by tobacco use status combinations
# compute weighted counts, percentages with 95% confidence intervals, and unweighted counts
# add overall indicators for sex and age
metrics_all = as_survey(svy_design) %>% # convert the survey design object to a survey design object
# group by tobacco use status combinations, and ensures that all levels are included
group_by(NHIS_9stat, .drop = FALSE) %>%
# calculate the outputs for each group
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se","ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
# indicate that the metrics are for which group, here for the overall population
mutate(sex = "Overall",age = "Overall") %>% ungroup()
metrics_all
## # A tibble: 9 × 10
## NHIS_9stat `weighted count` `weighted count_se` `weighted percent`
## <fct> <dbl> <dbl> <dbl>
## 1 Never/Never 146919837. 1045216. 0.591
## 2 Never/Former 14270696. 460667. 0.0574
## 3 Never/Current 4743082. 285991. 0.0191
## 4 Former/Never 37902940. 561263. 0.152
## 5 Former/Former 11011062. 357467. 0.0443
## 6 Former/Current 7000193. 311200. 0.0281
## 7 Current/Never 13837362. 386528. 0.0556
## 8 Current/Former 8585694. 320869. 0.0345
## 9 Current/Current 4511436. 248812. 0.0181
## # ℹ 6 more variables: `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>, sex <chr>, age <chr>
# calculation is similar to the overall survey metrics
# but it additionally groups the data by age categories, providing outputs for each age group within each tobacco use status combination
metrics_by_age = as_survey(svy_design) %>%
group_by(agegrp, NHIS_9stat, .drop = FALSE) %>%
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
mutate(sex = "Overall") %>% ungroup()
metrics_by_age <- rename(metrics_by_age, age = agegrp)
metrics_by_age
## # A tibble: 54 × 10
## age NHIS_9stat `weighted count` `weighted count_se` `weighted percent`
## <int> <fct> <dbl> <dbl> <dbl>
## 1 1 Never/Never 19712179. 638174. 0.660
## 2 1 Never/Former 4913474. 318508. 0.165
## 3 1 Never/Current 2558333. 232928. 0.0857
## 4 1 Former/Never 202699. 66884. 0.00679
## 5 1 Former/Former 516129. 91968. 0.0173
## 6 1 Former/Current 914901. 134160. 0.0306
## 7 1 Current/Never 152710. 50947. 0.00511
## 8 1 Current/Former 362278. 86702. 0.0121
## 9 1 Current/Current 530331. 95138. 0.0178
## 10 2 Never/Never 24745127. 566621. 0.579
## # ℹ 44 more rows
## # ℹ 5 more variables: `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>, sex <chr>
# calculation is similar to the overall survey metrics
# but it additionally groups the data by age and sex, providing outputs for each combination of sex, age, and tobacco use status within each tobacco use status combination
metrics_by_sex_age = as_survey(svy_design) %>%
group_by(male, agegrp, NHIS_9stat, .drop = FALSE) %>%
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
ungroup()
metrics_by_sex_age <- rename(metrics_by_sex_age, sex = male)
metrics_by_sex_age <- rename(metrics_by_sex_age, age = agegrp)
metrics_by_sex_age
## # A tibble: 108 × 10
## sex age NHIS_9stat `weighted count` `weighted count_se`
## <int> <int> <fct> <dbl> <dbl>
## 1 0 1 Never/Never 10032119. 456161.
## 2 0 1 Never/Former 2503567. 227313.
## 3 0 1 Never/Current 1265160. 164763.
## 4 0 1 Former/Never 92241. 47705.
## 5 0 1 Former/Former 244164. 59582.
## 6 0 1 Former/Current 334887. 81472.
## 7 0 1 Current/Never 58439. 28240.
## 8 0 1 Current/Former 121394. 50683.
## 9 0 1 Current/Current 199541. 57610.
## 10 0 2 Never/Never 13863422. 420756.
## # ℹ 98 more rows
## # ℹ 5 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>
# combine survey metrics into a data frame
NHIS_2023_metrics<-rbind.data.frame(metrics_all,metrics_by_age,metrics_by_sex_age)
# select relevant columns and recode for readability
NHIS_2023_metrics<-NHIS_2023_metrics %>%
dplyr::select(NHIS_9stat,age,sex,"weighted count","unweighted_count","weighted percent","weighted percent_se","weighted percent_low","weighted percent_upp") %>%
# filter to keep only rows where unweighted_count > 0
filter(unweighted_count>0)
NHIS_2023_metrics<-rename(NHIS_2023_metrics,cig_ecig=NHIS_9stat)
NHIS_2023_metrics<-rename(NHIS_2023_metrics,"unweighted count"=unweighted_count)
# recode sex and age variables
NHIS_2023_metrics$sex[NHIS_2023_metrics$sex %in% 1]="Male"
NHIS_2023_metrics$sex[NHIS_2023_metrics$sex %in% 0]="Female"
NHIS_2023_metrics$age[NHIS_2023_metrics$age %in% 1]="18-24"
NHIS_2023_metrics$age[NHIS_2023_metrics$age %in% 2]="25-34"
NHIS_2023_metrics$age[NHIS_2023_metrics$age %in% 3]="35-44"
NHIS_2023_metrics$age[NHIS_2023_metrics$age %in% 4]="45-54"
NHIS_2023_metrics$age[NHIS_2023_metrics$age %in% 5]="55-64"
NHIS_2023_metrics$age[NHIS_2023_metrics$age %in% 6]="65+"
# export the processed data
write.csv(NHIS_2023_metrics,"NHIS_2023_metrics.csv",row.names = T)
National Survey on Drug Use and Health (NSDUH)
# remove all the objects present in the workspace
rm(list=ls())
# load the data
# this sample data is a subset of NSDUH 2022
NSDUH_2022 <- read.csv("NSDUH_2022_data.csv")
# tobacco product use status
## cigarette
## current: smoked 100+ cigarettes and at least 1 day in the past 30 days
## former: ever used but did not smoke at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(NSDUH_2022$UM_smkstat, useNA = "always")
##
## current former missing never <NA>
## 7166 17709 59 34135 0
## cigar
## current: ever smoked and smoked at least 1 day in the past 30 days
## former: ever used but did not smoke at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(NSDUH_2022$UM_cigarstat, useNA = "always")
##
## current former missing never <NA>
## 2116 12893 46 44014 0
## e-cigarette
## current: ever used and vaped at least 1 day in the past 30 days
## former: ever used but did not vape at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(NSDUH_2022$UM_ecigstat, useNA = "always")
##
## current former missing never <NA>
## 6663 10489 136 41781 0
# Age: 1: ~17, 2: 18-25, 3: 26-34, 4: 35-49, 5: 50-64, 6: 65+
table(NSDUH_2022$UM_agecat, useNA = "always")
##
## 1 2 3 4 5 6 <NA>
## 11969 14307 9645 12588 5369 5191 0
NSDUH_2022 <- NSDUH_2022[!(NSDUH_2022$UM_agecat == 1),] # exclude non-adults for this analysis
# Sex: 1: Male, 2: Female
table(NSDUH_2022$irsex, useNA = "always")
##
## 1 2 <NA>
## 20852 26248 0
# Filter out rows with missing value
NSDUH_2022 = NSDUH_2022[!is.na(NSDUH_2022$UM_agecat),] # age
NSDUH_2022 = NSDUH_2022[!is.na(NSDUH_2022$irsex),] # sex
# generate a combination of tobacco product use status for cigarettes, cigars, and e-cigarettes
# creates a vector that contains all possible combinations
cig_cigar_ecig_name=paste0(rep(c("Never","Former","Current"),each=9),"/",
rep(c("Never","Former","Current"),each=3),"/",
rep(c("Never","Former","Current")))
# loop through all combinations and assign values:
NSDUH_2022$UM_cig_cigar_ecig_27stat=NA
m=1
for(i in c("never","former","current")){
for(j in c("never","former","current")){
for(k in c("never","former","current")){
NSDUH_2022$UM_cig_cigar_ecig_27stat[NSDUH_2022$UM_smkstat %in% i & NSDUH_2022$UM_cigarstat %in% j & NSDUH_2022$UM_ecigstat %in% k]=cig_cigar_ecig_name[m]
m=m+1
}
}
}
# for example, the loop will generate and assign the combination "Current/Former/Never" to the UM_cig_cigar_ecig_27stat column for observations where cigarettes are "Current", cigars are "Former", and e-cigarettes are "Never".
NSDUH_2022$UM_cig_cigar_ecig_27stat=factor(NSDUH_2022$UM_cig_cigar_ecig_27stat,levels=unique(cig_cigar_ecig_name))
table(NSDUH_2022$UM_cig_cigar_ecig_27stat,useNA = "always")
##
## Never/Never/Never Never/Never/Former Never/Never/Current
## 18064 1803 694
## Never/Former/Never Never/Former/Former Never/Former/Current
## 1513 424 137
## Never/Current/Never Never/Current/Former Never/Current/Current
## 183 124 70
## Former/Never/Never Former/Never/Former Former/Never/Current
## 5388 1667 1322
## Former/Former/Never Former/Former/Former Former/Former/Current
## 4434 2174 1259
## Former/Current/Never Former/Current/Former Former/Current/Current
## 279 250 205
## Current/Never/Never Current/Never/Former Current/Never/Current
## 1596 962 856
## Current/Former/Never Current/Former/Former Current/Former/Current
## 556 1216 870
## Current/Current/Never Current/Current/Former Current/Current/Current
## 222 322 366
## <NA>
## 144
NSDUH_2022=NSDUH_2022[!is.na(NSDUH_2022$UM_cig_cigar_ecig_27stat),]
# set options for survey analysis
options(survey.lonely.psu = "adjust") # adjusts for lonely primary sampling units (PSUs) by redistributing weights
options(warn = -1) # suppress warnings
options(digits = 20) # set 20 digits to display for numerical output
# Create a survey design object using the specified survey design parameters
svy_design <- svydesign(
id = ~verep, # primary sampling unit (PSU) variable
strata = ~VESTR_C, # variance stratum variable
weights = ~ANALWT2_C, # person level sampling weights
data = NSDUH_2022, # your data set
nest = TRUE # specify that the strata and PSUs are nested
)
# calculate overall survey metrics by grouping the data by tobacco use status combinations
# compute weighted counts, percentages with 95% confidence intervals, and unweighted counts
# add overall indicators for sex and age
metrics_all = as_survey(svy_design) %>% # convert the survey design object to a survey design object
# group by tobacco use status combinations, and ensures that all levels are included
group_by(UM_cig_cigar_ecig_27stat, .drop = FALSE) %>%
# calculate the outputs for each group
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se","ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
# indicate that the metrics are for which group, here for the overall population
mutate(sex = "Overall",age = "Overall") %>% ungroup()
metrics_all
## # A tibble: 27 × 10
## UM_cig_cigar_ecig_27stat `weighted count` `weighted count_se`
## <fct> <dbl> <dbl>
## 1 Never/Never/Never 93247611. 2316513.
## 2 Never/Never/Former 5344267. 266282.
## 3 Never/Never/Current 1864098. 155029.
## 4 Never/Former/Never 8696002. 437408.
## 5 Never/Former/Former 1337608. 84561.
## 6 Never/Former/Current 427498. 48556.
## 7 Never/Current/Never 775007. 128852.
## 8 Never/Current/Former 470227. 71045.
## 9 Never/Current/Current 205011. 43686.
## 10 Former/Never/Never 39690739. 904709.
## # ℹ 17 more rows
## # ℹ 7 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>, sex <chr>, age <chr>
# calculation is similar to the overall survey metrics
# but it additionally groups the data by age categories, providing outputs for each age group within each tobacco use status combination
metrics_by_age = as_survey(svy_design) %>%
group_by(UM_agecat, UM_cig_cigar_ecig_27stat, .drop = FALSE) %>%
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
mutate(sex = "Overall") %>% ungroup()
metrics_by_age <- rename(metrics_by_age, age = UM_agecat)
metrics_by_age
## # A tibble: 135 × 10
## age UM_cig_cigar_ecig_27stat `weighted count` `weighted count_se`
## <int> <fct> <dbl> <dbl>
## 1 2 Never/Never/Never 15412150. 486915.
## 2 2 Never/Never/Former 3790277. 187332.
## 3 2 Never/Never/Current 1388855. 127701.
## 4 2 Never/Former/Never 515225. 54784.
## 5 2 Never/Former/Former 691484. 52346.
## 6 2 Never/Former/Current 263985. 41478.
## 7 2 Never/Current/Never 85657. 16462.
## 8 2 Never/Current/Former 199776. 32278.
## 9 2 Never/Current/Current 144875. 29943.
## 10 2 Former/Never/Never 809773. 65073.
## # ℹ 125 more rows
## # ℹ 6 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>, sex <chr>
# calculation is similar to the overall survey metrics
# but it additionally groups the data by age and sex, providing outputs for each combination of sex, age, and tobacco use status within each tobacco use status combination
metrics_by_sex_age = as_survey(svy_design) %>%
group_by(irsex, UM_agecat, UM_cig_cigar_ecig_27stat, .drop = FALSE) %>%
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
ungroup()
metrics_by_sex_age <- rename(metrics_by_sex_age, sex = irsex)
metrics_by_sex_age <- rename(metrics_by_sex_age, age = UM_agecat)
metrics_by_sex_age
## # A tibble: 270 × 10
## sex age UM_cig_cigar_ecig_27stat `weighted count` `weighted count_se`
## <int> <int> <fct> <dbl> <dbl>
## 1 1 2 Never/Never/Never 7532616. 294270.
## 2 1 2 Never/Never/Former 1487542. 123413.
## 3 1 2 Never/Never/Current 593860. 88077.
## 4 1 2 Never/Former/Never 335326. 46714.
## 5 1 2 Never/Former/Former 445339. 51011.
## 6 1 2 Never/Former/Current 182022. 34459.
## 7 1 2 Never/Current/Never 54385. 13976.
## 8 1 2 Never/Current/Former 146846. 27080.
## 9 1 2 Never/Current/Current 98044. 22981.
## 10 1 2 Former/Never/Never 392555. 48034.
## # ℹ 260 more rows
## # ℹ 5 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>
# combine survey metrics into a data frame
NSDUH_2022_metrics<-rbind.data.frame(metrics_all,metrics_by_age,metrics_by_sex_age)
# select relevant columns and recode for readability
NSDUH_2022_metrics<-NSDUH_2022_metrics %>%
dplyr::select(UM_cig_cigar_ecig_27stat,age,sex,"weighted count","unweighted_count","weighted percent","weighted percent_se","weighted percent_low","weighted percent_upp") %>%
# filter to keep only rows where unweighted_count > 0
filter(unweighted_count>0)
NSDUH_2022_metrics<-rename(NSDUH_2022_metrics,cig_cigar_ecig=UM_cig_cigar_ecig_27stat)
NSDUH_2022_metrics<-rename(NSDUH_2022_metrics,"unweighted count"=unweighted_count)
# recode sex and age variables
NSDUH_2022_metrics$sex[NSDUH_2022_metrics$sex %in% 1]="Male"
NSDUH_2022_metrics$sex[NSDUH_2022_metrics$sex %in% 2]="Female"
# NSDUH_2022_metrics$age[NSDUH_2022_metrics$age %in% 1]="~17"
NSDUH_2022_metrics$age[NSDUH_2022_metrics$age %in% 2]="18-25"
NSDUH_2022_metrics$age[NSDUH_2022_metrics$age %in% 3]="26-34"
NSDUH_2022_metrics$age[NSDUH_2022_metrics$age %in% 4]="35-49"
NSDUH_2022_metrics$age[NSDUH_2022_metrics$age %in% 5]="50-64"
NSDUH_2022_metrics$age[NSDUH_2022_metrics$age %in% 6]="65+"
# export the processed data
write.csv(NSDUH_2022_metrics,"NSDUH_2022_metrics.csv",row.names = T)
Monitoring The Future (MTF)
# remove all the objects present in the workspace
rm(list=ls())
# load the data
# this sample data is a subset of MTF 2022
MTF_2022 <- read.csv("MTF_2022_data.csv")
# tobacco product use status
## cigarette
## current: ever smoked cigarettes and at least 1 day in the past 30 days
## former: ever used but did not smoke at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(MTF_2022$cig_st, useNA = "always")
##
## current former missing never <NA>
## 591 2576 1648 26623 0
## e-cigarette
## current: ever used and vaped at least 1 day in the past 30 days
## former: ever used but did not vape at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(MTF_2022$ecig_st, useNA = "always")
##
## current former missing never <NA>
## 4011 4040 2186 21201 0
# Sex: Male, Female
table(MTF_2022$sex, useNA = "always")
##
## Female Male Unknown <NA>
## 13188 14214 4036 0
# grade
table(MTF_2022$grade, useNA = "always")
##
## 8 10 12 <NA>
## 9889 11950 9599 0
# Filter out rows with missing value
table(MTF_2022$sex, useNA="always") # sex
##
## Female Male Unknown <NA>
## 13188 14214 4036 0
MTF_2022=MTF_2022[MTF_2022$sex!="Unknown",]
table(MTF_2022$grade, useNA="always") #grade
##
## 8 10 12 <NA>
## 8616 10311 8475 0
MTF_2022=MTF_2022[!is.na(MTF_2022$grade),]
# generate a combination of tobacco product use status for cigarettes and e-cigarettes
# creates a vector that contains all possible combinations
cig_ecig_name=paste0(rep(c("Never","Former","Current"),each=3),"/",c("Never","Former","Current"))
# loop through all combinations and assign values:
MTF_2022$MTF_9stat=NA
k=1
for(i in c("never","former","current")){
for(j in c("never","former","current")){
MTF_2022$MTF_9stat[MTF_2022$cig_st %in% i & MTF_2022$ecig_st %in% j]=cig_ecig_name[k]
k=k+1
}
}
# for example, the loop will generate and assign the combination "Current/Former" to the MTF_9stat column for observations where cigarettes are "Current" and e-cigarettes are "Former".
MTF_2022$MTF_9stat=factor(MTF_2022$MTF_9stat, levels=unique(cig_ecig_name))
MTF_2022=MTF_2022[!is.na(MTF_2022$MTF_9stat),]
table(MTF_2022$MTF_9stat,useNA = "always")
##
## Never/Never Never/Former Never/Current Former/Never Former/Former
## 19376 3017 2134 407 682
## Former/Current Current/Never Current/Former Current/Current <NA>
## 1168 83 50 337 0
# set options for survey analysis
options(survey.lonely.psu = "adjust") # adjusts for lonely primary sampling units (PSUs) by redistributing weights
options(warn = -1) # suppress warnings
options(digits = 20) # set 20 digits to display for numerical output
# Create a survey design object using the specified survey design parameters
svy_design <- svydesign(
id = ~psu, # primary sampling unit (PSU) variable
strata = ~strata, # variance stratum variable
weights = ~weight, # person level sampling weights
data = MTF_2022, # your data set
nest = TRUE # specify that the strata and PSUs are nested
)
# calculate overall survey metrics by grouping the data by tobacco use status combinations
# compute weighted counts, percentages with 95% confidence intervals, and unweighted counts
# add overall indicators for sex and age
metrics_all = as_survey(svy_design) %>% # convert the survey design object to a survey design object
# group by tobacco use status combinations, and ensures that all levels are included
group_by(MTF_9stat, .drop = FALSE) %>%
# calculate the outputs for each group
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se","ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
# indicate that the metrics are for which group, here for the overall population
mutate(sex = "Overall",grade = "Overall") %>% ungroup()
metrics_all
## # A tibble: 9 × 10
## MTF_9stat `weighted count` `weighted count_se` `weighted percent`
## <fct> <dbl> <dbl> <dbl>
## 1 Never/Never 19317. 3119. 0.704
## 2 Never/Former 3096. 495. 0.113
## 3 Never/Current 2178. 327. 0.0793
## 4 Former/Never 429. 90.2 0.0156
## 5 Former/Former 687. 145. 0.0250
## 6 Former/Current 1224. 215. 0.0446
## 7 Current/Never 77.2 21.0 0.00281
## 8 Current/Former 52.6 13.5 0.00191
## 9 Current/Current 394. 103. 0.0143
## # ℹ 6 more variables: `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>, sex <chr>, grade <chr>
# calculation is similar to the overall survey metrics
# but it additionally groups the data by sex categories, providing outputs for each sex group within each tobacco use status combination
metrics_by_sex = as_survey(svy_design) %>%
group_by(sex, MTF_9stat, .drop = FALSE) %>%
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
mutate(grade = "Overall") %>% ungroup()
metrics_by_sex <- rename(metrics_by_sex, sex = sex) # rename column names if needed
metrics_by_sex
## # A tibble: 18 × 10
## sex MTF_9stat `weighted count` `weighted count_se` `weighted percent`
## <chr> <fct> <dbl> <dbl> <dbl>
## 1 Female Never/Never 8903. 1427. 0.674
## 2 Female Never/Former 1721. 310. 0.130
## 3 Female Never/Current 1301. 209. 0.0986
## 4 Female Former/Never 160. 32.2 0.0122
## 5 Female Former/Former 324. 61.6 0.0245
## 6 Female Former/Current 589. 107. 0.0446
## 7 Female Current/Never 18.8 6.45 0.00142
## 8 Female Current/Former 20.7 5.73 0.00157
## 9 Female Current/Curre… 164. 27.2 0.0124
## 10 Male Never/Never 10413. 1699. 0.731
## 11 Male Never/Former 1375. 190. 0.0965
## 12 Male Never/Current 876. 122. 0.0615
## 13 Male Former/Never 269. 60.2 0.0189
## 14 Male Former/Former 363. 84.8 0.0255
## 15 Male Former/Current 635. 117. 0.0446
## 16 Male Current/Never 58.4 16.9 0.00410
## 17 Male Current/Former 31.8 10.6 0.00223
## 18 Male Current/Curre… 230. 77.7 0.0161
## # ℹ 5 more variables: `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>, grade <chr>
# calculation is similar to the overall survey metrics
# but it additionally groups the data by grade and sex, providing outputs for each combination of sex, grade, and tobacco use status within each tobacco use status combination
metrics_by_sex_grade = as_survey(svy_design) %>%
group_by(sex, grade, MTF_9stat, .drop = FALSE) %>%
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
ungroup()
metrics_by_sex_grade <- rename(metrics_by_sex_grade, sex = sex) # rename column names if needed
metrics_by_sex_grade <- rename(metrics_by_sex_grade, grade = grade) # rename column names if needed
metrics_by_sex_grade
## # A tibble: 54 × 10
## sex grade MTF_9stat `weighted count` `weighted count_se` `weighted percent`
## <chr> <int> <fct> <dbl> <dbl> <dbl>
## 1 Fema… 8 Never/Ne… 3299. 572. 0.798
## 2 Fema… 8 Never/Fo… 384. 118. 0.0929
## 3 Fema… 8 Never/Cu… 222. 80.1 0.0536
## 4 Fema… 8 Former/N… 43.4 10.1 0.0105
## 5 Fema… 8 Former/F… 64.2 20.6 0.0155
## 6 Fema… 8 Former/C… 97.4 31.4 0.0236
## 7 Fema… 8 Current/… 0.716 0.716 0.000173
## 8 Fema… 8 Current/… 5.13 2.31 0.00124
## 9 Fema… 8 Current/… 18.0 8.37 0.00434
## 10 Fema… 10 Never/Ne… 3226. 559. 0.661
## # ℹ 44 more rows
## # ℹ 4 more variables: `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>
# combine survey metrics into a data frame
MTF_2022_metrics<-rbind.data.frame(metrics_all,metrics_by_sex,metrics_by_sex_grade)
# select relevant columns and recode for readability
MTF_2022_metrics<-MTF_2022_metrics %>%
dplyr::select(MTF_9stat,grade,sex,"weighted count","unweighted_count","weighted percent","weighted percent_se","weighted percent_low","weighted percent_upp") %>%
# filter to keep only rows where unweighted_count > 0
filter(unweighted_count>0)
MTF_2022_metrics<-rename(MTF_2022_metrics,cig_ecig=MTF_9stat)
MTF_2022_metrics<-rename(MTF_2022_metrics,"unweighted count"=unweighted_count)
# recode sex and grade variables if needed
# export the processed data
write.csv(MTF_2022_metrics,"MTF_2022_metrics.csv",row.names = T)
National Youth Tobacco Survey (NYTS)
# remove all the objects present in the workspace
rm(list=ls())
# load the data
# this sample data is a subset of NYTS 2023
NYTS_2023 <- read.csv("NYTS_2023_data.csv")
# tobacco product use status
## cigarette
## current: smoked 100+ cigarettes and at least 1 day in the past 30 days
## former: not never nor current
## never: never used or smoked less than 100+ cigarettes
## missing: missing data
table(NYTS_2023$cigt, useNA = "always")
##
## Current Former Never <NA>
## 80 42 20006 0
## e-cigarette
## current: ever used and vaped at least 1 day in the past 30 days
## former: not never nor current
## never: never used
## missing: missing data
table(NYTS_2023$ecig, useNA = "always")
##
## Current Former Never <NA>
## 1382 1631 17115 0
# Age: 12-14, 15-17, 18+
table(NYTS_2023$age_cat, useNA = "always")
##
## 12-14 15-17 18+ <NA>
## 10517 8506 1105 0
# Sex: Male, Female
table(NYTS_2023$um_sex_lab, useNA = "always")
##
## Female Male <NA>
## 9859 10269 0
# Filter out rows with missing value
NYTS_2023 = NYTS_2023[!is.na(NYTS_2023$age_cat),] # age
NYTS_2023 = NYTS_2023[!is.na(NYTS_2023$um_sex_lab),] # sex
# generate a combination of tobacco product use status for cigarettes and e-cigarettes
# creates a vector that contains all possible combinations
cig_ecig_name=paste0(rep(c("Never","Former","Current"),each=3),"/",c("Never","Former","Current"))
# loop through all combinations and assign values:
NYTS_2023$NYTS_9stat=NA
k=1
for(i in c("Never","Former","Current")){
for(j in c("Never","Former","Current")){
NYTS_2023$NYTS_9stat[NYTS_2023$cigt %in% i & NYTS_2023$ecig %in% j]=cig_ecig_name[k]
k=k+1
}
}
# for example, the loop will generate and assign the combination "Current/Former" to the NYTS_9stat column for observations where cigarettes are "Current" and e-cigarettes are "Former".
NYTS_2023$NYTS_9stat=factor(NYTS_2023$NYTS_9stat, levels=unique(cig_ecig_name))
NYTS_2023=NYTS_2023[!is.na(NYTS_2023$NYTS_9stat),]
table(NYTS_2023$NYTS_9stat,useNA = "always")
##
## Never/Never Never/Former Never/Current Former/Never Former/Former
## 17092 1620 1294 6 9
## Former/Current Current/Never Current/Former Current/Current <NA>
## 27 17 2 61 0
# set options for survey analysis
options(survey.lonely.psu = "adjust") # adjusts for lonely primary sampling units (PSUs) by redistributing weights
options(warn = -1) # suppress warnings
options(digits = 20) # set 20 digits to display for numerical output
# Create a survey design object using the specified survey design parameters
svy_design <- svydesign(
id = ~id, # primary sampling unit (PSU) variable
PSU = ~psu, # primary sampling unit (PSU)
strata = ~strata, # variance stratum variable
weights = ~weights, # person-level sampling weights
data = NYTS_2023, # your data set
nest = TRUE # specify that the strata and PSUs are nested
)
# calculate overall survey metrics by grouping the data by tobacco use status combinations
# compute weighted counts, percentages with 95% confidence intervals, and unweighted counts
# add overall indicators for sex and age
metrics_all = as_survey(svy_design) %>% # convert the survey design object to a survey design object
# group by tobacco use status combinations, and ensures that all levels are included
group_by(NYTS_9stat, .drop = FALSE) %>%
# calculate the outputs for each group
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se","ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
# indicate that the metrics are for which group, here for the overall population
mutate(sex = "Overall",age = "Overall") %>% ungroup()
metrics_all
## # A tibble: 9 × 10
## NYTS_9stat `weighted count` `weighted count_se` `weighted percent`
## <fct> <dbl> <dbl> <dbl>
## 1 Never/Never 21811551. 280869. 0.836
## 2 Never/Former 2320582. 111346. 0.0889
## 3 Never/Current 1809811. 105831. 0.0694
## 4 Former/Never 3467. 1857. 0.000133
## 5 Former/Former 10922. 4821. 0.000419
## 6 Former/Current 35906. 11027. 0.00138
## 7 Current/Never 32199. 12233. 0.00123
## 8 Current/Former 3330. 2587. 0.000128
## 9 Current/Current 60935. 11953. 0.00234
## # ℹ 6 more variables: `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>, sex <chr>, age <chr>
# calculation is similar to the overall survey metrics
# but it additionally groups the data by age categories, providing outputs for each age group within each tobacco use status combination
metrics_by_age = as_survey(svy_design) %>%
group_by(age_cat, NYTS_9stat, .drop = FALSE) %>%
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
mutate(sex = "Overall") %>% ungroup()
metrics_by_age <- rename(metrics_by_age, age = age_cat)
metrics_by_age
## # A tibble: 27 × 10
## age NYTS_9stat `weighted count` `weighted count_se` `weighted percent`
## <chr> <fct> <dbl> <dbl> <dbl>
## 1 12-14 Never/Never 10630294. 188545. 0.903
## 2 12-14 Never/Former 571979. 46266. 0.0486
## 3 12-14 Never/Current 523608. 42064. 0.0445
## 4 12-14 Former/Never 1774. 1250. 0.000151
## 5 12-14 Former/Former 1710. 1193. 0.000145
## 6 12-14 Former/Current 9183. 7727. 0.000780
## 7 12-14 Current/Never 16172. 9967. 0.00137
## 8 12-14 Current/Former 3330. 2587. 0.000283
## 9 12-14 Current/Current 16622. 7062. 0.00141
## 10 15-17 Never/Never 9439850. 216608. 0.783
## # ℹ 17 more rows
## # ℹ 5 more variables: `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>, sex <chr>
# calculation is similar to the overall survey metrics
# but it additionally groups the data by age and sex, providing outputs for each combination of sex, age, and tobacco use status within each tobacco use status combination
metrics_by_sex_age = as_survey(svy_design) %>%
group_by(um_sex_lab, age_cat, NYTS_9stat, .drop = FALSE) %>%
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
ungroup()
metrics_by_sex_age <- rename(metrics_by_sex_age, sex = um_sex_lab)
metrics_by_sex_age <- rename(metrics_by_sex_age, age = age_cat)
metrics_by_sex_age
## # A tibble: 54 × 10
## sex age NYTS_9stat `weighted count` `weighted count_se`
## <chr> <chr> <fct> <dbl> <dbl>
## 1 Female 12-14 Never/Never 5212285. 137153.
## 2 Female 12-14 Never/Former 303541. 35919.
## 3 Female 12-14 Never/Current 300967. 29767.
## 4 Female 12-14 Former/Never 0 0
## 5 Female 12-14 Former/Former 802. 775.
## 6 Female 12-14 Former/Current 1630. 1630.
## 7 Female 12-14 Current/Never 1646. 1122.
## 8 Female 12-14 Current/Former 0 0
## 9 Female 12-14 Current/Current 6258. 4948.
## 10 Female 15-17 Never/Never 4512586. 165435.
## # ℹ 44 more rows
## # ℹ 5 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>
# combine survey metrics into a data frame
NYTS_2023_metrics<-rbind.data.frame(metrics_all,metrics_by_age,metrics_by_sex_age)
# select relevant columns and recode for readability
NYTS_2023_metrics<-NYTS_2023_metrics %>%
dplyr::select(NYTS_9stat,age,sex,"weighted count","unweighted_count","weighted percent","weighted percent_se","weighted percent_low","weighted percent_upp") %>%
# filter to keep only rows where unweighted_count > 0
filter(unweighted_count>0)
NYTS_2023_metrics<-rename(NYTS_2023_metrics,cig_ecig=NYTS_9stat)
NYTS_2023_metrics<-rename(NYTS_2023_metrics,"unweighted count"=unweighted_count)
# recode sex and age variables if needed
# export the processed data
write.csv(NYTS_2023_metrics,"NYTS_2023_metrics.csv",row.names = T)
The Tobacco Use Supplement to the Current Population Survey (TUS-CPS)
rm(list=ls()) # remove all the objects present in the workspace
# load the data
# this data set is a subset of TUSCPS 2022
TUSCPS_2022 <- read.csv("TUSCPS_2022_data.csv")
# tobacco product use status
## cigarette
## 1: current: smoked 100+ cigarettes and at least 1 day in the past 30 days
## 2: former: ever used but did not smoke at least 1 day in the past 30 days
## 3: never: never used
table(TUSCPS_2022$cig_status, useNA = "always")
##
## current former missing never <NA>
## 87 83 3 249 0
## e-cigarette
## 1: current: ever used and vaped at least 1 day in the past 30 days
## 2: former: ever used but did not vape at least 1 day in the past 30 days
## 3: never: never used
table(TUSCPS_2022$ecig_status, useNA = "always")
##
## current former missing never <NA>
## 14 50 10 348 0
# Age: 1: 18-24, 2: 25-34, 3: 35-44, 4: 45-54, 5: 55-64, 6:65+
table(TUSCPS_2022$Age, useNA = "always")
##
## 1 2 3 4 5 6 <NA>
## 25 90 79 71 60 97 0
# Sex:1: Male, 2: Female
table(TUSCPS_2022$Sex, useNA = "always")
##
## 1 2 <NA>
## 166 256 0
# Filter out rows with missing value
TUSCPS_2022 = TUSCPS_2022[!is.na(TUSCPS_2022$Age),] # age
TUSCPS_2022 = TUSCPS_2022[!is.na(TUSCPS_2022$Sex),] # sex
# generate a combination of tobacco product use status for cigarettes and e-cigarettes
# creates a vector that contains all possible combinations
cig_ecig_name=paste0(rep(c("Never","Former","Current"),each=3),"/",c("Never","Former","Current"))
# loop through all combinations and assign values:
TUSCPS_2022$TUSCPS_9stat=NA
k=1
for(i in c("never","former","current")){
for(j in c("never","former","current")){
TUSCPS_2022$TUSCPS_9stat[TUSCPS_2022$cig_status %in% i & TUSCPS_2022$ecig_status %in% j]=cig_ecig_name[k]
k=k+1
}
}
# for example, the loop will generate and assign the combination "Current/Former" to the TUSCPS_9stat column for observations where cigarettes are "Current" and e-cigarettes are "Former"
TUSCPS_2022$TUSCPS_9stat=factor(TUSCPS_2022$TUSCPS_9stat, levels=unique(cig_ecig_name))
TUSCPS_2022 = TUSCPS_2022[!is.na(TUSCPS_2022$TUSCPS_9stat),]
table(TUSCPS_2022$TUSCPS_9stat,useNA = "always")
##
## Never/Never Never/Former Never/Current Former/Never Former/Former
## 231 13 1 65 13
## Former/Current Current/Never Current/Former Current/Current <NA>
## 3 51 24 10 0
# set options for survey analysis
options(survey.replicates.mse = TRUE) # compute the mean square error for replicate weights
options(warn = -1) # suppress warnings
options(digits = 20) # set 20 digits to display for numerical output
# create a survey design object using replicate weights
svy_design = svydesign(
id = ~1, # primary sampling unit (PSU) variable; 1 means no clustering, and each row is treated as an independent unit
weights = ~PWSRWGT, # sampling weight
data = TUSCPS_2022 # your data set
)
# calculate overall survey metrics by grouping the data by tobacco use status combinations
# compute weighted counts, percentages with 95% confidence intervals, and unweighted counts
# add overall indicators for sex and age
metrics_all = as_survey_rep(svy_design) %>% # use as_survey_rep for replicate weights
# group by tobacco use status combinations, and ensures that all levels are included
group_by(TUSCPS_9stat, .drop = FALSE) %>%
# calculate the outputs for each group
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se","ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
# indicate that the metrics are for which group, here for the overall population
mutate(sex = "Overall",age = "Overall") %>% ungroup()
metrics_all
## # A tibble: 9 × 10
## TUSCPS_9stat `weighted count` `weighted count_se` `weighted percent`
## <fct> <dbl> <dbl> <dbl>
## 1 Never/Never 1486492. 108540. 0.574
## 2 Never/Former 88446. 31291. 0.0341
## 3 Never/Current 14738. 14738. 0.00569
## 4 Former/Never 376635. 57181. 0.145
## 5 Former/Former 70903. 24565. 0.0274
## 6 Former/Current 38429. 25465. 0.0148
## 7 Current/Never 257866. 44528. 0.0995
## 8 Current/Former 165959. 38210. 0.0640
## 9 Current/Current 92278. 41550. 0.0356
## # ℹ 6 more variables: `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>, sex <chr>, age <chr>
# calculation is similar to the overall survey metrics
# but it additionally groups the data by age categories, providing outputs for each age group within each tobacco use status combination
metrics_by_age = as_survey_rep(svy_design) %>%
group_by(Age, TUSCPS_9stat, .drop = FALSE) %>%
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
mutate(sex = "Overall") %>% ungroup()
metrics_by_age <- rename(metrics_by_age, age = Age)
metrics_by_age
## # A tibble: 54 × 10
## age TUSCPS_9stat `weighted count` `weighted count_se` `weighted percent`
## <int> <fct> <dbl> <dbl> <dbl>
## 1 1 Never/Never 117605. 42322. 0.507
## 2 1 Never/Former 20664. 15332. 0.0891
## 3 1 Never/Current 14738. 14738. 0.0635
## 4 1 Former/Never 7872. 5629. 0.0339
## 5 1 Former/Former 1897. 1897. 0.00818
## 6 1 Former/Current 0 0 0
## 7 1 Current/Never 29331. 17660. 0.126
## 8 1 Current/Former 9962. 9962. 0.0430
## 9 1 Current/Current 29856. 28482. 0.129
## 10 2 Never/Never 408645. 69467. 0.636
## # ℹ 44 more rows
## # ℹ 5 more variables: `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>, sex <chr>
# calculation is similar to the overall survey metrics
# but it additionally groups the data by age and sex, providing outputs for each combination of sex, age, and tobacco use status within each tobacco use status combination
metrics_by_sex_age = as_survey_rep(svy_design) %>%
group_by(Sex, Age, TUSCPS_9stat, .drop = FALSE) %>%
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
ungroup()
metrics_by_sex_age <- rename(metrics_by_sex_age, sex = Sex)
metrics_by_sex_age <- rename(metrics_by_sex_age, age = Age)
metrics_by_sex_age
## # A tibble: 108 × 10
## sex age TUSCPS_9stat `weighted count` `weighted count_se`
## <int> <int> <fct> <dbl> <dbl>
## 1 1 1 Never/Never 85087. 38085.
## 2 1 1 Never/Former 0 0
## 3 1 1 Never/Current 0 0
## 4 1 1 Former/Never 4557. 4557.
## 5 1 1 Former/Former 0 0
## 6 1 1 Former/Current 0 0
## 7 1 1 Current/Never 5580. 5580.
## 8 1 1 Current/Former 9962. 9962.
## 9 1 1 Current/Current 28451. 28451.
## 10 1 2 Never/Never 162327. 51167.
## # ℹ 98 more rows
## # ℹ 5 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>
# combine survey metrics into a data frame
data <- rbind.data.frame(metrics_all, metrics_by_age, metrics_by_sex_age)
# select relevant columns and recode for readability
TUSCPS_2022_metrics <- data %>%
dplyr::select(TUSCPS_9stat, age, sex,
"weighted count", "unweighted_count", "weighted percent",
"weighted percent_se", "weighted percent_low", "weighted percent_upp") %>%
# filters out rows with zero unweighted counts
filter(unweighted_count > 0)
# rename columns
TUSCPS_2022_metrics <- rename(TUSCPS_2022_metrics, cig_ecig = TUSCPS_9stat)
TUSCPS_2022_metrics <- rename(TUSCPS_2022_metrics, "unweighted count" = unweighted_count)
# recode sex and age variables
TUSCPS_2022_metrics$sex[TUSCPS_2022_metrics$sex %in% 2] = "Female"
TUSCPS_2022_metrics$sex[TUSCPS_2022_metrics$sex %in% 1] = "Male"
TUSCPS_2022_metrics$age[TUSCPS_2022_metrics$age %in% 1] = "18-24"
TUSCPS_2022_metrics$age[TUSCPS_2022_metrics$age %in% 2] = "25-34"
TUSCPS_2022_metrics$age[TUSCPS_2022_metrics$age %in% 3] = "35-44"
TUSCPS_2022_metrics$age[TUSCPS_2022_metrics$age %in% 4] = "45-54"
TUSCPS_2022_metrics$age[TUSCPS_2022_metrics$age %in% 5] = "55-64"
TUSCPS_2022_metrics$age[TUSCPS_2022_metrics$age %in% 6] = "65+"
# export the processed data
write.csv(TUSCPS_2022_metrics,"TUSCPS_2022_metrics.csv",row.names = F)
Youth Risk Behavior Surveillance System
(YRBSS)
Youth Risk Behavior Survey (YRBS)
# remove all the objects present in the workspace
rm(list=ls())
# load the data
# this sample data is a subset of YRBS 2021
YRBS_2021 <- read.csv("YRBS_2021_data.csv")
# tobacco product use status
## cigarette
## current: ever smoked cigarettes and at least 1 day in the past 30 days
## former: ever used but did not smoke at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(YRBS_2021$cig_st, useNA = "always")
##
## current former missing never <NA>
## 646 1735 3536 11156 0
## e-cigarette
## current: ever used and vaped at least 1 day in the past 30 days
## former: ever used but did not vape at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(YRBS_2021$ecig_st, useNA = "always")
##
## current former missing never <NA>
## 2892 2262 1243 10676 0
# Age: ~12, 13, 14, 15, 16, 17, 18+, Missing
table(YRBS_2021$Q1, useNA = "always")
##
## 12 years old or younger 13 years old 14 years old
## 29 61 3391
## 15 years old 16 years old 17 years old
## 4410 4257 3894
## 18 years old or older Missing <NA>
## 1011 20 0
# Sex: Male, Female, Missing
table(YRBS_2021$sex, useNA = "always")
##
## Female Male Missing <NA>
## 8128 8765 180 0
# Filter out rows with missing value
YRBS_2021 = YRBS_2021[YRBS_2021$Q1 != "Missing",] # age
YRBS_2021 = YRBS_2021[!is.na(YRBS_2021$Q1),]
table(YRBS_2021$Q1, useNA="always")
##
## 12 years old or younger 13 years old 14 years old
## 29 61 3391
## 15 years old 16 years old 17 years old
## 4410 4257 3894
## 18 years old or older <NA>
## 1011 0
YRBS_2021 = YRBS_2021[YRBS_2021$sex != "Missing",] # sex
YRBS_2021 = YRBS_2021[!is.na(YRBS_2021$sex),]
table(YRBS_2021$sex, useNA="always")
##
## Female Male <NA>
## 8121 8755 0
# generate a combination of tobacco product use status for cigarettes and e-cigarettes
# creates a vector that contains all possible combinations
cig_ecig_name=paste0(rep(c("Never","Former","Current"),each=3),"/",c("Never","Former","Current"))
# loop through all combinations and assign values:
YRBS_2021$YRBS_9stat=NA
k=1
for(i in c("never","former","current")){
for(j in c("never","former","current")){
YRBS_2021$YRBS_9stat[YRBS_2021$cig_st %in% i & YRBS_2021$ecig_st %in% j]=cig_ecig_name[k]
k=k+1
}
}
# for example, the loop will generate and assign the combination "Current/Former" to the YRBS_9stat column for observations where cigarettes are "Current" and e-cigarettes are "Former".
YRBS_2021$YRBS_9stat=factor(YRBS_2021$YRBS_9stat, levels=unique(cig_ecig_name))
YRBS_2021=YRBS_2021[!is.na(YRBS_2021$YRBS_9stat),]
table(YRBS_2021$YRBS_9stat,useNA = "always")
##
## Never/Never Never/Former Never/Current Former/Never Former/Former
## 8329 1203 956 246 511
## Former/Current Current/Never Current/Former Current/Current <NA>
## 754 28 25 538 0
# set options for survey analysis
options(survey.lonely.psu = "adjust") # adjusts for lonely primary sampling units (PSUs) by redistributing weights
options(warn = -1) # suppress warnings
options(digits = 20) # set 20 digits to display for numerical output
# Create a survey design object using the specified survey design parameters
svy_design <- svydesign(
id = ~PSU, # primary sampling unit (PSU) variable
strata = ~STRATUM, # variance stratum variable
weights = ~WEIGHT, # sampling weights
data = YRBS_2021, # your data set
nest = TRUE # specify that the strata and PSUs are nested
)
# calculate overall survey metrics by grouping the data by tobacco use status combinations
# compute weighted counts, percentages with 95% confidence intervals, and unweighted counts
# add overall indicators for sex and age
metrics_all = as_survey(svy_design) %>% # convert the survey design object to a survey design object
# group by tobacco use status combinations, and ensures that all levels are included
group_by(YRBS_9stat, .drop = FALSE) %>%
# calculate the outputs for each group
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se","ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
# indicate that the metrics are for which group, here for the overall population
mutate(sex = "Overall",age = "Overall") %>% ungroup()
metrics_all
## # A tibble: 9 × 10
## YRBS_9stat `weighted count` `weighted count_se` `weighted percent`
## <fct> <dbl> <dbl> <dbl>
## 1 Never/Never 8931. 745. 0.658
## 2 Never/Former 1335. 128. 0.0983
## 3 Never/Current 1118. 118. 0.0823
## 4 Former/Never 234. 32.8 0.0172
## 5 Former/Former 546. 42.6 0.0402
## 6 Former/Current 828. 79.0 0.0610
## 7 Current/Never 27.0 8.50 0.00199
## 8 Current/Former 34.3 7.10 0.00252
## 9 Current/Current 528. 58.6 0.0389
## # ℹ 6 more variables: `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>, sex <chr>, age <chr>
# calculation is similar to the overall survey metrics
# but it additionally groups the data by age categories, providing outputs for each age group within each tobacco use status combination
metrics_by_age = as_survey(svy_design) %>%
group_by(Q1, YRBS_9stat, .drop = FALSE) %>%
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
mutate(sex = "Overall") %>% ungroup()
metrics_by_age <- rename(metrics_by_age, age = Q1)
metrics_by_age
## # A tibble: 63 × 10
## age YRBS_9stat `weighted count` `weighted count_se` `weighted percent`
## <chr> <fct> <dbl> <dbl> <dbl>
## 1 12 years … Never/Nev… 2.87 1.68 0.553
## 2 12 years … Never/For… 0 0 0
## 3 12 years … Never/Cur… 0 0 0
## 4 12 years … Former/Ne… 0 0 0
## 5 12 years … Former/Fo… 0.410 0.410 0.0790
## 6 12 years … Former/Cu… 0 0 0
## 7 12 years … Current/N… 0 0 0
## 8 12 years … Current/F… 0 0 0
## 9 12 years … Current/C… 1.91 1.13 0.368
## 10 13 years … Never/Nev… 36.5 8.16 0.815
## # ℹ 53 more rows
## # ℹ 5 more variables: `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>, sex <chr>
# calculation is similar to the overall survey metrics
# but it additionally groups the data by age and sex, providing outputs for each combination of sex, age, and tobacco use status within each tobacco use status combination
metrics_by_sex_age = as_survey(svy_design) %>%
group_by(sex, Q1, YRBS_9stat, .drop = FALSE) %>%
summarize("weighted count" = survey_total(),
"weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
"unweighted_count" = unweighted(n())) %>%
ungroup()
metrics_by_sex_age <- rename(metrics_by_sex_age, sex = sex)
metrics_by_sex_age <- rename(metrics_by_sex_age, age = Q1)
metrics_by_sex_age
## # A tibble: 126 × 10
## sex age YRBS_9stat `weighted count` `weighted count_se`
## <chr> <chr> <fct> <dbl> <dbl>
## 1 Female 12 years old or young… Never/Nev… 0 0
## 2 Female 12 years old or young… Never/For… 0 0
## 3 Female 12 years old or young… Never/Cur… 0 0
## 4 Female 12 years old or young… Former/Ne… 0 0
## 5 Female 12 years old or young… Former/Fo… 0.410 0.410
## 6 Female 12 years old or young… Former/Cu… 0 0
## 7 Female 12 years old or young… Current/N… 0 0
## 8 Female 12 years old or young… Current/F… 0 0
## 9 Female 12 years old or young… Current/C… 1.16 1.05
## 10 Female 13 years old Never/Nev… 23.5 6.68
## # ℹ 116 more rows
## # ℹ 5 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## # `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## # unweighted_count <int>
# combine survey metrics into a data frame
YRBS_2021_metrics<-rbind.data.frame(metrics_all,metrics_by_age,metrics_by_sex_age)
# select relevant columns and recode for readability
YRBS_2021_metrics<-YRBS_2021_metrics %>%
dplyr::select(YRBS_9stat,age,sex,"weighted count","unweighted_count","weighted percent","weighted percent_se","weighted percent_low","weighted percent_upp") %>%
# filter to keep only rows where unweighted_count > 0
filter(unweighted_count>0)
YRBS_2021_metrics<-rename(YRBS_2021_metrics,cig_ecig=YRBS_9stat)
YRBS_2021_metrics<-rename(YRBS_2021_metrics,"unweighted count"=unweighted_count)
# recode sex and age variables if needed
# export the processed data
write.csv(YRBS_2021_metrics,"YRBS_2021_metrics.csv",row.names = T)