This document processes and analyzes survey data. The analysis involves calculating various survey metrics, including weighted counts, weighted percentages with 95% confidence intervals, and unweighted counts.

Steps include:

  1. Prepare Data: Clean and filter the data and ensure it is ready for analysis.
  2. Calculate Survey Metrics: Set up a survey design object with weights and configure options for analysis. Compute weighted and unweighted metrics for tobacco use status combinations, or by other category variables.
  3. Export Data: Clean and format the data, and then export the processed data to a CSV file.

Each step is detailed with specific code snippets to achieve the desired analysis.


1 PATH

Population Assessment of Tobacco and Health (PATH) Study

1.1 Prepare data

# remove all the objects present in the workspace
rm(list=ls())    

# load the data
# this sample data is a subset of PATH wave1
PATH_wave1 <- read.csv("PATH_wave1_data.csv")

# tobacco product use status
## cigarette
## current: smoked 100+ cigarettes and at least 1 day in the past 30 days 
## former: ever used but did not smoke at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(PATH_wave1$UMp1_W1_A_smkstat_Reg_1past30, useNA = "always")
## 
## current  former missing   never    <NA> 
##   11208    5079     173   15849       0
## cigar (Non-premium)
## current: smoked 100+ cigars and at least 1 day in the past 30 days 
## former: ever used but did not smoke at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(PATH_wave1$UM_W1_A_cigarstat_1past30, useNA = "always")
## 
## current  former missing   never    <NA> 
##    1706    2002    1214   27387       0
## e-cigarette
## current: ever used and vaped at least 1 day in the past 30 days
## former: ever used but did not vape at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(PATH_wave1$UMp1_W1_A_ecigstat_Reg_1past30, useNA = "always")
## 
## current  former missing   never    <NA> 
##    1369     892      70   29978       0
# age: 1: 18-24, 2: 25-34, 3: 35-54, 4: 55+
table(PATH_wave1$UM_W1_A_agecat, useNA = "always")
## 
##    1    2    3    4 <NA> 
## 9110 6338 9778 7083    0
# sex: 0: Female, 1: Male
table(PATH_wave1$UM_W1_A_male_imp, useNA = "always")
## 
##     0     1  <NA> 
## 15989 16320     0
# Filter out rows with missing value
PATH_wave1 = PATH_wave1[!is.na(PATH_wave1$UM_W1_A_agecat),]    # age
PATH_wave1 = PATH_wave1[!is.na(PATH_wave1$UM_W1_A_male_imp),]  # sex
# generate a combination of tobacco product use status for cigarettes, cigars, and e-cigarettes
# creates a vector that contains all possible combinations
cig_cigar_ecig_name=paste0(rep(c("Never","Former","Current"),each=9),"/",
                           rep(c("Never","Former","Current"),each=3),"/",
                           rep(c("Never","Former","Current")))

# loop through all combinations and assign values:
PATH_wave1$UM_W1_cig_cigar1_ecig_27stat=NA
m=1
for(i in c("never","former","current")){
  for(j in c("never","former","current")){
    for(k in c("never","former","current")){
      PATH_wave1$UM_W1_cig_cigar1_ecig_27stat[PATH_wave1$UMp1_W1_A_smkstat_Reg_1past30 %in% i & PATH_wave1$UM_W1_A_cigarstat_1past30 %in% j & PATH_wave1$UMp1_W1_A_ecigstat_Reg_1past30 %in% k]=cig_cigar_ecig_name[m]
      m=m+1
    }
  }
}

# for example, the loop will generate and assign the combination "Current/Former/Never" to the UM_W1_cig_cigar1_ecig_27stat column in PATH_wave1 for observations where cigarettes are "Current", cigars are "Former", and e-cigarettes are "Never".
PATH_wave1$UM_W1_cig_cigar1_ecig_27stat=factor(PATH_wave1$UM_W1_cig_cigar1_ecig_27stat, levels=unique(cig_cigar_ecig_name))
table(PATH_wave1$UM_W1_cig_cigar1_ecig_27stat,useNA = "always")
## 
##       Never/Never/Never      Never/Never/Former     Never/Never/Current 
##                   13978                      71                     137 
##      Never/Former/Never     Never/Former/Former    Never/Former/Current 
##                     444                      25                       8 
##     Never/Current/Never    Never/Current/Former   Never/Current/Current 
##                     503                      14                      45 
##      Former/Never/Never     Former/Never/Former    Former/Never/Current 
##                    3694                     137                     278 
##     Former/Former/Never    Former/Former/Former   Former/Former/Current 
##                     476                      37                      46 
##    Former/Current/Never   Former/Current/Former  Former/Current/Current 
##                     221                      10                      17 
##     Current/Never/Never    Current/Never/Former   Current/Never/Current 
##                    7903                     406                     612 
##    Current/Former/Never   Current/Former/Former  Current/Former/Current 
##                     769                     104                      85 
##   Current/Current/Never  Current/Current/Former Current/Current/Current 
##                     723                      65                      99 
##                    <NA> 
##                    1402
PATH_wave1=PATH_wave1[!is.na(PATH_wave1$UM_W1_cig_cigar1_ecig_27stat),]

1.2 Calculate survey metrics

# set options for survey analysis
options(survey.replicates.mse = TRUE)   # compute the mean square error for replicate weights
options(warn = -1)                      # suppress warnings
options(digits = 20)                    # set 20 digits to display for numerical output

# create a survey design object using replicate weights
svy_design = svrepdesign(
  id = ~PERSONID,                       # primary sampling unit (PSU) variable
  weights = ~R01_A_PWGT,                # sampling weights
  repweights = "R01_A_PWGT[1-100]+",    # replicate weights for PATH wave 1
  type = "Fay",                         # replication method
  rho = 0.3,                            # set the rho parameter for Fay's method
  data = PATH_wave1                     # your data set
  )

1.2.1 Overall survey metrics

# calculate overall survey metrics by grouping the data by tobacco use status combinations
# compute weighted counts, percentages with 95% confidence intervals, and unweighted counts
# add overall indicators for sex and age

metrics_all = as_survey_rep(svy_design) %>%   # use as_survey_rep for replicate weights
  # group by tobacco use status combinations, and ensures that all levels are included
  group_by(UM_W1_cig_cigar1_ecig_27stat, .drop = FALSE) %>% 
  # calculate the outputs for each group
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se","ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  # indicate that the metrics are for which group, here for the overall population
  mutate(sex = "Overall",age = "Overall") %>% ungroup()

metrics_all
## # A tibble: 27 × 10
##    UM_W1_cig_cigar1_ecig_27stat `weighted count` `weighted count_se`
##    <fct>                                   <dbl>               <dbl>
##  1 Never/Never/Never                  132831175.            1367042.
##  2 Never/Never/Former                    218596.              24553.
##  3 Never/Never/Current                   453316.              45260.
##  4 Never/Former/Never                   1990235.             123820.
##  5 Never/Former/Former                    84903.              23102.
##  6 Never/Former/Current                   23038.              11337.
##  7 Never/Current/Never                  1707233.              91458.
##  8 Never/Current/Former                   37719.              13190.
##  9 Never/Current/Current                 146509.              24344.
## 10 Former/Never/Never                  39264002.            1197085.
## # ℹ 17 more rows
## # ℹ 7 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>, sex <chr>, age <chr>

1.2.2 Grouped by age

# calculation is similar to the overall survey metrics
# but it additionally groups the data by age categories, providing outputs for each age group within each tobacco use status combination

metrics_by_age = as_survey_rep(svy_design) %>%
  group_by(UM_W1_A_agecat, UM_W1_cig_cigar1_ecig_27stat, .drop = FALSE) %>% 
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  mutate(sex = "Overall") %>% ungroup()
metrics_by_age <- rename(metrics_by_age, age = UM_W1_A_agecat)

metrics_by_age
## # A tibble: 108 × 10
##      age UM_W1_cig_cigar1_ecig_27stat `weighted count` `weighted count_se`
##    <int> <fct>                                   <dbl>               <dbl>
##  1     1 Never/Never/Never                   21200441.             203775.
##  2     1 Never/Never/Former                    107800.              22259.
##  3     1 Never/Never/Current                   203842.              32095.
##  4     1 Never/Former/Never                    556847.              28795.
##  5     1 Never/Former/Former                    45419.              12391.
##  6     1 Never/Former/Current                   10372.               4674.
##  7     1 Never/Current/Never                   616245.              59252.
##  8     1 Never/Current/Former                   21998.               7329.
##  9     1 Never/Current/Current                  59352.              15874.
## 10     1 Former/Never/Never                    976824.              64343.
## # ℹ 98 more rows
## # ℹ 6 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>, sex <chr>

1.2.3 Grouped by age and sex

# calculation is similar to the overall survey metrics
# but it additionally groups the data by age and sex, providing outputs for each combination of sex, age, and tobacco use status within each tobacco use status combination

metrics_by_sex_age = as_survey_rep(svy_design) %>%
  group_by(UM_W1_A_male_imp, UM_W1_A_agecat, UM_W1_cig_cigar1_ecig_27stat, .drop = FALSE) %>% 
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  ungroup()
metrics_by_sex_age <- rename(metrics_by_sex_age, sex = UM_W1_A_male_imp)
metrics_by_sex_age <- rename(metrics_by_sex_age, age = UM_W1_A_agecat)

metrics_by_sex_age
## # A tibble: 216 × 10
##      sex   age UM_W1_cig_cigar1_ecig_27stat `weighted count` `weighted count_se`
##    <int> <int> <fct>                                   <dbl>               <dbl>
##  1     0     1 Never/Never/Never                   11312305.             121360.
##  2     0     1 Never/Never/Former                     48967.              11601.
##  3     0     1 Never/Never/Current                    80183.              15832.
##  4     0     1 Never/Former/Never                    224587.              19852.
##  5     0     1 Never/Former/Former                     8944.               6628.
##  6     0     1 Never/Former/Current                       0                   0 
##  7     0     1 Never/Current/Never                   194000.              34322.
##  8     0     1 Never/Current/Former                    9437.               5015.
##  9     0     1 Never/Current/Current                  19447.               7056.
## 10     0     1 Former/Never/Never                    530845.              40526.
## # ℹ 206 more rows
## # ℹ 5 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>

1.3 Process Data

# combine survey metrics into a data frame
data <- rbind.data.frame(metrics_all, metrics_by_age, metrics_by_sex_age)

# select relevant columns and recode for readability
PATH_wave1_metrics <- data %>%
  dplyr::select(UM_W1_cig_cigar1_ecig_27stat, age, sex, 
                "weighted count", "unweighted_count", "weighted percent", 
                "weighted percent_se", "weighted percent_low", "weighted percent_upp") %>%
  # filters out rows with zero unweighted counts
  filter(unweighted_count > 0)  

# add wave identifier and rename columns
PATH_wave1_metrics = PATH_wave1_metrics %>% mutate(wave = 1)
PATH_wave1_metrics <- rename(PATH_wave1_metrics, cig_cigar_ecig = UM_W1_cig_cigar1_ecig_27stat)
PATH_wave1_metrics <- rename(PATH_wave1_metrics, "unweighted count" = unweighted_count)

# recode sex and age variables
PATH_wave1_metrics$sex[PATH_wave1_metrics$sex %in% 0] = "Female"
PATH_wave1_metrics$sex[PATH_wave1_metrics$sex %in% 1] = "Male"

PATH_wave1_metrics$age[PATH_wave1_metrics$age %in% 1] = "18-24"
PATH_wave1_metrics$age[PATH_wave1_metrics$age %in% 2] = "25-34"
PATH_wave1_metrics$age[PATH_wave1_metrics$age %in% 3] = "35-54"
PATH_wave1_metrics$age[PATH_wave1_metrics$age %in% 4] = "55+"

# export the processed data
write.csv(PATH_wave1_metrics,"PATH_wave1_metrics.csv",row.names = F)

2 NHIS

National Health Interview Survey (NHIS)

2.1 Prepare data

# remove all the objects present in the workspace
rm(list=ls())    

# load the data
# this sample data is a subset of NHIS 2023
NHIS_2023 <- read.csv("NHIS_2023_data.csv")

# tobacco product use status
## cigarette
## 1: current: current ever day/some day smoker
## 2: former: former smoker
## 3: never: never smoker
table(NHIS_2023$cig_status, useNA = "always")
## 
##     1     2     3  <NA> 
##  3135  7235 18111  1041
## e-cigarette
## 1: current e-cigarette user
## 2: used e-cigarette, not current user
## 3: never e-cigarette user
table(NHIS_2023$ecig_status, useNA = "always")
## 
##     1     2     3  <NA> 
##  1514  3518 23470  1020
# Age: 1: 18-24, 2: 25-34, 3: 35-44, 4: 45-54, 5: 55-64, 6:65+
table(NHIS_2023$agegrp, useNA = "always")
## 
##    1    2    3    4    5    6 <NA> 
## 1857 4253 4540 4078 5053 9741    0
# Sex: 0: Female, 1: Male
table(NHIS_2023$male, useNA = "always")
## 
##     0     1  <NA> 
## 16059 13457     6
# Filter out rows with missing value
NHIS_2023=NHIS_2023[!is.na(NHIS_2023$agegrp),] # age
NHIS_2023=NHIS_2023[!is.na(NHIS_2023$male),] # sex
# generate a combination of tobacco product use status for cigarettes and e-cigarettes
# replace the numeric codes with the corresponding descriptive labels
NHIS_2023 <- NHIS_2023 %>%
  mutate(
    cig_status = recode(cig_status, `1` = "current", `2` = "former", `3` = "never"),
    ecig_status = recode(ecig_status, `1` = "current", `2` = "former", `3` = "never")
  )

# creates a vector that contains all possible combinations
cig_ecig_name=paste0(rep(c("Never","Former","Current"),each=3),"/",c("Never","Former","Current"))

# loop through all combinations and assign values:
NHIS_2023$NHIS_9stat=NA
k=1
for(i in c("never","former","current")){
  for(j in c("never","former","current")){
    NHIS_2023$NHIS_9stat[NHIS_2023$cig_status %in% i & NHIS_2023$ecig_status %in% j]=cig_ecig_name[k]
    k=k+1
  }
}

# for example, the loop will generate and assign the combination "Current/Former" to the NHIS_9stat column for observations where cigarettes are "Current" and e-cigarettes are "Former".
NHIS_2023$NHIS_9stat=factor(NHIS_2023$NHIS_9stat, levels=unique(cig_ecig_name))
NHIS_2023=NHIS_2023[!is.na(NHIS_2023$NHIS_9stat),]
table(NHIS_2023$NHIS_9stat,useNA = "always")
## 
##     Never/Never    Never/Former   Never/Current    Former/Never   Former/Former 
##           16415            1299             389            5311            1238 
##  Former/Current   Current/Never  Current/Former Current/Current            <NA> 
##             682            1718             975             439               0

2.2 Calculate survey metrics

# set options for survey analysis
options(survey.lonely.psu = "adjust")   # adjusts for lonely primary sampling units (PSUs) by redistributing weights
options(warn = -1)                      # suppress warnings
options(digits = 20)                    # set 20 digits to display for numerical output

# create a survey design object using the svydesign function
svy_design <- svydesign(
  id = ~fpx,                          # primary sampling unit (PSU) variable
  strata = ~pstrat,                   # stratification variable
  weights = ~wtfa_sa,                 # sampling weights
  PSU = ~ppsu,                        # primary sampling unit (PSU) 
  data = NHIS_2023,                   # your data set
  nest = TRUE                         # specify that the strata and PSUs are nested
)

2.2.1 Overall survey metrics

# calculate overall survey metrics by grouping the data by tobacco use status combinations
# compute weighted counts, percentages with 95% confidence intervals, and unweighted counts
# add overall indicators for sex and age

metrics_all = as_survey(svy_design) %>%   # convert the survey design object to a survey design object
  # group by tobacco use status combinations, and ensures that all levels are included
  group_by(NHIS_9stat, .drop = FALSE) %>% 
  # calculate the outputs for each group
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se","ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  # indicate that the metrics are for which group, here for the overall population
  mutate(sex = "Overall",age = "Overall") %>% ungroup()

metrics_all
## # A tibble: 9 × 10
##   NHIS_9stat      `weighted count` `weighted count_se` `weighted percent`
##   <fct>                      <dbl>               <dbl>              <dbl>
## 1 Never/Never           146919837.            1045216.             0.591 
## 2 Never/Former           14270696.             460667.             0.0574
## 3 Never/Current           4743082.             285991.             0.0191
## 4 Former/Never           37902940.             561263.             0.152 
## 5 Former/Former          11011062.             357467.             0.0443
## 6 Former/Current          7000193.             311200.             0.0281
## 7 Current/Never          13837362.             386528.             0.0556
## 8 Current/Former          8585694.             320869.             0.0345
## 9 Current/Current         4511436.             248812.             0.0181
## # ℹ 6 more variables: `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>, sex <chr>, age <chr>

2.2.2 Grouped by age

# calculation is similar to the overall survey metrics
# but it additionally groups the data by age categories, providing outputs for each age group within each tobacco use status combination

metrics_by_age = as_survey(svy_design) %>%
  group_by(agegrp, NHIS_9stat, .drop = FALSE) %>% 
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  mutate(sex = "Overall") %>% ungroup()
metrics_by_age <- rename(metrics_by_age, age = agegrp)

metrics_by_age
## # A tibble: 54 × 10
##      age NHIS_9stat      `weighted count` `weighted count_se` `weighted percent`
##    <int> <fct>                      <dbl>               <dbl>              <dbl>
##  1     1 Never/Never            19712179.             638174.            0.660  
##  2     1 Never/Former            4913474.             318508.            0.165  
##  3     1 Never/Current           2558333.             232928.            0.0857 
##  4     1 Former/Never             202699.              66884.            0.00679
##  5     1 Former/Former            516129.              91968.            0.0173 
##  6     1 Former/Current           914901.             134160.            0.0306 
##  7     1 Current/Never            152710.              50947.            0.00511
##  8     1 Current/Former           362278.              86702.            0.0121 
##  9     1 Current/Current          530331.              95138.            0.0178 
## 10     2 Never/Never            24745127.             566621.            0.579  
## # ℹ 44 more rows
## # ℹ 5 more variables: `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>, sex <chr>

2.2.3 Grouped by age and sex

# calculation is similar to the overall survey metrics
# but it additionally groups the data by age and sex, providing outputs for each combination of sex, age, and tobacco use status within each tobacco use status combination

metrics_by_sex_age = as_survey(svy_design) %>%
  group_by(male, agegrp, NHIS_9stat, .drop = FALSE) %>% 
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  ungroup()
metrics_by_sex_age <- rename(metrics_by_sex_age, sex = male)
metrics_by_sex_age <- rename(metrics_by_sex_age, age = agegrp)

metrics_by_sex_age
## # A tibble: 108 × 10
##      sex   age NHIS_9stat      `weighted count` `weighted count_se`
##    <int> <int> <fct>                      <dbl>               <dbl>
##  1     0     1 Never/Never            10032119.             456161.
##  2     0     1 Never/Former            2503567.             227313.
##  3     0     1 Never/Current           1265160.             164763.
##  4     0     1 Former/Never              92241.              47705.
##  5     0     1 Former/Former            244164.              59582.
##  6     0     1 Former/Current           334887.              81472.
##  7     0     1 Current/Never             58439.              28240.
##  8     0     1 Current/Former           121394.              50683.
##  9     0     1 Current/Current          199541.              57610.
## 10     0     2 Never/Never            13863422.             420756.
## # ℹ 98 more rows
## # ℹ 5 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>

2.3 Process Data

# combine survey metrics into a data frame
NHIS_2023_metrics<-rbind.data.frame(metrics_all,metrics_by_age,metrics_by_sex_age)

# select relevant columns and recode for readability
NHIS_2023_metrics<-NHIS_2023_metrics %>%
  dplyr::select(NHIS_9stat,age,sex,"weighted count","unweighted_count","weighted percent","weighted percent_se","weighted percent_low","weighted percent_upp") %>%
  # filter to keep only rows where unweighted_count > 0
  filter(unweighted_count>0)

NHIS_2023_metrics<-rename(NHIS_2023_metrics,cig_ecig=NHIS_9stat)
NHIS_2023_metrics<-rename(NHIS_2023_metrics,"unweighted count"=unweighted_count)

# recode sex and age variables
NHIS_2023_metrics$sex[NHIS_2023_metrics$sex %in% 1]="Male"
NHIS_2023_metrics$sex[NHIS_2023_metrics$sex %in% 0]="Female"

NHIS_2023_metrics$age[NHIS_2023_metrics$age %in% 1]="18-24"
NHIS_2023_metrics$age[NHIS_2023_metrics$age %in% 2]="25-34"
NHIS_2023_metrics$age[NHIS_2023_metrics$age %in% 3]="35-44"
NHIS_2023_metrics$age[NHIS_2023_metrics$age %in% 4]="45-54"
NHIS_2023_metrics$age[NHIS_2023_metrics$age %in% 5]="55-64"
NHIS_2023_metrics$age[NHIS_2023_metrics$age %in% 6]="65+"

# export the processed data
write.csv(NHIS_2023_metrics,"NHIS_2023_metrics.csv",row.names = T)

3 NSDUH

National Survey on Drug Use and Health (NSDUH)

3.1 Prepare data

# remove all the objects present in the workspace
rm(list=ls())    

# load the data
# this sample data is a subset of NSDUH 2022
NSDUH_2022 <- read.csv("NSDUH_2022_data.csv")

# tobacco product use status
## cigarette
## current: smoked 100+ cigarettes and at least 1 day in the past 30 days 
## former: ever used but did not smoke at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(NSDUH_2022$UM_smkstat, useNA = "always")
## 
## current  former missing   never    <NA> 
##    7166   17709      59   34135       0
## cigar
## current: ever smoked and smoked at least 1 day in the past 30 days 
## former: ever used but did not smoke at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(NSDUH_2022$UM_cigarstat, useNA = "always")
## 
## current  former missing   never    <NA> 
##    2116   12893      46   44014       0
## e-cigarette
## current: ever used and vaped at least 1 day in the past 30 days
## former: ever used but did not vape at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(NSDUH_2022$UM_ecigstat, useNA = "always")
## 
## current  former missing   never    <NA> 
##    6663   10489     136   41781       0
# Age: 1: ~17, 2: 18-25, 3: 26-34, 4: 35-49, 5: 50-64, 6: 65+
table(NSDUH_2022$UM_agecat, useNA = "always")
## 
##     1     2     3     4     5     6  <NA> 
## 11969 14307  9645 12588  5369  5191     0
NSDUH_2022 <- NSDUH_2022[!(NSDUH_2022$UM_agecat == 1),] # exclude non-adults for this analysis

# Sex: 1: Male, 2: Female
table(NSDUH_2022$irsex, useNA = "always")
## 
##     1     2  <NA> 
## 20852 26248     0
# Filter out rows with missing value
NSDUH_2022 = NSDUH_2022[!is.na(NSDUH_2022$UM_agecat),]    # age
NSDUH_2022 = NSDUH_2022[!is.na(NSDUH_2022$irsex),]  # sex
# generate a combination of tobacco product use status for cigarettes, cigars, and e-cigarettes
# creates a vector that contains all possible combinations
cig_cigar_ecig_name=paste0(rep(c("Never","Former","Current"),each=9),"/",
                           rep(c("Never","Former","Current"),each=3),"/",
                           rep(c("Never","Former","Current")))

# loop through all combinations and assign values:
NSDUH_2022$UM_cig_cigar_ecig_27stat=NA
m=1
for(i in c("never","former","current")){
  for(j in c("never","former","current")){
    for(k in c("never","former","current")){
      NSDUH_2022$UM_cig_cigar_ecig_27stat[NSDUH_2022$UM_smkstat %in% i & NSDUH_2022$UM_cigarstat %in% j & NSDUH_2022$UM_ecigstat %in% k]=cig_cigar_ecig_name[m]
      m=m+1
    }
  }
}

# for example, the loop will generate and assign the combination "Current/Former/Never" to the UM_cig_cigar_ecig_27stat column for observations where cigarettes are "Current", cigars are "Former", and e-cigarettes are "Never".
NSDUH_2022$UM_cig_cigar_ecig_27stat=factor(NSDUH_2022$UM_cig_cigar_ecig_27stat,levels=unique(cig_cigar_ecig_name))
table(NSDUH_2022$UM_cig_cigar_ecig_27stat,useNA = "always")
## 
##       Never/Never/Never      Never/Never/Former     Never/Never/Current 
##                   18064                    1803                     694 
##      Never/Former/Never     Never/Former/Former    Never/Former/Current 
##                    1513                     424                     137 
##     Never/Current/Never    Never/Current/Former   Never/Current/Current 
##                     183                     124                      70 
##      Former/Never/Never     Former/Never/Former    Former/Never/Current 
##                    5388                    1667                    1322 
##     Former/Former/Never    Former/Former/Former   Former/Former/Current 
##                    4434                    2174                    1259 
##    Former/Current/Never   Former/Current/Former  Former/Current/Current 
##                     279                     250                     205 
##     Current/Never/Never    Current/Never/Former   Current/Never/Current 
##                    1596                     962                     856 
##    Current/Former/Never   Current/Former/Former  Current/Former/Current 
##                     556                    1216                     870 
##   Current/Current/Never  Current/Current/Former Current/Current/Current 
##                     222                     322                     366 
##                    <NA> 
##                     144
NSDUH_2022=NSDUH_2022[!is.na(NSDUH_2022$UM_cig_cigar_ecig_27stat),]

3.2 Calculate survey metrics

# set options for survey analysis
options(survey.lonely.psu = "adjust")   # adjusts for lonely primary sampling units (PSUs) by redistributing weights
options(warn = -1)                      # suppress warnings
options(digits = 20)                    # set 20 digits to display for numerical output

# Create a survey design object using the specified survey design parameters
svy_design <- svydesign(
  id = ~verep,                          # primary sampling unit (PSU) variable
  strata = ~VESTR_C,                    # variance stratum variable
  weights = ~ANALWT2_C,                 # person level sampling weights
  data = NSDUH_2022,                    # your data set
  nest = TRUE                           # specify that the strata and PSUs are nested
)

3.2.1 Overall survey metrics

# calculate overall survey metrics by grouping the data by tobacco use status combinations
# compute weighted counts, percentages with 95% confidence intervals, and unweighted counts
# add overall indicators for sex and age

metrics_all = as_survey(svy_design) %>%   # convert the survey design object to a survey design object
  # group by tobacco use status combinations, and ensures that all levels are included
  group_by(UM_cig_cigar_ecig_27stat, .drop = FALSE) %>% 
  # calculate the outputs for each group
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se","ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  # indicate that the metrics are for which group, here for the overall population
  mutate(sex = "Overall",age = "Overall") %>% ungroup()

metrics_all
## # A tibble: 27 × 10
##    UM_cig_cigar_ecig_27stat `weighted count` `weighted count_se`
##    <fct>                               <dbl>               <dbl>
##  1 Never/Never/Never               93247611.            2316513.
##  2 Never/Never/Former               5344267.             266282.
##  3 Never/Never/Current              1864098.             155029.
##  4 Never/Former/Never               8696002.             437408.
##  5 Never/Former/Former              1337608.              84561.
##  6 Never/Former/Current              427498.              48556.
##  7 Never/Current/Never               775007.             128852.
##  8 Never/Current/Former              470227.              71045.
##  9 Never/Current/Current             205011.              43686.
## 10 Former/Never/Never              39690739.             904709.
## # ℹ 17 more rows
## # ℹ 7 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>, sex <chr>, age <chr>

3.2.2 Grouped by age

# calculation is similar to the overall survey metrics
# but it additionally groups the data by age categories, providing outputs for each age group within each tobacco use status combination

metrics_by_age = as_survey(svy_design) %>%
  group_by(UM_agecat, UM_cig_cigar_ecig_27stat, .drop = FALSE) %>% 
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  mutate(sex = "Overall") %>% ungroup()
metrics_by_age <- rename(metrics_by_age, age = UM_agecat)

metrics_by_age
## # A tibble: 135 × 10
##      age UM_cig_cigar_ecig_27stat `weighted count` `weighted count_se`
##    <int> <fct>                               <dbl>               <dbl>
##  1     2 Never/Never/Never               15412150.             486915.
##  2     2 Never/Never/Former               3790277.             187332.
##  3     2 Never/Never/Current              1388855.             127701.
##  4     2 Never/Former/Never                515225.              54784.
##  5     2 Never/Former/Former               691484.              52346.
##  6     2 Never/Former/Current              263985.              41478.
##  7     2 Never/Current/Never                85657.              16462.
##  8     2 Never/Current/Former              199776.              32278.
##  9     2 Never/Current/Current             144875.              29943.
## 10     2 Former/Never/Never                809773.              65073.
## # ℹ 125 more rows
## # ℹ 6 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>, sex <chr>

3.2.3 Grouped by age and sex

# calculation is similar to the overall survey metrics
# but it additionally groups the data by age and sex, providing outputs for each combination of sex, age, and tobacco use status within each tobacco use status combination

metrics_by_sex_age = as_survey(svy_design) %>%
  group_by(irsex, UM_agecat, UM_cig_cigar_ecig_27stat, .drop = FALSE) %>% 
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  ungroup()
metrics_by_sex_age <- rename(metrics_by_sex_age, sex = irsex)
metrics_by_sex_age <- rename(metrics_by_sex_age, age = UM_agecat)

metrics_by_sex_age
## # A tibble: 270 × 10
##      sex   age UM_cig_cigar_ecig_27stat `weighted count` `weighted count_se`
##    <int> <int> <fct>                               <dbl>               <dbl>
##  1     1     2 Never/Never/Never                7532616.             294270.
##  2     1     2 Never/Never/Former               1487542.             123413.
##  3     1     2 Never/Never/Current               593860.              88077.
##  4     1     2 Never/Former/Never                335326.              46714.
##  5     1     2 Never/Former/Former               445339.              51011.
##  6     1     2 Never/Former/Current              182022.              34459.
##  7     1     2 Never/Current/Never                54385.              13976.
##  8     1     2 Never/Current/Former              146846.              27080.
##  9     1     2 Never/Current/Current              98044.              22981.
## 10     1     2 Former/Never/Never                392555.              48034.
## # ℹ 260 more rows
## # ℹ 5 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>

3.3 Process Data

# combine survey metrics into a data frame
NSDUH_2022_metrics<-rbind.data.frame(metrics_all,metrics_by_age,metrics_by_sex_age)

# select relevant columns and recode for readability
NSDUH_2022_metrics<-NSDUH_2022_metrics %>%
  dplyr::select(UM_cig_cigar_ecig_27stat,age,sex,"weighted count","unweighted_count","weighted percent","weighted percent_se","weighted percent_low","weighted percent_upp") %>%
  # filter to keep only rows where unweighted_count > 0
  filter(unweighted_count>0)

NSDUH_2022_metrics<-rename(NSDUH_2022_metrics,cig_cigar_ecig=UM_cig_cigar_ecig_27stat)
NSDUH_2022_metrics<-rename(NSDUH_2022_metrics,"unweighted count"=unweighted_count)

# recode sex and age variables
NSDUH_2022_metrics$sex[NSDUH_2022_metrics$sex %in% 1]="Male"
NSDUH_2022_metrics$sex[NSDUH_2022_metrics$sex %in% 2]="Female"

# NSDUH_2022_metrics$age[NSDUH_2022_metrics$age %in% 1]="~17"
NSDUH_2022_metrics$age[NSDUH_2022_metrics$age %in% 2]="18-25"
NSDUH_2022_metrics$age[NSDUH_2022_metrics$age %in% 3]="26-34"
NSDUH_2022_metrics$age[NSDUH_2022_metrics$age %in% 4]="35-49"
NSDUH_2022_metrics$age[NSDUH_2022_metrics$age %in% 5]="50-64"
NSDUH_2022_metrics$age[NSDUH_2022_metrics$age %in% 6]="65+"

# export the processed data
write.csv(NSDUH_2022_metrics,"NSDUH_2022_metrics.csv",row.names = T)

4 MTF

Monitoring The Future (MTF)

4.1 Prepare data

 # remove all the objects present in the workspace
rm(list=ls())   

# load the data
# this sample data is a subset of MTF 2022
MTF_2022 <- read.csv("MTF_2022_data.csv")

# tobacco product use status
## cigarette
## current: ever smoked cigarettes and at least 1 day in the past 30 days 
## former: ever used but did not smoke at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(MTF_2022$cig_st, useNA = "always")
## 
## current  former missing   never    <NA> 
##     591    2576    1648   26623       0
## e-cigarette
## current: ever used and vaped at least 1 day in the past 30 days
## former: ever used but did not vape at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(MTF_2022$ecig_st, useNA = "always")
## 
## current  former missing   never    <NA> 
##    4011    4040    2186   21201       0
# Sex: Male, Female
table(MTF_2022$sex, useNA = "always")
## 
##  Female    Male Unknown    <NA> 
##   13188   14214    4036       0
# grade
table(MTF_2022$grade, useNA = "always")
## 
##     8    10    12  <NA> 
##  9889 11950  9599     0
# Filter out rows with missing value
table(MTF_2022$sex, useNA="always")     # sex
## 
##  Female    Male Unknown    <NA> 
##   13188   14214    4036       0
MTF_2022=MTF_2022[MTF_2022$sex!="Unknown",]
table(MTF_2022$grade, useNA="always")   #grade
## 
##     8    10    12  <NA> 
##  8616 10311  8475     0
MTF_2022=MTF_2022[!is.na(MTF_2022$grade),]
# generate a combination of tobacco product use status for cigarettes and e-cigarettes
# creates a vector that contains all possible combinations
cig_ecig_name=paste0(rep(c("Never","Former","Current"),each=3),"/",c("Never","Former","Current"))

# loop through all combinations and assign values:
MTF_2022$MTF_9stat=NA
k=1
for(i in c("never","former","current")){
  for(j in c("never","former","current")){
    MTF_2022$MTF_9stat[MTF_2022$cig_st %in% i & MTF_2022$ecig_st %in% j]=cig_ecig_name[k]
    k=k+1
  }
}

# for example, the loop will generate and assign the combination "Current/Former" to the MTF_9stat column for observations where cigarettes are "Current" and e-cigarettes are "Former".
MTF_2022$MTF_9stat=factor(MTF_2022$MTF_9stat, levels=unique(cig_ecig_name))
MTF_2022=MTF_2022[!is.na(MTF_2022$MTF_9stat),]
table(MTF_2022$MTF_9stat,useNA = "always")
## 
##     Never/Never    Never/Former   Never/Current    Former/Never   Former/Former 
##           19376            3017            2134             407             682 
##  Former/Current   Current/Never  Current/Former Current/Current            <NA> 
##            1168              83              50             337               0

4.2 Calculate survey metrics

# set options for survey analysis
options(survey.lonely.psu = "adjust")   # adjusts for lonely primary sampling units (PSUs) by redistributing weights
options(warn = -1)                      # suppress warnings
options(digits = 20)                    # set 20 digits to display for numerical output

# Create a survey design object using the specified survey design parameters
svy_design <- svydesign(
  id = ~psu,                           # primary sampling unit (PSU) variable
  strata = ~strata,                    # variance stratum variable
  weights = ~weight,                   # person level sampling weights
  data = MTF_2022,                     # your data set
  nest = TRUE                          # specify that the strata and PSUs are nested
)

4.2.1 Overall survey metrics

# calculate overall survey metrics by grouping the data by tobacco use status combinations
# compute weighted counts, percentages with 95% confidence intervals, and unweighted counts
# add overall indicators for sex and age

metrics_all = as_survey(svy_design) %>%   # convert the survey design object to a survey design object
  # group by tobacco use status combinations, and ensures that all levels are included
  group_by(MTF_9stat, .drop = FALSE) %>% 
  # calculate the outputs for each group
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se","ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  # indicate that the metrics are for which group, here for the overall population
  mutate(sex = "Overall",grade = "Overall") %>% ungroup()

metrics_all
## # A tibble: 9 × 10
##   MTF_9stat       `weighted count` `weighted count_se` `weighted percent`
##   <fct>                      <dbl>               <dbl>              <dbl>
## 1 Never/Never              19317.               3119.             0.704  
## 2 Never/Former              3096.                495.             0.113  
## 3 Never/Current             2178.                327.             0.0793 
## 4 Former/Never               429.                 90.2            0.0156 
## 5 Former/Former              687.                145.             0.0250 
## 6 Former/Current            1224.                215.             0.0446 
## 7 Current/Never               77.2                21.0            0.00281
## 8 Current/Former              52.6                13.5            0.00191
## 9 Current/Current            394.                103.             0.0143 
## # ℹ 6 more variables: `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>, sex <chr>, grade <chr>

4.2.2 Grouped by sex

# calculation is similar to the overall survey metrics
# but it additionally groups the data by sex categories, providing outputs for each sex group within each tobacco use status combination

metrics_by_sex = as_survey(svy_design) %>%
  group_by(sex, MTF_9stat, .drop = FALSE) %>% 
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  mutate(grade = "Overall") %>% ungroup()
metrics_by_sex <- rename(metrics_by_sex, sex = sex) # rename column names if needed

metrics_by_sex
## # A tibble: 18 × 10
##    sex    MTF_9stat      `weighted count` `weighted count_se` `weighted percent`
##    <chr>  <fct>                     <dbl>               <dbl>              <dbl>
##  1 Female Never/Never              8903.              1427.              0.674  
##  2 Female Never/Former             1721.               310.              0.130  
##  3 Female Never/Current            1301.               209.              0.0986 
##  4 Female Former/Never              160.                32.2             0.0122 
##  5 Female Former/Former             324.                61.6             0.0245 
##  6 Female Former/Current            589.               107.              0.0446 
##  7 Female Current/Never              18.8                6.45            0.00142
##  8 Female Current/Former             20.7                5.73            0.00157
##  9 Female Current/Curre…            164.                27.2             0.0124 
## 10 Male   Never/Never             10413.              1699.              0.731  
## 11 Male   Never/Former             1375.               190.              0.0965 
## 12 Male   Never/Current             876.               122.              0.0615 
## 13 Male   Former/Never              269.                60.2             0.0189 
## 14 Male   Former/Former             363.                84.8             0.0255 
## 15 Male   Former/Current            635.               117.              0.0446 
## 16 Male   Current/Never              58.4               16.9             0.00410
## 17 Male   Current/Former             31.8               10.6             0.00223
## 18 Male   Current/Curre…            230.                77.7             0.0161 
## # ℹ 5 more variables: `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>, grade <chr>

4.2.3 Grouped by grade and sex

# calculation is similar to the overall survey metrics
# but it additionally groups the data by grade and sex, providing outputs for each combination of sex, grade, and tobacco use status within each tobacco use status combination

metrics_by_sex_grade = as_survey(svy_design) %>%
  group_by(sex, grade, MTF_9stat, .drop = FALSE) %>% 
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  ungroup()
metrics_by_sex_grade <- rename(metrics_by_sex_grade, sex = sex) # rename column names if needed
metrics_by_sex_grade <- rename(metrics_by_sex_grade, grade = grade) # rename column names if needed

metrics_by_sex_grade
## # A tibble: 54 × 10
##    sex   grade MTF_9stat `weighted count` `weighted count_se` `weighted percent`
##    <chr> <int> <fct>                <dbl>               <dbl>              <dbl>
##  1 Fema…     8 Never/Ne…         3299.                572.              0.798   
##  2 Fema…     8 Never/Fo…          384.                118.              0.0929  
##  3 Fema…     8 Never/Cu…          222.                 80.1             0.0536  
##  4 Fema…     8 Former/N…           43.4                10.1             0.0105  
##  5 Fema…     8 Former/F…           64.2                20.6             0.0155  
##  6 Fema…     8 Former/C…           97.4                31.4             0.0236  
##  7 Fema…     8 Current/…            0.716               0.716           0.000173
##  8 Fema…     8 Current/…            5.13                2.31            0.00124 
##  9 Fema…     8 Current/…           18.0                 8.37            0.00434 
## 10 Fema…    10 Never/Ne…         3226.                559.              0.661   
## # ℹ 44 more rows
## # ℹ 4 more variables: `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>

4.3 Process Data

# combine survey metrics into a data frame
MTF_2022_metrics<-rbind.data.frame(metrics_all,metrics_by_sex,metrics_by_sex_grade)

# select relevant columns and recode for readability
MTF_2022_metrics<-MTF_2022_metrics %>%
  dplyr::select(MTF_9stat,grade,sex,"weighted count","unweighted_count","weighted percent","weighted percent_se","weighted percent_low","weighted percent_upp") %>%
  # filter to keep only rows where unweighted_count > 0
  filter(unweighted_count>0)

MTF_2022_metrics<-rename(MTF_2022_metrics,cig_ecig=MTF_9stat)
MTF_2022_metrics<-rename(MTF_2022_metrics,"unweighted count"=unweighted_count)

# recode sex and grade variables if needed
# export the processed data
write.csv(MTF_2022_metrics,"MTF_2022_metrics.csv",row.names = T)

5 NYTS

National Youth Tobacco Survey (NYTS)

5.1 Prepare data

# remove all the objects present in the workspace
rm(list=ls())    

# load the data
# this sample data is a subset of NYTS 2023
NYTS_2023 <- read.csv("NYTS_2023_data.csv")

# tobacco product use status
## cigarette
## current: smoked 100+ cigarettes and at least 1 day in the past 30 days 
## former: not never nor current
## never: never used or smoked less than 100+ cigarettes
## missing: missing data
table(NYTS_2023$cigt, useNA = "always")
## 
## Current  Former   Never    <NA> 
##      80      42   20006       0
## e-cigarette
## current: ever used and vaped at least 1 day in the past 30 days
## former: not never nor current
## never: never used
## missing: missing data
table(NYTS_2023$ecig, useNA = "always")
## 
## Current  Former   Never    <NA> 
##    1382    1631   17115       0
# Age: 12-14, 15-17, 18+
table(NYTS_2023$age_cat, useNA = "always")
## 
## 12-14 15-17   18+  <NA> 
## 10517  8506  1105     0
# Sex: Male, Female
table(NYTS_2023$um_sex_lab, useNA = "always")
## 
## Female   Male   <NA> 
##   9859  10269      0
# Filter out rows with missing value
NYTS_2023 = NYTS_2023[!is.na(NYTS_2023$age_cat),]    # age
NYTS_2023 = NYTS_2023[!is.na(NYTS_2023$um_sex_lab),]  # sex
# generate a combination of tobacco product use status for cigarettes and e-cigarettes
# creates a vector that contains all possible combinations
cig_ecig_name=paste0(rep(c("Never","Former","Current"),each=3),"/",c("Never","Former","Current"))

# loop through all combinations and assign values:
NYTS_2023$NYTS_9stat=NA
k=1
for(i in c("Never","Former","Current")){
  for(j in c("Never","Former","Current")){
    NYTS_2023$NYTS_9stat[NYTS_2023$cigt %in% i & NYTS_2023$ecig %in% j]=cig_ecig_name[k]
    k=k+1
  }
}

# for example, the loop will generate and assign the combination "Current/Former" to the NYTS_9stat column for observations where cigarettes are "Current" and e-cigarettes are "Former".
NYTS_2023$NYTS_9stat=factor(NYTS_2023$NYTS_9stat, levels=unique(cig_ecig_name))
NYTS_2023=NYTS_2023[!is.na(NYTS_2023$NYTS_9stat),]
table(NYTS_2023$NYTS_9stat,useNA = "always")
## 
##     Never/Never    Never/Former   Never/Current    Former/Never   Former/Former 
##           17092            1620            1294               6               9 
##  Former/Current   Current/Never  Current/Former Current/Current            <NA> 
##              27              17               2              61               0

5.2 Calculate survey metrics

# set options for survey analysis
options(survey.lonely.psu = "adjust")   # adjusts for lonely primary sampling units (PSUs) by redistributing weights
options(warn = -1)                      # suppress warnings
options(digits = 20)                    # set 20 digits to display for numerical output

# Create a survey design object using the specified survey design parameters
svy_design <- svydesign(
  id = ~id,                          # primary sampling unit (PSU) variable
  PSU = ~psu,                        # primary sampling unit (PSU) 
  strata = ~strata,                  # variance stratum variable
  weights = ~weights,                # person-level sampling weights
  data = NYTS_2023,                  # your data set
  nest = TRUE                        # specify that the strata and PSUs are nested
)

5.2.1 Overall survey metrics

# calculate overall survey metrics by grouping the data by tobacco use status combinations
# compute weighted counts, percentages with 95% confidence intervals, and unweighted counts
# add overall indicators for sex and age

metrics_all = as_survey(svy_design) %>%   # convert the survey design object to a survey design object
  # group by tobacco use status combinations, and ensures that all levels are included
  group_by(NYTS_9stat, .drop = FALSE) %>% 
  # calculate the outputs for each group
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se","ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  # indicate that the metrics are for which group, here for the overall population
  mutate(sex = "Overall",age = "Overall") %>% ungroup()

metrics_all
## # A tibble: 9 × 10
##   NYTS_9stat      `weighted count` `weighted count_se` `weighted percent`
##   <fct>                      <dbl>               <dbl>              <dbl>
## 1 Never/Never            21811551.             280869.           0.836   
## 2 Never/Former            2320582.             111346.           0.0889  
## 3 Never/Current           1809811.             105831.           0.0694  
## 4 Former/Never               3467.               1857.           0.000133
## 5 Former/Former             10922.               4821.           0.000419
## 6 Former/Current            35906.              11027.           0.00138 
## 7 Current/Never             32199.              12233.           0.00123 
## 8 Current/Former             3330.               2587.           0.000128
## 9 Current/Current           60935.              11953.           0.00234 
## # ℹ 6 more variables: `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>, sex <chr>, age <chr>

5.2.2 Grouped by age

# calculation is similar to the overall survey metrics
# but it additionally groups the data by age categories, providing outputs for each age group within each tobacco use status combination

metrics_by_age = as_survey(svy_design) %>%
  group_by(age_cat, NYTS_9stat, .drop = FALSE) %>% 
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  mutate(sex = "Overall") %>% ungroup()
metrics_by_age <- rename(metrics_by_age, age = age_cat)

metrics_by_age
## # A tibble: 27 × 10
##    age   NYTS_9stat      `weighted count` `weighted count_se` `weighted percent`
##    <chr> <fct>                      <dbl>               <dbl>              <dbl>
##  1 12-14 Never/Never            10630294.             188545.           0.903   
##  2 12-14 Never/Former             571979.              46266.           0.0486  
##  3 12-14 Never/Current            523608.              42064.           0.0445  
##  4 12-14 Former/Never               1774.               1250.           0.000151
##  5 12-14 Former/Former              1710.               1193.           0.000145
##  6 12-14 Former/Current             9183.               7727.           0.000780
##  7 12-14 Current/Never             16172.               9967.           0.00137 
##  8 12-14 Current/Former             3330.               2587.           0.000283
##  9 12-14 Current/Current           16622.               7062.           0.00141 
## 10 15-17 Never/Never             9439850.             216608.           0.783   
## # ℹ 17 more rows
## # ℹ 5 more variables: `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>, sex <chr>

5.2.3 Grouped by age and sex

# calculation is similar to the overall survey metrics
# but it additionally groups the data by age and sex, providing outputs for each combination of sex, age, and tobacco use status within each tobacco use status combination

metrics_by_sex_age = as_survey(svy_design) %>%
  group_by(um_sex_lab, age_cat, NYTS_9stat, .drop = FALSE) %>% 
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  ungroup()
metrics_by_sex_age <- rename(metrics_by_sex_age, sex = um_sex_lab)
metrics_by_sex_age <- rename(metrics_by_sex_age, age = age_cat)

metrics_by_sex_age
## # A tibble: 54 × 10
##    sex    age   NYTS_9stat      `weighted count` `weighted count_se`
##    <chr>  <chr> <fct>                      <dbl>               <dbl>
##  1 Female 12-14 Never/Never             5212285.             137153.
##  2 Female 12-14 Never/Former             303541.              35919.
##  3 Female 12-14 Never/Current            300967.              29767.
##  4 Female 12-14 Former/Never                  0                   0 
##  5 Female 12-14 Former/Former               802.                775.
##  6 Female 12-14 Former/Current             1630.               1630.
##  7 Female 12-14 Current/Never              1646.               1122.
##  8 Female 12-14 Current/Former                0                   0 
##  9 Female 12-14 Current/Current            6258.               4948.
## 10 Female 15-17 Never/Never             4512586.             165435.
## # ℹ 44 more rows
## # ℹ 5 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>

5.3 Process Data

# combine survey metrics into a data frame
NYTS_2023_metrics<-rbind.data.frame(metrics_all,metrics_by_age,metrics_by_sex_age)

# select relevant columns and recode for readability
NYTS_2023_metrics<-NYTS_2023_metrics %>%
  dplyr::select(NYTS_9stat,age,sex,"weighted count","unweighted_count","weighted percent","weighted percent_se","weighted percent_low","weighted percent_upp") %>%
  # filter to keep only rows where unweighted_count > 0
  filter(unweighted_count>0)

NYTS_2023_metrics<-rename(NYTS_2023_metrics,cig_ecig=NYTS_9stat)
NYTS_2023_metrics<-rename(NYTS_2023_metrics,"unweighted count"=unweighted_count)

# recode sex and age variables if needed
# export the processed data
write.csv(NYTS_2023_metrics,"NYTS_2023_metrics.csv",row.names = T)

6 TUS-CPS

The Tobacco Use Supplement to the Current Population Survey (TUS-CPS)

6.1 Prepare data

rm(list=ls())    # remove all the objects present in the workspace

# load the data
# this data set is a subset of TUSCPS 2022
TUSCPS_2022 <- read.csv("TUSCPS_2022_data.csv")

# tobacco product use status
## cigarette
## 1: current: smoked 100+ cigarettes and at least 1 day in the past 30 days 
## 2: former: ever used but did not smoke at least 1 day in the past 30 days
## 3: never: never used
table(TUSCPS_2022$cig_status, useNA = "always")
## 
## current  former missing   never    <NA> 
##      87      83       3     249       0
## e-cigarette
## 1: current: ever used and vaped at least 1 day in the past 30 days
## 2: former: ever used but did not vape at least 1 day in the past 30 days
## 3: never: never used
table(TUSCPS_2022$ecig_status, useNA = "always")
## 
## current  former missing   never    <NA> 
##      14      50      10     348       0
# Age: 1: 18-24, 2: 25-34, 3: 35-44, 4: 45-54, 5: 55-64, 6:65+
table(TUSCPS_2022$Age, useNA = "always")
## 
##    1    2    3    4    5    6 <NA> 
##   25   90   79   71   60   97    0
# Sex:1: Male, 2: Female
table(TUSCPS_2022$Sex, useNA = "always")
## 
##    1    2 <NA> 
##  166  256    0
# Filter out rows with missing value
TUSCPS_2022 = TUSCPS_2022[!is.na(TUSCPS_2022$Age),]  # age
TUSCPS_2022 = TUSCPS_2022[!is.na(TUSCPS_2022$Sex),]  # sex
# generate a combination of tobacco product use status for cigarettes and e-cigarettes
# creates a vector that contains all possible combinations
cig_ecig_name=paste0(rep(c("Never","Former","Current"),each=3),"/",c("Never","Former","Current"))

# loop through all combinations and assign values:
TUSCPS_2022$TUSCPS_9stat=NA
k=1
for(i in c("never","former","current")){
  for(j in c("never","former","current")){
    TUSCPS_2022$TUSCPS_9stat[TUSCPS_2022$cig_status %in% i & TUSCPS_2022$ecig_status %in% j]=cig_ecig_name[k]
    k=k+1
  }
}

# for example, the loop will generate and assign the combination "Current/Former" to the TUSCPS_9stat column for observations where cigarettes are "Current" and e-cigarettes are "Former"
TUSCPS_2022$TUSCPS_9stat=factor(TUSCPS_2022$TUSCPS_9stat, levels=unique(cig_ecig_name))
TUSCPS_2022 = TUSCPS_2022[!is.na(TUSCPS_2022$TUSCPS_9stat),]
table(TUSCPS_2022$TUSCPS_9stat,useNA = "always")
## 
##     Never/Never    Never/Former   Never/Current    Former/Never   Former/Former 
##             231              13               1              65              13 
##  Former/Current   Current/Never  Current/Former Current/Current            <NA> 
##               3              51              24              10               0

6.2 Calculate survey metrics

# set options for survey analysis
options(survey.replicates.mse = TRUE)   # compute the mean square error for replicate weights
options(warn = -1)                      # suppress warnings
options(digits = 20)                    # set 20 digits to display for numerical output

# create a survey design object using replicate weights
svy_design = svydesign(
  id = ~1,                       # primary sampling unit (PSU) variable; 1 means no clustering, and each row is treated as an independent unit
  weights = ~PWSRWGT,            # sampling weight
  data = TUSCPS_2022             # your data set
)

6.2.1 Overall survey metrics

# calculate overall survey metrics by grouping the data by tobacco use status combinations
# compute weighted counts, percentages with 95% confidence intervals, and unweighted counts
# add overall indicators for sex and age

metrics_all = as_survey_rep(svy_design) %>%   # use as_survey_rep for replicate weights
  # group by tobacco use status combinations, and ensures that all levels are included
  group_by(TUSCPS_9stat, .drop = FALSE) %>% 
  # calculate the outputs for each group
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se","ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  # indicate that the metrics are for which group, here for the overall population
  mutate(sex = "Overall",age = "Overall") %>% ungroup()

metrics_all
## # A tibble: 9 × 10
##   TUSCPS_9stat    `weighted count` `weighted count_se` `weighted percent`
##   <fct>                      <dbl>               <dbl>              <dbl>
## 1 Never/Never             1486492.             108540.            0.574  
## 2 Never/Former              88446.              31291.            0.0341 
## 3 Never/Current             14738.              14738.            0.00569
## 4 Former/Never             376635.              57181.            0.145  
## 5 Former/Former             70903.              24565.            0.0274 
## 6 Former/Current            38429.              25465.            0.0148 
## 7 Current/Never            257866.              44528.            0.0995 
## 8 Current/Former           165959.              38210.            0.0640 
## 9 Current/Current           92278.              41550.            0.0356 
## # ℹ 6 more variables: `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>, sex <chr>, age <chr>

6.2.2 Grouped by age

# calculation is similar to the overall survey metrics
# but it additionally groups the data by age categories, providing outputs for each age group within each tobacco use status combination

metrics_by_age = as_survey_rep(svy_design) %>%
  group_by(Age, TUSCPS_9stat, .drop = FALSE) %>% 
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  mutate(sex = "Overall") %>% ungroup()
metrics_by_age <- rename(metrics_by_age, age = Age)

metrics_by_age
## # A tibble: 54 × 10
##      age TUSCPS_9stat    `weighted count` `weighted count_se` `weighted percent`
##    <int> <fct>                      <dbl>               <dbl>              <dbl>
##  1     1 Never/Never              117605.              42322.            0.507  
##  2     1 Never/Former              20664.              15332.            0.0891 
##  3     1 Never/Current             14738.              14738.            0.0635 
##  4     1 Former/Never               7872.               5629.            0.0339 
##  5     1 Former/Former              1897.               1897.            0.00818
##  6     1 Former/Current                0                   0             0      
##  7     1 Current/Never             29331.              17660.            0.126  
##  8     1 Current/Former             9962.               9962.            0.0430 
##  9     1 Current/Current           29856.              28482.            0.129  
## 10     2 Never/Never              408645.              69467.            0.636  
## # ℹ 44 more rows
## # ℹ 5 more variables: `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>, sex <chr>

6.2.3 Grouped by age and sex

# calculation is similar to the overall survey metrics
# but it additionally groups the data by age and sex, providing outputs for each combination of sex, age, and tobacco use status within each tobacco use status combination

metrics_by_sex_age = as_survey_rep(svy_design) %>%
  group_by(Sex, Age, TUSCPS_9stat, .drop = FALSE) %>% 
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  ungroup()
metrics_by_sex_age <- rename(metrics_by_sex_age, sex = Sex)
metrics_by_sex_age <- rename(metrics_by_sex_age, age = Age)

metrics_by_sex_age
## # A tibble: 108 × 10
##      sex   age TUSCPS_9stat    `weighted count` `weighted count_se`
##    <int> <int> <fct>                      <dbl>               <dbl>
##  1     1     1 Never/Never               85087.              38085.
##  2     1     1 Never/Former                  0                   0 
##  3     1     1 Never/Current                 0                   0 
##  4     1     1 Former/Never               4557.               4557.
##  5     1     1 Former/Former                 0                   0 
##  6     1     1 Former/Current                0                   0 
##  7     1     1 Current/Never              5580.               5580.
##  8     1     1 Current/Former             9962.               9962.
##  9     1     1 Current/Current           28451.              28451.
## 10     1     2 Never/Never              162327.              51167.
## # ℹ 98 more rows
## # ℹ 5 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>

6.3 Process Data

# combine survey metrics into a data frame
data <- rbind.data.frame(metrics_all, metrics_by_age, metrics_by_sex_age)

# select relevant columns and recode for readability
TUSCPS_2022_metrics <- data %>%
  dplyr::select(TUSCPS_9stat, age, sex, 
                "weighted count", "unweighted_count", "weighted percent", 
                "weighted percent_se", "weighted percent_low", "weighted percent_upp") %>%
  # filters out rows with zero unweighted counts
  filter(unweighted_count > 0)  

# rename columns
TUSCPS_2022_metrics <- rename(TUSCPS_2022_metrics, cig_ecig = TUSCPS_9stat)
TUSCPS_2022_metrics <- rename(TUSCPS_2022_metrics, "unweighted count" = unweighted_count)

# recode sex and age variables
TUSCPS_2022_metrics$sex[TUSCPS_2022_metrics$sex %in% 2] = "Female"
TUSCPS_2022_metrics$sex[TUSCPS_2022_metrics$sex %in% 1] = "Male"

TUSCPS_2022_metrics$age[TUSCPS_2022_metrics$age %in% 1] = "18-24"
TUSCPS_2022_metrics$age[TUSCPS_2022_metrics$age %in% 2] = "25-34"
TUSCPS_2022_metrics$age[TUSCPS_2022_metrics$age %in% 3] = "35-44"
TUSCPS_2022_metrics$age[TUSCPS_2022_metrics$age %in% 4] = "45-54"
TUSCPS_2022_metrics$age[TUSCPS_2022_metrics$age %in% 5] = "55-64"
TUSCPS_2022_metrics$age[TUSCPS_2022_metrics$age %in% 6] = "65+"

# export the processed data
write.csv(TUSCPS_2022_metrics,"TUSCPS_2022_metrics.csv",row.names = F)

7 YRBSS

Youth Risk Behavior Surveillance System (YRBSS)
Youth Risk Behavior Survey (YRBS)

7.1 Prepare data

# remove all the objects present in the workspace
rm(list=ls())    

# load the data
# this sample data is a subset of YRBS 2021
YRBS_2021 <- read.csv("YRBS_2021_data.csv")

# tobacco product use status
## cigarette
## current: ever smoked cigarettes and at least 1 day in the past 30 days 
## former: ever used but did not smoke at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(YRBS_2021$cig_st, useNA = "always")
## 
## current  former missing   never    <NA> 
##     646    1735    3536   11156       0
## e-cigarette
## current: ever used and vaped at least 1 day in the past 30 days
## former: ever used but did not vape at least 1 day in the past 30 days
## never: never used
## missing: missing data
table(YRBS_2021$ecig_st, useNA = "always")
## 
## current  former missing   never    <NA> 
##    2892    2262    1243   10676       0
# Age: ~12, 13, 14, 15, 16, 17, 18+, Missing
table(YRBS_2021$Q1, useNA = "always")
## 
## 12 years old or younger            13 years old            14 years old 
##                      29                      61                    3391 
##            15 years old            16 years old            17 years old 
##                    4410                    4257                    3894 
##   18 years old or older                 Missing                    <NA> 
##                    1011                      20                       0
# Sex: Male, Female, Missing
table(YRBS_2021$sex, useNA = "always")
## 
##  Female    Male Missing    <NA> 
##    8128    8765     180       0
# Filter out rows with missing value
YRBS_2021 = YRBS_2021[YRBS_2021$Q1 != "Missing",]    # age
YRBS_2021 = YRBS_2021[!is.na(YRBS_2021$Q1),]
table(YRBS_2021$Q1, useNA="always")
## 
## 12 years old or younger            13 years old            14 years old 
##                      29                      61                    3391 
##            15 years old            16 years old            17 years old 
##                    4410                    4257                    3894 
##   18 years old or older                    <NA> 
##                    1011                       0
YRBS_2021 = YRBS_2021[YRBS_2021$sex != "Missing",]  # sex
YRBS_2021 = YRBS_2021[!is.na(YRBS_2021$sex),]
table(YRBS_2021$sex, useNA="always")
## 
## Female   Male   <NA> 
##   8121   8755      0
# generate a combination of tobacco product use status for cigarettes and e-cigarettes
# creates a vector that contains all possible combinations
cig_ecig_name=paste0(rep(c("Never","Former","Current"),each=3),"/",c("Never","Former","Current"))

# loop through all combinations and assign values:
YRBS_2021$YRBS_9stat=NA
k=1
for(i in c("never","former","current")){
  for(j in c("never","former","current")){
    YRBS_2021$YRBS_9stat[YRBS_2021$cig_st %in% i & YRBS_2021$ecig_st %in% j]=cig_ecig_name[k]
    k=k+1
  }
}

# for example, the loop will generate and assign the combination "Current/Former" to the YRBS_9stat column for observations where cigarettes are "Current" and e-cigarettes are "Former".
YRBS_2021$YRBS_9stat=factor(YRBS_2021$YRBS_9stat, levels=unique(cig_ecig_name))
YRBS_2021=YRBS_2021[!is.na(YRBS_2021$YRBS_9stat),]
table(YRBS_2021$YRBS_9stat,useNA = "always")
## 
##     Never/Never    Never/Former   Never/Current    Former/Never   Former/Former 
##            8329            1203             956             246             511 
##  Former/Current   Current/Never  Current/Former Current/Current            <NA> 
##             754              28              25             538               0

7.2 Calculate survey metrics

# set options for survey analysis
options(survey.lonely.psu = "adjust")   # adjusts for lonely primary sampling units (PSUs) by redistributing weights
options(warn = -1)                      # suppress warnings
options(digits = 20)                    # set 20 digits to display for numerical output

# Create a survey design object using the specified survey design parameters
svy_design <- svydesign(
  id = ~PSU,                           # primary sampling unit (PSU) variable
  strata = ~STRATUM,                   # variance stratum variable
  weights = ~WEIGHT,                   # sampling weights
  data = YRBS_2021,                    # your data set
  nest = TRUE                          # specify that the strata and PSUs are nested
)

7.2.1 Overall survey metrics

# calculate overall survey metrics by grouping the data by tobacco use status combinations
# compute weighted counts, percentages with 95% confidence intervals, and unweighted counts
# add overall indicators for sex and age

metrics_all = as_survey(svy_design) %>%   # convert the survey design object to a survey design object
  # group by tobacco use status combinations, and ensures that all levels are included
  group_by(YRBS_9stat, .drop = FALSE) %>% 
  # calculate the outputs for each group
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se","ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  # indicate that the metrics are for which group, here for the overall population
  mutate(sex = "Overall",age = "Overall") %>% ungroup()

metrics_all
## # A tibble: 9 × 10
##   YRBS_9stat      `weighted count` `weighted count_se` `weighted percent`
##   <fct>                      <dbl>               <dbl>              <dbl>
## 1 Never/Never               8931.               745.              0.658  
## 2 Never/Former              1335.               128.              0.0983 
## 3 Never/Current             1118.               118.              0.0823 
## 4 Former/Never               234.                32.8             0.0172 
## 5 Former/Former              546.                42.6             0.0402 
## 6 Former/Current             828.                79.0             0.0610 
## 7 Current/Never               27.0                8.50            0.00199
## 8 Current/Former              34.3                7.10            0.00252
## 9 Current/Current            528.                58.6             0.0389 
## # ℹ 6 more variables: `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>, sex <chr>, age <chr>

7.2.2 Grouped by age

# calculation is similar to the overall survey metrics
# but it additionally groups the data by age categories, providing outputs for each age group within each tobacco use status combination

metrics_by_age = as_survey(svy_design) %>%
  group_by(Q1, YRBS_9stat, .drop = FALSE) %>% 
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  mutate(sex = "Overall") %>% ungroup()
metrics_by_age <- rename(metrics_by_age, age = Q1)

metrics_by_age
## # A tibble: 63 × 10
##    age        YRBS_9stat `weighted count` `weighted count_se` `weighted percent`
##    <chr>      <fct>                 <dbl>               <dbl>              <dbl>
##  1 12 years … Never/Nev…            2.87                1.68              0.553 
##  2 12 years … Never/For…            0                   0                 0     
##  3 12 years … Never/Cur…            0                   0                 0     
##  4 12 years … Former/Ne…            0                   0                 0     
##  5 12 years … Former/Fo…            0.410               0.410             0.0790
##  6 12 years … Former/Cu…            0                   0                 0     
##  7 12 years … Current/N…            0                   0                 0     
##  8 12 years … Current/F…            0                   0                 0     
##  9 12 years … Current/C…            1.91                1.13              0.368 
## 10 13 years … Never/Nev…           36.5                 8.16              0.815 
## # ℹ 53 more rows
## # ℹ 5 more variables: `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>, sex <chr>

7.2.3 Grouped by age and sex

# calculation is similar to the overall survey metrics
# but it additionally groups the data by age and sex, providing outputs for each combination of sex, age, and tobacco use status within each tobacco use status combination

metrics_by_sex_age = as_survey(svy_design) %>%
  group_by(sex, Q1, YRBS_9stat, .drop = FALSE) %>% 
  summarize("weighted count" = survey_total(),
            "weighted percent" = survey_mean(vartype = c("se", "ci"), level = 0.95),
            "unweighted_count" = unweighted(n())) %>%
  ungroup()
metrics_by_sex_age <- rename(metrics_by_sex_age, sex = sex)
metrics_by_sex_age <- rename(metrics_by_sex_age, age = Q1)

metrics_by_sex_age
## # A tibble: 126 × 10
##    sex    age                    YRBS_9stat `weighted count` `weighted count_se`
##    <chr>  <chr>                  <fct>                 <dbl>               <dbl>
##  1 Female 12 years old or young… Never/Nev…            0                   0    
##  2 Female 12 years old or young… Never/For…            0                   0    
##  3 Female 12 years old or young… Never/Cur…            0                   0    
##  4 Female 12 years old or young… Former/Ne…            0                   0    
##  5 Female 12 years old or young… Former/Fo…            0.410               0.410
##  6 Female 12 years old or young… Former/Cu…            0                   0    
##  7 Female 12 years old or young… Current/N…            0                   0    
##  8 Female 12 years old or young… Current/F…            0                   0    
##  9 Female 12 years old or young… Current/C…            1.16                1.05 
## 10 Female 13 years old           Never/Nev…           23.5                 6.68 
## # ℹ 116 more rows
## # ℹ 5 more variables: `weighted percent` <dbl>, `weighted percent_se` <dbl>,
## #   `weighted percent_low` <dbl>, `weighted percent_upp` <dbl>,
## #   unweighted_count <int>

7.3 Process Data

# combine survey metrics into a data frame
YRBS_2021_metrics<-rbind.data.frame(metrics_all,metrics_by_age,metrics_by_sex_age)

# select relevant columns and recode for readability
YRBS_2021_metrics<-YRBS_2021_metrics %>%
  dplyr::select(YRBS_9stat,age,sex,"weighted count","unweighted_count","weighted percent","weighted percent_se","weighted percent_low","weighted percent_upp") %>%
  # filter to keep only rows where unweighted_count > 0
  filter(unweighted_count>0)

YRBS_2021_metrics<-rename(YRBS_2021_metrics,cig_ecig=YRBS_9stat)
YRBS_2021_metrics<-rename(YRBS_2021_metrics,"unweighted count"=unweighted_count)

# recode sex and age variables if needed
# export the processed data
write.csv(YRBS_2021_metrics,"YRBS_2021_metrics.csv",row.names = T)