### Survey Weighted NHANES Logistic Regression
######################################################################################################################################################

## Script Sources: 
####https://stats.idre.ucla.edu/r/seminars/survey-data-analysis-with-r/
###  https://www.extendoffice.com/documents/excel/5168-excel-categorize-data-based-on-values.html
#### McConville K. Visualizing a categorical variable. In: ANALYZING SURVEY DATA IN R. Datacamp. 
###############https://stats.stackexchange.com/questions/475482/failed-to-obtain-standard-errors-of-predictors-using-svyglm-in-r-package-surv
##### https://cran.r-project.org/web/packages/survey/survey.pdf
#### https://www.analyticsvidhya.com/blog/2016/03/tutorial-powerful-packages-imputing-missing-values/
#### https://www.reddit.com/r/Rlanguage/comments/n3lrp3/warning_when_running_weighted_logistic_regression/
#####https://datascienceplus.com/handling-missing-data-with-mice-package-a-simple-approach/
#######http://juliejosse.com/wp-content/uploads/2018/06/DataAnalysisMissingR.html#inspect_the_imputed_values
########https://stefvanbuuren.name/fimd/ch-analysis.html
######https://www.statmethods.net/graphs/boxplot.html
###########https://r-survey.r-forge.r-project.org/survey/html/regTermTest.html
####https://github.com/cran/survey/blob/master/man/svyglm.Rd
######https://rstudio-pubs-static.s3.amazonaws.com/58200_229a4cf1bf6144eeb7992206ba45a6d5.html
#########https://strengejacke.github.io/sjPlot/articles/plot_model_estimates.html
#############https://cran.r-project.org/web/packages/ROCit/vignettes/my-vignette.html

library("ROCit")
library("aplpack")
library("tidyverse")
library("dbplyr")
library("ggplot2")
library("mice")
library("magrittr")
library("survey")
library("VIM")
library("sjlabelled")
library("sjmisc")
library("sjPlot")


#### This script is designed for all categorical variables##########################################
#######################################################################################################################################################
######################################################################################################################################

### Download  data from NHANES
### SPSS--> Excel--> Code data on Excel
### see https://www.extendoffice.com/documents/excel/5168-excel-categorize-data-based-on-values.html

####### Upload data from Excel#########################################################################################################

nhanes <- LRC2 %>% mutate_if(is.character,as.factor)
class(nhanes)
print(nhanes)

######### Summary of Data################
summary(nhanes)

###########Structure of the Data############
str(nhanes)

######## Head of structure################
head(nhanes)

############Tail of structure############
tail(nhanes)

######Determine Column Names###############

colnames(nhanes)


################################# remove Nas if necessary ################################################################################
DF <- na.omit(nhanes)
DF
str(DF)
summary(DF)

#########  Survey Data Equation Design###############################################
nhc <- svydesign(id=~SDMVPSU, weights=~WTSA2YR, strata=~SDMVSTRA, nest=TRUE, survey.lonely.psu = "adjust", data= DF)
nhc
summary(nhc)

##################Exploring Categorical Data################################################################################################################
###1) Create Contingency Tables

#Outcome Variable
taba <- svytable(~HSCRP, design = nhc)
taba

#Adjusting Variables

tabb <- svytable(~ BMI, design = nhc)
tabb
tabc<- svytable(~ Ethnicity, design = nhc)
tabc
tabd<- svytable(~ Age, design = nhc)
tabd
tabe<- svytable(~ Gender, design = nhc)
tabe
tabf<- svytable(~ Folate, design = nhc)
tabf

##Exposure Variables
tabg<- svytable(~ Serum_Zinc , design = nhc)
tabg
tabh<- svytable(~ Alcohol_Consumption , design = nhc)
tabh

###2) Construct Frequency Tables
tabab <- svytable(~HSCRP + BMI, design = nhc)
tabab

tabac <- svytable(~HSCRP + Ethnicity, design = nhc)
tabac

tabad <- svytable(~HSCRP + Age, design = nhc)
tabad

tabae<- svytable(~HSCRP + Gender, design = nhc)
tabae

tabaf<- svytable(~HSCRP + Folate, design = nhc)
tabaf

tabag<- svytable(~HSCRP + Serum_Zinc, design = nhc)
tabag

tabah <- svytable(~HSCRP + Alcohol_Consumption, design = nhc)
tabah

#### 3) Create Segmented Bar Graphs w/ conditional proportions

tabab_cond <- tabab %>% 
  as.data.frame() %>% 
  group_by(BMI) %>% 
  mutate(n_BMI = sum(Freq), Prop_HSCRP = Freq/n_BMI) %>% 
  ungroup()

tabac_cond <- tabac %>% 
  as.data.frame() %>% 
  group_by(Ethnicity) %>% 
  mutate(n_Ethnicity = sum(Freq), Prop_HSCRP = Freq/n_Ethnicity) %>% 
  ungroup()

tabad_cond <- tabad %>% 
  as.data.frame() %>% 
  group_by(Age) %>% 
  mutate(n_Age = sum(Freq), Prop_HSCRP = Freq/n_Age) %>% 
  ungroup()

tabae_cond <- tabae %>% 
  as.data.frame() %>% 
  group_by(Gender) %>% 
  mutate(n_Gender = sum(Freq), Prop_HSCRP = Freq/n_Gender) %>% 
  ungroup()

tabaf_cond <- tabaf %>% 
  as.data.frame() %>% 
  group_by(Folate) %>% 
  mutate(n_Folate = sum(Freq), Prop_HSCRP = Freq/n_Folate) %>% 
  ungroup()


tabag_cond <- tabag %>% 
  as.data.frame() %>% 
  group_by(Serum_Zinc) %>% 
  mutate(n_Serum_Zinc = sum(Freq), Prop_HSCRP = Freq/n_Serum_Zinc) %>% 
  ungroup()


tabah_cond <- tabah %>% 
  as.data.frame() %>% 
  group_by(Alcohol_Consumption) %>% 
  mutate(n_Alcohol_Consumption = sum(Freq), Prop_HSCRP = Freq/n_Alcohol_Consumption) %>% 
  ungroup()

ggplot(data = tabab_cond,
       mapping = aes(x = BMI, y = Prop_HSCRP, fill = HSCRP)) + 
  geom_col() + 
  coord_flip()

ggplot(data = tabac_cond,
       mapping = aes(x = Ethnicity, y = Prop_HSCRP, fill = HSCRP)) + 
  geom_col() + 
  coord_flip()

ggplot(data = tabad_cond,
       mapping = aes(x = Age, y = Prop_HSCRP, fill = HSCRP)) + 
  geom_col() + 
  coord_flip()

ggplot(data = tabae_cond,
       mapping = aes(x = Gender, y = Prop_HSCRP, fill = HSCRP)) + 
  geom_col() + 
  coord_flip()

ggplot(data = tabaf_cond,
       mapping = aes(x = Folate, y = Prop_HSCRP, fill = HSCRP)) + 
  geom_col() + 
  coord_flip()

ggplot(data = tabag_cond,
       mapping = aes(x = Serum_Zinc, y = Prop_HSCRP, fill = HSCRP)) + 
  geom_col() + 
  coord_flip()

ggplot(data = tabah_cond,
       mapping = aes(x = Alcohol_Consumption, y = Prop_HSCRP, fill = HSCRP)) + 
  geom_col() + 
  coord_flip()

##### 4) Estimate the Total for Combinations of X (Exposure, Adjusting) & Y (Outcome)
tab_totals1 <- svytotal(x= ~interaction(BMI, HSCRP),
                        design= nhc,
                        na.rm = TRUE)
tab_totals1

tab_totals2 <- svytotal(x= ~interaction(Ethnicity, HSCRP),
                        design= nhc,
                        na.rm = TRUE)
tab_totals2

tab_totals3 <- svytotal(x= ~interaction(Age, HSCRP),
                        design= nhc,
                        na.rm = TRUE)
tab_totals3

tab_totals4 <- svytotal(x= ~interaction(Gender, HSCRP),
                        design= nhc,
                        na.rm = TRUE)
tab_totals4



tab_totals5 <- svytotal(x= ~interaction(Folate, HSCRP),
                        design= nhc,
                        na.rm = TRUE)

tab_totals5


tab_totals6 <- svytotal(x= ~interaction(Serum_Zinc, HSCRP),
                        design= nhc,
                        na.rm = TRUE)
tab_totals6



tab_totals7 <- svytotal(x= ~interaction(Alcohol_Consumption, HSCRP),
                        design= nhc,
                        na.rm = TRUE)
tab_totals7


##### 5) Estimate Survey Means X (Exposure, Adjusting) & Y (Outcomes)

tab_mean1 <- svymean(x= ~interaction(BMI, HSCRP),
                     design= nhc,
                     na.rm = TRUE)
tab_mean1


tab_mean2 <- svymean(x= ~interaction(Ethnicity, HSCRP),
                     design= nhc,
                     na.rm = TRUE)
tab_mean2

tab_mean3 <- svymean(x= ~interaction(Age, HSCRP),
                     design= nhc,
                     na.rm = TRUE)
tab_mean3

tab_mean4 <- svymean(x= ~interaction(Gender, HSCRP),
                     design= nhc,
                     na.rm = TRUE)
tab_mean4


tab_mean5 <- svymean(x= ~interaction(Folate, HSCRP),
                     design= nhc,
                     na.rm = TRUE)

tab_mean5


tab_mean6 <- svymean(x= ~interaction(Serum_Zinc, HSCRP),
                     design= nhc,
                     na.rm = TRUE)
tab_mean6



tab_mean7 <- svymean(x= ~interaction(Alcohol_Consumption, HSCRP),
                     design= nhc,
                     na.rm = TRUE)
tab_mean7


########## 6) Test for significance via Chi squared, Individual Variables

chi1 <- svychisq(~BMI +HSCRP,
                 design= nhc,
                 statisitic= "Chisq")
chi1

chi2 <- svychisq(~ Ethnicity +HSCRP,
                 design= nhc,
                 statisitic= "Chisq")
chi2

chi3 <- svychisq(~ Age +HSCRP,
                 design= nhc,
                 statisitic= "Chisq")
chi3

chi4 <- svychisq(~ Gender +HSCRP,
                 design= nhc,
                 statisitic= "Chisq")
chi4

chi5 <- svychisq(~ Folate +HSCRP,
                 design= nhc,
                 statisitic= "Chisq")
chi5

chi6 <- svychisq(~ Serum_Zinc +HSCRP,
                 design= nhc,
                 statisitic= "Chisq")
chi6

chi7 <- svychisq(~ Alcohol_Consumption +HSCRP,
                 design= nhc,
                 statisitic= "Chisq")
chi7


#################################################################################################################################################################

############### Build Models, Raw weighted data, Logistic regression######################################################################################################################################

### Equation  
nhc <- svydesign(id=~SDMVPSU, weights=~WTSA2YR, strata=~SDMVSTRA, nest=TRUE, survey.lonely.psu = "adjust", data= DF)
str(nhc)   
#Summs
model1 <- svyglm(HSCRP ~ Serum_Zinc,design= nhc,rescale = TRUE, family = quasibinomial())
summary(model1, df= degf(nhc))
confint(model1)
summ(model1,confint=TRUE, n.sd=1, digits= 6)


#### Equation 
model3 <- svyglm(HSCRP ~ factor(Folate) + factor(Gender) + factor(Age) + factor(Ethnicity) + factor(BMI) + factor(Serum_Zinc) +
                   factor(Alcohol_Consumption), design= nhc,rescale = TRUE, family = quasibinomial())
### Summs
summary (model3, df= degf(nhc))
confint(model3, df= degf(nhc))
summ(model3,confint=TRUE, n.sd=1, digits= 6)

#### Equation
model4 <- svyglm(HSCRP ~ relevel(Folate,"Excessive") + relevel(Gender, "Female") + relevel(Age,"Elderly") + relevel(Ethnicity, "Asian") + relevel(BMI,"Normal Weight") + relevel(Serum_Zinc,"Deficient") +
                   relevel(Alcohol_Consumption,"Excessive Consumption"), design= nhc,rescale = TRUE, family = quasibinomial())

###Summs
summary (model4, df= degf(nhc))
confint(model4, df= degf(nhc))
summ(model4,confint=TRUE, n.sd=1, digits= 6)


### Equation 

model5 <- svyglm(HSCRP ~ factor(Folate) + factor(Gender) +  factor(Ethnicity) + factor(Serum_Zinc)*
                   factor(Alcohol_Consumption), design= nhc,rescale = TRUE, family = quasibinomial())
### Summs
summary (model5, df= degf(nhc))
confint(model5, df= degf(nhc))
summ(model5,confint=TRUE, n.sd=1, digits= 6)


### Equation


model6 <- svyglm(HSCRP ~ factor(Gender) +  factor(Age) + factor(Serum_Zinc)*
                   factor(Alcohol_Consumption), design= nhc,rescale = TRUE, family = quasibinomial())

### Summs
summary (model6, df= degf(nhc))
confint(model6, df= degf(nhc))
summ(model6,confint=TRUE, n.sd=1, digits= 6)

####### ROC  and Descriptive Stats on Separate script on separate script####################################