with(mtcars, {
  nokeepstats <- summary(mpg)
  keepstats <<- summary(mpg)
})
#keepstats is a global variable, nokeepstats is not.

source("") #run a set of R statements contained in file
 
#text output
sink("filename", append = TRUE, split = TRUE) #append means not overwriting,
#split means send output to both screen and the output file
sink() #return the output to screen alone

diabetes <- c("Type1", "Type2", "Type1", "Type1")
diabetes <- factor(diabetes)
#nominal veriable
#store this vector as (1,2,1,1) and associates it with 1 = type 1 (alphabetical)

status <- c("Poor", "Improved", "Excellent", "Poor")
status <- factor(status, ordered=TRUE, levels=c("Poor", "Improved", "Excellent"))
#encode the vector as (3,2,1,3) with specific order as levels
#also can add labels

#str(object) provides information on an object in R

#list
g <- "My First List"
h <- c(25, 26, 18, 39)
j <- matrix(1:10, nrow=5)
k <- c("one", "two", "three")
mylist <- list(title=g, ages=h, j, k)
mylist[[1]] # or mylist[["title"]] or mylist$title

numeric(0) 
character(0)
#length of 0

#use file() or ?file() for more information
file() #allows the user to access files
gzfile()  bzfile()  xzfile()  unz() #let user read compressed files
url() #lets you access internet files

#load package
ipak <- function(pkg){
  new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
  if (length(new.pkg)) 
    install.packages(new.pkg, dependencies = TRUE)
  sapply(pkg, require, character.only = TRUE)
}
packages <- c("plyr","dplyr","reshape2")
ipak(packages)

#download the file to load
if(!file.exists("data")) {dir.create("data")} #creat a file for data
fileUrl <- "https://adsfa"
download.file(fileUrl, destfile = "./data/name",method="curl")
dataDownloaded <- date()
#Current download methods are "internal", "wininet" (Windows only) 
#"libcurl", "wget" and "curl", and there is a value "auto"

#importing data from excel
#package gdata
library("gdata")
dataset <- read.xls("file", sheetIndex=1, header = TRUE)

rm()  #delete one or more objects


pdf("filename.pdf") #save the graph as a pdf
#or 
png("") #save in png
dev.off() #shuts down the specified (by default the current) device


manager <- c(1, 2, 3, 4, 5)
date <- c("10/24/08", "10/28/08", "10/1/08", "10/12/08", "5/1/09")
country <- c("US", "US", "UK", "UK", "UK")
gender <- c("M", "F", "F", "M", "F")
age <- c(32, 45, 25, 39, 99)
q1 <- c(5, 3, 3, 3, 2)
q2 <- c(4, 5, 5, 3, 2)
q3 <- c(5, 2, 5, 4, 1)
q4 <- c(5, 5, 5, NA, 2)
q5 <- c(5, 5, 2, NA, 1)
leadership <- data.frame(manager, date, country, gender, age,
                         q1, q2, q3, q4, q5, stringsAsFactors=FALSE)

within()
with()
#The within() function is similar to the with() function, 
#but allows you to modify the data frame.

na.omit() #deletes any rows with missing data

#date variable, The default format for inputting dates is yyyy-mm-dd
strDates <- c("01/05/1965", "08/16/1975")
dates <- as.Date(strDates, "%m/%d/%Y")

Sys.Date() #returns today’s date 
date() #returns the current date and time

format(x, format="output_format")
today <- Sys.Date()
format(today, format="%B %d %Y")
format(today, format="%A")
#output dates in a specified format, and to extract portions of dates


difftime() 
#calculate a time interval and express it 
#as seconds, minutes, hours, days, or weeks
today <- Sys.Date()
dob   <- as.Date("1956-10-12")
difftime(today, dob, units="weeks")
#Time difference of 2825 weeks


#Sorting data
newdata <- leadership[order(gender, age),]
#By default, the sorting order is ascending

total <- merge(dataframeA, dataframeB, by="ID")
#Merging dataset (inner joint)

#NULL (undefined) isn’t the same as NA (missing)

newdata <- subset(leadership, age >= 35 | age < 24, 
                  select=c(q1, q2, q3, q4))
#elect all rows that have a value of age >=35 or age < 24.
#You keep the variables q1 through q4

sample() #enables you to take a random sample (with or without replacement) 
#of size n from a dataset


#Using SQL statements to manipulate data frames
#library sqldf
library(sqldf)
newdf <- sqldf("select * from mtcars where carb=1 order by mpg",
               row.names=TRUE)


scale() 
#standardizes the specified columns of a matrix or data frame to a mean of 0 
# and a standard deviation of 1

library(MASS)
mvrnorm(n, mean, sigma)
#draw data from multivariate normal distribution with a given mean vector and 
# covariance matrix


#edit text variable
nchar(x) #Counts the number of characters of x
substr(x, start, stop) #Extract or replace substrings in a character vector

grep(pattern, x, ignore.case=FALSE, fixed=FALSE) 
#Search for pattern in x. If fixed=FALSE, then pattern is a regular expression. 
#If fixed=TRUE, then pattern is a text string. Returns matching indices.

sub(pattern, replacement, x, ignore.case=FALSE, fixed=FALSE)
#Find pattern in x and substitute with replacement text. 
#If fixed=FALSE then pattern is 
#a regular expression. If fixed=TRUE then pattern is a text string.
gsub() #replace all the pattern

strsplit(x, split, fixed=FALSE)
#Split the elements of character vector x at split. 
y <- strsplit("abc", "")
unlist(y)
sapply(y, "[", 2) #both return the 2nd element of y ([ means select elements)

paste(..., sep="") #Concatenate strings after using sep string to separate them.

toupper(x)
tolower(x)
cat(... , file = "", sep = " ")

cut(x, n) #Divide continuous variable x into factor with n levels


#Aggregation and restructuring
aggregate(data[variable.names], by, FUN)
#where x is the data object to be collapsed, 
#by is a list of variables that will be crossed to form the new observations, 
#and FUN is the scalar function used to calculate summary statistics that
#will make up the new observation values.


install.packages("reshape")
library(reshape)
md <- melt(mydata, id=(c("id", "time")))
melt(data) # put all the data in one col

cast(md, formula, FUN)

prop.table(mytable) #turn these frequencies table into proportions

#CHI-SQUARE TEST OF INDEPENDENCE
chisq.test(table)
# FISHER’S EXACT TEST for Independence
fisher.test(table)
#COCHRAN–MANTEL–HAENSZEL TEST
mantelhaen.test(table)

#The significance tests in the previous section evaluated whether 
#or not sufficient evidence existed to reject a null hypothesis 
#of independence between variables. If you can reject the null hypothesis, 
#your interest turns naturally to measures of association in order to 
#gauge the strength of the relationships present

#In statistics,the phi coefficienti s a measure of association for two binary variables. 
assocstats(table)

#The Pearson product moment correlation assesses the degree of linear relationship 
#between two quantitative variables. Spearman’s Rank Order correlation coefficient
#assesses the degree of relationship between two rank-ordered variables. 
#Kendall’s Tau is also a nonparametric measure of rank correlation.
cor(x, y = NULL, use = "everything",
    method = c("pearson", "kendall", "spearman"))

#A partial correlation is a correlation between two quantitative variables, 
#controlling for one or more other quantitative variables.
library(ggm)
# partial correlation of population and murder rate, controlling
# for income, illiteracy rate, and HS graduation rate
pcor(c(1,5,2,3,6), cov(states))

#Testing correlations for significance
cor.test(x, y,
         alternative = c("two.sided", "less", "greater"),
         method = c("pearson", "kendall", "spearman"),
         exact = NULL, conf.level = 0.95)

#t-test
t.test(x, y = NULL,
       alternative = c("two.sided", "less", "greater"),
       mu = 0, paired = FALSE, var.equal = FALSE,
       conf.level = 0.95)

#Nonparametric tests of group differences

#If the two groups are independent, you can use the Wilcoxon rank sum test 
#(more popularly known as the Mann–Whitney U test) to 
#assess whether the observations are sampled from the same probability distribution
wilcox.test(x, y = NULL,
            alternative = c("two.sided", "less", "greater"),
            mu = 0, paired = FALSE, exact = NULL, correct = TRUE,
            conf.int = FALSE, conf.level = 0.95)

#nonparametric test of ANOVA
kruskal.test(y ~ A, data)

friedman.test(y ~ A | B, data)
#where y is the numeric outcome variable, A is a grouping variable, 
#and B is a blocking variable that identifies matched observations.


#linear regression
myfit <- lm(formula, data)
# : Denotes an interaction between predictor variables. A prediction of y from x, z, 
# and the interaction between x and z would be coded y ~ x + z + x:z.

# * A shortcut for denoting all possible interactions. 
# The code y ~ x * z * w expands to y ~ x + z + w + x:z + x:w + z:w + x:z:w.

# ^ Denotes interactions up to a specified degree. The code y ~ (x + z + w)^2
# expands to y ~ x + z + w + x:z + x:w + z:w.

# - A minus sign removes a variable from the equation. 
#For example, y ~ (x + z + w)^2 – x:w expands to y ~ x + z + w + x:z + z:w.

# -1 Suppresses the intercept. For example, the formula y ~ x -1 
# fits a regression of y on x, and forces the line through the origin at x=0.

# I() Elements within the parentheses are interpreted arithmetically. 
# For example, y ~ x + (z + w)^2 would expand to y ~ x + z + w + z:w. 
# In contrast, the code y ~ x + I((z + w)^2) would expand to y ~ x + h, 
# where h is a new variable created by squaring the sum of z and w.
summary()
coefficients()
confint()
fitted()
residuals()
anova()
vcov()
AIC()
plot()
predict()

#multilinear regression
cor() #examine the relationships among the variables two at a time

library(car)
scatterplotMatrix(states, spread=FALSE, lty.smooth=2,
                  main="Scatter Plot Matrix")
#A significant interaction between two predictor variables tells 
#you that the relationship between one predictor and the response variable 
#depends on the level of the other predictor. 

#Regression diagnostics(plot)
fit <- lm(weight ~ height, data=women)
par(mfrow=c(2,2))
plot(fit)


#Multicollinearity problem (variance inflation factor)
#As a general rule, sqrt(vif) > 2 indicates a multicollinearity problem
library(car)
vif(fit)


#Outliers
#Outliers are observations that aren’t predicted well by the model.
#They have either unusually large positive or negative residuals
#Points in the Q-Q plot that lie outside the confidence band are considered outliers.
#A rough rule of thumb is that standardized residuals that are larger than 2 
#or less than –2 are worth attention.
library(car)
outlierTest(fit)
#reports the Bonferroni adjusted p-value for the largest absolute 
#studentized residual

#Observations that have high leverage are outliers with regard to the other predictors.
hat.plot <- function(fit) {
  p <- length(coefficients(fit))
  n <- length(fitted(fit))
  plot(hatvalues(fit), main="Index Plot of Hat Values")
  abline(h=c(2,3)*p/n, col="red", lty=2)
  identify(1:n, hatvalues(fit), names(hatvalues(fit)))
}
hat.plot(fit)

#Influential observations are observations that have a disproportionate 
#impact on the values of the model parameters.
#Cook’s distance, or D statistic and added variable plots. 
#Roughly speaking, Cook’s D values greater than 4/(n-k-1), 
#where n is the sample size and k is the number of predictor variables, 
#indicate influential observations. 
cutoff <- 4/(nrow(states)-length(fit$coefficients)-2)
plot(fit, which=4, cook.levels=cutoff)
abline(h=cutoff, lty=2, col="red")

library(car)
influencePlot(fit, id.method="identify", main="Influence Plot",
              sub="Circle size is proportional to Cook’s distance")


#Box–Cox transformation to normality
library(car)
summary(powerTransform(states$Murder))


#Backward stepwise selection
library(MASS)
fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost,
             data=states)
stepAIC(fit, direction="backward")


#Relative importance
#Standardized regression coefficients
zstates <- as.data.frame(scale(states))
zfit <- lm(Murder~Population + Income + Illiteracy + Frost, data=zstates)
coef(zfit)
#relative weights
relweights <- function(fit,...){
  R <- cor(fit$model)
  nvar <- ncol(R)
  rxx <- R[2:nvar, 2:nvar]
  rxy <- R[2:nvar, 1]
  svd <- eigen(rxx)
  evec <- svd$vectors
  ev <- svd$values
  delta <- diag(sqrt(ev))
  lambda <- evec %*% delta %*% t(evec)
  lambdasq <- lambda ^ 2
  beta <- solve(lambda) %*% rxy
  rsquare <- colSums(beta ^ 2)
  rawwgt <- lambdasq %*% beta ^ 2
  import <- (rawwgt / rsquare) * 100
  lbls <- names(fit$model[2:nvar])
  rownames(import) <- lbls
  colnames(import) <- "Weights"
  barplot(t(import),names.arg=lbls,
          ylab="% of R-Square",
          xlab="Predictor Variables",
          main="Relative Importance of Predictor Variables",
          sub=paste("R-Square=", round(rsquare, digits=3)),
          ...)
  return(import)
}


#ANOVA

#It's called a between-groups factor because patients
#are assigned to one and only one group.
#It's called a within-groups factor because each patient is
#measured under both levels.

#design for anova
#One-way ANOVA y ~ A
#One-way ANCOVA with one covariate y ~ x + A
#Two-way Factorial ANOVA y ~ A * B
#Two-way Factorial ANCOVA with two covariates y ~ x1 + x2 + A * B
#Randomized Block y ~ B + A (where B is a blocking factor)
#One-way within-groups ANOVA y ~ A + Error(Subject/A)
#Repeated measures ANOVA with one within-groups
#factor (W) and one between-groups factor (B) y ~ B * W + Error(Subject/W)

fit <- aov(response ~ trt)
summary(fit)

#plot group means with confidence intervals
library(gplots)
plotmeans(response ~ trt, xlab="Treatment", ylab="Response",
            main="Mean Plot\nwith 95% CI")


#multiple comparisons
#Tukey HSD pairwise group comparisons
TukeyHSD(fit) #if HH is loaded, TukeyHSD() will fail. In that case, use detach("package:HH")
par(las=2)
par(mar=c(5,8,4,2))
plot(TukeyHSD(fit))

#bar plots
library(multcomp)
par(mar=c(5,4,6,2))
tuk <- glht(fit, linfct=mcp(trt="Tukey"))
plot(cld(tuk, level=.05),col="lightgrey")


#Assessing test assumptions
#normality
library(car)
qqPlot(lm(response ~ trt, data=cholesterol),
         simulate=TRUE, main="Q-Q Plot", labels=FALSE)

#equality of variances
bartlett.test(response ~ trt, data=cholesterol)

#outliers
library(car)
outlierTest(fit)

#one way ancova
fit <- aov(weight ~ gesttime + dose)
summary(fit)
#obtain adjusted group means-that is, the group means obtained after partialing out the effects of the covariate
library(effects)
effect("dose", fit)

#Multiple comparisons employing user-supplied contrasts
library(multcomp)
contrast <- rbind("no drug vs. drug" = c(3, -1, -1, -1))
summary(glht(fit, linfct=mcp(dose=contrast)))

#standard ANCOVA designs assumes homogeneity of regression slopes.
library(multcomp)
fit2 <- aov(weight ~ gesttime*dose, data=litter)
summary(fit2)
#A significant interactionwould imply that the relationship between gestation 
#and birth weight depends on the level of the dose variable.

#Visualizing ancova
library(HH)
ancova(weight ~ gesttime + dose, data=litter)


#Two-way factorial ANOVA
attach(ToothGrowth)
fit <- aov(len ~ supp*dose)
summary(fit)
#visualizing interaction
interaction.plot(dose, supp, len, type="b",
                 col=c("red","blue"), pch=c(16, 18),
                 main = "Interaction between Dose and Supplement Type")
#or
library(gplots)
plotmeans(len ~ interaction(supp, dose, sep=" "),
          connect=list(c(1,3,5),c(2,4,6)),
          col=c("red", "darkgreen"),
          main = "Interaction Plot with 95% CIs",
          xlab="Treatment and Dose Combination")
#or
library(HH)
interaction2wt(len~supp*dose)


#Repeated measures ANOVA
w1b1 <- subset(CO2, Treatment=='chilled')
fit <- aov(uptake ~ conc*Type + Error(Plant/(conc)), w1b1)
summary(fit)
#visualizing the result
par(las=2)
par(mar=c(10,4,4,2))
with(w1b1, interaction.plot(conc,Type,uptake,
                              type="b", col=c("red","blue"), pch=c(16,18),
                              main="Interaction Plot for Plant Type and Concentration"))
boxplot(uptake ~ Type*conc, data=w1b1, col=(c("gold", "green")),
          main="Chilled Quebec and Mississippi Plants",
          ylab="Carbon dioxide uptake rate (umol/m^2 sec)")


#Multivariate analysis of variance (MANOVA)
library(MASS)
attach(UScereal)
y <- cbind(calories, fat, sugars)
aggregate(y, by=list(shelf), FUN=mean)
fit <- manova(y ~ shelf)
summary(fit)
summary.aov(fit)
#Robust MANOVA
library(rrcov)
Wilks.test(y,shelf,method="mcd")


#Power Analysis
#In planning research, the researcher typically pays special attention to four quantities:
#sample size, significance level, power, and effect size

#Sample size refers to the number of observations in each condition/group of the
#experimental design.

#The significance level (also referred to as alpha) is defined as the probability of
#making a Type I error. The significance level can also be thought of as the probability
#of finding an effect that is not there.

#Power is defined as one minus the probability of making a Type II error. Power
#can be thought of as the probability of finding an effect that is there.

#Effect size is the magnitude of the effect under the alternate or research hypothesis.
#The formula for effect size depends on the statistical methodology employed
#in the hypothesis testing

#Given any three, you can calculate the fourth

#power of t-test (equal sample size)
library(pwr)
pwr.t.test(n=, d=, sig.level=, power=, alternative=,type=)
#d is the effect size defined as the standardized mean difference
#d=(u1-u2)/sigma

#If the sample sizes for the two groups are unequal
pwr.t2n.test(n1=, n2=, d=, sig.level=, power=, alternative=)

#power analysis of a balanced oneway analysis of variance
pwr.anova.test(k=, n=, f=, sig.level=, power=)
#For a one-way ANOVA, effect size is measured by f
#f=sqrt(sum(n/N*(ui-u)^2/sigma^2))

#power analysis of correlation test
pwr.r.test(n=, r=, sig.level=, power=, alternative=)
#r is the effect size (as measured by a linear correlation coefficient)

#power analysis of Linear models
pwr.f2.test(u=, v=, f2=, sig.level=, power=)
#u and v are the numerator and denominator degrees of freedom
#f2 is the effect size

#Tests of proportions
pwr.2p.test(h=, n=, sig.level=, power=)
#h is effect size, h = 2arcsin(sqrt(p1))-2arcsin(sqrt(p2))
#h = ES.h(p1, p2)

#Chi-square tests
pwr.chisq.test(w =, N = , df = , sig.level =, power = )
#w = ES.w2(P), P is a probability table.


#permutation test
#t-test (must be factor)
library(coin)
score <- c(40, 57, 45, 55, 58, 57, 64, 55, 62, 65)
treatment <- factor(c(rep("A",5), rep("B",5)))
mydata <- data.frame(treatment, score)
oneway_test(score~treatment, data=mydata, distribution="exact")

#Permutation tests with the lmPerm package
lmp()  aovp() # lm() and aov()
library(lmPerm)
set.seed(1234)
fit <- lmp(weight~height, data=women, perm="Prob")


#Bootstrapping
#a function for obtaining the R-squared value
rsq <- function(formula, data, indices) {
  d <- data[indices,]
  fit <- lm(formula, data=d)
  return(summary(fit)$r.square)
}
library(boot)
set.seed(1234)
results <- boot(data=mtcars, statistic=rsq,
                R=1000, formula=mpg~wt+disp)
boot.ci(results, type=c("perc", "bca"))


#Generalized linear models
#logistic regression (where the dependent variable is categorical) and 
#Poisson regression (where the dependent variable is a count variable)

#binomial  (link = "logit")
#gaussian  (link = "identity")
#gamma     (link = "inverse")
#inverse.gaussian  (link = "1/mu^2")
#poisson   (link = "log")
#quasi     (link = "identity", variance = "constant")
#quasibinomial   (link = "logit")
#quasipoisson   (link = "log")

#logistic regression
glm(Y~X1+X2+X3, family=binomial(link="logit"), data=mydata)
data(Affairs, package="AER")
Affairs$ynaffair[Affairs$affairs  > 0] <- 1
Affairs$ynaffair[Affairs$affairs == 0] <- 0
Affairs$ynaffair <- factor(Affairs$ynaffair,
                             levels=c(0,1),
                             labels=c("No","Yes"))
fit.full <- glm(ynaffair ~ gender + age + yearsmarried + children +
                  religiousness + education + occupation +rating,
                data=Affairs,family=binomial())
fit.reduced <- glm(ynaffair ~ age + yearsmarried + religiousness +
                     rating, data=Affairs, family=binomial())
anova(fit.reduced, fit.full, test="Chisq")
#Overdispersion occurs when the observed variance of the response variable 
#is larger than what would be expected from a binomial distribution. 

#poission regression
glm(Y~X1+X2+X3, family=poisson(link="log"), data=mydata)
data(breslow.dat, package="robust")
fit <- glm(sumY ~ Base + Age + Trt, data=breslow.dat, family=poisson())
#also need to test overdispersion
library(qcc)
qcc.overdispersion.test(breslow.dat$sumY, type="poisson")
#use family="quasipoisson" when overdispersion is present (logit/poisson)

#POISSON REGRESSION WITH VARYING TIME PERIODS
fit <- glm(sumY ~ Base + Age + Trt, data=breslow.dat,
           offset= log(time), family=poisson())

#ZERO-INFLATED POISSON REGRESSION
zeroinfl() #function in the pscl package

#PCA and factor analysis
#factor analysis requires 5–10 times as many subjects as variables.

#pca & efa are from correlations

#parallel analysis 
library(psych)
fa.parallel(USJudgeRatings[,-1], fa="pc", n.iter=100,
            show.legend=FALSE, main="Scree plot with parallel analysis")
abline(h=1)

#Extracting principal components
library(psych)
pc <- principal(USJudgeRatings[,-1], nfactors=1)
pc


#pca with varimax rotation (orthogonal rotation)
rc <- principal(Harman23.cor$cov, nfactors=2, rotate="varimax")


library(psych)
rc <- principal(Harman23.cor$cov, nfactors=2, rotate="varimax")
round(unclass(rc$weights), 2)


#deciding how many common factors to extract
library(psych)
covariances <- ability.cov$cov
correlations <- cov2cor(covariances)
fa.parallel(correlations, n.obs = 112, fa="both", n.iter = 100,
            main="Scree plots with parallel analysis")


#For EFA, the Kaiser–Harris criterion is number of eigenvalues above 0
# rather than 1.


#deal with missing value