with(mtcars, { nokeepstats <- summary(mpg) keepstats <<- summary(mpg) }) #keepstats is a global variable, nokeepstats is not. source("") #run a set of R statements contained in file #text output sink("filename", append = TRUE, split = TRUE) #append means not overwriting, #split means send output to both screen and the output file sink() #return the output to screen alone diabetes <- c("Type1", "Type2", "Type1", "Type1") diabetes <- factor(diabetes) #nominal veriable #store this vector as (1,2,1,1) and associates it with 1 = type 1 (alphabetical) status <- c("Poor", "Improved", "Excellent", "Poor") status <- factor(status, ordered=TRUE, levels=c("Poor", "Improved", "Excellent")) #encode the vector as (3,2,1,3) with specific order as levels #also can add labels #str(object) provides information on an object in R #list g <- "My First List" h <- c(25, 26, 18, 39) j <- matrix(1:10, nrow=5) k <- c("one", "two", "three") mylist <- list(title=g, ages=h, j, k) mylist[[1]] # or mylist[["title"]] or mylist$title numeric(0) character(0) #length of 0 #use file() or ?file() for more information file() #allows the user to access files gzfile() bzfile() xzfile() unz() #let user read compressed files url() #lets you access internet files #load package ipak <- function(pkg){ new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])] if (length(new.pkg)) install.packages(new.pkg, dependencies = TRUE) sapply(pkg, require, character.only = TRUE) } packages <- c("plyr","dplyr","reshape2") ipak(packages) #download the file to load if(!file.exists("data")) {dir.create("data")} #creat a file for data fileUrl <- "https://adsfa" download.file(fileUrl, destfile = "./data/name",method="curl") dataDownloaded <- date() #Current download methods are "internal", "wininet" (Windows only) #"libcurl", "wget" and "curl", and there is a value "auto" #importing data from excel #package gdata library("gdata") dataset <- read.xls("file", sheetIndex=1, header = TRUE) rm() #delete one or more objects pdf("filename.pdf") #save the graph as a pdf #or png("") #save in png dev.off() #shuts down the specified (by default the current) device manager <- c(1, 2, 3, 4, 5) date <- c("10/24/08", "10/28/08", "10/1/08", "10/12/08", "5/1/09") country <- c("US", "US", "UK", "UK", "UK") gender <- c("M", "F", "F", "M", "F") age <- c(32, 45, 25, 39, 99) q1 <- c(5, 3, 3, 3, 2) q2 <- c(4, 5, 5, 3, 2) q3 <- c(5, 2, 5, 4, 1) q4 <- c(5, 5, 5, NA, 2) q5 <- c(5, 5, 2, NA, 1) leadership <- data.frame(manager, date, country, gender, age, q1, q2, q3, q4, q5, stringsAsFactors=FALSE) within() with() #The within() function is similar to the with() function, #but allows you to modify the data frame. na.omit() #deletes any rows with missing data #date variable, The default format for inputting dates is yyyy-mm-dd strDates <- c("01/05/1965", "08/16/1975") dates <- as.Date(strDates, "%m/%d/%Y") Sys.Date() #returns today’s date date() #returns the current date and time format(x, format="output_format") today <- Sys.Date() format(today, format="%B %d %Y") format(today, format="%A") #output dates in a specified format, and to extract portions of dates difftime() #calculate a time interval and express it #as seconds, minutes, hours, days, or weeks today <- Sys.Date() dob <- as.Date("1956-10-12") difftime(today, dob, units="weeks") #Time difference of 2825 weeks #Sorting data newdata <- leadership[order(gender, age),] #By default, the sorting order is ascending total <- merge(dataframeA, dataframeB, by="ID") #Merging dataset (inner joint) #NULL (undefined) isn’t the same as NA (missing) newdata <- subset(leadership, age >= 35 | age < 24, select=c(q1, q2, q3, q4)) #elect all rows that have a value of age >=35 or age < 24. #You keep the variables q1 through q4 sample() #enables you to take a random sample (with or without replacement) #of size n from a dataset #Using SQL statements to manipulate data frames #library sqldf library(sqldf) newdf <- sqldf("select * from mtcars where carb=1 order by mpg", row.names=TRUE) scale() #standardizes the specified columns of a matrix or data frame to a mean of 0 # and a standard deviation of 1 library(MASS) mvrnorm(n, mean, sigma) #draw data from multivariate normal distribution with a given mean vector and # covariance matrix #edit text variable nchar(x) #Counts the number of characters of x substr(x, start, stop) #Extract or replace substrings in a character vector grep(pattern, x, ignore.case=FALSE, fixed=FALSE) #Search for pattern in x. If fixed=FALSE, then pattern is a regular expression. #If fixed=TRUE, then pattern is a text string. Returns matching indices. sub(pattern, replacement, x, ignore.case=FALSE, fixed=FALSE) #Find pattern in x and substitute with replacement text. #If fixed=FALSE then pattern is #a regular expression. If fixed=TRUE then pattern is a text string. gsub() #replace all the pattern strsplit(x, split, fixed=FALSE) #Split the elements of character vector x at split. y <- strsplit("abc", "") unlist(y) sapply(y, "[", 2) #both return the 2nd element of y ([ means select elements) paste(..., sep="") #Concatenate strings after using sep string to separate them. toupper(x) tolower(x) cat(... , file = "", sep = " ") cut(x, n) #Divide continuous variable x into factor with n levels #Aggregation and restructuring aggregate(data[variable.names], by, FUN) #where x is the data object to be collapsed, #by is a list of variables that will be crossed to form the new observations, #and FUN is the scalar function used to calculate summary statistics that #will make up the new observation values. install.packages("reshape") library(reshape) md <- melt(mydata, id=(c("id", "time"))) melt(data) # put all the data in one col cast(md, formula, FUN) prop.table(mytable) #turn these frequencies table into proportions #CHI-SQUARE TEST OF INDEPENDENCE chisq.test(table) # FISHER’S EXACT TEST for Independence fisher.test(table) #COCHRAN–MANTEL–HAENSZEL TEST mantelhaen.test(table) #The significance tests in the previous section evaluated whether #or not sufficient evidence existed to reject a null hypothesis #of independence between variables. If you can reject the null hypothesis, #your interest turns naturally to measures of association in order to #gauge the strength of the relationships present #In statistics,the phi coefficienti s a measure of association for two binary variables. assocstats(table) #The Pearson product moment correlation assesses the degree of linear relationship #between two quantitative variables. Spearman’s Rank Order correlation coefficient #assesses the degree of relationship between two rank-ordered variables. #Kendall’s Tau is also a nonparametric measure of rank correlation. cor(x, y = NULL, use = "everything", method = c("pearson", "kendall", "spearman")) #A partial correlation is a correlation between two quantitative variables, #controlling for one or more other quantitative variables. library(ggm) # partial correlation of population and murder rate, controlling # for income, illiteracy rate, and HS graduation rate pcor(c(1,5,2,3,6), cov(states)) #Testing correlations for significance cor.test(x, y, alternative = c("two.sided", "less", "greater"), method = c("pearson", "kendall", "spearman"), exact = NULL, conf.level = 0.95) #t-test t.test(x, y = NULL, alternative = c("two.sided", "less", "greater"), mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95) #Nonparametric tests of group differences #If the two groups are independent, you can use the Wilcoxon rank sum test #(more popularly known as the Mann–Whitney U test) to #assess whether the observations are sampled from the same probability distribution wilcox.test(x, y = NULL, alternative = c("two.sided", "less", "greater"), mu = 0, paired = FALSE, exact = NULL, correct = TRUE, conf.int = FALSE, conf.level = 0.95) #nonparametric test of ANOVA kruskal.test(y ~ A, data) friedman.test(y ~ A | B, data) #where y is the numeric outcome variable, A is a grouping variable, #and B is a blocking variable that identifies matched observations. #linear regression myfit <- lm(formula, data) # : Denotes an interaction between predictor variables. A prediction of y from x, z, # and the interaction between x and z would be coded y ~ x + z + x:z. # * A shortcut for denoting all possible interactions. # The code y ~ x * z * w expands to y ~ x + z + w + x:z + x:w + z:w + x:z:w. # ^ Denotes interactions up to a specified degree. The code y ~ (x + z + w)^2 # expands to y ~ x + z + w + x:z + x:w + z:w. # - A minus sign removes a variable from the equation. #For example, y ~ (x + z + w)^2 – x:w expands to y ~ x + z + w + x:z + z:w. # -1 Suppresses the intercept. For example, the formula y ~ x -1 # fits a regression of y on x, and forces the line through the origin at x=0. # I() Elements within the parentheses are interpreted arithmetically. # For example, y ~ x + (z + w)^2 would expand to y ~ x + z + w + z:w. # In contrast, the code y ~ x + I((z + w)^2) would expand to y ~ x + h, # where h is a new variable created by squaring the sum of z and w. summary() coefficients() confint() fitted() residuals() anova() vcov() AIC() plot() predict() #multilinear regression cor() #examine the relationships among the variables two at a time library(car) scatterplotMatrix(states, spread=FALSE, lty.smooth=2, main="Scatter Plot Matrix") #A significant interaction between two predictor variables tells #you that the relationship between one predictor and the response variable #depends on the level of the other predictor. #Regression diagnostics(plot) fit <- lm(weight ~ height, data=women) par(mfrow=c(2,2)) plot(fit) #Multicollinearity problem (variance inflation factor) #As a general rule, sqrt(vif) > 2 indicates a multicollinearity problem library(car) vif(fit) #Outliers #Outliers are observations that aren’t predicted well by the model. #They have either unusually large positive or negative residuals #Points in the Q-Q plot that lie outside the confidence band are considered outliers. #A rough rule of thumb is that standardized residuals that are larger than 2 #or less than –2 are worth attention. library(car) outlierTest(fit) #reports the Bonferroni adjusted p-value for the largest absolute #studentized residual #Observations that have high leverage are outliers with regard to the other predictors. hat.plot <- function(fit) { p <- length(coefficients(fit)) n <- length(fitted(fit)) plot(hatvalues(fit), main="Index Plot of Hat Values") abline(h=c(2,3)*p/n, col="red", lty=2) identify(1:n, hatvalues(fit), names(hatvalues(fit))) } hat.plot(fit) #Influential observations are observations that have a disproportionate #impact on the values of the model parameters. #Cook’s distance, or D statistic and added variable plots. #Roughly speaking, Cook’s D values greater than 4/(n-k-1), #where n is the sample size and k is the number of predictor variables, #indicate influential observations. cutoff <- 4/(nrow(states)-length(fit$coefficients)-2) plot(fit, which=4, cook.levels=cutoff) abline(h=cutoff, lty=2, col="red") library(car) influencePlot(fit, id.method="identify", main="Influence Plot", sub="Circle size is proportional to Cook’s distance") #Box–Cox transformation to normality library(car) summary(powerTransform(states$Murder)) #Backward stepwise selection library(MASS) fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states) stepAIC(fit, direction="backward") #Relative importance #Standardized regression coefficients zstates <- as.data.frame(scale(states)) zfit <- lm(Murder~Population + Income + Illiteracy + Frost, data=zstates) coef(zfit) #relative weights relweights <- function(fit,...){ R <- cor(fit$model) nvar <- ncol(R) rxx <- R[2:nvar, 2:nvar] rxy <- R[2:nvar, 1] svd <- eigen(rxx) evec <- svd$vectors ev <- svd$values delta <- diag(sqrt(ev)) lambda <- evec %*% delta %*% t(evec) lambdasq <- lambda ^ 2 beta <- solve(lambda) %*% rxy rsquare <- colSums(beta ^ 2) rawwgt <- lambdasq %*% beta ^ 2 import <- (rawwgt / rsquare) * 100 lbls <- names(fit$model[2:nvar]) rownames(import) <- lbls colnames(import) <- "Weights" barplot(t(import),names.arg=lbls, ylab="% of R-Square", xlab="Predictor Variables", main="Relative Importance of Predictor Variables", sub=paste("R-Square=", round(rsquare, digits=3)), ...) return(import) } #ANOVA #It's called a between-groups factor because patients #are assigned to one and only one group. #It's called a within-groups factor because each patient is #measured under both levels. #design for anova #One-way ANOVA y ~ A #One-way ANCOVA with one covariate y ~ x + A #Two-way Factorial ANOVA y ~ A * B #Two-way Factorial ANCOVA with two covariates y ~ x1 + x2 + A * B #Randomized Block y ~ B + A (where B is a blocking factor) #One-way within-groups ANOVA y ~ A + Error(Subject/A) #Repeated measures ANOVA with one within-groups #factor (W) and one between-groups factor (B) y ~ B * W + Error(Subject/W) fit <- aov(response ~ trt) summary(fit) #plot group means with confidence intervals library(gplots) plotmeans(response ~ trt, xlab="Treatment", ylab="Response", main="Mean Plot\nwith 95% CI") #multiple comparisons #Tukey HSD pairwise group comparisons TukeyHSD(fit) #if HH is loaded, TukeyHSD() will fail. In that case, use detach("package:HH") par(las=2) par(mar=c(5,8,4,2)) plot(TukeyHSD(fit)) #bar plots library(multcomp) par(mar=c(5,4,6,2)) tuk <- glht(fit, linfct=mcp(trt="Tukey")) plot(cld(tuk, level=.05),col="lightgrey") #Assessing test assumptions #normality library(car) qqPlot(lm(response ~ trt, data=cholesterol), simulate=TRUE, main="Q-Q Plot", labels=FALSE) #equality of variances bartlett.test(response ~ trt, data=cholesterol) #outliers library(car) outlierTest(fit) #one way ancova fit <- aov(weight ~ gesttime + dose) summary(fit) #obtain adjusted group means-that is, the group means obtained after partialing out the effects of the covariate library(effects) effect("dose", fit) #Multiple comparisons employing user-supplied contrasts library(multcomp) contrast <- rbind("no drug vs. drug" = c(3, -1, -1, -1)) summary(glht(fit, linfct=mcp(dose=contrast))) #standard ANCOVA designs assumes homogeneity of regression slopes. library(multcomp) fit2 <- aov(weight ~ gesttime*dose, data=litter) summary(fit2) #A significant interactionwould imply that the relationship between gestation #and birth weight depends on the level of the dose variable. #Visualizing ancova library(HH) ancova(weight ~ gesttime + dose, data=litter) #Two-way factorial ANOVA attach(ToothGrowth) fit <- aov(len ~ supp*dose) summary(fit) #visualizing interaction interaction.plot(dose, supp, len, type="b", col=c("red","blue"), pch=c(16, 18), main = "Interaction between Dose and Supplement Type") #or library(gplots) plotmeans(len ~ interaction(supp, dose, sep=" "), connect=list(c(1,3,5),c(2,4,6)), col=c("red", "darkgreen"), main = "Interaction Plot with 95% CIs", xlab="Treatment and Dose Combination") #or library(HH) interaction2wt(len~supp*dose) #Repeated measures ANOVA w1b1 <- subset(CO2, Treatment=='chilled') fit <- aov(uptake ~ conc*Type + Error(Plant/(conc)), w1b1) summary(fit) #visualizing the result par(las=2) par(mar=c(10,4,4,2)) with(w1b1, interaction.plot(conc,Type,uptake, type="b", col=c("red","blue"), pch=c(16,18), main="Interaction Plot for Plant Type and Concentration")) boxplot(uptake ~ Type*conc, data=w1b1, col=(c("gold", "green")), main="Chilled Quebec and Mississippi Plants", ylab="Carbon dioxide uptake rate (umol/m^2 sec)") #Multivariate analysis of variance (MANOVA) library(MASS) attach(UScereal) y <- cbind(calories, fat, sugars) aggregate(y, by=list(shelf), FUN=mean) fit <- manova(y ~ shelf) summary(fit) summary.aov(fit) #Robust MANOVA library(rrcov) Wilks.test(y,shelf,method="mcd") #Power Analysis #In planning research, the researcher typically pays special attention to four quantities: #sample size, significance level, power, and effect size #Sample size refers to the number of observations in each condition/group of the #experimental design. #The significance level (also referred to as alpha) is defined as the probability of #making a Type I error. The significance level can also be thought of as the probability #of finding an effect that is not there. #Power is defined as one minus the probability of making a Type II error. Power #can be thought of as the probability of finding an effect that is there. #Effect size is the magnitude of the effect under the alternate or research hypothesis. #The formula for effect size depends on the statistical methodology employed #in the hypothesis testing #Given any three, you can calculate the fourth #power of t-test (equal sample size) library(pwr) pwr.t.test(n=, d=, sig.level=, power=, alternative=,type=) #d is the effect size defined as the standardized mean difference #d=(u1-u2)/sigma #If the sample sizes for the two groups are unequal pwr.t2n.test(n1=, n2=, d=, sig.level=, power=, alternative=) #power analysis of a balanced oneway analysis of variance pwr.anova.test(k=, n=, f=, sig.level=, power=) #For a one-way ANOVA, effect size is measured by f #f=sqrt(sum(n/N*(ui-u)^2/sigma^2)) #power analysis of correlation test pwr.r.test(n=, r=, sig.level=, power=, alternative=) #r is the effect size (as measured by a linear correlation coefficient) #power analysis of Linear models pwr.f2.test(u=, v=, f2=, sig.level=, power=) #u and v are the numerator and denominator degrees of freedom #f2 is the effect size #Tests of proportions pwr.2p.test(h=, n=, sig.level=, power=) #h is effect size, h = 2arcsin(sqrt(p1))-2arcsin(sqrt(p2)) #h = ES.h(p1, p2) #Chi-square tests pwr.chisq.test(w =, N = , df = , sig.level =, power = ) #w = ES.w2(P), P is a probability table. #permutation test #t-test (must be factor) library(coin) score <- c(40, 57, 45, 55, 58, 57, 64, 55, 62, 65) treatment <- factor(c(rep("A",5), rep("B",5))) mydata <- data.frame(treatment, score) oneway_test(score~treatment, data=mydata, distribution="exact") #Permutation tests with the lmPerm package lmp() aovp() # lm() and aov() library(lmPerm) set.seed(1234) fit <- lmp(weight~height, data=women, perm="Prob") #Bootstrapping #a function for obtaining the R-squared value rsq <- function(formula, data, indices) { d <- data[indices,] fit <- lm(formula, data=d) return(summary(fit)$r.square) } library(boot) set.seed(1234) results <- boot(data=mtcars, statistic=rsq, R=1000, formula=mpg~wt+disp) boot.ci(results, type=c("perc", "bca")) #Generalized linear models #logistic regression (where the dependent variable is categorical) and #Poisson regression (where the dependent variable is a count variable) #binomial (link = "logit") #gaussian (link = "identity") #gamma (link = "inverse") #inverse.gaussian (link = "1/mu^2") #poisson (link = "log") #quasi (link = "identity", variance = "constant") #quasibinomial (link = "logit") #quasipoisson (link = "log") #logistic regression glm(Y~X1+X2+X3, family=binomial(link="logit"), data=mydata) data(Affairs, package="AER") Affairs$ynaffair[Affairs$affairs > 0] <- 1 Affairs$ynaffair[Affairs$affairs == 0] <- 0 Affairs$ynaffair <- factor(Affairs$ynaffair, levels=c(0,1), labels=c("No","Yes")) fit.full <- glm(ynaffair ~ gender + age + yearsmarried + children + religiousness + education + occupation +rating, data=Affairs,family=binomial()) fit.reduced <- glm(ynaffair ~ age + yearsmarried + religiousness + rating, data=Affairs, family=binomial()) anova(fit.reduced, fit.full, test="Chisq") #Overdispersion occurs when the observed variance of the response variable #is larger than what would be expected from a binomial distribution. #poission regression glm(Y~X1+X2+X3, family=poisson(link="log"), data=mydata) data(breslow.dat, package="robust") fit <- glm(sumY ~ Base + Age + Trt, data=breslow.dat, family=poisson()) #also need to test overdispersion library(qcc) qcc.overdispersion.test(breslow.dat$sumY, type="poisson") #use family="quasipoisson" when overdispersion is present (logit/poisson) #POISSON REGRESSION WITH VARYING TIME PERIODS fit <- glm(sumY ~ Base + Age + Trt, data=breslow.dat, offset= log(time), family=poisson()) #ZERO-INFLATED POISSON REGRESSION zeroinfl() #function in the pscl package #PCA and factor analysis #factor analysis requires 5–10 times as many subjects as variables. #pca & efa are from correlations #parallel analysis library(psych) fa.parallel(USJudgeRatings[,-1], fa="pc", n.iter=100, show.legend=FALSE, main="Scree plot with parallel analysis") abline(h=1) #Extracting principal components library(psych) pc <- principal(USJudgeRatings[,-1], nfactors=1) pc #pca with varimax rotation (orthogonal rotation) rc <- principal(Harman23.cor$cov, nfactors=2, rotate="varimax") library(psych) rc <- principal(Harman23.cor$cov, nfactors=2, rotate="varimax") round(unclass(rc$weights), 2) #deciding how many common factors to extract library(psych) covariances <- ability.cov$cov correlations <- cov2cor(covariances) fa.parallel(correlations, n.obs = 112, fa="both", n.iter = 100, main="Scree plots with parallel analysis") #For EFA, the Kaiser–Harris criterion is number of eigenvalues above 0 # rather than 1. #deal with missing value