R_in_action.R 21.5 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699
with(mtcars, {
  nokeepstats <- summary(mpg)
  keepstats <<- summary(mpg)
})
#keepstats is a global variable, nokeepstats is not.

source("") #run a set of R statements contained in file
 
#text output
sink("filename", append = TRUE, split = TRUE) #append means not overwriting,
#split means send output to both screen and the output file
sink() #return the output to screen alone

diabetes <- c("Type1", "Type2", "Type1", "Type1")
diabetes <- factor(diabetes)
#nominal veriable
#store this vector as (1,2,1,1) and associates it with 1 = type 1 (alphabetical)

status <- c("Poor", "Improved", "Excellent", "Poor")
status <- factor(status, ordered=TRUE, levels=c("Poor", "Improved", "Excellent"))
#encode the vector as (3,2,1,3) with specific order as levels
#also can add labels

#str(object) provides information on an object in R

#list
g <- "My First List"
h <- c(25, 26, 18, 39)
j <- matrix(1:10, nrow=5)
k <- c("one", "two", "three")
mylist <- list(title=g, ages=h, j, k)
mylist[[1]] # or mylist[["title"]] or mylist$title

numeric(0) 
character(0)
#length of 0

#use file() or ?file() for more information
file() #allows the user to access files
gzfile()  bzfile()  xzfile()  unz() #let user read compressed files
url() #lets you access internet files

#load package
ipak <- function(pkg){
  new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
  if (length(new.pkg)) 
    install.packages(new.pkg, dependencies = TRUE)
  sapply(pkg, require, character.only = TRUE)
}
packages <- c("plyr","dplyr","reshape2")
ipak(packages)

#download the file to load
if(!file.exists("data")) {dir.create("data")} #creat a file for data
fileUrl <- "https://adsfa"
download.file(fileUrl, destfile = "./data/name",method="curl")
dataDownloaded <- date()
#Current download methods are "internal", "wininet" (Windows only) 
#"libcurl", "wget" and "curl", and there is a value "auto"

#importing data from excel
#package gdata
library("gdata")
dataset <- read.xls("file", sheetIndex=1, header = TRUE)

rm()  #delete one or more objects


pdf("filename.pdf") #save the graph as a pdf
#or 
png("") #save in png
dev.off() #shuts down the specified (by default the current) device


manager <- c(1, 2, 3, 4, 5)
date <- c("10/24/08", "10/28/08", "10/1/08", "10/12/08", "5/1/09")
country <- c("US", "US", "UK", "UK", "UK")
gender <- c("M", "F", "F", "M", "F")
age <- c(32, 45, 25, 39, 99)
q1 <- c(5, 3, 3, 3, 2)
q2 <- c(4, 5, 5, 3, 2)
q3 <- c(5, 2, 5, 4, 1)
q4 <- c(5, 5, 5, NA, 2)
q5 <- c(5, 5, 2, NA, 1)
leadership <- data.frame(manager, date, country, gender, age,
                         q1, q2, q3, q4, q5, stringsAsFactors=FALSE)

within()
with()
#The within() function is similar to the with() function, 
#but allows you to modify the data frame.

na.omit() #deletes any rows with missing data

#date variable, The default format for inputting dates is yyyy-mm-dd
strDates <- c("01/05/1965", "08/16/1975")
dates <- as.Date(strDates, "%m/%d/%Y")

Sys.Date() #returns today’s date 
date() #returns the current date and time

format(x, format="output_format")
today <- Sys.Date()
format(today, format="%B %d %Y")
format(today, format="%A")
#output dates in a specified format, and to extract portions of dates


difftime() 
#calculate a time interval and express it 
#as seconds, minutes, hours, days, or weeks
today <- Sys.Date()
dob   <- as.Date("1956-10-12")
difftime(today, dob, units="weeks")
#Time difference of 2825 weeks


#Sorting data
newdata <- leadership[order(gender, age),]
#By default, the sorting order is ascending

total <- merge(dataframeA, dataframeB, by="ID")
#Merging dataset (inner joint)

#NULL (undefined) isn’t the same as NA (missing)

newdata <- subset(leadership, age >= 35 | age < 24, 
                  select=c(q1, q2, q3, q4))
#elect all rows that have a value of age >=35 or age < 24.
#You keep the variables q1 through q4

sample() #enables you to take a random sample (with or without replacement) 
#of size n from a dataset


#Using SQL statements to manipulate data frames
#library sqldf
library(sqldf)
newdf <- sqldf("select * from mtcars where carb=1 order by mpg",
               row.names=TRUE)


scale() 
#standardizes the specified columns of a matrix or data frame to a mean of 0 
# and a standard deviation of 1

library(MASS)
mvrnorm(n, mean, sigma)
#draw data from multivariate normal distribution with a given mean vector and 
# covariance matrix


#edit text variable
nchar(x) #Counts the number of characters of x
substr(x, start, stop) #Extract or replace substrings in a character vector

grep(pattern, x, ignore.case=FALSE, fixed=FALSE) 
#Search for pattern in x. If fixed=FALSE, then pattern is a regular expression. 
#If fixed=TRUE, then pattern is a text string. Returns matching indices.

sub(pattern, replacement, x, ignore.case=FALSE, fixed=FALSE)
#Find pattern in x and substitute with replacement text. 
#If fixed=FALSE then pattern is 
#a regular expression. If fixed=TRUE then pattern is a text string.
gsub() #replace all the pattern

strsplit(x, split, fixed=FALSE)
#Split the elements of character vector x at split. 
y <- strsplit("abc", "")
unlist(y)
sapply(y, "[", 2) #both return the 2nd element of y ([ means select elements)

paste(..., sep="") #Concatenate strings after using sep string to separate them.

toupper(x)
tolower(x)
cat(... , file = "", sep = " ")

cut(x, n) #Divide continuous variable x into factor with n levels


#Aggregation and restructuring
aggregate(data[variable.names], by, FUN)
#where x is the data object to be collapsed, 
#by is a list of variables that will be crossed to form the new observations, 
#and FUN is the scalar function used to calculate summary statistics that
#will make up the new observation values.


install.packages("reshape")
library(reshape)
md <- melt(mydata, id=(c("id", "time")))
melt(data) # put all the data in one col

cast(md, formula, FUN)

prop.table(mytable) #turn these frequencies table into proportions

#CHI-SQUARE TEST OF INDEPENDENCE
chisq.test(table)
# FISHER’S EXACT TEST for Independence
fisher.test(table)
#COCHRAN–MANTEL–HAENSZEL TEST
mantelhaen.test(table)

#The significance tests in the previous section evaluated whether 
#or not sufficient evidence existed to reject a null hypothesis 
#of independence between variables. If you can reject the null hypothesis, 
#your interest turns naturally to measures of association in order to 
#gauge the strength of the relationships present

#In statistics,the phi coefficienti s a measure of association for two binary variables. 
assocstats(table)

#The Pearson product moment correlation assesses the degree of linear relationship 
#between two quantitative variables. Spearman’s Rank Order correlation coefficient
#assesses the degree of relationship between two rank-ordered variables. 
#Kendall’s Tau is also a nonparametric measure of rank correlation.
cor(x, y = NULL, use = "everything",
    method = c("pearson", "kendall", "spearman"))

#A partial correlation is a correlation between two quantitative variables, 
#controlling for one or more other quantitative variables.
library(ggm)
# partial correlation of population and murder rate, controlling
# for income, illiteracy rate, and HS graduation rate
pcor(c(1,5,2,3,6), cov(states))

#Testing correlations for significance
cor.test(x, y,
         alternative = c("two.sided", "less", "greater"),
         method = c("pearson", "kendall", "spearman"),
         exact = NULL, conf.level = 0.95)

#t-test
t.test(x, y = NULL,
       alternative = c("two.sided", "less", "greater"),
       mu = 0, paired = FALSE, var.equal = FALSE,
       conf.level = 0.95)

#Nonparametric tests of group differences

#If the two groups are independent, you can use the Wilcoxon rank sum test 
#(more popularly known as the Mann–Whitney U test) to 
#assess whether the observations are sampled from the same probability distribution
wilcox.test(x, y = NULL,
            alternative = c("two.sided", "less", "greater"),
            mu = 0, paired = FALSE, exact = NULL, correct = TRUE,
            conf.int = FALSE, conf.level = 0.95)

#nonparametric test of ANOVA
kruskal.test(y ~ A, data)

friedman.test(y ~ A | B, data)
#where y is the numeric outcome variable, A is a grouping variable, 
#and B is a blocking variable that identifies matched observations.


#linear regression
myfit <- lm(formula, data)
# : Denotes an interaction between predictor variables. A prediction of y from x, z, 
# and the interaction between x and z would be coded y ~ x + z + x:z.

# * A shortcut for denoting all possible interactions. 
# The code y ~ x * z * w expands to y ~ x + z + w + x:z + x:w + z:w + x:z:w.

# ^ Denotes interactions up to a specified degree. The code y ~ (x + z + w)^2
# expands to y ~ x + z + w + x:z + x:w + z:w.

# - A minus sign removes a variable from the equation. 
#For example, y ~ (x + z + w)^2 – x:w expands to y ~ x + z + w + x:z + z:w.

# -1 Suppresses the intercept. For example, the formula y ~ x -1 
# fits a regression of y on x, and forces the line through the origin at x=0.

# I() Elements within the parentheses are interpreted arithmetically. 
# For example, y ~ x + (z + w)^2 would expand to y ~ x + z + w + z:w. 
# In contrast, the code y ~ x + I((z + w)^2) would expand to y ~ x + h, 
# where h is a new variable created by squaring the sum of z and w.
summary()
coefficients()
confint()
fitted()
residuals()
anova()
vcov()
AIC()
plot()
predict()

#multilinear regression
cor() #examine the relationships among the variables two at a time

library(car)
scatterplotMatrix(states, spread=FALSE, lty.smooth=2,
                  main="Scatter Plot Matrix")
#A significant interaction between two predictor variables tells 
#you that the relationship between one predictor and the response variable 
#depends on the level of the other predictor. 

#Regression diagnostics(plot)
fit <- lm(weight ~ height, data=women)
par(mfrow=c(2,2))
plot(fit)


#Multicollinearity problem (variance inflation factor)
#As a general rule, sqrt(vif) > 2 indicates a multicollinearity problem
library(car)
vif(fit)


#Outliers
#Outliers are observations that aren’t predicted well by the model.
#They have either unusually large positive or negative residuals
#Points in the Q-Q plot that lie outside the confidence band are considered outliers.
#A rough rule of thumb is that standardized residuals that are larger than 2 
#or less than –2 are worth attention.
library(car)
outlierTest(fit)
#reports the Bonferroni adjusted p-value for the largest absolute 
#studentized residual

#Observations that have high leverage are outliers with regard to the other predictors.
hat.plot <- function(fit) {
  p <- length(coefficients(fit))
  n <- length(fitted(fit))
  plot(hatvalues(fit), main="Index Plot of Hat Values")
  abline(h=c(2,3)*p/n, col="red", lty=2)
  identify(1:n, hatvalues(fit), names(hatvalues(fit)))
}
hat.plot(fit)

#Influential observations are observations that have a disproportionate 
#impact on the values of the model parameters.
#Cook’s distance, or D statistic and added variable plots. 
#Roughly speaking, Cook’s D values greater than 4/(n-k-1), 
#where n is the sample size and k is the number of predictor variables, 
#indicate influential observations. 
cutoff <- 4/(nrow(states)-length(fit$coefficients)-2)
plot(fit, which=4, cook.levels=cutoff)
abline(h=cutoff, lty=2, col="red")

library(car)
influencePlot(fit, id.method="identify", main="Influence Plot",
              sub="Circle size is proportional to Cook’s distance")


#Box–Cox transformation to normality
library(car)
summary(powerTransform(states$Murder))


#Backward stepwise selection
library(MASS)
fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost,
             data=states)
stepAIC(fit, direction="backward")


#Relative importance
#Standardized regression coefficients
zstates <- as.data.frame(scale(states))
zfit <- lm(Murder~Population + Income + Illiteracy + Frost, data=zstates)
coef(zfit)
#relative weights
relweights <- function(fit,...){
  R <- cor(fit$model)
  nvar <- ncol(R)
  rxx <- R[2:nvar, 2:nvar]
  rxy <- R[2:nvar, 1]
  svd <- eigen(rxx)
  evec <- svd$vectors
  ev <- svd$values
  delta <- diag(sqrt(ev))
  lambda <- evec %*% delta %*% t(evec)
  lambdasq <- lambda ^ 2
  beta <- solve(lambda) %*% rxy
  rsquare <- colSums(beta ^ 2)
  rawwgt <- lambdasq %*% beta ^ 2
  import <- (rawwgt / rsquare) * 100
  lbls <- names(fit$model[2:nvar])
  rownames(import) <- lbls
  colnames(import) <- "Weights"
  barplot(t(import),names.arg=lbls,
          ylab="% of R-Square",
          xlab="Predictor Variables",
          main="Relative Importance of Predictor Variables",
          sub=paste("R-Square=", round(rsquare, digits=3)),
          ...)
  return(import)
}


#ANOVA

#It's called a between-groups factor because patients
#are assigned to one and only one group.
#It's called a within-groups factor because each patient is
#measured under both levels.

#design for anova
#One-way ANOVA y ~ A
#One-way ANCOVA with one covariate y ~ x + A
#Two-way Factorial ANOVA y ~ A * B
#Two-way Factorial ANCOVA with two covariates y ~ x1 + x2 + A * B
#Randomized Block y ~ B + A (where B is a blocking factor)
#One-way within-groups ANOVA y ~ A + Error(Subject/A)
#Repeated measures ANOVA with one within-groups
#factor (W) and one between-groups factor (B) y ~ B * W + Error(Subject/W)

fit <- aov(response ~ trt)
summary(fit)

#plot group means with confidence intervals
library(gplots)
plotmeans(response ~ trt, xlab="Treatment", ylab="Response",
            main="Mean Plot\nwith 95% CI")


#multiple comparisons
#Tukey HSD pairwise group comparisons
TukeyHSD(fit) #if HH is loaded, TukeyHSD() will fail. In that case, use detach("package:HH")
par(las=2)
par(mar=c(5,8,4,2))
plot(TukeyHSD(fit))

#bar plots
library(multcomp)
par(mar=c(5,4,6,2))
tuk <- glht(fit, linfct=mcp(trt="Tukey"))
plot(cld(tuk, level=.05),col="lightgrey")


#Assessing test assumptions
#normality
library(car)
qqPlot(lm(response ~ trt, data=cholesterol),
         simulate=TRUE, main="Q-Q Plot", labels=FALSE)

#equality of variances
bartlett.test(response ~ trt, data=cholesterol)

#outliers
library(car)
outlierTest(fit)

#one way ancova
fit <- aov(weight ~ gesttime + dose)
summary(fit)
#obtain adjusted group means-that is, the group means obtained after partialing out the effects of the covariate
library(effects)
effect("dose", fit)

#Multiple comparisons employing user-supplied contrasts
library(multcomp)
contrast <- rbind("no drug vs. drug" = c(3, -1, -1, -1))
summary(glht(fit, linfct=mcp(dose=contrast)))

#standard ANCOVA designs assumes homogeneity of regression slopes.
library(multcomp)
fit2 <- aov(weight ~ gesttime*dose, data=litter)
summary(fit2)
#A significant interactionwould imply that the relationship between gestation 
#and birth weight depends on the level of the dose variable.

#Visualizing ancova
library(HH)
ancova(weight ~ gesttime + dose, data=litter)


#Two-way factorial ANOVA
attach(ToothGrowth)
fit <- aov(len ~ supp*dose)
summary(fit)
#visualizing interaction
interaction.plot(dose, supp, len, type="b",
                 col=c("red","blue"), pch=c(16, 18),
                 main = "Interaction between Dose and Supplement Type")
#or
library(gplots)
plotmeans(len ~ interaction(supp, dose, sep=" "),
          connect=list(c(1,3,5),c(2,4,6)),
          col=c("red", "darkgreen"),
          main = "Interaction Plot with 95% CIs",
          xlab="Treatment and Dose Combination")
#or
library(HH)
interaction2wt(len~supp*dose)


#Repeated measures ANOVA
w1b1 <- subset(CO2, Treatment=='chilled')
fit <- aov(uptake ~ conc*Type + Error(Plant/(conc)), w1b1)
summary(fit)
#visualizing the result
par(las=2)
par(mar=c(10,4,4,2))
with(w1b1, interaction.plot(conc,Type,uptake,
                              type="b", col=c("red","blue"), pch=c(16,18),
                              main="Interaction Plot for Plant Type and Concentration"))
boxplot(uptake ~ Type*conc, data=w1b1, col=(c("gold", "green")),
          main="Chilled Quebec and Mississippi Plants",
          ylab="Carbon dioxide uptake rate (umol/m^2 sec)")


#Multivariate analysis of variance (MANOVA)
library(MASS)
attach(UScereal)
y <- cbind(calories, fat, sugars)
aggregate(y, by=list(shelf), FUN=mean)
fit <- manova(y ~ shelf)
summary(fit)
summary.aov(fit)
#Robust MANOVA
library(rrcov)
Wilks.test(y,shelf,method="mcd")


#Power Analysis
#In planning research, the researcher typically pays special attention to four quantities:
#sample size, significance level, power, and effect size

#Sample size refers to the number of observations in each condition/group of the
#experimental design.

#The significance level (also referred to as alpha) is defined as the probability of
#making a Type I error. The significance level can also be thought of as the probability
#of finding an effect that is not there.

#Power is defined as one minus the probability of making a Type II error. Power
#can be thought of as the probability of finding an effect that is there.

#Effect size is the magnitude of the effect under the alternate or research hypothesis.
#The formula for effect size depends on the statistical methodology employed
#in the hypothesis testing

#Given any three, you can calculate the fourth

#power of t-test (equal sample size)
library(pwr)
pwr.t.test(n=, d=, sig.level=, power=, alternative=,type=)
#d is the effect size defined as the standardized mean difference
#d=(u1-u2)/sigma

#If the sample sizes for the two groups are unequal
pwr.t2n.test(n1=, n2=, d=, sig.level=, power=, alternative=)

#power analysis of a balanced oneway analysis of variance
pwr.anova.test(k=, n=, f=, sig.level=, power=)
#For a one-way ANOVA, effect size is measured by f
#f=sqrt(sum(n/N*(ui-u)^2/sigma^2))

#power analysis of correlation test
pwr.r.test(n=, r=, sig.level=, power=, alternative=)
#r is the effect size (as measured by a linear correlation coefficient)

#power analysis of Linear models
pwr.f2.test(u=, v=, f2=, sig.level=, power=)
#u and v are the numerator and denominator degrees of freedom
#f2 is the effect size

#Tests of proportions
pwr.2p.test(h=, n=, sig.level=, power=)
#h is effect size, h = 2arcsin(sqrt(p1))-2arcsin(sqrt(p2))
#h = ES.h(p1, p2)

#Chi-square tests
pwr.chisq.test(w =, N = , df = , sig.level =, power = )
#w = ES.w2(P), P is a probability table.


#permutation test
#t-test (must be factor)
library(coin)
score <- c(40, 57, 45, 55, 58, 57, 64, 55, 62, 65)
treatment <- factor(c(rep("A",5), rep("B",5)))
mydata <- data.frame(treatment, score)
oneway_test(score~treatment, data=mydata, distribution="exact")

#Permutation tests with the lmPerm package
lmp()  aovp() # lm() and aov()
library(lmPerm)
set.seed(1234)
fit <- lmp(weight~height, data=women, perm="Prob")


#Bootstrapping
#a function for obtaining the R-squared value
rsq <- function(formula, data, indices) {
  d <- data[indices,]
  fit <- lm(formula, data=d)
  return(summary(fit)$r.square)
}
library(boot)
set.seed(1234)
results <- boot(data=mtcars, statistic=rsq,
                R=1000, formula=mpg~wt+disp)
boot.ci(results, type=c("perc", "bca"))


#Generalized linear models
#logistic regression (where the dependent variable is categorical) and 
#Poisson regression (where the dependent variable is a count variable)

#binomial  (link = "logit")
#gaussian  (link = "identity")
#gamma     (link = "inverse")
#inverse.gaussian  (link = "1/mu^2")
#poisson   (link = "log")
#quasi     (link = "identity", variance = "constant")
#quasibinomial   (link = "logit")
#quasipoisson   (link = "log")

#logistic regression
glm(Y~X1+X2+X3, family=binomial(link="logit"), data=mydata)
data(Affairs, package="AER")
Affairs$ynaffair[Affairs$affairs  > 0] <- 1
Affairs$ynaffair[Affairs$affairs == 0] <- 0
Affairs$ynaffair <- factor(Affairs$ynaffair,
                             levels=c(0,1),
                             labels=c("No","Yes"))
fit.full <- glm(ynaffair ~ gender + age + yearsmarried + children +
                  religiousness + education + occupation +rating,
                data=Affairs,family=binomial())
fit.reduced <- glm(ynaffair ~ age + yearsmarried + religiousness +
                     rating, data=Affairs, family=binomial())
anova(fit.reduced, fit.full, test="Chisq")
#Overdispersion occurs when the observed variance of the response variable 
#is larger than what would be expected from a binomial distribution. 

#poission regression
glm(Y~X1+X2+X3, family=poisson(link="log"), data=mydata)
data(breslow.dat, package="robust")
fit <- glm(sumY ~ Base + Age + Trt, data=breslow.dat, family=poisson())
#also need to test overdispersion
library(qcc)
qcc.overdispersion.test(breslow.dat$sumY, type="poisson")
#use family="quasipoisson" when overdispersion is present (logit/poisson)

#POISSON REGRESSION WITH VARYING TIME PERIODS
fit <- glm(sumY ~ Base + Age + Trt, data=breslow.dat,
           offset= log(time), family=poisson())

#ZERO-INFLATED POISSON REGRESSION
zeroinfl() #function in the pscl package

#PCA and factor analysis
#factor analysis requires 5–10 times as many subjects as variables.

#pca & efa are from correlations

#parallel analysis 
library(psych)
fa.parallel(USJudgeRatings[,-1], fa="pc", n.iter=100,
            show.legend=FALSE, main="Scree plot with parallel analysis")
abline(h=1)

#Extracting principal components
library(psych)
pc <- principal(USJudgeRatings[,-1], nfactors=1)
pc


#pca with varimax rotation (orthogonal rotation)
rc <- principal(Harman23.cor$cov, nfactors=2, rotate="varimax")


library(psych)
rc <- principal(Harman23.cor$cov, nfactors=2, rotate="varimax")
round(unclass(rc$weights), 2)


#deciding how many common factors to extract
library(psych)
covariances <- ability.cov$cov
correlations <- cov2cor(covariances)
fa.parallel(correlations, n.obs = 112, fa="both", n.iter = 100,
            main="Scree plots with parallel analysis")


#For EFA, the Kaiser–Harris criterion is number of eigenvalues above 0
# rather than 1.



#deal with missing value