R_in_action.R
21.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
with(mtcars, {
nokeepstats <- summary(mpg)
keepstats <<- summary(mpg)
})
#keepstats is a global variable, nokeepstats is not.
source("") #run a set of R statements contained in file
#text output
sink("filename", append = TRUE, split = TRUE) #append means not overwriting,
#split means send output to both screen and the output file
sink() #return the output to screen alone
diabetes <- c("Type1", "Type2", "Type1", "Type1")
diabetes <- factor(diabetes)
#nominal veriable
#store this vector as (1,2,1,1) and associates it with 1 = type 1 (alphabetical)
status <- c("Poor", "Improved", "Excellent", "Poor")
status <- factor(status, ordered=TRUE, levels=c("Poor", "Improved", "Excellent"))
#encode the vector as (3,2,1,3) with specific order as levels
#also can add labels
#str(object) provides information on an object in R
#list
g <- "My First List"
h <- c(25, 26, 18, 39)
j <- matrix(1:10, nrow=5)
k <- c("one", "two", "three")
mylist <- list(title=g, ages=h, j, k)
mylist[[1]] # or mylist[["title"]] or mylist$title
numeric(0)
character(0)
#length of 0
#use file() or ?file() for more information
file() #allows the user to access files
gzfile() bzfile() xzfile() unz() #let user read compressed files
url() #lets you access internet files
#load package
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
packages <- c("plyr","dplyr","reshape2")
ipak(packages)
#download the file to load
if(!file.exists("data")) {dir.create("data")} #creat a file for data
fileUrl <- "https://adsfa"
download.file(fileUrl, destfile = "./data/name",method="curl")
dataDownloaded <- date()
#Current download methods are "internal", "wininet" (Windows only)
#"libcurl", "wget" and "curl", and there is a value "auto"
#importing data from excel
#package gdata
library("gdata")
dataset <- read.xls("file", sheetIndex=1, header = TRUE)
rm() #delete one or more objects
pdf("filename.pdf") #save the graph as a pdf
#or
png("") #save in png
dev.off() #shuts down the specified (by default the current) device
manager <- c(1, 2, 3, 4, 5)
date <- c("10/24/08", "10/28/08", "10/1/08", "10/12/08", "5/1/09")
country <- c("US", "US", "UK", "UK", "UK")
gender <- c("M", "F", "F", "M", "F")
age <- c(32, 45, 25, 39, 99)
q1 <- c(5, 3, 3, 3, 2)
q2 <- c(4, 5, 5, 3, 2)
q3 <- c(5, 2, 5, 4, 1)
q4 <- c(5, 5, 5, NA, 2)
q5 <- c(5, 5, 2, NA, 1)
leadership <- data.frame(manager, date, country, gender, age,
q1, q2, q3, q4, q5, stringsAsFactors=FALSE)
within()
with()
#The within() function is similar to the with() function,
#but allows you to modify the data frame.
na.omit() #deletes any rows with missing data
#date variable, The default format for inputting dates is yyyy-mm-dd
strDates <- c("01/05/1965", "08/16/1975")
dates <- as.Date(strDates, "%m/%d/%Y")
Sys.Date() #returns today’s date
date() #returns the current date and time
format(x, format="output_format")
today <- Sys.Date()
format(today, format="%B %d %Y")
format(today, format="%A")
#output dates in a specified format, and to extract portions of dates
difftime()
#calculate a time interval and express it
#as seconds, minutes, hours, days, or weeks
today <- Sys.Date()
dob <- as.Date("1956-10-12")
difftime(today, dob, units="weeks")
#Time difference of 2825 weeks
#Sorting data
newdata <- leadership[order(gender, age),]
#By default, the sorting order is ascending
total <- merge(dataframeA, dataframeB, by="ID")
#Merging dataset (inner joint)
#NULL (undefined) isn’t the same as NA (missing)
newdata <- subset(leadership, age >= 35 | age < 24,
select=c(q1, q2, q3, q4))
#elect all rows that have a value of age >=35 or age < 24.
#You keep the variables q1 through q4
sample() #enables you to take a random sample (with or without replacement)
#of size n from a dataset
#Using SQL statements to manipulate data frames
#library sqldf
library(sqldf)
newdf <- sqldf("select * from mtcars where carb=1 order by mpg",
row.names=TRUE)
scale()
#standardizes the specified columns of a matrix or data frame to a mean of 0
# and a standard deviation of 1
library(MASS)
mvrnorm(n, mean, sigma)
#draw data from multivariate normal distribution with a given mean vector and
# covariance matrix
#edit text variable
nchar(x) #Counts the number of characters of x
substr(x, start, stop) #Extract or replace substrings in a character vector
grep(pattern, x, ignore.case=FALSE, fixed=FALSE)
#Search for pattern in x. If fixed=FALSE, then pattern is a regular expression.
#If fixed=TRUE, then pattern is a text string. Returns matching indices.
sub(pattern, replacement, x, ignore.case=FALSE, fixed=FALSE)
#Find pattern in x and substitute with replacement text.
#If fixed=FALSE then pattern is
#a regular expression. If fixed=TRUE then pattern is a text string.
gsub() #replace all the pattern
strsplit(x, split, fixed=FALSE)
#Split the elements of character vector x at split.
y <- strsplit("abc", "")
unlist(y)
sapply(y, "[", 2) #both return the 2nd element of y ([ means select elements)
paste(..., sep="") #Concatenate strings after using sep string to separate them.
toupper(x)
tolower(x)
cat(... , file = "", sep = " ")
cut(x, n) #Divide continuous variable x into factor with n levels
#Aggregation and restructuring
aggregate(data[variable.names], by, FUN)
#where x is the data object to be collapsed,
#by is a list of variables that will be crossed to form the new observations,
#and FUN is the scalar function used to calculate summary statistics that
#will make up the new observation values.
install.packages("reshape")
library(reshape)
md <- melt(mydata, id=(c("id", "time")))
melt(data) # put all the data in one col
cast(md, formula, FUN)
prop.table(mytable) #turn these frequencies table into proportions
#CHI-SQUARE TEST OF INDEPENDENCE
chisq.test(table)
# FISHER’S EXACT TEST for Independence
fisher.test(table)
#COCHRAN–MANTEL–HAENSZEL TEST
mantelhaen.test(table)
#The significance tests in the previous section evaluated whether
#or not sufficient evidence existed to reject a null hypothesis
#of independence between variables. If you can reject the null hypothesis,
#your interest turns naturally to measures of association in order to
#gauge the strength of the relationships present
#In statistics,the phi coefficienti s a measure of association for two binary variables.
assocstats(table)
#The Pearson product moment correlation assesses the degree of linear relationship
#between two quantitative variables. Spearman’s Rank Order correlation coefficient
#assesses the degree of relationship between two rank-ordered variables.
#Kendall’s Tau is also a nonparametric measure of rank correlation.
cor(x, y = NULL, use = "everything",
method = c("pearson", "kendall", "spearman"))
#A partial correlation is a correlation between two quantitative variables,
#controlling for one or more other quantitative variables.
library(ggm)
# partial correlation of population and murder rate, controlling
# for income, illiteracy rate, and HS graduation rate
pcor(c(1,5,2,3,6), cov(states))
#Testing correlations for significance
cor.test(x, y,
alternative = c("two.sided", "less", "greater"),
method = c("pearson", "kendall", "spearman"),
exact = NULL, conf.level = 0.95)
#t-test
t.test(x, y = NULL,
alternative = c("two.sided", "less", "greater"),
mu = 0, paired = FALSE, var.equal = FALSE,
conf.level = 0.95)
#Nonparametric tests of group differences
#If the two groups are independent, you can use the Wilcoxon rank sum test
#(more popularly known as the Mann–Whitney U test) to
#assess whether the observations are sampled from the same probability distribution
wilcox.test(x, y = NULL,
alternative = c("two.sided", "less", "greater"),
mu = 0, paired = FALSE, exact = NULL, correct = TRUE,
conf.int = FALSE, conf.level = 0.95)
#nonparametric test of ANOVA
kruskal.test(y ~ A, data)
friedman.test(y ~ A | B, data)
#where y is the numeric outcome variable, A is a grouping variable,
#and B is a blocking variable that identifies matched observations.
#linear regression
myfit <- lm(formula, data)
# : Denotes an interaction between predictor variables. A prediction of y from x, z,
# and the interaction between x and z would be coded y ~ x + z + x:z.
# * A shortcut for denoting all possible interactions.
# The code y ~ x * z * w expands to y ~ x + z + w + x:z + x:w + z:w + x:z:w.
# ^ Denotes interactions up to a specified degree. The code y ~ (x + z + w)^2
# expands to y ~ x + z + w + x:z + x:w + z:w.
# - A minus sign removes a variable from the equation.
#For example, y ~ (x + z + w)^2 – x:w expands to y ~ x + z + w + x:z + z:w.
# -1 Suppresses the intercept. For example, the formula y ~ x -1
# fits a regression of y on x, and forces the line through the origin at x=0.
# I() Elements within the parentheses are interpreted arithmetically.
# For example, y ~ x + (z + w)^2 would expand to y ~ x + z + w + z:w.
# In contrast, the code y ~ x + I((z + w)^2) would expand to y ~ x + h,
# where h is a new variable created by squaring the sum of z and w.
summary()
coefficients()
confint()
fitted()
residuals()
anova()
vcov()
AIC()
plot()
predict()
#multilinear regression
cor() #examine the relationships among the variables two at a time
library(car)
scatterplotMatrix(states, spread=FALSE, lty.smooth=2,
main="Scatter Plot Matrix")
#A significant interaction between two predictor variables tells
#you that the relationship between one predictor and the response variable
#depends on the level of the other predictor.
#Regression diagnostics(plot)
fit <- lm(weight ~ height, data=women)
par(mfrow=c(2,2))
plot(fit)
#Multicollinearity problem (variance inflation factor)
#As a general rule, sqrt(vif) > 2 indicates a multicollinearity problem
library(car)
vif(fit)
#Outliers
#Outliers are observations that aren’t predicted well by the model.
#They have either unusually large positive or negative residuals
#Points in the Q-Q plot that lie outside the confidence band are considered outliers.
#A rough rule of thumb is that standardized residuals that are larger than 2
#or less than –2 are worth attention.
library(car)
outlierTest(fit)
#reports the Bonferroni adjusted p-value for the largest absolute
#studentized residual
#Observations that have high leverage are outliers with regard to the other predictors.
hat.plot <- function(fit) {
p <- length(coefficients(fit))
n <- length(fitted(fit))
plot(hatvalues(fit), main="Index Plot of Hat Values")
abline(h=c(2,3)*p/n, col="red", lty=2)
identify(1:n, hatvalues(fit), names(hatvalues(fit)))
}
hat.plot(fit)
#Influential observations are observations that have a disproportionate
#impact on the values of the model parameters.
#Cook’s distance, or D statistic and added variable plots.
#Roughly speaking, Cook’s D values greater than 4/(n-k-1),
#where n is the sample size and k is the number of predictor variables,
#indicate influential observations.
cutoff <- 4/(nrow(states)-length(fit$coefficients)-2)
plot(fit, which=4, cook.levels=cutoff)
abline(h=cutoff, lty=2, col="red")
library(car)
influencePlot(fit, id.method="identify", main="Influence Plot",
sub="Circle size is proportional to Cook’s distance")
#Box–Cox transformation to normality
library(car)
summary(powerTransform(states$Murder))
#Backward stepwise selection
library(MASS)
fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost,
data=states)
stepAIC(fit, direction="backward")
#Relative importance
#Standardized regression coefficients
zstates <- as.data.frame(scale(states))
zfit <- lm(Murder~Population + Income + Illiteracy + Frost, data=zstates)
coef(zfit)
#relative weights
relweights <- function(fit,...){
R <- cor(fit$model)
nvar <- ncol(R)
rxx <- R[2:nvar, 2:nvar]
rxy <- R[2:nvar, 1]
svd <- eigen(rxx)
evec <- svd$vectors
ev <- svd$values
delta <- diag(sqrt(ev))
lambda <- evec %*% delta %*% t(evec)
lambdasq <- lambda ^ 2
beta <- solve(lambda) %*% rxy
rsquare <- colSums(beta ^ 2)
rawwgt <- lambdasq %*% beta ^ 2
import <- (rawwgt / rsquare) * 100
lbls <- names(fit$model[2:nvar])
rownames(import) <- lbls
colnames(import) <- "Weights"
barplot(t(import),names.arg=lbls,
ylab="% of R-Square",
xlab="Predictor Variables",
main="Relative Importance of Predictor Variables",
sub=paste("R-Square=", round(rsquare, digits=3)),
...)
return(import)
}
#ANOVA
#It's called a between-groups factor because patients
#are assigned to one and only one group.
#It's called a within-groups factor because each patient is
#measured under both levels.
#design for anova
#One-way ANOVA y ~ A
#One-way ANCOVA with one covariate y ~ x + A
#Two-way Factorial ANOVA y ~ A * B
#Two-way Factorial ANCOVA with two covariates y ~ x1 + x2 + A * B
#Randomized Block y ~ B + A (where B is a blocking factor)
#One-way within-groups ANOVA y ~ A + Error(Subject/A)
#Repeated measures ANOVA with one within-groups
#factor (W) and one between-groups factor (B) y ~ B * W + Error(Subject/W)
fit <- aov(response ~ trt)
summary(fit)
#plot group means with confidence intervals
library(gplots)
plotmeans(response ~ trt, xlab="Treatment", ylab="Response",
main="Mean Plot\nwith 95% CI")
#multiple comparisons
#Tukey HSD pairwise group comparisons
TukeyHSD(fit) #if HH is loaded, TukeyHSD() will fail. In that case, use detach("package:HH")
par(las=2)
par(mar=c(5,8,4,2))
plot(TukeyHSD(fit))
#bar plots
library(multcomp)
par(mar=c(5,4,6,2))
tuk <- glht(fit, linfct=mcp(trt="Tukey"))
plot(cld(tuk, level=.05),col="lightgrey")
#Assessing test assumptions
#normality
library(car)
qqPlot(lm(response ~ trt, data=cholesterol),
simulate=TRUE, main="Q-Q Plot", labels=FALSE)
#equality of variances
bartlett.test(response ~ trt, data=cholesterol)
#outliers
library(car)
outlierTest(fit)
#one way ancova
fit <- aov(weight ~ gesttime + dose)
summary(fit)
#obtain adjusted group means-that is, the group means obtained after partialing out the effects of the covariate
library(effects)
effect("dose", fit)
#Multiple comparisons employing user-supplied contrasts
library(multcomp)
contrast <- rbind("no drug vs. drug" = c(3, -1, -1, -1))
summary(glht(fit, linfct=mcp(dose=contrast)))
#standard ANCOVA designs assumes homogeneity of regression slopes.
library(multcomp)
fit2 <- aov(weight ~ gesttime*dose, data=litter)
summary(fit2)
#A significant interactionwould imply that the relationship between gestation
#and birth weight depends on the level of the dose variable.
#Visualizing ancova
library(HH)
ancova(weight ~ gesttime + dose, data=litter)
#Two-way factorial ANOVA
attach(ToothGrowth)
fit <- aov(len ~ supp*dose)
summary(fit)
#visualizing interaction
interaction.plot(dose, supp, len, type="b",
col=c("red","blue"), pch=c(16, 18),
main = "Interaction between Dose and Supplement Type")
#or
library(gplots)
plotmeans(len ~ interaction(supp, dose, sep=" "),
connect=list(c(1,3,5),c(2,4,6)),
col=c("red", "darkgreen"),
main = "Interaction Plot with 95% CIs",
xlab="Treatment and Dose Combination")
#or
library(HH)
interaction2wt(len~supp*dose)
#Repeated measures ANOVA
w1b1 <- subset(CO2, Treatment=='chilled')
fit <- aov(uptake ~ conc*Type + Error(Plant/(conc)), w1b1)
summary(fit)
#visualizing the result
par(las=2)
par(mar=c(10,4,4,2))
with(w1b1, interaction.plot(conc,Type,uptake,
type="b", col=c("red","blue"), pch=c(16,18),
main="Interaction Plot for Plant Type and Concentration"))
boxplot(uptake ~ Type*conc, data=w1b1, col=(c("gold", "green")),
main="Chilled Quebec and Mississippi Plants",
ylab="Carbon dioxide uptake rate (umol/m^2 sec)")
#Multivariate analysis of variance (MANOVA)
library(MASS)
attach(UScereal)
y <- cbind(calories, fat, sugars)
aggregate(y, by=list(shelf), FUN=mean)
fit <- manova(y ~ shelf)
summary(fit)
summary.aov(fit)
#Robust MANOVA
library(rrcov)
Wilks.test(y,shelf,method="mcd")
#Power Analysis
#In planning research, the researcher typically pays special attention to four quantities:
#sample size, significance level, power, and effect size
#Sample size refers to the number of observations in each condition/group of the
#experimental design.
#The significance level (also referred to as alpha) is defined as the probability of
#making a Type I error. The significance level can also be thought of as the probability
#of finding an effect that is not there.
#Power is defined as one minus the probability of making a Type II error. Power
#can be thought of as the probability of finding an effect that is there.
#Effect size is the magnitude of the effect under the alternate or research hypothesis.
#The formula for effect size depends on the statistical methodology employed
#in the hypothesis testing
#Given any three, you can calculate the fourth
#power of t-test (equal sample size)
library(pwr)
pwr.t.test(n=, d=, sig.level=, power=, alternative=,type=)
#d is the effect size defined as the standardized mean difference
#d=(u1-u2)/sigma
#If the sample sizes for the two groups are unequal
pwr.t2n.test(n1=, n2=, d=, sig.level=, power=, alternative=)
#power analysis of a balanced oneway analysis of variance
pwr.anova.test(k=, n=, f=, sig.level=, power=)
#For a one-way ANOVA, effect size is measured by f
#f=sqrt(sum(n/N*(ui-u)^2/sigma^2))
#power analysis of correlation test
pwr.r.test(n=, r=, sig.level=, power=, alternative=)
#r is the effect size (as measured by a linear correlation coefficient)
#power analysis of Linear models
pwr.f2.test(u=, v=, f2=, sig.level=, power=)
#u and v are the numerator and denominator degrees of freedom
#f2 is the effect size
#Tests of proportions
pwr.2p.test(h=, n=, sig.level=, power=)
#h is effect size, h = 2arcsin(sqrt(p1))-2arcsin(sqrt(p2))
#h = ES.h(p1, p2)
#Chi-square tests
pwr.chisq.test(w =, N = , df = , sig.level =, power = )
#w = ES.w2(P), P is a probability table.
#permutation test
#t-test (must be factor)
library(coin)
score <- c(40, 57, 45, 55, 58, 57, 64, 55, 62, 65)
treatment <- factor(c(rep("A",5), rep("B",5)))
mydata <- data.frame(treatment, score)
oneway_test(score~treatment, data=mydata, distribution="exact")
#Permutation tests with the lmPerm package
lmp() aovp() # lm() and aov()
library(lmPerm)
set.seed(1234)
fit <- lmp(weight~height, data=women, perm="Prob")
#Bootstrapping
#a function for obtaining the R-squared value
rsq <- function(formula, data, indices) {
d <- data[indices,]
fit <- lm(formula, data=d)
return(summary(fit)$r.square)
}
library(boot)
set.seed(1234)
results <- boot(data=mtcars, statistic=rsq,
R=1000, formula=mpg~wt+disp)
boot.ci(results, type=c("perc", "bca"))
#Generalized linear models
#logistic regression (where the dependent variable is categorical) and
#Poisson regression (where the dependent variable is a count variable)
#binomial (link = "logit")
#gaussian (link = "identity")
#gamma (link = "inverse")
#inverse.gaussian (link = "1/mu^2")
#poisson (link = "log")
#quasi (link = "identity", variance = "constant")
#quasibinomial (link = "logit")
#quasipoisson (link = "log")
#logistic regression
glm(Y~X1+X2+X3, family=binomial(link="logit"), data=mydata)
data(Affairs, package="AER")
Affairs$ynaffair[Affairs$affairs > 0] <- 1
Affairs$ynaffair[Affairs$affairs == 0] <- 0
Affairs$ynaffair <- factor(Affairs$ynaffair,
levels=c(0,1),
labels=c("No","Yes"))
fit.full <- glm(ynaffair ~ gender + age + yearsmarried + children +
religiousness + education + occupation +rating,
data=Affairs,family=binomial())
fit.reduced <- glm(ynaffair ~ age + yearsmarried + religiousness +
rating, data=Affairs, family=binomial())
anova(fit.reduced, fit.full, test="Chisq")
#Overdispersion occurs when the observed variance of the response variable
#is larger than what would be expected from a binomial distribution.
#poission regression
glm(Y~X1+X2+X3, family=poisson(link="log"), data=mydata)
data(breslow.dat, package="robust")
fit <- glm(sumY ~ Base + Age + Trt, data=breslow.dat, family=poisson())
#also need to test overdispersion
library(qcc)
qcc.overdispersion.test(breslow.dat$sumY, type="poisson")
#use family="quasipoisson" when overdispersion is present (logit/poisson)
#POISSON REGRESSION WITH VARYING TIME PERIODS
fit <- glm(sumY ~ Base + Age + Trt, data=breslow.dat,
offset= log(time), family=poisson())
#ZERO-INFLATED POISSON REGRESSION
zeroinfl() #function in the pscl package
#PCA and factor analysis
#factor analysis requires 5–10 times as many subjects as variables.
#pca & efa are from correlations
#parallel analysis
library(psych)
fa.parallel(USJudgeRatings[,-1], fa="pc", n.iter=100,
show.legend=FALSE, main="Scree plot with parallel analysis")
abline(h=1)
#Extracting principal components
library(psych)
pc <- principal(USJudgeRatings[,-1], nfactors=1)
pc
#pca with varimax rotation (orthogonal rotation)
rc <- principal(Harman23.cor$cov, nfactors=2, rotate="varimax")
library(psych)
rc <- principal(Harman23.cor$cov, nfactors=2, rotate="varimax")
round(unclass(rc$weights), 2)
#deciding how many common factors to extract
library(psych)
covariances <- ability.cov$cov
correlations <- cov2cor(covariances)
fa.parallel(correlations, n.obs = 112, fa="both", n.iter = 100,
main="Scree plots with parallel analysis")
#For EFA, the Kaiser–Harris criterion is number of eigenvalues above 0
# rather than 1.
#deal with missing value