Commit 978c0f7912211370ce279a8973091245c8a149a7

Authored by Jingan Qu
1 parent 658ed0df54
Exists in master

upload two R code file

Showing 3 changed files with 1555 additions and 791 deletions   Show diff stats
GPL22111.csv
File was created 1
2
3 #download the file to load
4 if (!file.exists("data")){dir.create("data")}
5 fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
6 download.file(fileUrl, destfile = "./data/cameras.csv",method = "curl")
7 dateDownloaded <- date()
8 if (!file.exists("data")){dir.create("data")}
9 fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
10 download.file(fileUrl, destfile = "./data/cameras.csv",method = "curl")
11 dateDownloaded <- date()
12
13 dataset <- read.csv("./data/cameras.csv")
14 #equal to
15 dataset <- read.table("./data/cameras.csv",sep=",",header = TRUE)
16
17
18 if (!file.exists("data")){dir.create("data")}
19 fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.xlsx?accessType=DOWNLOAD"
20 download.file(fileUrl, destfile = "./data/cameras.xlsx",method = "curl")
21 dateDownloaded <- date()
22
23 #read xlsx file
24 library(gdata)
25 dataset <- read.csv("./data/cameras.xlsx")
26
27 #data table
28 library(data.table)
29 DF <- data.frame(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))
30 DT <- data.table(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))
31 #subsetting rows
32 DT[2,]
33 DT[DT$y == 'a']
34 DT[c(2,3)]
35 #subsetting columns
36 DT[,list(x)]
37 #calculating values for variables
38 DT[,list(mean(x),sum(z))]
39 DT[,table(y)]
40 #adding new columns
41 DT[,w:=z^2]
42 DT2 <- DT
43 DT[,y:=2] #DT, DT2 the same
44 #multiple operations
45 DT[,m:={tmp<-(x+z);log2(tmp+5)}]
46 DT[,a:=x>0]
47 DT[,b:=mean(x+w),by=a]
48 #special variables
49 set.seed(123)
50 DT <- data.table(x=sample(letters[1:3],1E5,TRUE))
51 #keys
52 DT <- data.table(x=rep(c("a","b","c"),each=100),y=rnorm(300))
53 setkey(DT,x)
54 DT['a']
55 #joins
56 DT1 <- data.table(x=c('a','a','b','dt1'),y=1:4)
57 DT2 <- data.table(x=c('a','b','dt2'),z=5:7)
58 setkey(DT1,x); setkey(DT2,x)
59 merge(DT1, DT2)
60
61
62 #read XML into R
63 library(XML)
64 fileUrl <- "http://www.w3schools.com/xml/simple.xml"
65 doc <- xmlTreeParse(fileUrl, useInternal = TRUE)
66 rootNode <- xmlRoot(doc)
67 xmlName(rootNode)
68 names(rootNode)
69 #directly access parts of XML doc
70 rootNode[[1]]
71 rootNode[[1]][[1]]
72 #programmatically extract parts of the file
73 xmlSApply(rootNode,xmlValue)
74 xpathSApply(rootNode, "//name",xmlValue)
75 xpathSApply(rootNode, "//price",xmlValue)
76 #another example
77 fileUrl <- "http://espn.com/nfl/team/_/name/baltimore-ravens"
78 doc <- htmlTreeParse(fileUrl, useInternal = TRUE)
79 scores <- xpathSApply(doc,"li[@class='score']",xmlValue)
80 teams <- xpathSApply(doc,"li[@class='team-name']",xmlValue)
81
82 #Making a Proportional Stacked Bar Graph
83
84 #reading JSON
85 #javascript object notation
86 library(jsonlite)
87 jsonData <- fromJSON("https://api.github.com/users/jtleek/repos")
88 names(jsonData)
89 names(jsonData$owner)
90 jsonData$owner$login
91 #writing data frame to JSON
92 myjson <- toJSON(iris, pretty = TRUE)
93 cat(myjson)
94 #convert back to JSON
95 iris2 <- fromJSON(myjson)
96
97
98 #reading mySQL
99 library(RMySQL)
100 ucscDb <- dbConnect(MySQL(),user="genome",
101 host="genome-mysql.cse.ucsc.edu")
102 result <- dbGetQuery(ucscDb,"show databases;");dbDisconnect(ucscDb)
103
104 hg19 <- dbConnect(MySQL(),user="genome",db='hg19',
105 host="genome-mysql.cse.ucsc.edu")
106 allTables <- dbListTables(hg19)
107 length(allTables)
108 #get dimensions of a specific table
109 dbListFields(hg19,"affyU133Plus2")
110 dbGetQuery(hg19,"select count(*) from affyU133Plus2")
111 result <- dbGetQuery(hg19,"select * from affyU133Plus2")
112 #read from the table
113 affyData <- dbReadTable(hg19,"affyU133Plus2")
114 #query
115 affyMis <- dbGetQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3")
116 #equal to
117 query <- dbSendQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3")
118 affyMis <- fetch(query)
119 quantile(affyMis$misMatches)
120 dbClearResult(query)
121 #disconnect mySQL
122 dbDisconnect(hg19)
123
124
125 #reading HDF5
126 source("http://bioconductor.org/biocLite.R")
127 biocLite("rhdf5")
128 library(rhdf5)
129 created <- h5createFile("example.h5")
130 created <- h5createGroup("example.h5","foo")
131 created <- h5createGroup("example.h5","baa")
132 created <- h5createGroup("example.h5","foo/foobaa")
133 h5ls("example.h5")
134 #write to groups
135 A <- matrix(1:10, nr=5, nc =2)
136 h5write(A, "example.h5","foo/A")
137 B <- array(seq(0.1,2.0,by=0.1),dim=c(5,2,2))
138 attr(B,"scale") <- "liter"
139 h5write(B,"example.h5","foo/foobaa/B")
140 h5ls("example.h5")
141 #write a data set
142 df <- data.frame(1L:5L, seq(0,1,length.out = 5), c('ab','cde','fghi','a','s'),
143 stringsAsFactors = F)
144 h5write(df,"example.h5","df")
145 h5ls("example.h5")
146 #reading data
147 readA <- h5read("example.h5","foo/A")
148 readB <- h5read("example.h5","foo/foobaa/B")
149 readdf <- h5read("example.h5",'df')
150 readA
151
152
153 #reading data from APIs
154
155 #subsetting
156 set.seed(13435)
157 X <- data.frame("var1"=sample(1:5),"var2"=sample(6:10),"var3"=sample(11:15))
158 X <- X[sample(1:5),]
159 X$var2[c(1,3)] <- NA
160 #dealing with missing values
161 X[which(X$var2>8),] #which can ignore the NA values
162 #sorting川普
163 sort(X$var1)
164 sort(X$var1,decreasing = TRUE)
165 sort(X$var2,na.last = TRUE)
166 #ordering
167 X[order(X$var1),]
168 #ording by plyr
169 library(plyr)
170 arrange(X,var1)
171 arrange(X,desc(var1))
172
173
174 #creating new variables
175 #getting the data from the web
176 if (!file.exists("data")){dir.create("data")}
177 fileUrl <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"
178 download.file(fileUrl, destfile = "./data/restaurants.csv",method = "curl")
179 dateDownloaded <- date()
180 restData <- read.csv("./data/restaurants.csv")
181
182 #subsetting variables
183 restData$nearMe <- restData$neighborhood %in% c("Roland Park","Homeland")
184 table(restData$nearMe)
185 #creating binary variables
186 restData$zipWrong <- ifelse(restData$zipCode < 0, TRUE, FALSE)
187 with(restData,table(zipWrong,zipCode < 0))
188 #creating categorical variable
189 restData$zipGroup <- cut(restData$zipCode, breaks = quantile(restData$zipCode))
190 table(restData$zipGroup)
191 #easier cutting
192 library(Hmisc)
193 restData$zipGroup <- cut2(restData$zipCode,g=4)
194 table(restData$zipGroup)
195 #creating factor variables
196 restData$zcf <- factor(restData$zipCode)
197 #levels of factor
198 yesno <- sample(c("yes","no"),size = 10,replace = TRUE)
199 yesnofac <- factor(yesno,levels = c("yes","no"))
200 relevel(yesnofac,ref = "no")
201 #cutting produces factor variables
202 library(Hmisc)
203 restData$zipGroup <- cut2(restData$zipCode,g=4)
204 #using mutate function
205 library(Hmisc)
206 library(plyr)
207 restData2 <- mutate(restData,zipGroup=cut2(zipCode,g=4))
208 table(restData2$zipGroup)
209
210 #smmarizing data
211 colSums(is.na(restData))
212 all(colSums(is.na(restData)) == 0)
213 #values with specific character
214 table(restData$zipCode %in% c("21212","21213"))
215 #cross table
216 data("UCBAdmissions")
217 df1 <- as.data.frame(UCBAdmissions)
218 summary(df1)
219 xt <- xtabs(Freq ~Gender +Admit,data=df1) #sum
220 #flat tables
221 warpbreaks$replicate <- rep(1:9, len=54)
222 xt <- xtabs(breaks~.,data=warpbreaks)
223 ftable(xt)
224 #size of a data set
225 fakeData <- rnorm(1e5)
226 object.size(fakeData)
227 print(object.size(fakeData),units = "Mb")
228
229
230 #dplyr package
231 library(dplyr)
232 #select
233 chicago <- readRDS("./data/chicago.rds")
234 head(select(chicago, 1:5)) #select col
235 head(select(chicago, city:dptp))
236 head(select(chicago,-(city:dptp))) #not select
237 #equivalent
238 i <- match("city", names(chicago))
239 j <- match("dptp", names(chicago))
240 head(chicago[,-(i:j)])
241
242 #filter
243 chic <- filter(chicago, pm25tmean2 > 30)
244 head(select(chic,1:3,pm25tmean2))
245 chic <- filter(chicago, pm25tmean2 > 30 & tmpd > 80)
246 head(select(chic,1:3,pm25tmean2,tmpd))
247
248 #arrange (reorder rows while preserving corresponding order of other col)
249 chicago <- arrange(chicago, date)
250 head(select(chicago,date,pm25tmean2))
251 #descending order
252 chicago <- arrange(chicago, desc(date))
253 head(select(chicago,date,pm25tmean2))
254
255
256 #rename
257 chicago <- rename(chicago, dewpoint = dptp, pm25 = pm25tmean2)
258 head(select(chicago,1:5))
259
260 #mutate
261 chicago <- mutate(chicago,
262 pm25detrend = pm25 - mean(pm25,na.rm = TRUE))
263 head(select(chicago,pm25,pm25detrend))
264
265 #group_by (generate summary statistics by stratum)
266 chicago <- mutate(chicago,
267 tempcat=factor(1*(tmpd>80), labels = c("cold","hot")))
268 #1* just transfor to numeric (0,1)
269 hotcold <- group_by(chicago, tempcat)
270 summarize(hotcold, pm25 = mean(pm25, na.rm = TRUE),
271 o3 = max(o3tmean2, na.rm = TRUE),
272 no2 = median(no2tmean2, na.rm = TRUE))
273
274 chicago <- mutate(chicago,
275 year = as.POSIXlt(date)$year + 1900)
276 years <- group_by(chicago, year)
277 summarize(years, pm25 = mean(pm25, na.rm = T),
278 o3 = max(o3tmean2, na.rm = T),
279 no2 = median(no2tmean2, na.rm = T))
280
281 chicago %>% mutate(month = as.POSIXlt(date)$mon +1) %>% group_by(month) %>% summarize(pm25 = mean(pm25, na.rm = T),
282 o3 = max(o3tmean2, na.rm = T),
283 no2 = median(no2tmean2, na.rm = T))
284
285
286 if (!exists("./data")) {dir.create("./data")}
287 fileUrl1 <- "https://dl.dropboxusercontent.com/u/7710864/data/reviews-apr29.csv"
288 fileUrl2 <- "https://dl.dropboxusercontent.com/u/7710864/data/solutions-apr29.csv"
289
290 download.file(fileUrl1,destfile = "./data/reviews.csv",method = "curl")
291 download.file(fileUrl2,destfile = "./data/solutions.csv",method = "curl")
292
293 reviews <- read.csv("./data/reviews.csv")
294 solutions <- read.csv("./data/solutions.csv")
295
296 #merge data
297 mergedData <- merge(reviews, solutions, by.x = "solution_id", by.y="id",all=TRUE)
298
299 #join in the plyr package
300 library(plyr)
301 df1 <- data.frame(id = sample(1:10), x=rnorm(10))
302 df2 <- data.frame(id = sample(1:10), y = rnorm(10))
303 arrange(join(df1, df2),id)
304 #join multiple data frames
305 df1 <- data.frame(id = sample(1:10), x=rnorm(10))
306 df2 <- data.frame(id = sample(1:10), y = rnorm(10))
307 df3 <- data.frame(id = sample(1:10),z=rnorm(10))
308
309 dfList <- list(df1,df2,df3)
310 join_all(dfList)
311
312 #working with date
313 d1 <- date()
314 class(d1) #charactor
315
316 d2 <- Sys.Date()
317 class(d2) #date
318 #formatting dates
319 format(d2, "%a %b %d")
320
321 x <- c("1jan1960","2jan1960")
322 z <- as.Date(x, "%d%b%Y")
323 #converting
324 weekdays(d2)
325
326 #lubridate
327 library(lubridate)
328 ymd("20140108")
329 ymd("2011-08-03")
330
331 #editing text variables
332
333 if (!file.exists("data")){dir.create("data")}
334 fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
335 download.file(fileUrl, destfile = "./data/cameras.csv",method = "curl")
336
337 camera <- read.csv("./data/cameras.csv")
338 tolower(names(camera))
339
340 #fixing character vectors
341 splitNames <- strsplit(names(camera), "\\.")
342 splitNames[[6]][1]
343 firstElement <- function(x){x[1]}
344 sapply(splitNames,firstElement)
345
346 fileUrl1 <- "https://dl.dropboxusercontent.com/u/7710864/data/reviews-apr29.csv"
347 fileUrl2 <- "https://dl.dropboxusercontent.com/u/7710864/data/solutions-apr29.csv"
348
349 download.file(fileUrl1,destfile = "./data/reviews.csv",method = "curl")
350 download.file(fileUrl2,destfile = "./data/solutions.csv",method = "curl")
351
352 reviews <- read.csv("./data/reviews.csv")
353 solutions <- read.csv("./data/solutions.csv")
354 names(reviews)
355 #replacement
356 sub("_", "", names(reviews)) #just replace one
357 gsub() #replace all
358
359
360 #finding values
361 grep("Alameda", camera$intersection) #return location
362 grepl() #return T or F
363
364 grep("Alameda", camera$intersection, value = TRUE) #return exact element
365
366
367 #stringr library
368 library(stringr)
369 nchar("Jeffrey Leek") #count
370 substr("Jeffrey Leek",1,7)
371 paste("Jeffrey","Leek") #with blank
372 paste0("Jeffrey", "Leek") #without blank
373 str_trim("Jeff ") #remove the blanks (beginning and ending)
374
375
376 #reshaping
377 library(reshape2)
378 mtcars$carname <- rownames(mtcars)
379 carMelt <- melt(mtcars, id=c("carname","gear","cyl"), measure.vars = c("mpg","hp"))
380
381
382 #casting data frames
383 cylData <- dcast(carMelt,cyl~variable) #count
384 cylData <- dcast(carMelt,cyl~variable, mean)
385
386 #averaging values
387 head(InsectSprays)
388 tapply(InsectSprays$count,InsectSprays$spray,sum) #
389 #split
390 spIns <- split(InsectSprays$count, InsectSprays$spray)
391 sapply(spIns,sum)
392 #or
393 unlist(lapply(spIns,sum))
394
395 #plyr package
396 library(plyr)
397 ddply(InsectSprays, "spray",summarize,sum=sum(count)) #return dataframe
398
399
400
401
402
403
404
405
File was created 1
2 set.seed(1410)
3
4 dsmall <- diamonds[sample(nrow(diamonds), 100),]
5
6
7 #scatterplot
8 qplot(carat, price, data = diamonds)
9 qplot(log(carat), log(price), data = diamonds)
10
11 qplot(carat, x*y*z, data = diamonds)
12 qplot(carat, price, data = dsmall, colour = color)
13 qplot(carat, price, data = dsmall, shape = cut)
14
15 qplot(carat, price, data = diamonds, alpha = I(1/10))
16 qplot(carat, price, data = diamonds, alpha = I(1/100))
17 qplot(carat, price, data = diamonds, alpha = I(1/200))
18
19
20 #geom
21 qplot(carat, price, data = dsmall, geom = c("point", "smooth"))
22 qplot(carat, price, data = diamonds, geom = c("point", "smooth"))
23
24
25 qplot(color, price/carat, data=diamonds,geom = "jitter")
26 qplot(color, price/carat, data=diamonds,geom = "boxplot")
27 qplot(color, price / carat, data = diamonds, geom = "jitter",
28 alpha = I(1 / 50))
29
30 #offer size, colour, shape for scatterplot
31 #offer size, colour, fill for boxplot
32 qplot(color, price/carat, data=diamonds,geom = "boxplot",colour="red")
33
34
35 qplot(carat, data = diamonds, geom = "histogram")
36 qplot(carat, data = diamonds, geom = "density")
37
38
39 qplot(carat, data = diamonds, geom = "histogram", binwidth = 1,
40 xlim = c(0,3))
41 qplot(carat, data = diamonds, geom = "histogram", binwidth = 0.1,
42 xlim = c(0,3))
43 qplot(carat, data = diamonds, geom = "histogram", binwidth = 0.01,
44 xlim = c(0,3))
45
46 qplot(carat, data = diamonds, geom = "density", colour = color)
47 qplot(carat, data = diamonds, geom = "histogram", fill = color, bins = 50)
48
49 qplot(color, data = diamonds, geom = "bar")
50 qplot(color, data = diamonds, geom = "bar", weight = carat) +
51 scale_y_continuous("carat") #total weight
52
53
54 qplot(date, unemploy / pop, data = economics, geom = "line")
55 qplot(date, uempmed, data = economics, geom = "line")
56
57 #path
58 year <- function(x) as.POSIXlt(x)$year + 1900
59 qplot(unemploy / pop, uempmed, data = economics,
60 geom = c("point", "path"))
61 qplot(unemploy / pop, uempmed, data = economics,
62 geom = "path", colour = year(date)) + scale_area()
63
64 #faceting
65 # row var ∼ col var
66 qplot(carat, data = diamonds, facets = color ~ .,
67 geom = "histogram", binwidth = 0.1, xlim = c(0, 3))
68 qplot(carat, ..density.., data = diamonds, facets = color ~ .,
69 geom = "histogram", binwidth = 0.1, xlim = c(0, 3))
70 #Using ..density.. tells ggplot2 to map the density to the y-axis
71 #instead of the default use of count
72 #density plot makes it easier to compare distributions ignoring
73 #the relative abundance of diamonds within each colour
74
75
76 #see ?plotmath for more examples of using mathematical formulae
77
78 qplot(
79 carat, price, data = dsmall,
80 xlab = "Price ($)", ylab = "Weight (carats)",
81 main = "Price-weight relationship" )
82
83
84 qplot(
85 carat, price/carat, data = dsmall,
86 xlab = "Weight(carats)", ylab = "expression(frac(price,carat))",
87 main = "Small diamonds",
88 xlim = c(.2, 1))
89
90 qplot(carat, price, data = dsmall, log = "xy")
91
92 qplot(displ, hwy, data = mpg, colour = factor(cyl)) #factor
93
94 #ggplot
95 p <- ggplot(diamonds, aes(x = carat))
96 p <- p + layer(
97 geom = "bar",
98 geom_params = list(fill = "steelblue"),
99 stat = "bin",
100 stat_params = list(binwidth = 2)
101 )
102 #layer(geom, geom_params, stat, stat_params, data, mapping, position)
103 ggplot(msleep, aes(sleep_rem / sleep_total, awake)) +
104 geom_point()
105
106 #geom_XXX(mapping, data, ..., geom, position)
107 #stat_XXX(mapping, data, ..., stat, position)
108 p <- ggplot(mtcars, aes(mpg, wt)) +
109 geom_point()
110
111 p + geom_point(aes(colour = factor(cyl)))
112 p + geom_point(aes(y=disp))
113
114 p <- ggplot(mtcars, aes(mpg, wt))
115 p + geom_point(colour = "darkblue")
116 p + geom_point(aes(colour = "darkblue"))
117
118 p + geom_point(aes(colour = factor(cyl)))
119
120 p <- ggplot(Oxboys, aes(age, height, group = Subject)) + geom_line()
121
122 p + geom_smooth(aes(group = Subject), method = "lm", se = F)
123
124 p + geom_smooth(aes(group = 1), method = "lm", se = F)
125
126 boysbox <- ggplot(Oxboys, aes(Occasion, height)) + geom_boxplot()
127
128 boysbox + geom_line(aes(group=Subject), color=I(8))
129
130 xgrid <- with(df, seq(min(x), max(x), length=50))
131
132 p <- ggplot(diamonds, aes(carat)) +
133 geom_histogram(aes(y=..count..), binwidth = 0.1)
134 p <- ggplot(diamonds, aes(carat)) +
135 geom_histogram(aes(y=..density..), binwidth = 0.1)
136 qplot(carat, ..density.., data=diamonds, geom = "histogram",binwidth = 0.1)
137
138 d <- ggplot(diamonds,aes(carat)) + xlim(0, 3)
139 d + stat_bin(aes(ymax = ..count..), binwidth = 0.1, geom = "area")
140 d + stat_bin(aes(size = ..density..), binwidth = 0.1, geom = "point", position = "identity")
141 d + stat_bin(aes(y = 1, fill = ..count..), binwidth = 0.1,geom = "tile", position = "identity")
142
143 #graph cook book
144 library(gcookbook)
145 library(ggplot2)
146 #chapte 2 quickly exploring data
147
148 #creating a scatter plot
149 plot(mtcars$wt, mtcars$mpg)
150 qplot(mtcars$wt, mtcars$mpg)
151 qplot(wt, mpg, data=mtcars)
152 # This is equivalent to:
153 ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point()
154
155 #creating line plot
156 plot(pressure$temperature, pressure$pressure, type="l")
157
158 qplot(pressure$temperature, pressure$pressure, geom="line")
159 qplot(temperature, pressure, data=pressure, geom="line")
160 # This is equivalent to:
161 ggplot(pressure, aes(x=temperature, y=pressure)) + geom_line()
162 # Lines and points together
163 qplot(temperature, pressure, data=pressure, geom=c("line", "point"))
164 # Equivalent to:
165 ggplot(pressure, aes(x=temperature, y=pressure)) + geom_line() + geom_point()
166
167
168 #creating bar graph
169 barplot(BOD$demand, names.arg=BOD$Time) #name
170
171 # Generate a table of countsqplot(supp, len, data=ToothGrowth, geom="boxplot")
172 # This is equivalent to:
173 ggplot(ToothGrowth, aes(x=supp, y=len)) + geom_boxplot()
174 barplot(table(mtcars$cyl))
175 library(ggplot2)
176 ggplot(BOD, aes(x=factor(Time), y=demand)) + geom_bar(stat="identity")
177 # Bar graph of counts
178 qplot(factor(cyl), data=mtcars)
179 # This is equivalent to:
180 ggplot(mtcars, aes(x=factor(cyl))) + geom_bar()
181
182 #creating histogram
183 # Specify approximate number of bins with breaks
184 hist(mtcars$mpg, breaks=10)
185 library(ggplot2)
186 qplot(mpg, data=mtcars, binwidth=1)
187 # This is equivalent to:
188 ggplot(mtcars, aes(x=mpg)) + geom_histogram(binwidth=1)
189
190 #creating box plot
191 # Formula syntax
192 boxplot(len ~ supp, data = ToothGrowth)
193 library(ggplot2)
194 qplot(supp, len, data=ToothGrowth, geom="boxplot")
195 # This is equivalent to:
196 ggplot(ToothGrowth, aes(x=supp, y=len)) + geom_boxplot()
197 ggplot(ToothGrowth, aes(x=interaction(supp, dose), y=len)) + geom_boxplot()
198
199 #plot a function curve
200 curve(x^3 - 5*x, from=-4, to=4)
201 # Plot a user-defined function
202 myfun <- function(xvar) {
203 1/(1 + exp(-xvar + 10))
204 }
205 curve(myfun(x), from=0, to=20)
206 # Add a line:
207 curve(1-myfun(x), add = TRUE, col = "red")
208 library(ggplot2)
209 # This is equivalent to:
210 ggplot(data.frame(x=c(0, 20)), aes(x=x)) + stat_function(fun=myfun, geom="line")
211
212
213 #chapter 3 Bar graphs
214 ggplot(pg_mean, aes(x=group, y=weight)) + geom_bar(stat="identity")
215
216 ggplot(pg_mean, aes(x=group, y=weight)) +
217 geom_bar(stat="identity", fill="lightblue", colour="black")
218
219
220 #group by a second variable
221 ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
222 geom_bar(position="dodge", colour="black",stat="identity") #default stat is count
223
224 #stat="identity" means using sum
225
226 ggplot(diamonds, aes(x=cut)) + geom_bar(stat = "count") #default stat is count
227
228
229 upc <- subset(uspopchange, rank(Change)>40)
230 ggplot(upc, aes(x=Abb, y=Change, fill=Region)) + geom_bar(stat="identity")
231
232 ggplot(upc, aes(x=reorder(Abb, Change), y=Change, fill=Region)) +
233 geom_bar(stat="identity", colour="black") +
234 scale_fill_manual(values=c("#669933", "#FFCC66")) +
235 xlab("State")
236
237 #using the reorder() function to reorder the levels of a factor based on
238 #the values of another variable
239
240
241 #Coloring Negative and Positive Bars Differently
242 csub <- subset(climate, Source=="Berkeley" & Year >= 1900)
243 csub$pos <- csub$Anomaly10y >= 0
244 ggplot(csub, aes(x=Year, y=Anomaly10y, fill=pos)) +
245 geom_bar(stat="identity", position="identity") #default position is stack
246
247 ggplot(csub, aes(x=Year, y=Anomaly10y, fill=pos)) +
248 geom_bar(stat="identity", position="identity", colour="black", size=0.25) +
249 scale_fill_manual(values=c("#CCEEFF", "#FFDDDD"), guide=FALSE)
250
251
252 #Adjusting Bar Width and Spacing
253 #default value is 0.9
254 ggplot(pg_mean, aes(x=group, y=weight)) + geom_bar(stat="identity", width=0.5)
255 #a grouped bar graph with narrow bars
256 ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
257 geom_bar(stat="identity", width=0.5, position="dodge")
258 #with some space between the bars
259 ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
260 geom_bar(stat="identity", width=0.6, position=position_dodge(0.8)) #space = 0.8-0.6
261 #To be more precise, the value of width in position_dodge() is the same as
262 #width in geom_bar()
263
264
265 #Making a Stacked Bar Graph
266 ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
267 geom_bar(stat="identity")
268 #One problem with the default output is that the stacking order is the opposite of the
269 #order of items in the legend
270 #reverse the legend
271 ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
272 geom_bar(stat="identity") +
273 guides(fill=guide_legend(reverse=TRUE))
274 #reverse the stacking order
275 libary(plyr) # Needed for desc()
276 ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar, order=desc(Cultivar))) +
277 geom_bar(stat="identity")
278 #get a different color palette
279 ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
280 geom_bar(stat="identity", colour="black") +
281 guides(fill=guide_legend(reverse=TRUE)) +
282 scale_fill_brewer(palette="Pastel1")
283
284
285 #Making a Proportional Stacked Bar Graph
286 library(gcookbook)
287 library(plyr)
288 ce <- ddply(cabbage_exp,"Date",transform,
289 percent_weight = Weight / sum(Weight)*100)
290
291 ggplot(ce, aes(x=Date, y= percent_weight, fill =Cultivar)) +
292 geom_bar(stat="identity")
293 #change a little bit
294 ggplot(ce, aes(x=Date, y= percent_weight, fill =Cultivar)) +
295 geom_bar(stat="identity", color="black") +
296 guides(fill=guide_legend(reverse = TRUE)) +
297 scale_fill_brewer(palette = "Pastel1")
298
299 #adding labels to a bar graph
300 # Below the top
301 ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) +
302 geom_bar(stat="identity") +
303 geom_text(aes(label=Weight), vjust=1.5, colour="white")
304
305 # Above the top
306 ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) +
307 geom_bar(stat="identity") +
308 geom_text(aes(label=Weight), vjust=-0.2)
309
310 #adjust y limits to be a little higher
311 ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) +
312 geom_bar(stat="identity") +
313 geom_text(aes(label=Weight), vjust=-0.2) +
314 ylim(0, max(cabbage_exp$Weight) * 1.05)
315
316 # Map y positions slightly above bar top - y range of plot will auto-adjust
317 ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) +
318 geom_bar(stat="identity") +
319 geom_text(aes(y=Weight+.1, label=Weight))
320
321 #dodging
322 ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
323 geom_bar(stat="identity", position="dodge") +
324 geom_text(aes(label=Weight), vjust=1.5, colour="white",
325 position=position_dodge(0.9), size=3)
326
327
328 #label on stack bar
329 library(plyr)
330 # Sort by the day and sex columns
331 ce <- arrange(cabbage_exp, Date, Cultivar)
332 #get the cumulative sum
333 ce <- ddply(ce, "Date", transform, label_y=cumsum(Weight))
334
335 ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) +
336 geom_bar(stat="identity") +
337 geom_text(aes(y=label_y, label=Weight), vjust=1.5, colour="white") +
338 guides(fill=guide_legend(reverse=TRUE))
339
340 #put the label in the center
341 ce <- arrange(cabbage_exp, Date, Cultivar)
342 # Calculate y position, placing it in the middle
343 ce <- ddply(ce, "Date", transform, label_y=cumsum(Weight)-0.5*Weight) #good
344 ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) +
345 geom_bar(stat="identity") +
346 geom_text(aes(y=label_y, label=Weight), colour="white")
347
348 #polish
349 ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) +
350 geom_bar(stat="identity") +
351 geom_text(aes(y=label_y, label=paste(format(Weight, nsmall=2), "kg")),
352 size=4) +
353 guides(fill=guide_legend(reverse=TRUE)) + scale_fill_brewer(palette="Pastel1")
354 #coord_flip()
355
356 #making a Cleveland Dot Plot
357 library(gcookbook)
358 tophit <- tophitters2001[1:25,]
359
360 ggplot(tophit, aes(x=avg, y=name)) + geom_point()
361
362 ggplot(tophit, aes(x=avg, y=reorder(name, avg))) +
363 geom_point(size=3) + # Use a larger dot
364 theme_bw() +
365 theme(panel.grid.major.x = element_blank(),
366 panel.grid.minor.x = element_blank(),
367 panel.grid.major.y = element_line(colour="grey60", linetype="dashed"))
368
369 ggplot(tophit, aes(x=reorder(name, avg), y=avg)) +
370 geom_point(size=3) + # Use a larger dot
371 theme_bw() +
372 theme(axis.text.x = element_text(angle=60, hjust=1),
373 panel.grid.major.y = element_blank(),
374 panel.grid.minor.y = element_blank(),
375 panel.grid.major.x = element_line(colour="grey60", linetype="dashed"))
376
377 #reorder onlu by one variable
378 #order by two variables
379 # Get the names, sorted first by lg, then by avg
380 nameorder <- tophit$name[order(tophit$lg, tophit$avg)]
381 # Turn name into a factor, with levels in the order of nameorder
382 tophit$name <- factor(tophit$name, levels=nameorder)
383
384
385 ggplot(tophit, aes(x=avg, y=name)) +
386 geom_segment(aes(yend=name), xend=0, colour="grey50") +
387 geom_point(size=3, aes(colour=lg)) +
388 scale_colour_brewer(palette="Set1", limits=c("NL","AL")) + #limits order the posion
389 theme_bw() +
390 theme(panel.grid.major.y = element_blank(), # No horizontal grid lines
391 legend.position=c(1, 0.55), # Put legend inside plot area
392 legend.justification=c(1, 0.5))
393
394 ggplot(tophit, aes(x=avg, y=name)) +
395 geom_segment(aes(yend=name), xend=0, colour="grey50") +
396 geom_point(size=3, aes(colour=lg)) +
397 scale_colour_brewer(palette="Set1", limits=c("NL","AL"), guide=FALSE) +
398 theme_bw() +
399 theme(panel.grid.major.y = element_blank()) +
400 facet_grid(lg ~ ., scales="free_y", space="free_y")
401
402
403 #Making a Basic Line Graph
404 ggplot(BOD, aes(x=Time, y=demand)) + geom_line()
405
406 d1 <- BOD
407 d1$Time <- factor(d1$Time)
408 ggplot(d1, aes(x=Time, y=demand,group=1)) +geom_line()
409 #add y limit
410 ggplot(BOD, aes(x=Time, y=demand)) + geom_line() + ylim(0, max(BOD$demand))
411 ggplot(BOD, aes(x=Time, y=demand)) + geom_line() + expand_limits(y=0) #same
412
413 #adding points to a line graph
414 ggplot(BOD, aes(x=Time, y=demand)) + geom_line() + geom_point()
415
416 library(gcookbook)
417 ggplot(worldpop, aes(x=Year, y=Population)) +
418 geom_line() +
419 geom_point()
420
421 #with a log y-axis
422 ggplot(worldpop, aes(x=Year, y= Population)) +
423 geom_line() +
424 geom_point() +
425 scale_y_log10()
426
427 #making a line graph with multiple lines
428 #summarize the data
429 library(plyr)
430 tg <- ddply(ToothGrowth, c("supp","dose"),summarise, length=mean(len))
431
432 #map supp to colour
433 ggplot(tg, aes(x=dose,y=length,colour=supp)) + geom_line()
434
435 #map supp to linetype
436 ggplot(tg, aes(x=dose, y=length, linetype=supp)) + geom_line()
437
438 #factor + group (line point together)
439 ggplot(tg, aes(x=factor(dose), y=length,colour=supp,group=supp)) + geom_line()
440 # Make the points a little larger
441 ggplot(tg, aes(x=dose, y=length, shape=supp)) + geom_line() + geom_point(size=4)
442 # Also use a point with a color fill
443 ggplot(tg, aes(x=dose, y=length, fill=supp)) + geom_line() + geom_point(size=4, shape=21)
444 #dodge them
445 ggplot(tg, aes(x=dose, y=length, shape=supp)) +
446 geom_line(position=position_dodge(0.2)) + # Dodge lines by 0.2
447 geom_point(position=position_dodge(0.2), size=4) # Dodge points by 0.2
448
449 #change the apperance of lines
450 ggplot(BOD, aes(x=Time, y=demand)) +
451 geom_line(linetype = "dashed", size = 1, color = "blue")
452
453 # Load plyr so we can use ddply() to create the example data set
454 library(plyr)
455 # Summarize the ToothGrowth data
456 tg <- ddply(ToothGrowth, c("supp", "dose"), summarise, length=mean(len))
457 ggplot(tg, aes(x=dose, y=length, colour=supp)) +
458 geom_line() +
459 scale_colour_brewer(palette="Set1")
460 # If both lines have the same properties, you need to specify a variable to # use for grouping
461 ggplot(tg, aes(x=dose, y=length, group=supp)) +
462 geom_line(colour="darkgreen", size=1.5)
463 # Since supp is mapped to colour, it will automatically be used for grouping
464 ggplot(tg, aes(x=dose, y=length, colour=supp)) +
465 geom_line(linetype="dashed") +
466 geom_point(shape=22, size=3, fill="white")
467
468 #change the appearance of points
469 ggplot(BOD, aes(x=Time, y=demand)) +
470 geom_line() +
471 geom_point(size=4, shape=22, colour="darkred", fill="pink")
472 #The default shape for points is a solid circle,
473 #the default size is 2, and the default colour is "black"
474
475 # Load plyr so we can use ddply() to create the example data set
476 library(plyr)
477 # Summarize the ToothGrowth data
478 tg <- ddply(ToothGrowth, c("supp", "dose"), summarise, length=mean(len))
479 # Save the position_dodge specification because we'll use it multiple times
480 pd <- position_dodge(0.2)
481 ggplot(tg, aes(x=dose, y=length, fill=supp)) +
482 geom_line(position=pd) +
483 geom_point(shape=21, size=3, position=pd) +
484 scale_fill_manual(values=c("black","white"))
485
486
487 #making a graph with a shaded area
488 sunspotyear <- data.frame(
489 Year = as.numeric(time(sunspot.year)),
490 Sunspots = as.numeric(sunspot.year)
491 )
492 ggplot(sunspotyear, aes(x=Year, y=Sunspots)) + geom_area()
493 #change area appearance (with outline)
494 ggplot(sunspotyear, aes(x=Year, y=Sunspots)) +
495 geom_area(colour="black", fill="blue", alpha=.2)
496 #no bottom line
497 ggplot(sunspotyear, aes(x=Year, y=Sunspots)) +
498 geom_area(fill="blue", alpha=.2) +
499 geom_line()
500
501 #making a stacked area graph
502 library(gcookbook)
503 ggplot(uspopage, aes(x=Year, y= Thousands,fill=AgeGroup)) +
504 geom_area()
505
506 #reverse the legend order
507 ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup)) +
508 geom_area(colour="black", size=.2, alpha=.4) +
509 scale_fill_brewer(palette="Blues") +
510 guides(fill=guide_legend(reverse=TRUE))
511 #or
512 ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup)) +
513 geom_area(colour="black", size=.2, alpha=.4) +
514 scale_fill_brewer(palette="Blues", breaks=rev(levels(uspopage$AgeGroup)))
515
516
517 #or change the order of fill
518 library(plyr) # For the desc() function
519 ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup, order=desc(AgeGroup))) +
520 geom_area(colour="black", size=.2, alpha=.4) +
521 scale_fill_brewer(palette="Blues")
522
523
524 #remove the left and right sides
525 ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup, order=desc(AgeGroup))) +
526 geom_area(colour=NA, alpha=.4) +
527 scale_fill_brewer(palette="Blues") +
528 geom_line(position="stack", size=.2) #default position is "identity"
529
530
531 #making a proportional stacked area graph
532 #first calculate the proportions
533 library(gcookbook)
534 library(plyr)
535 #convert to percent
536 p <- ddply(uspopage,"Year",transform,
537 Percent = Thousands/sum(Thousands)*100)
538 ggplot(p,aes(x=Year, y= Percent, fill = AgeGroup)) +
539 geom_area(colour = "black",size=.2,alpha = .4) +
540 scale_fill_brewer(palette="Blues",breaks=rev(levels(p$AgeGroup)))
541
542 #adding a confidence region
543 library(gcookbook)
544 clim <- subset(climate, Source == "Berkeley",
545 select = c("Year","Anomaly10y","Unc10y"))
546 #the same
547 library(dplyr)
548 clim <- climate %>% filter(Source == "Berkeley") %>% select(Year, Anomaly10y, Unc10y)
549 #shaded region
550 ggplot(clim, aes(x=Year,y= Anomaly10y)) +
551 geom_ribbon(aes(ymin=Anomaly10y-Unc10y, ymax=Anomaly10y+Unc10y),
552 alpha=0.2) +
553 geom_line()
554 #we need know the 95% confidence interval ahead
555
556 # With a dotted line for upper and lower bounds
557 ggplot(clim, aes(x=Year, y=Anomaly10y)) +
558 geom_line(aes(y=Anomaly10y-Unc10y), colour="grey50", linetype="dotted") +
559 geom_line(aes(y=Anomaly10y+Unc10y), colour="grey50", linetype="dotted") +
560 geom_line()
561
562
563
564 ####scatter plots
565 #making a basic scatter plot
566 ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point()
567 #the default solid circles (shape #16) is hollow ones (#21)
568 ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point(shape=21)
569 ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point(size=1.5)
570 #default value of size is 2
571
572 #grouping data points by a variable using shape or color
573 library(gcookbook)
574 #mapping color
575 ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) + geom_point()
576 #mapping shape
577 ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex)) + geom_point() +geom_line(group=sex)
578
579
580 ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex, colour=sex)) +
581 geom_point()
582
583 ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex, colour=sex)) +
584 geom_point() +
585 scale_shape_manual(values=c(1,2)) +
586 scale_colour_brewer(palette="Set1")
587
588 #Using different point shapes
589 library(gcookbook)
590 ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point(shape=3)
591
592 # Use slightly larger points and use a shape scale with custom values
593 ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex)) +
594 geom_point(size=3) + scale_shape_manual(values=c(1, 4))
595 #Some of the point shapes (1–14) have just an outline,
596 #some (15–20) are solid, and
597 #some (21–25) have an outline and fill that can be controlled separately
598
599 #shape represent one variable and
600 #the fill (empty or solid) represent another variable
601 # Make a copy of the data
602 hw <- heightweight
603 # Categorize into <100 and >=100 groups
604 hw$weightGroup <- cut(hw$weightLb, breaks=c(-Inf, 100, Inf),
605 labels=c("< 100", ">= 100"))
606 ggplot(hw, aes(x=ageYear, y=heightIn, shape=sex, fill=weightGroup)) +
607 geom_point(size=2.5) + scale_shape_manual(values=c(21, 24)) +
608 scale_fill_manual(values=c(NA, "black"),
609 guide=guide_legend(override.aes=list(shape=21)))
610
611 #mapping a continuous variable to color or size
612 library(gcookbook)
613 ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=weightLb)) + geom_point()
614 ggplot(heightweight, aes(x=ageYear, y=heightIn, size=weightLb)) + geom_point()
615
616 #fill gradient to go from black to white
617 #make the points larger
618 ggplot(heightweight, aes(x=weightLb, y=heightIn, fill=ageYear)) +
619 geom_point(shape=21, size=2.5) +
620 scale_fill_gradient(low="black", high="white")
621 # Using guide_legend() will result in a discrete legend instead of a colorbar
622 ggplot(heightweight, aes(x=weightLb, y=heightIn, fill=ageYear)) +
623 geom_point(shape=21, size=2.5) +
624 scale_fill_gradient(low="black", high="white", breaks=12:17,
625 guide=guide_legend())
626
627 ggplot(heightweight, aes(x=ageYear, y=heightIn, size=weightLb, colour=sex)) +
628 geom_point(alpha=.5) +
629 scale_colour_brewer(palette="Set1") +
630 scale_size_area() # Make area proportional to numeric value
631
632 #dealing with overplotting
633 sp <- ggplot(diamonds, aes(x=carat, y=price))
634 sp + geom_point(alpha=0.1)
635 sp + geom_point(alpha=.01)
636 #bin the points into rectangle
637 sp + stat_bin2d()
638 sp + stat_bin2d(bins=50) +
639 scale_fill_gradient(low="lightblue", high="red", limits=c(0, 6000))
640 library(hexbin)
641 sp + stat_binhex() +
642 scale_fill_gradient(low="lightblue", high="red",
643 limits=c(0, 8000))
644 sp + stat_binhex() +
645 scale_fill_gradient(low="lightblue", high="red",
646 breaks=c(0, 250, 500, 1000, 2000, 4000, 6000),
647 limits=c(0, 6000))
648
649 #one discrete axis and one continuous axis
650 sp1 <- ggplot(ChickWeight, aes(x=Time, y=weight))
651 sp1 + geom_point()
652 sp1 + geom_point(position="jitter")
653 # Could also use geom_jitter(), which is equivalent
654 sp1 + geom_point(position=position_jitter(width=.5, height=0))
655 #using boxplot
656 sp1 + geom_boxplot(aes(group=Time))
657
658 #adding fitted regression model lines
659 library(gcookbook)
660 sp <- ggplot(heightweight, aes(x=ageYear, y= heightIn))
661 sp + geom_point() + stat_smooth(method = lm)
662 #99% confidence region
663 sp + geom_point() + stat_smooth(method = lm, level = 0.99)
664 #No confidence region
665 sp + geom_point() + stat_smooth(method=lm, se=F)
666
667 sp + geom_point(color="grey60") +
668 stat_smooth(method = lm, se=F, color="black")
669
670 #the default method is locally weighted polynomial curve
671 sp + geom_point(color="grey60") + stat_smooth(method=loess)
672
673 #with logistic regressio curve
674 library(MASS) # For the data set
675 b <- biopsy
676 b$classn[b$class=="benign"] <- 0
677 b$classn[b$class=="malignant"] <- 1
678 ggplot(b, aes(x=V1,y=classn)) +
679 geom_point(position=position_jitter(width = .3,height=.06),
680 alpha=.4, shape=21,size=1.5) +
681 stat_smooth(method=glm, method.args = list(family="binomial"))
682 #parameter is no longer used
683
684 #group points with fitted line
685 sps <- ggplot(heightweight, aes(x=ageYear, y=heightIn, color = sex)) +
686 geom_point() +
687 scale_color_brewer(palette = "Set1")
688 sps + geom_smooth()
689
690 sps + geom_smooth(method = lm, se = F, fullrange= T)
691
692
693 #adding fitted lines from an existing model
694 library(gcookbook)
695 model <- lm(heightIn~ ageYear + I(ageYear^2), data = heightweight)
696 xmin <- min(heightweight$ageYear)
697 xmax <- max(heightweight$ageYear)
698
699 predicted <- data.frame(ageYear = seq(xmin,xmax,length.out = 200))
700 predicted$heightIn <- predict(model, predicted)
701
702 sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
703 geom_point(color = "grey40")
704 sp + geom_line(data=predicted,size=1)
705
706 # Given a model, predict values of yvar from xvar
707 # This supports one predictor and one predicted variable
708 # xrange: If NULL, determine the x range from the model object. If a vector with # two numbers, use those as the min and max of the prediction range.
709 # samples: Number of samples across the x range.
710 # ...: Further arguments to be passed to predict()
711 predictvals <- function(model, xvar, yvar, xrange=NULL, samples = 100,...){
712 if (is.null(xrange)) {
713 if (any(class(model) %in% c("lm","glm")))
714 xrange <- range(model$model[[xvar]])
715 else if (any(class(model) %in% "loess"))
716 xrange <- range(model$x)
717 }
718 newdata <- data.frame(x=seq(xrange[1],xrange[2],length.out = samples))
719 names(newdata) <- xvar
720 newdata[yvar] <- predict(model,newdata = newdata,...)
721 newdata
722 }
723
724 modlinear <- lm(heightIn ~ ageYear, heightweight)
725 modloess <- loess(heightIn ~ ageYear, heightweight)
726 #predict
727 lm_predicted <- predictvals(modlinear, "ageYear", "heightIn")
728 loess_predicted <- predictvals(modloess, "ageYear", "heightIn")
729 #multiple fitted lines
730 sp + geom_line(data=lm_predicted, colour="red", size=.8) +
731 geom_line(data=loess_predicted, colour="blue", size=.8)
732
733 library(MASS) # For the data set b <- biopsy
734 b$classn[b$class=="benign"] <- 0
735 b$classn[b$class=="malignant"] <- 1
736 fitlogistic <- glm(classn~V1,data = b, family = binomial)
737
738 #get predicted values
739 glm_predicted <- predictvals(fitlogistic, "V1","classn",type="response")
740 ggplot(b,aes(x=V1, y=classn)) +
741 geom_point(position = position_jitter(width = .3,height=.08),alpha=.4,
742 shape=21,size=1.5) +
743 geom_line(data=glm_predicted, color="#1177FF",size=1)
744
745 #adding fitted lines from multiple exsiting models
746 make_model <- function(data) { lm(heightIn ~ ageYear, data)}
747 library(gcookbook)
748 library(plyr)
749 models <- dlply(heightweight,"sex",.fun=make_model)
750 #For each subset of a data frame, apply function then combine results into a list.
751 pred <- ldply(models,.fun = predictvals,xvar="ageYear",yvar="heightIn")
752 #For each element of a list, apply function then combine results into a data frame
753 ggplot(heightweight, aes(x=ageYear, y=heightIn, color=sex)) +
754 geom_point() +
755 geom_line(data=pred)
756 #extend the range to the same range across all groups
757 pred <- ldply(models, .fun=predictvals, xvar="ageYear", yvar="heightIn",
758 xrange=range(heightweight$ageYear))
759 ggplot(heightweight, aes(x=ageYear, y=heightIn, color=sex)) +
760 geom_point() +
761 geom_line(data=pred)
762
763 #adding annotations with model coefficients
764 library(gcookbook)
765 model <- lm(heightIn~ ageYear, heightweight)
766 summary(model)
767 #add r square value
768 pred <- predictvals(model, "ageYear", "heightIn")
769 sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
770 geom_point() +
771 geom_line(data=pred)
772 sp + annotate("text", label="r^2=0.42", x=16.5, y=52)
773 #using r's math expression syntax
774 sp + annotate("text", label="r^2=0.42",parse=TRUE, x=16.5, y=52)
775
776 #create a string that when parsed, returns a valid expression
777 eqn <- as.character(as.expression(
778 substitute(italic(y) == a + b * italic(x) * "," ~~ italic(r)^2 ~ "=" ~ r2,
779 list(a = format(coef(model)[1], digits=3),
780 b = format(coef(model)[2], digits=3),
781 r2 = format(summary(model)$r.squared, digits=2)
782 ))))
783 sp + annotate("text", label=eqn, parse=TRUE,x=Inf,y=-Inf,hjust=1.1,vjust=-.5)
784
785
786 #adding marginal rugs to a scatter plot
787 ggplot(faithful, aes(x=eruptions, y=waiting)) + geom_point() + geom_rug()
788 #specify the size
789 ggplot(faithful, aes(x=eruptions, y = waiting)) +
790 geom_point() +
791 geom_rug(position = "jitter",size=.2)
792
793 #label points in a scatter plot
794 library(gcookbook)
795 s <- subset(countries, Year==2009 & healthexp > 2000)
796 sp <- ggplot(s, aes(x=healthexp, y=infmortality)) + geom_point()
797 sp + annotate("text",x=4350,y=5.4,label="Cannada") +
798 annotate("text",x=7400,y=6.8,label="USA")
799 #automatically add the labels
800 sp + geom_text(aes(label=Name), size=4)
801 sp + geom_text(aes(label=Name), size=4,vjust=0)
802 sp + geom_text(aes(y=infmortality + 0.1, label = Name),size=4,vjust=0)
803 sp + geom_text(aes(label = Name), size=4,hjust=0)
804 sp + geom_text(aes(x=healthexp+100, label=Name), size=4, hjust=0)
805
806 #label some of the points automatically
807 cdat <- subset(countries, Year == 2009 & healthexp > 2009)
808 cdat$Name1 <- cdat$Name
809 idx <- cdat$Name1 %in% c("Canada", "Ireland", "United Kingdom", "United States",
810 "New Zealand", "Iceland", "Japan", "Luxembourg",
811 "Netherlands", "Switzerland")
812 cdat$Name1[!idx] <- NA
813 ggplot(cdat, aes(x=healthexp, y=infmortality)) +
814 geom_point() +
815 geom_text(aes(x=healthexp+100, label=Name1),size=4,hjust=0) +
816 xlim(2000,10000)
817
818 #creating a balloon plot
819 library(gcookbook) # For the data set
820 cdat <- subset(countries, Year==2009 &
821 Name %in% c("Canada", "Ireland", "United Kingdom", "United States",
822 "New Zealand", "Iceland", "Japan", "Luxembourg",
823 "Netherlands", "Switzerland"))
824 p <- ggplot(cdat, aes(x=healthexp, y=infmortality,size=GDP)) +
825 geom_point(shape=21,color="black",fill="cornsilk")
826 #GDP mapped to redius
827 p#GDP mapped to area instead, and larger circles
828 p + scale_size_area(max_size=15)
829
830 #add up counts for male and female
831 hec <- HairEyeColor[,,"Male"] + HairEyeColor[,,"Female"]
832
833 #convert to long format
834 library(reshape2)
835 hec <- melt(hec, value.name="count")
836 ggplot(hec, aes(x=Eye,y=Hair)) +
837 geom_point(aes(size=count),shape=21,color="black",fill="cornsilk") +
838 scale_size_area(max_size = 20,guid=F) +
839 geom_text(aes(y=as.numeric(Hair)-sqrt(count)/22,label=count),
840 color="grey60",size=4)
841
842 #making a scatter plot matrix
843 library(gcookbook)
844 c2009 <- subset(countries, Year==2009,
845 select=c(Name, GDP, laborrate, healthexp, infmortality))
846 pairs(c2009[,2:5])
847
848 panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...) {
849 usr <- par("usr")
850 on.exit(par(usr))
851 par(usr = c(0, 1, 0, 1))
852 r <- abs(cor(x, y, use="complete.obs"))
853 txt <- format(c(r, 0.123456789), digits=digits)[1]
854 txt <- paste(prefix, txt, sep="")
855 if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
856 text(0.5, 0.5, txt, cex = cex.cor * (1 + r) / 2)
857 }
858
859 panel.hist <- function(x, ...) {
860 usr <- par("usr")
861 on.exit(par(usr))
862 par(usr = c(usr[1:2], 0, 1.5) )
863 h <- hist(x, plot = FALSE)
864 breaks <- h$breaks
865 nB <- length(breaks)
866 y <- h$counts
867 y <- y/max(y)
868 rect(breaks[-nB], 0, breaks[-1], y, col="white", ...)
869 }
870
871 pairs(c2009[,2:5], upper.panel = panel.cor,
872 diag.panel = panel.hist,
873 lower.panel = panel.smooth)
874
875 panel.lm <- function (x, y, col = par("col"), bg = NA, pch = par("pch"),
876 cex = 1, col.smooth = "black", ...) {
877 points(x, y, pch = pch, col = col, bg = bg, cex = cex)
878 abline(stats::lm(y ~ x), col = col.smooth, ...)
879 }
880
881 pairs(c2009[,2:5], pch=".",
882 upper.panel = panel.cor,
883 diag.panel = panel.hist,
884 lower.panel = panel.lm)
885
886
887 #########summarized data distributions
888
889 #making a basic histogram
890 ggplot(faithful, aes(x=waiting)) + geom_histogram()
891
892 # Store the values in a simple vector
893 w <- faithful$waiting
894 ggplot(NULL, aes(x=w)) + geom_histogram()
895
896 #set the width of each bin to 5
897 ggplot(faithful, aes(x=waiting)) +
898 geom_histogram(binwidth = 5, fill ="white",color="black")
899
900 #divide the x range into 15 bins
901 binsize <- diff(range(faithful$waiting))/15
902 ggplot(faithful,aes(x=waiting)) +
903 geom_histogram(binwidth = binsize,fill="white",color="black")
904
905 #change the origin
906 h <- ggplot(faithful, aes(x=waiting)) # Save the base object for reuse
907 h + geom_histogram(binwidth=8, fill="white", colour="black", origin=31)
908 h + geom_histogram(binwidth=8, fill="white", colour="black", origin=35)
909
910 #the same with bar
911 h + geom_bar(stat="bin")
912
913 #making multiple histograms from grouped data
914 library(MASS)
915 #use smoke as the faceting variable
916 ggplot(birthwt, aes(x=bwt)) +
917 geom_histogram(fill="white", color="black") +
918 facet_grid(smoke~.)
919 #change the label
920 bir <- birthwt
921 #convert smoke to a factor
922 bir$smoke <- factor(bir$smoke)
923 library(plyr)
924 bir$smoke <- revalue(bir$smoke, c("0"="No Smoke","1"="Smoke"))
925 #same result
926 bir$smoke <- factor(bir$smoke,labels=c("No Smoke","Smoke"))
927
928 ggplot(bir,aes(x=bwt)) +
929 geom_histogram(fill="white",color="black") +
930 facet_grid(smoke~.)
931
932
933 #when size are different
934 ggplot(bir, aes(x=bwt)) +
935 geom_histogram(fill="white", color="black") +
936 facet_grid(race~.)
937 #resize independently
938 ggplot(bir, aes(x=bwt)) +
939 geom_histogram(fill="white",color="black") +
940 facet_grid(race~.,scales = "free") #scales is set "free"
941 #another approach is to map the grouping variable to fill
942 ggplot(bir, aes(x=bwt,fill=smoke)) +
943 geom_histogram(position="identity",alpha=0.4)
944 #without position="identity", ggplot will stack the histogram bars on top of each other
945
946 #making a density curve
947 ggplot(faithful, aes(x=waiting)) + geom_density()
948
949 #remove the side and bottom
950 ggplot(faithful, aes(x=waiting)) +
951 geom_line(stat="density") +
952 expand_limits(y=0)
953
954 #the larger the bandwidth, the more smoothing there is
955 ggplot(faithful,aes(x=waiting)) +
956 geom_line(stat="density",adjust=.25,color="red") +
957 geom_line(stat="density") +
958 geom_line(stat="density",adjust=2,color="blue")
959
960 #set the area color
961 ggplot(faithful,aes(x=waiting)) +
962 geom_density(fill="blue",alpha=.2) +
963 xlim(35,105)
964
965 ggplot(faithful,aes(x=waiting)) +
966 geom_density(fill="blue",alpha=.2,color=NA) +
967 geom_line(stat="density") +
968 xlim(35,105)
969
970 #compare the theoretical and obsearved distributions
971 ggplot(faithful, aes(x=waiting, y=..density..)) +
972 geom_histogram(fill="cornsilk", colour="grey60", size=.2) +
973 geom_density() +
974 xlim(35, 105)
975
976 #making multiple density curves from grouped data
977 library(MASS)
978 bir <- birthwt
979 bir$smoke <- factor(bir$smoke)
980 ggplot(bir,aes(x=bwt,color=smoke)) + geom_density()
981 #mapping smoke to fill
982 ggplot(bir,aes(x=bwt,fill=smoke)) + geom_density(alpha=.3)
983
984 ggplot(bir, aes(x=bwt)) + geom_density() + facet_grid(smoke~.)
985
986 library(plyr) # For the revalue function
987 birthwt1 <- birthwt
988 birthwt1$smoke <- factor(birthwt1$smoke)
989 birthwt1$smoke <- revalue(birthwt1$smoke, c("0"="No Smoke", "1"="Smoke"))
990 ggplot(birthwt1, aes(x=bwt)) + geom_density() + facet_grid(smoke ~ .)
991
992 ggplot(birthwt1, aes(x=bwt, y=..density..)) +
993 geom_histogram(binwidth=200, fill="cornsilk", colour="grey60", size=.2) +
994 geom_density() +
995 facet_grid(smoke ~ .)
996
997 #making a frequency polygon
998 ggplot(faithful, aes(x=waiting)) + geom_freqpoly()
999 ggplot(faithful, aes(x=waiting)) + geom_freqpoly(binwidth=4)
1000
1001 #use 15 bins
1002 binsize <- diff(range(faithful$waiting))/15
1003 ggplot(faithful, aes(x=waiting)) + geom_freqpoly(binwidth=binsize)
1004
1005 #making a basic box plot
1006 library(MASS)
1007 ggplot(birthwt, aes(x=factor(race), y=bwt)) + geom_boxplot()
1008 # Use factor() to convert numeric variable to discrete
1009
1010 #change width
1011 ggplot(birthwt,aes(x=factor(race),y=bwt)) + geom_boxplot(width=.5)
1012 #default size is 2, default shape is 16
1013 ggplot(birthwt,aes(x=factor(race),y=bwt)) +
1014 geom_boxplot(outlier.size = 1.5,outlier.shape = 21)
1015
1016 ggplot(birthwt,aes(x=1,y=bwt)) + geom_boxplot() +
1017 scale_x_continuous(breaks = NULL) +
1018 theme(axis.title.x=element_blank())
1019
1020 #adding natches to a box plot
1021 library(MASS)
1022 ggplot(birthwt,aes(x=factor(race),y=bwt)) +
1023 geom_boxplot(notch = TRUE)
1024
1025 #adding means to a box plot
1026 library(MASS)
1027 ggplot(birthwt, aes(factor(race),y=bwt)) +
1028 geom_boxplot() +
1029 stat_summary(fun.y = "mean",geom = "point",shape=23,size=3,fill="white")
1030
1031 #making a violin plot
1032 library(gcookbook)
1033 p <- ggplot(heightweight,aes(x=sex,y=heightIn))
1034 p + geom_violin()
1035 #with narrows boxplot
1036 p + geom_violin() +
1037 geom_boxplot(width=.1,fill="black",outlier.colour = NA) +
1038 stat_summary(fun.y = median,geom="point",fill="white",shape=21,size=2.5)
1039 #keep the tails
1040 p + geom_violin(trim=FALSE)
1041
1042 #making a dot plot
1043 library(gcookbook)
1044 c2009 <- subset(countries, Year==2009 & healthexp > 2000)
1045 p <- ggplot(c2009,aes(x=infmortality))
1046 p + geom_dotplot()
1047 p + geom_dotplot(binwidth=.25) + geom_rug() +
1048 scale_y_continuous(breaks=NULL) +
1049 theme(axis.title.y=element_blank())
1050
1051 #making multiple dot plots for grouped data
1052 ggplot(heightweight, aes(x=sex, y=heightIn)) +
1053 geom_dotplot(binaxis="y", binwidth=.5, stackdir="center")
1054
1055
1056 #making a desity plot of two dimensional data
1057 p <- ggplot(faithful,aes(x=eruptions,y=waiting))
1058 p + geom_point() + stat_density2d()
1059 #map the height of the density to color
1060 p + stat_density2d(aes(color=..level..))
1061 #map denstity estimate to fill color
1062 p + stat_density2d(aes(fill=..density..),geom="raster",contour = F)
1063 #with points and map desity estimate to alpha
1064 p + geom_point() +
1065 stat_density2d(aes(alpha=..density..),geom="tile",contour=F)
1066 #tile and raster look like the same
1067
1068 #marginal rug added to a scatter plot
1069 p + stat_density2d(aes(fill=..density..),geom="raster",
1070 contour=F,h=c(.5,5))
1071
1072
1073
1074 #############annotations
1075
1076 #adding text annotations
1077 p <- ggplot(faithful,aes(x=eruptions,y=waiting)) + geom_point()
1078 p + annotate(geom="text",x=3,y=48,label = "Group 1") +
1079 annotate("text",x=4.5,y=66,label = "Group 2")
1080 p + annotate("text",x=3,y=48,label="Group 1",family="serif",
1081 fontface="italic",color="darkred",size=3) +
1082 annotate("text",x=4.5,y=66,label="Group 2",family="serif",
1083 fontface="italic",color="darkred",size=3)
1084 annotate("text",x=3,y=48,label="Group 1",family="serif",
1085 fontface="italic",color="darkred",size=3)
1086
1087 p + annotate("text", x=3, y=48, label="Group 1", alpha=.1) + # Normal
1088 geom_text(x=4.5, y=66, label="Group 2", alpha=.1) # Overplotted
1089
1090 #for continuous axes, use Inf and -Inf to place text annotations at the
1091 #edge of the plotting area
1092 p + annotate("text",x=-Inf,y=Inf,label="Upper left",hjust=-.2,vjust=2) +
1093 annotate("text",x=mean(range(faithful$eruptions)),y=-Inf,vjust=-.4,label="Bottom middle")
1094 #for hjust and vjust position is minus the value
1095
1096 #using mathematical expression in annotations
1097 #a normal curve
1098 p <- ggplot(data.frame(x=c(-3,3)), aes(x=x)) + stat_function(fun=dnorm)
1099 p + annotate("text",x=2,y=0.3,parse=TRUE,
1100 label="frac(1,sqrt(2*pi))*e^{-x^2/2}")
1101 #use single quotes within double quotes (or vice versa) to mark the plain-text parts
1102 #put a * operator between them
1103 p + annotate("text",x=0,y=.05,parse=TRUE,size=4,
1104 label="'Function: '*y==frac(1,sqrt(2*pi))*e^{-x^2/2}")
1105 #?plotmath for many examples of mathematical expressions, and
1106 #?demo(plot math) for graphical examples of mathematical expressions
1107
1108 #adding lines
1109 library(gcookbook)
1110 p <- ggplot(heightweight,aes(x=ageYear,y=heightIn,color=sex)) + geom_point()
1111 #adding horizongtal and vertical lines
1112 p + geom_hline(yintercept = 60) + geom_vline(xintercept = 14)
1113 p + geom_abline(intercept = 37.4, slope = 1.75)
1114
1115 library(plyr)
1116 hw_means <- ddply(heightweight,"sex",summarise,heightIn=mean(heightIn))
1117 p + geom_hline(aes(yintercept=heightIn,color=sex),data=hw_means,
1118 linetype="dashed",size=1)
1119
1120 #for discrete variable
1121 pg <- ggplot(PlantGrowth,aes(x=group,y=weight)) + geom_point()
1122 pg + geom_vline(xintercept = 2)
1123 pg + geom_vline(xintercept = which(levels(PlantGrowth$group) == "ctrl"))
1124
1125 #adding line segments and arrows
1126 library(gcookbook)
1127 p <- ggplot(subset(climate, Source=="Berkeley"), aes(x=Year, y=Anomaly10y)) +
1128 geom_line()
1129 p + annotate("segment", x=1950, xend=1980, y=-.25, yend=-.25)
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152