Commit 978c0f7912211370ce279a8973091245c8a149a7
1 parent
658ed0df54
Exists in
master
upload two R code file
Showing
3 changed files
with
1555 additions
and
791 deletions
Show diff stats
GPL22111.csv
data_cleaning.R
File was created | 1 | ||
2 | |||
3 | #download the file to load | ||
4 | if (!file.exists("data")){dir.create("data")} | ||
5 | fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD" | ||
6 | download.file(fileUrl, destfile = "./data/cameras.csv",method = "curl") | ||
7 | dateDownloaded <- date() | ||
8 | if (!file.exists("data")){dir.create("data")} | ||
9 | fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD" | ||
10 | download.file(fileUrl, destfile = "./data/cameras.csv",method = "curl") | ||
11 | dateDownloaded <- date() | ||
12 | |||
13 | dataset <- read.csv("./data/cameras.csv") | ||
14 | #equal to | ||
15 | dataset <- read.table("./data/cameras.csv",sep=",",header = TRUE) | ||
16 | |||
17 | |||
18 | if (!file.exists("data")){dir.create("data")} | ||
19 | fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.xlsx?accessType=DOWNLOAD" | ||
20 | download.file(fileUrl, destfile = "./data/cameras.xlsx",method = "curl") | ||
21 | dateDownloaded <- date() | ||
22 | |||
23 | #read xlsx file | ||
24 | library(gdata) | ||
25 | dataset <- read.csv("./data/cameras.xlsx") | ||
26 | |||
27 | #data table | ||
28 | library(data.table) | ||
29 | DF <- data.frame(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9)) | ||
30 | DT <- data.table(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9)) | ||
31 | #subsetting rows | ||
32 | DT[2,] | ||
33 | DT[DT$y == 'a'] | ||
34 | DT[c(2,3)] | ||
35 | #subsetting columns | ||
36 | DT[,list(x)] | ||
37 | #calculating values for variables | ||
38 | DT[,list(mean(x),sum(z))] | ||
39 | DT[,table(y)] | ||
40 | #adding new columns | ||
41 | DT[,w:=z^2] | ||
42 | DT2 <- DT | ||
43 | DT[,y:=2] #DT, DT2 the same | ||
44 | #multiple operations | ||
45 | DT[,m:={tmp<-(x+z);log2(tmp+5)}] | ||
46 | DT[,a:=x>0] | ||
47 | DT[,b:=mean(x+w),by=a] | ||
48 | #special variables | ||
49 | set.seed(123) | ||
50 | DT <- data.table(x=sample(letters[1:3],1E5,TRUE)) | ||
51 | #keys | ||
52 | DT <- data.table(x=rep(c("a","b","c"),each=100),y=rnorm(300)) | ||
53 | setkey(DT,x) | ||
54 | DT['a'] | ||
55 | #joins | ||
56 | DT1 <- data.table(x=c('a','a','b','dt1'),y=1:4) | ||
57 | DT2 <- data.table(x=c('a','b','dt2'),z=5:7) | ||
58 | setkey(DT1,x); setkey(DT2,x) | ||
59 | merge(DT1, DT2) | ||
60 | |||
61 | |||
62 | #read XML into R | ||
63 | library(XML) | ||
64 | fileUrl <- "http://www.w3schools.com/xml/simple.xml" | ||
65 | doc <- xmlTreeParse(fileUrl, useInternal = TRUE) | ||
66 | rootNode <- xmlRoot(doc) | ||
67 | xmlName(rootNode) | ||
68 | names(rootNode) | ||
69 | #directly access parts of XML doc | ||
70 | rootNode[[1]] | ||
71 | rootNode[[1]][[1]] | ||
72 | #programmatically extract parts of the file | ||
73 | xmlSApply(rootNode,xmlValue) | ||
74 | xpathSApply(rootNode, "//name",xmlValue) | ||
75 | xpathSApply(rootNode, "//price",xmlValue) | ||
76 | #another example | ||
77 | fileUrl <- "http://espn.com/nfl/team/_/name/baltimore-ravens" | ||
78 | doc <- htmlTreeParse(fileUrl, useInternal = TRUE) | ||
79 | scores <- xpathSApply(doc,"li[@class='score']",xmlValue) | ||
80 | teams <- xpathSApply(doc,"li[@class='team-name']",xmlValue) | ||
81 | |||
82 | #Making a Proportional Stacked Bar Graph | ||
83 | |||
84 | #reading JSON | ||
85 | #javascript object notation | ||
86 | library(jsonlite) | ||
87 | jsonData <- fromJSON("https://api.github.com/users/jtleek/repos") | ||
88 | names(jsonData) | ||
89 | names(jsonData$owner) | ||
90 | jsonData$owner$login | ||
91 | #writing data frame to JSON | ||
92 | myjson <- toJSON(iris, pretty = TRUE) | ||
93 | cat(myjson) | ||
94 | #convert back to JSON | ||
95 | iris2 <- fromJSON(myjson) | ||
96 | |||
97 | |||
98 | #reading mySQL | ||
99 | library(RMySQL) | ||
100 | ucscDb <- dbConnect(MySQL(),user="genome", | ||
101 | host="genome-mysql.cse.ucsc.edu") | ||
102 | result <- dbGetQuery(ucscDb,"show databases;");dbDisconnect(ucscDb) | ||
103 | |||
104 | hg19 <- dbConnect(MySQL(),user="genome",db='hg19', | ||
105 | host="genome-mysql.cse.ucsc.edu") | ||
106 | allTables <- dbListTables(hg19) | ||
107 | length(allTables) | ||
108 | #get dimensions of a specific table | ||
109 | dbListFields(hg19,"affyU133Plus2") | ||
110 | dbGetQuery(hg19,"select count(*) from affyU133Plus2") | ||
111 | result <- dbGetQuery(hg19,"select * from affyU133Plus2") | ||
112 | #read from the table | ||
113 | affyData <- dbReadTable(hg19,"affyU133Plus2") | ||
114 | #query | ||
115 | affyMis <- dbGetQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3") | ||
116 | #equal to | ||
117 | query <- dbSendQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3") | ||
118 | affyMis <- fetch(query) | ||
119 | quantile(affyMis$misMatches) | ||
120 | dbClearResult(query) | ||
121 | #disconnect mySQL | ||
122 | dbDisconnect(hg19) | ||
123 | |||
124 | |||
125 | #reading HDF5 | ||
126 | source("http://bioconductor.org/biocLite.R") | ||
127 | biocLite("rhdf5") | ||
128 | library(rhdf5) | ||
129 | created <- h5createFile("example.h5") | ||
130 | created <- h5createGroup("example.h5","foo") | ||
131 | created <- h5createGroup("example.h5","baa") | ||
132 | created <- h5createGroup("example.h5","foo/foobaa") | ||
133 | h5ls("example.h5") | ||
134 | #write to groups | ||
135 | A <- matrix(1:10, nr=5, nc =2) | ||
136 | h5write(A, "example.h5","foo/A") | ||
137 | B <- array(seq(0.1,2.0,by=0.1),dim=c(5,2,2)) | ||
138 | attr(B,"scale") <- "liter" | ||
139 | h5write(B,"example.h5","foo/foobaa/B") | ||
140 | h5ls("example.h5") | ||
141 | #write a data set | ||
142 | df <- data.frame(1L:5L, seq(0,1,length.out = 5), c('ab','cde','fghi','a','s'), | ||
143 | stringsAsFactors = F) | ||
144 | h5write(df,"example.h5","df") | ||
145 | h5ls("example.h5") | ||
146 | #reading data | ||
147 | readA <- h5read("example.h5","foo/A") | ||
148 | readB <- h5read("example.h5","foo/foobaa/B") | ||
149 | readdf <- h5read("example.h5",'df') | ||
150 | readA | ||
151 | |||
152 | |||
153 | #reading data from APIs | ||
154 | |||
155 | #subsetting | ||
156 | set.seed(13435) | ||
157 | X <- data.frame("var1"=sample(1:5),"var2"=sample(6:10),"var3"=sample(11:15)) | ||
158 | X <- X[sample(1:5),] | ||
159 | X$var2[c(1,3)] <- NA | ||
160 | #dealing with missing values | ||
161 | X[which(X$var2>8),] #which can ignore the NA values | ||
162 | #sorting川普 | ||
163 | sort(X$var1) | ||
164 | sort(X$var1,decreasing = TRUE) | ||
165 | sort(X$var2,na.last = TRUE) | ||
166 | #ordering | ||
167 | X[order(X$var1),] | ||
168 | #ording by plyr | ||
169 | library(plyr) | ||
170 | arrange(X,var1) | ||
171 | arrange(X,desc(var1)) | ||
172 | |||
173 | |||
174 | #creating new variables | ||
175 | #getting the data from the web | ||
176 | if (!file.exists("data")){dir.create("data")} | ||
177 | fileUrl <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD" | ||
178 | download.file(fileUrl, destfile = "./data/restaurants.csv",method = "curl") | ||
179 | dateDownloaded <- date() | ||
180 | restData <- read.csv("./data/restaurants.csv") | ||
181 | |||
182 | #subsetting variables | ||
183 | restData$nearMe <- restData$neighborhood %in% c("Roland Park","Homeland") | ||
184 | table(restData$nearMe) | ||
185 | #creating binary variables | ||
186 | restData$zipWrong <- ifelse(restData$zipCode < 0, TRUE, FALSE) | ||
187 | with(restData,table(zipWrong,zipCode < 0)) | ||
188 | #creating categorical variable | ||
189 | restData$zipGroup <- cut(restData$zipCode, breaks = quantile(restData$zipCode)) | ||
190 | table(restData$zipGroup) | ||
191 | #easier cutting | ||
192 | library(Hmisc) | ||
193 | restData$zipGroup <- cut2(restData$zipCode,g=4) | ||
194 | table(restData$zipGroup) | ||
195 | #creating factor variables | ||
196 | restData$zcf <- factor(restData$zipCode) | ||
197 | #levels of factor | ||
198 | yesno <- sample(c("yes","no"),size = 10,replace = TRUE) | ||
199 | yesnofac <- factor(yesno,levels = c("yes","no")) | ||
200 | relevel(yesnofac,ref = "no") | ||
201 | #cutting produces factor variables | ||
202 | library(Hmisc) | ||
203 | restData$zipGroup <- cut2(restData$zipCode,g=4) | ||
204 | #using mutate function | ||
205 | library(Hmisc) | ||
206 | library(plyr) | ||
207 | restData2 <- mutate(restData,zipGroup=cut2(zipCode,g=4)) | ||
208 | table(restData2$zipGroup) | ||
209 | |||
210 | #smmarizing data | ||
211 | colSums(is.na(restData)) | ||
212 | all(colSums(is.na(restData)) == 0) | ||
213 | #values with specific character | ||
214 | table(restData$zipCode %in% c("21212","21213")) | ||
215 | #cross table | ||
216 | data("UCBAdmissions") | ||
217 | df1 <- as.data.frame(UCBAdmissions) | ||
218 | summary(df1) | ||
219 | xt <- xtabs(Freq ~Gender +Admit,data=df1) #sum | ||
220 | #flat tables | ||
221 | warpbreaks$replicate <- rep(1:9, len=54) | ||
222 | xt <- xtabs(breaks~.,data=warpbreaks) | ||
223 | ftable(xt) | ||
224 | #size of a data set | ||
225 | fakeData <- rnorm(1e5) | ||
226 | object.size(fakeData) | ||
227 | print(object.size(fakeData),units = "Mb") | ||
228 | |||
229 | |||
230 | #dplyr package | ||
231 | library(dplyr) | ||
232 | #select | ||
233 | chicago <- readRDS("./data/chicago.rds") | ||
234 | head(select(chicago, 1:5)) #select col | ||
235 | head(select(chicago, city:dptp)) | ||
236 | head(select(chicago,-(city:dptp))) #not select | ||
237 | #equivalent | ||
238 | i <- match("city", names(chicago)) | ||
239 | j <- match("dptp", names(chicago)) | ||
240 | head(chicago[,-(i:j)]) | ||
241 | |||
242 | #filter | ||
243 | chic <- filter(chicago, pm25tmean2 > 30) | ||
244 | head(select(chic,1:3,pm25tmean2)) | ||
245 | chic <- filter(chicago, pm25tmean2 > 30 & tmpd > 80) | ||
246 | head(select(chic,1:3,pm25tmean2,tmpd)) | ||
247 | |||
248 | #arrange (reorder rows while preserving corresponding order of other col) | ||
249 | chicago <- arrange(chicago, date) | ||
250 | head(select(chicago,date,pm25tmean2)) | ||
251 | #descending order | ||
252 | chicago <- arrange(chicago, desc(date)) | ||
253 | head(select(chicago,date,pm25tmean2)) | ||
254 | |||
255 | |||
256 | #rename | ||
257 | chicago <- rename(chicago, dewpoint = dptp, pm25 = pm25tmean2) | ||
258 | head(select(chicago,1:5)) | ||
259 | |||
260 | #mutate | ||
261 | chicago <- mutate(chicago, | ||
262 | pm25detrend = pm25 - mean(pm25,na.rm = TRUE)) | ||
263 | head(select(chicago,pm25,pm25detrend)) | ||
264 | |||
265 | #group_by (generate summary statistics by stratum) | ||
266 | chicago <- mutate(chicago, | ||
267 | tempcat=factor(1*(tmpd>80), labels = c("cold","hot"))) | ||
268 | #1* just transfor to numeric (0,1) | ||
269 | hotcold <- group_by(chicago, tempcat) | ||
270 | summarize(hotcold, pm25 = mean(pm25, na.rm = TRUE), | ||
271 | o3 = max(o3tmean2, na.rm = TRUE), | ||
272 | no2 = median(no2tmean2, na.rm = TRUE)) | ||
273 | |||
274 | chicago <- mutate(chicago, | ||
275 | year = as.POSIXlt(date)$year + 1900) | ||
276 | years <- group_by(chicago, year) | ||
277 | summarize(years, pm25 = mean(pm25, na.rm = T), | ||
278 | o3 = max(o3tmean2, na.rm = T), | ||
279 | no2 = median(no2tmean2, na.rm = T)) | ||
280 | |||
281 | chicago %>% mutate(month = as.POSIXlt(date)$mon +1) %>% group_by(month) %>% summarize(pm25 = mean(pm25, na.rm = T), | ||
282 | o3 = max(o3tmean2, na.rm = T), | ||
283 | no2 = median(no2tmean2, na.rm = T)) | ||
284 | |||
285 | |||
286 | if (!exists("./data")) {dir.create("./data")} | ||
287 | fileUrl1 <- "https://dl.dropboxusercontent.com/u/7710864/data/reviews-apr29.csv" | ||
288 | fileUrl2 <- "https://dl.dropboxusercontent.com/u/7710864/data/solutions-apr29.csv" | ||
289 | |||
290 | download.file(fileUrl1,destfile = "./data/reviews.csv",method = "curl") | ||
291 | download.file(fileUrl2,destfile = "./data/solutions.csv",method = "curl") | ||
292 | |||
293 | reviews <- read.csv("./data/reviews.csv") | ||
294 | solutions <- read.csv("./data/solutions.csv") | ||
295 | |||
296 | #merge data | ||
297 | mergedData <- merge(reviews, solutions, by.x = "solution_id", by.y="id",all=TRUE) | ||
298 | |||
299 | #join in the plyr package | ||
300 | library(plyr) | ||
301 | df1 <- data.frame(id = sample(1:10), x=rnorm(10)) | ||
302 | df2 <- data.frame(id = sample(1:10), y = rnorm(10)) | ||
303 | arrange(join(df1, df2),id) | ||
304 | #join multiple data frames | ||
305 | df1 <- data.frame(id = sample(1:10), x=rnorm(10)) | ||
306 | df2 <- data.frame(id = sample(1:10), y = rnorm(10)) | ||
307 | df3 <- data.frame(id = sample(1:10),z=rnorm(10)) | ||
308 | |||
309 | dfList <- list(df1,df2,df3) | ||
310 | join_all(dfList) | ||
311 | |||
312 | #working with date | ||
313 | d1 <- date() | ||
314 | class(d1) #charactor | ||
315 | |||
316 | d2 <- Sys.Date() | ||
317 | class(d2) #date | ||
318 | #formatting dates | ||
319 | format(d2, "%a %b %d") | ||
320 | |||
321 | x <- c("1jan1960","2jan1960") | ||
322 | z <- as.Date(x, "%d%b%Y") | ||
323 | #converting | ||
324 | weekdays(d2) | ||
325 | |||
326 | #lubridate | ||
327 | library(lubridate) | ||
328 | ymd("20140108") | ||
329 | ymd("2011-08-03") | ||
330 | |||
331 | #editing text variables | ||
332 | |||
333 | if (!file.exists("data")){dir.create("data")} | ||
334 | fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD" | ||
335 | download.file(fileUrl, destfile = "./data/cameras.csv",method = "curl") | ||
336 | |||
337 | camera <- read.csv("./data/cameras.csv") | ||
338 | tolower(names(camera)) | ||
339 | |||
340 | #fixing character vectors | ||
341 | splitNames <- strsplit(names(camera), "\\.") | ||
342 | splitNames[[6]][1] | ||
343 | firstElement <- function(x){x[1]} | ||
344 | sapply(splitNames,firstElement) | ||
345 | |||
346 | fileUrl1 <- "https://dl.dropboxusercontent.com/u/7710864/data/reviews-apr29.csv" | ||
347 | fileUrl2 <- "https://dl.dropboxusercontent.com/u/7710864/data/solutions-apr29.csv" | ||
348 | |||
349 | download.file(fileUrl1,destfile = "./data/reviews.csv",method = "curl") | ||
350 | download.file(fileUrl2,destfile = "./data/solutions.csv",method = "curl") | ||
351 | |||
352 | reviews <- read.csv("./data/reviews.csv") | ||
353 | solutions <- read.csv("./data/solutions.csv") | ||
354 | names(reviews) | ||
355 | #replacement | ||
356 | sub("_", "", names(reviews)) #just replace one | ||
357 | gsub() #replace all | ||
358 | |||
359 | |||
360 | #finding values | ||
361 | grep("Alameda", camera$intersection) #return location | ||
362 | grepl() #return T or F | ||
363 | |||
364 | grep("Alameda", camera$intersection, value = TRUE) #return exact element | ||
365 | |||
366 | |||
367 | #stringr library | ||
368 | library(stringr) | ||
369 | nchar("Jeffrey Leek") #count | ||
370 | substr("Jeffrey Leek",1,7) | ||
371 | paste("Jeffrey","Leek") #with blank | ||
372 | paste0("Jeffrey", "Leek") #without blank | ||
373 | str_trim("Jeff ") #remove the blanks (beginning and ending) | ||
374 | |||
375 | |||
376 | #reshaping | ||
377 | library(reshape2) | ||
378 | mtcars$carname <- rownames(mtcars) | ||
379 | carMelt <- melt(mtcars, id=c("carname","gear","cyl"), measure.vars = c("mpg","hp")) | ||
380 | |||
381 | |||
382 | #casting data frames | ||
383 | cylData <- dcast(carMelt,cyl~variable) #count | ||
384 | cylData <- dcast(carMelt,cyl~variable, mean) | ||
385 | |||
386 | #averaging values | ||
387 | head(InsectSprays) | ||
388 | tapply(InsectSprays$count,InsectSprays$spray,sum) # | ||
389 | #split | ||
390 | spIns <- split(InsectSprays$count, InsectSprays$spray) | ||
391 | sapply(spIns,sum) | ||
392 | #or | ||
393 | unlist(lapply(spIns,sum)) | ||
394 | |||
395 | #plyr package | ||
396 | library(plyr) | ||
397 | ddply(InsectSprays, "spray",summarize,sum=sum(count)) #return dataframe | ||
398 | |||
399 | |||
400 | |||
401 | |||
402 | |||
403 | |||
404 | |||
405 |
ggplot2.R
File was created | 1 | ||
2 | set.seed(1410) | ||
3 | |||
4 | dsmall <- diamonds[sample(nrow(diamonds), 100),] | ||
5 | |||
6 | |||
7 | #scatterplot | ||
8 | qplot(carat, price, data = diamonds) | ||
9 | qplot(log(carat), log(price), data = diamonds) | ||
10 | |||
11 | qplot(carat, x*y*z, data = diamonds) | ||
12 | qplot(carat, price, data = dsmall, colour = color) | ||
13 | qplot(carat, price, data = dsmall, shape = cut) | ||
14 | |||
15 | qplot(carat, price, data = diamonds, alpha = I(1/10)) | ||
16 | qplot(carat, price, data = diamonds, alpha = I(1/100)) | ||
17 | qplot(carat, price, data = diamonds, alpha = I(1/200)) | ||
18 | |||
19 | |||
20 | #geom | ||
21 | qplot(carat, price, data = dsmall, geom = c("point", "smooth")) | ||
22 | qplot(carat, price, data = diamonds, geom = c("point", "smooth")) | ||
23 | |||
24 | |||
25 | qplot(color, price/carat, data=diamonds,geom = "jitter") | ||
26 | qplot(color, price/carat, data=diamonds,geom = "boxplot") | ||
27 | qplot(color, price / carat, data = diamonds, geom = "jitter", | ||
28 | alpha = I(1 / 50)) | ||
29 | |||
30 | #offer size, colour, shape for scatterplot | ||
31 | #offer size, colour, fill for boxplot | ||
32 | qplot(color, price/carat, data=diamonds,geom = "boxplot",colour="red") | ||
33 | |||
34 | |||
35 | qplot(carat, data = diamonds, geom = "histogram") | ||
36 | qplot(carat, data = diamonds, geom = "density") | ||
37 | |||
38 | |||
39 | qplot(carat, data = diamonds, geom = "histogram", binwidth = 1, | ||
40 | xlim = c(0,3)) | ||
41 | qplot(carat, data = diamonds, geom = "histogram", binwidth = 0.1, | ||
42 | xlim = c(0,3)) | ||
43 | qplot(carat, data = diamonds, geom = "histogram", binwidth = 0.01, | ||
44 | xlim = c(0,3)) | ||
45 | |||
46 | qplot(carat, data = diamonds, geom = "density", colour = color) | ||
47 | qplot(carat, data = diamonds, geom = "histogram", fill = color, bins = 50) | ||
48 | |||
49 | qplot(color, data = diamonds, geom = "bar") | ||
50 | qplot(color, data = diamonds, geom = "bar", weight = carat) + | ||
51 | scale_y_continuous("carat") #total weight | ||
52 | |||
53 | |||
54 | qplot(date, unemploy / pop, data = economics, geom = "line") | ||
55 | qplot(date, uempmed, data = economics, geom = "line") | ||
56 | |||
57 | #path | ||
58 | year <- function(x) as.POSIXlt(x)$year + 1900 | ||
59 | qplot(unemploy / pop, uempmed, data = economics, | ||
60 | geom = c("point", "path")) | ||
61 | qplot(unemploy / pop, uempmed, data = economics, | ||
62 | geom = "path", colour = year(date)) + scale_area() | ||
63 | |||
64 | #faceting | ||
65 | # row var ∼ col var | ||
66 | qplot(carat, data = diamonds, facets = color ~ ., | ||
67 | geom = "histogram", binwidth = 0.1, xlim = c(0, 3)) | ||
68 | qplot(carat, ..density.., data = diamonds, facets = color ~ ., | ||
69 | geom = "histogram", binwidth = 0.1, xlim = c(0, 3)) | ||
70 | #Using ..density.. tells ggplot2 to map the density to the y-axis | ||
71 | #instead of the default use of count | ||
72 | #density plot makes it easier to compare distributions ignoring | ||
73 | #the relative abundance of diamonds within each colour | ||
74 | |||
75 | |||
76 | #see ?plotmath for more examples of using mathematical formulae | ||
77 | |||
78 | qplot( | ||
79 | carat, price, data = dsmall, | ||
80 | xlab = "Price ($)", ylab = "Weight (carats)", | ||
81 | main = "Price-weight relationship" ) | ||
82 | |||
83 | |||
84 | qplot( | ||
85 | carat, price/carat, data = dsmall, | ||
86 | xlab = "Weight(carats)", ylab = "expression(frac(price,carat))", | ||
87 | main = "Small diamonds", | ||
88 | xlim = c(.2, 1)) | ||
89 | |||
90 | qplot(carat, price, data = dsmall, log = "xy") | ||
91 | |||
92 | qplot(displ, hwy, data = mpg, colour = factor(cyl)) #factor | ||
93 | |||
94 | #ggplot | ||
95 | p <- ggplot(diamonds, aes(x = carat)) | ||
96 | p <- p + layer( | ||
97 | geom = "bar", | ||
98 | geom_params = list(fill = "steelblue"), | ||
99 | stat = "bin", | ||
100 | stat_params = list(binwidth = 2) | ||
101 | ) | ||
102 | #layer(geom, geom_params, stat, stat_params, data, mapping, position) | ||
103 | ggplot(msleep, aes(sleep_rem / sleep_total, awake)) + | ||
104 | geom_point() | ||
105 | |||
106 | #geom_XXX(mapping, data, ..., geom, position) | ||
107 | #stat_XXX(mapping, data, ..., stat, position) | ||
108 | p <- ggplot(mtcars, aes(mpg, wt)) + | ||
109 | geom_point() | ||
110 | |||
111 | p + geom_point(aes(colour = factor(cyl))) | ||
112 | p + geom_point(aes(y=disp)) | ||
113 | |||
114 | p <- ggplot(mtcars, aes(mpg, wt)) | ||
115 | p + geom_point(colour = "darkblue") | ||
116 | p + geom_point(aes(colour = "darkblue")) | ||
117 | |||
118 | p + geom_point(aes(colour = factor(cyl))) | ||
119 | |||
120 | p <- ggplot(Oxboys, aes(age, height, group = Subject)) + geom_line() | ||
121 | |||
122 | p + geom_smooth(aes(group = Subject), method = "lm", se = F) | ||
123 | |||
124 | p + geom_smooth(aes(group = 1), method = "lm", se = F) | ||
125 | |||
126 | boysbox <- ggplot(Oxboys, aes(Occasion, height)) + geom_boxplot() | ||
127 | |||
128 | boysbox + geom_line(aes(group=Subject), color=I(8)) | ||
129 | |||
130 | xgrid <- with(df, seq(min(x), max(x), length=50)) | ||
131 | |||
132 | p <- ggplot(diamonds, aes(carat)) + | ||
133 | geom_histogram(aes(y=..count..), binwidth = 0.1) | ||
134 | p <- ggplot(diamonds, aes(carat)) + | ||
135 | geom_histogram(aes(y=..density..), binwidth = 0.1) | ||
136 | qplot(carat, ..density.., data=diamonds, geom = "histogram",binwidth = 0.1) | ||
137 | |||
138 | d <- ggplot(diamonds,aes(carat)) + xlim(0, 3) | ||
139 | d + stat_bin(aes(ymax = ..count..), binwidth = 0.1, geom = "area") | ||
140 | d + stat_bin(aes(size = ..density..), binwidth = 0.1, geom = "point", position = "identity") | ||
141 | d + stat_bin(aes(y = 1, fill = ..count..), binwidth = 0.1,geom = "tile", position = "identity") | ||
142 | |||
143 | #graph cook book | ||
144 | library(gcookbook) | ||
145 | library(ggplot2) | ||
146 | #chapte 2 quickly exploring data | ||
147 | |||
148 | #creating a scatter plot | ||
149 | plot(mtcars$wt, mtcars$mpg) | ||
150 | qplot(mtcars$wt, mtcars$mpg) | ||
151 | qplot(wt, mpg, data=mtcars) | ||
152 | # This is equivalent to: | ||
153 | ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point() | ||
154 | |||
155 | #creating line plot | ||
156 | plot(pressure$temperature, pressure$pressure, type="l") | ||
157 | |||
158 | qplot(pressure$temperature, pressure$pressure, geom="line") | ||
159 | qplot(temperature, pressure, data=pressure, geom="line") | ||
160 | # This is equivalent to: | ||
161 | ggplot(pressure, aes(x=temperature, y=pressure)) + geom_line() | ||
162 | # Lines and points together | ||
163 | qplot(temperature, pressure, data=pressure, geom=c("line", "point")) | ||
164 | # Equivalent to: | ||
165 | ggplot(pressure, aes(x=temperature, y=pressure)) + geom_line() + geom_point() | ||
166 | |||
167 | |||
168 | #creating bar graph | ||
169 | barplot(BOD$demand, names.arg=BOD$Time) #name | ||
170 | |||
171 | # Generate a table of countsqplot(supp, len, data=ToothGrowth, geom="boxplot") | ||
172 | # This is equivalent to: | ||
173 | ggplot(ToothGrowth, aes(x=supp, y=len)) + geom_boxplot() | ||
174 | barplot(table(mtcars$cyl)) | ||
175 | library(ggplot2) | ||
176 | ggplot(BOD, aes(x=factor(Time), y=demand)) + geom_bar(stat="identity") | ||
177 | # Bar graph of counts | ||
178 | qplot(factor(cyl), data=mtcars) | ||
179 | # This is equivalent to: | ||
180 | ggplot(mtcars, aes(x=factor(cyl))) + geom_bar() | ||
181 | |||
182 | #creating histogram | ||
183 | # Specify approximate number of bins with breaks | ||
184 | hist(mtcars$mpg, breaks=10) | ||
185 | library(ggplot2) | ||
186 | qplot(mpg, data=mtcars, binwidth=1) | ||
187 | # This is equivalent to: | ||
188 | ggplot(mtcars, aes(x=mpg)) + geom_histogram(binwidth=1) | ||
189 | |||
190 | #creating box plot | ||
191 | # Formula syntax | ||
192 | boxplot(len ~ supp, data = ToothGrowth) | ||
193 | library(ggplot2) | ||
194 | qplot(supp, len, data=ToothGrowth, geom="boxplot") | ||
195 | # This is equivalent to: | ||
196 | ggplot(ToothGrowth, aes(x=supp, y=len)) + geom_boxplot() | ||
197 | ggplot(ToothGrowth, aes(x=interaction(supp, dose), y=len)) + geom_boxplot() | ||
198 | |||
199 | #plot a function curve | ||
200 | curve(x^3 - 5*x, from=-4, to=4) | ||
201 | # Plot a user-defined function | ||
202 | myfun <- function(xvar) { | ||
203 | 1/(1 + exp(-xvar + 10)) | ||
204 | } | ||
205 | curve(myfun(x), from=0, to=20) | ||
206 | # Add a line: | ||
207 | curve(1-myfun(x), add = TRUE, col = "red") | ||
208 | library(ggplot2) | ||
209 | # This is equivalent to: | ||
210 | ggplot(data.frame(x=c(0, 20)), aes(x=x)) + stat_function(fun=myfun, geom="line") | ||
211 | |||
212 | |||
213 | #chapter 3 Bar graphs | ||
214 | ggplot(pg_mean, aes(x=group, y=weight)) + geom_bar(stat="identity") | ||
215 | |||
216 | ggplot(pg_mean, aes(x=group, y=weight)) + | ||
217 | geom_bar(stat="identity", fill="lightblue", colour="black") | ||
218 | |||
219 | |||
220 | #group by a second variable | ||
221 | ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) + | ||
222 | geom_bar(position="dodge", colour="black",stat="identity") #default stat is count | ||
223 | |||
224 | #stat="identity" means using sum | ||
225 | |||
226 | ggplot(diamonds, aes(x=cut)) + geom_bar(stat = "count") #default stat is count | ||
227 | |||
228 | |||
229 | upc <- subset(uspopchange, rank(Change)>40) | ||
230 | ggplot(upc, aes(x=Abb, y=Change, fill=Region)) + geom_bar(stat="identity") | ||
231 | |||
232 | ggplot(upc, aes(x=reorder(Abb, Change), y=Change, fill=Region)) + | ||
233 | geom_bar(stat="identity", colour="black") + | ||
234 | scale_fill_manual(values=c("#669933", "#FFCC66")) + | ||
235 | xlab("State") | ||
236 | |||
237 | #using the reorder() function to reorder the levels of a factor based on | ||
238 | #the values of another variable | ||
239 | |||
240 | |||
241 | #Coloring Negative and Positive Bars Differently | ||
242 | csub <- subset(climate, Source=="Berkeley" & Year >= 1900) | ||
243 | csub$pos <- csub$Anomaly10y >= 0 | ||
244 | ggplot(csub, aes(x=Year, y=Anomaly10y, fill=pos)) + | ||
245 | geom_bar(stat="identity", position="identity") #default position is stack | ||
246 | |||
247 | ggplot(csub, aes(x=Year, y=Anomaly10y, fill=pos)) + | ||
248 | geom_bar(stat="identity", position="identity", colour="black", size=0.25) + | ||
249 | scale_fill_manual(values=c("#CCEEFF", "#FFDDDD"), guide=FALSE) | ||
250 | |||
251 | |||
252 | #Adjusting Bar Width and Spacing | ||
253 | #default value is 0.9 | ||
254 | ggplot(pg_mean, aes(x=group, y=weight)) + geom_bar(stat="identity", width=0.5) | ||
255 | #a grouped bar graph with narrow bars | ||
256 | ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) + | ||
257 | geom_bar(stat="identity", width=0.5, position="dodge") | ||
258 | #with some space between the bars | ||
259 | ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) + | ||
260 | geom_bar(stat="identity", width=0.6, position=position_dodge(0.8)) #space = 0.8-0.6 | ||
261 | #To be more precise, the value of width in position_dodge() is the same as | ||
262 | #width in geom_bar() | ||
263 | |||
264 | |||
265 | #Making a Stacked Bar Graph | ||
266 | ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) + | ||
267 | geom_bar(stat="identity") | ||
268 | #One problem with the default output is that the stacking order is the opposite of the | ||
269 | #order of items in the legend | ||
270 | #reverse the legend | ||
271 | ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) + | ||
272 | geom_bar(stat="identity") + | ||
273 | guides(fill=guide_legend(reverse=TRUE)) | ||
274 | #reverse the stacking order | ||
275 | libary(plyr) # Needed for desc() | ||
276 | ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar, order=desc(Cultivar))) + | ||
277 | geom_bar(stat="identity") | ||
278 | #get a different color palette | ||
279 | ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) + | ||
280 | geom_bar(stat="identity", colour="black") + | ||
281 | guides(fill=guide_legend(reverse=TRUE)) + | ||
282 | scale_fill_brewer(palette="Pastel1") | ||
283 | |||
284 | |||
285 | #Making a Proportional Stacked Bar Graph | ||
286 | library(gcookbook) | ||
287 | library(plyr) | ||
288 | ce <- ddply(cabbage_exp,"Date",transform, | ||
289 | percent_weight = Weight / sum(Weight)*100) | ||
290 | |||
291 | ggplot(ce, aes(x=Date, y= percent_weight, fill =Cultivar)) + | ||
292 | geom_bar(stat="identity") | ||
293 | #change a little bit | ||
294 | ggplot(ce, aes(x=Date, y= percent_weight, fill =Cultivar)) + | ||
295 | geom_bar(stat="identity", color="black") + | ||
296 | guides(fill=guide_legend(reverse = TRUE)) + | ||
297 | scale_fill_brewer(palette = "Pastel1") | ||
298 | |||
299 | #adding labels to a bar graph | ||
300 | # Below the top | ||
301 | ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) + | ||
302 | geom_bar(stat="identity") + | ||
303 | geom_text(aes(label=Weight), vjust=1.5, colour="white") | ||
304 | |||
305 | # Above the top | ||
306 | ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) + | ||
307 | geom_bar(stat="identity") + | ||
308 | geom_text(aes(label=Weight), vjust=-0.2) | ||
309 | |||
310 | #adjust y limits to be a little higher | ||
311 | ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) + | ||
312 | geom_bar(stat="identity") + | ||
313 | geom_text(aes(label=Weight), vjust=-0.2) + | ||
314 | ylim(0, max(cabbage_exp$Weight) * 1.05) | ||
315 | |||
316 | # Map y positions slightly above bar top - y range of plot will auto-adjust | ||
317 | ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) + | ||
318 | geom_bar(stat="identity") + | ||
319 | geom_text(aes(y=Weight+.1, label=Weight)) | ||
320 | |||
321 | #dodging | ||
322 | ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) + | ||
323 | geom_bar(stat="identity", position="dodge") + | ||
324 | geom_text(aes(label=Weight), vjust=1.5, colour="white", | ||
325 | position=position_dodge(0.9), size=3) | ||
326 | |||
327 | |||
328 | #label on stack bar | ||
329 | library(plyr) | ||
330 | # Sort by the day and sex columns | ||
331 | ce <- arrange(cabbage_exp, Date, Cultivar) | ||
332 | #get the cumulative sum | ||
333 | ce <- ddply(ce, "Date", transform, label_y=cumsum(Weight)) | ||
334 | |||
335 | ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) + | ||
336 | geom_bar(stat="identity") + | ||
337 | geom_text(aes(y=label_y, label=Weight), vjust=1.5, colour="white") + | ||
338 | guides(fill=guide_legend(reverse=TRUE)) | ||
339 | |||
340 | #put the label in the center | ||
341 | ce <- arrange(cabbage_exp, Date, Cultivar) | ||
342 | # Calculate y position, placing it in the middle | ||
343 | ce <- ddply(ce, "Date", transform, label_y=cumsum(Weight)-0.5*Weight) #good | ||
344 | ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) + | ||
345 | geom_bar(stat="identity") + | ||
346 | geom_text(aes(y=label_y, label=Weight), colour="white") | ||
347 | |||
348 | #polish | ||
349 | ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) + | ||
350 | geom_bar(stat="identity") + | ||
351 | geom_text(aes(y=label_y, label=paste(format(Weight, nsmall=2), "kg")), | ||
352 | size=4) + | ||
353 | guides(fill=guide_legend(reverse=TRUE)) + scale_fill_brewer(palette="Pastel1") | ||
354 | #coord_flip() | ||
355 | |||
356 | #making a Cleveland Dot Plot | ||
357 | library(gcookbook) | ||
358 | tophit <- tophitters2001[1:25,] | ||
359 | |||
360 | ggplot(tophit, aes(x=avg, y=name)) + geom_point() | ||
361 | |||
362 | ggplot(tophit, aes(x=avg, y=reorder(name, avg))) + | ||
363 | geom_point(size=3) + # Use a larger dot | ||
364 | theme_bw() + | ||
365 | theme(panel.grid.major.x = element_blank(), | ||
366 | panel.grid.minor.x = element_blank(), | ||
367 | panel.grid.major.y = element_line(colour="grey60", linetype="dashed")) | ||
368 | |||
369 | ggplot(tophit, aes(x=reorder(name, avg), y=avg)) + | ||
370 | geom_point(size=3) + # Use a larger dot | ||
371 | theme_bw() + | ||
372 | theme(axis.text.x = element_text(angle=60, hjust=1), | ||
373 | panel.grid.major.y = element_blank(), | ||
374 | panel.grid.minor.y = element_blank(), | ||
375 | panel.grid.major.x = element_line(colour="grey60", linetype="dashed")) | ||
376 | |||
377 | #reorder onlu by one variable | ||
378 | #order by two variables | ||
379 | # Get the names, sorted first by lg, then by avg | ||
380 | nameorder <- tophit$name[order(tophit$lg, tophit$avg)] | ||
381 | # Turn name into a factor, with levels in the order of nameorder | ||
382 | tophit$name <- factor(tophit$name, levels=nameorder) | ||
383 | |||
384 | |||
385 | ggplot(tophit, aes(x=avg, y=name)) + | ||
386 | geom_segment(aes(yend=name), xend=0, colour="grey50") + | ||
387 | geom_point(size=3, aes(colour=lg)) + | ||
388 | scale_colour_brewer(palette="Set1", limits=c("NL","AL")) + #limits order the posion | ||
389 | theme_bw() + | ||
390 | theme(panel.grid.major.y = element_blank(), # No horizontal grid lines | ||
391 | legend.position=c(1, 0.55), # Put legend inside plot area | ||
392 | legend.justification=c(1, 0.5)) | ||
393 | |||
394 | ggplot(tophit, aes(x=avg, y=name)) + | ||
395 | geom_segment(aes(yend=name), xend=0, colour="grey50") + | ||
396 | geom_point(size=3, aes(colour=lg)) + | ||
397 | scale_colour_brewer(palette="Set1", limits=c("NL","AL"), guide=FALSE) + | ||
398 | theme_bw() + | ||
399 | theme(panel.grid.major.y = element_blank()) + | ||
400 | facet_grid(lg ~ ., scales="free_y", space="free_y") | ||
401 | |||
402 | |||
403 | #Making a Basic Line Graph | ||
404 | ggplot(BOD, aes(x=Time, y=demand)) + geom_line() | ||
405 | |||
406 | d1 <- BOD | ||
407 | d1$Time <- factor(d1$Time) | ||
408 | ggplot(d1, aes(x=Time, y=demand,group=1)) +geom_line() | ||
409 | #add y limit | ||
410 | ggplot(BOD, aes(x=Time, y=demand)) + geom_line() + ylim(0, max(BOD$demand)) | ||
411 | ggplot(BOD, aes(x=Time, y=demand)) + geom_line() + expand_limits(y=0) #same | ||
412 | |||
413 | #adding points to a line graph | ||
414 | ggplot(BOD, aes(x=Time, y=demand)) + geom_line() + geom_point() | ||
415 | |||
416 | library(gcookbook) | ||
417 | ggplot(worldpop, aes(x=Year, y=Population)) + | ||
418 | geom_line() + | ||
419 | geom_point() | ||
420 | |||
421 | #with a log y-axis | ||
422 | ggplot(worldpop, aes(x=Year, y= Population)) + | ||
423 | geom_line() + | ||
424 | geom_point() + | ||
425 | scale_y_log10() | ||
426 | |||
427 | #making a line graph with multiple lines | ||
428 | #summarize the data | ||
429 | library(plyr) | ||
430 | tg <- ddply(ToothGrowth, c("supp","dose"),summarise, length=mean(len)) | ||
431 | |||
432 | #map supp to colour | ||
433 | ggplot(tg, aes(x=dose,y=length,colour=supp)) + geom_line() | ||
434 | |||
435 | #map supp to linetype | ||
436 | ggplot(tg, aes(x=dose, y=length, linetype=supp)) + geom_line() | ||
437 | |||
438 | #factor + group (line point together) | ||
439 | ggplot(tg, aes(x=factor(dose), y=length,colour=supp,group=supp)) + geom_line() | ||
440 | # Make the points a little larger | ||
441 | ggplot(tg, aes(x=dose, y=length, shape=supp)) + geom_line() + geom_point(size=4) | ||
442 | # Also use a point with a color fill | ||
443 | ggplot(tg, aes(x=dose, y=length, fill=supp)) + geom_line() + geom_point(size=4, shape=21) | ||
444 | #dodge them | ||
445 | ggplot(tg, aes(x=dose, y=length, shape=supp)) + | ||
446 | geom_line(position=position_dodge(0.2)) + # Dodge lines by 0.2 | ||
447 | geom_point(position=position_dodge(0.2), size=4) # Dodge points by 0.2 | ||
448 | |||
449 | #change the apperance of lines | ||
450 | ggplot(BOD, aes(x=Time, y=demand)) + | ||
451 | geom_line(linetype = "dashed", size = 1, color = "blue") | ||
452 | |||
453 | # Load plyr so we can use ddply() to create the example data set | ||
454 | library(plyr) | ||
455 | # Summarize the ToothGrowth data | ||
456 | tg <- ddply(ToothGrowth, c("supp", "dose"), summarise, length=mean(len)) | ||
457 | ggplot(tg, aes(x=dose, y=length, colour=supp)) + | ||
458 | geom_line() + | ||
459 | scale_colour_brewer(palette="Set1") | ||
460 | # If both lines have the same properties, you need to specify a variable to # use for grouping | ||
461 | ggplot(tg, aes(x=dose, y=length, group=supp)) + | ||
462 | geom_line(colour="darkgreen", size=1.5) | ||
463 | # Since supp is mapped to colour, it will automatically be used for grouping | ||
464 | ggplot(tg, aes(x=dose, y=length, colour=supp)) + | ||
465 | geom_line(linetype="dashed") + | ||
466 | geom_point(shape=22, size=3, fill="white") | ||
467 | |||
468 | #change the appearance of points | ||
469 | ggplot(BOD, aes(x=Time, y=demand)) + | ||
470 | geom_line() + | ||
471 | geom_point(size=4, shape=22, colour="darkred", fill="pink") | ||
472 | #The default shape for points is a solid circle, | ||
473 | #the default size is 2, and the default colour is "black" | ||
474 | |||
475 | # Load plyr so we can use ddply() to create the example data set | ||
476 | library(plyr) | ||
477 | # Summarize the ToothGrowth data | ||
478 | tg <- ddply(ToothGrowth, c("supp", "dose"), summarise, length=mean(len)) | ||
479 | # Save the position_dodge specification because we'll use it multiple times | ||
480 | pd <- position_dodge(0.2) | ||
481 | ggplot(tg, aes(x=dose, y=length, fill=supp)) + | ||
482 | geom_line(position=pd) + | ||
483 | geom_point(shape=21, size=3, position=pd) + | ||
484 | scale_fill_manual(values=c("black","white")) | ||
485 | |||
486 | |||
487 | #making a graph with a shaded area | ||
488 | sunspotyear <- data.frame( | ||
489 | Year = as.numeric(time(sunspot.year)), | ||
490 | Sunspots = as.numeric(sunspot.year) | ||
491 | ) | ||
492 | ggplot(sunspotyear, aes(x=Year, y=Sunspots)) + geom_area() | ||
493 | #change area appearance (with outline) | ||
494 | ggplot(sunspotyear, aes(x=Year, y=Sunspots)) + | ||
495 | geom_area(colour="black", fill="blue", alpha=.2) | ||
496 | #no bottom line | ||
497 | ggplot(sunspotyear, aes(x=Year, y=Sunspots)) + | ||
498 | geom_area(fill="blue", alpha=.2) + | ||
499 | geom_line() | ||
500 | |||
501 | #making a stacked area graph | ||
502 | library(gcookbook) | ||
503 | ggplot(uspopage, aes(x=Year, y= Thousands,fill=AgeGroup)) + | ||
504 | geom_area() | ||
505 | |||
506 | #reverse the legend order | ||
507 | ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup)) + | ||
508 | geom_area(colour="black", size=.2, alpha=.4) + | ||
509 | scale_fill_brewer(palette="Blues") + | ||
510 | guides(fill=guide_legend(reverse=TRUE)) | ||
511 | #or | ||
512 | ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup)) + | ||
513 | geom_area(colour="black", size=.2, alpha=.4) + | ||
514 | scale_fill_brewer(palette="Blues", breaks=rev(levels(uspopage$AgeGroup))) | ||
515 | |||
516 | |||
517 | #or change the order of fill | ||
518 | library(plyr) # For the desc() function | ||
519 | ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup, order=desc(AgeGroup))) + | ||
520 | geom_area(colour="black", size=.2, alpha=.4) + | ||
521 | scale_fill_brewer(palette="Blues") | ||
522 | |||
523 | |||
524 | #remove the left and right sides | ||
525 | ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup, order=desc(AgeGroup))) + | ||
526 | geom_area(colour=NA, alpha=.4) + | ||
527 | scale_fill_brewer(palette="Blues") + | ||
528 | geom_line(position="stack", size=.2) #default position is "identity" | ||
529 | |||
530 | |||
531 | #making a proportional stacked area graph | ||
532 | #first calculate the proportions | ||
533 | library(gcookbook) | ||
534 | library(plyr) | ||
535 | #convert to percent | ||
536 | p <- ddply(uspopage,"Year",transform, | ||
537 | Percent = Thousands/sum(Thousands)*100) | ||
538 | ggplot(p,aes(x=Year, y= Percent, fill = AgeGroup)) + | ||
539 | geom_area(colour = "black",size=.2,alpha = .4) + | ||
540 | scale_fill_brewer(palette="Blues",breaks=rev(levels(p$AgeGroup))) | ||
541 | |||
542 | #adding a confidence region | ||
543 | library(gcookbook) | ||
544 | clim <- subset(climate, Source == "Berkeley", | ||
545 | select = c("Year","Anomaly10y","Unc10y")) | ||
546 | #the same | ||
547 | library(dplyr) | ||
548 | clim <- climate %>% filter(Source == "Berkeley") %>% select(Year, Anomaly10y, Unc10y) | ||
549 | #shaded region | ||
550 | ggplot(clim, aes(x=Year,y= Anomaly10y)) + | ||
551 | geom_ribbon(aes(ymin=Anomaly10y-Unc10y, ymax=Anomaly10y+Unc10y), | ||
552 | alpha=0.2) + | ||
553 | geom_line() | ||
554 | #we need know the 95% confidence interval ahead | ||
555 | |||
556 | # With a dotted line for upper and lower bounds | ||
557 | ggplot(clim, aes(x=Year, y=Anomaly10y)) + | ||
558 | geom_line(aes(y=Anomaly10y-Unc10y), colour="grey50", linetype="dotted") + | ||
559 | geom_line(aes(y=Anomaly10y+Unc10y), colour="grey50", linetype="dotted") + | ||
560 | geom_line() | ||
561 | |||
562 | |||
563 | |||
564 | ####scatter plots | ||
565 | #making a basic scatter plot | ||
566 | ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point() | ||
567 | #the default solid circles (shape #16) is hollow ones (#21) | ||
568 | ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point(shape=21) | ||
569 | ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point(size=1.5) | ||
570 | #default value of size is 2 | ||
571 | |||
572 | #grouping data points by a variable using shape or color | ||
573 | library(gcookbook) | ||
574 | #mapping color | ||
575 | ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) + geom_point() | ||
576 | #mapping shape | ||
577 | ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex)) + geom_point() +geom_line(group=sex) | ||
578 | |||
579 | |||
580 | ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex, colour=sex)) + | ||
581 | geom_point() | ||
582 | |||
583 | ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex, colour=sex)) + | ||
584 | geom_point() + | ||
585 | scale_shape_manual(values=c(1,2)) + | ||
586 | scale_colour_brewer(palette="Set1") | ||
587 | |||
588 | #Using different point shapes | ||
589 | library(gcookbook) | ||
590 | ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point(shape=3) | ||
591 | |||
592 | # Use slightly larger points and use a shape scale with custom values | ||
593 | ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex)) + | ||
594 | geom_point(size=3) + scale_shape_manual(values=c(1, 4)) | ||
595 | #Some of the point shapes (1–14) have just an outline, | ||
596 | #some (15–20) are solid, and | ||
597 | #some (21–25) have an outline and fill that can be controlled separately | ||
598 | |||
599 | #shape represent one variable and | ||
600 | #the fill (empty or solid) represent another variable | ||
601 | # Make a copy of the data | ||
602 | hw <- heightweight | ||
603 | # Categorize into <100 and >=100 groups | ||
604 | hw$weightGroup <- cut(hw$weightLb, breaks=c(-Inf, 100, Inf), | ||
605 | labels=c("< 100", ">= 100")) | ||
606 | ggplot(hw, aes(x=ageYear, y=heightIn, shape=sex, fill=weightGroup)) + | ||
607 | geom_point(size=2.5) + scale_shape_manual(values=c(21, 24)) + | ||
608 | scale_fill_manual(values=c(NA, "black"), | ||
609 | guide=guide_legend(override.aes=list(shape=21))) | ||
610 | |||
611 | #mapping a continuous variable to color or size | ||
612 | library(gcookbook) | ||
613 | ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=weightLb)) + geom_point() | ||
614 | ggplot(heightweight, aes(x=ageYear, y=heightIn, size=weightLb)) + geom_point() | ||
615 | |||
616 | #fill gradient to go from black to white | ||
617 | #make the points larger | ||
618 | ggplot(heightweight, aes(x=weightLb, y=heightIn, fill=ageYear)) + | ||
619 | geom_point(shape=21, size=2.5) + | ||
620 | scale_fill_gradient(low="black", high="white") | ||
621 | # Using guide_legend() will result in a discrete legend instead of a colorbar | ||
622 | ggplot(heightweight, aes(x=weightLb, y=heightIn, fill=ageYear)) + | ||
623 | geom_point(shape=21, size=2.5) + | ||
624 | scale_fill_gradient(low="black", high="white", breaks=12:17, | ||
625 | guide=guide_legend()) | ||
626 | |||
627 | ggplot(heightweight, aes(x=ageYear, y=heightIn, size=weightLb, colour=sex)) + | ||
628 | geom_point(alpha=.5) + | ||
629 | scale_colour_brewer(palette="Set1") + | ||
630 | scale_size_area() # Make area proportional to numeric value | ||
631 | |||
632 | #dealing with overplotting | ||
633 | sp <- ggplot(diamonds, aes(x=carat, y=price)) | ||
634 | sp + geom_point(alpha=0.1) | ||
635 | sp + geom_point(alpha=.01) | ||
636 | #bin the points into rectangle | ||
637 | sp + stat_bin2d() | ||
638 | sp + stat_bin2d(bins=50) + | ||
639 | scale_fill_gradient(low="lightblue", high="red", limits=c(0, 6000)) | ||
640 | library(hexbin) | ||
641 | sp + stat_binhex() + | ||
642 | scale_fill_gradient(low="lightblue", high="red", | ||
643 | limits=c(0, 8000)) | ||
644 | sp + stat_binhex() + | ||
645 | scale_fill_gradient(low="lightblue", high="red", | ||
646 | breaks=c(0, 250, 500, 1000, 2000, 4000, 6000), | ||
647 | limits=c(0, 6000)) | ||
648 | |||
649 | #one discrete axis and one continuous axis | ||
650 | sp1 <- ggplot(ChickWeight, aes(x=Time, y=weight)) | ||
651 | sp1 + geom_point() | ||
652 | sp1 + geom_point(position="jitter") | ||
653 | # Could also use geom_jitter(), which is equivalent | ||
654 | sp1 + geom_point(position=position_jitter(width=.5, height=0)) | ||
655 | #using boxplot | ||
656 | sp1 + geom_boxplot(aes(group=Time)) | ||
657 | |||
658 | #adding fitted regression model lines | ||
659 | library(gcookbook) | ||
660 | sp <- ggplot(heightweight, aes(x=ageYear, y= heightIn)) | ||
661 | sp + geom_point() + stat_smooth(method = lm) | ||
662 | #99% confidence region | ||
663 | sp + geom_point() + stat_smooth(method = lm, level = 0.99) | ||
664 | #No confidence region | ||
665 | sp + geom_point() + stat_smooth(method=lm, se=F) | ||
666 | |||
667 | sp + geom_point(color="grey60") + | ||
668 | stat_smooth(method = lm, se=F, color="black") | ||
669 | |||
670 | #the default method is locally weighted polynomial curve | ||
671 | sp + geom_point(color="grey60") + stat_smooth(method=loess) | ||
672 | |||
673 | #with logistic regressio curve | ||
674 | library(MASS) # For the data set | ||
675 | b <- biopsy | ||
676 | b$classn[b$class=="benign"] <- 0 | ||
677 | b$classn[b$class=="malignant"] <- 1 | ||
678 | ggplot(b, aes(x=V1,y=classn)) + | ||
679 | geom_point(position=position_jitter(width = .3,height=.06), | ||
680 | alpha=.4, shape=21,size=1.5) + | ||
681 | stat_smooth(method=glm, method.args = list(family="binomial")) | ||
682 | #parameter is no longer used | ||
683 | |||
684 | #group points with fitted line | ||
685 | sps <- ggplot(heightweight, aes(x=ageYear, y=heightIn, color = sex)) + | ||
686 | geom_point() + | ||
687 | scale_color_brewer(palette = "Set1") | ||
688 | sps + geom_smooth() | ||
689 | |||
690 | sps + geom_smooth(method = lm, se = F, fullrange= T) | ||
691 | |||
692 | |||
693 | #adding fitted lines from an existing model | ||
694 | library(gcookbook) | ||
695 | model <- lm(heightIn~ ageYear + I(ageYear^2), data = heightweight) | ||
696 | xmin <- min(heightweight$ageYear) | ||
697 | xmax <- max(heightweight$ageYear) | ||
698 | |||
699 | predicted <- data.frame(ageYear = seq(xmin,xmax,length.out = 200)) | ||
700 | predicted$heightIn <- predict(model, predicted) | ||
701 | |||
702 | sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) + | ||
703 | geom_point(color = "grey40") | ||
704 | sp + geom_line(data=predicted,size=1) | ||
705 | |||
706 | # Given a model, predict values of yvar from xvar | ||
707 | # This supports one predictor and one predicted variable | ||
708 | # xrange: If NULL, determine the x range from the model object. If a vector with # two numbers, use those as the min and max of the prediction range. | ||
709 | # samples: Number of samples across the x range. | ||
710 | # ...: Further arguments to be passed to predict() | ||
711 | predictvals <- function(model, xvar, yvar, xrange=NULL, samples = 100,...){ | ||
712 | if (is.null(xrange)) { | ||
713 | if (any(class(model) %in% c("lm","glm"))) | ||
714 | xrange <- range(model$model[[xvar]]) | ||
715 | else if (any(class(model) %in% "loess")) | ||
716 | xrange <- range(model$x) | ||
717 | } | ||
718 | newdata <- data.frame(x=seq(xrange[1],xrange[2],length.out = samples)) | ||
719 | names(newdata) <- xvar | ||
720 | newdata[yvar] <- predict(model,newdata = newdata,...) | ||
721 | newdata | ||
722 | } | ||
723 | |||
724 | modlinear <- lm(heightIn ~ ageYear, heightweight) | ||
725 | modloess <- loess(heightIn ~ ageYear, heightweight) | ||
726 | #predict | ||
727 | lm_predicted <- predictvals(modlinear, "ageYear", "heightIn") | ||
728 | loess_predicted <- predictvals(modloess, "ageYear", "heightIn") | ||
729 | #multiple fitted lines | ||
730 | sp + geom_line(data=lm_predicted, colour="red", size=.8) + | ||
731 | geom_line(data=loess_predicted, colour="blue", size=.8) | ||
732 | |||
733 | library(MASS) # For the data set b <- biopsy | ||
734 | b$classn[b$class=="benign"] <- 0 | ||
735 | b$classn[b$class=="malignant"] <- 1 | ||
736 | fitlogistic <- glm(classn~V1,data = b, family = binomial) | ||
737 | |||
738 | #get predicted values | ||
739 | glm_predicted <- predictvals(fitlogistic, "V1","classn",type="response") | ||
740 | ggplot(b,aes(x=V1, y=classn)) + | ||
741 | geom_point(position = position_jitter(width = .3,height=.08),alpha=.4, | ||
742 | shape=21,size=1.5) + | ||
743 | geom_line(data=glm_predicted, color="#1177FF",size=1) | ||
744 | |||
745 | #adding fitted lines from multiple exsiting models | ||
746 | make_model <- function(data) { lm(heightIn ~ ageYear, data)} | ||
747 | library(gcookbook) | ||
748 | library(plyr) | ||
749 | models <- dlply(heightweight,"sex",.fun=make_model) | ||
750 | #For each subset of a data frame, apply function then combine results into a list. | ||
751 | pred <- ldply(models,.fun = predictvals,xvar="ageYear",yvar="heightIn") | ||
752 | #For each element of a list, apply function then combine results into a data frame | ||
753 | ggplot(heightweight, aes(x=ageYear, y=heightIn, color=sex)) + | ||
754 | geom_point() + | ||
755 | geom_line(data=pred) | ||
756 | #extend the range to the same range across all groups | ||
757 | pred <- ldply(models, .fun=predictvals, xvar="ageYear", yvar="heightIn", | ||
758 | xrange=range(heightweight$ageYear)) | ||
759 | ggplot(heightweight, aes(x=ageYear, y=heightIn, color=sex)) + | ||
760 | geom_point() + | ||
761 | geom_line(data=pred) | ||
762 | |||
763 | #adding annotations with model coefficients | ||
764 | library(gcookbook) | ||
765 | model <- lm(heightIn~ ageYear, heightweight) | ||
766 | summary(model) | ||
767 | #add r square value | ||
768 | pred <- predictvals(model, "ageYear", "heightIn") | ||
769 | sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) + | ||
770 | geom_point() + | ||
771 | geom_line(data=pred) | ||
772 | sp + annotate("text", label="r^2=0.42", x=16.5, y=52) | ||
773 | #using r's math expression syntax | ||
774 | sp + annotate("text", label="r^2=0.42",parse=TRUE, x=16.5, y=52) | ||
775 | |||
776 | #create a string that when parsed, returns a valid expression | ||
777 | eqn <- as.character(as.expression( | ||
778 | substitute(italic(y) == a + b * italic(x) * "," ~~ italic(r)^2 ~ "=" ~ r2, | ||
779 | list(a = format(coef(model)[1], digits=3), | ||
780 | b = format(coef(model)[2], digits=3), | ||
781 | r2 = format(summary(model)$r.squared, digits=2) | ||
782 | )))) | ||
783 | sp + annotate("text", label=eqn, parse=TRUE,x=Inf,y=-Inf,hjust=1.1,vjust=-.5) | ||
784 | |||
785 | |||
786 | #adding marginal rugs to a scatter plot | ||
787 | ggplot(faithful, aes(x=eruptions, y=waiting)) + geom_point() + geom_rug() | ||
788 | #specify the size | ||
789 | ggplot(faithful, aes(x=eruptions, y = waiting)) + | ||
790 | geom_point() + | ||
791 | geom_rug(position = "jitter",size=.2) | ||
792 | |||
793 | #label points in a scatter plot | ||
794 | library(gcookbook) | ||
795 | s <- subset(countries, Year==2009 & healthexp > 2000) | ||
796 | sp <- ggplot(s, aes(x=healthexp, y=infmortality)) + geom_point() | ||
797 | sp + annotate("text",x=4350,y=5.4,label="Cannada") + | ||
798 | annotate("text",x=7400,y=6.8,label="USA") | ||
799 | #automatically add the labels | ||
800 | sp + geom_text(aes(label=Name), size=4) | ||
801 | sp + geom_text(aes(label=Name), size=4,vjust=0) | ||
802 | sp + geom_text(aes(y=infmortality + 0.1, label = Name),size=4,vjust=0) | ||
803 | sp + geom_text(aes(label = Name), size=4,hjust=0) | ||
804 | sp + geom_text(aes(x=healthexp+100, label=Name), size=4, hjust=0) | ||
805 | |||
806 | #label some of the points automatically | ||
807 | cdat <- subset(countries, Year == 2009 & healthexp > 2009) | ||
808 | cdat$Name1 <- cdat$Name | ||
809 | idx <- cdat$Name1 %in% c("Canada", "Ireland", "United Kingdom", "United States", | ||
810 | "New Zealand", "Iceland", "Japan", "Luxembourg", | ||
811 | "Netherlands", "Switzerland") | ||
812 | cdat$Name1[!idx] <- NA | ||
813 | ggplot(cdat, aes(x=healthexp, y=infmortality)) + | ||
814 | geom_point() + | ||
815 | geom_text(aes(x=healthexp+100, label=Name1),size=4,hjust=0) + | ||
816 | xlim(2000,10000) | ||
817 | |||
818 | #creating a balloon plot | ||
819 | library(gcookbook) # For the data set | ||
820 | cdat <- subset(countries, Year==2009 & | ||
821 | Name %in% c("Canada", "Ireland", "United Kingdom", "United States", | ||
822 | "New Zealand", "Iceland", "Japan", "Luxembourg", | ||
823 | "Netherlands", "Switzerland")) | ||
824 | p <- ggplot(cdat, aes(x=healthexp, y=infmortality,size=GDP)) + | ||
825 | geom_point(shape=21,color="black",fill="cornsilk") | ||
826 | #GDP mapped to redius | ||
827 | p#GDP mapped to area instead, and larger circles | ||
828 | p + scale_size_area(max_size=15) | ||
829 | |||
830 | #add up counts for male and female | ||
831 | hec <- HairEyeColor[,,"Male"] + HairEyeColor[,,"Female"] | ||
832 | |||
833 | #convert to long format | ||
834 | library(reshape2) | ||
835 | hec <- melt(hec, value.name="count") | ||
836 | ggplot(hec, aes(x=Eye,y=Hair)) + | ||
837 | geom_point(aes(size=count),shape=21,color="black",fill="cornsilk") + | ||
838 | scale_size_area(max_size = 20,guid=F) + | ||
839 | geom_text(aes(y=as.numeric(Hair)-sqrt(count)/22,label=count), | ||
840 | color="grey60",size=4) | ||
841 | |||
842 | #making a scatter plot matrix | ||
843 | library(gcookbook) | ||
844 | c2009 <- subset(countries, Year==2009, | ||
845 | select=c(Name, GDP, laborrate, healthexp, infmortality)) | ||
846 | pairs(c2009[,2:5]) | ||
847 | |||
848 | panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...) { | ||
849 | usr <- par("usr") | ||
850 | on.exit(par(usr)) | ||
851 | par(usr = c(0, 1, 0, 1)) | ||
852 | r <- abs(cor(x, y, use="complete.obs")) | ||
853 | txt <- format(c(r, 0.123456789), digits=digits)[1] | ||
854 | txt <- paste(prefix, txt, sep="") | ||
855 | if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt) | ||
856 | text(0.5, 0.5, txt, cex = cex.cor * (1 + r) / 2) | ||
857 | } | ||
858 | |||
859 | panel.hist <- function(x, ...) { | ||
860 | usr <- par("usr") | ||
861 | on.exit(par(usr)) | ||
862 | par(usr = c(usr[1:2], 0, 1.5) ) | ||
863 | h <- hist(x, plot = FALSE) | ||
864 | breaks <- h$breaks | ||
865 | nB <- length(breaks) | ||
866 | y <- h$counts | ||
867 | y <- y/max(y) | ||
868 | rect(breaks[-nB], 0, breaks[-1], y, col="white", ...) | ||
869 | } | ||
870 | |||
871 | pairs(c2009[,2:5], upper.panel = panel.cor, | ||
872 | diag.panel = panel.hist, | ||
873 | lower.panel = panel.smooth) | ||
874 | |||
875 | panel.lm <- function (x, y, col = par("col"), bg = NA, pch = par("pch"), | ||
876 | cex = 1, col.smooth = "black", ...) { | ||
877 | points(x, y, pch = pch, col = col, bg = bg, cex = cex) | ||
878 | abline(stats::lm(y ~ x), col = col.smooth, ...) | ||
879 | } | ||
880 | |||
881 | pairs(c2009[,2:5], pch=".", | ||
882 | upper.panel = panel.cor, | ||
883 | diag.panel = panel.hist, | ||
884 | lower.panel = panel.lm) | ||
885 | |||
886 | |||
887 | #########summarized data distributions | ||
888 | |||
889 | #making a basic histogram | ||
890 | ggplot(faithful, aes(x=waiting)) + geom_histogram() | ||
891 | |||
892 | # Store the values in a simple vector | ||
893 | w <- faithful$waiting | ||
894 | ggplot(NULL, aes(x=w)) + geom_histogram() | ||
895 | |||
896 | #set the width of each bin to 5 | ||
897 | ggplot(faithful, aes(x=waiting)) + | ||
898 | geom_histogram(binwidth = 5, fill ="white",color="black") | ||
899 | |||
900 | #divide the x range into 15 bins | ||
901 | binsize <- diff(range(faithful$waiting))/15 | ||
902 | ggplot(faithful,aes(x=waiting)) + | ||
903 | geom_histogram(binwidth = binsize,fill="white",color="black") | ||
904 | |||
905 | #change the origin | ||
906 | h <- ggplot(faithful, aes(x=waiting)) # Save the base object for reuse | ||
907 | h + geom_histogram(binwidth=8, fill="white", colour="black", origin=31) | ||
908 | h + geom_histogram(binwidth=8, fill="white", colour="black", origin=35) | ||
909 | |||
910 | #the same with bar | ||
911 | h + geom_bar(stat="bin") | ||
912 | |||
913 | #making multiple histograms from grouped data | ||
914 | library(MASS) | ||
915 | #use smoke as the faceting variable | ||
916 | ggplot(birthwt, aes(x=bwt)) + | ||
917 | geom_histogram(fill="white", color="black") + | ||
918 | facet_grid(smoke~.) | ||
919 | #change the label | ||
920 | bir <- birthwt | ||
921 | #convert smoke to a factor | ||
922 | bir$smoke <- factor(bir$smoke) | ||
923 | library(plyr) | ||
924 | bir$smoke <- revalue(bir$smoke, c("0"="No Smoke","1"="Smoke")) | ||
925 | #same result | ||
926 | bir$smoke <- factor(bir$smoke,labels=c("No Smoke","Smoke")) | ||
927 | |||
928 | ggplot(bir,aes(x=bwt)) + | ||
929 | geom_histogram(fill="white",color="black") + | ||
930 | facet_grid(smoke~.) | ||
931 | |||
932 | |||
933 | #when size are different | ||
934 | ggplot(bir, aes(x=bwt)) + | ||
935 | geom_histogram(fill="white", color="black") + | ||
936 | facet_grid(race~.) | ||
937 | #resize independently | ||
938 | ggplot(bir, aes(x=bwt)) + | ||
939 | geom_histogram(fill="white",color="black") + | ||
940 | facet_grid(race~.,scales = "free") #scales is set "free" | ||
941 | #another approach is to map the grouping variable to fill | ||
942 | ggplot(bir, aes(x=bwt,fill=smoke)) + | ||
943 | geom_histogram(position="identity",alpha=0.4) | ||
944 | #without position="identity", ggplot will stack the histogram bars on top of each other | ||
945 | |||
946 | #making a density curve | ||
947 | ggplot(faithful, aes(x=waiting)) + geom_density() | ||
948 | |||
949 | #remove the side and bottom | ||
950 | ggplot(faithful, aes(x=waiting)) + | ||
951 | geom_line(stat="density") + | ||
952 | expand_limits(y=0) | ||
953 | |||
954 | #the larger the bandwidth, the more smoothing there is | ||
955 | ggplot(faithful,aes(x=waiting)) + | ||
956 | geom_line(stat="density",adjust=.25,color="red") + | ||
957 | geom_line(stat="density") + | ||
958 | geom_line(stat="density",adjust=2,color="blue") | ||
959 | |||
960 | #set the area color | ||
961 | ggplot(faithful,aes(x=waiting)) + | ||
962 | geom_density(fill="blue",alpha=.2) + | ||
963 | xlim(35,105) | ||
964 | |||
965 | ggplot(faithful,aes(x=waiting)) + | ||
966 | geom_density(fill="blue",alpha=.2,color=NA) + | ||
967 | geom_line(stat="density") + | ||
968 | xlim(35,105) | ||
969 | |||
970 | #compare the theoretical and obsearved distributions | ||
971 | ggplot(faithful, aes(x=waiting, y=..density..)) + | ||
972 | geom_histogram(fill="cornsilk", colour="grey60", size=.2) + | ||
973 | geom_density() + | ||
974 | xlim(35, 105) | ||
975 | |||
976 | #making multiple density curves from grouped data | ||
977 | library(MASS) | ||
978 | bir <- birthwt | ||
979 | bir$smoke <- factor(bir$smoke) | ||
980 | ggplot(bir,aes(x=bwt,color=smoke)) + geom_density() | ||
981 | #mapping smoke to fill | ||
982 | ggplot(bir,aes(x=bwt,fill=smoke)) + geom_density(alpha=.3) | ||
983 | |||
984 | ggplot(bir, aes(x=bwt)) + geom_density() + facet_grid(smoke~.) | ||
985 | |||
986 | library(plyr) # For the revalue function | ||
987 | birthwt1 <- birthwt | ||
988 | birthwt1$smoke <- factor(birthwt1$smoke) | ||
989 | birthwt1$smoke <- revalue(birthwt1$smoke, c("0"="No Smoke", "1"="Smoke")) | ||
990 | ggplot(birthwt1, aes(x=bwt)) + geom_density() + facet_grid(smoke ~ .) | ||
991 | |||
992 | ggplot(birthwt1, aes(x=bwt, y=..density..)) + | ||
993 | geom_histogram(binwidth=200, fill="cornsilk", colour="grey60", size=.2) + | ||
994 | geom_density() + | ||
995 | facet_grid(smoke ~ .) | ||
996 | |||
997 | #making a frequency polygon | ||
998 | ggplot(faithful, aes(x=waiting)) + geom_freqpoly() | ||
999 | ggplot(faithful, aes(x=waiting)) + geom_freqpoly(binwidth=4) | ||
1000 | |||
1001 | #use 15 bins | ||
1002 | binsize <- diff(range(faithful$waiting))/15 | ||
1003 | ggplot(faithful, aes(x=waiting)) + geom_freqpoly(binwidth=binsize) | ||
1004 | |||
1005 | #making a basic box plot | ||
1006 | library(MASS) | ||
1007 | ggplot(birthwt, aes(x=factor(race), y=bwt)) + geom_boxplot() | ||
1008 | # Use factor() to convert numeric variable to discrete | ||
1009 | |||
1010 | #change width | ||
1011 | ggplot(birthwt,aes(x=factor(race),y=bwt)) + geom_boxplot(width=.5) | ||
1012 | #default size is 2, default shape is 16 | ||
1013 | ggplot(birthwt,aes(x=factor(race),y=bwt)) + | ||
1014 | geom_boxplot(outlier.size = 1.5,outlier.shape = 21) | ||
1015 | |||
1016 | ggplot(birthwt,aes(x=1,y=bwt)) + geom_boxplot() + | ||
1017 | scale_x_continuous(breaks = NULL) + | ||
1018 | theme(axis.title.x=element_blank()) | ||
1019 | |||
1020 | #adding natches to a box plot | ||
1021 | library(MASS) | ||
1022 | ggplot(birthwt,aes(x=factor(race),y=bwt)) + | ||
1023 | geom_boxplot(notch = TRUE) | ||
1024 | |||
1025 | #adding means to a box plot | ||
1026 | library(MASS) | ||
1027 | ggplot(birthwt, aes(factor(race),y=bwt)) + | ||
1028 | geom_boxplot() + | ||
1029 | stat_summary(fun.y = "mean",geom = "point",shape=23,size=3,fill="white") | ||
1030 | |||
1031 | #making a violin plot | ||
1032 | library(gcookbook) | ||
1033 | p <- ggplot(heightweight,aes(x=sex,y=heightIn)) | ||
1034 | p + geom_violin() | ||
1035 | #with narrows boxplot | ||
1036 | p + geom_violin() + | ||
1037 | geom_boxplot(width=.1,fill="black",outlier.colour = NA) + | ||
1038 | stat_summary(fun.y = median,geom="point",fill="white",shape=21,size=2.5) | ||
1039 | #keep the tails | ||
1040 | p + geom_violin(trim=FALSE) | ||
1041 | |||
1042 | #making a dot plot | ||
1043 | library(gcookbook) | ||
1044 | c2009 <- subset(countries, Year==2009 & healthexp > 2000) | ||
1045 | p <- ggplot(c2009,aes(x=infmortality)) | ||
1046 | p + geom_dotplot() | ||
1047 | p + geom_dotplot(binwidth=.25) + geom_rug() + | ||
1048 | scale_y_continuous(breaks=NULL) + | ||
1049 | theme(axis.title.y=element_blank()) | ||
1050 | |||
1051 | #making multiple dot plots for grouped data | ||
1052 | ggplot(heightweight, aes(x=sex, y=heightIn)) + | ||
1053 | geom_dotplot(binaxis="y", binwidth=.5, stackdir="center") | ||
1054 | |||
1055 | |||
1056 | #making a desity plot of two dimensional data | ||
1057 | p <- ggplot(faithful,aes(x=eruptions,y=waiting)) | ||
1058 | p + geom_point() + stat_density2d() | ||
1059 | #map the height of the density to color | ||
1060 | p + stat_density2d(aes(color=..level..)) | ||
1061 | #map denstity estimate to fill color | ||
1062 | p + stat_density2d(aes(fill=..density..),geom="raster",contour = F) | ||
1063 | #with points and map desity estimate to alpha | ||
1064 | p + geom_point() + | ||
1065 | stat_density2d(aes(alpha=..density..),geom="tile",contour=F) | ||
1066 | #tile and raster look like the same | ||
1067 | |||
1068 | #marginal rug added to a scatter plot | ||
1069 | p + stat_density2d(aes(fill=..density..),geom="raster", | ||
1070 | contour=F,h=c(.5,5)) | ||
1071 | |||
1072 | |||
1073 | |||
1074 | #############annotations | ||
1075 | |||
1076 | #adding text annotations | ||
1077 | p <- ggplot(faithful,aes(x=eruptions,y=waiting)) + geom_point() | ||
1078 | p + annotate(geom="text",x=3,y=48,label = "Group 1") + | ||
1079 | annotate("text",x=4.5,y=66,label = "Group 2") | ||
1080 | p + annotate("text",x=3,y=48,label="Group 1",family="serif", | ||
1081 | fontface="italic",color="darkred",size=3) + | ||
1082 | annotate("text",x=4.5,y=66,label="Group 2",family="serif", | ||
1083 | fontface="italic",color="darkred",size=3) | ||
1084 | annotate("text",x=3,y=48,label="Group 1",family="serif", | ||
1085 | fontface="italic",color="darkred",size=3) | ||
1086 | |||
1087 | p + annotate("text", x=3, y=48, label="Group 1", alpha=.1) + # Normal | ||
1088 | geom_text(x=4.5, y=66, label="Group 2", alpha=.1) # Overplotted | ||
1089 | |||
1090 | #for continuous axes, use Inf and -Inf to place text annotations at the | ||
1091 | #edge of the plotting area | ||
1092 | p + annotate("text",x=-Inf,y=Inf,label="Upper left",hjust=-.2,vjust=2) + | ||
1093 | annotate("text",x=mean(range(faithful$eruptions)),y=-Inf,vjust=-.4,label="Bottom middle") | ||
1094 | #for hjust and vjust position is minus the value | ||
1095 | |||
1096 | #using mathematical expression in annotations | ||
1097 | #a normal curve | ||
1098 | p <- ggplot(data.frame(x=c(-3,3)), aes(x=x)) + stat_function(fun=dnorm) | ||
1099 | p + annotate("text",x=2,y=0.3,parse=TRUE, | ||
1100 | label="frac(1,sqrt(2*pi))*e^{-x^2/2}") | ||
1101 | #use single quotes within double quotes (or vice versa) to mark the plain-text parts | ||
1102 | #put a * operator between them | ||
1103 | p + annotate("text",x=0,y=.05,parse=TRUE,size=4, | ||
1104 | label="'Function: '*y==frac(1,sqrt(2*pi))*e^{-x^2/2}") | ||
1105 | #?plotmath for many examples of mathematical expressions, and | ||
1106 | #?demo(plot math) for graphical examples of mathematical expressions | ||
1107 | |||
1108 | #adding lines | ||
1109 | library(gcookbook) | ||
1110 | p <- ggplot(heightweight,aes(x=ageYear,y=heightIn,color=sex)) + geom_point() | ||
1111 | #adding horizongtal and vertical lines | ||
1112 | p + geom_hline(yintercept = 60) + geom_vline(xintercept = 14) | ||
1113 | p + geom_abline(intercept = 37.4, slope = 1.75) | ||
1114 | |||
1115 | library(plyr) | ||
1116 | hw_means <- ddply(heightweight,"sex",summarise,heightIn=mean(heightIn)) | ||
1117 | p + geom_hline(aes(yintercept=heightIn,color=sex),data=hw_means, | ||
1118 | linetype="dashed",size=1) | ||
1119 | |||
1120 | #for discrete variable | ||
1121 | pg <- ggplot(PlantGrowth,aes(x=group,y=weight)) + geom_point() | ||
1122 | pg + geom_vline(xintercept = 2) | ||
1123 | pg + geom_vline(xintercept = which(levels(PlantGrowth$group) == "ctrl")) | ||
1124 | |||
1125 | #adding line segments and arrows | ||
1126 | library(gcookbook) | ||
1127 | p <- ggplot(subset(climate, Source=="Berkeley"), aes(x=Year, y=Anomaly10y)) + | ||
1128 | geom_line() | ||
1129 | p + annotate("segment", x=1950, xend=1980, y=-.25, yend=-.25) | ||
1130 | |||
1131 | |||
1132 | |||
1133 | |||
1134 | |||
1135 | |||
1136 | |||
1137 | |||
1138 | |||
1139 | |||
1140 | |||
1141 | |||
1142 | |||
1143 | |||
1144 | |||
1145 | |||
1146 | |||
1147 | |||
1148 | |||
1149 | |||
1150 | |||
1151 | |||
1152 |