Jingan Qu / r_code

Browse Code »

Commit 978c0f7912211370ce279a8973091245c8a149a7

Authored by Jingan Qu 2016-11-08 16:30:34 -0500

1 parent 658ed0df54

Exists in master

upload two R code file

Showing 3 changed files with 1555 additions and 791 deletions Show diff stats

GPL22111.csv

View file @ 978c0f7

data_cleaning.R

Diff comments View file @ 978c0f7

File was created	1
	2
	3	#download the file to load
	4	if (!file.exists("data")){dir.create("data")}
	5	fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
	6	download.file(fileUrl, destfile = "./data/cameras.csv",method = "curl")
	7	dateDownloaded <- date()
	8	if (!file.exists("data")){dir.create("data")}
	9	fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
	10	download.file(fileUrl, destfile = "./data/cameras.csv",method = "curl")
	11	dateDownloaded <- date()
	12
	13	dataset <- read.csv("./data/cameras.csv")
	14	#equal to
	15	dataset <- read.table("./data/cameras.csv",sep=",",header = TRUE)
	16
	17
	18	if (!file.exists("data")){dir.create("data")}
	19	fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.xlsx?accessType=DOWNLOAD"
	20	download.file(fileUrl, destfile = "./data/cameras.xlsx",method = "curl")
	21	dateDownloaded <- date()
	22
	23	#read xlsx file
	24	library(gdata)
	25	dataset <- read.csv("./data/cameras.xlsx")
	26
	27	#data table
	28	library(data.table)
	29	DF <- data.frame(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))
	30	DT <- data.table(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))
	31	#subsetting rows
	32	DT[2,]
	33	DT[DT$y == 'a']
	34	DT[c(2,3)]
	35	#subsetting columns
	36	DT[,list(x)]
	37	#calculating values for variables
	38	DT[,list(mean(x),sum(z))]
	39	DT[,table(y)]
	40	#adding new columns
	41	DT[,w:=z^2]
	42	DT2 <- DT
	43	DT[,y:=2] #DT, DT2 the same
	44	#multiple operations
	45	DT[,m:={tmp<-(x+z);log2(tmp+5)}]
	46	DT[,a:=x>0]
	47	DT[,b:=mean(x+w),by=a]
	48	#special variables
	49	set.seed(123)
	50	DT <- data.table(x=sample(letters[1:3],1E5,TRUE))
	51	#keys
	52	DT <- data.table(x=rep(c("a","b","c"),each=100),y=rnorm(300))
	53	setkey(DT,x)
	54	DT['a']
	55	#joins
	56	DT1 <- data.table(x=c('a','a','b','dt1'),y=1:4)
	57	DT2 <- data.table(x=c('a','b','dt2'),z=5:7)
	58	setkey(DT1,x); setkey(DT2,x)
	59	merge(DT1, DT2)
	60
	61
	62	#read XML into R
	63	library(XML)
	64	fileUrl <- "http://www.w3schools.com/xml/simple.xml"
	65	doc <- xmlTreeParse(fileUrl, useInternal = TRUE)
	66	rootNode <- xmlRoot(doc)
	67	xmlName(rootNode)
	68	names(rootNode)
	69	#directly access parts of XML doc
	70	rootNode[[1]]
	71	rootNode[[1]][[1]]
	72	#programmatically extract parts of the file
	73	xmlSApply(rootNode,xmlValue)
	74	xpathSApply(rootNode, "//name",xmlValue)
	75	xpathSApply(rootNode, "//price",xmlValue)
	76	#another example
	77	fileUrl <- "http://espn.com/nfl/team/_/name/baltimore-ravens"
	78	doc <- htmlTreeParse(fileUrl, useInternal = TRUE)
	79	scores <- xpathSApply(doc,"li[@class='score']",xmlValue)
	80	teams <- xpathSApply(doc,"li[@class='team-name']",xmlValue)
	81
	82	#Making a Proportional Stacked Bar Graph
	83
	84	#reading JSON
	85	#javascript object notation
	86	library(jsonlite)
	87	jsonData <- fromJSON("https://api.github.com/users/jtleek/repos")
	88	names(jsonData)
	89	names(jsonData$owner)
	90	jsonData$owner$login
	91	#writing data frame to JSON
	92	myjson <- toJSON(iris, pretty = TRUE)
	93	cat(myjson)
	94	#convert back to JSON
	95	iris2 <- fromJSON(myjson)
	96
	97
	98	#reading mySQL
	99	library(RMySQL)
	100	ucscDb <- dbConnect(MySQL(),user="genome",
	101	host="genome-mysql.cse.ucsc.edu")
	102	result <- dbGetQuery(ucscDb,"show databases;");dbDisconnect(ucscDb)
	103
	104	hg19 <- dbConnect(MySQL(),user="genome",db='hg19',
	105	host="genome-mysql.cse.ucsc.edu")
	106	allTables <- dbListTables(hg19)
	107	length(allTables)
	108	#get dimensions of a specific table
	109	dbListFields(hg19,"affyU133Plus2")
	110	dbGetQuery(hg19,"select count(*) from affyU133Plus2")
	111	result <- dbGetQuery(hg19,"select * from affyU133Plus2")
	112	#read from the table
	113	affyData <- dbReadTable(hg19,"affyU133Plus2")
	114	#query
	115	affyMis <- dbGetQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3")
	116	#equal to
	117	query <- dbSendQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3")
	118	affyMis <- fetch(query)
	119	quantile(affyMis$misMatches)
	120	dbClearResult(query)
	121	#disconnect mySQL
	122	dbDisconnect(hg19)
	123
	124
	125	#reading HDF5
	126	source("http://bioconductor.org/biocLite.R")
	127	biocLite("rhdf5")
	128	library(rhdf5)
	129	created <- h5createFile("example.h5")
	130	created <- h5createGroup("example.h5","foo")
	131	created <- h5createGroup("example.h5","baa")
	132	created <- h5createGroup("example.h5","foo/foobaa")
	133	h5ls("example.h5")
	134	#write to groups
	135	A <- matrix(1:10, nr=5, nc =2)
	136	h5write(A, "example.h5","foo/A")
	137	B <- array(seq(0.1,2.0,by=0.1),dim=c(5,2,2))
	138	attr(B,"scale") <- "liter"
	139	h5write(B,"example.h5","foo/foobaa/B")
	140	h5ls("example.h5")
	141	#write a data set
	142	df <- data.frame(1L:5L, seq(0,1,length.out = 5), c('ab','cde','fghi','a','s'),
	143	stringsAsFactors = F)
	144	h5write(df,"example.h5","df")
	145	h5ls("example.h5")
	146	#reading data
	147	readA <- h5read("example.h5","foo/A")
	148	readB <- h5read("example.h5","foo/foobaa/B")
	149	readdf <- h5read("example.h5",'df')
	150	readA
	151
	152
	153	#reading data from APIs
	154
	155	#subsetting
	156	set.seed(13435)
	157	X <- data.frame("var1"=sample(1:5),"var2"=sample(6:10),"var3"=sample(11:15))
	158	X <- X[sample(1:5),]
	159	X$var2[c(1,3)] <- NA
	160	#dealing with missing values
	161	X[which(X$var2>8),] #which can ignore the NA values
	162	#sorting川普
	163	sort(X$var1)
	164	sort(X$var1,decreasing = TRUE)
	165	sort(X$var2,na.last = TRUE)
	166	#ordering
	167	X[order(X$var1),]
	168	#ording by plyr
	169	library(plyr)
	170	arrange(X,var1)
	171	arrange(X,desc(var1))
	172
	173
	174	#creating new variables
	175	#getting the data from the web
	176	if (!file.exists("data")){dir.create("data")}
	177	fileUrl <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"
	178	download.file(fileUrl, destfile = "./data/restaurants.csv",method = "curl")
	179	dateDownloaded <- date()
	180	restData <- read.csv("./data/restaurants.csv")
	181
	182	#subsetting variables
	183	restData$nearMe <- restData$neighborhood %in% c("Roland Park","Homeland")
	184	table(restData$nearMe)
	185	#creating binary variables
	186	restData$zipWrong <- ifelse(restData$zipCode < 0, TRUE, FALSE)
	187	with(restData,table(zipWrong,zipCode < 0))
	188	#creating categorical variable
	189	restData$zipGroup <- cut(restData$zipCode, breaks = quantile(restData$zipCode))
	190	table(restData$zipGroup)
	191	#easier cutting
	192	library(Hmisc)
	193	restData$zipGroup <- cut2(restData$zipCode,g=4)
	194	table(restData$zipGroup)
	195	#creating factor variables
	196	restData$zcf <- factor(restData$zipCode)
	197	#levels of factor
	198	yesno <- sample(c("yes","no"),size = 10,replace = TRUE)
	199	yesnofac <- factor(yesno,levels = c("yes","no"))
	200	relevel(yesnofac,ref = "no")
	201	#cutting produces factor variables
	202	library(Hmisc)
	203	restData$zipGroup <- cut2(restData$zipCode,g=4)
	204	#using mutate function
	205	library(Hmisc)
	206	library(plyr)
	207	restData2 <- mutate(restData,zipGroup=cut2(zipCode,g=4))
	208	table(restData2$zipGroup)
	209
	210	#smmarizing data
	211	colSums(is.na(restData))
	212	all(colSums(is.na(restData)) == 0)
	213	#values with specific character
	214	table(restData$zipCode %in% c("21212","21213"))
	215	#cross table
	216	data("UCBAdmissions")
	217	df1 <- as.data.frame(UCBAdmissions)
	218	summary(df1)
	219	xt <- xtabs(Freq ~Gender +Admit,data=df1) #sum
	220	#flat tables
	221	warpbreaks$replicate <- rep(1:9, len=54)
	222	xt <- xtabs(breaks~.,data=warpbreaks)
	223	ftable(xt)
	224	#size of a data set
	225	fakeData <- rnorm(1e5)
	226	object.size(fakeData)
	227	print(object.size(fakeData),units = "Mb")
	228
	229
	230	#dplyr package
	231	library(dplyr)
	232	#select
	233	chicago <- readRDS("./data/chicago.rds")
	234	head(select(chicago, 1:5)) #select col
	235	head(select(chicago, city:dptp))
	236	head(select(chicago,-(city:dptp))) #not select
	237	#equivalent
	238	i <- match("city", names(chicago))
	239	j <- match("dptp", names(chicago))
	240	head(chicago[,-(i:j)])
	241
	242	#filter
	243	chic <- filter(chicago, pm25tmean2 > 30)
	244	head(select(chic,1:3,pm25tmean2))
	245	chic <- filter(chicago, pm25tmean2 > 30 & tmpd > 80)
	246	head(select(chic,1:3,pm25tmean2,tmpd))
	247
	248	#arrange (reorder rows while preserving corresponding order of other col)
	249	chicago <- arrange(chicago, date)
	250	head(select(chicago,date,pm25tmean2))
	251	#descending order
	252	chicago <- arrange(chicago, desc(date))
	253	head(select(chicago,date,pm25tmean2))
	254
	255
	256	#rename
	257	chicago <- rename(chicago, dewpoint = dptp, pm25 = pm25tmean2)
	258	head(select(chicago,1:5))
	259
	260	#mutate
	261	chicago <- mutate(chicago,
	262	pm25detrend = pm25 - mean(pm25,na.rm = TRUE))
	263	head(select(chicago,pm25,pm25detrend))
	264
	265	#group_by (generate summary statistics by stratum)
	266	chicago <- mutate(chicago,
	267	tempcat=factor(1*(tmpd>80), labels = c("cold","hot")))
	268	#1* just transfor to numeric (0,1)
	269	hotcold <- group_by(chicago, tempcat)
	270	summarize(hotcold, pm25 = mean(pm25, na.rm = TRUE),
	271	o3 = max(o3tmean2, na.rm = TRUE),
	272	no2 = median(no2tmean2, na.rm = TRUE))
	273
	274	chicago <- mutate(chicago,
	275	year = as.POSIXlt(date)$year + 1900)
	276	years <- group_by(chicago, year)
	277	summarize(years, pm25 = mean(pm25, na.rm = T),
	278	o3 = max(o3tmean2, na.rm = T),
	279	no2 = median(no2tmean2, na.rm = T))
	280
	281	chicago %>% mutate(month = as.POSIXlt(date)$mon +1) %>% group_by(month) %>% summarize(pm25 = mean(pm25, na.rm = T),
	282	o3 = max(o3tmean2, na.rm = T),
	283	no2 = median(no2tmean2, na.rm = T))
	284
	285
	286	if (!exists("./data")) {dir.create("./data")}
	287	fileUrl1 <- "https://dl.dropboxusercontent.com/u/7710864/data/reviews-apr29.csv"
	288	fileUrl2 <- "https://dl.dropboxusercontent.com/u/7710864/data/solutions-apr29.csv"
	289
	290	download.file(fileUrl1,destfile = "./data/reviews.csv",method = "curl")
	291	download.file(fileUrl2,destfile = "./data/solutions.csv",method = "curl")
	292
	293	reviews <- read.csv("./data/reviews.csv")
	294	solutions <- read.csv("./data/solutions.csv")
	295
	296	#merge data
	297	mergedData <- merge(reviews, solutions, by.x = "solution_id", by.y="id",all=TRUE)
	298
	299	#join in the plyr package
	300	library(plyr)
	301	df1 <- data.frame(id = sample(1:10), x=rnorm(10))
	302	df2 <- data.frame(id = sample(1:10), y = rnorm(10))
	303	arrange(join(df1, df2),id)
	304	#join multiple data frames
	305	df1 <- data.frame(id = sample(1:10), x=rnorm(10))
	306	df2 <- data.frame(id = sample(1:10), y = rnorm(10))
	307	df3 <- data.frame(id = sample(1:10),z=rnorm(10))
	308
	309	dfList <- list(df1,df2,df3)
	310	join_all(dfList)
	311
	312	#working with date
	313	d1 <- date()
	314	class(d1) #charactor
	315
	316	d2 <- Sys.Date()
	317	class(d2) #date
	318	#formatting dates
	319	format(d2, "%a %b %d")
	320
	321	x <- c("1jan1960","2jan1960")
	322	z <- as.Date(x, "%d%b%Y")
	323	#converting
	324	weekdays(d2)
	325
	326	#lubridate
	327	library(lubridate)
	328	ymd("20140108")
	329	ymd("2011-08-03")
	330
	331	#editing text variables
	332
	333	if (!file.exists("data")){dir.create("data")}
	334	fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
	335	download.file(fileUrl, destfile = "./data/cameras.csv",method = "curl")
	336
	337	camera <- read.csv("./data/cameras.csv")
	338	tolower(names(camera))
	339
	340	#fixing character vectors
	341	splitNames <- strsplit(names(camera), "\\.")
	342	splitNames[[6]][1]
	343	firstElement <- function(x){x[1]}
	344	sapply(splitNames,firstElement)
	345
	346	fileUrl1 <- "https://dl.dropboxusercontent.com/u/7710864/data/reviews-apr29.csv"
	347	fileUrl2 <- "https://dl.dropboxusercontent.com/u/7710864/data/solutions-apr29.csv"
	348
	349	download.file(fileUrl1,destfile = "./data/reviews.csv",method = "curl")
	350	download.file(fileUrl2,destfile = "./data/solutions.csv",method = "curl")
	351
	352	reviews <- read.csv("./data/reviews.csv")
	353	solutions <- read.csv("./data/solutions.csv")
	354	names(reviews)
	355	#replacement
	356	sub("_", "", names(reviews)) #just replace one
	357	gsub() #replace all
	358
	359
	360	#finding values
	361	grep("Alameda", camera$intersection) #return location
	362	grepl() #return T or F
	363
	364	grep("Alameda", camera$intersection, value = TRUE) #return exact element
	365
	366
	367	#stringr library
	368	library(stringr)
	369	nchar("Jeffrey Leek") #count
	370	substr("Jeffrey Leek",1,7)
	371	paste("Jeffrey","Leek") #with blank
	372	paste0("Jeffrey", "Leek") #without blank
	373	str_trim("Jeff ") #remove the blanks (beginning and ending)
	374
	375
	376	#reshaping
	377	library(reshape2)
	378	mtcars$carname <- rownames(mtcars)
	379	carMelt <- melt(mtcars, id=c("carname","gear","cyl"), measure.vars = c("mpg","hp"))
	380
	381
	382	#casting data frames
	383	cylData <- dcast(carMelt,cyl~variable) #count
	384	cylData <- dcast(carMelt,cyl~variable, mean)
	385
	386	#averaging values
	387	head(InsectSprays)
	388	tapply(InsectSprays$count,InsectSprays$spray,sum) #
	389	#split
	390	spIns <- split(InsectSprays$count, InsectSprays$spray)
	391	sapply(spIns,sum)
	392	#or
	393	unlist(lapply(spIns,sum))
	394
	395	#plyr package
	396	library(plyr)
	397	ddply(InsectSprays, "spray",summarize,sum=sum(count)) #return dataframe
	398
	399
	400
	401
	402
	403
	404
	405

ggplot2.R

Diff comments View file @ 978c0f7

File was created	1
	2	set.seed(1410)
	3
	4	dsmall <- diamonds[sample(nrow(diamonds), 100),]
	5
	6
	7	#scatterplot
	8	qplot(carat, price, data = diamonds)
	9	qplot(log(carat), log(price), data = diamonds)
	10
	11	qplot(carat, xyz, data = diamonds)
	12	qplot(carat, price, data = dsmall, colour = color)
	13	qplot(carat, price, data = dsmall, shape = cut)
	14
	15	qplot(carat, price, data = diamonds, alpha = I(1/10))
	16	qplot(carat, price, data = diamonds, alpha = I(1/100))
	17	qplot(carat, price, data = diamonds, alpha = I(1/200))
	18
	19
	20	#geom
	21	qplot(carat, price, data = dsmall, geom = c("point", "smooth"))
	22	qplot(carat, price, data = diamonds, geom = c("point", "smooth"))
	23
	24
	25	qplot(color, price/carat, data=diamonds,geom = "jitter")
	26	qplot(color, price/carat, data=diamonds,geom = "boxplot")
	27	qplot(color, price / carat, data = diamonds, geom = "jitter",
	28	alpha = I(1 / 50))
	29
	30	#offer size, colour, shape for scatterplot
	31	#offer size, colour, fill for boxplot
	32	qplot(color, price/carat, data=diamonds,geom = "boxplot",colour="red")
	33
	34
	35	qplot(carat, data = diamonds, geom = "histogram")
	36	qplot(carat, data = diamonds, geom = "density")
	37
	38
	39	qplot(carat, data = diamonds, geom = "histogram", binwidth = 1,
	40	xlim = c(0,3))
	41	qplot(carat, data = diamonds, geom = "histogram", binwidth = 0.1,
	42	xlim = c(0,3))
	43	qplot(carat, data = diamonds, geom = "histogram", binwidth = 0.01,
	44	xlim = c(0,3))
	45
	46	qplot(carat, data = diamonds, geom = "density", colour = color)
	47	qplot(carat, data = diamonds, geom = "histogram", fill = color, bins = 50)
	48
	49	qplot(color, data = diamonds, geom = "bar")
	50	qplot(color, data = diamonds, geom = "bar", weight = carat) +
	51	scale_y_continuous("carat") #total weight
	52
	53
	54	qplot(date, unemploy / pop, data = economics, geom = "line")
	55	qplot(date, uempmed, data = economics, geom = "line")
	56
	57	#path
	58	year <- function(x) as.POSIXlt(x)$year + 1900
	59	qplot(unemploy / pop, uempmed, data = economics,
	60	geom = c("point", "path"))
	61	qplot(unemploy / pop, uempmed, data = economics,
	62	geom = "path", colour = year(date)) + scale_area()
	63
	64	#faceting
	65	# row var ∼ col var
	66	qplot(carat, data = diamonds, facets = color ~ .,
	67	geom = "histogram", binwidth = 0.1, xlim = c(0, 3))
	68	qplot(carat, ..density.., data = diamonds, facets = color ~ .,
	69	geom = "histogram", binwidth = 0.1, xlim = c(0, 3))
	70	#Using ..density.. tells ggplot2 to map the density to the y-axis
	71	#instead of the default use of count
	72	#density plot makes it easier to compare distributions ignoring
	73	#the relative abundance of diamonds within each colour
	74
	75
	76	#see ?plotmath for more examples of using mathematical formulae
	77
	78	qplot(
	79	carat, price, data = dsmall,
	80	xlab = "Price ($)", ylab = "Weight (carats)",
	81	main = "Price-weight relationship" )
	82
	83
	84	qplot(
	85	carat, price/carat, data = dsmall,
	86	xlab = "Weight(carats)", ylab = "expression(frac(price,carat))",
	87	main = "Small diamonds",
	88	xlim = c(.2, 1))
	89
	90	qplot(carat, price, data = dsmall, log = "xy")
	91
	92	qplot(displ, hwy, data = mpg, colour = factor(cyl)) #factor
	93
	94	#ggplot
	95	p <- ggplot(diamonds, aes(x = carat))
	96	p <- p + layer(
	97	geom = "bar",
	98	geom_params = list(fill = "steelblue"),
	99	stat = "bin",
	100	stat_params = list(binwidth = 2)
	101	)
	102	#layer(geom, geom_params, stat, stat_params, data, mapping, position)
	103	ggplot(msleep, aes(sleep_rem / sleep_total, awake)) +
	104	geom_point()
	105
	106	#geom_XXX(mapping, data, ..., geom, position)
	107	#stat_XXX(mapping, data, ..., stat, position)
	108	p <- ggplot(mtcars, aes(mpg, wt)) +
	109	geom_point()
	110
	111	p + geom_point(aes(colour = factor(cyl)))
	112	p + geom_point(aes(y=disp))
	113
	114	p <- ggplot(mtcars, aes(mpg, wt))
	115	p + geom_point(colour = "darkblue")
	116	p + geom_point(aes(colour = "darkblue"))
	117
	118	p + geom_point(aes(colour = factor(cyl)))
	119
	120	p <- ggplot(Oxboys, aes(age, height, group = Subject)) + geom_line()
	121
	122	p + geom_smooth(aes(group = Subject), method = "lm", se = F)
	123
	124	p + geom_smooth(aes(group = 1), method = "lm", se = F)
	125
	126	boysbox <- ggplot(Oxboys, aes(Occasion, height)) + geom_boxplot()
	127
	128	boysbox + geom_line(aes(group=Subject), color=I(8))
	129
	130	xgrid <- with(df, seq(min(x), max(x), length=50))
	131
	132	p <- ggplot(diamonds, aes(carat)) +
	133	geom_histogram(aes(y=..count..), binwidth = 0.1)
	134	p <- ggplot(diamonds, aes(carat)) +
	135	geom_histogram(aes(y=..density..), binwidth = 0.1)
	136	qplot(carat, ..density.., data=diamonds, geom = "histogram",binwidth = 0.1)
	137
	138	d <- ggplot(diamonds,aes(carat)) + xlim(0, 3)
	139	d + stat_bin(aes(ymax = ..count..), binwidth = 0.1, geom = "area")
	140	d + stat_bin(aes(size = ..density..), binwidth = 0.1, geom = "point", position = "identity")
	141	d + stat_bin(aes(y = 1, fill = ..count..), binwidth = 0.1,geom = "tile", position = "identity")
	142
	143	#graph cook book
	144	library(gcookbook)
	145	library(ggplot2)
	146	#chapte 2 quickly exploring data
	147
	148	#creating a scatter plot
	149	plot(mtcars$wt, mtcars$mpg)
	150	qplot(mtcars$wt, mtcars$mpg)
	151	qplot(wt, mpg, data=mtcars)
	152	# This is equivalent to:
	153	ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point()
	154
	155	#creating line plot
	156	plot(pressure$temperature, pressure$pressure, type="l")
	157
	158	qplot(pressure$temperature, pressure$pressure, geom="line")
	159	qplot(temperature, pressure, data=pressure, geom="line")
	160	# This is equivalent to:
	161	ggplot(pressure, aes(x=temperature, y=pressure)) + geom_line()
	162	# Lines and points together
	163	qplot(temperature, pressure, data=pressure, geom=c("line", "point"))
	164	# Equivalent to:
	165	ggplot(pressure, aes(x=temperature, y=pressure)) + geom_line() + geom_point()
	166
	167
	168	#creating bar graph
	169	barplot(BOD$demand, names.arg=BOD$Time) #name
	170
	171	# Generate a table of countsqplot(supp, len, data=ToothGrowth, geom="boxplot")
	172	# This is equivalent to:
	173	ggplot(ToothGrowth, aes(x=supp, y=len)) + geom_boxplot()
	174	barplot(table(mtcars$cyl))
	175	library(ggplot2)
	176	ggplot(BOD, aes(x=factor(Time), y=demand)) + geom_bar(stat="identity")
	177	# Bar graph of counts
	178	qplot(factor(cyl), data=mtcars)
	179	# This is equivalent to:
	180	ggplot(mtcars, aes(x=factor(cyl))) + geom_bar()
	181
	182	#creating histogram
	183	# Specify approximate number of bins with breaks
	184	hist(mtcars$mpg, breaks=10)
	185	library(ggplot2)
	186	qplot(mpg, data=mtcars, binwidth=1)
	187	# This is equivalent to:
	188	ggplot(mtcars, aes(x=mpg)) + geom_histogram(binwidth=1)
	189
	190	#creating box plot
	191	# Formula syntax
	192	boxplot(len ~ supp, data = ToothGrowth)
	193	library(ggplot2)
	194	qplot(supp, len, data=ToothGrowth, geom="boxplot")
	195	# This is equivalent to:
	196	ggplot(ToothGrowth, aes(x=supp, y=len)) + geom_boxplot()
	197	ggplot(ToothGrowth, aes(x=interaction(supp, dose), y=len)) + geom_boxplot()
	198
	199	#plot a function curve
	200	curve(x^3 - 5*x, from=-4, to=4)
	201	# Plot a user-defined function
	202	myfun <- function(xvar) {
	203	1/(1 + exp(-xvar + 10))
	204	}
	205	curve(myfun(x), from=0, to=20)
	206	# Add a line:
	207	curve(1-myfun(x), add = TRUE, col = "red")
	208	library(ggplot2)
	209	# This is equivalent to:
	210	ggplot(data.frame(x=c(0, 20)), aes(x=x)) + stat_function(fun=myfun, geom="line")
	211
	212
	213	#chapter 3 Bar graphs
	214	ggplot(pg_mean, aes(x=group, y=weight)) + geom_bar(stat="identity")
	215
	216	ggplot(pg_mean, aes(x=group, y=weight)) +
	217	geom_bar(stat="identity", fill="lightblue", colour="black")
	218
	219
	220	#group by a second variable
	221	ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
	222	geom_bar(position="dodge", colour="black",stat="identity") #default stat is count
	223
	224	#stat="identity" means using sum
	225
	226	ggplot(diamonds, aes(x=cut)) + geom_bar(stat = "count") #default stat is count
	227
	228
	229	upc <- subset(uspopchange, rank(Change)>40)
	230	ggplot(upc, aes(x=Abb, y=Change, fill=Region)) + geom_bar(stat="identity")
	231
	232	ggplot(upc, aes(x=reorder(Abb, Change), y=Change, fill=Region)) +
	233	geom_bar(stat="identity", colour="black") +
	234	scale_fill_manual(values=c("#669933", "#FFCC66")) +
	235	xlab("State")
	236
	237	#using the reorder() function to reorder the levels of a factor based on
	238	#the values of another variable
	239
	240
	241	#Coloring Negative and Positive Bars Differently
	242	csub <- subset(climate, Source=="Berkeley" & Year >= 1900)
	243	csub$pos <- csub$Anomaly10y >= 0
	244	ggplot(csub, aes(x=Year, y=Anomaly10y, fill=pos)) +
	245	geom_bar(stat="identity", position="identity") #default position is stack
	246
	247	ggplot(csub, aes(x=Year, y=Anomaly10y, fill=pos)) +
	248	geom_bar(stat="identity", position="identity", colour="black", size=0.25) +
	249	scale_fill_manual(values=c("#CCEEFF", "#FFDDDD"), guide=FALSE)
	250
	251
	252	#Adjusting Bar Width and Spacing
	253	#default value is 0.9
	254	ggplot(pg_mean, aes(x=group, y=weight)) + geom_bar(stat="identity", width=0.5)
	255	#a grouped bar graph with narrow bars
	256	ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
	257	geom_bar(stat="identity", width=0.5, position="dodge")
	258	#with some space between the bars
	259	ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
	260	geom_bar(stat="identity", width=0.6, position=position_dodge(0.8)) #space = 0.8-0.6
	261	#To be more precise, the value of width in position_dodge() is the same as
	262	#width in geom_bar()
	263
	264
	265	#Making a Stacked Bar Graph
	266	ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
	267	geom_bar(stat="identity")
	268	#One problem with the default output is that the stacking order is the opposite of the
	269	#order of items in the legend
	270	#reverse the legend
	271	ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
	272	geom_bar(stat="identity") +
	273	guides(fill=guide_legend(reverse=TRUE))
	274	#reverse the stacking order
	275	libary(plyr) # Needed for desc()
	276	ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar, order=desc(Cultivar))) +
	277	geom_bar(stat="identity")
	278	#get a different color palette
	279	ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
	280	geom_bar(stat="identity", colour="black") +
	281	guides(fill=guide_legend(reverse=TRUE)) +
	282	scale_fill_brewer(palette="Pastel1")
	283
	284
	285	#Making a Proportional Stacked Bar Graph
	286	library(gcookbook)
	287	library(plyr)
	288	ce <- ddply(cabbage_exp,"Date",transform,
	289	percent_weight = Weight / sum(Weight)*100)
	290
	291	ggplot(ce, aes(x=Date, y= percent_weight, fill =Cultivar)) +
	292	geom_bar(stat="identity")
	293	#change a little bit
	294	ggplot(ce, aes(x=Date, y= percent_weight, fill =Cultivar)) +
	295	geom_bar(stat="identity", color="black") +
	296	guides(fill=guide_legend(reverse = TRUE)) +
	297	scale_fill_brewer(palette = "Pastel1")
	298
	299	#adding labels to a bar graph
	300	# Below the top
	301	ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) +
	302	geom_bar(stat="identity") +
	303	geom_text(aes(label=Weight), vjust=1.5, colour="white")
	304
	305	# Above the top
	306	ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) +
	307	geom_bar(stat="identity") +
	308	geom_text(aes(label=Weight), vjust=-0.2)
	309
	310	#adjust y limits to be a little higher
	311	ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) +
	312	geom_bar(stat="identity") +
	313	geom_text(aes(label=Weight), vjust=-0.2) +
	314	ylim(0, max(cabbage_exp$Weight) * 1.05)
	315
	316	# Map y positions slightly above bar top - y range of plot will auto-adjust
	317	ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) +
	318	geom_bar(stat="identity") +
	319	geom_text(aes(y=Weight+.1, label=Weight))
	320
	321	#dodging
	322	ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
	323	geom_bar(stat="identity", position="dodge") +
	324	geom_text(aes(label=Weight), vjust=1.5, colour="white",
	325	position=position_dodge(0.9), size=3)
	326
	327
	328	#label on stack bar
	329	library(plyr)
	330	# Sort by the day and sex columns
	331	ce <- arrange(cabbage_exp, Date, Cultivar)
	332	#get the cumulative sum
	333	ce <- ddply(ce, "Date", transform, label_y=cumsum(Weight))
	334
	335	ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) +
	336	geom_bar(stat="identity") +
	337	geom_text(aes(y=label_y, label=Weight), vjust=1.5, colour="white") +
	338	guides(fill=guide_legend(reverse=TRUE))
	339
	340	#put the label in the center
	341	ce <- arrange(cabbage_exp, Date, Cultivar)
	342	# Calculate y position, placing it in the middle
	343	ce <- ddply(ce, "Date", transform, label_y=cumsum(Weight)-0.5*Weight) #good
	344	ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) +
	345	geom_bar(stat="identity") +
	346	geom_text(aes(y=label_y, label=Weight), colour="white")
	347
	348	#polish
	349	ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) +
	350	geom_bar(stat="identity") +
	351	geom_text(aes(y=label_y, label=paste(format(Weight, nsmall=2), "kg")),
	352	size=4) +
	353	guides(fill=guide_legend(reverse=TRUE)) + scale_fill_brewer(palette="Pastel1")
	354	#coord_flip()
	355
	356	#making a Cleveland Dot Plot
	357	library(gcookbook)
	358	tophit <- tophitters2001[1:25,]
	359
	360	ggplot(tophit, aes(x=avg, y=name)) + geom_point()
	361
	362	ggplot(tophit, aes(x=avg, y=reorder(name, avg))) +
	363	geom_point(size=3) + # Use a larger dot
	364	theme_bw() +
	365	theme(panel.grid.major.x = element_blank(),
	366	panel.grid.minor.x = element_blank(),
	367	panel.grid.major.y = element_line(colour="grey60", linetype="dashed"))
	368
	369	ggplot(tophit, aes(x=reorder(name, avg), y=avg)) +
	370	geom_point(size=3) + # Use a larger dot
	371	theme_bw() +
	372	theme(axis.text.x = element_text(angle=60, hjust=1),
	373	panel.grid.major.y = element_blank(),
	374	panel.grid.minor.y = element_blank(),
	375	panel.grid.major.x = element_line(colour="grey60", linetype="dashed"))
	376
	377	#reorder onlu by one variable
	378	#order by two variables
	379	# Get the names, sorted first by lg, then by avg
	380	nameorder <- tophit$name[order(tophit$lg, tophit$avg)]
	381	# Turn name into a factor, with levels in the order of nameorder
	382	tophit$name <- factor(tophit$name, levels=nameorder)
	383
	384
	385	ggplot(tophit, aes(x=avg, y=name)) +
	386	geom_segment(aes(yend=name), xend=0, colour="grey50") +
	387	geom_point(size=3, aes(colour=lg)) +
	388	scale_colour_brewer(palette="Set1", limits=c("NL","AL")) + #limits order the posion
	389	theme_bw() +
	390	theme(panel.grid.major.y = element_blank(), # No horizontal grid lines
	391	legend.position=c(1, 0.55), # Put legend inside plot area
	392	legend.justification=c(1, 0.5))
	393
	394	ggplot(tophit, aes(x=avg, y=name)) +
	395	geom_segment(aes(yend=name), xend=0, colour="grey50") +
	396	geom_point(size=3, aes(colour=lg)) +
	397	scale_colour_brewer(palette="Set1", limits=c("NL","AL"), guide=FALSE) +
	398	theme_bw() +
	399	theme(panel.grid.major.y = element_blank()) +
	400	facet_grid(lg ~ ., scales="free_y", space="free_y")
	401
	402
	403	#Making a Basic Line Graph
	404	ggplot(BOD, aes(x=Time, y=demand)) + geom_line()
	405
	406	d1 <- BOD
	407	d1$Time <- factor(d1$Time)
	408	ggplot(d1, aes(x=Time, y=demand,group=1)) +geom_line()
	409	#add y limit
	410	ggplot(BOD, aes(x=Time, y=demand)) + geom_line() + ylim(0, max(BOD$demand))
	411	ggplot(BOD, aes(x=Time, y=demand)) + geom_line() + expand_limits(y=0) #same
	412
	413	#adding points to a line graph
	414	ggplot(BOD, aes(x=Time, y=demand)) + geom_line() + geom_point()
	415
	416	library(gcookbook)
	417	ggplot(worldpop, aes(x=Year, y=Population)) +
	418	geom_line() +
	419	geom_point()
	420
	421	#with a log y-axis
	422	ggplot(worldpop, aes(x=Year, y= Population)) +
	423	geom_line() +
	424	geom_point() +
	425	scale_y_log10()
	426
	427	#making a line graph with multiple lines
	428	#summarize the data
	429	library(plyr)
	430	tg <- ddply(ToothGrowth, c("supp","dose"),summarise, length=mean(len))
	431
	432	#map supp to colour
	433	ggplot(tg, aes(x=dose,y=length,colour=supp)) + geom_line()
	434
	435	#map supp to linetype
	436	ggplot(tg, aes(x=dose, y=length, linetype=supp)) + geom_line()
	437
	438	#factor + group (line point together)
	439	ggplot(tg, aes(x=factor(dose), y=length,colour=supp,group=supp)) + geom_line()
	440	# Make the points a little larger
	441	ggplot(tg, aes(x=dose, y=length, shape=supp)) + geom_line() + geom_point(size=4)
	442	# Also use a point with a color fill
	443	ggplot(tg, aes(x=dose, y=length, fill=supp)) + geom_line() + geom_point(size=4, shape=21)
	444	#dodge them
	445	ggplot(tg, aes(x=dose, y=length, shape=supp)) +
	446	geom_line(position=position_dodge(0.2)) + # Dodge lines by 0.2
	447	geom_point(position=position_dodge(0.2), size=4) # Dodge points by 0.2
	448
	449	#change the apperance of lines
	450	ggplot(BOD, aes(x=Time, y=demand)) +
	451	geom_line(linetype = "dashed", size = 1, color = "blue")
	452
	453	# Load plyr so we can use ddply() to create the example data set
	454	library(plyr)
	455	# Summarize the ToothGrowth data
	456	tg <- ddply(ToothGrowth, c("supp", "dose"), summarise, length=mean(len))
	457	ggplot(tg, aes(x=dose, y=length, colour=supp)) +
	458	geom_line() +
	459	scale_colour_brewer(palette="Set1")
	460	# If both lines have the same properties, you need to specify a variable to # use for grouping
	461	ggplot(tg, aes(x=dose, y=length, group=supp)) +
	462	geom_line(colour="darkgreen", size=1.5)
	463	# Since supp is mapped to colour, it will automatically be used for grouping
	464	ggplot(tg, aes(x=dose, y=length, colour=supp)) +
	465	geom_line(linetype="dashed") +
	466	geom_point(shape=22, size=3, fill="white")
	467
	468	#change the appearance of points
	469	ggplot(BOD, aes(x=Time, y=demand)) +
	470	geom_line() +
	471	geom_point(size=4, shape=22, colour="darkred", fill="pink")
	472	#The default shape for points is a solid circle,
	473	#the default size is 2, and the default colour is "black"
	474
	475	# Load plyr so we can use ddply() to create the example data set
	476	library(plyr)
	477	# Summarize the ToothGrowth data
	478	tg <- ddply(ToothGrowth, c("supp", "dose"), summarise, length=mean(len))
	479	# Save the position_dodge specification because we'll use it multiple times
	480	pd <- position_dodge(0.2)
	481	ggplot(tg, aes(x=dose, y=length, fill=supp)) +
	482	geom_line(position=pd) +
	483	geom_point(shape=21, size=3, position=pd) +
	484	scale_fill_manual(values=c("black","white"))
	485
	486
	487	#making a graph with a shaded area
	488	sunspotyear <- data.frame(
	489	Year = as.numeric(time(sunspot.year)),
	490	Sunspots = as.numeric(sunspot.year)
	491	)
	492	ggplot(sunspotyear, aes(x=Year, y=Sunspots)) + geom_area()
	493	#change area appearance (with outline)
	494	ggplot(sunspotyear, aes(x=Year, y=Sunspots)) +
	495	geom_area(colour="black", fill="blue", alpha=.2)
	496	#no bottom line
	497	ggplot(sunspotyear, aes(x=Year, y=Sunspots)) +
	498	geom_area(fill="blue", alpha=.2) +
	499	geom_line()
	500
	501	#making a stacked area graph
	502	library(gcookbook)
	503	ggplot(uspopage, aes(x=Year, y= Thousands,fill=AgeGroup)) +
	504	geom_area()
	505
	506	#reverse the legend order
	507	ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup)) +
	508	geom_area(colour="black", size=.2, alpha=.4) +
	509	scale_fill_brewer(palette="Blues") +
	510	guides(fill=guide_legend(reverse=TRUE))
	511	#or
	512	ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup)) +
	513	geom_area(colour="black", size=.2, alpha=.4) +
	514	scale_fill_brewer(palette="Blues", breaks=rev(levels(uspopage$AgeGroup)))
	515
	516
	517	#or change the order of fill
	518	library(plyr) # For the desc() function
	519	ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup, order=desc(AgeGroup))) +
	520	geom_area(colour="black", size=.2, alpha=.4) +
	521	scale_fill_brewer(palette="Blues")
	522
	523
	524	#remove the left and right sides
	525	ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup, order=desc(AgeGroup))) +
	526	geom_area(colour=NA, alpha=.4) +
	527	scale_fill_brewer(palette="Blues") +
	528	geom_line(position="stack", size=.2) #default position is "identity"
	529
	530
	531	#making a proportional stacked area graph
	532	#first calculate the proportions
	533	library(gcookbook)
	534	library(plyr)
	535	#convert to percent
	536	p <- ddply(uspopage,"Year",transform,
	537	Percent = Thousands/sum(Thousands)*100)
	538	ggplot(p,aes(x=Year, y= Percent, fill = AgeGroup)) +
	539	geom_area(colour = "black",size=.2,alpha = .4) +
	540	scale_fill_brewer(palette="Blues",breaks=rev(levels(p$AgeGroup)))
	541
	542	#adding a confidence region
	543	library(gcookbook)
	544	clim <- subset(climate, Source == "Berkeley",
	545	select = c("Year","Anomaly10y","Unc10y"))
	546	#the same
	547	library(dplyr)
	548	clim <- climate %>% filter(Source == "Berkeley") %>% select(Year, Anomaly10y, Unc10y)
	549	#shaded region
	550	ggplot(clim, aes(x=Year,y= Anomaly10y)) +
	551	geom_ribbon(aes(ymin=Anomaly10y-Unc10y, ymax=Anomaly10y+Unc10y),
	552	alpha=0.2) +
	553	geom_line()
	554	#we need know the 95% confidence interval ahead
	555
	556	# With a dotted line for upper and lower bounds
	557	ggplot(clim, aes(x=Year, y=Anomaly10y)) +
	558	geom_line(aes(y=Anomaly10y-Unc10y), colour="grey50", linetype="dotted") +
	559	geom_line(aes(y=Anomaly10y+Unc10y), colour="grey50", linetype="dotted") +
	560	geom_line()
	561
	562
	563
	564	####scatter plots
	565	#making a basic scatter plot
	566	ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point()
	567	#the default solid circles (shape #16) is hollow ones (#21)
	568	ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point(shape=21)
	569	ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point(size=1.5)
	570	#default value of size is 2
	571
	572	#grouping data points by a variable using shape or color
	573	library(gcookbook)
	574	#mapping color
	575	ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) + geom_point()
	576	#mapping shape
	577	ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex)) + geom_point() +geom_line(group=sex)
	578
	579
	580	ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex, colour=sex)) +
	581	geom_point()
	582
	583	ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex, colour=sex)) +
	584	geom_point() +
	585	scale_shape_manual(values=c(1,2)) +
	586	scale_colour_brewer(palette="Set1")
	587
	588	#Using different point shapes
	589	library(gcookbook)
	590	ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point(shape=3)
	591
	592	# Use slightly larger points and use a shape scale with custom values
	593	ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex)) +
	594	geom_point(size=3) + scale_shape_manual(values=c(1, 4))
	595	#Some of the point shapes (1–14) have just an outline,
	596	#some (15–20) are solid, and
	597	#some (21–25) have an outline and fill that can be controlled separately
	598
	599	#shape represent one variable and
	600	#the fill (empty or solid) represent another variable
	601	# Make a copy of the data
	602	hw <- heightweight
	603	# Categorize into <100 and >=100 groups
	604	hw$weightGroup <- cut(hw$weightLb, breaks=c(-Inf, 100, Inf),
	605	labels=c("< 100", ">= 100"))
	606	ggplot(hw, aes(x=ageYear, y=heightIn, shape=sex, fill=weightGroup)) +
	607	geom_point(size=2.5) + scale_shape_manual(values=c(21, 24)) +
	608	scale_fill_manual(values=c(NA, "black"),
	609	guide=guide_legend(override.aes=list(shape=21)))
	610
	611	#mapping a continuous variable to color or size
	612	library(gcookbook)
	613	ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=weightLb)) + geom_point()
	614	ggplot(heightweight, aes(x=ageYear, y=heightIn, size=weightLb)) + geom_point()
	615
	616	#fill gradient to go from black to white
	617	#make the points larger
	618	ggplot(heightweight, aes(x=weightLb, y=heightIn, fill=ageYear)) +
	619	geom_point(shape=21, size=2.5) +
	620	scale_fill_gradient(low="black", high="white")
	621	# Using guide_legend() will result in a discrete legend instead of a colorbar
	622	ggplot(heightweight, aes(x=weightLb, y=heightIn, fill=ageYear)) +
	623	geom_point(shape=21, size=2.5) +
	624	scale_fill_gradient(low="black", high="white", breaks=12:17,
	625	guide=guide_legend())
	626
	627	ggplot(heightweight, aes(x=ageYear, y=heightIn, size=weightLb, colour=sex)) +
	628	geom_point(alpha=.5) +
	629	scale_colour_brewer(palette="Set1") +
	630	scale_size_area() # Make area proportional to numeric value
	631
	632	#dealing with overplotting
	633	sp <- ggplot(diamonds, aes(x=carat, y=price))
	634	sp + geom_point(alpha=0.1)
	635	sp + geom_point(alpha=.01)
	636	#bin the points into rectangle
	637	sp + stat_bin2d()
	638	sp + stat_bin2d(bins=50) +
	639	scale_fill_gradient(low="lightblue", high="red", limits=c(0, 6000))
	640	library(hexbin)
	641	sp + stat_binhex() +
	642	scale_fill_gradient(low="lightblue", high="red",
	643	limits=c(0, 8000))
	644	sp + stat_binhex() +
	645	scale_fill_gradient(low="lightblue", high="red",
	646	breaks=c(0, 250, 500, 1000, 2000, 4000, 6000),
	647	limits=c(0, 6000))
	648
	649	#one discrete axis and one continuous axis
	650	sp1 <- ggplot(ChickWeight, aes(x=Time, y=weight))
	651	sp1 + geom_point()
	652	sp1 + geom_point(position="jitter")
	653	# Could also use geom_jitter(), which is equivalent
	654	sp1 + geom_point(position=position_jitter(width=.5, height=0))
	655	#using boxplot
	656	sp1 + geom_boxplot(aes(group=Time))
	657
	658	#adding fitted regression model lines
	659	library(gcookbook)
	660	sp <- ggplot(heightweight, aes(x=ageYear, y= heightIn))
	661	sp + geom_point() + stat_smooth(method = lm)
	662	#99% confidence region
	663	sp + geom_point() + stat_smooth(method = lm, level = 0.99)
	664	#No confidence region
	665	sp + geom_point() + stat_smooth(method=lm, se=F)
	666
	667	sp + geom_point(color="grey60") +
	668	stat_smooth(method = lm, se=F, color="black")
	669
	670	#the default method is locally weighted polynomial curve
	671	sp + geom_point(color="grey60") + stat_smooth(method=loess)
	672
	673	#with logistic regressio curve
	674	library(MASS) # For the data set
	675	b <- biopsy
	676	b$classn[b$class=="benign"] <- 0
	677	b$classn[b$class=="malignant"] <- 1
	678	ggplot(b, aes(x=V1,y=classn)) +
	679	geom_point(position=position_jitter(width = .3,height=.06),
	680	alpha=.4, shape=21,size=1.5) +
	681	stat_smooth(method=glm, method.args = list(family="binomial"))
	682	#parameter is no longer used
	683
	684	#group points with fitted line
	685	sps <- ggplot(heightweight, aes(x=ageYear, y=heightIn, color = sex)) +
	686	geom_point() +
	687	scale_color_brewer(palette = "Set1")
	688	sps + geom_smooth()
	689
	690	sps + geom_smooth(method = lm, se = F, fullrange= T)
	691
	692
	693	#adding fitted lines from an existing model
	694	library(gcookbook)
	695	model <- lm(heightIn~ ageYear + I(ageYear^2), data = heightweight)
	696	xmin <- min(heightweight$ageYear)
	697	xmax <- max(heightweight$ageYear)
	698
	699	predicted <- data.frame(ageYear = seq(xmin,xmax,length.out = 200))
	700	predicted$heightIn <- predict(model, predicted)
	701
	702	sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
	703	geom_point(color = "grey40")
	704	sp + geom_line(data=predicted,size=1)
	705
	706	# Given a model, predict values of yvar from xvar
	707	# This supports one predictor and one predicted variable
	708	# xrange: If NULL, determine the x range from the model object. If a vector with # two numbers, use those as the min and max of the prediction range.
	709	# samples: Number of samples across the x range.
	710	# ...: Further arguments to be passed to predict()
	711	predictvals <- function(model, xvar, yvar, xrange=NULL, samples = 100,...){
	712	if (is.null(xrange)) {
	713	if (any(class(model) %in% c("lm","glm")))
	714	xrange <- range(model$model[[xvar]])
	715	else if (any(class(model) %in% "loess"))
	716	xrange <- range(model$x)
	717	}
	718	newdata <- data.frame(x=seq(xrange[1],xrange[2],length.out = samples))
	719	names(newdata) <- xvar
	720	newdata[yvar] <- predict(model,newdata = newdata,...)
	721	newdata
	722	}
	723
	724	modlinear <- lm(heightIn ~ ageYear, heightweight)
	725	modloess <- loess(heightIn ~ ageYear, heightweight)
	726	#predict
	727	lm_predicted <- predictvals(modlinear, "ageYear", "heightIn")
	728	loess_predicted <- predictvals(modloess, "ageYear", "heightIn")
	729	#multiple fitted lines
	730	sp + geom_line(data=lm_predicted, colour="red", size=.8) +
	731	geom_line(data=loess_predicted, colour="blue", size=.8)
	732
	733	library(MASS) # For the data set b <- biopsy
	734	b$classn[b$class=="benign"] <- 0
	735	b$classn[b$class=="malignant"] <- 1
	736	fitlogistic <- glm(classn~V1,data = b, family = binomial)
	737
	738	#get predicted values
	739	glm_predicted <- predictvals(fitlogistic, "V1","classn",type="response")
	740	ggplot(b,aes(x=V1, y=classn)) +
	741	geom_point(position = position_jitter(width = .3,height=.08),alpha=.4,
	742	shape=21,size=1.5) +
	743	geom_line(data=glm_predicted, color="#1177FF",size=1)
	744
	745	#adding fitted lines from multiple exsiting models
	746	make_model <- function(data) { lm(heightIn ~ ageYear, data)}
	747	library(gcookbook)
	748	library(plyr)
	749	models <- dlply(heightweight,"sex",.fun=make_model)
	750	#For each subset of a data frame, apply function then combine results into a list.
	751	pred <- ldply(models,.fun = predictvals,xvar="ageYear",yvar="heightIn")
	752	#For each element of a list, apply function then combine results into a data frame
	753	ggplot(heightweight, aes(x=ageYear, y=heightIn, color=sex)) +
	754	geom_point() +
	755	geom_line(data=pred)
	756	#extend the range to the same range across all groups
	757	pred <- ldply(models, .fun=predictvals, xvar="ageYear", yvar="heightIn",
	758	xrange=range(heightweight$ageYear))
	759	ggplot(heightweight, aes(x=ageYear, y=heightIn, color=sex)) +
	760	geom_point() +
	761	geom_line(data=pred)
	762
	763	#adding annotations with model coefficients
	764	library(gcookbook)
	765	model <- lm(heightIn~ ageYear, heightweight)
	766	summary(model)
	767	#add r square value
	768	pred <- predictvals(model, "ageYear", "heightIn")
	769	sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
	770	geom_point() +
	771	geom_line(data=pred)
	772	sp + annotate("text", label="r^2=0.42", x=16.5, y=52)
	773	#using r's math expression syntax
	774	sp + annotate("text", label="r^2=0.42",parse=TRUE, x=16.5, y=52)
	775
	776	#create a string that when parsed, returns a valid expression
	777	eqn <- as.character(as.expression(
	778	substitute(italic(y) == a + b * italic(x) * "," ~~ italic(r)^2 ~ "=" ~ r2,
	779	list(a = format(coef(model)[1], digits=3),
	780	b = format(coef(model)[2], digits=3),
	781	r2 = format(summary(model)$r.squared, digits=2)
	782	))))
	783	sp + annotate("text", label=eqn, parse=TRUE,x=Inf,y=-Inf,hjust=1.1,vjust=-.5)
	784
	785
	786	#adding marginal rugs to a scatter plot
	787	ggplot(faithful, aes(x=eruptions, y=waiting)) + geom_point() + geom_rug()
	788	#specify the size
	789	ggplot(faithful, aes(x=eruptions, y = waiting)) +
	790	geom_point() +
	791	geom_rug(position = "jitter",size=.2)
	792
	793	#label points in a scatter plot
	794	library(gcookbook)
	795	s <- subset(countries, Year==2009 & healthexp > 2000)
	796	sp <- ggplot(s, aes(x=healthexp, y=infmortality)) + geom_point()
	797	sp + annotate("text",x=4350,y=5.4,label="Cannada") +
	798	annotate("text",x=7400,y=6.8,label="USA")
	799	#automatically add the labels
	800	sp + geom_text(aes(label=Name), size=4)
	801	sp + geom_text(aes(label=Name), size=4,vjust=0)
	802	sp + geom_text(aes(y=infmortality + 0.1, label = Name),size=4,vjust=0)
	803	sp + geom_text(aes(label = Name), size=4,hjust=0)
	804	sp + geom_text(aes(x=healthexp+100, label=Name), size=4, hjust=0)
	805
	806	#label some of the points automatically
	807	cdat <- subset(countries, Year == 2009 & healthexp > 2009)
	808	cdat$Name1 <- cdat$Name
	809	idx <- cdat$Name1 %in% c("Canada", "Ireland", "United Kingdom", "United States",
	810	"New Zealand", "Iceland", "Japan", "Luxembourg",
	811	"Netherlands", "Switzerland")
	812	cdat$Name1[!idx] <- NA
	813	ggplot(cdat, aes(x=healthexp, y=infmortality)) +
	814	geom_point() +
	815	geom_text(aes(x=healthexp+100, label=Name1),size=4,hjust=0) +
	816	xlim(2000,10000)
	817
	818	#creating a balloon plot
	819	library(gcookbook) # For the data set
	820	cdat <- subset(countries, Year==2009 &
	821	Name %in% c("Canada", "Ireland", "United Kingdom", "United States",
	822	"New Zealand", "Iceland", "Japan", "Luxembourg",
	823	"Netherlands", "Switzerland"))
	824	p <- ggplot(cdat, aes(x=healthexp, y=infmortality,size=GDP)) +
	825	geom_point(shape=21,color="black",fill="cornsilk")
	826	#GDP mapped to redius
	827	p#GDP mapped to area instead, and larger circles
	828	p + scale_size_area(max_size=15)
	829
	830	#add up counts for male and female
	831	hec <- HairEyeColor[,,"Male"] + HairEyeColor[,,"Female"]
	832
	833	#convert to long format
	834	library(reshape2)
	835	hec <- melt(hec, value.name="count")
	836	ggplot(hec, aes(x=Eye,y=Hair)) +
	837	geom_point(aes(size=count),shape=21,color="black",fill="cornsilk") +
	838	scale_size_area(max_size = 20,guid=F) +
	839	geom_text(aes(y=as.numeric(Hair)-sqrt(count)/22,label=count),
	840	color="grey60",size=4)
	841
	842	#making a scatter plot matrix
	843	library(gcookbook)
	844	c2009 <- subset(countries, Year==2009,
	845	select=c(Name, GDP, laborrate, healthexp, infmortality))
	846	pairs(c2009[,2:5])
	847
	848	panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...) {
	849	usr <- par("usr")
	850	on.exit(par(usr))
	851	par(usr = c(0, 1, 0, 1))
	852	r <- abs(cor(x, y, use="complete.obs"))
	853	txt <- format(c(r, 0.123456789), digits=digits)[1]
	854	txt <- paste(prefix, txt, sep="")
	855	if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
	856	text(0.5, 0.5, txt, cex = cex.cor * (1 + r) / 2)
	857	}
	858
	859	panel.hist <- function(x, ...) {
	860	usr <- par("usr")
	861	on.exit(par(usr))
	862	par(usr = c(usr[1:2], 0, 1.5) )
	863	h <- hist(x, plot = FALSE)
	864	breaks <- h$breaks
	865	nB <- length(breaks)
	866	y <- h$counts
	867	y <- y/max(y)
	868	rect(breaks[-nB], 0, breaks[-1], y, col="white", ...)
	869	}
	870
	871	pairs(c2009[,2:5], upper.panel = panel.cor,
	872	diag.panel = panel.hist,
	873	lower.panel = panel.smooth)
	874
	875	panel.lm <- function (x, y, col = par("col"), bg = NA, pch = par("pch"),
	876	cex = 1, col.smooth = "black", ...) {
	877	points(x, y, pch = pch, col = col, bg = bg, cex = cex)
	878	abline(stats::lm(y ~ x), col = col.smooth, ...)
	879	}
	880
	881	pairs(c2009[,2:5], pch=".",
	882	upper.panel = panel.cor,
	883	diag.panel = panel.hist,
	884	lower.panel = panel.lm)
	885
	886
	887	#########summarized data distributions
	888
	889	#making a basic histogram
	890	ggplot(faithful, aes(x=waiting)) + geom_histogram()
	891
	892	# Store the values in a simple vector
	893	w <- faithful$waiting
	894	ggplot(NULL, aes(x=w)) + geom_histogram()
	895
	896	#set the width of each bin to 5
	897	ggplot(faithful, aes(x=waiting)) +
	898	geom_histogram(binwidth = 5, fill ="white",color="black")
	899
	900	#divide the x range into 15 bins
	901	binsize <- diff(range(faithful$waiting))/15
	902	ggplot(faithful,aes(x=waiting)) +
	903	geom_histogram(binwidth = binsize,fill="white",color="black")
	904
	905	#change the origin
	906	h <- ggplot(faithful, aes(x=waiting)) # Save the base object for reuse
	907	h + geom_histogram(binwidth=8, fill="white", colour="black", origin=31)
	908	h + geom_histogram(binwidth=8, fill="white", colour="black", origin=35)
	909
	910	#the same with bar
	911	h + geom_bar(stat="bin")
	912
	913	#making multiple histograms from grouped data
	914	library(MASS)
	915	#use smoke as the faceting variable
	916	ggplot(birthwt, aes(x=bwt)) +
	917	geom_histogram(fill="white", color="black") +
	918	facet_grid(smoke~.)
	919	#change the label
	920	bir <- birthwt
	921	#convert smoke to a factor
	922	bir$smoke <- factor(bir$smoke)
	923	library(plyr)
	924	bir$smoke <- revalue(bir$smoke, c("0"="No Smoke","1"="Smoke"))
	925	#same result
	926	bir$smoke <- factor(bir$smoke,labels=c("No Smoke","Smoke"))
	927
	928	ggplot(bir,aes(x=bwt)) +
	929	geom_histogram(fill="white",color="black") +
	930	facet_grid(smoke~.)
	931
	932
	933	#when size are different
	934	ggplot(bir, aes(x=bwt)) +
	935	geom_histogram(fill="white", color="black") +
	936	facet_grid(race~.)
	937	#resize independently
	938	ggplot(bir, aes(x=bwt)) +
	939	geom_histogram(fill="white",color="black") +
	940	facet_grid(race~.,scales = "free") #scales is set "free"
	941	#another approach is to map the grouping variable to fill
	942	ggplot(bir, aes(x=bwt,fill=smoke)) +
	943	geom_histogram(position="identity",alpha=0.4)
	944	#without position="identity", ggplot will stack the histogram bars on top of each other
	945
	946	#making a density curve
	947	ggplot(faithful, aes(x=waiting)) + geom_density()
	948
	949	#remove the side and bottom
	950	ggplot(faithful, aes(x=waiting)) +
	951	geom_line(stat="density") +
	952	expand_limits(y=0)
	953
	954	#the larger the bandwidth, the more smoothing there is
	955	ggplot(faithful,aes(x=waiting)) +
	956	geom_line(stat="density",adjust=.25,color="red") +
	957	geom_line(stat="density") +
	958	geom_line(stat="density",adjust=2,color="blue")
	959
	960	#set the area color
	961	ggplot(faithful,aes(x=waiting)) +
	962	geom_density(fill="blue",alpha=.2) +
	963	xlim(35,105)
	964
	965	ggplot(faithful,aes(x=waiting)) +
	966	geom_density(fill="blue",alpha=.2,color=NA) +
	967	geom_line(stat="density") +
	968	xlim(35,105)
	969
	970	#compare the theoretical and obsearved distributions
	971	ggplot(faithful, aes(x=waiting, y=..density..)) +
	972	geom_histogram(fill="cornsilk", colour="grey60", size=.2) +
	973	geom_density() +
	974	xlim(35, 105)
	975
	976	#making multiple density curves from grouped data
	977	library(MASS)
	978	bir <- birthwt
	979	bir$smoke <- factor(bir$smoke)
	980	ggplot(bir,aes(x=bwt,color=smoke)) + geom_density()
	981	#mapping smoke to fill
	982	ggplot(bir,aes(x=bwt,fill=smoke)) + geom_density(alpha=.3)
	983
	984	ggplot(bir, aes(x=bwt)) + geom_density() + facet_grid(smoke~.)
	985
	986	library(plyr) # For the revalue function
	987	birthwt1 <- birthwt
	988	birthwt1$smoke <- factor(birthwt1$smoke)
	989	birthwt1$smoke <- revalue(birthwt1$smoke, c("0"="No Smoke", "1"="Smoke"))
	990	ggplot(birthwt1, aes(x=bwt)) + geom_density() + facet_grid(smoke ~ .)
	991
	992	ggplot(birthwt1, aes(x=bwt, y=..density..)) +
	993	geom_histogram(binwidth=200, fill="cornsilk", colour="grey60", size=.2) +
	994	geom_density() +
	995	facet_grid(smoke ~ .)
	996
	997	#making a frequency polygon
	998	ggplot(faithful, aes(x=waiting)) + geom_freqpoly()
	999	ggplot(faithful, aes(x=waiting)) + geom_freqpoly(binwidth=4)
	1000
	1001	#use 15 bins
	1002	binsize <- diff(range(faithful$waiting))/15
	1003	ggplot(faithful, aes(x=waiting)) + geom_freqpoly(binwidth=binsize)
	1004
	1005	#making a basic box plot
	1006	library(MASS)
	1007	ggplot(birthwt, aes(x=factor(race), y=bwt)) + geom_boxplot()
	1008	# Use factor() to convert numeric variable to discrete
	1009
	1010	#change width
	1011	ggplot(birthwt,aes(x=factor(race),y=bwt)) + geom_boxplot(width=.5)
	1012	#default size is 2, default shape is 16
	1013	ggplot(birthwt,aes(x=factor(race),y=bwt)) +
	1014	geom_boxplot(outlier.size = 1.5,outlier.shape = 21)
	1015
	1016	ggplot(birthwt,aes(x=1,y=bwt)) + geom_boxplot() +
	1017	scale_x_continuous(breaks = NULL) +
	1018	theme(axis.title.x=element_blank())
	1019
	1020	#adding natches to a box plot
	1021	library(MASS)
	1022	ggplot(birthwt,aes(x=factor(race),y=bwt)) +
	1023	geom_boxplot(notch = TRUE)
	1024
	1025	#adding means to a box plot
	1026	library(MASS)
	1027	ggplot(birthwt, aes(factor(race),y=bwt)) +
	1028	geom_boxplot() +
	1029	stat_summary(fun.y = "mean",geom = "point",shape=23,size=3,fill="white")
	1030
	1031	#making a violin plot
	1032	library(gcookbook)
	1033	p <- ggplot(heightweight,aes(x=sex,y=heightIn))
	1034	p + geom_violin()
	1035	#with narrows boxplot
	1036	p + geom_violin() +
	1037	geom_boxplot(width=.1,fill="black",outlier.colour = NA) +
	1038	stat_summary(fun.y = median,geom="point",fill="white",shape=21,size=2.5)
	1039	#keep the tails
	1040	p + geom_violin(trim=FALSE)
	1041
	1042	#making a dot plot
	1043	library(gcookbook)
	1044	c2009 <- subset(countries, Year==2009 & healthexp > 2000)
	1045	p <- ggplot(c2009,aes(x=infmortality))
	1046	p + geom_dotplot()
	1047	p + geom_dotplot(binwidth=.25) + geom_rug() +
	1048	scale_y_continuous(breaks=NULL) +
	1049	theme(axis.title.y=element_blank())
	1050
	1051	#making multiple dot plots for grouped data
	1052	ggplot(heightweight, aes(x=sex, y=heightIn)) +
	1053	geom_dotplot(binaxis="y", binwidth=.5, stackdir="center")
	1054
	1055
	1056	#making a desity plot of two dimensional data
	1057	p <- ggplot(faithful,aes(x=eruptions,y=waiting))
	1058	p + geom_point() + stat_density2d()
	1059	#map the height of the density to color
	1060	p + stat_density2d(aes(color=..level..))
	1061	#map denstity estimate to fill color
	1062	p + stat_density2d(aes(fill=..density..),geom="raster",contour = F)
	1063	#with points and map desity estimate to alpha
	1064	p + geom_point() +
	1065	stat_density2d(aes(alpha=..density..),geom="tile",contour=F)
	1066	#tile and raster look like the same
	1067
	1068	#marginal rug added to a scatter plot
	1069	p + stat_density2d(aes(fill=..density..),geom="raster",
	1070	contour=F,h=c(.5,5))
	1071
	1072
	1073
	1074	#############annotations
	1075
	1076	#adding text annotations
	1077	p <- ggplot(faithful,aes(x=eruptions,y=waiting)) + geom_point()
	1078	p + annotate(geom="text",x=3,y=48,label = "Group 1") +
	1079	annotate("text",x=4.5,y=66,label = "Group 2")
	1080	p + annotate("text",x=3,y=48,label="Group 1",family="serif",
	1081	fontface="italic",color="darkred",size=3) +
	1082	annotate("text",x=4.5,y=66,label="Group 2",family="serif",
	1083	fontface="italic",color="darkred",size=3)
	1084	annotate("text",x=3,y=48,label="Group 1",family="serif",
	1085	fontface="italic",color="darkred",size=3)
	1086
	1087	p + annotate("text", x=3, y=48, label="Group 1", alpha=.1) + # Normal
	1088	geom_text(x=4.5, y=66, label="Group 2", alpha=.1) # Overplotted
	1089
	1090	#for continuous axes, use Inf and -Inf to place text annotations at the
	1091	#edge of the plotting area
	1092	p + annotate("text",x=-Inf,y=Inf,label="Upper left",hjust=-.2,vjust=2) +
	1093	annotate("text",x=mean(range(faithful$eruptions)),y=-Inf,vjust=-.4,label="Bottom middle")
	1094	#for hjust and vjust position is minus the value
	1095
	1096	#using mathematical expression in annotations
	1097	#a normal curve
	1098	p <- ggplot(data.frame(x=c(-3,3)), aes(x=x)) + stat_function(fun=dnorm)
	1099	p + annotate("text",x=2,y=0.3,parse=TRUE,
	1100	label="frac(1,sqrt(2pi))e^{-x^2/2}")
	1101	#use single quotes within double quotes (or vice versa) to mark the plain-text parts
	1102	#put a * operator between them
	1103	p + annotate("text",x=0,y=.05,parse=TRUE,size=4,
	1104	label="'Function: 'y==frac(1,sqrt(2pi))*e^{-x^2/2}")
	1105	#?plotmath for many examples of mathematical expressions, and
	1106	#?demo(plot math) for graphical examples of mathematical expressions
	1107
	1108	#adding lines
	1109	library(gcookbook)
	1110	p <- ggplot(heightweight,aes(x=ageYear,y=heightIn,color=sex)) + geom_point()
	1111	#adding horizongtal and vertical lines
	1112	p + geom_hline(yintercept = 60) + geom_vline(xintercept = 14)
	1113	p + geom_abline(intercept = 37.4, slope = 1.75)
	1114
	1115	library(plyr)
	1116	hw_means <- ddply(heightweight,"sex",summarise,heightIn=mean(heightIn))
	1117	p + geom_hline(aes(yintercept=heightIn,color=sex),data=hw_means,
	1118	linetype="dashed",size=1)
	1119
	1120	#for discrete variable
	1121	pg <- ggplot(PlantGrowth,aes(x=group,y=weight)) + geom_point()
	1122	pg + geom_vline(xintercept = 2)
	1123	pg + geom_vline(xintercept = which(levels(PlantGrowth$group) == "ctrl"))
	1124
	1125	#adding line segments and arrows
	1126	library(gcookbook)
	1127	p <- ggplot(subset(climate, Source=="Berkeley"), aes(x=Year, y=Anomaly10y)) +
	1128	geom_line()
	1129	p + annotate("segment", x=1950, xend=1980, y=-.25, yend=-.25)
	1130
	1131
	1132
	1133
	1134
	1135
	1136
	1137
	1138
	1139
	1140
	1141
	1142
	1143
	1144
	1145
	1146
	1147
	1148
	1149
	1150
	1151
	1152