set.seed(1410) dsmall <- diamonds[sample(nrow(diamonds), 100),] #scatterplot qplot(carat, price, data = diamonds) qplot(log(carat), log(price), data = diamonds) qplot(carat, x*y*z, data = diamonds) qplot(carat, price, data = dsmall, colour = color) qplot(carat, price, data = dsmall, shape = cut) qplot(carat, price, data = diamonds, alpha = I(1/10)) qplot(carat, price, data = diamonds, alpha = I(1/100)) qplot(carat, price, data = diamonds, alpha = I(1/200)) #geom qplot(carat, price, data = dsmall, geom = c("point", "smooth")) qplot(carat, price, data = diamonds, geom = c("point", "smooth")) qplot(color, price/carat, data=diamonds,geom = "jitter") qplot(color, price/carat, data=diamonds,geom = "boxplot") qplot(color, price / carat, data = diamonds, geom = "jitter", alpha = I(1 / 50)) #offer size, colour, shape for scatterplot #offer size, colour, fill for boxplot qplot(color, price/carat, data=diamonds,geom = "boxplot",colour="red") qplot(carat, data = diamonds, geom = "histogram") qplot(carat, data = diamonds, geom = "density") qplot(carat, data = diamonds, geom = "histogram", binwidth = 1, xlim = c(0,3)) qplot(carat, data = diamonds, geom = "histogram", binwidth = 0.1, xlim = c(0,3)) qplot(carat, data = diamonds, geom = "histogram", binwidth = 0.01, xlim = c(0,3)) qplot(carat, data = diamonds, geom = "density", colour = color) qplot(carat, data = diamonds, geom = "histogram", fill = color, bins = 50) qplot(color, data = diamonds, geom = "bar") qplot(color, data = diamonds, geom = "bar", weight = carat) + scale_y_continuous("carat") #total weight qplot(date, unemploy / pop, data = economics, geom = "line") qplot(date, uempmed, data = economics, geom = "line") #path year <- function(x) as.POSIXlt(x)$year + 1900 qplot(unemploy / pop, uempmed, data = economics, geom = c("point", "path")) qplot(unemploy / pop, uempmed, data = economics, geom = "path", colour = year(date)) + scale_area() #faceting # row var ∼ col var qplot(carat, data = diamonds, facets = color ~ ., geom = "histogram", binwidth = 0.1, xlim = c(0, 3)) qplot(carat, ..density.., data = diamonds, facets = color ~ ., geom = "histogram", binwidth = 0.1, xlim = c(0, 3)) #Using ..density.. tells ggplot2 to map the density to the y-axis #instead of the default use of count #density plot makes it easier to compare distributions ignoring #the relative abundance of diamonds within each colour #see ?plotmath for more examples of using mathematical formulae qplot( carat, price, data = dsmall, xlab = "Price ($)", ylab = "Weight (carats)", main = "Price-weight relationship" ) qplot( carat, price/carat, data = dsmall, xlab = "Weight(carats)", ylab = "expression(frac(price,carat))", main = "Small diamonds", xlim = c(.2, 1)) qplot(carat, price, data = dsmall, log = "xy") qplot(displ, hwy, data = mpg, colour = factor(cyl)) #factor #ggplot p <- ggplot(diamonds, aes(x = carat)) p <- p + layer( geom = "bar", geom_params = list(fill = "steelblue"), stat = "bin", stat_params = list(binwidth = 2) ) #layer(geom, geom_params, stat, stat_params, data, mapping, position) ggplot(msleep, aes(sleep_rem / sleep_total, awake)) + geom_point() #geom_XXX(mapping, data, ..., geom, position) #stat_XXX(mapping, data, ..., stat, position) p <- ggplot(mtcars, aes(mpg, wt)) + geom_point() p + geom_point(aes(colour = factor(cyl))) p + geom_point(aes(y=disp)) p <- ggplot(mtcars, aes(mpg, wt)) p + geom_point(colour = "darkblue") p + geom_point(aes(colour = "darkblue")) p + geom_point(aes(colour = factor(cyl))) p <- ggplot(Oxboys, aes(age, height, group = Subject)) + geom_line() p + geom_smooth(aes(group = Subject), method = "lm", se = F) p + geom_smooth(aes(group = 1), method = "lm", se = F) boysbox <- ggplot(Oxboys, aes(Occasion, height)) + geom_boxplot() boysbox + geom_line(aes(group=Subject), color=I(8)) xgrid <- with(df, seq(min(x), max(x), length=50)) p <- ggplot(diamonds, aes(carat)) + geom_histogram(aes(y=..count..), binwidth = 0.1) p <- ggplot(diamonds, aes(carat)) + geom_histogram(aes(y=..density..), binwidth = 0.1) qplot(carat, ..density.., data=diamonds, geom = "histogram",binwidth = 0.1) d <- ggplot(diamonds,aes(carat)) + xlim(0, 3) d + stat_bin(aes(ymax = ..count..), binwidth = 0.1, geom = "area") d + stat_bin(aes(size = ..density..), binwidth = 0.1, geom = "point", position = "identity") d + stat_bin(aes(y = 1, fill = ..count..), binwidth = 0.1,geom = "tile", position = "identity") #graph cook book library(gcookbook) library(ggplot2) #chapte 2 quickly exploring data #creating a scatter plot plot(mtcars$wt, mtcars$mpg) qplot(mtcars$wt, mtcars$mpg) qplot(wt, mpg, data=mtcars) # This is equivalent to: ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point() #creating line plot plot(pressure$temperature, pressure$pressure, type="l") qplot(pressure$temperature, pressure$pressure, geom="line") qplot(temperature, pressure, data=pressure, geom="line") # This is equivalent to: ggplot(pressure, aes(x=temperature, y=pressure)) + geom_line() # Lines and points together qplot(temperature, pressure, data=pressure, geom=c("line", "point")) # Equivalent to: ggplot(pressure, aes(x=temperature, y=pressure)) + geom_line() + geom_point() #creating bar graph barplot(BOD$demand, names.arg=BOD$Time) #name # Generate a table of countsqplot(supp, len, data=ToothGrowth, geom="boxplot") # This is equivalent to: ggplot(ToothGrowth, aes(x=supp, y=len)) + geom_boxplot() barplot(table(mtcars$cyl)) library(ggplot2) ggplot(BOD, aes(x=factor(Time), y=demand)) + geom_bar(stat="identity") # Bar graph of counts qplot(factor(cyl), data=mtcars) # This is equivalent to: ggplot(mtcars, aes(x=factor(cyl))) + geom_bar() #creating histogram # Specify approximate number of bins with breaks hist(mtcars$mpg, breaks=10) library(ggplot2) qplot(mpg, data=mtcars, binwidth=1) # This is equivalent to: ggplot(mtcars, aes(x=mpg)) + geom_histogram(binwidth=1) #creating box plot # Formula syntax boxplot(len ~ supp, data = ToothGrowth) library(ggplot2) qplot(supp, len, data=ToothGrowth, geom="boxplot") # This is equivalent to: ggplot(ToothGrowth, aes(x=supp, y=len)) + geom_boxplot() ggplot(ToothGrowth, aes(x=interaction(supp, dose), y=len)) + geom_boxplot() #plot a function curve curve(x^3 - 5*x, from=-4, to=4) # Plot a user-defined function myfun <- function(xvar) { 1/(1 + exp(-xvar + 10)) } curve(myfun(x), from=0, to=20) # Add a line: curve(1-myfun(x), add = TRUE, col = "red") library(ggplot2) # This is equivalent to: ggplot(data.frame(x=c(0, 20)), aes(x=x)) + stat_function(fun=myfun, geom="line") #chapter 3 Bar graphs ggplot(pg_mean, aes(x=group, y=weight)) + geom_bar(stat="identity") ggplot(pg_mean, aes(x=group, y=weight)) + geom_bar(stat="identity", fill="lightblue", colour="black") #group by a second variable ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) + geom_bar(position="dodge", colour="black",stat="identity") #default stat is count #stat="identity" means using sum ggplot(diamonds, aes(x=cut)) + geom_bar(stat = "count") #default stat is count upc <- subset(uspopchange, rank(Change)>40) ggplot(upc, aes(x=Abb, y=Change, fill=Region)) + geom_bar(stat="identity") ggplot(upc, aes(x=reorder(Abb, Change), y=Change, fill=Region)) + geom_bar(stat="identity", colour="black") + scale_fill_manual(values=c("#669933", "#FFCC66")) + xlab("State") #using the reorder() function to reorder the levels of a factor based on #the values of another variable #Coloring Negative and Positive Bars Differently csub <- subset(climate, Source=="Berkeley" & Year >= 1900) csub$pos <- csub$Anomaly10y >= 0 ggplot(csub, aes(x=Year, y=Anomaly10y, fill=pos)) + geom_bar(stat="identity", position="identity") #default position is stack ggplot(csub, aes(x=Year, y=Anomaly10y, fill=pos)) + geom_bar(stat="identity", position="identity", colour="black", size=0.25) + scale_fill_manual(values=c("#CCEEFF", "#FFDDDD"), guide=FALSE) #Adjusting Bar Width and Spacing #default value is 0.9 ggplot(pg_mean, aes(x=group, y=weight)) + geom_bar(stat="identity", width=0.5) #a grouped bar graph with narrow bars ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) + geom_bar(stat="identity", width=0.5, position="dodge") #with some space between the bars ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) + geom_bar(stat="identity", width=0.6, position=position_dodge(0.8)) #space = 0.8-0.6 #To be more precise, the value of width in position_dodge() is the same as #width in geom_bar() #Making a Stacked Bar Graph ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) + geom_bar(stat="identity") #One problem with the default output is that the stacking order is the opposite of the #order of items in the legend #reverse the legend ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) + geom_bar(stat="identity") + guides(fill=guide_legend(reverse=TRUE)) #reverse the stacking order libary(plyr) # Needed for desc() ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar, order=desc(Cultivar))) + geom_bar(stat="identity") #get a different color palette ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) + geom_bar(stat="identity", colour="black") + guides(fill=guide_legend(reverse=TRUE)) + scale_fill_brewer(palette="Pastel1") #Making a Proportional Stacked Bar Graph library(gcookbook) library(plyr) ce <- ddply(cabbage_exp,"Date",transform, percent_weight = Weight / sum(Weight)*100) ggplot(ce, aes(x=Date, y= percent_weight, fill =Cultivar)) + geom_bar(stat="identity") #change a little bit ggplot(ce, aes(x=Date, y= percent_weight, fill =Cultivar)) + geom_bar(stat="identity", color="black") + guides(fill=guide_legend(reverse = TRUE)) + scale_fill_brewer(palette = "Pastel1") #adding labels to a bar graph # Below the top ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) + geom_bar(stat="identity") + geom_text(aes(label=Weight), vjust=1.5, colour="white") # Above the top ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) + geom_bar(stat="identity") + geom_text(aes(label=Weight), vjust=-0.2) #adjust y limits to be a little higher ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) + geom_bar(stat="identity") + geom_text(aes(label=Weight), vjust=-0.2) + ylim(0, max(cabbage_exp$Weight) * 1.05) # Map y positions slightly above bar top - y range of plot will auto-adjust ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) + geom_bar(stat="identity") + geom_text(aes(y=Weight+.1, label=Weight)) #dodging ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) + geom_bar(stat="identity", position="dodge") + geom_text(aes(label=Weight), vjust=1.5, colour="white", position=position_dodge(0.9), size=3) #label on stack bar library(plyr) # Sort by the day and sex columns ce <- arrange(cabbage_exp, Date, Cultivar) #get the cumulative sum ce <- ddply(ce, "Date", transform, label_y=cumsum(Weight)) ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) + geom_bar(stat="identity") + geom_text(aes(y=label_y, label=Weight), vjust=1.5, colour="white") + guides(fill=guide_legend(reverse=TRUE)) #put the label in the center ce <- arrange(cabbage_exp, Date, Cultivar) # Calculate y position, placing it in the middle ce <- ddply(ce, "Date", transform, label_y=cumsum(Weight)-0.5*Weight) #good ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) + geom_bar(stat="identity") + geom_text(aes(y=label_y, label=Weight), colour="white") #polish ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) + geom_bar(stat="identity") + geom_text(aes(y=label_y, label=paste(format(Weight, nsmall=2), "kg")), size=4) + guides(fill=guide_legend(reverse=TRUE)) + scale_fill_brewer(palette="Pastel1") #coord_flip() #making a Cleveland Dot Plot library(gcookbook) tophit <- tophitters2001[1:25,] ggplot(tophit, aes(x=avg, y=name)) + geom_point() ggplot(tophit, aes(x=avg, y=reorder(name, avg))) + geom_point(size=3) + # Use a larger dot theme_bw() + theme(panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), panel.grid.major.y = element_line(colour="grey60", linetype="dashed")) ggplot(tophit, aes(x=reorder(name, avg), y=avg)) + geom_point(size=3) + # Use a larger dot theme_bw() + theme(axis.text.x = element_text(angle=60, hjust=1), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank(), panel.grid.major.x = element_line(colour="grey60", linetype="dashed")) #reorder onlu by one variable #order by two variables # Get the names, sorted first by lg, then by avg nameorder <- tophit$name[order(tophit$lg, tophit$avg)] # Turn name into a factor, with levels in the order of nameorder tophit$name <- factor(tophit$name, levels=nameorder) ggplot(tophit, aes(x=avg, y=name)) + geom_segment(aes(yend=name), xend=0, colour="grey50") + geom_point(size=3, aes(colour=lg)) + scale_colour_brewer(palette="Set1", limits=c("NL","AL")) + #limits order the posion theme_bw() + theme(panel.grid.major.y = element_blank(), # No horizontal grid lines legend.position=c(1, 0.55), # Put legend inside plot area legend.justification=c(1, 0.5)) ggplot(tophit, aes(x=avg, y=name)) + geom_segment(aes(yend=name), xend=0, colour="grey50") + geom_point(size=3, aes(colour=lg)) + scale_colour_brewer(palette="Set1", limits=c("NL","AL"), guide=FALSE) + theme_bw() + theme(panel.grid.major.y = element_blank()) + facet_grid(lg ~ ., scales="free_y", space="free_y") #Making a Basic Line Graph ggplot(BOD, aes(x=Time, y=demand)) + geom_line() d1 <- BOD d1$Time <- factor(d1$Time) ggplot(d1, aes(x=Time, y=demand,group=1)) +geom_line() #add y limit ggplot(BOD, aes(x=Time, y=demand)) + geom_line() + ylim(0, max(BOD$demand)) ggplot(BOD, aes(x=Time, y=demand)) + geom_line() + expand_limits(y=0) #same #adding points to a line graph ggplot(BOD, aes(x=Time, y=demand)) + geom_line() + geom_point() library(gcookbook) ggplot(worldpop, aes(x=Year, y=Population)) + geom_line() + geom_point() #with a log y-axis ggplot(worldpop, aes(x=Year, y= Population)) + geom_line() + geom_point() + scale_y_log10() #making a line graph with multiple lines #summarize the data library(plyr) tg <- ddply(ToothGrowth, c("supp","dose"),summarise, length=mean(len)) #map supp to colour ggplot(tg, aes(x=dose,y=length,colour=supp)) + geom_line() #map supp to linetype ggplot(tg, aes(x=dose, y=length, linetype=supp)) + geom_line() #factor + group (line point together) ggplot(tg, aes(x=factor(dose), y=length,colour=supp,group=supp)) + geom_line() # Make the points a little larger ggplot(tg, aes(x=dose, y=length, shape=supp)) + geom_line() + geom_point(size=4) # Also use a point with a color fill ggplot(tg, aes(x=dose, y=length, fill=supp)) + geom_line() + geom_point(size=4, shape=21) #dodge them ggplot(tg, aes(x=dose, y=length, shape=supp)) + geom_line(position=position_dodge(0.2)) + # Dodge lines by 0.2 geom_point(position=position_dodge(0.2), size=4) # Dodge points by 0.2 #change the apperance of lines ggplot(BOD, aes(x=Time, y=demand)) + geom_line(linetype = "dashed", size = 1, color = "blue") # Load plyr so we can use ddply() to create the example data set library(plyr) # Summarize the ToothGrowth data tg <- ddply(ToothGrowth, c("supp", "dose"), summarise, length=mean(len)) ggplot(tg, aes(x=dose, y=length, colour=supp)) + geom_line() + scale_colour_brewer(palette="Set1") # If both lines have the same properties, you need to specify a variable to # use for grouping ggplot(tg, aes(x=dose, y=length, group=supp)) + geom_line(colour="darkgreen", size=1.5) # Since supp is mapped to colour, it will automatically be used for grouping ggplot(tg, aes(x=dose, y=length, colour=supp)) + geom_line(linetype="dashed") + geom_point(shape=22, size=3, fill="white") #change the appearance of points ggplot(BOD, aes(x=Time, y=demand)) + geom_line() + geom_point(size=4, shape=22, colour="darkred", fill="pink") #The default shape for points is a solid circle, #the default size is 2, and the default colour is "black" # Load plyr so we can use ddply() to create the example data set library(plyr) # Summarize the ToothGrowth data tg <- ddply(ToothGrowth, c("supp", "dose"), summarise, length=mean(len)) # Save the position_dodge specification because we'll use it multiple times pd <- position_dodge(0.2) ggplot(tg, aes(x=dose, y=length, fill=supp)) + geom_line(position=pd) + geom_point(shape=21, size=3, position=pd) + scale_fill_manual(values=c("black","white")) #making a graph with a shaded area sunspotyear <- data.frame( Year = as.numeric(time(sunspot.year)), Sunspots = as.numeric(sunspot.year) ) ggplot(sunspotyear, aes(x=Year, y=Sunspots)) + geom_area() #change area appearance (with outline) ggplot(sunspotyear, aes(x=Year, y=Sunspots)) + geom_area(colour="black", fill="blue", alpha=.2) #no bottom line ggplot(sunspotyear, aes(x=Year, y=Sunspots)) + geom_area(fill="blue", alpha=.2) + geom_line() #making a stacked area graph library(gcookbook) ggplot(uspopage, aes(x=Year, y= Thousands,fill=AgeGroup)) + geom_area() #reverse the legend order ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup)) + geom_area(colour="black", size=.2, alpha=.4) + scale_fill_brewer(palette="Blues") + guides(fill=guide_legend(reverse=TRUE)) #or ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup)) + geom_area(colour="black", size=.2, alpha=.4) + scale_fill_brewer(palette="Blues", breaks=rev(levels(uspopage$AgeGroup))) #or change the order of fill library(plyr) # For the desc() function ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup, order=desc(AgeGroup))) + geom_area(colour="black", size=.2, alpha=.4) + scale_fill_brewer(palette="Blues") #remove the left and right sides ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup, order=desc(AgeGroup))) + geom_area(colour=NA, alpha=.4) + scale_fill_brewer(palette="Blues") + geom_line(position="stack", size=.2) #default position is "identity" #making a proportional stacked area graph #first calculate the proportions library(gcookbook) library(plyr) #convert to percent p <- ddply(uspopage,"Year",transform, Percent = Thousands/sum(Thousands)*100) ggplot(p,aes(x=Year, y= Percent, fill = AgeGroup)) + geom_area(colour = "black",size=.2,alpha = .4) + scale_fill_brewer(palette="Blues",breaks=rev(levels(p$AgeGroup))) #adding a confidence region library(gcookbook) clim <- subset(climate, Source == "Berkeley", select = c("Year","Anomaly10y","Unc10y")) #the same library(dplyr) clim <- climate %>% filter(Source == "Berkeley") %>% select(Year, Anomaly10y, Unc10y) #shaded region ggplot(clim, aes(x=Year,y= Anomaly10y)) + geom_ribbon(aes(ymin=Anomaly10y-Unc10y, ymax=Anomaly10y+Unc10y), alpha=0.2) + geom_line() #we need know the 95% confidence interval ahead # With a dotted line for upper and lower bounds ggplot(clim, aes(x=Year, y=Anomaly10y)) + geom_line(aes(y=Anomaly10y-Unc10y), colour="grey50", linetype="dotted") + geom_line(aes(y=Anomaly10y+Unc10y), colour="grey50", linetype="dotted") + geom_line() ####scatter plots #making a basic scatter plot ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point() #the default solid circles (shape #16) is hollow ones (#21) ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point(shape=21) ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point(size=1.5) #default value of size is 2 #grouping data points by a variable using shape or color library(gcookbook) #mapping color ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) + geom_point() #mapping shape ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex)) + geom_point() +geom_line(group=sex) ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex, colour=sex)) + geom_point() ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex, colour=sex)) + geom_point() + scale_shape_manual(values=c(1,2)) + scale_colour_brewer(palette="Set1") #Using different point shapes library(gcookbook) ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point(shape=3) # Use slightly larger points and use a shape scale with custom values ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex)) + geom_point(size=3) + scale_shape_manual(values=c(1, 4)) #Some of the point shapes (1–14) have just an outline, #some (15–20) are solid, and #some (21–25) have an outline and fill that can be controlled separately #shape represent one variable and #the fill (empty or solid) represent another variable # Make a copy of the data hw <- heightweight # Categorize into <100 and >=100 groups hw$weightGroup <- cut(hw$weightLb, breaks=c(-Inf, 100, Inf), labels=c("< 100", ">= 100")) ggplot(hw, aes(x=ageYear, y=heightIn, shape=sex, fill=weightGroup)) + geom_point(size=2.5) + scale_shape_manual(values=c(21, 24)) + scale_fill_manual(values=c(NA, "black"), guide=guide_legend(override.aes=list(shape=21))) #mapping a continuous variable to color or size library(gcookbook) ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=weightLb)) + geom_point() ggplot(heightweight, aes(x=ageYear, y=heightIn, size=weightLb)) + geom_point() #fill gradient to go from black to white #make the points larger ggplot(heightweight, aes(x=weightLb, y=heightIn, fill=ageYear)) + geom_point(shape=21, size=2.5) + scale_fill_gradient(low="black", high="white") # Using guide_legend() will result in a discrete legend instead of a colorbar ggplot(heightweight, aes(x=weightLb, y=heightIn, fill=ageYear)) + geom_point(shape=21, size=2.5) + scale_fill_gradient(low="black", high="white", breaks=12:17, guide=guide_legend()) ggplot(heightweight, aes(x=ageYear, y=heightIn, size=weightLb, colour=sex)) + geom_point(alpha=.5) + scale_colour_brewer(palette="Set1") + scale_size_area() # Make area proportional to numeric value #dealing with overplotting sp <- ggplot(diamonds, aes(x=carat, y=price)) sp + geom_point(alpha=0.1) sp + geom_point(alpha=.01) #bin the points into rectangle sp + stat_bin2d() sp + stat_bin2d(bins=50) + scale_fill_gradient(low="lightblue", high="red", limits=c(0, 6000)) library(hexbin) sp + stat_binhex() + scale_fill_gradient(low="lightblue", high="red", limits=c(0, 8000)) sp + stat_binhex() + scale_fill_gradient(low="lightblue", high="red", breaks=c(0, 250, 500, 1000, 2000, 4000, 6000), limits=c(0, 6000)) #one discrete axis and one continuous axis sp1 <- ggplot(ChickWeight, aes(x=Time, y=weight)) sp1 + geom_point() sp1 + geom_point(position="jitter") # Could also use geom_jitter(), which is equivalent sp1 + geom_point(position=position_jitter(width=.5, height=0)) #using boxplot sp1 + geom_boxplot(aes(group=Time)) #adding fitted regression model lines library(gcookbook) sp <- ggplot(heightweight, aes(x=ageYear, y= heightIn)) sp + geom_point() + stat_smooth(method = lm) #99% confidence region sp + geom_point() + stat_smooth(method = lm, level = 0.99) #No confidence region sp + geom_point() + stat_smooth(method=lm, se=F) sp + geom_point(color="grey60") + stat_smooth(method = lm, se=F, color="black") #the default method is locally weighted polynomial curve sp + geom_point(color="grey60") + stat_smooth(method=loess) #with logistic regressio curve library(MASS) # For the data set b <- biopsy b$classn[b$class=="benign"] <- 0 b$classn[b$class=="malignant"] <- 1 ggplot(b, aes(x=V1,y=classn)) + geom_point(position=position_jitter(width = .3,height=.06), alpha=.4, shape=21,size=1.5) + stat_smooth(method=glm, method.args = list(family="binomial")) #parameter is no longer used #group points with fitted line sps <- ggplot(heightweight, aes(x=ageYear, y=heightIn, color = sex)) + geom_point() + scale_color_brewer(palette = "Set1") sps + geom_smooth() sps + geom_smooth(method = lm, se = F, fullrange= T) #adding fitted lines from an existing model library(gcookbook) model <- lm(heightIn~ ageYear + I(ageYear^2), data = heightweight) xmin <- min(heightweight$ageYear) xmax <- max(heightweight$ageYear) predicted <- data.frame(ageYear = seq(xmin,xmax,length.out = 200)) predicted$heightIn <- predict(model, predicted) sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point(color = "grey40") sp + geom_line(data=predicted,size=1) # Given a model, predict values of yvar from xvar # This supports one predictor and one predicted variable # xrange: If NULL, determine the x range from the model object. If a vector with # two numbers, use those as the min and max of the prediction range. # samples: Number of samples across the x range. # ...: Further arguments to be passed to predict() predictvals <- function(model, xvar, yvar, xrange=NULL, samples = 100,...){ if (is.null(xrange)) { if (any(class(model) %in% c("lm","glm"))) xrange <- range(model$model[[xvar]]) else if (any(class(model) %in% "loess")) xrange <- range(model$x) } newdata <- data.frame(x=seq(xrange[1],xrange[2],length.out = samples)) names(newdata) <- xvar newdata[yvar] <- predict(model,newdata = newdata,...) newdata } modlinear <- lm(heightIn ~ ageYear, heightweight) modloess <- loess(heightIn ~ ageYear, heightweight) #predict lm_predicted <- predictvals(modlinear, "ageYear", "heightIn") loess_predicted <- predictvals(modloess, "ageYear", "heightIn") #multiple fitted lines sp + geom_line(data=lm_predicted, colour="red", size=.8) + geom_line(data=loess_predicted, colour="blue", size=.8) library(MASS) # For the data set b <- biopsy b$classn[b$class=="benign"] <- 0 b$classn[b$class=="malignant"] <- 1 fitlogistic <- glm(classn~V1,data = b, family = binomial) #get predicted values glm_predicted <- predictvals(fitlogistic, "V1","classn",type="response") ggplot(b,aes(x=V1, y=classn)) + geom_point(position = position_jitter(width = .3,height=.08),alpha=.4, shape=21,size=1.5) + geom_line(data=glm_predicted, color="#1177FF",size=1) #adding fitted lines from multiple exsiting models make_model <- function(data) { lm(heightIn ~ ageYear, data)} library(gcookbook) library(plyr) models <- dlply(heightweight,"sex",.fun=make_model) #For each subset of a data frame, apply function then combine results into a list. pred <- ldply(models,.fun = predictvals,xvar="ageYear",yvar="heightIn") #For each element of a list, apply function then combine results into a data frame ggplot(heightweight, aes(x=ageYear, y=heightIn, color=sex)) + geom_point() + geom_line(data=pred) #extend the range to the same range across all groups pred <- ldply(models, .fun=predictvals, xvar="ageYear", yvar="heightIn", xrange=range(heightweight$ageYear)) ggplot(heightweight, aes(x=ageYear, y=heightIn, color=sex)) + geom_point() + geom_line(data=pred) #adding annotations with model coefficients library(gcookbook) model <- lm(heightIn~ ageYear, heightweight) summary(model) #add r square value pred <- predictvals(model, "ageYear", "heightIn") sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point() + geom_line(data=pred) sp + annotate("text", label="r^2=0.42", x=16.5, y=52) #using r's math expression syntax sp + annotate("text", label="r^2=0.42",parse=TRUE, x=16.5, y=52) #create a string that when parsed, returns a valid expression eqn <- as.character(as.expression( substitute(italic(y) == a + b * italic(x) * "," ~~ italic(r)^2 ~ "=" ~ r2, list(a = format(coef(model)[1], digits=3), b = format(coef(model)[2], digits=3), r2 = format(summary(model)$r.squared, digits=2) )))) sp + annotate("text", label=eqn, parse=TRUE,x=Inf,y=-Inf,hjust=1.1,vjust=-.5) #adding marginal rugs to a scatter plot ggplot(faithful, aes(x=eruptions, y=waiting)) + geom_point() + geom_rug() #specify the size ggplot(faithful, aes(x=eruptions, y = waiting)) + geom_point() + geom_rug(position = "jitter",size=.2) #label points in a scatter plot library(gcookbook) s <- subset(countries, Year==2009 & healthexp > 2000) sp <- ggplot(s, aes(x=healthexp, y=infmortality)) + geom_point() sp + annotate("text",x=4350,y=5.4,label="Cannada") + annotate("text",x=7400,y=6.8,label="USA") #automatically add the labels sp + geom_text(aes(label=Name), size=4) sp + geom_text(aes(label=Name), size=4,vjust=0) sp + geom_text(aes(y=infmortality + 0.1, label = Name),size=4,vjust=0) sp + geom_text(aes(label = Name), size=4,hjust=0) sp + geom_text(aes(x=healthexp+100, label=Name), size=4, hjust=0) #label some of the points automatically cdat <- subset(countries, Year == 2009 & healthexp > 2009) cdat$Name1 <- cdat$Name idx <- cdat$Name1 %in% c("Canada", "Ireland", "United Kingdom", "United States", "New Zealand", "Iceland", "Japan", "Luxembourg", "Netherlands", "Switzerland") cdat$Name1[!idx] <- NA ggplot(cdat, aes(x=healthexp, y=infmortality)) + geom_point() + geom_text(aes(x=healthexp+100, label=Name1),size=4,hjust=0) + xlim(2000,10000) #creating a balloon plot library(gcookbook) # For the data set cdat <- subset(countries, Year==2009 & Name %in% c("Canada", "Ireland", "United Kingdom", "United States", "New Zealand", "Iceland", "Japan", "Luxembourg", "Netherlands", "Switzerland")) p <- ggplot(cdat, aes(x=healthexp, y=infmortality,size=GDP)) + geom_point(shape=21,color="black",fill="cornsilk") #GDP mapped to redius p#GDP mapped to area instead, and larger circles p + scale_size_area(max_size=15) #add up counts for male and female hec <- HairEyeColor[,,"Male"] + HairEyeColor[,,"Female"] #convert to long format library(reshape2) hec <- melt(hec, value.name="count") ggplot(hec, aes(x=Eye,y=Hair)) + geom_point(aes(size=count),shape=21,color="black",fill="cornsilk") + scale_size_area(max_size = 20,guid=F) + geom_text(aes(y=as.numeric(Hair)-sqrt(count)/22,label=count), color="grey60",size=4) #making a scatter plot matrix library(gcookbook) c2009 <- subset(countries, Year==2009, select=c(Name, GDP, laborrate, healthexp, infmortality)) pairs(c2009[,2:5]) panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...) { usr <- par("usr") on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- abs(cor(x, y, use="complete.obs")) txt <- format(c(r, 0.123456789), digits=digits)[1] txt <- paste(prefix, txt, sep="") if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex.cor * (1 + r) / 2) } panel.hist <- function(x, ...) { usr <- par("usr") on.exit(par(usr)) par(usr = c(usr[1:2], 0, 1.5) ) h <- hist(x, plot = FALSE) breaks <- h$breaks nB <- length(breaks) y <- h$counts y <- y/max(y) rect(breaks[-nB], 0, breaks[-1], y, col="white", ...) } pairs(c2009[,2:5], upper.panel = panel.cor, diag.panel = panel.hist, lower.panel = panel.smooth) panel.lm <- function (x, y, col = par("col"), bg = NA, pch = par("pch"), cex = 1, col.smooth = "black", ...) { points(x, y, pch = pch, col = col, bg = bg, cex = cex) abline(stats::lm(y ~ x), col = col.smooth, ...) } pairs(c2009[,2:5], pch=".", upper.panel = panel.cor, diag.panel = panel.hist, lower.panel = panel.lm) #########summarized data distributions #making a basic histogram ggplot(faithful, aes(x=waiting)) + geom_histogram() # Store the values in a simple vector w <- faithful$waiting ggplot(NULL, aes(x=w)) + geom_histogram() #set the width of each bin to 5 ggplot(faithful, aes(x=waiting)) + geom_histogram(binwidth = 5, fill ="white",color="black") #divide the x range into 15 bins binsize <- diff(range(faithful$waiting))/15 ggplot(faithful,aes(x=waiting)) + geom_histogram(binwidth = binsize,fill="white",color="black") #change the origin h <- ggplot(faithful, aes(x=waiting)) # Save the base object for reuse h + geom_histogram(binwidth=8, fill="white", colour="black", origin=31) h + geom_histogram(binwidth=8, fill="white", colour="black", origin=35) #the same with bar h + geom_bar(stat="bin") #making multiple histograms from grouped data library(MASS) #use smoke as the faceting variable ggplot(birthwt, aes(x=bwt)) + geom_histogram(fill="white", color="black") + facet_grid(smoke~.) #change the label bir <- birthwt #convert smoke to a factor bir$smoke <- factor(bir$smoke) library(plyr) bir$smoke <- revalue(bir$smoke, c("0"="No Smoke","1"="Smoke")) #same result bir$smoke <- factor(bir$smoke,labels=c("No Smoke","Smoke")) ggplot(bir,aes(x=bwt)) + geom_histogram(fill="white",color="black") + facet_grid(smoke~.) #when size are different ggplot(bir, aes(x=bwt)) + geom_histogram(fill="white", color="black") + facet_grid(race~.) #resize independently ggplot(bir, aes(x=bwt)) + geom_histogram(fill="white",color="black") + facet_grid(race~.,scales = "free") #scales is set "free" #another approach is to map the grouping variable to fill ggplot(bir, aes(x=bwt,fill=smoke)) + geom_histogram(position="identity",alpha=0.4) #without position="identity", ggplot will stack the histogram bars on top of each other #making a density curve ggplot(faithful, aes(x=waiting)) + geom_density() #remove the side and bottom ggplot(faithful, aes(x=waiting)) + geom_line(stat="density") + expand_limits(y=0) #the larger the bandwidth, the more smoothing there is ggplot(faithful,aes(x=waiting)) + geom_line(stat="density",adjust=.25,color="red") + geom_line(stat="density") + geom_line(stat="density",adjust=2,color="blue") #set the area color ggplot(faithful,aes(x=waiting)) + geom_density(fill="blue",alpha=.2) + xlim(35,105) ggplot(faithful,aes(x=waiting)) + geom_density(fill="blue",alpha=.2,color=NA) + geom_line(stat="density") + xlim(35,105) #compare the theoretical and obsearved distributions ggplot(faithful, aes(x=waiting, y=..density..)) + geom_histogram(fill="cornsilk", colour="grey60", size=.2) + geom_density() + xlim(35, 105) #making multiple density curves from grouped data library(MASS) bir <- birthwt bir$smoke <- factor(bir$smoke) ggplot(bir,aes(x=bwt,color=smoke)) + geom_density() #mapping smoke to fill ggplot(bir,aes(x=bwt,fill=smoke)) + geom_density(alpha=.3) ggplot(bir, aes(x=bwt)) + geom_density() + facet_grid(smoke~.) library(plyr) # For the revalue function birthwt1 <- birthwt birthwt1$smoke <- factor(birthwt1$smoke) birthwt1$smoke <- revalue(birthwt1$smoke, c("0"="No Smoke", "1"="Smoke")) ggplot(birthwt1, aes(x=bwt)) + geom_density() + facet_grid(smoke ~ .) ggplot(birthwt1, aes(x=bwt, y=..density..)) + geom_histogram(binwidth=200, fill="cornsilk", colour="grey60", size=.2) + geom_density() + facet_grid(smoke ~ .) #making a frequency polygon ggplot(faithful, aes(x=waiting)) + geom_freqpoly() ggplot(faithful, aes(x=waiting)) + geom_freqpoly(binwidth=4) #use 15 bins binsize <- diff(range(faithful$waiting))/15 ggplot(faithful, aes(x=waiting)) + geom_freqpoly(binwidth=binsize) #making a basic box plot library(MASS) ggplot(birthwt, aes(x=factor(race), y=bwt)) + geom_boxplot() # Use factor() to convert numeric variable to discrete #change width ggplot(birthwt,aes(x=factor(race),y=bwt)) + geom_boxplot(width=.5) #default size is 2, default shape is 16 ggplot(birthwt,aes(x=factor(race),y=bwt)) + geom_boxplot(outlier.size = 1.5,outlier.shape = 21) ggplot(birthwt,aes(x=1,y=bwt)) + geom_boxplot() + scale_x_continuous(breaks = NULL) + theme(axis.title.x=element_blank()) #adding natches to a box plot library(MASS) ggplot(birthwt,aes(x=factor(race),y=bwt)) + geom_boxplot(notch = TRUE) #adding means to a box plot library(MASS) ggplot(birthwt, aes(factor(race),y=bwt)) + geom_boxplot() + stat_summary(fun.y = "mean",geom = "point",shape=23,size=3,fill="white") #making a violin plot library(gcookbook) p <- ggplot(heightweight,aes(x=sex,y=heightIn)) p + geom_violin() #with narrows boxplot p + geom_violin() + geom_boxplot(width=.1,fill="black",outlier.colour = NA) + stat_summary(fun.y = median,geom="point",fill="white",shape=21,size=2.5) #keep the tails p + geom_violin(trim=FALSE) #making a dot plot library(gcookbook) c2009 <- subset(countries, Year==2009 & healthexp > 2000) p <- ggplot(c2009,aes(x=infmortality)) p + geom_dotplot() p + geom_dotplot(binwidth=.25) + geom_rug() + scale_y_continuous(breaks=NULL) + theme(axis.title.y=element_blank()) #making multiple dot plots for grouped data ggplot(heightweight, aes(x=sex, y=heightIn)) + geom_dotplot(binaxis="y", binwidth=.5, stackdir="center") #making a desity plot of two dimensional data p <- ggplot(faithful,aes(x=eruptions,y=waiting)) p + geom_point() + stat_density2d() #map the height of the density to color p + stat_density2d(aes(color=..level..)) #map denstity estimate to fill color p + stat_density2d(aes(fill=..density..),geom="raster",contour = F) #with points and map desity estimate to alpha p + geom_point() + stat_density2d(aes(alpha=..density..),geom="tile",contour=F) #tile and raster look like the same #marginal rug added to a scatter plot p + stat_density2d(aes(fill=..density..),geom="raster", contour=F,h=c(.5,5)) #############annotations #adding text annotations p <- ggplot(faithful,aes(x=eruptions,y=waiting)) + geom_point() p + annotate(geom="text",x=3,y=48,label = "Group 1") + annotate("text",x=4.5,y=66,label = "Group 2") p + annotate("text",x=3,y=48,label="Group 1",family="serif", fontface="italic",color="darkred",size=3) + annotate("text",x=4.5,y=66,label="Group 2",family="serif", fontface="italic",color="darkred",size=3) annotate("text",x=3,y=48,label="Group 1",family="serif", fontface="italic",color="darkred",size=3) p + annotate("text", x=3, y=48, label="Group 1", alpha=.1) + # Normal geom_text(x=4.5, y=66, label="Group 2", alpha=.1) # Overplotted #for continuous axes, use Inf and -Inf to place text annotations at the #edge of the plotting area p + annotate("text",x=-Inf,y=Inf,label="Upper left",hjust=-.2,vjust=2) + annotate("text",x=mean(range(faithful$eruptions)),y=-Inf,vjust=-.4,label="Bottom middle") #for hjust and vjust position is minus the value #using mathematical expression in annotations #a normal curve p <- ggplot(data.frame(x=c(-3,3)), aes(x=x)) + stat_function(fun=dnorm) p + annotate("text",x=2,y=0.3,parse=TRUE, label="frac(1,sqrt(2*pi))*e^{-x^2/2}") #use single quotes within double quotes (or vice versa) to mark the plain-text parts #put a * operator between them p + annotate("text",x=0,y=.05,parse=TRUE,size=4, label="'Function: '*y==frac(1,sqrt(2*pi))*e^{-x^2/2}") #?plotmath for many examples of mathematical expressions, and #?demo(plot math) for graphical examples of mathematical expressions #adding lines library(gcookbook) p <- ggplot(heightweight,aes(x=ageYear,y=heightIn,color=sex)) + geom_point() #adding horizongtal and vertical lines p + geom_hline(yintercept = 60) + geom_vline(xintercept = 14) p + geom_abline(intercept = 37.4, slope = 1.75) library(plyr) hw_means <- ddply(heightweight,"sex",summarise,heightIn=mean(heightIn)) p + geom_hline(aes(yintercept=heightIn,color=sex),data=hw_means, linetype="dashed",size=1) #for discrete variable pg <- ggplot(PlantGrowth,aes(x=group,y=weight)) + geom_point() pg + geom_vline(xintercept = 2) pg + geom_vline(xintercept = which(levels(PlantGrowth$group) == "ctrl")) #adding line segments and arrows library(gcookbook) p <- ggplot(subset(climate, Source=="Berkeley"), aes(x=Year, y=Anomaly10y)) + geom_line() p + annotate("segment", x=1950, xend=1980, y=-.25, yend=-.25)