set.seed(1410)

dsmall <- diamonds[sample(nrow(diamonds), 100),]


#scatterplot
qplot(carat, price, data = diamonds)
qplot(log(carat), log(price), data = diamonds)

qplot(carat, x*y*z, data = diamonds)
qplot(carat, price, data = dsmall, colour = color)
qplot(carat, price, data = dsmall, shape = cut)

qplot(carat, price, data = diamonds, alpha = I(1/10))
qplot(carat, price, data = diamonds, alpha = I(1/100))
qplot(carat, price, data = diamonds, alpha = I(1/200))


#geom
qplot(carat, price, data = dsmall, geom = c("point", "smooth"))
qplot(carat, price, data = diamonds, geom = c("point", "smooth"))


qplot(color, price/carat, data=diamonds,geom = "jitter")
qplot(color, price/carat, data=diamonds,geom = "boxplot")
qplot(color, price / carat, data = diamonds, geom = "jitter",
      alpha = I(1 / 50))

#offer size, colour, shape for scatterplot
#offer size, colour, fill for boxplot
qplot(color, price/carat, data=diamonds,geom = "boxplot",colour="red")


qplot(carat, data = diamonds, geom = "histogram")
qplot(carat, data = diamonds, geom = "density")


qplot(carat, data = diamonds, geom = "histogram", binwidth = 1,
      xlim = c(0,3))
qplot(carat, data = diamonds, geom = "histogram", binwidth = 0.1,
      xlim = c(0,3))
qplot(carat, data = diamonds, geom = "histogram", binwidth = 0.01,
      xlim = c(0,3))

qplot(carat, data = diamonds, geom = "density", colour = color)
qplot(carat, data = diamonds, geom = "histogram", fill = color, bins = 50)

qplot(color, data = diamonds, geom = "bar")
qplot(color, data = diamonds, geom = "bar", weight = carat) +
  scale_y_continuous("carat") #total weight


qplot(date, unemploy / pop, data = economics, geom = "line")
qplot(date, uempmed, data = economics, geom = "line")

#path
year <- function(x) as.POSIXlt(x)$year + 1900
qplot(unemploy / pop, uempmed, data = economics,
      geom = c("point", "path"))
qplot(unemploy / pop, uempmed, data = economics,
      geom = "path", colour = year(date)) + scale_area()

#faceting
# row var ∼ col var
qplot(carat, data = diamonds, facets = color ~ .,
      geom = "histogram", binwidth = 0.1, xlim = c(0, 3))
qplot(carat, ..density.., data = diamonds, facets = color ~ .,
      geom = "histogram", binwidth = 0.1, xlim = c(0, 3))
#Using ..density.. tells ggplot2 to map the density to the y-axis 
#instead of the default use of count
#density plot makes it easier to compare distributions ignoring 
#the relative abundance of diamonds within each colour


#see ?plotmath for more examples of using mathematical formulae

qplot(
  carat, price, data = dsmall,
  xlab = "Price ($)", ylab = "Weight (carats)",
  main = "Price-weight relationship" )


qplot(
  carat, price/carat, data = dsmall,
  xlab = "Weight(carats)", ylab = "expression(frac(price,carat))",
  main = "Small diamonds",
  xlim = c(.2, 1))

qplot(carat, price, data = dsmall, log = "xy")

qplot(displ, hwy, data = mpg, colour = factor(cyl)) #factor

#ggplot
p <- ggplot(diamonds, aes(x = carat))
p <- p + layer(
  geom = "bar",
  geom_params = list(fill = "steelblue"),
  stat = "bin",
  stat_params = list(binwidth = 2)
)
#layer(geom, geom_params, stat, stat_params, data, mapping, position)
ggplot(msleep, aes(sleep_rem / sleep_total, awake)) +
  geom_point()

#geom_XXX(mapping, data, ..., geom, position)
#stat_XXX(mapping, data, ..., stat, position)
p <- ggplot(mtcars, aes(mpg, wt)) +
  geom_point()

p + geom_point(aes(colour = factor(cyl)))
p + geom_point(aes(y=disp))

p <- ggplot(mtcars, aes(mpg, wt))
p + geom_point(colour = "darkblue")
p + geom_point(aes(colour = "darkblue"))

p + geom_point(aes(colour = factor(cyl)))

p <- ggplot(Oxboys, aes(age, height, group = Subject)) + geom_line()

p + geom_smooth(aes(group = Subject), method = "lm", se = F)

p + geom_smooth(aes(group = 1), method = "lm", se = F)

boysbox <- ggplot(Oxboys, aes(Occasion, height)) + geom_boxplot()

boysbox + geom_line(aes(group=Subject), color=I(8))

xgrid <- with(df, seq(min(x), max(x), length=50))

p <- ggplot(diamonds, aes(carat)) +
  geom_histogram(aes(y=..count..), binwidth = 0.1)
p <- ggplot(diamonds, aes(carat)) +
  geom_histogram(aes(y=..density..), binwidth = 0.1)
qplot(carat, ..density.., data=diamonds, geom = "histogram",binwidth = 0.1)

d <- ggplot(diamonds,aes(carat)) + xlim(0, 3)
d + stat_bin(aes(ymax = ..count..), binwidth = 0.1, geom = "area")
d + stat_bin(aes(size = ..density..), binwidth = 0.1, geom = "point", position = "identity")
d + stat_bin(aes(y = 1, fill = ..count..), binwidth = 0.1,geom = "tile", position = "identity")

#graph cook book
library(gcookbook)
library(ggplot2)
#chapte 2 quickly exploring data

#creating a scatter plot
plot(mtcars$wt, mtcars$mpg)
qplot(mtcars$wt, mtcars$mpg)
qplot(wt, mpg, data=mtcars)
# This is equivalent to:
ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point()

#creating line plot
plot(pressure$temperature, pressure$pressure, type="l")

qplot(pressure$temperature, pressure$pressure, geom="line")
qplot(temperature, pressure, data=pressure, geom="line")
# This is equivalent to:
ggplot(pressure, aes(x=temperature, y=pressure)) + geom_line()
# Lines and points together
qplot(temperature, pressure, data=pressure, geom=c("line", "point"))
# Equivalent to:
ggplot(pressure, aes(x=temperature, y=pressure)) + geom_line() + geom_point()


#creating bar graph
barplot(BOD$demand, names.arg=BOD$Time) #name

# Generate a table of countsqplot(supp, len, data=ToothGrowth, geom="boxplot")
# This is equivalent to:
ggplot(ToothGrowth, aes(x=supp, y=len)) + geom_boxplot()
barplot(table(mtcars$cyl))
library(ggplot2)
ggplot(BOD, aes(x=factor(Time), y=demand)) + geom_bar(stat="identity")
# Bar graph of counts
qplot(factor(cyl), data=mtcars)
# This is equivalent to:
ggplot(mtcars, aes(x=factor(cyl))) + geom_bar()

#creating histogram
# Specify approximate number of bins with breaks
hist(mtcars$mpg, breaks=10)
library(ggplot2)
qplot(mpg, data=mtcars, binwidth=1)
# This is equivalent to:
ggplot(mtcars, aes(x=mpg)) + geom_histogram(binwidth=1)

#creating box plot
# Formula syntax
boxplot(len ~ supp, data = ToothGrowth)
library(ggplot2)
qplot(supp, len, data=ToothGrowth, geom="boxplot")
# This is equivalent to:
ggplot(ToothGrowth, aes(x=supp, y=len)) + geom_boxplot()
ggplot(ToothGrowth, aes(x=interaction(supp, dose), y=len)) + geom_boxplot()

#plot a function curve
curve(x^3 - 5*x, from=-4, to=4)
# Plot a user-defined function
myfun <- function(xvar) {
  1/(1 + exp(-xvar + 10))
}
curve(myfun(x), from=0, to=20)
# Add a line:
curve(1-myfun(x), add = TRUE, col = "red")
library(ggplot2)
# This is equivalent to:
ggplot(data.frame(x=c(0, 20)), aes(x=x)) + stat_function(fun=myfun, geom="line")


#chapter 3 Bar graphs
ggplot(pg_mean, aes(x=group, y=weight)) + geom_bar(stat="identity")

ggplot(pg_mean, aes(x=group, y=weight)) +
  geom_bar(stat="identity", fill="lightblue", colour="black")


#group by a second variable
ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
  geom_bar(position="dodge", colour="black",stat="identity") #default stat is count

#stat="identity" means using sum

ggplot(diamonds, aes(x=cut)) + geom_bar(stat = "count") #default stat is count


upc <- subset(uspopchange, rank(Change)>40)
ggplot(upc, aes(x=Abb, y=Change, fill=Region)) + geom_bar(stat="identity")

ggplot(upc, aes(x=reorder(Abb, Change), y=Change, fill=Region)) +
  geom_bar(stat="identity", colour="black") +
  scale_fill_manual(values=c("#669933", "#FFCC66")) +
  xlab("State")

#using the reorder() function to reorder the levels of a factor based on
#the values of another variable


#Coloring Negative and Positive Bars Differently
csub <- subset(climate, Source=="Berkeley" & Year >= 1900)
csub$pos <- csub$Anomaly10y >= 0
ggplot(csub, aes(x=Year, y=Anomaly10y, fill=pos)) +
  geom_bar(stat="identity", position="identity") #default position is stack

ggplot(csub, aes(x=Year, y=Anomaly10y, fill=pos)) +
  geom_bar(stat="identity", position="identity", colour="black", size=0.25) +
  scale_fill_manual(values=c("#CCEEFF", "#FFDDDD"), guide=FALSE)


#Adjusting Bar Width and Spacing
#default value is 0.9
ggplot(pg_mean, aes(x=group, y=weight)) + geom_bar(stat="identity", width=0.5)
#a grouped bar graph with narrow bars
ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
  geom_bar(stat="identity", width=0.5, position="dodge")
#with some space between the bars
ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
  geom_bar(stat="identity", width=0.6, position=position_dodge(0.8)) #space = 0.8-0.6
#To be more precise, the value of width in position_dodge() is the same as
#width in geom_bar()


#Making a Stacked Bar Graph
ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
  geom_bar(stat="identity")
#One problem with the default output is that the stacking order is the opposite of the
#order of items in the legend
#reverse the legend
ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
  geom_bar(stat="identity") +
  guides(fill=guide_legend(reverse=TRUE))
#reverse the stacking order
libary(plyr) # Needed for desc()
ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar, order=desc(Cultivar))) +
  geom_bar(stat="identity")
#get a different color palette
ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
  geom_bar(stat="identity", colour="black") +
  guides(fill=guide_legend(reverse=TRUE)) +
  scale_fill_brewer(palette="Pastel1")


#Making a Proportional Stacked Bar Graph
library(gcookbook)
library(plyr)
ce <- ddply(cabbage_exp,"Date",transform,
            percent_weight = Weight / sum(Weight)*100)

ggplot(ce, aes(x=Date, y= percent_weight, fill =Cultivar)) +
  geom_bar(stat="identity")
#change a little bit
ggplot(ce, aes(x=Date, y= percent_weight, fill =Cultivar)) +
  geom_bar(stat="identity", color="black") +
  guides(fill=guide_legend(reverse = TRUE)) +
  scale_fill_brewer(palette = "Pastel1")

#adding labels to a bar graph
# Below the top
ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) +
  geom_bar(stat="identity") +
  geom_text(aes(label=Weight), vjust=1.5, colour="white")

# Above the top
ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) +
  geom_bar(stat="identity") +
  geom_text(aes(label=Weight), vjust=-0.2)

#adjust y limits to be a little higher
ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) +
  geom_bar(stat="identity") +
  geom_text(aes(label=Weight), vjust=-0.2) +
  ylim(0, max(cabbage_exp$Weight) * 1.05)

# Map y positions slightly above bar top - y range of plot will auto-adjust
ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar), y=Weight)) +
  geom_bar(stat="identity") +
  geom_text(aes(y=Weight+.1, label=Weight))

#dodging
ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) +
  geom_bar(stat="identity", position="dodge") +
  geom_text(aes(label=Weight), vjust=1.5, colour="white",
            position=position_dodge(0.9), size=3)


#label on stack bar
library(plyr)
# Sort by the day and sex columns
ce <- arrange(cabbage_exp, Date, Cultivar)
#get the cumulative sum
ce <- ddply(ce, "Date", transform, label_y=cumsum(Weight))

ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) +
  geom_bar(stat="identity") +
  geom_text(aes(y=label_y, label=Weight), vjust=1.5, colour="white") +
  guides(fill=guide_legend(reverse=TRUE))

#put the label in the center
ce <- arrange(cabbage_exp, Date, Cultivar)
# Calculate y position, placing it in the middle
ce <- ddply(ce, "Date", transform, label_y=cumsum(Weight)-0.5*Weight) #good
ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) +
  geom_bar(stat="identity") +
  geom_text(aes(y=label_y, label=Weight), colour="white")

#polish
ggplot(ce, aes(x=Date, y=Weight, fill=Cultivar)) +
  geom_bar(stat="identity") +
  geom_text(aes(y=label_y, label=paste(format(Weight, nsmall=2), "kg")),
            size=4) +
  guides(fill=guide_legend(reverse=TRUE)) + scale_fill_brewer(palette="Pastel1")
#coord_flip()

#making a Cleveland Dot Plot
library(gcookbook)
tophit <- tophitters2001[1:25,] 

ggplot(tophit, aes(x=avg, y=name)) + geom_point()

ggplot(tophit, aes(x=avg, y=reorder(name, avg))) + 
  geom_point(size=3) + # Use a larger dot 
  theme_bw() +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.grid.major.y = element_line(colour="grey60", linetype="dashed"))

ggplot(tophit, aes(x=reorder(name, avg), y=avg)) + 
  geom_point(size=3) + # Use a larger dot 
  theme_bw() +
  theme(axis.text.x = element_text(angle=60, hjust=1),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        panel.grid.major.x = element_line(colour="grey60", linetype="dashed"))

#reorder onlu by one variable
#order by two variables
# Get the names, sorted first by lg, then by avg
nameorder <- tophit$name[order(tophit$lg, tophit$avg)]
# Turn name into a factor, with levels in the order of nameorder
tophit$name <- factor(tophit$name, levels=nameorder)


ggplot(tophit, aes(x=avg, y=name)) +
  geom_segment(aes(yend=name), xend=0, colour="grey50") + 
  geom_point(size=3, aes(colour=lg)) + 
  scale_colour_brewer(palette="Set1", limits=c("NL","AL")) + #limits order the posion
  theme_bw() +
  theme(panel.grid.major.y = element_blank(), # No horizontal grid lines
        legend.position=c(1, 0.55), # Put legend inside plot area 
        legend.justification=c(1, 0.5))
        
ggplot(tophit, aes(x=avg, y=name)) +
  geom_segment(aes(yend=name), xend=0, colour="grey50") +
  geom_point(size=3, aes(colour=lg)) +
  scale_colour_brewer(palette="Set1", limits=c("NL","AL"), guide=FALSE) + 
  theme_bw() +
  theme(panel.grid.major.y = element_blank()) +
  facet_grid(lg ~ ., scales="free_y", space="free_y")


#Making a Basic Line Graph
ggplot(BOD, aes(x=Time, y=demand)) + geom_line()

d1 <- BOD
d1$Time <- factor(d1$Time)
ggplot(d1, aes(x=Time, y=demand,group=1)) +geom_line()
#add y limit
ggplot(BOD, aes(x=Time, y=demand)) + geom_line() + ylim(0, max(BOD$demand))
ggplot(BOD, aes(x=Time, y=demand)) + geom_line() + expand_limits(y=0) #same

#adding points to a line graph
ggplot(BOD, aes(x=Time, y=demand)) + geom_line() + geom_point()

library(gcookbook)
ggplot(worldpop, aes(x=Year, y=Population)) +
  geom_line() +
  geom_point()

#with a log y-axis
ggplot(worldpop, aes(x=Year, y= Population)) +
  geom_line() +
  geom_point() +
  scale_y_log10()

#making a line graph with multiple lines
#summarize the data
library(plyr)
tg <- ddply(ToothGrowth, c("supp","dose"),summarise, length=mean(len))

#map supp to colour
ggplot(tg, aes(x=dose,y=length,colour=supp)) + geom_line()

#map supp to linetype
ggplot(tg, aes(x=dose, y=length, linetype=supp)) + geom_line()

#factor + group (line point together)
ggplot(tg, aes(x=factor(dose), y=length,colour=supp,group=supp)) + geom_line()
# Make the points a little larger
ggplot(tg, aes(x=dose, y=length, shape=supp)) + geom_line() + geom_point(size=4) 
# Also use a point with a color fill
ggplot(tg, aes(x=dose, y=length, fill=supp)) + geom_line() + geom_point(size=4, shape=21) 
#dodge them
ggplot(tg, aes(x=dose, y=length, shape=supp)) + 
  geom_line(position=position_dodge(0.2)) + # Dodge lines by 0.2 
  geom_point(position=position_dodge(0.2), size=4) # Dodge points by 0.2

#change the apperance of lines
ggplot(BOD, aes(x=Time, y=demand)) +
  geom_line(linetype = "dashed", size = 1, color = "blue")

# Load plyr so we can use ddply() to create the example data set
library(plyr)
# Summarize the ToothGrowth data
tg <- ddply(ToothGrowth, c("supp", "dose"), summarise, length=mean(len))
ggplot(tg, aes(x=dose, y=length, colour=supp)) +
  geom_line() +
  scale_colour_brewer(palette="Set1")
# If both lines have the same properties, you need to specify a variable to # use for grouping
ggplot(tg, aes(x=dose, y=length, group=supp)) +
  geom_line(colour="darkgreen", size=1.5)
# Since supp is mapped to colour, it will automatically be used for grouping
ggplot(tg, aes(x=dose, y=length, colour=supp)) +
  geom_line(linetype="dashed") +
  geom_point(shape=22, size=3, fill="white")

#change the appearance of points
ggplot(BOD, aes(x=Time, y=demand)) +
  geom_line() +
  geom_point(size=4, shape=22, colour="darkred", fill="pink")
#The default shape for points is a solid circle, 
#the default size is 2, and the default colour is "black"

# Load plyr so we can use ddply() to create the example data set
library(plyr)
# Summarize the ToothGrowth data
tg <- ddply(ToothGrowth, c("supp", "dose"), summarise, length=mean(len))
# Save the position_dodge specification because we'll use it multiple times
pd <- position_dodge(0.2)
ggplot(tg, aes(x=dose, y=length, fill=supp)) +
  geom_line(position=pd) +
  geom_point(shape=21, size=3, position=pd) +
  scale_fill_manual(values=c("black","white"))


#making a graph with a shaded area
sunspotyear <- data.frame(
  Year     = as.numeric(time(sunspot.year)),
  Sunspots = as.numeric(sunspot.year)
)
ggplot(sunspotyear, aes(x=Year, y=Sunspots)) + geom_area()
#change area appearance (with outline)
ggplot(sunspotyear, aes(x=Year, y=Sunspots)) +
  geom_area(colour="black", fill="blue", alpha=.2)
#no bottom line
ggplot(sunspotyear, aes(x=Year, y=Sunspots)) +
  geom_area(fill="blue", alpha=.2) +
  geom_line()

#making a stacked area graph
library(gcookbook)
ggplot(uspopage, aes(x=Year, y= Thousands,fill=AgeGroup)) +
  geom_area()

#reverse the legend order
ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup)) +
  geom_area(colour="black", size=.2, alpha=.4) +
  scale_fill_brewer(palette="Blues") +
  guides(fill=guide_legend(reverse=TRUE))
#or
ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup)) +
  geom_area(colour="black", size=.2, alpha=.4) +
  scale_fill_brewer(palette="Blues", breaks=rev(levels(uspopage$AgeGroup)))


#or change the order of fill
library(plyr) # For the desc() function
ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup, order=desc(AgeGroup))) +
  geom_area(colour="black", size=.2, alpha=.4) +
  scale_fill_brewer(palette="Blues")


#remove the left and right sides
ggplot(uspopage, aes(x=Year, y=Thousands, fill=AgeGroup, order=desc(AgeGroup))) + 
  geom_area(colour=NA, alpha=.4) +
  scale_fill_brewer(palette="Blues") +
  geom_line(position="stack", size=.2) #default position is "identity"


#making a proportional stacked area graph
#first calculate the proportions
library(gcookbook)
library(plyr)
#convert to percent
p <- ddply(uspopage,"Year",transform, 
           Percent = Thousands/sum(Thousands)*100)
ggplot(p,aes(x=Year, y= Percent, fill = AgeGroup)) +
  geom_area(colour = "black",size=.2,alpha = .4) +
  scale_fill_brewer(palette="Blues",breaks=rev(levels(p$AgeGroup)))

#adding a confidence region
library(gcookbook)
clim <- subset(climate, Source == "Berkeley",
               select = c("Year","Anomaly10y","Unc10y"))
#the same
library(dplyr)
clim <- climate %>% filter(Source == "Berkeley") %>% select(Year, Anomaly10y, Unc10y)
#shaded region
ggplot(clim, aes(x=Year,y= Anomaly10y)) +
  geom_ribbon(aes(ymin=Anomaly10y-Unc10y, ymax=Anomaly10y+Unc10y),
              alpha=0.2) +
  geom_line()
#we need know the 95% confidence interval ahead

# With a dotted line for upper and lower bounds
ggplot(clim, aes(x=Year, y=Anomaly10y)) +
  geom_line(aes(y=Anomaly10y-Unc10y), colour="grey50", linetype="dotted") +
  geom_line(aes(y=Anomaly10y+Unc10y), colour="grey50", linetype="dotted") +
  geom_line()


####scatter plots
#making a basic scatter plot
ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point()
#the default solid circles (shape #16) is hollow ones (#21)
ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point(shape=21)
ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point(size=1.5)
#default value of size is 2

#grouping data points by a variable using shape or color
library(gcookbook)
#mapping color
ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) + geom_point()
#mapping shape
ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex)) + geom_point() +geom_line(group=sex)


ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex, colour=sex)) +
  geom_point()

ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex, colour=sex)) +
  geom_point() +
  scale_shape_manual(values=c(1,2)) +
  scale_colour_brewer(palette="Set1")

#Using different point shapes
library(gcookbook)
ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point(shape=3)

# Use slightly larger points and use a shape scale with custom values
ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex)) +
  geom_point(size=3) + scale_shape_manual(values=c(1, 4))
#Some of the point shapes (1–14) have just an outline, 
#some (15–20) are solid, and 
#some (21–25) have an outline and fill that can be controlled separately

#shape represent one variable and 
#the fill (empty or solid) represent another variable
# Make a copy of the data
hw <- heightweight
# Categorize into <100 and >=100 groups
hw$weightGroup <- cut(hw$weightLb, breaks=c(-Inf, 100, Inf),
                      labels=c("< 100", ">= 100"))
ggplot(hw, aes(x=ageYear, y=heightIn, shape=sex, fill=weightGroup)) +
  geom_point(size=2.5) + scale_shape_manual(values=c(21, 24)) + 
  scale_fill_manual(values=c(NA, "black"),
                    guide=guide_legend(override.aes=list(shape=21)))

#mapping a continuous variable to color or size
library(gcookbook)
ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=weightLb)) + geom_point()
ggplot(heightweight, aes(x=ageYear, y=heightIn, size=weightLb)) + geom_point()

#fill gradient to go from black to white
#make the points larger
ggplot(heightweight, aes(x=weightLb, y=heightIn, fill=ageYear)) +
  geom_point(shape=21, size=2.5) +
  scale_fill_gradient(low="black", high="white")
# Using guide_legend() will result in a discrete legend instead of a colorbar
ggplot(heightweight, aes(x=weightLb, y=heightIn, fill=ageYear)) +
  geom_point(shape=21, size=2.5) +
  scale_fill_gradient(low="black", high="white", breaks=12:17,
                      guide=guide_legend())

ggplot(heightweight, aes(x=ageYear, y=heightIn, size=weightLb, colour=sex)) + 
  geom_point(alpha=.5) +
  scale_colour_brewer(palette="Set1") +
  scale_size_area()  # Make area proportional to numeric value

#dealing with overplotting
sp <- ggplot(diamonds, aes(x=carat, y=price))
sp + geom_point(alpha=0.1)
sp + geom_point(alpha=.01)
#bin the points into rectangle
sp + stat_bin2d()
sp + stat_bin2d(bins=50) +
  scale_fill_gradient(low="lightblue", high="red", limits=c(0, 6000))
library(hexbin)
sp + stat_binhex() +
  scale_fill_gradient(low="lightblue", high="red",
                      limits=c(0, 8000))
sp + stat_binhex() +
  scale_fill_gradient(low="lightblue", high="red",
                      breaks=c(0, 250, 500, 1000, 2000, 4000, 6000),
                      limits=c(0, 6000))

#one discrete axis and one continuous axis
sp1 <- ggplot(ChickWeight, aes(x=Time, y=weight))
sp1 + geom_point()
sp1 + geom_point(position="jitter")
# Could also use geom_jitter(), which is equivalent
sp1 + geom_point(position=position_jitter(width=.5, height=0))
#using boxplot
sp1 + geom_boxplot(aes(group=Time))

#adding fitted regression model lines
library(gcookbook)
sp <- ggplot(heightweight, aes(x=ageYear, y= heightIn))
sp + geom_point() + stat_smooth(method = lm)
#99% confidence region
sp + geom_point() + stat_smooth(method = lm, level = 0.99)
#No confidence region
sp + geom_point() + stat_smooth(method=lm, se=F)

sp + geom_point(color="grey60") + 
  stat_smooth(method = lm, se=F, color="black")

#the default method is locally weighted polynomial curve
sp + geom_point(color="grey60") + stat_smooth(method=loess)

#with logistic regressio curve
library(MASS) # For the data set
b <- biopsy
b$classn[b$class=="benign"]    <- 0
b$classn[b$class=="malignant"] <- 1
ggplot(b, aes(x=V1,y=classn)) +
  geom_point(position=position_jitter(width = .3,height=.06),
             alpha=.4, shape=21,size=1.5) +
  stat_smooth(method=glm, method.args = list(family="binomial"))
#parameter is no longer used

#group points with fitted line
sps <- ggplot(heightweight, aes(x=ageYear, y=heightIn, color = sex)) +
  geom_point() +
  scale_color_brewer(palette = "Set1")
sps + geom_smooth()

sps + geom_smooth(method = lm, se = F, fullrange= T)


#adding fitted lines from an existing model 
library(gcookbook)
model <- lm(heightIn~ ageYear + I(ageYear^2), data = heightweight)
xmin <- min(heightweight$ageYear)
xmax <- max(heightweight$ageYear)

predicted <- data.frame(ageYear = seq(xmin,xmax,length.out = 200))
predicted$heightIn <- predict(model, predicted)

sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
  geom_point(color = "grey40")
sp + geom_line(data=predicted,size=1)

# Given a model, predict values of yvar from xvar
# This supports one predictor and one predicted variable
# xrange: If NULL, determine the x range from the model object. If a vector with # two numbers, use those as the min and max of the prediction range.
# samples: Number of samples across the x range.
# ...: Further arguments to be passed to predict()
predictvals <- function(model, xvar, yvar, xrange=NULL, samples = 100,...){
  if (is.null(xrange)) {
    if (any(class(model) %in% c("lm","glm"))) 
      xrange <- range(model$model[[xvar]])
    else if (any(class(model) %in% "loess"))
      xrange <- range(model$x)
  }
  newdata <- data.frame(x=seq(xrange[1],xrange[2],length.out = samples))
  names(newdata) <- xvar
  newdata[yvar] <- predict(model,newdata = newdata,...)
  newdata
}

modlinear <- lm(heightIn ~ ageYear, heightweight)
modloess  <- loess(heightIn ~ ageYear, heightweight)
#predict
lm_predicted    <- predictvals(modlinear, "ageYear", "heightIn")
loess_predicted <- predictvals(modloess, "ageYear", "heightIn")
#multiple fitted lines
sp + geom_line(data=lm_predicted, colour="red", size=.8) +
  geom_line(data=loess_predicted, colour="blue", size=.8)

library(MASS) # For the data set b <- biopsy
b$classn[b$class=="benign"]    <- 0
b$classn[b$class=="malignant"] <- 1
fitlogistic <- glm(classn~V1,data = b, family = binomial)

#get predicted values
glm_predicted <- predictvals(fitlogistic, "V1","classn",type="response")
ggplot(b,aes(x=V1, y=classn)) +
  geom_point(position = position_jitter(width = .3,height=.08),alpha=.4,
             shape=21,size=1.5) +
  geom_line(data=glm_predicted, color="#1177FF",size=1)
  
#adding fitted lines from multiple exsiting models
make_model <- function(data) { lm(heightIn ~ ageYear, data)}
library(gcookbook)
library(plyr)
models <- dlply(heightweight,"sex",.fun=make_model)
#For each subset of a data frame, apply function then combine results into a list.
pred <- ldply(models,.fun = predictvals,xvar="ageYear",yvar="heightIn")
#For each element of a list, apply function then combine results into a data frame
ggplot(heightweight, aes(x=ageYear, y=heightIn, color=sex)) +
  geom_point() +
  geom_line(data=pred)
#extend the range to the same range across all groups
pred <- ldply(models, .fun=predictvals, xvar="ageYear", yvar="heightIn",
                  xrange=range(heightweight$ageYear))
ggplot(heightweight, aes(x=ageYear, y=heightIn, color=sex)) +
  geom_point() +
  geom_line(data=pred)

#adding annotations with model coefficients
library(gcookbook)
model <- lm(heightIn~ ageYear, heightweight)
summary(model)
#add r square value
pred <- predictvals(model, "ageYear", "heightIn")
sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
  geom_point() +
  geom_line(data=pred)
sp + annotate("text", label="r^2=0.42", x=16.5, y=52)
#using r's math expression syntax
sp + annotate("text", label="r^2=0.42",parse=TRUE, x=16.5, y=52)

#create a string that when parsed, returns a valid expression
eqn <- as.character(as.expression(
  substitute(italic(y) == a + b * italic(x) * "," ~~ italic(r)^2 ~ "=" ~ r2,
             list(a = format(coef(model)[1], digits=3),
                  b = format(coef(model)[2], digits=3),
                  r2 = format(summary(model)$r.squared, digits=2)
             ))))
sp + annotate("text", label=eqn, parse=TRUE,x=Inf,y=-Inf,hjust=1.1,vjust=-.5)


#adding marginal rugs to a scatter plot
ggplot(faithful, aes(x=eruptions, y=waiting)) + geom_point() + geom_rug()
#specify the size
ggplot(faithful, aes(x=eruptions, y = waiting)) +
  geom_point() +
  geom_rug(position = "jitter",size=.2)

#label points in a scatter plot
library(gcookbook)
s <- subset(countries, Year==2009 & healthexp > 2000)
sp <- ggplot(s, aes(x=healthexp, y=infmortality)) + geom_point()
sp + annotate("text",x=4350,y=5.4,label="Cannada") +
  annotate("text",x=7400,y=6.8,label="USA")
#automatically add the labels
sp + geom_text(aes(label=Name), size=4)
sp + geom_text(aes(label=Name), size=4,vjust=0)
sp + geom_text(aes(y=infmortality + 0.1, label = Name),size=4,vjust=0)
sp + geom_text(aes(label = Name), size=4,hjust=0)
sp + geom_text(aes(x=healthexp+100, label=Name), size=4, hjust=0)

#label some of the points automatically
cdat <- subset(countries, Year == 2009 & healthexp > 2009)
cdat$Name1 <- cdat$Name
idx <- cdat$Name1 %in% c("Canada", "Ireland", "United Kingdom", "United States",
                         "New Zealand", "Iceland", "Japan", "Luxembourg",
                         "Netherlands", "Switzerland")
cdat$Name1[!idx] <- NA
ggplot(cdat, aes(x=healthexp, y=infmortality)) +
  geom_point() +
  geom_text(aes(x=healthexp+100, label=Name1),size=4,hjust=0) +
  xlim(2000,10000)

#creating a balloon plot
library(gcookbook) # For the data set
cdat <- subset(countries, Year==2009 &
                 Name %in% c("Canada", "Ireland", "United Kingdom", "United States",
                             "New Zealand", "Iceland", "Japan", "Luxembourg",
                             "Netherlands", "Switzerland"))
p <- ggplot(cdat, aes(x=healthexp, y=infmortality,size=GDP)) +
  geom_point(shape=21,color="black",fill="cornsilk")
#GDP mapped to redius
p#GDP mapped to area instead, and larger circles
p + scale_size_area(max_size=15)

#add up counts for male and female
hec <- HairEyeColor[,,"Male"] + HairEyeColor[,,"Female"]

#convert to long format
library(reshape2)
hec <- melt(hec, value.name="count")
ggplot(hec, aes(x=Eye,y=Hair)) +
  geom_point(aes(size=count),shape=21,color="black",fill="cornsilk") +
  scale_size_area(max_size = 20,guid=F) +
  geom_text(aes(y=as.numeric(Hair)-sqrt(count)/22,label=count),
            color="grey60",size=4)

#making a scatter plot matrix
library(gcookbook)
c2009 <- subset(countries, Year==2009,
                select=c(Name, GDP, laborrate, healthexp, infmortality))
pairs(c2009[,2:5])

panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...) { 
  usr <- par("usr")
  on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  r <- abs(cor(x, y, use="complete.obs"))
  txt <- format(c(r, 0.123456789), digits=digits)[1] 
  txt <- paste(prefix, txt, sep="") 
  if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt) 
  text(0.5, 0.5, txt, cex = cex.cor * (1 + r) / 2)
}

panel.hist <- function(x, ...) { 
  usr <- par("usr") 
  on.exit(par(usr))
  par(usr = c(usr[1:2], 0, 1.5) ) 
  h <- hist(x, plot = FALSE) 
  breaks <- h$breaks
  nB <- length(breaks)
  y <- h$counts
  y <- y/max(y)
  rect(breaks[-nB], 0, breaks[-1], y, col="white", ...)
}

pairs(c2009[,2:5], upper.panel = panel.cor,
      diag.panel  = panel.hist,
      lower.panel = panel.smooth)

panel.lm <- function (x, y, col = par("col"), bg = NA, pch = par("pch"), 
                      cex = 1, col.smooth = "black", ...) {
  points(x, y, pch = pch, col = col, bg = bg, cex = cex)
  abline(stats::lm(y ~ x),  col = col.smooth, ...)
}

pairs(c2009[,2:5], pch=".",
      upper.panel = panel.cor,
      diag.panel  = panel.hist,
      lower.panel = panel.lm)


#########summarized data distributions

#making a basic histogram
ggplot(faithful, aes(x=waiting)) + geom_histogram()

# Store the values in a simple vector
w <- faithful$waiting
ggplot(NULL, aes(x=w)) + geom_histogram()

#set the width of each bin to 5
ggplot(faithful, aes(x=waiting)) +
  geom_histogram(binwidth = 5, fill ="white",color="black")

#divide the x range into 15 bins
binsize <- diff(range(faithful$waiting))/15
ggplot(faithful,aes(x=waiting)) +
  geom_histogram(binwidth = binsize,fill="white",color="black")

#change the origin
h <- ggplot(faithful, aes(x=waiting)) # Save the base object for reuse 
h + geom_histogram(binwidth=8, fill="white", colour="black", origin=31) 
h + geom_histogram(binwidth=8, fill="white", colour="black", origin=35)

#the same with bar
h + geom_bar(stat="bin")

#making multiple histograms from grouped data
library(MASS)
#use smoke as the faceting variable
ggplot(birthwt, aes(x=bwt)) +
  geom_histogram(fill="white", color="black") +
  facet_grid(smoke~.)
#change the label
bir <- birthwt
#convert smoke to a factor
bir$smoke <- factor(bir$smoke)
library(plyr)
bir$smoke <- revalue(bir$smoke, c("0"="No Smoke","1"="Smoke"))
#same result
bir$smoke <- factor(bir$smoke,labels=c("No Smoke","Smoke"))

ggplot(bir,aes(x=bwt)) +
  geom_histogram(fill="white",color="black") +
  facet_grid(smoke~.)


#when size are different
ggplot(bir, aes(x=bwt)) +
  geom_histogram(fill="white", color="black") +
  facet_grid(race~.)
#resize independently
ggplot(bir, aes(x=bwt)) +
  geom_histogram(fill="white",color="black") +
  facet_grid(race~.,scales = "free") #scales is set "free"
#another approach is to map the grouping variable to fill
ggplot(bir, aes(x=bwt,fill=smoke)) +
  geom_histogram(position="identity",alpha=0.4) 
#without position="identity", ggplot will stack the histogram bars on top of each other

#making a density curve
ggplot(faithful, aes(x=waiting)) + geom_density()

#remove the side and bottom
ggplot(faithful, aes(x=waiting)) +
  geom_line(stat="density") +
  expand_limits(y=0)

#the larger the bandwidth, the more smoothing there is
ggplot(faithful,aes(x=waiting)) +
  geom_line(stat="density",adjust=.25,color="red") +
  geom_line(stat="density") +
  geom_line(stat="density",adjust=2,color="blue")
  
#set the area color
ggplot(faithful,aes(x=waiting)) +
  geom_density(fill="blue",alpha=.2) +
  xlim(35,105)

ggplot(faithful,aes(x=waiting)) +
  geom_density(fill="blue",alpha=.2,color=NA) +
  geom_line(stat="density") +
  xlim(35,105)

#compare the theoretical and obsearved distributions
ggplot(faithful, aes(x=waiting, y=..density..)) +
  geom_histogram(fill="cornsilk", colour="grey60", size=.2) +
  geom_density() +
  xlim(35, 105)

#making multiple density curves from grouped data
library(MASS)
bir <- birthwt
bir$smoke <- factor(bir$smoke)
ggplot(bir,aes(x=bwt,color=smoke)) + geom_density()
#mapping smoke to fill
ggplot(bir,aes(x=bwt,fill=smoke)) + geom_density(alpha=.3)

ggplot(bir, aes(x=bwt)) + geom_density() + facet_grid(smoke~.)

library(plyr) # For the revalue function
birthwt1 <- birthwt
birthwt1$smoke <- factor(birthwt1$smoke)
birthwt1$smoke <- revalue(birthwt1$smoke, c("0"="No Smoke", "1"="Smoke"))
ggplot(birthwt1, aes(x=bwt)) + geom_density() + facet_grid(smoke ~ .)

ggplot(birthwt1, aes(x=bwt, y=..density..)) +
  geom_histogram(binwidth=200, fill="cornsilk", colour="grey60", size=.2) +
  geom_density() +
  facet_grid(smoke ~ .)

#making a frequency polygon
ggplot(faithful, aes(x=waiting)) + geom_freqpoly()
ggplot(faithful, aes(x=waiting)) + geom_freqpoly(binwidth=4)

#use 15 bins
binsize <- diff(range(faithful$waiting))/15
ggplot(faithful, aes(x=waiting)) + geom_freqpoly(binwidth=binsize)

#making a basic box plot
library(MASS)
ggplot(birthwt, aes(x=factor(race), y=bwt)) + geom_boxplot()
# Use factor() to convert numeric variable to discrete

#change width
ggplot(birthwt,aes(x=factor(race),y=bwt)) + geom_boxplot(width=.5)
#default size is 2, default shape is 16
ggplot(birthwt,aes(x=factor(race),y=bwt)) +
  geom_boxplot(outlier.size = 1.5,outlier.shape = 21)

ggplot(birthwt,aes(x=1,y=bwt)) + geom_boxplot() + 
  scale_x_continuous(breaks = NULL) +
  theme(axis.title.x=element_blank())

#adding natches to a box plot
library(MASS)
ggplot(birthwt,aes(x=factor(race),y=bwt)) +
  geom_boxplot(notch = TRUE)

#adding means to a box plot
library(MASS)
ggplot(birthwt, aes(factor(race),y=bwt)) +
  geom_boxplot() +
  stat_summary(fun.y = "mean",geom = "point",shape=23,size=3,fill="white")

#making a violin plot
library(gcookbook)
p <- ggplot(heightweight,aes(x=sex,y=heightIn))
p + geom_violin()
#with narrows boxplot
p + geom_violin() +
  geom_boxplot(width=.1,fill="black",outlier.colour = NA) +
  stat_summary(fun.y = median,geom="point",fill="white",shape=21,size=2.5)
#keep the tails
p + geom_violin(trim=FALSE) 

#making a dot plot
library(gcookbook)
c2009 <- subset(countries, Year==2009 & healthexp > 2000)
p <- ggplot(c2009,aes(x=infmortality))
p + geom_dotplot() 
p + geom_dotplot(binwidth=.25) + geom_rug() +
  scale_y_continuous(breaks=NULL) +
  theme(axis.title.y=element_blank())

#making multiple dot plots for grouped data
ggplot(heightweight, aes(x=sex, y=heightIn)) +
  geom_dotplot(binaxis="y", binwidth=.5, stackdir="center")


#making a desity plot of two dimensional data
p <- ggplot(faithful,aes(x=eruptions,y=waiting))
p + geom_point() + stat_density2d()
#map the height of the density to color
p + stat_density2d(aes(color=..level..))
#map denstity estimate to fill color
p + stat_density2d(aes(fill=..density..),geom="raster",contour = F)
#with points and map desity estimate to alpha
p + geom_point() +
  stat_density2d(aes(alpha=..density..),geom="tile",contour=F)
#tile and raster look like the same

#marginal rug added to a scatter plot
p + stat_density2d(aes(fill=..density..),geom="raster",
                   contour=F,h=c(.5,5))


#############annotations

#adding text annotations
p <- ggplot(faithful,aes(x=eruptions,y=waiting)) + geom_point()
p + annotate(geom="text",x=3,y=48,label = "Group 1") +
  annotate("text",x=4.5,y=66,label = "Group 2")
p + annotate("text",x=3,y=48,label="Group 1",family="serif",
             fontface="italic",color="darkred",size=3) +
  annotate("text",x=4.5,y=66,label="Group 2",family="serif",
           fontface="italic",color="darkred",size=3)
annotate("text",x=3,y=48,label="Group 1",family="serif",
         fontface="italic",color="darkred",size=3)

p + annotate("text", x=3, y=48, label="Group 1", alpha=.1) + # Normal
  geom_text(x=4.5, y=66, label="Group 2", alpha=.1)          # Overplotted

#for continuous axes, use Inf and -Inf to place text annotations at the
#edge of the plotting area
p + annotate("text",x=-Inf,y=Inf,label="Upper left",hjust=-.2,vjust=2) +
  annotate("text",x=mean(range(faithful$eruptions)),y=-Inf,vjust=-.4,label="Bottom middle")
#for hjust and vjust position is minus the value 

#using mathematical expression in annotations
#a normal curve
p <- ggplot(data.frame(x=c(-3,3)), aes(x=x)) + stat_function(fun=dnorm)
p + annotate("text",x=2,y=0.3,parse=TRUE,
             label="frac(1,sqrt(2*pi))*e^{-x^2/2}")
#use single quotes within double quotes (or vice versa) to mark the plain-text parts
#put a * operator between them
p + annotate("text",x=0,y=.05,parse=TRUE,size=4,
             label="'Function: '*y==frac(1,sqrt(2*pi))*e^{-x^2/2}")
#?plotmath for many examples of mathematical expressions, and 
#?demo(plot math) for graphical examples of mathematical expressions

#adding lines
library(gcookbook)
p <- ggplot(heightweight,aes(x=ageYear,y=heightIn,color=sex)) + geom_point()
#adding horizongtal and vertical lines
p + geom_hline(yintercept = 60) + geom_vline(xintercept = 14)
p + geom_abline(intercept = 37.4, slope = 1.75)

library(plyr)
hw_means <- ddply(heightweight,"sex",summarise,heightIn=mean(heightIn))
p + geom_hline(aes(yintercept=heightIn,color=sex),data=hw_means,
               linetype="dashed",size=1)

#for discrete variable
pg <- ggplot(PlantGrowth,aes(x=group,y=weight)) + geom_point()
pg + geom_vline(xintercept = 2)
pg + geom_vline(xintercept = which(levels(PlantGrowth$group) == "ctrl"))

#adding line segments and arrows
library(gcookbook)
p <- ggplot(subset(climate, Source=="Berkeley"), aes(x=Year, y=Anomaly10y)) +
  geom_line()
p + annotate("segment", x=1950, xend=1980, y=-.25, yend=-.25)