#download the file to load
if (!file.exists("data")){dir.create("data")}
fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl, destfile = "./data/cameras.csv",method = "curl")
dateDownloaded <- date()
if (!file.exists("data")){dir.create("data")}
fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl, destfile = "./data/cameras.csv",method = "curl")
dateDownloaded <- date()

dataset <- read.csv("./data/cameras.csv")
#equal to
dataset <- read.table("./data/cameras.csv",sep=",",header = TRUE)


if (!file.exists("data")){dir.create("data")}
fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.xlsx?accessType=DOWNLOAD"
download.file(fileUrl, destfile = "./data/cameras.xlsx",method = "curl")
dateDownloaded <- date()

#read xlsx file
library(gdata)
dataset <- read.csv("./data/cameras.xlsx")

#data table
library(data.table)
DF <- data.frame(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))
DT <- data.table(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))
#subsetting rows
DT[2,]
DT[DT$y == 'a']
DT[c(2,3)]
#subsetting columns
DT[,list(x)]
#calculating values for variables
DT[,list(mean(x),sum(z))]
DT[,table(y)]
#adding new columns
DT[,w:=z^2]
DT2 <- DT
DT[,y:=2] #DT, DT2 the same
#multiple operations
DT[,m:={tmp<-(x+z);log2(tmp+5)}]
DT[,a:=x>0]
DT[,b:=mean(x+w),by=a]
#special variables
set.seed(123)
DT <- data.table(x=sample(letters[1:3],1E5,TRUE))
#keys
DT <- data.table(x=rep(c("a","b","c"),each=100),y=rnorm(300))
setkey(DT,x)
DT['a']
#joins
DT1 <- data.table(x=c('a','a','b','dt1'),y=1:4)
DT2 <- data.table(x=c('a','b','dt2'),z=5:7)
setkey(DT1,x); setkey(DT2,x)
merge(DT1, DT2)


#read XML into R
library(XML)
fileUrl <- "http://www.w3schools.com/xml/simple.xml"
doc <- xmlTreeParse(fileUrl, useInternal = TRUE)
rootNode <- xmlRoot(doc)
xmlName(rootNode)
names(rootNode)
#directly access parts of XML doc
rootNode[[1]]
rootNode[[1]][[1]]
#programmatically extract parts of the file
xmlSApply(rootNode,xmlValue)
xpathSApply(rootNode, "//name",xmlValue)
xpathSApply(rootNode, "//price",xmlValue)
#another example
fileUrl <- "http://espn.com/nfl/team/_/name/baltimore-ravens"
doc <- htmlTreeParse(fileUrl, useInternal = TRUE)
scores <- xpathSApply(doc,"li[@class='score']",xmlValue)
teams <- xpathSApply(doc,"li[@class='team-name']",xmlValue)

#Making a Proportional Stacked Bar Graph

#reading JSON
#javascript object notation
library(jsonlite)
jsonData <- fromJSON("https://api.github.com/users/jtleek/repos")
names(jsonData)
names(jsonData$owner)
jsonData$owner$login
#writing data frame to JSON
myjson <- toJSON(iris, pretty = TRUE)
cat(myjson)
#convert back to JSON
iris2 <- fromJSON(myjson)


#reading mySQL
library(RMySQL)
ucscDb <- dbConnect(MySQL(),user="genome",
                    host="genome-mysql.cse.ucsc.edu")
result <- dbGetQuery(ucscDb,"show databases;");dbDisconnect(ucscDb)

hg19 <- dbConnect(MySQL(),user="genome",db='hg19',
                  host="genome-mysql.cse.ucsc.edu")
allTables <- dbListTables(hg19)
length(allTables)
#get dimensions of a specific table
dbListFields(hg19,"affyU133Plus2")
dbGetQuery(hg19,"select count(*) from affyU133Plus2")
result <- dbGetQuery(hg19,"select * from affyU133Plus2")
#read from the table
affyData <- dbReadTable(hg19,"affyU133Plus2")
#query
affyMis <- dbGetQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3")
#equal to 
query <- dbSendQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3")
affyMis <- fetch(query)
quantile(affyMis$misMatches)
dbClearResult(query)
#disconnect mySQL
dbDisconnect(hg19)


#reading HDF5
source("http://bioconductor.org/biocLite.R")
biocLite("rhdf5")
library(rhdf5)
created <- h5createFile("example.h5")
created <- h5createGroup("example.h5","foo")
created <- h5createGroup("example.h5","baa")
created <- h5createGroup("example.h5","foo/foobaa")
h5ls("example.h5")
#write to groups
A <- matrix(1:10, nr=5, nc =2)
h5write(A, "example.h5","foo/A")
B <- array(seq(0.1,2.0,by=0.1),dim=c(5,2,2))
attr(B,"scale") <- "liter"
h5write(B,"example.h5","foo/foobaa/B")
h5ls("example.h5")
#write a data set
df <- data.frame(1L:5L, seq(0,1,length.out = 5), c('ab','cde','fghi','a','s'),
                 stringsAsFactors = F)
h5write(df,"example.h5","df")
h5ls("example.h5")
#reading data
readA <- h5read("example.h5","foo/A")
readB <- h5read("example.h5","foo/foobaa/B")
readdf <- h5read("example.h5",'df')
readA


#reading data from APIs

#subsetting
set.seed(13435)
X <- data.frame("var1"=sample(1:5),"var2"=sample(6:10),"var3"=sample(11:15))
X <- X[sample(1:5),]
X$var2[c(1,3)] <- NA
#dealing with missing values
X[which(X$var2>8),] #which can ignore the NA values
#sorting川普
sort(X$var1)
sort(X$var1,decreasing = TRUE)
sort(X$var2,na.last = TRUE)
#ordering
X[order(X$var1),]
#ording by plyr
library(plyr)
arrange(X,var1)
arrange(X,desc(var1))


#creating new variables
#getting the data from the web
if (!file.exists("data")){dir.create("data")}
fileUrl <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl, destfile = "./data/restaurants.csv",method = "curl")
dateDownloaded <- date()
restData <- read.csv("./data/restaurants.csv")

#subsetting variables
restData$nearMe <- restData$neighborhood %in% c("Roland Park","Homeland")
table(restData$nearMe)
#creating binary variables
restData$zipWrong <- ifelse(restData$zipCode < 0, TRUE, FALSE)
with(restData,table(zipWrong,zipCode < 0))
#creating categorical variable
restData$zipGroup <- cut(restData$zipCode, breaks = quantile(restData$zipCode))
table(restData$zipGroup)
#easier cutting
library(Hmisc)
restData$zipGroup <- cut2(restData$zipCode,g=4)
table(restData$zipGroup)
#creating factor variables
restData$zcf <- factor(restData$zipCode)
#levels of factor
yesno <- sample(c("yes","no"),size = 10,replace = TRUE)
yesnofac <- factor(yesno,levels = c("yes","no"))
relevel(yesnofac,ref = "no")
#cutting produces factor variables
library(Hmisc)
restData$zipGroup <- cut2(restData$zipCode,g=4)
#using mutate function
library(Hmisc)
library(plyr)
restData2 <- mutate(restData,zipGroup=cut2(zipCode,g=4))
table(restData2$zipGroup)

#smmarizing data
colSums(is.na(restData))
all(colSums(is.na(restData)) == 0)
#values with specific character
table(restData$zipCode %in% c("21212","21213"))
#cross table
data("UCBAdmissions")
df1 <- as.data.frame(UCBAdmissions)
summary(df1)
xt <- xtabs(Freq ~Gender +Admit,data=df1) #sum
#flat tables
warpbreaks$replicate <- rep(1:9, len=54)
xt <- xtabs(breaks~.,data=warpbreaks)
ftable(xt)
#size of a data set
fakeData <- rnorm(1e5)
object.size(fakeData)
print(object.size(fakeData),units = "Mb")


#dplyr package
library(dplyr)
#select
chicago <- readRDS("./data/chicago.rds")
head(select(chicago, 1:5)) #select col
head(select(chicago, city:dptp))
head(select(chicago,-(city:dptp))) #not select
#equivalent
i <- match("city", names(chicago))
j <- match("dptp", names(chicago))
head(chicago[,-(i:j)])

#filter
chic <- filter(chicago, pm25tmean2 > 30)
head(select(chic,1:3,pm25tmean2))
chic <- filter(chicago, pm25tmean2 > 30 & tmpd > 80)
head(select(chic,1:3,pm25tmean2,tmpd))

#arrange (reorder rows while preserving corresponding order of other col)
chicago <- arrange(chicago, date)
head(select(chicago,date,pm25tmean2))
#descending order
chicago <- arrange(chicago, desc(date))
head(select(chicago,date,pm25tmean2))


#rename
chicago <- rename(chicago, dewpoint = dptp, pm25 = pm25tmean2)
head(select(chicago,1:5))

#mutate
chicago <- mutate(chicago,
                  pm25detrend = pm25 - mean(pm25,na.rm = TRUE))
head(select(chicago,pm25,pm25detrend))

#group_by (generate summary statistics by stratum)
chicago <- mutate(chicago,
                  tempcat=factor(1*(tmpd>80), labels = c("cold","hot")))
#1* just transfor to numeric (0,1)
hotcold <- group_by(chicago, tempcat)
summarize(hotcold, pm25 = mean(pm25, na.rm = TRUE),
          o3 = max(o3tmean2, na.rm = TRUE),
          no2 = median(no2tmean2, na.rm = TRUE))

chicago <- mutate(chicago, 
                  year = as.POSIXlt(date)$year + 1900)
years <- group_by(chicago, year)
summarize(years, pm25 = mean(pm25, na.rm = T),
          o3 = max(o3tmean2, na.rm = T),
          no2 = median(no2tmean2, na.rm = T))

chicago %>% mutate(month = as.POSIXlt(date)$mon +1) %>% group_by(month) %>% summarize(pm25 = mean(pm25, na.rm = T),
                o3 = max(o3tmean2, na.rm = T),
                no2 = median(no2tmean2, na.rm = T))


if (!exists("./data")) {dir.create("./data")}
fileUrl1 <- "https://dl.dropboxusercontent.com/u/7710864/data/reviews-apr29.csv"
fileUrl2 <- "https://dl.dropboxusercontent.com/u/7710864/data/solutions-apr29.csv"

download.file(fileUrl1,destfile = "./data/reviews.csv",method = "curl")
download.file(fileUrl2,destfile = "./data/solutions.csv",method = "curl")

reviews <- read.csv("./data/reviews.csv")
solutions <- read.csv("./data/solutions.csv")

#merge data
mergedData <- merge(reviews, solutions, by.x = "solution_id", by.y="id",all=TRUE)

#join in the plyr package
library(plyr)
df1 <- data.frame(id = sample(1:10), x=rnorm(10))
df2 <- data.frame(id = sample(1:10), y = rnorm(10))
arrange(join(df1, df2),id)
#join multiple data frames
df1 <- data.frame(id = sample(1:10), x=rnorm(10))
df2 <- data.frame(id = sample(1:10), y = rnorm(10))
df3 <- data.frame(id = sample(1:10),z=rnorm(10))

dfList <- list(df1,df2,df3)
join_all(dfList)

#working with date
d1 <- date()
class(d1) #charactor

d2 <- Sys.Date()
class(d2) #date
#formatting dates
format(d2, "%a %b %d")

x <- c("1jan1960","2jan1960")
z <- as.Date(x, "%d%b%Y")
#converting
weekdays(d2)

#lubridate
library(lubridate)
ymd("20140108")
ymd("2011-08-03")

#editing text variables

if (!file.exists("data")){dir.create("data")}
fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl, destfile = "./data/cameras.csv",method = "curl")

camera <- read.csv("./data/cameras.csv")
tolower(names(camera))

#fixing character vectors
splitNames <- strsplit(names(camera), "\\.")
splitNames[[6]][1]
firstElement <- function(x){x[1]}
sapply(splitNames,firstElement)

fileUrl1 <- "https://dl.dropboxusercontent.com/u/7710864/data/reviews-apr29.csv"
fileUrl2 <- "https://dl.dropboxusercontent.com/u/7710864/data/solutions-apr29.csv"

download.file(fileUrl1,destfile = "./data/reviews.csv",method = "curl")
download.file(fileUrl2,destfile = "./data/solutions.csv",method = "curl")

reviews <- read.csv("./data/reviews.csv")
solutions <- read.csv("./data/solutions.csv")
names(reviews)
#replacement
sub("_", "", names(reviews)) #just replace one
gsub() #replace all


#finding values
grep("Alameda", camera$intersection) #return location
grepl() #return T or F

grep("Alameda", camera$intersection, value = TRUE) #return exact element 


#stringr library
library(stringr)
nchar("Jeffrey Leek") #count
substr("Jeffrey Leek",1,7)
paste("Jeffrey","Leek") #with blank
paste0("Jeffrey", "Leek") #without blank
str_trim("Jeff    ") #remove the blanks (beginning and ending)


#reshaping
library(reshape2)
mtcars$carname <- rownames(mtcars)
carMelt <- melt(mtcars, id=c("carname","gear","cyl"), measure.vars = c("mpg","hp"))


#casting data frames
cylData <- dcast(carMelt,cyl~variable) #count
cylData <- dcast(carMelt,cyl~variable, mean)

#averaging values
head(InsectSprays)
tapply(InsectSprays$count,InsectSprays$spray,sum) #
#split
spIns <- split(InsectSprays$count, InsectSprays$spray)
sapply(spIns,sum)
#or 
unlist(lapply(spIns,sum))

#plyr package
library(plyr)
ddply(InsectSprays, "spray",summarize,sum=sum(count)) #return dataframe