#download the file to load if (!file.exists("data")){dir.create("data")} fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD" download.file(fileUrl, destfile = "./data/cameras.csv",method = "curl") dateDownloaded <- date() if (!file.exists("data")){dir.create("data")} fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD" download.file(fileUrl, destfile = "./data/cameras.csv",method = "curl") dateDownloaded <- date() dataset <- read.csv("./data/cameras.csv") #equal to dataset <- read.table("./data/cameras.csv",sep=",",header = TRUE) if (!file.exists("data")){dir.create("data")} fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.xlsx?accessType=DOWNLOAD" download.file(fileUrl, destfile = "./data/cameras.xlsx",method = "curl") dateDownloaded <- date() #read xlsx file library(gdata) dataset <- read.csv("./data/cameras.xlsx") #data table library(data.table) DF <- data.frame(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9)) DT <- data.table(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9)) #subsetting rows DT[2,] DT[DT$y == 'a'] DT[c(2,3)] #subsetting columns DT[,list(x)] #calculating values for variables DT[,list(mean(x),sum(z))] DT[,table(y)] #adding new columns DT[,w:=z^2] DT2 <- DT DT[,y:=2] #DT, DT2 the same #multiple operations DT[,m:={tmp<-(x+z);log2(tmp+5)}] DT[,a:=x>0] DT[,b:=mean(x+w),by=a] #special variables set.seed(123) DT <- data.table(x=sample(letters[1:3],1E5,TRUE)) #keys DT <- data.table(x=rep(c("a","b","c"),each=100),y=rnorm(300)) setkey(DT,x) DT['a'] #joins DT1 <- data.table(x=c('a','a','b','dt1'),y=1:4) DT2 <- data.table(x=c('a','b','dt2'),z=5:7) setkey(DT1,x); setkey(DT2,x) merge(DT1, DT2) #read XML into R library(XML) fileUrl <- "http://www.w3schools.com/xml/simple.xml" doc <- xmlTreeParse(fileUrl, useInternal = TRUE) rootNode <- xmlRoot(doc) xmlName(rootNode) names(rootNode) #directly access parts of XML doc rootNode[[1]] rootNode[[1]][[1]] #programmatically extract parts of the file xmlSApply(rootNode,xmlValue) xpathSApply(rootNode, "//name",xmlValue) xpathSApply(rootNode, "//price",xmlValue) #another example fileUrl <- "http://espn.com/nfl/team/_/name/baltimore-ravens" doc <- htmlTreeParse(fileUrl, useInternal = TRUE) scores <- xpathSApply(doc,"li[@class='score']",xmlValue) teams <- xpathSApply(doc,"li[@class='team-name']",xmlValue) #Making a Proportional Stacked Bar Graph #reading JSON #javascript object notation library(jsonlite) jsonData <- fromJSON("https://api.github.com/users/jtleek/repos") names(jsonData) names(jsonData$owner) jsonData$owner$login #writing data frame to JSON myjson <- toJSON(iris, pretty = TRUE) cat(myjson) #convert back to JSON iris2 <- fromJSON(myjson) #reading mySQL library(RMySQL) ucscDb <- dbConnect(MySQL(),user="genome", host="genome-mysql.cse.ucsc.edu") result <- dbGetQuery(ucscDb,"show databases;");dbDisconnect(ucscDb) hg19 <- dbConnect(MySQL(),user="genome",db='hg19', host="genome-mysql.cse.ucsc.edu") allTables <- dbListTables(hg19) length(allTables) #get dimensions of a specific table dbListFields(hg19,"affyU133Plus2") dbGetQuery(hg19,"select count(*) from affyU133Plus2") result <- dbGetQuery(hg19,"select * from affyU133Plus2") #read from the table affyData <- dbReadTable(hg19,"affyU133Plus2") #query affyMis <- dbGetQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3") #equal to query <- dbSendQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3") affyMis <- fetch(query) quantile(affyMis$misMatches) dbClearResult(query) #disconnect mySQL dbDisconnect(hg19) #reading HDF5 source("http://bioconductor.org/biocLite.R") biocLite("rhdf5") library(rhdf5) created <- h5createFile("example.h5") created <- h5createGroup("example.h5","foo") created <- h5createGroup("example.h5","baa") created <- h5createGroup("example.h5","foo/foobaa") h5ls("example.h5") #write to groups A <- matrix(1:10, nr=5, nc =2) h5write(A, "example.h5","foo/A") B <- array(seq(0.1,2.0,by=0.1),dim=c(5,2,2)) attr(B,"scale") <- "liter" h5write(B,"example.h5","foo/foobaa/B") h5ls("example.h5") #write a data set df <- data.frame(1L:5L, seq(0,1,length.out = 5), c('ab','cde','fghi','a','s'), stringsAsFactors = F) h5write(df,"example.h5","df") h5ls("example.h5") #reading data readA <- h5read("example.h5","foo/A") readB <- h5read("example.h5","foo/foobaa/B") readdf <- h5read("example.h5",'df') readA #reading data from APIs #subsetting set.seed(13435) X <- data.frame("var1"=sample(1:5),"var2"=sample(6:10),"var3"=sample(11:15)) X <- X[sample(1:5),] X$var2[c(1,3)] <- NA #dealing with missing values X[which(X$var2>8),] #which can ignore the NA values #sorting川普 sort(X$var1) sort(X$var1,decreasing = TRUE) sort(X$var2,na.last = TRUE) #ordering X[order(X$var1),] #ording by plyr library(plyr) arrange(X,var1) arrange(X,desc(var1)) #creating new variables #getting the data from the web if (!file.exists("data")){dir.create("data")} fileUrl <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD" download.file(fileUrl, destfile = "./data/restaurants.csv",method = "curl") dateDownloaded <- date() restData <- read.csv("./data/restaurants.csv") #subsetting variables restData$nearMe <- restData$neighborhood %in% c("Roland Park","Homeland") table(restData$nearMe) #creating binary variables restData$zipWrong <- ifelse(restData$zipCode < 0, TRUE, FALSE) with(restData,table(zipWrong,zipCode < 0)) #creating categorical variable restData$zipGroup <- cut(restData$zipCode, breaks = quantile(restData$zipCode)) table(restData$zipGroup) #easier cutting library(Hmisc) restData$zipGroup <- cut2(restData$zipCode,g=4) table(restData$zipGroup) #creating factor variables restData$zcf <- factor(restData$zipCode) #levels of factor yesno <- sample(c("yes","no"),size = 10,replace = TRUE) yesnofac <- factor(yesno,levels = c("yes","no")) relevel(yesnofac,ref = "no") #cutting produces factor variables library(Hmisc) restData$zipGroup <- cut2(restData$zipCode,g=4) #using mutate function library(Hmisc) library(plyr) restData2 <- mutate(restData,zipGroup=cut2(zipCode,g=4)) table(restData2$zipGroup) #smmarizing data colSums(is.na(restData)) all(colSums(is.na(restData)) == 0) #values with specific character table(restData$zipCode %in% c("21212","21213")) #cross table data("UCBAdmissions") df1 <- as.data.frame(UCBAdmissions) summary(df1) xt <- xtabs(Freq ~Gender +Admit,data=df1) #sum #flat tables warpbreaks$replicate <- rep(1:9, len=54) xt <- xtabs(breaks~.,data=warpbreaks) ftable(xt) #size of a data set fakeData <- rnorm(1e5) object.size(fakeData) print(object.size(fakeData),units = "Mb") #dplyr package library(dplyr) #select chicago <- readRDS("./data/chicago.rds") head(select(chicago, 1:5)) #select col head(select(chicago, city:dptp)) head(select(chicago,-(city:dptp))) #not select #equivalent i <- match("city", names(chicago)) j <- match("dptp", names(chicago)) head(chicago[,-(i:j)]) #filter chic <- filter(chicago, pm25tmean2 > 30) head(select(chic,1:3,pm25tmean2)) chic <- filter(chicago, pm25tmean2 > 30 & tmpd > 80) head(select(chic,1:3,pm25tmean2,tmpd)) #arrange (reorder rows while preserving corresponding order of other col) chicago <- arrange(chicago, date) head(select(chicago,date,pm25tmean2)) #descending order chicago <- arrange(chicago, desc(date)) head(select(chicago,date,pm25tmean2)) #rename chicago <- rename(chicago, dewpoint = dptp, pm25 = pm25tmean2) head(select(chicago,1:5)) #mutate chicago <- mutate(chicago, pm25detrend = pm25 - mean(pm25,na.rm = TRUE)) head(select(chicago,pm25,pm25detrend)) #group_by (generate summary statistics by stratum) chicago <- mutate(chicago, tempcat=factor(1*(tmpd>80), labels = c("cold","hot"))) #1* just transfor to numeric (0,1) hotcold <- group_by(chicago, tempcat) summarize(hotcold, pm25 = mean(pm25, na.rm = TRUE), o3 = max(o3tmean2, na.rm = TRUE), no2 = median(no2tmean2, na.rm = TRUE)) chicago <- mutate(chicago, year = as.POSIXlt(date)$year + 1900) years <- group_by(chicago, year) summarize(years, pm25 = mean(pm25, na.rm = T), o3 = max(o3tmean2, na.rm = T), no2 = median(no2tmean2, na.rm = T)) chicago %>% mutate(month = as.POSIXlt(date)$mon +1) %>% group_by(month) %>% summarize(pm25 = mean(pm25, na.rm = T), o3 = max(o3tmean2, na.rm = T), no2 = median(no2tmean2, na.rm = T)) if (!exists("./data")) {dir.create("./data")} fileUrl1 <- "https://dl.dropboxusercontent.com/u/7710864/data/reviews-apr29.csv" fileUrl2 <- "https://dl.dropboxusercontent.com/u/7710864/data/solutions-apr29.csv" download.file(fileUrl1,destfile = "./data/reviews.csv",method = "curl") download.file(fileUrl2,destfile = "./data/solutions.csv",method = "curl") reviews <- read.csv("./data/reviews.csv") solutions <- read.csv("./data/solutions.csv") #merge data mergedData <- merge(reviews, solutions, by.x = "solution_id", by.y="id",all=TRUE) #join in the plyr package library(plyr) df1 <- data.frame(id = sample(1:10), x=rnorm(10)) df2 <- data.frame(id = sample(1:10), y = rnorm(10)) arrange(join(df1, df2),id) #join multiple data frames df1 <- data.frame(id = sample(1:10), x=rnorm(10)) df2 <- data.frame(id = sample(1:10), y = rnorm(10)) df3 <- data.frame(id = sample(1:10),z=rnorm(10)) dfList <- list(df1,df2,df3) join_all(dfList) #working with date d1 <- date() class(d1) #charactor d2 <- Sys.Date() class(d2) #date #formatting dates format(d2, "%a %b %d") x <- c("1jan1960","2jan1960") z <- as.Date(x, "%d%b%Y") #converting weekdays(d2) #lubridate library(lubridate) ymd("20140108") ymd("2011-08-03") #editing text variables if (!file.exists("data")){dir.create("data")} fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD" download.file(fileUrl, destfile = "./data/cameras.csv",method = "curl") camera <- read.csv("./data/cameras.csv") tolower(names(camera)) #fixing character vectors splitNames <- strsplit(names(camera), "\\.") splitNames[[6]][1] firstElement <- function(x){x[1]} sapply(splitNames,firstElement) fileUrl1 <- "https://dl.dropboxusercontent.com/u/7710864/data/reviews-apr29.csv" fileUrl2 <- "https://dl.dropboxusercontent.com/u/7710864/data/solutions-apr29.csv" download.file(fileUrl1,destfile = "./data/reviews.csv",method = "curl") download.file(fileUrl2,destfile = "./data/solutions.csv",method = "curl") reviews <- read.csv("./data/reviews.csv") solutions <- read.csv("./data/solutions.csv") names(reviews) #replacement sub("_", "", names(reviews)) #just replace one gsub() #replace all #finding values grep("Alameda", camera$intersection) #return location grepl() #return T or F grep("Alameda", camera$intersection, value = TRUE) #return exact element #stringr library library(stringr) nchar("Jeffrey Leek") #count substr("Jeffrey Leek",1,7) paste("Jeffrey","Leek") #with blank paste0("Jeffrey", "Leek") #without blank str_trim("Jeff ") #remove the blanks (beginning and ending) #reshaping library(reshape2) mtcars$carname <- rownames(mtcars) carMelt <- melt(mtcars, id=c("carname","gear","cyl"), measure.vars = c("mpg","hp")) #casting data frames cylData <- dcast(carMelt,cyl~variable) #count cylData <- dcast(carMelt,cyl~variable, mean) #averaging values head(InsectSprays) tapply(InsectSprays$count,InsectSprays$spray,sum) # #split spIns <- split(InsectSprays$count, InsectSprays$spray) sapply(spIns,sum) #or unlist(lapply(spIns,sum)) #plyr package library(plyr) ddply(InsectSprays, "spray",summarize,sum=sum(count)) #return dataframe