Efrain Gonzalez / Cleaning and Fixing Data with R

Blame view

RCleanDscret.R 12.5 KB

234f89c9a Efrain Gonzalez Outputs raw and d...	1	##Posted 6/15/2017
c2b2c096e Efrain Gonzalez Update to amount ...	2	options(digits = 11)
234f89c9a Efrain Gonzalez Outputs raw and d...	3
0eb342056 Efrain Gonzalez This code combine...	4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26	#Libraries required to run the code library(pryr) library(MASS) library(dplyr) library(tidyr) library(readr) library(stringr) #Necessary Functions #1#Function for handling the changing of row names and column names chngrownm <- function(mat){ row <- dim(mat)[1] col <- dim(mat)[2] j <- 1 x <- 1 p <- 1 a <- 1 b <- 1 g <- 1 for(j in 1:col){ if("!Sample_source_name_ch1"==mat[1,j]){ colnames(mat)[j] <- "Brain_Region"
805474e1e Efrain Gonzalez Most recent updat...	27	} else if("!Sample_title" == mat[1,j]){
0eb342056 Efrain Gonzalez This code combine...	28	colnames(mat)[j] <- "Title"
805474e1e Efrain Gonzalez Most recent updat...	29	} else if("!Sample_geo_accession" == mat[1,j]){
0eb342056 Efrain Gonzalez This code combine...	30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47	colnames(mat)[j] <- "ID_REF" } else{ if(grepl("Sex\|gender\|Gender\|sex",mat[2,j])==TRUE){ colnames(mat)[j] <- paste0("Sex",x) x = x + 1 } if(grepl("postmorteminterval\|PMI\|pmi",mat[2,j])==TRUE){ colnames(mat)[j] <- paste0("PMI",p) p = p + 1 } if(grepl("age\|Age\|AGE",mat[2,j])==TRUE){ colnames(mat)[j] <- paste0("Age",a) a = a + 1 } if(grepl("braak\|b&b",mat[2,j])==TRUE){ colnames(mat)[j] <- paste0("Braak",b) b = b + 1 }
51d31a335 Efrain Gonzalez Update (UNTESTED)	48	if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control\|Normal",mat[2,j])==TRUE){
0eb342056 Efrain Gonzalez This code combine...	49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102	colnames(mat)[j] <- paste0("Group",g) g = g + 1 } } j = j + 1 } mat } #2#Function for reorganizing information within the columns cinfo <- function(mat){ col <- dim(mat)[2] j <-2 for(j in 2:col){ if(grepl("Group",colnames(mat)[j]) == TRUE){ mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j]) } if(grepl("Age",colnames(mat)[j])==TRUE){ mat[,j] <- gsub("\\D","",mat[,j])%>% as.integer() } if(grepl("Sex",colnames(mat)[j])==TRUE){ mat[,j] <- gsub(".+:\\s","",mat[,j]) } if(grepl("PMI",colnames(mat)[j])==TRUE){ mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% as.numeric() } if(grepl("Braak",colnames(mat)[j])==TRUE){ mat[,j]<-gsub(".+:\\s","",mat[,j])%>% as.roman()%>% as.integer() } j=j+1 } mat } #3#Function for labeling the gene IDs without names NAFIXING <- function(GIDNAM){ row <- dim(GIDNAM)[1] i <- 1 for(i in 1:row){ if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE\|\|is.na(GIDNAM[i,2])==TRUE){ GIDNAM[i,2] <- GIDNAM[i,1] } i <- i + 1 } GIDNAM } #4#Function for changing the gene ID to gene name cgeneID <- function(GeneName,DATA){
805474e1e Efrain Gonzalez Most recent updat...	103 104 105 106 107 108 109 110 111 112 113 114	nj <- t(GeneName) nq <- t(DATA) colGene <- dim(nj)[2] colDATA <- dim(nq)[2] j <- 1 for(j in 1:colDATA){ #where is that gene id located within the GPL file chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,]) if(is.na(sum(chngreq))==FALSE){ if(sum(chngreq) > 0){ nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j]) }
0eb342056 Efrain Gonzalez This code combine...	115	}
805474e1e Efrain Gonzalez Most recent updat...	116	j <- j + 1
0eb342056 Efrain Gonzalez This code combine...	117	}
805474e1e Efrain Gonzalez Most recent updat...	118	nq
0eb342056 Efrain Gonzalez This code combine...	119	}
805474e1e Efrain Gonzalez Most recent updat...	120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137	#cgeneID <- function(GeneName,DATA){ # colGene <- dim(GeneName)[2] # j <- 1 # for(j in 1:colGene){ # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) # if(is.na(sum(chngsreq))==FALSE){ # if(sum(chngsreq) > 0){ # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) # } # } # #if(sum(chngsreq) > 0){ # ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) # #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) # #} # j = j+1 # } # DATA #}
0eb342056 Efrain Gonzalez This code combine...	138 139 140 141 142 143 144	#5#Function for adjusting the gene names gcnames <- function(DiData,usecol=1){ nuruns <- dim(DiData)[2] i = 1 nwnam <- rep("0",length.out=nuruns) for(i in 1:nuruns){
62170e5f6 Efrain Gonzalez Update to gcnames...	145 146	if(length(strsplit(colnames(DiData)[i],"///\|//")[[1]]) >= usecol){ nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///\|//")[[1]][usecol])
0eb342056 Efrain Gonzalez This code combine...	147	} else{
62170e5f6 Efrain Gonzalez Update to gcnames...	148	nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///\|//")[[1]][1])
0eb342056 Efrain Gonzalez This code combine...	149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169	} } nwnam } #6# Function for discretizing the data dndat <- function(NDATA){ rownd <- dim(NDATA)[1] colnd <- dim(NDATA)[2] DDATA <- matrix(0,nrow=rownd,ncol=colnd) colnames(DDATA) <- colnames(NDATA) i <- 1 for(i in 1:rownd){ j <- 1 for(j in 1:colnd){ if(is.na(NDATA[i,j])==FALSE){ if(NDATA[i,j] < -1){ DDATA[i,j]=0L
805474e1e Efrain Gonzalez Most recent updat...	170	} else if(NDATA[i,j] > 1){
0eb342056 Efrain Gonzalez This code combine...	171	DDATA[i,j]=2L
f2cad6d27 Efrain Gonzalez Update to dndat f...	172	} else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){
0eb342056 Efrain Gonzalez This code combine...	173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214	DDATA[i,j]=1L } } else{ DDATA[i,j] = NDATA[i,j] } j = j + 1 } i = i + 1 } DDATA } #The Rest of this code will be used every time you want to change a data set #Getting the series matrix file print("Choose the series matrix file that you want to Analyze") alz <- file.choose() #Getting the GPL file print("Choose the GPL file that correlates with the above series matrix file") genena <- file.choose() #Find out if it is a soft GPL file or not soft <- strsplit(genena,"[\\\|/]") %>% .[[1]] %>% .[length(.)] %>% grepl("soft\|annot",.) #Working with the wordy part of the document alzword <- alz %>% read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% filter(grepl("!Sample",X1))%>% filter(!grepl("!Sample_contact",X1)) ##Changing row names and column names: ALZWORD <- t(alzword) rownames(ALZWORD)=NULL colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) ALZWORD <- chngrownm(ALZWORD)[-1,] ALZWORD <- ALZWORD%>%
b2053f56b Efrain Gonzalez Updating handling...	215	as.data.frame(.,stringsAsFactors = FALSE)%>%
0eb342056 Efrain Gonzalez This code combine...	216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237	dplyr::select(-starts_with("col")) ##Reorganizing information within the columns ALZWORDF <- cinfo(ALZWORD) #Working with Actual Data part of file alzdat <- alz %>% read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) ALZDAT <- t(alzdat[,-1]) rownames(ALZDAT)=NULL ##Is there a clean version of the GPL file available? gplnum <- strsplit(genena,"[\\\|/]") %>% .[[1]] %>% .[length(.)] %>% gsub("\\D","",.) clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) if(clfileex >= 1){ #use the clean version geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
805474e1e Efrain Gonzalez Most recent updat...	238	} else if(clfileex == 0){
0eb342056 Efrain Gonzalez This code combine...	239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259	##Lets Create a clean version ##Gene ID to Gene Name if(soft == TRUE){ #Check to see if there is already a file containing information on soft files fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) if(fileex == 1){ #Check to see if this GPL soft file has been used before IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% .$GPL_FILE_NUM%>% grepl(gplnum,.) %>% sum() if(IDF == 1){ IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% .$GPL_FILE_NUM%>% grep(gplnum,.) idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% .$LOC_ID %>% .[IDLOCAL] geneIDNam <- genena %>% read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
cb1063ceb Efrain Gonzalez Added "Gene symbo...	260	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$\|^Gene symbol$",colnames(.)))
805474e1e Efrain Gonzalez Most recent updat...	261	} else if(IDF == 0){
0eb342056 Efrain Gonzalez This code combine...	262 263 264 265 266 267 268 269 270 271	#No information on this particular GPL file idLOCGPL <- genena %>% read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% t(.) %>% grep("^ID\\s*$",.) %>% -1 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) geneIDNam <- genena %>% read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
cb1063ceb Efrain Gonzalez Added "Gene symbo...	272	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$\|^Gene symbol$",colnames(.)))
0eb342056 Efrain Gonzalez This code combine...	273	}
805474e1e Efrain Gonzalez Most recent updat...	274	} else if(fileex == 0){
0eb342056 Efrain Gonzalez This code combine...	275 276 277 278 279 280 281 282 283 284 285	#We must create a file that we can access for later use idLOCGPL <- genena %>% read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% t(.) %>% grep("^ID\\s*$",.) %>% -1 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) geneIDNam <- genena %>% read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
cb1063ceb Efrain Gonzalez Added "Gene symbo...	286	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$\|^Gene symbol$",colnames(.)))
0eb342056 Efrain Gonzalez This code combine...	287	}
805474e1e Efrain Gonzalez Most recent updat...	288	} else if(soft == FALSE){
0eb342056 Efrain Gonzalez This code combine...	289 290	geneIDNam <- genena %>% read_delim(delim="\t",comment = "#")%>%
cb1063ceb Efrain Gonzalez Added "Gene symbo...	291	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$\|^Gene symbol$",colnames(.)))
0eb342056 Efrain Gonzalez This code combine...	292 293 294 295 296 297 298 299 300 301 302 303 304 305 306	} ##Labeling the gene IDs without names geneIDNam <- NAFIXING(geneIDNam) ##remove the whitespace geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) ##Here is the clean version write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) } ##Changing the gene ID to gene name
805474e1e Efrain Gonzalez Most recent updat...	307	ALZDAT1 <- cgeneID(geneIDNam,alzdat)
0eb342056 Efrain Gonzalez This code combine...	308 309 310 311 312 313 314 315 316	colnames(ALZDAT) = ALZDAT1[1,] ##Adjusting the column names aka the gene names colnames(ALZDAT) <- gcnames(ALZDAT) #Full RAW Data Fullalzdwr <- ALZDAT %>%
db021299e Efrain Gonzalez Updated "stringsA...	317	as.data.frame(.,stringsAsFactors = FALSE) %>%
0eb342056 Efrain Gonzalez This code combine...	318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335	cbind(ALZWORDF,.) #Raw file is output nfnaex <- strsplit(alz,"[\\]") %>% .[[1]] %>% .[length(.)] %>% gsub("\\D","",.) %>% c("GSE",.,"aftexcel.txt") %>% paste(collapse = "") write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") #Now for the discretization part ##get the wordy part again rawword <- t(ALZWORDF) ##where is ID_REF located
689231363 Efrain Gonzalez Update error with...	336	hereim <- grep("ID_REF",rownames(rawword))
0eb342056 Efrain Gonzalez This code combine...	337 338 339 340 341 342	##Subject Names GSM... subjnam <- rawword[hereim,] ##Getting the names for the rows namedarows <- rownames(rawword)[-hereim] %>%
b2053f56b Efrain Gonzalez Updating handling...	343	as.data.frame(.,stringsAsFactors = FALSE)
0eb342056 Efrain Gonzalez This code combine...	344	RAWWORD <- rawword[-hereim,] %>%
b2053f56b Efrain Gonzalez Updating handling...	345	as.data.frame(.,stringsAsFactors = FALSE) %>%
0eb342056 Efrain Gonzalez This code combine...	346 347 348 349	bind_cols(namedarows,.) z <- 1 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) for(z in 1:dim(RAWWORD)[1]){
805474e1e Efrain Gonzalez Most recent updat...	350 351 352 353 354 355 356	if(sum(is.na(RAWWORD[z,])) > 0){ naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) } if(length(grep("NA",RAWWORD[z,])) > 0){ naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] } z <- z + 1
0eb342056 Efrain Gonzalez This code combine...	357 358 359 360 361 362 363 364	} colnames(naroww) <- "ROW_NAs" RAWWORD <- bind_cols(RAWWORD,naroww) roALZna <- t(ALZDAT) %>% rownames(.) %>%
b2053f56b Efrain Gonzalez Updating handling...	365	as.data.frame(.,stringsAsFactors = FALSE)
0eb342056 Efrain Gonzalez This code combine...	366 367 368	colnames(roALZna) <- "ID_REF" RAWDAT <- t(ALZDAT) %>%
b2053f56b Efrain Gonzalez Updating handling...	369	as.data.frame(.,stringsAsFactors = FALSE)
0eb342056 Efrain Gonzalez This code combine...	370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394	colnames(RAWDAT) <- NULL rownames(RAWDAT) <- NULL RAWDAT2 <- RAWDAT %>% cbind(roALZna,.) %>% dplyr::arrange(.,ID_REF) ##Editing the file for R processing RAWDATID <- RAWDAT2[,1] %>% as.matrix(.) RAWDATNUM <- RAWDAT2[,-1] %>% mapply(.,FUN = as.numeric) %>% t(.) ##Consolidating genes with the same name ###create empty matrix of size equal to tabRDATID tabRDATID <- table(RAWDATID) NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) j <- 1 for(j in 1:length(tabRDATID)){ ##Putting the ones without duplicates in their new homes if(tabRDATID[j] == 1){ NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
805474e1e Efrain Gonzalez Most recent updat...	395	} else if(tabRDATID[j] > 1){
0eb342056 Efrain Gonzalez This code combine...	396	##Averaging duplicates and putting them in their new homes
0eb342056 Efrain Gonzalez This code combine...	397 398 399 400 401 402 403 404 405 406 407	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) } j <- j + 1 } ##Scaling the Data scrawdat <- NuRDATN%>% scale() attr(scrawdat,"scaled:center") <- NULL attr(scrawdat,"scaled:scale") <- NULL colnames(scrawdat) <- rownames(tabRDATID)
556b97bfa Efrain Gonzalez Update Now output...	408 409 410 411 412 413 414 415 416 417 418 419	#Outputting the Z-score file nfnzsc <- strsplit(alz,"[\\]") %>% .[[1]] %>% .[length(.)] %>% gsub("\\D","",.) %>% c("GSE",.,"zscore.txt") %>% paste(collapse = "") zscraw <- scrawdat %>% t()%>% as.data.frame(.,stringsAsFactors = FALSE) colnames(zscraw) <- subjnam write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = TRUE)
0eb342056 Efrain Gonzalez This code combine...	420 421 422 423	##Discretized the Data dialzdat <- scrawdat %>% dndat(.) %>% t()%>%
b2053f56b Efrain Gonzalez Updating handling...	424	as.data.frame(.,stringsAsFactors = FALSE)
0eb342056 Efrain Gonzalez This code combine...	425 426 427	colnames(dialzdat) <- rownames(RAWDATNUM) ##setting "ID_REF" as a new variable
b2053f56b Efrain Gonzalez Updating handling...	428	geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)
0eb342056 Efrain Gonzalez This code combine...	429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464	colnames(geneNAM) <- "ID_REF" rownames(dialzdat) <- NULL dialzdat <-bind_cols(geneNAM,dialzdat) ##NAs in a column x <- 2 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) nacol[1,1] = "COL_NAs" for(x in 2:dim(dialzdat)[2]){ nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) x <- x + 1 } colnames(nacol) <- colnames(dialzdat) dialzdat<-bind_rows(dialzdat,nacol) ##NAs in a row y <- 1 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) for(y in 1:dim(dialzdat)[1]){ narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) y <- y + 1 } colnames(narowd) <- "ROW_NAs" dialzdat <- bind_cols(dialzdat,narowd) colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam colnames(RAWWORD) <- colnames(dialzdat) ##converting to character so that the clinical can be brought together with discrete data k <- 2 for(k in 2:dim(dialzdat)[2]-1){ dialzdat[,k] <- as.character(dialzdat[,k]) k <- k + 1 } #The End the full data Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) #Produces Discrete file
689231363 Efrain Gonzalez Update error with...	465	nfnaex2 <- strsplit(alz,"[\\\|/]") %>%
0eb342056 Efrain Gonzalez This code combine...	466 467 468 469 470	.[[1]] %>% .[length(.)] %>% gsub("\\D","",.) %>% c("GSE",.,"dscrt.txt") %>% paste(collapse = "")
689231363 Efrain Gonzalez Update error with...	471	write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
0eb342056 Efrain Gonzalez This code combine...	472