Blame view
RCleanDscret.R
12.5 KB
234f89c9a Outputs raw and d... |
1 |
##Posted 6/15/2017 |
c2b2c096e Update to amount ... |
2 |
options(digits = 11) |
234f89c9a Outputs raw and d... |
3 |
|
0eb342056 This code combine... |
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
#Libraries required to run the code library(pryr) library(MASS) library(dplyr) library(tidyr) library(readr) library(stringr) #Necessary Functions #1#Function for handling the changing of row names and column names chngrownm <- function(mat){ row <- dim(mat)[1] col <- dim(mat)[2] j <- 1 x <- 1 p <- 1 a <- 1 b <- 1 g <- 1 for(j in 1:col){ if("!Sample_source_name_ch1"==mat[1,j]){ colnames(mat)[j] <- "Brain_Region" |
805474e1e Most recent updat... |
27 |
} else if("!Sample_title" == mat[1,j]){ |
0eb342056 This code combine... |
28 |
colnames(mat)[j] <- "Title" |
805474e1e Most recent updat... |
29 |
} else if("!Sample_geo_accession" == mat[1,j]){ |
0eb342056 This code combine... |
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
colnames(mat)[j] <- "ID_REF" } else{ if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ colnames(mat)[j] <- paste0("Sex",x) x = x + 1 } if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ colnames(mat)[j] <- paste0("PMI",p) p = p + 1 } if(grepl("age|Age|AGE",mat[2,j])==TRUE){ colnames(mat)[j] <- paste0("Age",a) a = a + 1 } if(grepl("braak|b&b",mat[2,j])==TRUE){ colnames(mat)[j] <- paste0("Braak",b) b = b + 1 } |
51d31a335 Update (UNTESTED) |
48 |
if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){ |
0eb342056 This code combine... |
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
colnames(mat)[j] <- paste0("Group",g) g = g + 1 } } j = j + 1 } mat } #2#Function for reorganizing information within the columns cinfo <- function(mat){ col <- dim(mat)[2] j <-2 for(j in 2:col){ if(grepl("Group",colnames(mat)[j]) == TRUE){ mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) } if(grepl("Age",colnames(mat)[j])==TRUE){ mat[,j] <- gsub("\\D","",mat[,j])%>% as.integer() } if(grepl("Sex",colnames(mat)[j])==TRUE){ mat[,j] <- gsub(".+:\\s","",mat[,j]) } if(grepl("PMI",colnames(mat)[j])==TRUE){ mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% as.numeric() } if(grepl("Braak",colnames(mat)[j])==TRUE){ mat[,j]<-gsub(".+:\\s","",mat[,j])%>% as.roman()%>% as.integer() } j=j+1 } mat } #3#Function for labeling the gene IDs without names NAFIXING <- function(GIDNAM){ row <- dim(GIDNAM)[1] i <- 1 for(i in 1:row){ if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ GIDNAM[i,2] <- GIDNAM[i,1] } i <- i + 1 } GIDNAM } #4#Function for changing the gene ID to gene name cgeneID <- function(GeneName,DATA){ |
805474e1e Most recent updat... |
103 104 105 106 107 108 109 110 111 112 113 114 |
nj <- t(GeneName) nq <- t(DATA) colGene <- dim(nj)[2] colDATA <- dim(nq)[2] j <- 1 for(j in 1:colDATA){ #where is that gene id located within the GPL file chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,]) if(is.na(sum(chngreq))==FALSE){ if(sum(chngreq) > 0){ nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j]) } |
0eb342056 This code combine... |
115 |
} |
805474e1e Most recent updat... |
116 |
j <- j + 1 |
0eb342056 This code combine... |
117 |
} |
805474e1e Most recent updat... |
118 |
nq |
0eb342056 This code combine... |
119 |
} |
805474e1e Most recent updat... |
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
#cgeneID <- function(GeneName,DATA){ # colGene <- dim(GeneName)[2] # j <- 1 # for(j in 1:colGene){ # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) # if(is.na(sum(chngsreq))==FALSE){ # if(sum(chngsreq) > 0){ # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) # } # } # #if(sum(chngsreq) > 0){ # ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) # #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) # #} # j = j+1 # } # DATA #} |
0eb342056 This code combine... |
138 139 140 141 142 143 144 |
#5#Function for adjusting the gene names gcnames <- function(DiData,usecol=1){ nuruns <- dim(DiData)[2] i = 1 nwnam <- rep("0",length.out=nuruns) for(i in 1:nuruns){ |
62170e5f6 Update to gcnames... |
145 146 |
if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){ nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol]) |
0eb342056 This code combine... |
147 |
} else{ |
62170e5f6 Update to gcnames... |
148 |
nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1]) |
0eb342056 This code combine... |
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
} } nwnam } #6# Function for discretizing the data dndat <- function(NDATA){ rownd <- dim(NDATA)[1] colnd <- dim(NDATA)[2] DDATA <- matrix(0,nrow=rownd,ncol=colnd) colnames(DDATA) <- colnames(NDATA) i <- 1 for(i in 1:rownd){ j <- 1 for(j in 1:colnd){ if(is.na(NDATA[i,j])==FALSE){ if(NDATA[i,j] < -1){ DDATA[i,j]=0L |
805474e1e Most recent updat... |
170 |
} else if(NDATA[i,j] > 1){ |
0eb342056 This code combine... |
171 |
DDATA[i,j]=2L |
f2cad6d27 Update to dndat f... |
172 |
} else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){ |
0eb342056 This code combine... |
173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
DDATA[i,j]=1L } } else{ DDATA[i,j] = NDATA[i,j] } j = j + 1 } i = i + 1 } DDATA } #The Rest of this code will be used every time you want to change a data set #Getting the series matrix file print("Choose the series matrix file that you want to Analyze") alz <- file.choose() #Getting the GPL file print("Choose the GPL file that correlates with the above series matrix file") genena <- file.choose() #Find out if it is a soft GPL file or not soft <- strsplit(genena,"[\\|/]") %>% .[[1]] %>% .[length(.)] %>% grepl("soft|annot",.) #Working with the wordy part of the document alzword <- alz %>% read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% filter(grepl("!Sample",X1))%>% filter(!grepl("!Sample_contact",X1)) ##Changing row names and column names: ALZWORD <- t(alzword) rownames(ALZWORD)=NULL colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) ALZWORD <- chngrownm(ALZWORD)[-1,] ALZWORD <- ALZWORD%>% |
b2053f56b Updating handling... |
215 |
as.data.frame(.,stringsAsFactors = FALSE)%>% |
0eb342056 This code combine... |
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 |
dplyr::select(-starts_with("col")) ##Reorganizing information within the columns ALZWORDF <- cinfo(ALZWORD) #Working with Actual Data part of file alzdat <- alz %>% read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) ALZDAT <- t(alzdat[,-1]) rownames(ALZDAT)=NULL ##Is there a clean version of the GPL file available? gplnum <- strsplit(genena,"[\\|/]") %>% .[[1]] %>% .[length(.)] %>% gsub("\\D","",.) clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) if(clfileex >= 1){ #use the clean version geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") |
805474e1e Most recent updat... |
238 |
} else if(clfileex == 0){ |
0eb342056 This code combine... |
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 |
##Lets Create a clean version ##Gene ID to Gene Name if(soft == TRUE){ #Check to see if there is already a file containing information on soft files fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) if(fileex == 1){ #Check to see if this GPL soft file has been used before IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% .$GPL_FILE_NUM%>% grepl(gplnum,.) %>% sum() if(IDF == 1){ IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% .$GPL_FILE_NUM%>% grep(gplnum,.) idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% .$LOC_ID %>% .[IDLOCAL] geneIDNam <- genena %>% read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% |
cb1063ceb Added "Gene symbo... |
260 |
dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.))) |
805474e1e Most recent updat... |
261 |
} else if(IDF == 0){ |
0eb342056 This code combine... |
262 263 264 265 266 267 268 269 270 271 |
#No information on this particular GPL file idLOCGPL <- genena %>% read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% t(.) %>% grep("^ID\\s*$",.) %>% -1 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) geneIDNam <- genena %>% read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% |
cb1063ceb Added "Gene symbo... |
272 |
dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.))) |
0eb342056 This code combine... |
273 |
} |
805474e1e Most recent updat... |
274 |
} else if(fileex == 0){ |
0eb342056 This code combine... |
275 276 277 278 279 280 281 282 283 284 285 |
#We must create a file that we can access for later use idLOCGPL <- genena %>% read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% t(.) %>% grep("^ID\\s*$",.) %>% -1 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) geneIDNam <- genena %>% read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% |
cb1063ceb Added "Gene symbo... |
286 |
dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.))) |
0eb342056 This code combine... |
287 |
} |
805474e1e Most recent updat... |
288 |
} else if(soft == FALSE){ |
0eb342056 This code combine... |
289 290 |
geneIDNam <- genena %>% read_delim(delim="\t",comment = "#")%>% |
cb1063ceb Added "Gene symbo... |
291 |
dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.))) |
0eb342056 This code combine... |
292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 |
} ##Labeling the gene IDs without names geneIDNam <- NAFIXING(geneIDNam) ##remove the whitespace geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) ##Here is the clean version write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) } ##Changing the gene ID to gene name |
805474e1e Most recent updat... |
307 |
ALZDAT1 <- cgeneID(geneIDNam,alzdat) |
0eb342056 This code combine... |
308 309 310 311 312 313 314 315 316 |
colnames(ALZDAT) = ALZDAT1[1,] ##Adjusting the column names aka the gene names colnames(ALZDAT) <- gcnames(ALZDAT) #Full RAW Data Fullalzdwr <- ALZDAT %>% |
db021299e Updated "stringsA... |
317 |
as.data.frame(.,stringsAsFactors = FALSE) %>% |
0eb342056 This code combine... |
318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 |
cbind(ALZWORDF,.) #Raw file is output nfnaex <- strsplit(alz,"[\\]") %>% .[[1]] %>% .[length(.)] %>% gsub("\\D","",.) %>% c("GSE",.,"aftexcel.txt") %>% paste(collapse = "") write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") #Now for the discretization part ##get the wordy part again rawword <- t(ALZWORDF) ##where is ID_REF located |
689231363 Update error with... |
336 |
hereim <- grep("ID_REF",rownames(rawword)) |
0eb342056 This code combine... |
337 338 339 340 341 342 |
##Subject Names GSM... subjnam <- rawword[hereim,] ##Getting the names for the rows namedarows <- rownames(rawword)[-hereim] %>% |
b2053f56b Updating handling... |
343 |
as.data.frame(.,stringsAsFactors = FALSE) |
0eb342056 This code combine... |
344 |
RAWWORD <- rawword[-hereim,] %>% |
b2053f56b Updating handling... |
345 |
as.data.frame(.,stringsAsFactors = FALSE) %>% |
0eb342056 This code combine... |
346 347 348 349 |
bind_cols(namedarows,.) z <- 1 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) for(z in 1:dim(RAWWORD)[1]){ |
805474e1e Most recent updat... |
350 351 352 353 354 355 356 |
if(sum(is.na(RAWWORD[z,])) > 0){ naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) } if(length(grep("NA",RAWWORD[z,])) > 0){ naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] } z <- z + 1 |
0eb342056 This code combine... |
357 358 359 360 361 362 363 364 |
} colnames(naroww) <- "ROW_NAs" RAWWORD <- bind_cols(RAWWORD,naroww) roALZna <- t(ALZDAT) %>% rownames(.) %>% |
b2053f56b Updating handling... |
365 |
as.data.frame(.,stringsAsFactors = FALSE) |
0eb342056 This code combine... |
366 367 368 |
colnames(roALZna) <- "ID_REF" RAWDAT <- t(ALZDAT) %>% |
b2053f56b Updating handling... |
369 |
as.data.frame(.,stringsAsFactors = FALSE) |
0eb342056 This code combine... |
370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 |
colnames(RAWDAT) <- NULL rownames(RAWDAT) <- NULL RAWDAT2 <- RAWDAT %>% cbind(roALZna,.) %>% dplyr::arrange(.,ID_REF) ##Editing the file for R processing RAWDATID <- RAWDAT2[,1] %>% as.matrix(.) RAWDATNUM <- RAWDAT2[,-1] %>% mapply(.,FUN = as.numeric) %>% t(.) ##Consolidating genes with the same name ###create empty matrix of size equal to tabRDATID tabRDATID <- table(RAWDATID) NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) j <- 1 for(j in 1:length(tabRDATID)){ ##Putting the ones without duplicates in their new homes if(tabRDATID[j] == 1){ NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] |
805474e1e Most recent updat... |
395 |
} else if(tabRDATID[j] > 1){ |
0eb342056 This code combine... |
396 |
##Averaging duplicates and putting them in their new homes |
0eb342056 This code combine... |
397 398 399 400 401 402 403 404 405 406 407 |
NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) } j <- j + 1 } ##Scaling the Data scrawdat <- NuRDATN%>% scale() attr(scrawdat,"scaled:center") <- NULL attr(scrawdat,"scaled:scale") <- NULL colnames(scrawdat) <- rownames(tabRDATID) |
556b97bfa Update Now output... |
408 409 410 411 412 413 414 415 416 417 418 419 |
#Outputting the Z-score file nfnzsc <- strsplit(alz,"[\\]") %>% .[[1]] %>% .[length(.)] %>% gsub("\\D","",.) %>% c("GSE",.,"zscore.txt") %>% paste(collapse = "") zscraw <- scrawdat %>% t()%>% as.data.frame(.,stringsAsFactors = FALSE) colnames(zscraw) <- subjnam write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = TRUE) |
0eb342056 This code combine... |
420 421 422 423 |
##Discretized the Data dialzdat <- scrawdat %>% dndat(.) %>% t()%>% |
b2053f56b Updating handling... |
424 |
as.data.frame(.,stringsAsFactors = FALSE) |
0eb342056 This code combine... |
425 426 427 |
colnames(dialzdat) <- rownames(RAWDATNUM) ##setting "ID_REF" as a new variable |
b2053f56b Updating handling... |
428 |
geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE) |
0eb342056 This code combine... |
429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 |
colnames(geneNAM) <- "ID_REF" rownames(dialzdat) <- NULL dialzdat <-bind_cols(geneNAM,dialzdat) ##NAs in a column x <- 2 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) nacol[1,1] = "COL_NAs" for(x in 2:dim(dialzdat)[2]){ nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) x <- x + 1 } colnames(nacol) <- colnames(dialzdat) dialzdat<-bind_rows(dialzdat,nacol) ##NAs in a row y <- 1 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) for(y in 1:dim(dialzdat)[1]){ narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) y <- y + 1 } colnames(narowd) <- "ROW_NAs" dialzdat <- bind_cols(dialzdat,narowd) colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam colnames(RAWWORD) <- colnames(dialzdat) ##converting to character so that the clinical can be brought together with discrete data k <- 2 for(k in 2:dim(dialzdat)[2]-1){ dialzdat[,k] <- as.character(dialzdat[,k]) k <- k + 1 } #The End the full data Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) #Produces Discrete file |
689231363 Update error with... |
465 |
nfnaex2 <- strsplit(alz,"[\\|/]") %>% |
0eb342056 This code combine... |
466 467 468 469 470 |
.[[1]] %>% .[length(.)] %>% gsub("\\D","",.) %>% c("GSE",.,"dscrt.txt") %>% paste(collapse = "") |
689231363 Update error with... |
471 |
write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) |
0eb342056 This code combine... |
472 |