Commit adfed316993d072e749a8cd85434fc667c054f22
1 parent
689231363c
Exists in
master
An automated version of the RCleanDscret.R
Working on outputting more insightful errors and warnings. (UNTESTED)
Showing
1 changed file
with
752 additions
and
0 deletions
Show diff stats
RAutoClDs.R
| ... | ... | @@ -0,0 +1,752 @@ |
| 1 | +#Efrain H. Gonzalez | |
| 2 | +#6/19/2017 | |
| 3 | +#Libraries required to run the code | |
| 4 | +library(pryr) | |
| 5 | +library(MASS) | |
| 6 | +library(dplyr) | |
| 7 | +library(tidyr) | |
| 8 | +library(readr) | |
| 9 | +library(stringr) | |
| 10 | + | |
| 11 | + | |
| 12 | +#Necessary Functions | |
| 13 | +#1#Function for handling the changing of row names and column names | |
| 14 | +chngrownm <- function(mat){ | |
| 15 | + row <- dim(mat)[1] | |
| 16 | + col <- dim(mat)[2] | |
| 17 | + j <- 1 | |
| 18 | + x <- 1 | |
| 19 | + p <- 1 | |
| 20 | + a <- 1 | |
| 21 | + b <- 1 | |
| 22 | + g <- 1 | |
| 23 | + for(j in 1:col){ | |
| 24 | + if("!Sample_source_name_ch1"==mat[1,j]){ | |
| 25 | + colnames(mat)[j] <- "Brain_Region" | |
| 26 | + } | |
| 27 | + if("!Sample_title" == mat[1,j]){ | |
| 28 | + colnames(mat)[j] <- "Title" | |
| 29 | + } | |
| 30 | + if("!Sample_geo_accession" == mat[1,j]){ | |
| 31 | + colnames(mat)[j] <- "ID_REF" | |
| 32 | + } else{ | |
| 33 | + if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ | |
| 34 | + colnames(mat)[j] <- paste0("Sex",x) | |
| 35 | + x = x + 1 | |
| 36 | + } | |
| 37 | + if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ | |
| 38 | + colnames(mat)[j] <- paste0("PMI",p) | |
| 39 | + p = p + 1 | |
| 40 | + } | |
| 41 | + if(grepl("age|Age|AGE",mat[2,j])==TRUE){ | |
| 42 | + colnames(mat)[j] <- paste0("Age",a) | |
| 43 | + a = a + 1 | |
| 44 | + } | |
| 45 | + if(grepl("braak|b&b",mat[2,j])==TRUE){ | |
| 46 | + colnames(mat)[j] <- paste0("Braak",b) | |
| 47 | + b = b + 1 | |
| 48 | + } | |
| 49 | + if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){ | |
| 50 | + colnames(mat)[j] <- paste0("Group",g) | |
| 51 | + g = g + 1 | |
| 52 | + } | |
| 53 | + | |
| 54 | + } | |
| 55 | + j = j + 1 | |
| 56 | + } | |
| 57 | + mat | |
| 58 | +} | |
| 59 | + | |
| 60 | +#2#Function for reorganizing information within the columns | |
| 61 | +cinfo <- function(mat){ | |
| 62 | + col <- dim(mat)[2] | |
| 63 | + j <-2 | |
| 64 | + for(j in 2:col){ | |
| 65 | + if(grepl("Group",colnames(mat)[j]) == TRUE){ | |
| 66 | + mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) | |
| 67 | + } | |
| 68 | + if(grepl("Age",colnames(mat)[j])==TRUE){ | |
| 69 | + mat[,j] <- gsub("\\D","",mat[,j])%>% | |
| 70 | + as.integer() | |
| 71 | + } | |
| 72 | + if(grepl("Sex",colnames(mat)[j])==TRUE){ | |
| 73 | + mat[,j] <- gsub(".+:\\s","",mat[,j]) | |
| 74 | + } | |
| 75 | + if(grepl("PMI",colnames(mat)[j])==TRUE){ | |
| 76 | + mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% | |
| 77 | + as.numeric() | |
| 78 | + } | |
| 79 | + if(grepl("Braak",colnames(mat)[j])==TRUE){ | |
| 80 | + mat[,j]<-gsub(".+:\\s","",mat[,j])%>% | |
| 81 | + as.roman()%>% | |
| 82 | + as.integer() | |
| 83 | + } | |
| 84 | + j=j+1 | |
| 85 | + } | |
| 86 | + mat | |
| 87 | +} | |
| 88 | + | |
| 89 | +#3#Function for labeling the gene IDs without names | |
| 90 | +NAFIXING <- function(GIDNAM){ | |
| 91 | + row <- dim(GIDNAM)[1] | |
| 92 | + i <- 1 | |
| 93 | + for(i in 1:row){ | |
| 94 | + if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ | |
| 95 | + GIDNAM[i,2] <- GIDNAM[i,1] | |
| 96 | + } | |
| 97 | + i <- i + 1 | |
| 98 | + } | |
| 99 | + GIDNAM | |
| 100 | +} | |
| 101 | + | |
| 102 | +#4#Function for changing the gene ID to gene name | |
| 103 | +cgeneID <- function(GeneName,DATA){ | |
| 104 | + colGene <- dim(GeneName)[2] | |
| 105 | + j <- 1 | |
| 106 | + for(j in 1:colGene){ | |
| 107 | + chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) | |
| 108 | + if(is.na(sum(chngsreq))==FALSE){ | |
| 109 | + if(sum(chngsreq) > 0){ | |
| 110 | + DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
| 111 | + } | |
| 112 | + } | |
| 113 | + j = j+1 | |
| 114 | + } | |
| 115 | + DATA | |
| 116 | +} | |
| 117 | + | |
| 118 | +#5#Function for adjusting the gene names | |
| 119 | +gcnames <- function(DiData,usecol=1){ | |
| 120 | + nuruns <- dim(DiData)[2] | |
| 121 | + i = 1 | |
| 122 | + nwnam <- rep("0",length.out=nuruns) | |
| 123 | + for(i in 1:nuruns){ | |
| 124 | + if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ | |
| 125 | + nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol]) | |
| 126 | + } else{ | |
| 127 | + nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1]) | |
| 128 | + } | |
| 129 | + | |
| 130 | + } | |
| 131 | + nwnam | |
| 132 | + | |
| 133 | +} | |
| 134 | + | |
| 135 | +#6# Function for discretizing the data | |
| 136 | +dndat <- function(NDATA){ | |
| 137 | + rownd <- dim(NDATA)[1] | |
| 138 | + colnd <- dim(NDATA)[2] | |
| 139 | + DDATA <- matrix(0,nrow=rownd,ncol=colnd) | |
| 140 | + colnames(DDATA) <- colnames(NDATA) | |
| 141 | + i <- 1 | |
| 142 | + for(i in 1:rownd){ | |
| 143 | + j <- 1 | |
| 144 | + for(j in 1:colnd){ | |
| 145 | + if(is.na(NDATA[i,j])==FALSE){ | |
| 146 | + | |
| 147 | + if(NDATA[i,j] < -1){ | |
| 148 | + DDATA[i,j]=0L | |
| 149 | + } | |
| 150 | + if(NDATA[i,j] > 1){ | |
| 151 | + DDATA[i,j]=2L | |
| 152 | + } | |
| 153 | + if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | |
| 154 | + DDATA[i,j]=1L | |
| 155 | + } | |
| 156 | + } else{ | |
| 157 | + DDATA[i,j] = NDATA[i,j] | |
| 158 | + } | |
| 159 | + j = j + 1 | |
| 160 | + } | |
| 161 | + i = i + 1 | |
| 162 | + } | |
| 163 | + DDATA | |
| 164 | +} | |
| 165 | + | |
| 166 | + | |
| 167 | +#MajorFunction#This is the function that does everything else | |
| 168 | +THEFT <- function(){ | |
| 169 | + #Set working directory based on the directory of the series matrix file Currently only works for windows | |
| 170 | + wd <- getwd() | |
| 171 | + #list.files() | |
| 172 | + #gsub("wd",wd,"Do you want to clean all data files in the directory wd?") | |
| 173 | + numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) | |
| 174 | + GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files()) | |
| 175 | + | |
| 176 | + #ALL DATA FILES WILL BE CLEANED | |
| 177 | + if(numDAT == 1){ | |
| 178 | + #indexing the data files | |
| 179 | + n <- 1 | |
| 180 | + for(n in 1: length(GSEfileloc)){ | |
| 181 | + alz <- list.files()[GSEfileloc[n]] | |
| 182 | + | |
| 183 | + #Working with the wordy part of the document | |
| 184 | + alzword <- alz %>% | |
| 185 | + read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% | |
| 186 | + filter(grepl("!Sample",X1))%>% | |
| 187 | + filter(!grepl("!Sample_contact",X1)) | |
| 188 | + | |
| 189 | + #Getting the GPL file | |
| 190 | + genena <- grep("_platform_id",alzword$X1) %>% | |
| 191 | + alzword$X2[.] %>% | |
| 192 | + str_trim(.) %>% | |
| 193 | + paste0("^",.) %>% | |
| 194 | + grep(.,list.files()) %>% | |
| 195 | + list.files()[.] | |
| 196 | + | |
| 197 | + #Find out if it is a soft GPL file or not | |
| 198 | + soft <- strsplit(genena,"[\\|/]") %>% | |
| 199 | + .[[1]] %>% | |
| 200 | + .[length(.)] %>% | |
| 201 | + grepl("soft",.) | |
| 202 | + | |
| 203 | + ##Changing row names and column names: | |
| 204 | + ALZWORD <- t(alzword) | |
| 205 | + rownames(ALZWORD)=NULL | |
| 206 | + colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) | |
| 207 | + ALZWORD <- chngrownm(ALZWORD)[-1,] | |
| 208 | + ALZWORD <- ALZWORD%>% | |
| 209 | + as.data.frame()%>% | |
| 210 | + dplyr::select(-starts_with("col")) | |
| 211 | + | |
| 212 | + ##Reorganizing information within the columns and final clinical data | |
| 213 | + ALZWORDF <- cinfo(ALZWORD) | |
| 214 | + | |
| 215 | + | |
| 216 | + #Working with Actual Data part of file | |
| 217 | + alzdat <- alz %>% | |
| 218 | + read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) | |
| 219 | + ALZDAT <- t(alzdat[,-1]) | |
| 220 | + rownames(ALZDAT)=NULL | |
| 221 | + | |
| 222 | + ##Is there a clean version of the GPL file available? | |
| 223 | + gplnum <- strsplit(genena,"[\\|/]") %>% | |
| 224 | + .[[1]] %>% | |
| 225 | + .[length(.)] %>% | |
| 226 | + gsub("\\D","",.) | |
| 227 | + clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) | |
| 228 | + if(clfileex >= 1){ | |
| 229 | + #use the clean version | |
| 230 | + geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% | |
| 231 | + read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") | |
| 232 | + | |
| 233 | + } | |
| 234 | + if(clfileex == 0){ | |
| 235 | + ##Lets Create a clean version | |
| 236 | + | |
| 237 | + ##Gene ID to Gene Name | |
| 238 | + if(soft == TRUE){ | |
| 239 | + #Check to see if there is already a file containing information on soft files | |
| 240 | + fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) | |
| 241 | + if(fileex == 1){ | |
| 242 | + #Check to see if this GPL soft file has been used before | |
| 243 | + IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | |
| 244 | + .$GPL_FILE_NUM%>% | |
| 245 | + grepl(gplnum,.) %>% | |
| 246 | + sum() | |
| 247 | + if(IDF == 1){ | |
| 248 | + IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | |
| 249 | + .$GPL_FILE_NUM%>% | |
| 250 | + grep(gplnum,.) | |
| 251 | + idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | |
| 252 | + .$LOC_ID %>% | |
| 253 | + .[IDLOCAL] | |
| 254 | + geneIDNam <- genena %>% | |
| 255 | + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% | |
| 256 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
| 257 | + } | |
| 258 | + if(IDF == 0){ | |
| 259 | + #No information on this particular GPL file | |
| 260 | + idLOCGPL <- genena %>% | |
| 261 | + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | |
| 262 | + t(.) %>% | |
| 263 | + grep("^ID\\s*$",.) %>% | |
| 264 | + -1 | |
| 265 | + cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% | |
| 266 | + cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) | |
| 267 | + geneIDNam <- genena %>% | |
| 268 | + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | |
| 269 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
| 270 | + } | |
| 271 | + } | |
| 272 | + if(fileex == 0){ | |
| 273 | + #We must create a file that we can access for later use | |
| 274 | + idLOCGPL <- genena %>% | |
| 275 | + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | |
| 276 | + t(.) %>% | |
| 277 | + grep("^ID\\s*$",.) %>% | |
| 278 | + -1 | |
| 279 | + Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) | |
| 280 | + colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") | |
| 281 | + write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) | |
| 282 | + geneIDNam <- genena %>% | |
| 283 | + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | |
| 284 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
| 285 | + } | |
| 286 | + } | |
| 287 | + if(soft == FALSE){ | |
| 288 | + geneIDNam <- genena %>% | |
| 289 | + read_delim(delim="\t",comment = "#")%>% | |
| 290 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
| 291 | + } | |
| 292 | + | |
| 293 | + ##Labeling the gene IDs without names | |
| 294 | + geneIDNam <- NAFIXING(geneIDNam) | |
| 295 | + | |
| 296 | + ##remove the whitespace | |
| 297 | + geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) | |
| 298 | + | |
| 299 | + ##Here is the clean version | |
| 300 | + write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) | |
| 301 | + } | |
| 302 | + | |
| 303 | + | |
| 304 | + | |
| 305 | + ##Changing the gene ID to gene name | |
| 306 | + ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) | |
| 307 | + colnames(ALZDAT) = ALZDAT1[1,] | |
| 308 | + | |
| 309 | + | |
| 310 | + ##Adjusting the column names aka the gene names | |
| 311 | + colnames(ALZDAT) <- gcnames(ALZDAT) | |
| 312 | + | |
| 313 | + | |
| 314 | + #Full RAW Data | |
| 315 | + Fullalzdwr <- ALZDAT %>% | |
| 316 | + as.data.frame() %>% | |
| 317 | + cbind(ALZWORDF,.) | |
| 318 | + | |
| 319 | + #Raw file is output | |
| 320 | + nfnaex <- strsplit(alz,"[\\]") %>% | |
| 321 | + .[[1]] %>% | |
| 322 | + .[length(.)] %>% | |
| 323 | + gsub("\\D","",.) %>% | |
| 324 | + c("GSE",.,"aftexcel.txt") %>% | |
| 325 | + paste(collapse = "") | |
| 326 | + write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") | |
| 327 | + | |
| 328 | + | |
| 329 | + | |
| 330 | + #Now for the discretization part | |
| 331 | + ##get the wordy part again | |
| 332 | + rawword <- t(ALZWORDF) | |
| 333 | + | |
| 334 | + ##where is ID_REF located | |
| 335 | + hereim <- grep("ID_REF",rownames(rawword)) | |
| 336 | + | |
| 337 | + ##Subject Names GSM... | |
| 338 | + subjnam <- rawword[hereim,] | |
| 339 | + | |
| 340 | + ##Getting the names for the rows | |
| 341 | + namedarows <- rownames(rawword)[-hereim] %>% | |
| 342 | + as.data.frame() | |
| 343 | + RAWWORD <- rawword[-hereim,] %>% | |
| 344 | + as.data.frame() %>% | |
| 345 | + bind_cols(namedarows,.) | |
| 346 | + z <- 1 | |
| 347 | + naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) | |
| 348 | + for(z in 1:dim(RAWWORD)[1]){ | |
| 349 | + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
| 350 | + z <- z + 1 | |
| 351 | + } | |
| 352 | + | |
| 353 | + colnames(naroww) <- "ROW_NAs" | |
| 354 | + RAWWORD <- bind_cols(RAWWORD,naroww) | |
| 355 | + | |
| 356 | + | |
| 357 | + roALZna <- t(ALZDAT) %>% | |
| 358 | + rownames(.) %>% | |
| 359 | + as.data.frame(.) | |
| 360 | + colnames(roALZna) <- "ID_REF" | |
| 361 | + | |
| 362 | + RAWDAT <- t(ALZDAT) %>% | |
| 363 | + as.data.frame(.) | |
| 364 | + colnames(RAWDAT) <- NULL | |
| 365 | + rownames(RAWDAT) <- NULL | |
| 366 | + | |
| 367 | + RAWDAT2 <- RAWDAT %>% | |
| 368 | + cbind(roALZna,.) %>% | |
| 369 | + dplyr::arrange(.,ID_REF) | |
| 370 | + | |
| 371 | + ##Editing the file for R processing | |
| 372 | + RAWDATID <- RAWDAT2[,1] %>% | |
| 373 | + as.matrix(.) | |
| 374 | + | |
| 375 | + RAWDATNUM <- RAWDAT2[,-1] %>% | |
| 376 | + mapply(.,FUN = as.numeric) %>% | |
| 377 | + t(.) | |
| 378 | + | |
| 379 | + ##Consolidating genes with the same name | |
| 380 | + ###create empty matrix of size equal to tabRDATID | |
| 381 | + tabRDATID <- table(RAWDATID) | |
| 382 | + NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) | |
| 383 | + j <- 1 | |
| 384 | + for(j in 1:length(tabRDATID)){ | |
| 385 | + ##Putting the ones without duplicates in their new homes | |
| 386 | + if(tabRDATID[j] == 1){ | |
| 387 | + NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] | |
| 388 | + } | |
| 389 | + ##Averaging duplicates and putting them in their new homes | |
| 390 | + if(tabRDATID[j] > 1){ | |
| 391 | + NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) | |
| 392 | + } | |
| 393 | + j <- j + 1 | |
| 394 | + } | |
| 395 | + | |
| 396 | + ##Scaling the Data | |
| 397 | + scrawdat <- NuRDATN%>% | |
| 398 | + scale() | |
| 399 | + attr(scrawdat,"scaled:center") <- NULL | |
| 400 | + attr(scrawdat,"scaled:scale") <- NULL | |
| 401 | + colnames(scrawdat) <- rownames(tabRDATID) | |
| 402 | + | |
| 403 | + ##Discretized the Data | |
| 404 | + dialzdat <- scrawdat %>% | |
| 405 | + dndat(.) %>% | |
| 406 | + t()%>% | |
| 407 | + as.data.frame(.) | |
| 408 | + colnames(dialzdat) <- rownames(RAWDATNUM) | |
| 409 | + | |
| 410 | + ##setting "ID_REF" as a new variable | |
| 411 | + geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) | |
| 412 | + colnames(geneNAM) <- "ID_REF" | |
| 413 | + rownames(dialzdat) <- NULL | |
| 414 | + dialzdat <-bind_cols(geneNAM,dialzdat) | |
| 415 | + | |
| 416 | + ##NAs in a column | |
| 417 | + x <- 2 | |
| 418 | + nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) | |
| 419 | + nacol[1,1] = "COL_NAs" | |
| 420 | + for(x in 2:dim(dialzdat)[2]){ | |
| 421 | + nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) | |
| 422 | + x <- x + 1 | |
| 423 | + } | |
| 424 | + colnames(nacol) <- colnames(dialzdat) | |
| 425 | + dialzdat <- bind_rows(dialzdat,nacol) | |
| 426 | + | |
| 427 | + ##NAs in a row | |
| 428 | + y <- 1 | |
| 429 | + narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) | |
| 430 | + for(y in 1:dim(dialzdat)[1]){ | |
| 431 | + narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) | |
| 432 | + y <- y + 1 | |
| 433 | + } | |
| 434 | + colnames(narowd) <- "ROW_NAs" | |
| 435 | + dialzdat <- bind_cols(dialzdat,narowd) | |
| 436 | + colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam | |
| 437 | + colnames(RAWWORD) <- colnames(dialzdat) | |
| 438 | + ##converting to character so that the clinical can be brought together with discrete data | |
| 439 | + k <- 2 | |
| 440 | + for(k in 2:dim(dialzdat)[2]-1){ | |
| 441 | + dialzdat[,k] <- as.character(dialzdat[,k]) | |
| 442 | + k <- k + 1 | |
| 443 | + } | |
| 444 | + #The End the full data | |
| 445 | + Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) | |
| 446 | + | |
| 447 | + #Produces Discrete file | |
| 448 | + nfnaex2 <- strsplit(alz,"[\\|/]") %>% | |
| 449 | + .[[1]] %>% | |
| 450 | + .[length(.)] %>% | |
| 451 | + gsub("\\D","",.) %>% | |
| 452 | + c("GSE",.,"dscrt.txt") %>% | |
| 453 | + paste(collapse = "") | |
| 454 | + write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) | |
| 455 | + n <- n +1 | |
| 456 | + } | |
| 457 | + } | |
| 458 | + | |
| 459 | + #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN | |
| 460 | + if(numDAT == 2){ | |
| 461 | + #All the files you want to analyze | |
| 462 | + ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:") | |
| 463 | + if(length(ANDIS) == 0){ | |
| 464 | + #Spit out a warning | |
| 465 | + warning("You did not select any files and so no cleaning will be performed") | |
| 466 | + } else{ | |
| 467 | + #indexing the data files | |
| 468 | + n <- 1 | |
| 469 | + for(n in 1: length(ANDIS)){ | |
| 470 | + alz <- ANDIS[n] | |
| 471 | + | |
| 472 | + #Working with the wordy part of the document | |
| 473 | + alzword <- alz %>% | |
| 474 | + read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% | |
| 475 | + filter(grepl("!Sample",X1))%>% | |
| 476 | + filter(!grepl("!Sample_contact",X1)) | |
| 477 | + | |
| 478 | + #Getting the GPL file | |
| 479 | + genena <- grep("_platform_id",alzword$X1) %>% | |
| 480 | + alzword$X2[.] %>% | |
| 481 | + str_trim(.) %>% | |
| 482 | + paste0("^",.) %>% | |
| 483 | + grep(.,list.files()) %>% | |
| 484 | + list.files()[.] | |
| 485 | + | |
| 486 | + #Find out if it is a soft GPL file or not | |
| 487 | + soft <- strsplit(genena,"[\\|/]") %>% | |
| 488 | + .[[1]] %>% | |
| 489 | + .[length(.)] %>% | |
| 490 | + grepl("soft",.) | |
| 491 | + | |
| 492 | + ##Changing row names and column names: | |
| 493 | + ALZWORD <- t(alzword) | |
| 494 | + rownames(ALZWORD)=NULL | |
| 495 | + colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) | |
| 496 | + ALZWORD <- chngrownm(ALZWORD)[-1,] | |
| 497 | + ALZWORD <- ALZWORD%>% | |
| 498 | + as.data.frame()%>% | |
| 499 | + dplyr::select(-starts_with("col")) | |
| 500 | + | |
| 501 | + ##Reorganizing information within the columns and final clinical data | |
| 502 | + ALZWORDF <- cinfo(ALZWORD) | |
| 503 | + | |
| 504 | + | |
| 505 | + #Working with Actual Data part of file | |
| 506 | + alzdat <- alz %>% | |
| 507 | + read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) | |
| 508 | + ALZDAT <- t(alzdat[,-1]) | |
| 509 | + rownames(ALZDAT)=NULL | |
| 510 | + | |
| 511 | + ##Is there a clean version of the GPL file available? | |
| 512 | + gplnum <- strsplit(genena,"[\\|/]") %>% | |
| 513 | + .[[1]] %>% | |
| 514 | + .[length(.)] %>% | |
| 515 | + gsub("\\D","",.) | |
| 516 | + clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) | |
| 517 | + if(clfileex >= 1){ | |
| 518 | + #use the clean version | |
| 519 | + geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% | |
| 520 | + read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") | |
| 521 | + | |
| 522 | + } | |
| 523 | + if(clfileex == 0){ | |
| 524 | + ##Lets Create a clean version | |
| 525 | + | |
| 526 | + ##Gene ID to Gene Name | |
| 527 | + if(soft == TRUE){ | |
| 528 | + #Check to see if there is already a file containing information on soft files | |
| 529 | + fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) | |
| 530 | + if(fileex == 1){ | |
| 531 | + #Check to see if this GPL soft file has been used before | |
| 532 | + IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | |
| 533 | + .$GPL_FILE_NUM%>% | |
| 534 | + grepl(gplnum,.) %>% | |
| 535 | + sum() | |
| 536 | + if(IDF == 1){ | |
| 537 | + IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | |
| 538 | + .$GPL_FILE_NUM%>% | |
| 539 | + grep(gplnum,.) | |
| 540 | + idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | |
| 541 | + .$LOC_ID %>% | |
| 542 | + .[IDLOCAL] | |
| 543 | + geneIDNam <- genena %>% | |
| 544 | + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% | |
| 545 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
| 546 | + } | |
| 547 | + if(IDF == 0){ | |
| 548 | + #No information on this particular GPL file | |
| 549 | + idLOCGPL <- genena %>% | |
| 550 | + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | |
| 551 | + t(.) %>% | |
| 552 | + grep("^ID\\s*$",.) %>% | |
| 553 | + -1 | |
| 554 | + cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% | |
| 555 | + cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) | |
| 556 | + geneIDNam <- genena %>% | |
| 557 | + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | |
| 558 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
| 559 | + } | |
| 560 | + } | |
| 561 | + if(fileex == 0){ | |
| 562 | + #We must create a file that we can access for later use | |
| 563 | + idLOCGPL <- genena %>% | |
| 564 | + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | |
| 565 | + t(.) %>% | |
| 566 | + grep("^ID\\s*$",.) %>% | |
| 567 | + -1 | |
| 568 | + Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) | |
| 569 | + colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") | |
| 570 | + write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) | |
| 571 | + geneIDNam <- genena %>% | |
| 572 | + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | |
| 573 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
| 574 | + } | |
| 575 | + } | |
| 576 | + if(soft == FALSE){ | |
| 577 | + geneIDNam <- genena %>% | |
| 578 | + read_delim(delim="\t",comment = "#")%>% | |
| 579 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
| 580 | + } | |
| 581 | + | |
| 582 | + ##Labeling the gene IDs without names | |
| 583 | + geneIDNam <- NAFIXING(geneIDNam) | |
| 584 | + | |
| 585 | + ##remove the whitespace | |
| 586 | + geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) | |
| 587 | + | |
| 588 | + ##Here is the clean version | |
| 589 | + write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) | |
| 590 | + } | |
| 591 | + | |
| 592 | + | |
| 593 | + | |
| 594 | + ##Changing the gene ID to gene name | |
| 595 | + ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) | |
| 596 | + colnames(ALZDAT) = ALZDAT1[1,] | |
| 597 | + | |
| 598 | + | |
| 599 | + ##Adjusting the column names aka the gene names | |
| 600 | + colnames(ALZDAT) <- gcnames(ALZDAT) | |
| 601 | + | |
| 602 | + | |
| 603 | + #Full RAW Data | |
| 604 | + Fullalzdwr <- ALZDAT %>% | |
| 605 | + as.data.frame() %>% | |
| 606 | + cbind(ALZWORDF,.) | |
| 607 | + | |
| 608 | + #Raw file is output | |
| 609 | + nfnaex <- strsplit(alz,"[\\]") %>% | |
| 610 | + .[[1]] %>% | |
| 611 | + .[length(.)] %>% | |
| 612 | + gsub("\\D","",.) %>% | |
| 613 | + c("GSE",.,"aftexcel.txt") %>% | |
| 614 | + paste(collapse = "") | |
| 615 | + write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") | |
| 616 | + | |
| 617 | + | |
| 618 | + | |
| 619 | + #Now for the discretization part | |
| 620 | + ##get the wordy part again | |
| 621 | + rawword <- t(ALZWORDF) | |
| 622 | + | |
| 623 | + ##where is ID_REF located | |
| 624 | + hereim <- grep("ID_REF",rownames(rawword)) | |
| 625 | + | |
| 626 | + ##Subject Names GSM... | |
| 627 | + subjnam <- rawword[hereim,] | |
| 628 | + | |
| 629 | + ##Getting the names for the rows | |
| 630 | + namedarows <- rownames(rawword)[-hereim] %>% | |
| 631 | + as.data.frame() | |
| 632 | + RAWWORD <- rawword[-hereim,] %>% | |
| 633 | + as.data.frame() %>% | |
| 634 | + bind_cols(namedarows,.) | |
| 635 | + z <- 1 | |
| 636 | + naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) | |
| 637 | + for(z in 1:dim(RAWWORD)[1]){ | |
| 638 | + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
| 639 | + z <- z + 1 | |
| 640 | + } | |
| 641 | + | |
| 642 | + colnames(naroww) <- "ROW_NAs" | |
| 643 | + RAWWORD <- bind_cols(RAWWORD,naroww) | |
| 644 | + | |
| 645 | + | |
| 646 | + roALZna <- t(ALZDAT) %>% | |
| 647 | + rownames(.) %>% | |
| 648 | + as.data.frame(.) | |
| 649 | + colnames(roALZna) <- "ID_REF" | |
| 650 | + | |
| 651 | + RAWDAT <- t(ALZDAT) %>% | |
| 652 | + as.data.frame(.) | |
| 653 | + colnames(RAWDAT) <- NULL | |
| 654 | + rownames(RAWDAT) <- NULL | |
| 655 | + | |
| 656 | + RAWDAT2 <- RAWDAT %>% | |
| 657 | + cbind(roALZna,.) %>% | |
| 658 | + dplyr::arrange(.,ID_REF) | |
| 659 | + | |
| 660 | + ##Editing the file for R processing | |
| 661 | + RAWDATID <- RAWDAT2[,1] %>% | |
| 662 | + as.matrix(.) | |
| 663 | + | |
| 664 | + RAWDATNUM <- RAWDAT2[,-1] %>% | |
| 665 | + mapply(.,FUN = as.numeric) %>% | |
| 666 | + t(.) | |
| 667 | + | |
| 668 | + ##Consolidating genes with the same name | |
| 669 | + ###create empty matrix of size equal to tabRDATID | |
| 670 | + tabRDATID <- table(RAWDATID) | |
| 671 | + NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) | |
| 672 | + j <- 1 | |
| 673 | + for(j in 1:length(tabRDATID)){ | |
| 674 | + ##Putting the ones without duplicates in their new homes | |
| 675 | + if(tabRDATID[j] == 1){ | |
| 676 | + NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] | |
| 677 | + } | |
| 678 | + ##Averaging duplicates and putting them in their new homes | |
| 679 | + if(tabRDATID[j] > 1){ | |
| 680 | + NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) | |
| 681 | + } | |
| 682 | + j <- j + 1 | |
| 683 | + } | |
| 684 | + | |
| 685 | + ##Scaling the Data | |
| 686 | + scrawdat <- NuRDATN%>% | |
| 687 | + scale() | |
| 688 | + attr(scrawdat,"scaled:center") <- NULL | |
| 689 | + attr(scrawdat,"scaled:scale") <- NULL | |
| 690 | + colnames(scrawdat) <- rownames(tabRDATID) | |
| 691 | + | |
| 692 | + ##Discretized the Data | |
| 693 | + dialzdat <- scrawdat %>% | |
| 694 | + dndat(.) %>% | |
| 695 | + t()%>% | |
| 696 | + as.data.frame(.) | |
| 697 | + colnames(dialzdat) <- rownames(RAWDATNUM) | |
| 698 | + | |
| 699 | + ##setting "ID_REF" as a new variable | |
| 700 | + geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) | |
| 701 | + colnames(geneNAM) <- "ID_REF" | |
| 702 | + rownames(dialzdat) <- NULL | |
| 703 | + dialzdat <-bind_cols(geneNAM,dialzdat) | |
| 704 | + | |
| 705 | + ##NAs in a column | |
| 706 | + x <- 2 | |
| 707 | + nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) | |
| 708 | + nacol[1,1] = "COL_NAs" | |
| 709 | + for(x in 2:dim(dialzdat)[2]){ | |
| 710 | + nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) | |
| 711 | + x <- x + 1 | |
| 712 | + } | |
| 713 | + colnames(nacol) <- colnames(dialzdat) | |
| 714 | + dialzdat <- bind_rows(dialzdat,nacol) | |
| 715 | + | |
| 716 | + ##NAs in a row | |
| 717 | + y <- 1 | |
| 718 | + narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) | |
| 719 | + for(y in 1:dim(dialzdat)[1]){ | |
| 720 | + narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) | |
| 721 | + y <- y + 1 | |
| 722 | + } | |
| 723 | + colnames(narowd) <- "ROW_NAs" | |
| 724 | + dialzdat <- bind_cols(dialzdat,narowd) | |
| 725 | + colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam | |
| 726 | + colnames(RAWWORD) <- colnames(dialzdat) | |
| 727 | + ##converting to character so that the clinical can be brought together with discrete data | |
| 728 | + k <- 2 | |
| 729 | + for(k in 2:dim(dialzdat)[2]-1){ | |
| 730 | + dialzdat[,k] <- as.character(dialzdat[,k]) | |
| 731 | + k <- k + 1 | |
| 732 | + } | |
| 733 | + #The End the full data | |
| 734 | + Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) | |
| 735 | + | |
| 736 | + #Produces Discrete file | |
| 737 | + nfnaex2 <- strsplit(alz,"[\\|/]") %>% | |
| 738 | + .[[1]] %>% | |
| 739 | + .[length(.)] %>% | |
| 740 | + gsub("\\D","",.) %>% | |
| 741 | + c("GSE",.,"dscrt.txt") %>% | |
| 742 | + paste(collapse = "") | |
| 743 | + write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) | |
| 744 | + | |
| 745 | + | |
| 746 | + n <- n + 1 | |
| 747 | + } | |
| 748 | + } | |
| 749 | + } | |
| 750 | +} | |
| 751 | +#The Rest of this code will be used every time you want to change a data set | |
| 752 | +THEFT() | |
| 0 | 753 | \ No newline at end of file |