Commit adfed316993d072e749a8cd85434fc667c054f22

Authored by Efrain Gonzalez
1 parent 689231363c
Exists in master

An automated version of the RCleanDscret.R

Working on outputting more insightful errors and warnings. (UNTESTED)
Showing 1 changed file with 752 additions and 0 deletions   Show diff stats
... ... @@ -0,0 +1,752 @@
  1 +#Efrain H. Gonzalez
  2 +#6/19/2017
  3 +#Libraries required to run the code
  4 +library(pryr)
  5 +library(MASS)
  6 +library(dplyr)
  7 +library(tidyr)
  8 +library(readr)
  9 +library(stringr)
  10 +
  11 +
  12 +#Necessary Functions
  13 +#1#Function for handling the changing of row names and column names
  14 +chngrownm <- function(mat){
  15 + row <- dim(mat)[1]
  16 + col <- dim(mat)[2]
  17 + j <- 1
  18 + x <- 1
  19 + p <- 1
  20 + a <- 1
  21 + b <- 1
  22 + g <- 1
  23 + for(j in 1:col){
  24 + if("!Sample_source_name_ch1"==mat[1,j]){
  25 + colnames(mat)[j] <- "Brain_Region"
  26 + }
  27 + if("!Sample_title" == mat[1,j]){
  28 + colnames(mat)[j] <- "Title"
  29 + }
  30 + if("!Sample_geo_accession" == mat[1,j]){
  31 + colnames(mat)[j] <- "ID_REF"
  32 + } else{
  33 + if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
  34 + colnames(mat)[j] <- paste0("Sex",x)
  35 + x = x + 1
  36 + }
  37 + if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
  38 + colnames(mat)[j] <- paste0("PMI",p)
  39 + p = p + 1
  40 + }
  41 + if(grepl("age|Age|AGE",mat[2,j])==TRUE){
  42 + colnames(mat)[j] <- paste0("Age",a)
  43 + a = a + 1
  44 + }
  45 + if(grepl("braak|b&b",mat[2,j])==TRUE){
  46 + colnames(mat)[j] <- paste0("Braak",b)
  47 + b = b + 1
  48 + }
  49 + if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){
  50 + colnames(mat)[j] <- paste0("Group",g)
  51 + g = g + 1
  52 + }
  53 +
  54 + }
  55 + j = j + 1
  56 + }
  57 + mat
  58 +}
  59 +
  60 +#2#Function for reorganizing information within the columns
  61 +cinfo <- function(mat){
  62 + col <- dim(mat)[2]
  63 + j <-2
  64 + for(j in 2:col){
  65 + if(grepl("Group",colnames(mat)[j]) == TRUE){
  66 + mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
  67 + }
  68 + if(grepl("Age",colnames(mat)[j])==TRUE){
  69 + mat[,j] <- gsub("\\D","",mat[,j])%>%
  70 + as.integer()
  71 + }
  72 + if(grepl("Sex",colnames(mat)[j])==TRUE){
  73 + mat[,j] <- gsub(".+:\\s","",mat[,j])
  74 + }
  75 + if(grepl("PMI",colnames(mat)[j])==TRUE){
  76 + mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
  77 + as.numeric()
  78 + }
  79 + if(grepl("Braak",colnames(mat)[j])==TRUE){
  80 + mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
  81 + as.roman()%>%
  82 + as.integer()
  83 + }
  84 + j=j+1
  85 + }
  86 + mat
  87 +}
  88 +
  89 +#3#Function for labeling the gene IDs without names
  90 +NAFIXING <- function(GIDNAM){
  91 + row <- dim(GIDNAM)[1]
  92 + i <- 1
  93 + for(i in 1:row){
  94 + if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
  95 + GIDNAM[i,2] <- GIDNAM[i,1]
  96 + }
  97 + i <- i + 1
  98 + }
  99 + GIDNAM
  100 +}
  101 +
  102 +#4#Function for changing the gene ID to gene name
  103 +cgeneID <- function(GeneName,DATA){
  104 + colGene <- dim(GeneName)[2]
  105 + j <- 1
  106 + for(j in 1:colGene){
  107 + chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
  108 + if(is.na(sum(chngsreq))==FALSE){
  109 + if(sum(chngsreq) > 0){
  110 + DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
  111 + }
  112 + }
  113 + j = j+1
  114 + }
  115 + DATA
  116 +}
  117 +
  118 +#5#Function for adjusting the gene names
  119 +gcnames <- function(DiData,usecol=1){
  120 + nuruns <- dim(DiData)[2]
  121 + i = 1
  122 + nwnam <- rep("0",length.out=nuruns)
  123 + for(i in 1:nuruns){
  124 + if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
  125 + nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
  126 + } else{
  127 + nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
  128 + }
  129 +
  130 + }
  131 + nwnam
  132 +
  133 +}
  134 +
  135 +#6# Function for discretizing the data
  136 +dndat <- function(NDATA){
  137 + rownd <- dim(NDATA)[1]
  138 + colnd <- dim(NDATA)[2]
  139 + DDATA <- matrix(0,nrow=rownd,ncol=colnd)
  140 + colnames(DDATA) <- colnames(NDATA)
  141 + i <- 1
  142 + for(i in 1:rownd){
  143 + j <- 1
  144 + for(j in 1:colnd){
  145 + if(is.na(NDATA[i,j])==FALSE){
  146 +
  147 + if(NDATA[i,j] < -1){
  148 + DDATA[i,j]=0L
  149 + }
  150 + if(NDATA[i,j] > 1){
  151 + DDATA[i,j]=2L
  152 + }
  153 + if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
  154 + DDATA[i,j]=1L
  155 + }
  156 + } else{
  157 + DDATA[i,j] = NDATA[i,j]
  158 + }
  159 + j = j + 1
  160 + }
  161 + i = i + 1
  162 + }
  163 + DDATA
  164 +}
  165 +
  166 +
  167 +#MajorFunction#This is the function that does everything else
  168 +THEFT <- function(){
  169 + #Set working directory based on the directory of the series matrix file Currently only works for windows
  170 + wd <- getwd()
  171 + #list.files()
  172 + #gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
  173 + numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
  174 + GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
  175 +
  176 + #ALL DATA FILES WILL BE CLEANED
  177 + if(numDAT == 1){
  178 + #indexing the data files
  179 + n <- 1
  180 + for(n in 1: length(GSEfileloc)){
  181 + alz <- list.files()[GSEfileloc[n]]
  182 +
  183 + #Working with the wordy part of the document
  184 + alzword <- alz %>%
  185 + read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
  186 + filter(grepl("!Sample",X1))%>%
  187 + filter(!grepl("!Sample_contact",X1))
  188 +
  189 + #Getting the GPL file
  190 + genena <- grep("_platform_id",alzword$X1) %>%
  191 + alzword$X2[.] %>%
  192 + str_trim(.) %>%
  193 + paste0("^",.) %>%
  194 + grep(.,list.files()) %>%
  195 + list.files()[.]
  196 +
  197 + #Find out if it is a soft GPL file or not
  198 + soft <- strsplit(genena,"[\\|/]") %>%
  199 + .[[1]] %>%
  200 + .[length(.)] %>%
  201 + grepl("soft",.)
  202 +
  203 + ##Changing row names and column names:
  204 + ALZWORD <- t(alzword)
  205 + rownames(ALZWORD)=NULL
  206 + colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
  207 + ALZWORD <- chngrownm(ALZWORD)[-1,]
  208 + ALZWORD <- ALZWORD%>%
  209 + as.data.frame()%>%
  210 + dplyr::select(-starts_with("col"))
  211 +
  212 + ##Reorganizing information within the columns and final clinical data
  213 + ALZWORDF <- cinfo(ALZWORD)
  214 +
  215 +
  216 + #Working with Actual Data part of file
  217 + alzdat <- alz %>%
  218 + read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
  219 + ALZDAT <- t(alzdat[,-1])
  220 + rownames(ALZDAT)=NULL
  221 +
  222 + ##Is there a clean version of the GPL file available?
  223 + gplnum <- strsplit(genena,"[\\|/]") %>%
  224 + .[[1]] %>%
  225 + .[length(.)] %>%
  226 + gsub("\\D","",.)
  227 + clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
  228 + if(clfileex >= 1){
  229 + #use the clean version
  230 + geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
  231 + read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
  232 +
  233 + }
  234 + if(clfileex == 0){
  235 + ##Lets Create a clean version
  236 +
  237 + ##Gene ID to Gene Name
  238 + if(soft == TRUE){
  239 + #Check to see if there is already a file containing information on soft files
  240 + fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
  241 + if(fileex == 1){
  242 + #Check to see if this GPL soft file has been used before
  243 + IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
  244 + .$GPL_FILE_NUM%>%
  245 + grepl(gplnum,.) %>%
  246 + sum()
  247 + if(IDF == 1){
  248 + IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
  249 + .$GPL_FILE_NUM%>%
  250 + grep(gplnum,.)
  251 + idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
  252 + .$LOC_ID %>%
  253 + .[IDLOCAL]
  254 + geneIDNam <- genena %>%
  255 + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
  256 + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
  257 + }
  258 + if(IDF == 0){
  259 + #No information on this particular GPL file
  260 + idLOCGPL <- genena %>%
  261 + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
  262 + t(.) %>%
  263 + grep("^ID\\s*$",.) %>%
  264 + -1
  265 + cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
  266 + cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
  267 + geneIDNam <- genena %>%
  268 + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
  269 + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
  270 + }
  271 + }
  272 + if(fileex == 0){
  273 + #We must create a file that we can access for later use
  274 + idLOCGPL <- genena %>%
  275 + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
  276 + t(.) %>%
  277 + grep("^ID\\s*$",.) %>%
  278 + -1
  279 + Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
  280 + colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
  281 + write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
  282 + geneIDNam <- genena %>%
  283 + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
  284 + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
  285 + }
  286 + }
  287 + if(soft == FALSE){
  288 + geneIDNam <- genena %>%
  289 + read_delim(delim="\t",comment = "#")%>%
  290 + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
  291 + }
  292 +
  293 + ##Labeling the gene IDs without names
  294 + geneIDNam <- NAFIXING(geneIDNam)
  295 +
  296 + ##remove the whitespace
  297 + geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
  298 +
  299 + ##Here is the clean version
  300 + write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
  301 + }
  302 +
  303 +
  304 +
  305 + ##Changing the gene ID to gene name
  306 + ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
  307 + colnames(ALZDAT) = ALZDAT1[1,]
  308 +
  309 +
  310 + ##Adjusting the column names aka the gene names
  311 + colnames(ALZDAT) <- gcnames(ALZDAT)
  312 +
  313 +
  314 + #Full RAW Data
  315 + Fullalzdwr <- ALZDAT %>%
  316 + as.data.frame() %>%
  317 + cbind(ALZWORDF,.)
  318 +
  319 + #Raw file is output
  320 + nfnaex <- strsplit(alz,"[\\]") %>%
  321 + .[[1]] %>%
  322 + .[length(.)] %>%
  323 + gsub("\\D","",.) %>%
  324 + c("GSE",.,"aftexcel.txt") %>%
  325 + paste(collapse = "")
  326 + write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
  327 +
  328 +
  329 +
  330 + #Now for the discretization part
  331 + ##get the wordy part again
  332 + rawword <- t(ALZWORDF)
  333 +
  334 + ##where is ID_REF located
  335 + hereim <- grep("ID_REF",rownames(rawword))
  336 +
  337 + ##Subject Names GSM...
  338 + subjnam <- rawword[hereim,]
  339 +
  340 + ##Getting the names for the rows
  341 + namedarows <- rownames(rawword)[-hereim] %>%
  342 + as.data.frame()
  343 + RAWWORD <- rawword[-hereim,] %>%
  344 + as.data.frame() %>%
  345 + bind_cols(namedarows,.)
  346 + z <- 1
  347 + naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
  348 + for(z in 1:dim(RAWWORD)[1]){
  349 + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
  350 + z <- z + 1
  351 + }
  352 +
  353 + colnames(naroww) <- "ROW_NAs"
  354 + RAWWORD <- bind_cols(RAWWORD,naroww)
  355 +
  356 +
  357 + roALZna <- t(ALZDAT) %>%
  358 + rownames(.) %>%
  359 + as.data.frame(.)
  360 + colnames(roALZna) <- "ID_REF"
  361 +
  362 + RAWDAT <- t(ALZDAT) %>%
  363 + as.data.frame(.)
  364 + colnames(RAWDAT) <- NULL
  365 + rownames(RAWDAT) <- NULL
  366 +
  367 + RAWDAT2 <- RAWDAT %>%
  368 + cbind(roALZna,.) %>%
  369 + dplyr::arrange(.,ID_REF)
  370 +
  371 + ##Editing the file for R processing
  372 + RAWDATID <- RAWDAT2[,1] %>%
  373 + as.matrix(.)
  374 +
  375 + RAWDATNUM <- RAWDAT2[,-1] %>%
  376 + mapply(.,FUN = as.numeric) %>%
  377 + t(.)
  378 +
  379 + ##Consolidating genes with the same name
  380 + ###create empty matrix of size equal to tabRDATID
  381 + tabRDATID <- table(RAWDATID)
  382 + NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
  383 + j <- 1
  384 + for(j in 1:length(tabRDATID)){
  385 + ##Putting the ones without duplicates in their new homes
  386 + if(tabRDATID[j] == 1){
  387 + NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
  388 + }
  389 + ##Averaging duplicates and putting them in their new homes
  390 + if(tabRDATID[j] > 1){
  391 + NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
  392 + }
  393 + j <- j + 1
  394 + }
  395 +
  396 + ##Scaling the Data
  397 + scrawdat <- NuRDATN%>%
  398 + scale()
  399 + attr(scrawdat,"scaled:center") <- NULL
  400 + attr(scrawdat,"scaled:scale") <- NULL
  401 + colnames(scrawdat) <- rownames(tabRDATID)
  402 +
  403 + ##Discretized the Data
  404 + dialzdat <- scrawdat %>%
  405 + dndat(.) %>%
  406 + t()%>%
  407 + as.data.frame(.)
  408 + colnames(dialzdat) <- rownames(RAWDATNUM)
  409 +
  410 + ##setting "ID_REF" as a new variable
  411 + geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
  412 + colnames(geneNAM) <- "ID_REF"
  413 + rownames(dialzdat) <- NULL
  414 + dialzdat <-bind_cols(geneNAM,dialzdat)
  415 +
  416 + ##NAs in a column
  417 + x <- 2
  418 + nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
  419 + nacol[1,1] = "COL_NAs"
  420 + for(x in 2:dim(dialzdat)[2]){
  421 + nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
  422 + x <- x + 1
  423 + }
  424 + colnames(nacol) <- colnames(dialzdat)
  425 + dialzdat <- bind_rows(dialzdat,nacol)
  426 +
  427 + ##NAs in a row
  428 + y <- 1
  429 + narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
  430 + for(y in 1:dim(dialzdat)[1]){
  431 + narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
  432 + y <- y + 1
  433 + }
  434 + colnames(narowd) <- "ROW_NAs"
  435 + dialzdat <- bind_cols(dialzdat,narowd)
  436 + colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
  437 + colnames(RAWWORD) <- colnames(dialzdat)
  438 + ##converting to character so that the clinical can be brought together with discrete data
  439 + k <- 2
  440 + for(k in 2:dim(dialzdat)[2]-1){
  441 + dialzdat[,k] <- as.character(dialzdat[,k])
  442 + k <- k + 1
  443 + }
  444 + #The End the full data
  445 + Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
  446 +
  447 + #Produces Discrete file
  448 + nfnaex2 <- strsplit(alz,"[\\|/]") %>%
  449 + .[[1]] %>%
  450 + .[length(.)] %>%
  451 + gsub("\\D","",.) %>%
  452 + c("GSE",.,"dscrt.txt") %>%
  453 + paste(collapse = "")
  454 + write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
  455 + n <- n +1
  456 + }
  457 + }
  458 +
  459 + #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
  460 + if(numDAT == 2){
  461 + #All the files you want to analyze
  462 + ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
  463 + if(length(ANDIS) == 0){
  464 + #Spit out a warning
  465 + warning("You did not select any files and so no cleaning will be performed")
  466 + } else{
  467 + #indexing the data files
  468 + n <- 1
  469 + for(n in 1: length(ANDIS)){
  470 + alz <- ANDIS[n]
  471 +
  472 + #Working with the wordy part of the document
  473 + alzword <- alz %>%
  474 + read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
  475 + filter(grepl("!Sample",X1))%>%
  476 + filter(!grepl("!Sample_contact",X1))
  477 +
  478 + #Getting the GPL file
  479 + genena <- grep("_platform_id",alzword$X1) %>%
  480 + alzword$X2[.] %>%
  481 + str_trim(.) %>%
  482 + paste0("^",.) %>%
  483 + grep(.,list.files()) %>%
  484 + list.files()[.]
  485 +
  486 + #Find out if it is a soft GPL file or not
  487 + soft <- strsplit(genena,"[\\|/]") %>%
  488 + .[[1]] %>%
  489 + .[length(.)] %>%
  490 + grepl("soft",.)
  491 +
  492 + ##Changing row names and column names:
  493 + ALZWORD <- t(alzword)
  494 + rownames(ALZWORD)=NULL
  495 + colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
  496 + ALZWORD <- chngrownm(ALZWORD)[-1,]
  497 + ALZWORD <- ALZWORD%>%
  498 + as.data.frame()%>%
  499 + dplyr::select(-starts_with("col"))
  500 +
  501 + ##Reorganizing information within the columns and final clinical data
  502 + ALZWORDF <- cinfo(ALZWORD)
  503 +
  504 +
  505 + #Working with Actual Data part of file
  506 + alzdat <- alz %>%
  507 + read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
  508 + ALZDAT <- t(alzdat[,-1])
  509 + rownames(ALZDAT)=NULL
  510 +
  511 + ##Is there a clean version of the GPL file available?
  512 + gplnum <- strsplit(genena,"[\\|/]") %>%
  513 + .[[1]] %>%
  514 + .[length(.)] %>%
  515 + gsub("\\D","",.)
  516 + clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
  517 + if(clfileex >= 1){
  518 + #use the clean version
  519 + geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
  520 + read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
  521 +
  522 + }
  523 + if(clfileex == 0){
  524 + ##Lets Create a clean version
  525 +
  526 + ##Gene ID to Gene Name
  527 + if(soft == TRUE){
  528 + #Check to see if there is already a file containing information on soft files
  529 + fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
  530 + if(fileex == 1){
  531 + #Check to see if this GPL soft file has been used before
  532 + IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
  533 + .$GPL_FILE_NUM%>%
  534 + grepl(gplnum,.) %>%
  535 + sum()
  536 + if(IDF == 1){
  537 + IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
  538 + .$GPL_FILE_NUM%>%
  539 + grep(gplnum,.)
  540 + idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
  541 + .$LOC_ID %>%
  542 + .[IDLOCAL]
  543 + geneIDNam <- genena %>%
  544 + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
  545 + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
  546 + }
  547 + if(IDF == 0){
  548 + #No information on this particular GPL file
  549 + idLOCGPL <- genena %>%
  550 + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
  551 + t(.) %>%
  552 + grep("^ID\\s*$",.) %>%
  553 + -1
  554 + cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
  555 + cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
  556 + geneIDNam <- genena %>%
  557 + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
  558 + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
  559 + }
  560 + }
  561 + if(fileex == 0){
  562 + #We must create a file that we can access for later use
  563 + idLOCGPL <- genena %>%
  564 + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
  565 + t(.) %>%
  566 + grep("^ID\\s*$",.) %>%
  567 + -1
  568 + Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
  569 + colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
  570 + write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
  571 + geneIDNam <- genena %>%
  572 + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
  573 + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
  574 + }
  575 + }
  576 + if(soft == FALSE){
  577 + geneIDNam <- genena %>%
  578 + read_delim(delim="\t",comment = "#")%>%
  579 + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
  580 + }
  581 +
  582 + ##Labeling the gene IDs without names
  583 + geneIDNam <- NAFIXING(geneIDNam)
  584 +
  585 + ##remove the whitespace
  586 + geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
  587 +
  588 + ##Here is the clean version
  589 + write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
  590 + }
  591 +
  592 +
  593 +
  594 + ##Changing the gene ID to gene name
  595 + ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
  596 + colnames(ALZDAT) = ALZDAT1[1,]
  597 +
  598 +
  599 + ##Adjusting the column names aka the gene names
  600 + colnames(ALZDAT) <- gcnames(ALZDAT)
  601 +
  602 +
  603 + #Full RAW Data
  604 + Fullalzdwr <- ALZDAT %>%
  605 + as.data.frame() %>%
  606 + cbind(ALZWORDF,.)
  607 +
  608 + #Raw file is output
  609 + nfnaex <- strsplit(alz,"[\\]") %>%
  610 + .[[1]] %>%
  611 + .[length(.)] %>%
  612 + gsub("\\D","",.) %>%
  613 + c("GSE",.,"aftexcel.txt") %>%
  614 + paste(collapse = "")
  615 + write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
  616 +
  617 +
  618 +
  619 + #Now for the discretization part
  620 + ##get the wordy part again
  621 + rawword <- t(ALZWORDF)
  622 +
  623 + ##where is ID_REF located
  624 + hereim <- grep("ID_REF",rownames(rawword))
  625 +
  626 + ##Subject Names GSM...
  627 + subjnam <- rawword[hereim,]
  628 +
  629 + ##Getting the names for the rows
  630 + namedarows <- rownames(rawword)[-hereim] %>%
  631 + as.data.frame()
  632 + RAWWORD <- rawword[-hereim,] %>%
  633 + as.data.frame() %>%
  634 + bind_cols(namedarows,.)
  635 + z <- 1
  636 + naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
  637 + for(z in 1:dim(RAWWORD)[1]){
  638 + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
  639 + z <- z + 1
  640 + }
  641 +
  642 + colnames(naroww) <- "ROW_NAs"
  643 + RAWWORD <- bind_cols(RAWWORD,naroww)
  644 +
  645 +
  646 + roALZna <- t(ALZDAT) %>%
  647 + rownames(.) %>%
  648 + as.data.frame(.)
  649 + colnames(roALZna) <- "ID_REF"
  650 +
  651 + RAWDAT <- t(ALZDAT) %>%
  652 + as.data.frame(.)
  653 + colnames(RAWDAT) <- NULL
  654 + rownames(RAWDAT) <- NULL
  655 +
  656 + RAWDAT2 <- RAWDAT %>%
  657 + cbind(roALZna,.) %>%
  658 + dplyr::arrange(.,ID_REF)
  659 +
  660 + ##Editing the file for R processing
  661 + RAWDATID <- RAWDAT2[,1] %>%
  662 + as.matrix(.)
  663 +
  664 + RAWDATNUM <- RAWDAT2[,-1] %>%
  665 + mapply(.,FUN = as.numeric) %>%
  666 + t(.)
  667 +
  668 + ##Consolidating genes with the same name
  669 + ###create empty matrix of size equal to tabRDATID
  670 + tabRDATID <- table(RAWDATID)
  671 + NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
  672 + j <- 1
  673 + for(j in 1:length(tabRDATID)){
  674 + ##Putting the ones without duplicates in their new homes
  675 + if(tabRDATID[j] == 1){
  676 + NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
  677 + }
  678 + ##Averaging duplicates and putting them in their new homes
  679 + if(tabRDATID[j] > 1){
  680 + NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
  681 + }
  682 + j <- j + 1
  683 + }
  684 +
  685 + ##Scaling the Data
  686 + scrawdat <- NuRDATN%>%
  687 + scale()
  688 + attr(scrawdat,"scaled:center") <- NULL
  689 + attr(scrawdat,"scaled:scale") <- NULL
  690 + colnames(scrawdat) <- rownames(tabRDATID)
  691 +
  692 + ##Discretized the Data
  693 + dialzdat <- scrawdat %>%
  694 + dndat(.) %>%
  695 + t()%>%
  696 + as.data.frame(.)
  697 + colnames(dialzdat) <- rownames(RAWDATNUM)
  698 +
  699 + ##setting "ID_REF" as a new variable
  700 + geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
  701 + colnames(geneNAM) <- "ID_REF"
  702 + rownames(dialzdat) <- NULL
  703 + dialzdat <-bind_cols(geneNAM,dialzdat)
  704 +
  705 + ##NAs in a column
  706 + x <- 2
  707 + nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
  708 + nacol[1,1] = "COL_NAs"
  709 + for(x in 2:dim(dialzdat)[2]){
  710 + nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
  711 + x <- x + 1
  712 + }
  713 + colnames(nacol) <- colnames(dialzdat)
  714 + dialzdat <- bind_rows(dialzdat,nacol)
  715 +
  716 + ##NAs in a row
  717 + y <- 1
  718 + narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
  719 + for(y in 1:dim(dialzdat)[1]){
  720 + narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
  721 + y <- y + 1
  722 + }
  723 + colnames(narowd) <- "ROW_NAs"
  724 + dialzdat <- bind_cols(dialzdat,narowd)
  725 + colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
  726 + colnames(RAWWORD) <- colnames(dialzdat)
  727 + ##converting to character so that the clinical can be brought together with discrete data
  728 + k <- 2
  729 + for(k in 2:dim(dialzdat)[2]-1){
  730 + dialzdat[,k] <- as.character(dialzdat[,k])
  731 + k <- k + 1
  732 + }
  733 + #The End the full data
  734 + Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
  735 +
  736 + #Produces Discrete file
  737 + nfnaex2 <- strsplit(alz,"[\\|/]") %>%
  738 + .[[1]] %>%
  739 + .[length(.)] %>%
  740 + gsub("\\D","",.) %>%
  741 + c("GSE",.,"dscrt.txt") %>%
  742 + paste(collapse = "")
  743 + write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
  744 +
  745 +
  746 + n <- n + 1
  747 + }
  748 + }
  749 + }
  750 +}
  751 +#The Rest of this code will be used every time you want to change a data set
  752 +THEFT()
0 753 \ No newline at end of file