Commit 0eb3420561162679cbaa6e877e4bb7621c5d694b

Authored by Efrain Gonzalez
1 parent 2167ed7633
Exists in master

This code combines the cleaning and discretizing processes. (UNTESTED)

Two files are output one with raw data and the other with discretized data.
Showing 1 changed file with 445 additions and 0 deletions   Show diff stats
... ... @@ -0,0 +1,445 @@
  1 +#Libraries required to run the code
  2 +library(pryr)
  3 +library(MASS)
  4 +library(dplyr)
  5 +library(tidyr)
  6 +library(readr)
  7 +library(stringr)
  8 +
  9 +
  10 +#Necessary Functions
  11 +#1#Function for handling the changing of row names and column names
  12 +chngrownm <- function(mat){
  13 + row <- dim(mat)[1]
  14 + col <- dim(mat)[2]
  15 + j <- 1
  16 + x <- 1
  17 + p <- 1
  18 + a <- 1
  19 + b <- 1
  20 + g <- 1
  21 + for(j in 1:col){
  22 + if("!Sample_source_name_ch1"==mat[1,j]){
  23 + colnames(mat)[j] <- "Brain_Region"
  24 + }
  25 + if("!Sample_title" == mat[1,j]){
  26 + colnames(mat)[j] <- "Title"
  27 + }
  28 + if("!Sample_geo_accession" == mat[1,j]){
  29 + colnames(mat)[j] <- "ID_REF"
  30 + } else{
  31 + if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
  32 + colnames(mat)[j] <- paste0("Sex",x)
  33 + x = x + 1
  34 + }
  35 + if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
  36 + colnames(mat)[j] <- paste0("PMI",p)
  37 + p = p + 1
  38 + }
  39 + if(grepl("age|Age|AGE",mat[2,j])==TRUE){
  40 + colnames(mat)[j] <- paste0("Age",a)
  41 + a = a + 1
  42 + }
  43 + if(grepl("braak|b&b",mat[2,j])==TRUE){
  44 + colnames(mat)[j] <- paste0("Braak",b)
  45 + b = b + 1
  46 + }
  47 + if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
  48 + colnames(mat)[j] <- paste0("Group",g)
  49 + g = g + 1
  50 + }
  51 +
  52 + }
  53 + j = j + 1
  54 + }
  55 + mat
  56 +}
  57 +
  58 +#2#Function for reorganizing information within the columns
  59 +cinfo <- function(mat){
  60 + col <- dim(mat)[2]
  61 + j <-2
  62 + for(j in 2:col){
  63 + if(grepl("Group",colnames(mat)[j]) == TRUE){
  64 + mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
  65 + }
  66 + if(grepl("Age",colnames(mat)[j])==TRUE){
  67 + mat[,j] <- gsub("\\D","",mat[,j])%>%
  68 + as.integer()
  69 + }
  70 + if(grepl("Sex",colnames(mat)[j])==TRUE){
  71 + mat[,j] <- gsub(".+:\\s","",mat[,j])
  72 + }
  73 + if(grepl("PMI",colnames(mat)[j])==TRUE){
  74 + mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
  75 + as.numeric()
  76 + }
  77 + if(grepl("Braak",colnames(mat)[j])==TRUE){
  78 + mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
  79 + as.roman()%>%
  80 + as.integer()
  81 + }
  82 + j=j+1
  83 + }
  84 + mat
  85 +}
  86 +
  87 +#3#Function for labeling the gene IDs without names
  88 +NAFIXING <- function(GIDNAM){
  89 + row <- dim(GIDNAM)[1]
  90 + i <- 1
  91 + for(i in 1:row){
  92 + if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
  93 + GIDNAM[i,2] <- GIDNAM[i,1]
  94 + }
  95 + i <- i + 1
  96 + }
  97 + GIDNAM
  98 +}
  99 +
  100 +#4#Function for changing the gene ID to gene name
  101 +cgeneID <- function(GeneName,DATA){
  102 + colGene <- dim(GeneName)[2]
  103 + j <- 1
  104 + for(j in 1:colGene){
  105 + chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
  106 + if(is.na(sum(chngsreq))==FALSE){
  107 + if(sum(chngsreq) > 0){
  108 + DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
  109 + }
  110 + }
  111 + #if(sum(chngsreq) > 0){
  112 + ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
  113 + #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
  114 + #}
  115 + j = j+1
  116 + }
  117 + DATA
  118 +}
  119 +
  120 +#5#Function for adjusting the gene names
  121 +gcnames <- function(DiData,usecol=1){
  122 + nuruns <- dim(DiData)[2]
  123 + i = 1
  124 + nwnam <- rep("0",length.out=nuruns)
  125 + for(i in 1:nuruns){
  126 + if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
  127 + nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
  128 + } else{
  129 + nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
  130 + }
  131 +
  132 + }
  133 + nwnam
  134 +
  135 +}
  136 +
  137 +#6# Function for discretizing the data
  138 +dndat <- function(NDATA){
  139 + rownd <- dim(NDATA)[1]
  140 + colnd <- dim(NDATA)[2]
  141 + DDATA <- matrix(0,nrow=rownd,ncol=colnd)
  142 + colnames(DDATA) <- colnames(NDATA)
  143 + i <- 1
  144 + for(i in 1:rownd){
  145 + j <- 1
  146 + for(j in 1:colnd){
  147 + if(is.na(NDATA[i,j])==FALSE){
  148 +
  149 + if(NDATA[i,j] < -1){
  150 + DDATA[i,j]=0L
  151 + }
  152 + if(NDATA[i,j] > 1){
  153 + DDATA[i,j]=2L
  154 + }
  155 + if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
  156 + DDATA[i,j]=1L
  157 + }
  158 + } else{
  159 + DDATA[i,j] = NDATA[i,j]
  160 + }
  161 + j = j + 1
  162 + }
  163 + i = i + 1
  164 + }
  165 + DDATA
  166 +}
  167 +
  168 +
  169 +#The Rest of this code will be used every time you want to change a data set
  170 +
  171 +#Getting the series matrix file
  172 +print("Choose the series matrix file that you want to Analyze")
  173 +alz <- file.choose()
  174 +
  175 +#Getting the GPL file
  176 +print("Choose the GPL file that correlates with the above series matrix file")
  177 +genena <- file.choose()
  178 +
  179 +
  180 +#Find out if it is a soft GPL file or not
  181 +soft <- strsplit(genena,"[\\|/]") %>%
  182 + .[[1]] %>%
  183 + .[length(.)] %>%
  184 + grepl("soft|annot",.)
  185 +
  186 +#Working with the wordy part of the document
  187 +alzword <- alz %>%
  188 + read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
  189 + filter(grepl("!Sample",X1))%>%
  190 + filter(!grepl("!Sample_contact",X1))
  191 +
  192 +##Changing row names and column names:
  193 +ALZWORD <- t(alzword)
  194 +rownames(ALZWORD)=NULL
  195 +colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
  196 +ALZWORD <- chngrownm(ALZWORD)[-1,]
  197 +ALZWORD <- ALZWORD%>%
  198 + as.data.frame()%>%
  199 + dplyr::select(-starts_with("col"))
  200 +
  201 +##Reorganizing information within the columns
  202 +ALZWORDF <- cinfo(ALZWORD)
  203 +
  204 +
  205 +#Working with Actual Data part of file
  206 +alzdat <- alz %>%
  207 + read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
  208 +ALZDAT <- t(alzdat[,-1])
  209 +rownames(ALZDAT)=NULL
  210 +
  211 +##Is there a clean version of the GPL file available?
  212 +gplnum <- strsplit(genena,"[\\|/]") %>%
  213 + .[[1]] %>%
  214 + .[length(.)] %>%
  215 + gsub("\\D","",.)
  216 +clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
  217 +if(clfileex >= 1){
  218 +#use the clean version
  219 +geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
  220 + read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
  221 +
  222 +}
  223 +if(clfileex == 0){
  224 +##Lets Create a clean version
  225 +
  226 +##Gene ID to Gene Name
  227 + if(soft == TRUE){
  228 + #Check to see if there is already a file containing information on soft files
  229 + fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
  230 + if(fileex == 1){
  231 + #Check to see if this GPL soft file has been used before
  232 + IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
  233 + .$GPL_FILE_NUM%>%
  234 + grepl(gplnum,.) %>%
  235 + sum()
  236 + if(IDF == 1){
  237 + IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
  238 + .$GPL_FILE_NUM%>%
  239 + grep(gplnum,.)
  240 + idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
  241 + .$LOC_ID %>%
  242 + .[IDLOCAL]
  243 + geneIDNam <- genena %>%
  244 + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
  245 + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
  246 + }
  247 + if(IDF == 0){
  248 + #No information on this particular GPL file
  249 + idLOCGPL <- genena %>%
  250 + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
  251 + t(.) %>%
  252 + grep("^ID\\s*$",.) %>%
  253 + -1
  254 + cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
  255 + cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
  256 + geneIDNam <- genena %>%
  257 + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
  258 + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
  259 + }
  260 + }
  261 + if(fileex == 0){
  262 + #We must create a file that we can access for later use
  263 + idLOCGPL <- genena %>%
  264 + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
  265 + t(.) %>%
  266 + grep("^ID\\s*$",.) %>%
  267 + -1
  268 + Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
  269 + colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
  270 + write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
  271 + geneIDNam <- genena %>%
  272 + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
  273 + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
  274 + }
  275 + }
  276 + if(soft == FALSE){
  277 + geneIDNam <- genena %>%
  278 + read_delim(delim="\t",comment = "#")%>%
  279 + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
  280 + }
  281 +
  282 + ##Labeling the gene IDs without names
  283 + geneIDNam <- NAFIXING(geneIDNam)
  284 +
  285 + ##remove the whitespace
  286 + geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
  287 +
  288 + ##Here is the clean version
  289 + write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
  290 +}
  291 +
  292 +
  293 +
  294 +##Changing the gene ID to gene name
  295 +ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
  296 +colnames(ALZDAT) = ALZDAT1[1,]
  297 +
  298 +
  299 +##Adjusting the column names aka the gene names
  300 +colnames(ALZDAT) <- gcnames(ALZDAT)
  301 +
  302 +
  303 +#Full RAW Data
  304 +Fullalzdwr <- ALZDAT %>%
  305 + as.data.frame() %>%
  306 + cbind(ALZWORDF,.)
  307 +
  308 +
  309 +#Raw file is output
  310 +nfnaex <- strsplit(alz,"[\\]") %>%
  311 + .[[1]] %>%
  312 + .[length(.)] %>%
  313 + gsub("\\D","",.) %>%
  314 + c("GSE",.,"aftexcel.txt") %>%
  315 + paste(collapse = "")
  316 +write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
  317 +
  318 +
  319 +#Now for the discretization part
  320 +##get the wordy part again
  321 +rawword <- t(ALZWORDF)
  322 +
  323 +##where is ID_REF located
  324 +hereim <- grep("ID_REF",rawword[,1])
  325 +
  326 +##Subject Names GSM...
  327 +subjnam <- rawword[hereim,]
  328 +
  329 +##Getting the names for the rows
  330 +namedarows <- rownames(rawword)[-hereim] %>%
  331 + as.data.frame()
  332 +RAWWORD <- rawword[-hereim,] %>%
  333 + as.data.frame() %>%
  334 + bind_cols(namedarows,.)
  335 +z <- 1
  336 +naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
  337 +for(z in 1:dim(RAWWORD)[1]){
  338 + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
  339 + z <- z + 1
  340 +}
  341 +
  342 +colnames(naroww) <- "ROW_NAs"
  343 +RAWWORD <- bind_cols(RAWWORD,naroww)
  344 +
  345 +
  346 +roALZna <- t(ALZDAT) %>%
  347 + rownames(.) %>%
  348 + as.data.frame(.)
  349 +colnames(roALZna) <- "ID_REF"
  350 +
  351 +RAWDAT <- t(ALZDAT) %>%
  352 + as.data.frame(.)
  353 +colnames(RAWDAT) <- NULL
  354 +rownames(RAWDAT) <- NULL
  355 +
  356 +RAWDAT2 <- RAWDAT %>%
  357 + cbind(roALZna,.) %>%
  358 + dplyr::arrange(.,ID_REF)
  359 +
  360 +##Editing the file for R processing
  361 +RAWDATID <- RAWDAT2[,1] %>%
  362 + as.matrix(.)
  363 +
  364 +RAWDATNUM <- RAWDAT2[,-1] %>%
  365 + mapply(.,FUN = as.numeric) %>%
  366 + t(.)
  367 +
  368 +##Consolidating genes with the same name
  369 +###create empty matrix of size equal to tabRDATID
  370 +tabRDATID <- table(RAWDATID)
  371 +NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
  372 +j <- 1
  373 +for(j in 1:length(tabRDATID)){
  374 +
  375 + ##Putting the ones without duplicates in their new homes
  376 + if(tabRDATID[j] == 1){
  377 + NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
  378 + }
  379 + ##Averaging duplicates and putting them in their new homes
  380 + if(tabRDATID[j] > 1){
  381 + NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
  382 + }
  383 + j <- j + 1
  384 +}
  385 +
  386 +##Scaling the Data
  387 +scrawdat <- NuRDATN%>%
  388 + scale()
  389 +attr(scrawdat,"scaled:center") <- NULL
  390 +attr(scrawdat,"scaled:scale") <- NULL
  391 +colnames(scrawdat) <- rownames(tabRDATID)
  392 +
  393 +##Discretized the Data
  394 +dialzdat <- scrawdat %>%
  395 + dndat(.) %>%
  396 + t()%>%
  397 + as.data.frame(.)
  398 +colnames(dialzdat) <- rownames(RAWDATNUM)
  399 +
  400 +##setting "ID_REF" as a new variable
  401 +geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
  402 +colnames(geneNAM) <- "ID_REF"
  403 +rownames(dialzdat) <- NULL
  404 +dialzdat <-bind_cols(geneNAM,dialzdat)
  405 +
  406 +##NAs in a column
  407 +x <- 2
  408 +nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
  409 +nacol[1,1] = "COL_NAs"
  410 +for(x in 2:dim(dialzdat)[2]){
  411 + nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
  412 + x <- x + 1
  413 +}
  414 +colnames(nacol) <- colnames(dialzdat)
  415 +dialzdat<-bind_rows(dialzdat,nacol)
  416 +
  417 +##NAs in a row
  418 +y <- 1
  419 +narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
  420 +for(y in 1:dim(dialzdat)[1]){
  421 + narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
  422 + y <- y + 1
  423 +}
  424 +colnames(narowd) <- "ROW_NAs"
  425 +dialzdat <- bind_cols(dialzdat,narowd)
  426 +colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
  427 +colnames(RAWWORD) <- colnames(dialzdat)
  428 +##converting to character so that the clinical can be brought together with discrete data
  429 +k <- 2
  430 +for(k in 2:dim(dialzdat)[2]-1){
  431 + dialzdat[,k] <- as.character(dialzdat[,k])
  432 + k <- k + 1
  433 +}
  434 +#The End the full data
  435 +Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
  436 +
  437 +#Produces Discrete file
  438 +nfnaex <- strsplit(rawdat,"[\\|/]") %>%
  439 + .[[1]] %>%
  440 + .[length(.)] %>%
  441 + gsub("\\D","",.) %>%
  442 + c("GSE",.,"dscrt.txt") %>%
  443 + paste(collapse = "")
  444 +write.table(Dscrtalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)
  445 +