From 788834dd790ca6d024f704be6cb685756857a503 Mon Sep 17 00:00:00 2001 From: Efrain Gonzalez Date: Mon, 12 Jun 2017 13:18:49 -0400 Subject: [PATCH] This code takes the clean data and discretizes it --- RPostClean.R | 157 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 RPostClean.R diff --git a/RPostClean.R b/RPostClean.R new file mode 100644 index 0000000..ecb7946 --- /dev/null +++ b/RPostClean.R @@ -0,0 +1,157 @@ +#For Reading Raw Data from the created file + +#Required Libraries +library(MASS) +library(dplyr) +library(tidyr) +library(readr) +library(stringr) + + +#Necessary Functions + +#1# Function for discretizing the data +dndat <- function(NDATA){ + rownd <- dim(NDATA)[1] + colnd <- dim(NDATA)[2] + DDATA <- matrix(0,nrow=rownd,ncol=colnd) + colnames(DDATA) <- colnames(NDATA) + i = 1 + for(i in 1:rownd){ + for(j in 1:colnd){ + if(is.na(NDATA[i,j])==FALSE){ + + if(NDATA[i,j] < -1){ + DDATA[i,j]=0L + } + if(NDATA[i,j] > 1){ + DDATA[i,j]=2L + } + if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ + DDATA[i,j]=1L + } + } else{ + DDATA[i,j] = NDATA[i,j] + } + j = j + 1 + } + i = i + 1 + } + DDATA +} + + +#Bringing in the file +rawdat <- file.choose() +RAWDAT <- rawdat %>% + read_delim(delim ="\t",col_names = FALSE,skip=1) %>% + filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1)) +attributes(RAWDAT)$names <- RAWDAT[1,] + +#Just the clinical data +RAWWORD <- rawdat %>% + read_delim(delim ="\t",col_names = FALSE,skip=1) %>% + filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1)) +attributes(RAWWORD)$names <- RAWDAT[1,] +#Add col of NAs to clinical data +z <- 1 +naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) +for(z in 1:dim(RAWWORD)[1]){ + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) + z <- z + 1 +} +colnames(naroww) <- "ROW_NAs" +RAWWORD <- bind_cols(RAWWORD,naroww) + + +##Getting back to the data +RAWDAT2 <- RAWDAT[-1,] %>% + dplyr::arrange(.,ID_REF) + +##Editing the file for R processing +RAWDATID <- RAWDAT2[,1] %>% + as.matrix(.) +RAWDATNUM <- RAWDAT2[,-1] %>% + mapply(.,FUN = as.numeric) %>% + t(.) + +##Consolidating genes with the same name +tabRDATID <- table(RAWDATID) +NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) +j <- 1 +for(j in 1:length(tabRDATID)){ + ##Putting the ones without duplicates in their new homes + if(tabRDATID[j] == 1){ + NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] + } + ##Averaging duplicates and putting them in their new homes + if(tabRDATID[j] > 1){ + NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) + } + j <- j + 1 +} + + +#Scaling the Data +scrawdat <- NuRDATN%>% + scale() +attr(scrawdat,"scaled:center") <- NULL +attr(scrawdat,"scaled:scale") <- NULL +colnames(scrawdat) <- rownames(tabRDATID) + + +#Discretized the Data +dialzdat <- scrawdat %>% + dndat(.) %>% + t()%>% + as.data.frame(.) +colnames(dialzdat) <- rownames(RAWDATNUM) + +#gene names +genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) +#setting "ID_REF" as a new variable +colnames(genena) <- "ID_REF" +rownames(dialzdat) <- NULL +dialzdat <-bind_cols(genena,dialzdat) + +#NAs in a column +x <- 2 +nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) +nacol[1,1] = "COL_NAs" +for(x in 2:dim(dialzdat)[2]){ + nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) + x <- x + 1 +} +colnames(nacol) <- colnames(dialzdat) +dialzdat<-bind_rows(dialzdat,nacol) + +#NAs in a row +y <- 1 +narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) +for(y in 1:dim(dialzdat)[1]){ + narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) + y <- y + 1 +} +colnames(narowd) <- "ROW_NAs" +dialzdat <- bind_cols(dialzdat,narowd) + +#converting to character so that the clinical can be brought together with discrete data +k <- 2 +for(k in 2:dim(dialzdat)[2]-1){ + dialzdat[,k] <- as.character(dialzdat[,k]) + k <- k + 1 +} + + +#The End the full data we seem to have found Carmen +Fullalzdw <- bind_rows(RAWWORD,dialzdat) + +#Create the file +nfnaex <- strsplit(rawdat,"[\\|/]") %>% + .[[1]] %>% + .[length(.)] %>% + gsub("\\D","",.) %>% + c("GSE",.,"dscrt.txt") %>% + paste(collapse = "") +write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE) + -- 2.29.0