Commit 788834dd790ca6d024f704be6cb685756857a503

Authored by Efrain Gonzalez
1 parent 69cbaf694d
Exists in master

This code takes the clean data and discretizes it

Showing 1 changed file with 157 additions and 0 deletions   Show diff stats
... ... @@ -0,0 +1,157 @@
  1 +#For Reading Raw Data from the created file
  2 +
  3 +#Required Libraries
  4 +library(MASS)
  5 +library(dplyr)
  6 +library(tidyr)
  7 +library(readr)
  8 +library(stringr)
  9 +
  10 +
  11 +#Necessary Functions
  12 +
  13 +#1# Function for discretizing the data
  14 +dndat <- function(NDATA){
  15 + rownd <- dim(NDATA)[1]
  16 + colnd <- dim(NDATA)[2]
  17 + DDATA <- matrix(0,nrow=rownd,ncol=colnd)
  18 + colnames(DDATA) <- colnames(NDATA)
  19 + i = 1
  20 + for(i in 1:rownd){
  21 + for(j in 1:colnd){
  22 + if(is.na(NDATA[i,j])==FALSE){
  23 +
  24 + if(NDATA[i,j] < -1){
  25 + DDATA[i,j]=0L
  26 + }
  27 + if(NDATA[i,j] > 1){
  28 + DDATA[i,j]=2L
  29 + }
  30 + if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
  31 + DDATA[i,j]=1L
  32 + }
  33 + } else{
  34 + DDATA[i,j] = NDATA[i,j]
  35 + }
  36 + j = j + 1
  37 + }
  38 + i = i + 1
  39 + }
  40 + DDATA
  41 +}
  42 +
  43 +
  44 +#Bringing in the file
  45 +rawdat <- file.choose()
  46 +RAWDAT <- rawdat %>%
  47 + read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
  48 + filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))
  49 +attributes(RAWDAT)$names <- RAWDAT[1,]
  50 +
  51 +#Just the clinical data
  52 +RAWWORD <- rawdat %>%
  53 + read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
  54 + filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))
  55 +attributes(RAWWORD)$names <- RAWDAT[1,]
  56 +#Add col of NAs to clinical data
  57 +z <- 1
  58 +naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
  59 +for(z in 1:dim(RAWWORD)[1]){
  60 + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
  61 + z <- z + 1
  62 +}
  63 +colnames(naroww) <- "ROW_NAs"
  64 +RAWWORD <- bind_cols(RAWWORD,naroww)
  65 +
  66 +
  67 +##Getting back to the data
  68 +RAWDAT2 <- RAWDAT[-1,] %>%
  69 + dplyr::arrange(.,ID_REF)
  70 +
  71 +##Editing the file for R processing
  72 +RAWDATID <- RAWDAT2[,1] %>%
  73 + as.matrix(.)
  74 +RAWDATNUM <- RAWDAT2[,-1] %>%
  75 + mapply(.,FUN = as.numeric) %>%
  76 + t(.)
  77 +
  78 +##Consolidating genes with the same name
  79 +tabRDATID <- table(RAWDATID)
  80 +NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
  81 +j <- 1
  82 +for(j in 1:length(tabRDATID)){
  83 + ##Putting the ones without duplicates in their new homes
  84 + if(tabRDATID[j] == 1){
  85 + NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
  86 + }
  87 + ##Averaging duplicates and putting them in their new homes
  88 + if(tabRDATID[j] > 1){
  89 + NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
  90 + }
  91 + j <- j + 1
  92 +}
  93 +
  94 +
  95 +#Scaling the Data
  96 +scrawdat <- NuRDATN%>%
  97 + scale()
  98 +attr(scrawdat,"scaled:center") <- NULL
  99 +attr(scrawdat,"scaled:scale") <- NULL
  100 +colnames(scrawdat) <- rownames(tabRDATID)
  101 +
  102 +
  103 +#Discretized the Data
  104 +dialzdat <- scrawdat %>%
  105 + dndat(.) %>%
  106 + t()%>%
  107 + as.data.frame(.)
  108 +colnames(dialzdat) <- rownames(RAWDATNUM)
  109 +
  110 +#gene names
  111 +genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
  112 +#setting "ID_REF" as a new variable
  113 +colnames(genena) <- "ID_REF"
  114 +rownames(dialzdat) <- NULL
  115 +dialzdat <-bind_cols(genena,dialzdat)
  116 +
  117 +#NAs in a column
  118 +x <- 2
  119 +nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
  120 +nacol[1,1] = "COL_NAs"
  121 +for(x in 2:dim(dialzdat)[2]){
  122 + nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
  123 + x <- x + 1
  124 +}
  125 +colnames(nacol) <- colnames(dialzdat)
  126 +dialzdat<-bind_rows(dialzdat,nacol)
  127 +
  128 +#NAs in a row
  129 +y <- 1
  130 +narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
  131 +for(y in 1:dim(dialzdat)[1]){
  132 + narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
  133 + y <- y + 1
  134 +}
  135 +colnames(narowd) <- "ROW_NAs"
  136 +dialzdat <- bind_cols(dialzdat,narowd)
  137 +
  138 +#converting to character so that the clinical can be brought together with discrete data
  139 +k <- 2
  140 +for(k in 2:dim(dialzdat)[2]-1){
  141 + dialzdat[,k] <- as.character(dialzdat[,k])
  142 + k <- k + 1
  143 +}
  144 +
  145 +
  146 +#The End the full data we seem to have found Carmen
  147 +Fullalzdw <- bind_rows(RAWWORD,dialzdat)
  148 +
  149 +#Create the file
  150 +nfnaex <- strsplit(rawdat,"[\\|/]") %>%
  151 + .[[1]] %>%
  152 + .[length(.)] %>%
  153 + gsub("\\D","",.) %>%
  154 + c("GSE",.,"dscrt.txt") %>%
  155 + paste(collapse = "")
  156 +write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)
  157 +