#For Reading Raw Data from the created file #Required Libraries library(MASS) library(dplyr) library(tidyr) library(readr) library(stringr) #Necessary Functions #1# Function for discretizing the data dndat <- function(NDATA){ rownd <- dim(NDATA)[1] colnd <- dim(NDATA)[2] DDATA <- matrix(0,nrow=rownd,ncol=colnd) colnames(DDATA) <- colnames(NDATA) i = 1 for(i in 1:rownd){ j <- 1 for(j in 1:colnd){ if(is.na(NDATA[i,j])==FALSE){ if(NDATA[i,j] < -1){ DDATA[i,j]=0L } if(NDATA[i,j] > 1){ DDATA[i,j]=2L } if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ DDATA[i,j]=1L } } else{ DDATA[i,j] = NDATA[i,j] } j = j + 1 } i = i + 1 } DDATA } #Bringing in the file rawdat <- file.choose() RAWDAT <- rawdat %>% read_delim(delim ="\t",col_names = FALSE,skip=1) %>% filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1)) attributes(RAWDAT)$names <- RAWDAT[1,] #Just the clinical data RAWWORD <- rawdat %>% read_delim(delim ="\t",col_names = FALSE,skip=1) %>% filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1)) attributes(RAWWORD)$names <- RAWDAT[1,] #Add col of NAs to clinical data z <- 1 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) for(z in 1:dim(RAWWORD)[1]){ naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) z <- z + 1 } colnames(naroww) <- "ROW_NAs" RAWWORD <- bind_cols(RAWWORD,naroww) ##Getting back to the data RAWDAT2 <- RAWDAT[-1,] %>% dplyr::arrange(.,ID_REF) ##Editing the file for R processing RAWDATID <- RAWDAT2[,1] %>% as.matrix(.) RAWDATNUM <- RAWDAT2[,-1] %>% mapply(.,FUN = as.numeric) %>% t(.) ##Consolidating genes with the same name tabRDATID <- table(RAWDATID) NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) j <- 1 for(j in 1:length(tabRDATID)){ ##Putting the ones without duplicates in their new homes if(tabRDATID[j] == 1){ NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] } ##Averaging duplicates and putting them in their new homes if(tabRDATID[j] > 1){ NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) } j <- j + 1 } #Scaling the Data scrawdat <- NuRDATN%>% scale() attr(scrawdat,"scaled:center") <- NULL attr(scrawdat,"scaled:scale") <- NULL colnames(scrawdat) <- rownames(tabRDATID) #Discretized the Data dialzdat <- scrawdat %>% dndat(.) %>% t()%>% as.data.frame(.) colnames(dialzdat) <- rownames(RAWDATNUM) #gene names genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) #setting "ID_REF" as a new variable colnames(genena) <- "ID_REF" rownames(dialzdat) <- NULL dialzdat <-bind_cols(genena,dialzdat) #NAs in a column x <- 2 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) nacol[1,1] = "COL_NAs" for(x in 2:dim(dialzdat)[2]){ nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) x <- x + 1 } colnames(nacol) <- colnames(dialzdat) dialzdat<-bind_rows(dialzdat,nacol) #NAs in a row y <- 1 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) for(y in 1:dim(dialzdat)[1]){ narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) y <- y + 1 } colnames(narowd) <- "ROW_NAs" dialzdat <- bind_cols(dialzdat,narowd) #converting to character so that the clinical can be brought together with discrete data k <- 2 for(k in 2:dim(dialzdat)[2]-1){ dialzdat[,k] <- as.character(dialzdat[,k]) k <- k + 1 } #The End the full data Fullalzdw <- bind_rows(RAWWORD,dialzdat) #Create the file nfnaex <- strsplit(rawdat,"[\\|/]") %>% .[[1]] %>% .[length(.)] %>% gsub("\\D","",.) %>% c("GSE",.,"dscrt.txt") %>% paste(collapse = "") write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)