diff --git a/RAutoClDs.R b/RAutoClDs.R new file mode 100644 index 0000000..f126d6e --- /dev/null +++ b/RAutoClDs.R @@ -0,0 +1,752 @@ +#Efrain H. Gonzalez +#6/19/2017 +#Libraries required to run the code +library(pryr) +library(MASS) +library(dplyr) +library(tidyr) +library(readr) +library(stringr) + + +#Necessary Functions +#1#Function for handling the changing of row names and column names +chngrownm <- function(mat){ + row <- dim(mat)[1] + col <- dim(mat)[2] + j <- 1 + x <- 1 + p <- 1 + a <- 1 + b <- 1 + g <- 1 + for(j in 1:col){ + if("!Sample_source_name_ch1"==mat[1,j]){ + colnames(mat)[j] <- "Brain_Region" + } + if("!Sample_title" == mat[1,j]){ + colnames(mat)[j] <- "Title" + } + if("!Sample_geo_accession" == mat[1,j]){ + colnames(mat)[j] <- "ID_REF" + } else{ + if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ + colnames(mat)[j] <- paste0("Sex",x) + x = x + 1 + } + if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ + colnames(mat)[j] <- paste0("PMI",p) + p = p + 1 + } + if(grepl("age|Age|AGE",mat[2,j])==TRUE){ + colnames(mat)[j] <- paste0("Age",a) + a = a + 1 + } + if(grepl("braak|b&b",mat[2,j])==TRUE){ + colnames(mat)[j] <- paste0("Braak",b) + b = b + 1 + } + if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){ + colnames(mat)[j] <- paste0("Group",g) + g = g + 1 + } + + } + j = j + 1 + } + mat +} + +#2#Function for reorganizing information within the columns +cinfo <- function(mat){ + col <- dim(mat)[2] + j <-2 + for(j in 2:col){ + if(grepl("Group",colnames(mat)[j]) == TRUE){ + mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) + } + if(grepl("Age",colnames(mat)[j])==TRUE){ + mat[,j] <- gsub("\\D","",mat[,j])%>% + as.integer() + } + if(grepl("Sex",colnames(mat)[j])==TRUE){ + mat[,j] <- gsub(".+:\\s","",mat[,j]) + } + if(grepl("PMI",colnames(mat)[j])==TRUE){ + mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% + as.numeric() + } + if(grepl("Braak",colnames(mat)[j])==TRUE){ + mat[,j]<-gsub(".+:\\s","",mat[,j])%>% + as.roman()%>% + as.integer() + } + j=j+1 + } + mat +} + +#3#Function for labeling the gene IDs without names +NAFIXING <- function(GIDNAM){ + row <- dim(GIDNAM)[1] + i <- 1 + for(i in 1:row){ + if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ + GIDNAM[i,2] <- GIDNAM[i,1] + } + i <- i + 1 + } + GIDNAM +} + +#4#Function for changing the gene ID to gene name +cgeneID <- function(GeneName,DATA){ + colGene <- dim(GeneName)[2] + j <- 1 + for(j in 1:colGene){ + chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) + if(is.na(sum(chngsreq))==FALSE){ + if(sum(chngsreq) > 0){ + DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) + } + } + j = j+1 + } + DATA +} + +#5#Function for adjusting the gene names +gcnames <- function(DiData,usecol=1){ + nuruns <- dim(DiData)[2] + i = 1 + nwnam <- rep("0",length.out=nuruns) + for(i in 1:nuruns){ + if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ + nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol]) + } else{ + nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1]) + } + + } + nwnam + +} + +#6# Function for discretizing the data +dndat <- function(NDATA){ + rownd <- dim(NDATA)[1] + colnd <- dim(NDATA)[2] + DDATA <- matrix(0,nrow=rownd,ncol=colnd) + colnames(DDATA) <- colnames(NDATA) + i <- 1 + for(i in 1:rownd){ + j <- 1 + for(j in 1:colnd){ + if(is.na(NDATA[i,j])==FALSE){ + + if(NDATA[i,j] < -1){ + DDATA[i,j]=0L + } + if(NDATA[i,j] > 1){ + DDATA[i,j]=2L + } + if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ + DDATA[i,j]=1L + } + } else{ + DDATA[i,j] = NDATA[i,j] + } + j = j + 1 + } + i = i + 1 + } + DDATA +} + + +#MajorFunction#This is the function that does everything else +THEFT <- function(){ + #Set working directory based on the directory of the series matrix file Currently only works for windows + wd <- getwd() + #list.files() + #gsub("wd",wd,"Do you want to clean all data files in the directory wd?") + numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) + GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files()) + + #ALL DATA FILES WILL BE CLEANED + if(numDAT == 1){ + #indexing the data files + n <- 1 + for(n in 1: length(GSEfileloc)){ + alz <- list.files()[GSEfileloc[n]] + + #Working with the wordy part of the document + alzword <- alz %>% + read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% + filter(grepl("!Sample",X1))%>% + filter(!grepl("!Sample_contact",X1)) + + #Getting the GPL file + genena <- grep("_platform_id",alzword$X1) %>% + alzword$X2[.] %>% + str_trim(.) %>% + paste0("^",.) %>% + grep(.,list.files()) %>% + list.files()[.] + + #Find out if it is a soft GPL file or not + soft <- strsplit(genena,"[\\|/]") %>% + .[[1]] %>% + .[length(.)] %>% + grepl("soft",.) + + ##Changing row names and column names: + ALZWORD <- t(alzword) + rownames(ALZWORD)=NULL + colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) + ALZWORD <- chngrownm(ALZWORD)[-1,] + ALZWORD <- ALZWORD%>% + as.data.frame()%>% + dplyr::select(-starts_with("col")) + + ##Reorganizing information within the columns and final clinical data + ALZWORDF <- cinfo(ALZWORD) + + + #Working with Actual Data part of file + alzdat <- alz %>% + read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) + ALZDAT <- t(alzdat[,-1]) + rownames(ALZDAT)=NULL + + ##Is there a clean version of the GPL file available? + gplnum <- strsplit(genena,"[\\|/]") %>% + .[[1]] %>% + .[length(.)] %>% + gsub("\\D","",.) + clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) + if(clfileex >= 1){ + #use the clean version + geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% + read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") + + } + if(clfileex == 0){ + ##Lets Create a clean version + + ##Gene ID to Gene Name + if(soft == TRUE){ + #Check to see if there is already a file containing information on soft files + fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) + if(fileex == 1){ + #Check to see if this GPL soft file has been used before + IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% + .$GPL_FILE_NUM%>% + grepl(gplnum,.) %>% + sum() + if(IDF == 1){ + IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% + .$GPL_FILE_NUM%>% + grep(gplnum,.) + idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% + .$LOC_ID %>% + .[IDLOCAL] + geneIDNam <- genena %>% + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) + } + if(IDF == 0){ + #No information on this particular GPL file + idLOCGPL <- genena %>% + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% + t(.) %>% + grep("^ID\\s*$",.) %>% + -1 + cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% + cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) + geneIDNam <- genena %>% + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) + } + } + if(fileex == 0){ + #We must create a file that we can access for later use + idLOCGPL <- genena %>% + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% + t(.) %>% + grep("^ID\\s*$",.) %>% + -1 + Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) + colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") + write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) + geneIDNam <- genena %>% + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) + } + } + if(soft == FALSE){ + geneIDNam <- genena %>% + read_delim(delim="\t",comment = "#")%>% + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) + } + + ##Labeling the gene IDs without names + geneIDNam <- NAFIXING(geneIDNam) + + ##remove the whitespace + geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) + + ##Here is the clean version + write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) + } + + + + ##Changing the gene ID to gene name + ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) + colnames(ALZDAT) = ALZDAT1[1,] + + + ##Adjusting the column names aka the gene names + colnames(ALZDAT) <- gcnames(ALZDAT) + + + #Full RAW Data + Fullalzdwr <- ALZDAT %>% + as.data.frame() %>% + cbind(ALZWORDF,.) + + #Raw file is output + nfnaex <- strsplit(alz,"[\\]") %>% + .[[1]] %>% + .[length(.)] %>% + gsub("\\D","",.) %>% + c("GSE",.,"aftexcel.txt") %>% + paste(collapse = "") + write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") + + + + #Now for the discretization part + ##get the wordy part again + rawword <- t(ALZWORDF) + + ##where is ID_REF located + hereim <- grep("ID_REF",rownames(rawword)) + + ##Subject Names GSM... + subjnam <- rawword[hereim,] + + ##Getting the names for the rows + namedarows <- rownames(rawword)[-hereim] %>% + as.data.frame() + RAWWORD <- rawword[-hereim,] %>% + as.data.frame() %>% + bind_cols(namedarows,.) + z <- 1 + naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) + for(z in 1:dim(RAWWORD)[1]){ + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) + z <- z + 1 + } + + colnames(naroww) <- "ROW_NAs" + RAWWORD <- bind_cols(RAWWORD,naroww) + + + roALZna <- t(ALZDAT) %>% + rownames(.) %>% + as.data.frame(.) + colnames(roALZna) <- "ID_REF" + + RAWDAT <- t(ALZDAT) %>% + as.data.frame(.) + colnames(RAWDAT) <- NULL + rownames(RAWDAT) <- NULL + + RAWDAT2 <- RAWDAT %>% + cbind(roALZna,.) %>% + dplyr::arrange(.,ID_REF) + + ##Editing the file for R processing + RAWDATID <- RAWDAT2[,1] %>% + as.matrix(.) + + RAWDATNUM <- RAWDAT2[,-1] %>% + mapply(.,FUN = as.numeric) %>% + t(.) + + ##Consolidating genes with the same name + ###create empty matrix of size equal to tabRDATID + tabRDATID <- table(RAWDATID) + NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) + j <- 1 + for(j in 1:length(tabRDATID)){ + ##Putting the ones without duplicates in their new homes + if(tabRDATID[j] == 1){ + NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] + } + ##Averaging duplicates and putting them in their new homes + if(tabRDATID[j] > 1){ + NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) + } + j <- j + 1 + } + + ##Scaling the Data + scrawdat <- NuRDATN%>% + scale() + attr(scrawdat,"scaled:center") <- NULL + attr(scrawdat,"scaled:scale") <- NULL + colnames(scrawdat) <- rownames(tabRDATID) + + ##Discretized the Data + dialzdat <- scrawdat %>% + dndat(.) %>% + t()%>% + as.data.frame(.) + colnames(dialzdat) <- rownames(RAWDATNUM) + + ##setting "ID_REF" as a new variable + geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) + colnames(geneNAM) <- "ID_REF" + rownames(dialzdat) <- NULL + dialzdat <-bind_cols(geneNAM,dialzdat) + + ##NAs in a column + x <- 2 + nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) + nacol[1,1] = "COL_NAs" + for(x in 2:dim(dialzdat)[2]){ + nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) + x <- x + 1 + } + colnames(nacol) <- colnames(dialzdat) + dialzdat <- bind_rows(dialzdat,nacol) + + ##NAs in a row + y <- 1 + narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) + for(y in 1:dim(dialzdat)[1]){ + narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) + y <- y + 1 + } + colnames(narowd) <- "ROW_NAs" + dialzdat <- bind_cols(dialzdat,narowd) + colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam + colnames(RAWWORD) <- colnames(dialzdat) + ##converting to character so that the clinical can be brought together with discrete data + k <- 2 + for(k in 2:dim(dialzdat)[2]-1){ + dialzdat[,k] <- as.character(dialzdat[,k]) + k <- k + 1 + } + #The End the full data + Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) + + #Produces Discrete file + nfnaex2 <- strsplit(alz,"[\\|/]") %>% + .[[1]] %>% + .[length(.)] %>% + gsub("\\D","",.) %>% + c("GSE",.,"dscrt.txt") %>% + paste(collapse = "") + write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) + n <- n +1 + } + } + + #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN + if(numDAT == 2){ + #All the files you want to analyze + ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:") + if(length(ANDIS) == 0){ + #Spit out a warning + warning("You did not select any files and so no cleaning will be performed") + } else{ + #indexing the data files + n <- 1 + for(n in 1: length(ANDIS)){ + alz <- ANDIS[n] + + #Working with the wordy part of the document + alzword <- alz %>% + read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% + filter(grepl("!Sample",X1))%>% + filter(!grepl("!Sample_contact",X1)) + + #Getting the GPL file + genena <- grep("_platform_id",alzword$X1) %>% + alzword$X2[.] %>% + str_trim(.) %>% + paste0("^",.) %>% + grep(.,list.files()) %>% + list.files()[.] + + #Find out if it is a soft GPL file or not + soft <- strsplit(genena,"[\\|/]") %>% + .[[1]] %>% + .[length(.)] %>% + grepl("soft",.) + + ##Changing row names and column names: + ALZWORD <- t(alzword) + rownames(ALZWORD)=NULL + colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) + ALZWORD <- chngrownm(ALZWORD)[-1,] + ALZWORD <- ALZWORD%>% + as.data.frame()%>% + dplyr::select(-starts_with("col")) + + ##Reorganizing information within the columns and final clinical data + ALZWORDF <- cinfo(ALZWORD) + + + #Working with Actual Data part of file + alzdat <- alz %>% + read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) + ALZDAT <- t(alzdat[,-1]) + rownames(ALZDAT)=NULL + + ##Is there a clean version of the GPL file available? + gplnum <- strsplit(genena,"[\\|/]") %>% + .[[1]] %>% + .[length(.)] %>% + gsub("\\D","",.) + clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) + if(clfileex >= 1){ + #use the clean version + geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% + read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") + + } + if(clfileex == 0){ + ##Lets Create a clean version + + ##Gene ID to Gene Name + if(soft == TRUE){ + #Check to see if there is already a file containing information on soft files + fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) + if(fileex == 1){ + #Check to see if this GPL soft file has been used before + IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% + .$GPL_FILE_NUM%>% + grepl(gplnum,.) %>% + sum() + if(IDF == 1){ + IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% + .$GPL_FILE_NUM%>% + grep(gplnum,.) + idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% + .$LOC_ID %>% + .[IDLOCAL] + geneIDNam <- genena %>% + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) + } + if(IDF == 0){ + #No information on this particular GPL file + idLOCGPL <- genena %>% + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% + t(.) %>% + grep("^ID\\s*$",.) %>% + -1 + cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% + cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) + geneIDNam <- genena %>% + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) + } + } + if(fileex == 0){ + #We must create a file that we can access for later use + idLOCGPL <- genena %>% + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% + t(.) %>% + grep("^ID\\s*$",.) %>% + -1 + Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) + colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") + write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) + geneIDNam <- genena %>% + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) + } + } + if(soft == FALSE){ + geneIDNam <- genena %>% + read_delim(delim="\t",comment = "#")%>% + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) + } + + ##Labeling the gene IDs without names + geneIDNam <- NAFIXING(geneIDNam) + + ##remove the whitespace + geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) + + ##Here is the clean version + write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) + } + + + + ##Changing the gene ID to gene name + ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) + colnames(ALZDAT) = ALZDAT1[1,] + + + ##Adjusting the column names aka the gene names + colnames(ALZDAT) <- gcnames(ALZDAT) + + + #Full RAW Data + Fullalzdwr <- ALZDAT %>% + as.data.frame() %>% + cbind(ALZWORDF,.) + + #Raw file is output + nfnaex <- strsplit(alz,"[\\]") %>% + .[[1]] %>% + .[length(.)] %>% + gsub("\\D","",.) %>% + c("GSE",.,"aftexcel.txt") %>% + paste(collapse = "") + write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") + + + + #Now for the discretization part + ##get the wordy part again + rawword <- t(ALZWORDF) + + ##where is ID_REF located + hereim <- grep("ID_REF",rownames(rawword)) + + ##Subject Names GSM... + subjnam <- rawword[hereim,] + + ##Getting the names for the rows + namedarows <- rownames(rawword)[-hereim] %>% + as.data.frame() + RAWWORD <- rawword[-hereim,] %>% + as.data.frame() %>% + bind_cols(namedarows,.) + z <- 1 + naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) + for(z in 1:dim(RAWWORD)[1]){ + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) + z <- z + 1 + } + + colnames(naroww) <- "ROW_NAs" + RAWWORD <- bind_cols(RAWWORD,naroww) + + + roALZna <- t(ALZDAT) %>% + rownames(.) %>% + as.data.frame(.) + colnames(roALZna) <- "ID_REF" + + RAWDAT <- t(ALZDAT) %>% + as.data.frame(.) + colnames(RAWDAT) <- NULL + rownames(RAWDAT) <- NULL + + RAWDAT2 <- RAWDAT %>% + cbind(roALZna,.) %>% + dplyr::arrange(.,ID_REF) + + ##Editing the file for R processing + RAWDATID <- RAWDAT2[,1] %>% + as.matrix(.) + + RAWDATNUM <- RAWDAT2[,-1] %>% + mapply(.,FUN = as.numeric) %>% + t(.) + + ##Consolidating genes with the same name + ###create empty matrix of size equal to tabRDATID + tabRDATID <- table(RAWDATID) + NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) + j <- 1 + for(j in 1:length(tabRDATID)){ + ##Putting the ones without duplicates in their new homes + if(tabRDATID[j] == 1){ + NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] + } + ##Averaging duplicates and putting them in their new homes + if(tabRDATID[j] > 1){ + NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) + } + j <- j + 1 + } + + ##Scaling the Data + scrawdat <- NuRDATN%>% + scale() + attr(scrawdat,"scaled:center") <- NULL + attr(scrawdat,"scaled:scale") <- NULL + colnames(scrawdat) <- rownames(tabRDATID) + + ##Discretized the Data + dialzdat <- scrawdat %>% + dndat(.) %>% + t()%>% + as.data.frame(.) + colnames(dialzdat) <- rownames(RAWDATNUM) + + ##setting "ID_REF" as a new variable + geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) + colnames(geneNAM) <- "ID_REF" + rownames(dialzdat) <- NULL + dialzdat <-bind_cols(geneNAM,dialzdat) + + ##NAs in a column + x <- 2 + nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) + nacol[1,1] = "COL_NAs" + for(x in 2:dim(dialzdat)[2]){ + nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) + x <- x + 1 + } + colnames(nacol) <- colnames(dialzdat) + dialzdat <- bind_rows(dialzdat,nacol) + + ##NAs in a row + y <- 1 + narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) + for(y in 1:dim(dialzdat)[1]){ + narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) + y <- y + 1 + } + colnames(narowd) <- "ROW_NAs" + dialzdat <- bind_cols(dialzdat,narowd) + colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam + colnames(RAWWORD) <- colnames(dialzdat) + ##converting to character so that the clinical can be brought together with discrete data + k <- 2 + for(k in 2:dim(dialzdat)[2]-1){ + dialzdat[,k] <- as.character(dialzdat[,k]) + k <- k + 1 + } + #The End the full data + Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) + + #Produces Discrete file + nfnaex2 <- strsplit(alz,"[\\|/]") %>% + .[[1]] %>% + .[length(.)] %>% + gsub("\\D","",.) %>% + c("GSE",.,"dscrt.txt") %>% + paste(collapse = "") + write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) + + + n <- n + 1 + } + } + } +} +#The Rest of this code will be used every time you want to change a data set +THEFT() \ No newline at end of file