An automated version of the RCleanDscret.R

Working on outputting more insightful errors and warnings. (UNTESTED)

An automated version of the RCleanDscret.R
Working on outputting more insightful errors and warnings. (UNTESTED)
Efrain Gonzalez
1 parent 689231363c
Showing 1 changed file with 752 additions and 0 deletions Show diff stats
RAutoClDs.R
@@ -0,0 +1,752 @@
+#Efrain H. Gonzalez
+#6/19/2017
+#Libraries required to run the code
+library(pryr)
+library(MASS)
+library(dplyr)
+library(tidyr)
+library(readr)
+library(stringr)
+ 
+
+#Necessary Functions
+#1#Function for handling the changing of row names and column names
+chngrownm <- function(mat){
+	row <- dim(mat)[1]
+	col <- dim(mat)[2]
+	j <- 1
+	x <- 1
+	p <- 1	
+	a <- 1 
+	b <- 1
+	g <- 1
+	for(j in 1:col){
+		if("!Sample_source_name_ch1"==mat[1,j]){
+			colnames(mat)[j] <- "Brain_Region"	
+		} 
+		if("!Sample_title" == mat[1,j]){
+			colnames(mat)[j] <- "Title"
+		} 
+		if("!Sample_geo_accession" == mat[1,j]){
+			colnames(mat)[j] <- "ID_REF"
+		} else{
+			if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("Sex",x)
+				x = x + 1
+			}
+			if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("PMI",p)
+				p = p + 1
+			}
+			if(grepl("age|Age|AGE",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("Age",a)
+				a = a + 1
+			 }
+			if(grepl("braak|b&b",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("Braak",b)
+				b = b + 1
+			}
+			if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("Group",g)
+				g = g + 1
+			}
+			
+		}
+		j = j + 1
+	}
+	mat
+}			
+
+#2#Function for reorganizing information within the columns
+cinfo <- function(mat){
+	col <- dim(mat)[2]
+	j <-2
+	for(j in 2:col){
+		if(grepl("Group",colnames(mat)[j]) == TRUE){
+			mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
+		}
+		if(grepl("Age",colnames(mat)[j])==TRUE){
+			mat[,j] <- gsub("\\D","",mat[,j])%>%
+				as.integer()
+		}
+		if(grepl("Sex",colnames(mat)[j])==TRUE){
+			mat[,j] <- gsub(".+:\\s","",mat[,j])
+		}
+		if(grepl("PMI",colnames(mat)[j])==TRUE){
+			mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
+				as.numeric() 
+		}
+		if(grepl("Braak",colnames(mat)[j])==TRUE){
+			mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
+				as.roman()%>%
+				as.integer()
+		}
+	j=j+1
+	}
+	mat
+}
+
+#3#Function for labeling the gene IDs without names
+NAFIXING <- function(GIDNAM){
+	row <- dim(GIDNAM)[1]
+	i <- 1
+	for(i in 1:row){
+		if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
+			GIDNAM[i,2] <- GIDNAM[i,1]
+		}
+		i <- i + 1
+	}
+	GIDNAM
+}	
+
+#4#Function for changing the gene ID to gene name
+cgeneID <- function(GeneName,DATA){
+    colGene <- dim(GeneName)[2]
+     j <- 1
+     for(j in 1:colGene){
+	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
+	if(is.na(sum(chngsreq))==FALSE){
+		if(sum(chngsreq) > 0){
+			DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
+		}
+	}
+	j = j+1
+	}
+	DATA
+}
+
+#5#Function for adjusting the gene names
+gcnames <- function(DiData,usecol=1){
+	nuruns <- dim(DiData)[2]
+	i = 1
+	nwnam <- rep("0",length.out=nuruns)
+	for(i in 1:nuruns){
+		if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
+			nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
+		} else{
+			nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
+		}
+		
+	}
+	nwnam
+
+}
+
+#6# Function for discretizing the data 
+dndat <- function(NDATA){
+	rownd <- dim(NDATA)[1]
+	colnd <- dim(NDATA)[2]
+	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
+	colnames(DDATA) <- colnames(NDATA)
+	i <- 1
+	for(i in 1:rownd){
+		j <- 1
+		for(j in 1:colnd){
+			if(is.na(NDATA[i,j])==FALSE){
+			
+				if(NDATA[i,j] < -1){
+					DDATA[i,j]=0L
+				}
+				if(NDATA[i,j] > 1){
+					DDATA[i,j]=2L
+				}
+				if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
+					DDATA[i,j]=1L
+				}
+			} else{
+				DDATA[i,j] = NDATA[i,j]
+			}
+			j = j + 1			
+		}
+		i = i + 1
+	}
+	DDATA
+}
+
+
+#MajorFunction#This is the function that does everything else
+THEFT <- function(){
+	#Set working directory based on the directory of the series matrix file Currently only works for windows
+	wd <- getwd()
+	#list.files()
+	#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
+	numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
+	GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
+	
+	#ALL DATA FILES WILL BE CLEANED
+	if(numDAT == 1){
+		#indexing the data files
+		n <- 1	
+		for(n in 1: length(GSEfileloc)){
+			alz <- list.files()[GSEfileloc[n]]
+			
+			#Working with the wordy part of the document
+			alzword <- alz %>%
+				read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
+				filter(grepl("!Sample",X1))%>%
+				filter(!grepl("!Sample_contact",X1))
+			
+			#Getting the GPL file
+			genena <- grep("_platform_id",alzword$X1) %>%
+				alzword$X2[.] %>%
+				str_trim(.) %>%
+				paste0("^",.) %>%
+				grep(.,list.files()) %>%
+				list.files()[.]
+			 
+			#Find out if it is a soft GPL file or not
+			soft <- strsplit(genena,"[\\|/]") %>%
+				.[[1]] %>%
+				.[length(.)] %>%
+				grepl("soft",.)
+			
+			##Changing row names and column names:
+			ALZWORD <- t(alzword)
+			rownames(ALZWORD)=NULL
+			colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
+			ALZWORD <- chngrownm(ALZWORD)[-1,]
+			ALZWORD <- ALZWORD%>%
+				as.data.frame()%>%
+				dplyr::select(-starts_with("col"))
+			
+			##Reorganizing information within the columns and final clinical data
+			ALZWORDF <- cinfo(ALZWORD)
+			
+						
+			#Working with Actual Data part of file
+			alzdat <- alz %>% 
+				read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
+			ALZDAT <- t(alzdat[,-1])
+			rownames(ALZDAT)=NULL
+			
+			##Is there a clean version of the GPL file available?
+			gplnum <- strsplit(genena,"[\\|/]") %>%
+				.[[1]] %>%
+				.[length(.)] %>%
+				gsub("\\D","",.)
+			clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
+			if(clfileex >= 1){
+			#use the clean version
+			geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
+				read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
+			
+			} 
+			if(clfileex == 0){
+			##Lets Create a clean version
+			 
+			##Gene ID to Gene Name
+				if(soft == TRUE){
+					#Check to see if there is already a file containing information on soft files
+					fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
+					if(fileex == 1){
+						#Check to see if this GPL soft file has been used before
+						IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
+							.$GPL_FILE_NUM%>%
+							grepl(gplnum,.) %>%
+							sum()
+						if(IDF == 1){
+							IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
+								.$GPL_FILE_NUM%>%
+								grep(gplnum,.)
+							idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
+									.$LOC_ID %>%
+									.[IDLOCAL]
+							geneIDNam <- genena %>%
+								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
+								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
+						}
+						if(IDF == 0){
+							#No information on this particular GPL file
+							idLOCGPL <- genena %>%
+								read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
+								t(.) %>%
+								grep("^ID\\s*$",.) %>%
+								-1
+							cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
+								cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
+							geneIDNam <- genena %>%
+								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
+								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
+						}
+					}
+					if(fileex == 0){
+						#We must create a file that we can access for later use
+						idLOCGPL <- genena %>%
+							read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
+							t(.) %>%
+							grep("^ID\\s*$",.) %>%
+							-1
+						Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
+						colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
+						write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
+						geneIDNam <- genena %>%
+							read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
+							dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
+					}
+	 			}
+				if(soft == FALSE){
+					geneIDNam <- genena %>%
+						read_delim(delim="\t",comment = "#")%>%
+						dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
+				}
+				
+				##Labeling the gene IDs without names
+				geneIDNam <- NAFIXING(geneIDNam)	
+				
+				##remove the whitespace
+				geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
+				
+				##Here is the clean version
+				write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
+			}
+			
+			
+			
+			##Changing the gene ID to gene name
+			ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
+			colnames(ALZDAT) = ALZDAT1[1,]
+						
+			
+			##Adjusting the column names aka the gene names
+			colnames(ALZDAT) <- gcnames(ALZDAT)
+			
+			
+			#Full RAW Data
+			Fullalzdwr <- ALZDAT %>%
+				as.data.frame() %>%
+				cbind(ALZWORDF,.)
+				
+			#Raw file is output
+			nfnaex <- strsplit(alz,"[\\]") %>%
+				.[[1]] %>%
+				.[length(.)] %>%
+				gsub("\\D","",.) %>%
+				c("GSE",.,"aftexcel.txt") %>%
+				paste(collapse = "")
+			write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
+			
+						
+						
+			#Now for the discretization part
+			##get the wordy part again
+			rawword <- t(ALZWORDF)
+			
+			##where is ID_REF located
+			hereim <- grep("ID_REF",rownames(rawword))
+			
+			##Subject Names GSM...
+			subjnam <- rawword[hereim,]
+			
+			##Getting the names for the rows
+			namedarows <- rownames(rawword)[-hereim] %>%
+				as.data.frame()
+			RAWWORD <- rawword[-hereim,] %>%
+				as.data.frame() %>%
+				bind_cols(namedarows,.)
+			z <- 1
+			naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
+			for(z in 1:dim(RAWWORD)[1]){
+				naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
+				z <- z + 1
+			}
+			
+			colnames(naroww) <- "ROW_NAs"
+			RAWWORD <- bind_cols(RAWWORD,naroww)
+			
+			
+			roALZna <- t(ALZDAT) %>%
+				rownames(.) %>%
+				as.data.frame(.)
+			colnames(roALZna) <- "ID_REF"
+			
+			RAWDAT <- t(ALZDAT) %>%
+				as.data.frame(.)
+			colnames(RAWDAT) <- NULL
+			rownames(RAWDAT) <- NULL
+			
+			RAWDAT2 <- RAWDAT %>%
+				cbind(roALZna,.) %>%
+				dplyr::arrange(.,ID_REF)
+			
+			##Editing the file for R processing
+			RAWDATID <- RAWDAT2[,1] %>%
+				as.matrix(.)
+			
+			RAWDATNUM <- RAWDAT2[,-1] %>%
+				mapply(.,FUN = as.numeric) %>%
+				t(.)
+			
+			##Consolidating genes with the same name
+			###create empty matrix of size equal  to tabRDATID
+			tabRDATID <- table(RAWDATID)
+			NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
+			j <- 1
+			for(j in 1:length(tabRDATID)){
+				##Putting the ones without duplicates in their new homes
+				if(tabRDATID[j] == 1){
+					NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
+				}
+				##Averaging duplicates and putting them in their new homes
+				if(tabRDATID[j] > 1){
+					NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
+				}
+				j <- j + 1
+			}
+			
+			##Scaling the Data
+			scrawdat <- NuRDATN%>%
+				scale()
+			attr(scrawdat,"scaled:center") <- NULL
+			attr(scrawdat,"scaled:scale") <- NULL
+			colnames(scrawdat) <- rownames(tabRDATID)
+			
+			##Discretized the Data
+			dialzdat <- scrawdat %>%
+				dndat(.) %>%
+				t()%>%
+				as.data.frame(.)
+			colnames(dialzdat) <- rownames(RAWDATNUM)
+						
+			##setting "ID_REF" as a new variable
+			geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
+			colnames(geneNAM) <- "ID_REF"
+			rownames(dialzdat) <- NULL
+			dialzdat <-bind_cols(geneNAM,dialzdat)
+			
+			##NAs in a column
+			x <- 2
+			nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
+			nacol[1,1] = "COL_NAs"
+			for(x in 2:dim(dialzdat)[2]){
+				nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
+				x <- x + 1
+			}
+			colnames(nacol) <- colnames(dialzdat)
+			dialzdat <- bind_rows(dialzdat,nacol)
+			
+			##NAs in a row
+			y <- 1
+			narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
+			for(y in 1:dim(dialzdat)[1]){
+				narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
+				y <- y + 1
+			}
+			colnames(narowd) <- "ROW_NAs"
+			dialzdat <- bind_cols(dialzdat,narowd)
+			colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
+			colnames(RAWWORD) <- colnames(dialzdat)
+			##converting to character so that the clinical can be brought together with discrete data
+			k <- 2
+			for(k in 2:dim(dialzdat)[2]-1){
+				dialzdat[,k] <- as.character(dialzdat[,k])
+				k <- k + 1
+			}
+			#The End the full data
+			Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
+			
+			#Produces Discrete file
+			nfnaex2 <- strsplit(alz,"[\\|/]") %>%
+				.[[1]] %>%
+				.[length(.)] %>%
+				gsub("\\D","",.) %>%
+				c("GSE",.,"dscrt.txt") %>%
+				paste(collapse = "")
+			write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
+			n <- n +1
+		}
+	}
+	
+	#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
+	if(numDAT == 2){
+		#All the files you want to analyze
+		ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
+		if(length(ANDIS) == 0){
+			#Spit out a warning
+			warning("You did not select any files and so no cleaning will be performed")
+		} else{
+			#indexing the data files
+			n <- 1
+			for(n in 1: length(ANDIS)){
+				alz <- ANDIS[n]
+				
+				#Working with the wordy part of the document
+				alzword <- alz %>%
+					read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
+					filter(grepl("!Sample",X1))%>%
+					filter(!grepl("!Sample_contact",X1))
+				
+				#Getting the GPL file
+				genena <- grep("_platform_id",alzword$X1) %>%
+					alzword$X2[.] %>%
+					str_trim(.) %>%
+					paste0("^",.) %>%
+					grep(.,list.files()) %>%
+					list.files()[.]
+				 
+				#Find out if it is a soft GPL file or not
+				soft <- strsplit(genena,"[\\|/]") %>%
+					.[[1]] %>%
+					.[length(.)] %>%
+					grepl("soft",.)
+				
+				##Changing row names and column names:
+				ALZWORD <- t(alzword)
+				rownames(ALZWORD)=NULL
+				colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
+				ALZWORD <- chngrownm(ALZWORD)[-1,]
+				ALZWORD <- ALZWORD%>%
+					as.data.frame()%>%
+					dplyr::select(-starts_with("col"))
+				
+				##Reorganizing information within the columns and final clinical data
+				ALZWORDF <- cinfo(ALZWORD)
+				
+							
+				#Working with Actual Data part of file
+				alzdat <- alz %>% 
+					read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
+				ALZDAT <- t(alzdat[,-1])
+				rownames(ALZDAT)=NULL
+				
+				##Is there a clean version of the GPL file available?
+				gplnum <- strsplit(genena,"[\\|/]") %>%
+					.[[1]] %>%
+					.[length(.)] %>%
+					gsub("\\D","",.)
+				clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
+				if(clfileex >= 1){
+				#use the clean version
+				geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
+					read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
+				
+				} 
+				if(clfileex == 0){
+				##Lets Create a clean version
+				 
+				##Gene ID to Gene Name
+					if(soft == TRUE){
+						#Check to see if there is already a file containing information on soft files
+						fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
+						if(fileex == 1){
+							#Check to see if this GPL soft file has been used before
+							IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
+								.$GPL_FILE_NUM%>%
+								grepl(gplnum,.) %>%
+								sum()
+							if(IDF == 1){
+								IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
+									.$GPL_FILE_NUM%>%
+									grep(gplnum,.)
+								idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
+										.$LOC_ID %>%
+										.[IDLOCAL]
+								geneIDNam <- genena %>%
+									read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
+									dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
+							}
+							if(IDF == 0){
+								#No information on this particular GPL file
+								idLOCGPL <- genena %>%
+									read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
+									t(.) %>%
+									grep("^ID\\s*$",.) %>%
+									-1
+								cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
+									cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
+								geneIDNam <- genena %>%
+									read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
+									dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
+							}
+						}
+						if(fileex == 0){
+							#We must create a file that we can access for later use
+							idLOCGPL <- genena %>%
+								read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
+								t(.) %>%
+								grep("^ID\\s*$",.) %>%
+								-1
+							Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
+							colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
+							write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
+							geneIDNam <- genena %>%
+								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
+								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
+						}
+		 			}
+					if(soft == FALSE){
+						geneIDNam <- genena %>%
+							read_delim(delim="\t",comment = "#")%>%
+							dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
+					}
+					
+					##Labeling the gene IDs without names
+						geneIDNam <- NAFIXING(geneIDNam)	
+					
+					##remove the whitespace
+					geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
+					
+					##Here is the clean version
+					write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
+				}
+				
+				
+				
+				##Changing the gene ID to gene name
+				ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
+				colnames(ALZDAT) = ALZDAT1[1,]
+							
+				
+				##Adjusting the column names aka the gene names
+				colnames(ALZDAT) <- gcnames(ALZDAT)
+				
+			
+				#Full RAW Data
+				Fullalzdwr <- ALZDAT %>%
+					as.data.frame() %>%
+					cbind(ALZWORDF,.)
+					
+				#Raw file is output
+				nfnaex <- strsplit(alz,"[\\]") %>%
+					.[[1]] %>%
+					.[length(.)] %>%
+					gsub("\\D","",.) %>%
+					c("GSE",.,"aftexcel.txt") %>%
+					paste(collapse = "")
+				write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
+				
+	 						
+						
+				#Now for the discretization part
+				##get the wordy part again
+				rawword <- t(ALZWORDF)
+				
+				##where is ID_REF located
+				hereim <- grep("ID_REF",rownames(rawword))
+				
+				##Subject Names GSM...
+				subjnam <- rawword[hereim,]
+				
+				##Getting the names for the rows
+				namedarows <- rownames(rawword)[-hereim] %>%
+					as.data.frame()
+				RAWWORD <- rawword[-hereim,] %>%
+					as.data.frame() %>%
+					bind_cols(namedarows,.)
+				z <- 1
+				naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
+				for(z in 1:dim(RAWWORD)[1]){
+					naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
+					z <- z + 1
+				}
+				
+				colnames(naroww) <- "ROW_NAs"
+				RAWWORD <- bind_cols(RAWWORD,naroww)
+				
+					
+				roALZna <- t(ALZDAT) %>%
+					rownames(.) %>%
+					as.data.frame(.)
+				colnames(roALZna) <- "ID_REF"
+				
+				RAWDAT <- t(ALZDAT) %>%
+					as.data.frame(.)
+				colnames(RAWDAT) <- NULL
+				rownames(RAWDAT) <- NULL
+				
+				RAWDAT2 <- RAWDAT %>%
+					cbind(roALZna,.) %>%
+					dplyr::arrange(.,ID_REF)
+				
+				##Editing the file for R processing
+				RAWDATID <- RAWDAT2[,1] %>%
+					as.matrix(.)
+				
+				RAWDATNUM <- RAWDAT2[,-1] %>%
+					mapply(.,FUN = as.numeric) %>%
+					t(.)
+				
+				##Consolidating genes with the same name
+				###create empty matrix of size equal  to tabRDATID
+				tabRDATID <- table(RAWDATID)
+				NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
+				j <- 1
+				for(j in 1:length(tabRDATID)){
+					##Putting the ones without duplicates in their new homes
+					if(tabRDATID[j] == 1){
+						NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
+					}
+					##Averaging duplicates and putting them in their new homes
+					if(tabRDATID[j] > 1){
+						NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
+					}
+					j <- j + 1
+				}
+				
+				##Scaling the Data
+				scrawdat <- NuRDATN%>%
+					scale()
+				attr(scrawdat,"scaled:center") <- NULL
+				attr(scrawdat,"scaled:scale") <- NULL
+				colnames(scrawdat) <- rownames(tabRDATID)
+				
+				##Discretized the Data
+				dialzdat <- scrawdat %>%
+					dndat(.) %>%
+					t()%>%
+					as.data.frame(.)
+				colnames(dialzdat) <- rownames(RAWDATNUM)
+							
+				##setting "ID_REF" as a new variable
+				geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
+				colnames(geneNAM) <- "ID_REF"
+				rownames(dialzdat) <- NULL
+				dialzdat <-bind_cols(geneNAM,dialzdat)
+				
+				##NAs in a column
+				x <- 2
+				nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
+				nacol[1,1] = "COL_NAs"
+				for(x in 2:dim(dialzdat)[2]){
+					nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
+					x <- x + 1
+				}
+				colnames(nacol) <- colnames(dialzdat)
+				dialzdat <- bind_rows(dialzdat,nacol)
+				
+				##NAs in a row
+				y <- 1
+				narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
+				for(y in 1:dim(dialzdat)[1]){
+					narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
+					y <- y + 1
+				}
+				colnames(narowd) <- "ROW_NAs"
+				dialzdat <- bind_cols(dialzdat,narowd)
+				colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
+				colnames(RAWWORD) <- colnames(dialzdat)
+				##converting to character so that the clinical can be brought together with discrete data
+				k <- 2
+				for(k in 2:dim(dialzdat)[2]-1){
+					dialzdat[,k] <- as.character(dialzdat[,k])
+					k <- k + 1
+				}
+				#The End the full data
+				Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
+				
+				#Produces Discrete file
+				nfnaex2 <- strsplit(alz,"[\\|/]") %>%
+					.[[1]] %>%
+					.[length(.)] %>%
+					gsub("\\D","",.) %>%
+					c("GSE",.,"dscrt.txt") %>%
+					paste(collapse = "")
+				write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
+				
+								
+				n <- n + 1
+			}
+		}
+	}
+}
+#The Rest of this code will be used every time you want to change a data set
+THEFT()
 \ No newline at end of file