#Efrain H. Gonzalez
#6/22/2017
options(digits = 11)
#Libraries required to run the code
library(pryr)
library(MASS)
library(dplyr)
library(tidyr)
library(readr)
library(stringr)
 

#Necessary Functions
#1#Function for handling the changing of row names and column names
chngrownm <- function(mat){
	row <- dim(mat)[1]
	col <- dim(mat)[2]
	e <- 1
	r <- 1
	a <- 1	
	h <- 1 
	g <- 1
	o <- 1
	for(e in 1:col){
		if("!Sample_source_name_ch1"==mat[1,e]){
			colnames(mat)[e] <- "Brain_Region"	
		} else if("!Sample_title" == mat[1,e]){
			colnames(mat)[e] <- "Title"
		} else if("!Sample_geo_accession" == mat[1,e]){
			colnames(mat)[e] <- "ID_REF"
		} else{
			if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){
				colnames(mat)[e] <- paste0("Sex",r)
				r = r + 1
			}
			if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){
				colnames(mat)[e] <- paste0("PMI",a)
				a = a + 1
			}
			if(grepl("age|Age|AGE",mat[2,e])==TRUE){
				colnames(mat)[e] <- paste0("Age",h)
				h = h + 1
			 }
			if(grepl("braak|b&b",mat[2,e])==TRUE){
				colnames(mat)[e] <- paste0("Braak",g)
				g = g + 1
			}
			if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){
				colnames(mat)[e] <- paste0("Group",o)
				o = o + 1
			}
			
		}
		e = e + 1
	}
	mat
}			

#2#Function for reorganizing information within the columns
cinfo <- function(mat){
	col <- dim(mat)[2]
	j <-2
	for(j in 2:col){
		if(grepl("Group",colnames(mat)[j]) == TRUE){
			mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
		} else if(grepl("Age",colnames(mat)[j])==TRUE){
			mat[,j] <- gsub("\\D","",mat[,j])%>%
				as.integer()
		} else if(grepl("Sex",colnames(mat)[j])==TRUE){
			mat[,j] <- gsub(".+:\\s","",mat[,j])
		} else if(grepl("PMI",colnames(mat)[j])==TRUE){
			mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
				as.numeric() 
		} else if(grepl("Braak",colnames(mat)[j])==TRUE){
			mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
				as.roman()%>%
				as.integer()
		}
	j=j+1
	}
	mat
}

#3#Function for labeling the gene IDs without names
NAFIXING <- function(GIDNAM){
	row <- dim(GIDNAM)[1]
	i <- 1
	for(i in 1:row){
		if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
			GIDNAM[i,2] <- GIDNAM[i,1]
		}
		i <- i + 1
	}
	GIDNAM
}	

#4#Function for changing the gene ID to gene name
cgeneID <- function(GeneName,DATA){
	nj <- t(GeneName)
	nq <- t(DATA)
	colGene <- dim(nj)[2]
	colDATA <- dim(nq)[2]
	j <- 1
	for(j in 1:colDATA){
		#where is that gene id located within the GPL file
		chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
		if(is.na(sum(chngreq))==FALSE){
			if(sum(chngreq) > 0){
			nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
			}
		}
		j <- j + 1
	}
	nq
}
#cgeneID <- function(GeneName,DATA){
#    colGene <- dim(GeneName)[2]
#     j <- 1
#     for(j in 1:colGene){
#	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
#	if(is.na(sum(chngsreq))==FALSE){
#		if(sum(chngsreq) > 0){
#			DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
#		}
#	}
#	j = j+1
#	}
#	DATA
#}

#5#Function for adjusting the gene names
gcnames <- function(DiData,usecol=1){
	nuruns <- dim(DiData)[2]
	i = 1
	nwnam <- rep("0",length.out=nuruns)
	for(i in 1:nuruns){
		if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){
			nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])
		} else{
			nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])
		}
		
	}
	nwnam

}

#6# Function for discretizing the data 
dndat <- function(NDATA){
	rownd <- dim(NDATA)[1]
	colnd <- dim(NDATA)[2]
	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
	colnames(DDATA) <- colnames(NDATA)
	i <- 1
	for(i in 1:rownd){
		j <- 1
		for(j in 1:colnd){
			if(is.na(NDATA[i,j])==FALSE){
			
				if(NDATA[i,j] < -1){
					DDATA[i,j]=0L
				} else if(NDATA[i,j] > 1){
					DDATA[i,j]=2L
				} else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){
					DDATA[i,j]=1L
				}
			} else{
				DDATA[i,j] = NDATA[i,j]
			}
			j = j + 1			
		}
		i = i + 1
	}
	DDATA
}


#MajorFunction#This is the function that does everything else
THEFT <- function(){
	#Set working directory based on the directory of the series matrix file Currently only works for windows
	wd <- getwd()
	#list.files()
	#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
	numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
	GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
	GSEfloc <- list.files()[GSEfileloc]
	#ALL DATA FILES WILL BE CLEANED
	if(numDAT == 1){
		#indexing the data files
		n <- 1	
		for(n in 1: length(GSEfloc)){
			alz <- GSEfloc[n]
			
			#Working with the wordy part of the document
			alzword <- alz %>%
				read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
				filter(grepl("!Sample",X1))%>%
				filter(!grepl("!Sample_contact",X1))
			
			#Getting the GPL file
			genena <- grep("_platform_id",alzword$X1) %>%
				alzword$X2[.] %>%
				str_trim(.) %>%
				paste0("^",.,"\\D") %>%
				grep(.,list.files()) %>%
				list.files()[.]
			 
			#Find out if it is a soft GPL file or not
			#soft <- strsplit(genena,"[\\|/]") %>%
			#	.[[1]] %>%
			#	.[length(.)] %>%
			#	grepl("soft",.)
			soft <- TRUE
			##Changing row names and column names:
			ALZWORD <- t(alzword)
			rownames(ALZWORD)=NULL
			colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
			ALZWORD <- chngrownm(ALZWORD)[-1,]
			ALZWORD <- ALZWORD%>%
				as.data.frame(.,stringsAsFactors = FALSE)%>%
				dplyr::select(-starts_with("col"))
			
			##Reorganizing information within the columns and final clinical data
			ALZWORDF <- cinfo(ALZWORD)
			
						
			#Working with Actual Data part of file
			alzdat <- alz %>% 
				read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
			ALZDAT <- t(alzdat[,-1])
			rownames(ALZDAT)=NULL
			
			##Is there a clean version of the GPL file available?
			gplnum <- strsplit(genena,"[\\|/]") %>%
				.[[1]] %>%
				.[length(.)] %>%
				gsub("\\D","",.)
			clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
			if(clfileex >= 1){
			#use the clean version
			geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
				read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
			
			} else if(clfileex == 0){
			##Lets Create a clean version
			 
			##Gene ID to Gene Name
				if(soft == TRUE){
					#Check to see if there is already a file containing information on soft files
					fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
					if(fileex == 1){
						#Check to see if this GPL soft file has been used before
						IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
							.$GPL_FILE_NUM%>%
							grepl(gplnum,.) %>%
							sum()
						if(IDF == 1){
							IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
								.$GPL_FILE_NUM%>%
								grep(gplnum,.)
							idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
									.$LOC_ID %>%
									.[IDLOCAL]
							geneIDNam <- genena %>%
								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|GENE_SYMBOL_LIST|^GENE_SYMBOL$|^Gene symbol$|^Gene Symbol$",colnames(.)))
						} else if(IDF == 0){
							#No information on this particular GPL file
							idLOCGPL <- genena %>%
								read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
								t(.) %>%
								grep("^ID\\s*$",.) %>%
								-1
							cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
								cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
							geneIDNam <- genena %>%
								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|GENE_SYMBOL_LIST|^GENE_SYMBOL$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^Gene Symbol$",colnames(.)))
						}
					} else if(fileex == 0){
						#We must create a file that we can access for later use
						idLOCGPL <- genena %>%
							read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
							t(.) %>%
							grep("^ID\\s*$",.) %>%
							-1
						Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
						colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
						write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
						geneIDNam <- genena %>%
							read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
							dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|GENE_SYMBOL_LIST|^GENE_SYMBOL$|^Gene symbol$|^Gene Symbol$",colnames(.)))
					}
	 			} else if(soft == FALSE){
					geneIDNam <- genena %>%
						read_delim(delim="\t",comment = "#")%>%
						dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|GENE_SYMBOL_LIST|^GENE_SYMBOL$|^Gene symbol$|^Gene Symbol$",colnames(.)))
				}
				
				##Labeling the gene IDs without names
				geneIDNam <- NAFIXING(geneIDNam)	
				
				##remove the whitespace
				geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
				
				##Here is the clean version
				write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
			}
			
			
			
			##Changing the gene ID to gene name
			ALZDAT1 <- cgeneID(geneIDNam,alzdat)
			colnames(ALZDAT) = ALZDAT1[1,]
						
			
			##Adjusting the column names aka the gene names
			colnames(ALZDAT) <- gcnames(ALZDAT)
			
			
			#Full RAW Data
			Fullalzdwr <- ALZDAT %>%
				as.data.frame(.,stringsAsFactors = FALSE) %>%
				cbind(ALZWORDF,.)
				
			#Raw file is output
			nfnaex <- strsplit(alz,"[\\]") %>%
				.[[1]] %>%
				.[length(.)] %>%
				gsub("\\D","",.) %>%
				c("GSE",.,"aftexcel.txt") %>%
				paste(collapse = "")
			write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
			
						
						
			#Now for the discretization part
			##get the wordy part again
			rawword <- t(ALZWORDF)
			
			##where is ID_REF located
			hereim <- grep("ID_REF",rownames(rawword))
			
			##Subject Names GSM...
			subjnam <- rawword[hereim,]
			
			##Getting the names for the rows
			namedarows <- rownames(rawword)[-hereim] %>%
				as.data.frame(.,stringsAsFactors = FALSE)
			RAWWORD <- rawword[-hereim,] %>%
				as.data.frame(.,stringsAsFactors = FALSE) %>%
				bind_cols(namedarows,.)
			z <- 1
			naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
			for(z in 1:dim(RAWWORD)[1]){
                if(sum(is.na(RAWWORD[z,])) > 0){
				    naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
				}
				if(length(grep("NA",RAWWORD[z,])) > 0){
                naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
                }
                z <- z + 1
            }
			
			colnames(naroww) <- "ROW_NAs"
			RAWWORD <- bind_cols(RAWWORD,naroww)
			
			
			roALZna <- t(ALZDAT) %>%
				rownames(.) %>%
				as.data.frame(.,stringsAsFactors = FALSE)
			colnames(roALZna) <- "ID_REF"
			
			RAWDAT <- t(ALZDAT) %>%
				as.data.frame(.,stringsAsFactors = FALSE)
			colnames(RAWDAT) <- NULL
			rownames(RAWDAT) <- NULL
			
			RAWDAT2 <- RAWDAT %>%
				cbind(roALZna,.) %>%
				dplyr::arrange(.,ID_REF)
			
			##Editing the file for R processing
			RAWDATID <- RAWDAT2[,1] %>%
				as.matrix(.)
			
			RAWDATNUM <- RAWDAT2[,-1] %>%
				mapply(.,FUN = as.numeric) %>%
				t(.)
			
			##Consolidating genes with the same name
			###create empty matrix of size equal  to tabRDATID
			tabRDATID <- table(RAWDATID)
			NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
			j <- 1
			for(j in 1:length(tabRDATID)){
				##Putting the ones without duplicates in their new homes
				if(tabRDATID[j] == 1){
					NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
				} else if(tabRDATID[j] > 1){
				    ##Averaging duplicates and putting them in their new homes
					NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
				}
				j <- j + 1
			}
			
			##Scaling the Data
			scrawdat <- NuRDATN%>%
				scale()
			attr(scrawdat,"scaled:center") <- NULL
			attr(scrawdat,"scaled:scale") <- NULL
			colnames(scrawdat) <- rownames(tabRDATID)
			
			#Outputting the Z-score file
			nfnzsc <- strsplit(alz,"[\\]") %>%
	            .[[1]] %>%
            	.[length(.)] %>%
            	gsub("\\D","",.) %>%
	            c("GSE",.,"zscore.txt") %>%
	            paste(collapse = "")
            zscraw <- scrawdat %>%
	            t()%>%
	            as.data.frame(.,stringsAsFactors = FALSE)
            colnames(zscraw) <- subjnam
            write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = TRUE)
			
			##Discretized the Data
			dialzdat <- scrawdat %>%
				dndat(.) %>%
				t()%>%
				as.data.frame(.,stringsAsFactors = FALSE)
			colnames(dialzdat) <- rownames(RAWDATNUM)
						
			##setting "ID_REF" as a new variable
			geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)
			colnames(geneNAM) <- "ID_REF"
			rownames(dialzdat) <- NULL
			dialzdat <-bind_cols(geneNAM,dialzdat)
			
			##NAs in a column
			x <- 2
			nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
			nacol[1,1] = "COL_NAs"
			for(x in 2:dim(dialzdat)[2]){
				nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
				x <- x + 1
			}
			colnames(nacol) <- colnames(dialzdat)
			dialzdat <- bind_rows(dialzdat,nacol)
			
			##NAs in a row
			y <- 1
			narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
			for(y in 1:dim(dialzdat)[1]){
				narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
				y <- y + 1
			}
			colnames(narowd) <- "ROW_NAs"
			dialzdat <- bind_cols(dialzdat,narowd)
			colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
			colnames(RAWWORD) <- colnames(dialzdat)
			##converting to character so that the clinical can be brought together with discrete data
			k <- 2
			for(k in 2:dim(dialzdat)[2]-1){
				dialzdat[,k] <- as.character(dialzdat[,k])
				k <- k + 1
			}
			#The End the full data
			Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
			
			#Produces Discrete file
			nfnaex2 <- strsplit(alz,"[\\|/]") %>%
				.[[1]] %>%
				.[length(.)] %>%
				gsub("\\D","",.) %>%
				c("GSE",.,"dscrt.txt") %>%
				paste(collapse = "")
			write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
			n <- n +1
		}
	} else if(numDAT == 2){
	#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN

		#All the files you want to analyze
		ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
		if(length(ANDIS) == 0){
			#Spit out a warning
			warning("You did not select any files and so no cleaning will be performed")
		} else{
			#indexing the data files
			n <- 1
			for(n in 1: length(ANDIS)){
				alz <- ANDIS[n]
				
				#Working with the wordy part of the document
				alzword <- alz %>%
					read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
					filter(grepl("!Sample",X1))%>%
					filter(!grepl("!Sample_contact",X1))
				
				#Getting the GPL file
				genena <- grep("_platform_id",alzword$X1) %>%
					alzword$X2[.] %>%
					str_trim(.) %>%
					paste0("^",.,"\\D") %>%
					grep(.,list.files()) %>%
					list.files()[.]
				 
				#Find out if it is a soft GPL file or not
				#soft <- strsplit(genena,"[\\|/]") %>%
				#	.[[1]] %>%
				#	.[length(.)] %>%
				#	grepl("soft",.)
				soft <- TRUE
				##Changing row names and column names:
				ALZWORD <- t(alzword)
				rownames(ALZWORD)=NULL
				colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
				ALZWORD <- chngrownm(ALZWORD)[-1,]
				ALZWORD <- ALZWORD%>%
					as.data.frame(.,stringsAsFactors = FALSE)%>%
					dplyr::select(-starts_with("col"))
				
				##Reorganizing information within the columns and final clinical data
				ALZWORDF <- cinfo(ALZWORD)
				
							
				#Working with Actual Data part of file
				alzdat <- alz %>% 
					read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
				ALZDAT <- t(alzdat[,-1])
				rownames(ALZDAT)=NULL
				
				##Is there a clean version of the GPL file available?
				gplnum <- strsplit(genena,"[\\|/]") %>%
					.[[1]] %>%
					.[length(.)] %>%
					gsub("\\D","",.)
				clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
				if(clfileex >= 1){
				#use the clean version
				geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
					read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
				
				} else if(clfileex == 0){
				##Lets Create a clean version
				 
				##Gene ID to Gene Name
					if(soft == TRUE){
						#Check to see if there is already a file containing information on soft files
						fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
						if(fileex == 1){
							#Check to see if this GPL soft file has been used before
							IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
								.$GPL_FILE_NUM%>%
								grepl(gplnum,.) %>%
								sum()
							if(IDF == 1){
								IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
									.$GPL_FILE_NUM%>%
									grep(gplnum,.)
								idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
										.$LOC_ID %>%
										.[IDLOCAL]
								geneIDNam <- genena %>%
									read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
									dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|GENE_SYMBOL_LIST|^Gene symbol$|^GENE_SYMBOL$|^Gene Symbol$",colnames(.)))
							} else if(IDF == 0){
								#No information on this particular GPL file
								idLOCGPL <- genena %>%
									read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
									t(.) %>%
									grep("^ID\\s*$",.) %>%
									-1
								cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
									cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
								geneIDNam <- genena %>%
									read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
									dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|GENE_SYMBOL_LIST|^GENE_SYMBOL$|^Gene symbol$|^Gene Symbol$",colnames(.)))
							}
						} else if(fileex == 0){
							#We must create a file that we can access for later use
							idLOCGPL <- genena %>%
								read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
								t(.) %>%
								grep("^ID\\s*$",.) %>%
								-1
							Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
							colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
							write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
							geneIDNam <- genena %>%
								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|GENE_SYMBOL_LIST|^GENE_SYMBOL$|^Gene Symbol$",colnames(.)))
						}
		 			} else if(soft == FALSE){
						geneIDNam <- genena %>%
							read_delim(delim="\t",comment = "#")%>%
							dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|GENE_SYMBOL_LIST|^GENE_SYMBOL$|^Gene symbol$|^Gene Symbol$",colnames(.)))
					}
					
					##Labeling the gene IDs without names
						geneIDNam <- NAFIXING(geneIDNam)	
					
					##remove the whitespace
					geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
					
					##Here is the clean version
					write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
				}
				
				
				
				##Changing the gene ID to gene name
				ALZDAT1 <- cgeneID(geneIDNam,alzdat)
				colnames(ALZDAT) = ALZDAT1[1,]
							
				
				##Adjusting the column names aka the gene names
				colnames(ALZDAT) <- gcnames(ALZDAT)
				
			
				#Full RAW Data
				Fullalzdwr <- ALZDAT %>%
					as.data.frame(.,stringsAsFactors = FALSE) %>%
					cbind(ALZWORDF,.)
					
				#Raw file is output
				nfnaex <- strsplit(alz,"[\\]") %>%
					.[[1]] %>%
					.[length(.)] %>%
					gsub("\\D","",.) %>%
					c("GSE",.,"aftexcel.txt") %>%
					paste(collapse = "")
				write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
				
	 						
						
				#Now for the discretization part
				##get the wordy part again
				rawword <- t(ALZWORDF)
				
				##where is ID_REF located
				hereim <- grep("ID_REF",rownames(rawword))
				
				##Subject Names GSM...
				subjnam <- rawword[hereim,]
				
				##Getting the names for the rows
				namedarows <- rownames(rawword)[-hereim] %>%
					as.data.frame(.,stringsAsFactors = FALSE)
				RAWWORD <- rawword[-hereim,] %>%
					as.data.frame(.,stringsAsFactors = FALSE) %>%
					bind_cols(namedarows,.)
				z <- 1
				naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
				for(z in 1:dim(RAWWORD)[1]){
                    if(sum(is.na(RAWWORD[z,])) > 0){
                        naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
                    }
					if(length(grep("NA",RAWWORD[z,])) > 0){
                        naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
                    }
                    z <- z + 1
                }
				
				colnames(naroww) <- "ROW_NAs"
				RAWWORD <- bind_cols(RAWWORD,naroww)
				
					
				roALZna <- t(ALZDAT) %>%
					rownames(.) %>%
					as.data.frame(.,stringsAsFactors = FALSE)
				colnames(roALZna) <- "ID_REF"
				
				RAWDAT <- t(ALZDAT) %>%
					as.data.frame(.,stringsAsFactors = FALSE)
				colnames(RAWDAT) <- NULL
				rownames(RAWDAT) <- NULL
				
				RAWDAT2 <- RAWDAT %>%
					cbind(roALZna,.) %>%
					dplyr::arrange(.,ID_REF)
				
				##Editing the file for R processing
				RAWDATID <- RAWDAT2[,1] %>%
					as.matrix(.)
				
				RAWDATNUM <- RAWDAT2[,-1] %>%
					mapply(.,FUN = as.numeric) %>%
					t(.)
				
				##Consolidating genes with the same name
				###create empty matrix of size equal  to tabRDATID
				tabRDATID <- table(RAWDATID)
				NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
				j <- 1
				for(j in 1:length(tabRDATID)){
					##Putting the ones without duplicates in their new homes
					if(tabRDATID[j] == 1){
						NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
					} else if(tabRDATID[j] > 1){
					##Averaging duplicates and putting them in their new homes
						NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
					}
					j <- j + 1
				}
				
				##Scaling the Data
				scrawdat <- NuRDATN%>%
					scale()
				attr(scrawdat,"scaled:center") <- NULL
				attr(scrawdat,"scaled:scale") <- NULL
				colnames(scrawdat) <- rownames(tabRDATID)
				
				#Outputting the Z-score file
			    nfnzsc <- strsplit(alz,"[\\]") %>%
                    .[[1]] %>%
                    .[length(.)] %>%
                    gsub("\\D","",.) %>%
                    c("GSE",.,"zscore.txt") %>%
                    paste(collapse = "")
                zscraw <- scrawdat %>%
                    t()%>%
                    as.data.frame(.,stringsAsFactors = FALSE)
                colnames(zscraw) <- subjnam
                write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = TRUE)
				
				##Discretized the Data
				dialzdat <- scrawdat %>%
					dndat(.) %>%
					t()%>%
					as.data.frame(.,stringsAsFactors = FALSE)
				colnames(dialzdat) <- rownames(RAWDATNUM)
							
				##setting "ID_REF" as a new variable
				geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)
				colnames(geneNAM) <- "ID_REF"
				rownames(dialzdat) <- NULL
				dialzdat <-bind_cols(geneNAM,dialzdat)
				
				##NAs in a column
				x <- 2
				nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
				nacol[1,1] = "COL_NAs"
				for(x in 2:dim(dialzdat)[2]){
					nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
					x <- x + 1
				}
				colnames(nacol) <- colnames(dialzdat)
				dialzdat <- bind_rows(dialzdat,nacol)
				
				##NAs in a row
				y <- 1
				narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
				for(y in 1:dim(dialzdat)[1]){
					narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
					y <- y + 1
				}
				colnames(narowd) <- "ROW_NAs"
				dialzdat <- bind_cols(dialzdat,narowd)
				colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
				colnames(RAWWORD) <- colnames(dialzdat)
				##converting to character so that the clinical can be brought together with discrete data
				k <- 2
				for(k in 2:dim(dialzdat)[2]-1){
					dialzdat[,k] <- as.character(dialzdat[,k])
					k <- k + 1
				}
				#The End the full data
				Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
				
				#Produces Discrete file
				nfnaex2 <- strsplit(alz,"[\\|/]") %>%
					.[[1]] %>%
					.[length(.)] %>%
					gsub("\\D","",.) %>%
					c("GSE",.,"dscrt.txt") %>%
					paste(collapse = "")
				write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
				
								
				n <- n + 1
			}
		}
	}
}
#The Rest of this code will be used every time you want to change a data set
THEFT()
