##Posted 6/15/2017
options(digits = 11)

#Libraries required to run the code
library(pryr)
library(MASS)
library(dplyr)
library(tidyr)
library(readr)
library(stringr)


#Necessary Functions
#1#Function for handling the changing of row names and column names
chngrownm <- function(mat){
	row <- dim(mat)[1]
	col <- dim(mat)[2]
	j <- 1
	x <- 1
	p <- 1	
	a <- 1 
	b <- 1
	g <- 1
	for(j in 1:col){
		if("!Sample_source_name_ch1"==mat[1,j]){
			colnames(mat)[j] <- "Brain_Region"	
		} else if("!Sample_title" == mat[1,j]){
			colnames(mat)[j] <- "Title"
		} else if("!Sample_geo_accession" == mat[1,j]){
			colnames(mat)[j] <- "ID_REF"
		} else{
			if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
				colnames(mat)[j] <- paste0("Sex",x)
				x = x + 1
			}
			if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
				colnames(mat)[j] <- paste0("PMI",p)
				p = p + 1
			}
			if(grepl("age|Age|AGE",mat[2,j])==TRUE){
				colnames(mat)[j] <- paste0("Age",a)
				a = a + 1
			 }
			if(grepl("braak|b&b",mat[2,j])==TRUE){
				colnames(mat)[j] <- paste0("Braak",b)
				b = b + 1
			}
			if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){
				colnames(mat)[j] <- paste0("Group",g)
				g = g + 1
			}
			
		}
		j = j + 1
	}
	mat
}			

#2#Function for reorganizing information within the columns
cinfo <- function(mat){
	col <- dim(mat)[2]
	j <-2
	for(j in 2:col){
		if(grepl("Group",colnames(mat)[j]) == TRUE){
			mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
		}
		if(grepl("Age",colnames(mat)[j])==TRUE){
			mat[,j] <- gsub("\\D","",mat[,j])%>%
				as.integer()
		}
		if(grepl("Sex",colnames(mat)[j])==TRUE){
			mat[,j] <- gsub(".+:\\s","",mat[,j])
		}
		if(grepl("PMI",colnames(mat)[j])==TRUE){
			mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
				as.numeric() 
		}
		if(grepl("Braak",colnames(mat)[j])==TRUE){
			mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
				as.roman()%>%
				as.integer()
		}
	j=j+1
	}
	mat
}

#3#Function for labeling the gene IDs without names
NAFIXING <- function(GIDNAM){
	row <- dim(GIDNAM)[1]
	i <- 1
	for(i in 1:row){
		if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
			GIDNAM[i,2] <- GIDNAM[i,1]
		}
		i <- i + 1
	}
	GIDNAM
}	

#4#Function for changing the gene ID to gene name
cgeneID <- function(GeneName,DATA){
	nj <- t(GeneName)
	nq <- t(DATA)
	colGene <- dim(nj)[2]
	colDATA <- dim(nq)[2]
	j <- 1
	for(j in 1:colDATA){
		#where is that gene id located within the GPL file
		chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
		if(is.na(sum(chngreq))==FALSE){
			if(sum(chngreq) > 0){
			nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
			}
		}
		j <- j + 1
	}
	nq
}
#cgeneID <- function(GeneName,DATA){
#    colGene <- dim(GeneName)[2]
#     j <- 1
#     for(j in 1:colGene){
#	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
#	if(is.na(sum(chngsreq))==FALSE){
#		if(sum(chngsreq) > 0){
#			DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
#		}
#	}
#		#if(sum(chngsreq) > 0){
#		##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
#		#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
#		#}
#	j = j+1
#	}
#	DATA
#}

#5#Function for adjusting the gene names
gcnames <- function(DiData,usecol=1){
	nuruns <- dim(DiData)[2]
	i = 1
	nwnam <- rep("0",length.out=nuruns)
	for(i in 1:nuruns){
		if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){
			nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])
		} else{
			nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])
		}
		
	}
	nwnam

}

#6# Function for discretizing the data 
dndat <- function(NDATA){
	rownd <- dim(NDATA)[1]
	colnd <- dim(NDATA)[2]
	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
	colnames(DDATA) <- colnames(NDATA)
	i <- 1
	for(i in 1:rownd){
		j <- 1
		for(j in 1:colnd){
			if(is.na(NDATA[i,j])==FALSE){
			
				if(NDATA[i,j] < -1){
					DDATA[i,j]=0L
				} else if(NDATA[i,j] > 1){
					DDATA[i,j]=2L
				} else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){
					DDATA[i,j]=1L
				}
			} else{
				DDATA[i,j] = NDATA[i,j]
			}
			j = j + 1			
		}
		i = i + 1
	}
	DDATA
}


#The Rest of this code will be used every time you want to change a data set

#Getting the series matrix file
print("Choose the series matrix file that you want to Analyze")
alz <- file.choose()

#Getting the GPL file
print("Choose the GPL file that correlates with the above series matrix file")
genena <- file.choose()


#Find out if it is a soft GPL file or not
soft <- strsplit(genena,"[\\|/]") %>%
	.[[1]] %>%
	.[length(.)] %>%
	grepl("soft|annot",.)

#Working with the wordy part of the document
alzword <- alz %>%
	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
	filter(grepl("!Sample",X1))%>%
	filter(!grepl("!Sample_contact",X1))

##Changing row names and column names:
ALZWORD <- t(alzword)
rownames(ALZWORD)=NULL
colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
ALZWORD <- chngrownm(ALZWORD)[-1,]
ALZWORD <- ALZWORD%>%
	as.data.frame(.,stringsAsFactors = FALSE)%>%
	dplyr::select(-starts_with("col"))

##Reorganizing information within the columns
ALZWORDF <- cinfo(ALZWORD)


#Working with Actual Data part of file
alzdat <- alz %>% 
	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
ALZDAT <- t(alzdat[,-1])
rownames(ALZDAT)=NULL

##Is there a clean version of the GPL file available?
gplnum <- strsplit(genena,"[\\|/]") %>%
	.[[1]] %>%
	.[length(.)] %>%
	gsub("\\D","",.)
clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
if(clfileex >= 1){
#use the clean version
geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

} else if(clfileex == 0){
##Lets Create a clean version
 
##Gene ID to Gene Name
	if(soft == TRUE){
		#Check to see if there is already a file containing information on soft files
		fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
		if(fileex == 1){
			#Check to see if this GPL soft file has been used before
			IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
				.$GPL_FILE_NUM%>%
				grepl(gplnum,.) %>%
				sum()
			if(IDF == 1){
				IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
					.$GPL_FILE_NUM%>%
					grep(gplnum,.)
				idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
					.$LOC_ID %>%
					.[IDLOCAL]
				geneIDNam <- genena %>%
					read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
					dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.)))
			} else if(IDF == 0){
				#No information on this particular GPL file
				idLOCGPL <- genena %>%
					read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
					t(.) %>%
					grep("^ID\\s*$",.) %>%
					-1
				cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
					cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
				geneIDNam <- genena %>%
					read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
					dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.)))
			}
		} else if(fileex == 0){
			#We must create a file that we can access for later use
			idLOCGPL <- genena %>%
				read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
				t(.) %>%
				grep("^ID\\s*$",.) %>%
				-1
			Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
			colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
			write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
			geneIDNam <- genena %>%
				read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
				dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.)))
		}
	 } else if(soft == FALSE){
		geneIDNam <- genena %>%
		read_delim(delim="\t",comment = "#")%>%
		dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.)))
	}

	##Labeling the gene IDs without names
	geneIDNam <- NAFIXING(geneIDNam)	

	##remove the whitespace
	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
	
	##Here is the clean version
	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
}



##Changing the gene ID to gene name
ALZDAT1 <- cgeneID(geneIDNam,alzdat)
colnames(ALZDAT) = ALZDAT1[1,]


##Adjusting the column names aka the gene names
colnames(ALZDAT) <- gcnames(ALZDAT)


#Full RAW Data
Fullalzdwr <- ALZDAT %>%
	as.data.frame(.,stringsAsFactors = FALSE) %>%
	cbind(ALZWORDF,.)


#Raw file is output
nfnaex <- strsplit(alz,"[\\]") %>%
	.[[1]] %>%
	.[length(.)] %>%
	gsub("\\D","",.) %>%
	c("GSE",.,"aftexcel.txt") %>%
	paste(collapse = "")
write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")


#Now for the discretization part
##get the wordy part again
rawword <- t(ALZWORDF)

##where is ID_REF located
hereim <- grep("ID_REF",rownames(rawword))

##Subject Names GSM...
subjnam <- rawword[hereim,]

##Getting the names for the rows
namedarows <- rownames(rawword)[-hereim] %>%
	as.data.frame(.,stringsAsFactors = FALSE)
RAWWORD <- rawword[-hereim,] %>%
	as.data.frame(.,stringsAsFactors = FALSE) %>%
	bind_cols(namedarows,.)
z <- 1
naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
for(z in 1:dim(RAWWORD)[1]){
    if(sum(is.na(RAWWORD[z,])) > 0){
        naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
    }
	if(length(grep("NA",RAWWORD[z,])) > 0){
        naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
    }
    z <- z + 1
}

colnames(naroww) <- "ROW_NAs"
RAWWORD <- bind_cols(RAWWORD,naroww)


roALZna <- t(ALZDAT) %>%
	rownames(.) %>%
	as.data.frame(.,stringsAsFactors = FALSE)
colnames(roALZna) <- "ID_REF"

RAWDAT <- t(ALZDAT) %>%
	as.data.frame(.,stringsAsFactors = FALSE)
colnames(RAWDAT) <- NULL
rownames(RAWDAT) <- NULL

RAWDAT2 <- RAWDAT %>%
	cbind(roALZna,.) %>%
	dplyr::arrange(.,ID_REF)

##Editing the file for R processing
RAWDATID <- RAWDAT2[,1] %>%
	as.matrix(.)

RAWDATNUM <- RAWDAT2[,-1] %>%
	mapply(.,FUN = as.numeric) %>%
	t(.)

##Consolidating genes with the same name
###create empty matrix of size equal  to tabRDATID
tabRDATID <- table(RAWDATID)
NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
j <- 1
for(j in 1:length(tabRDATID)){

	##Putting the ones without duplicates in their new homes
	if(tabRDATID[j] == 1){
		NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
	} else if(tabRDATID[j] > 1){
	##Averaging duplicates and putting them in their new homes
		NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
	}
	j <- j + 1
}

##Scaling the Data
scrawdat <- NuRDATN%>%
	scale()
attr(scrawdat,"scaled:center") <- NULL
attr(scrawdat,"scaled:scale") <- NULL
colnames(scrawdat) <- rownames(tabRDATID)

#Outputting the Z-score file
nfnzsc <- strsplit(alz,"[\\]") %>%
    .[[1]] %>%
    .[length(.)] %>%
    gsub("\\D","",.) %>%
    c("GSE",.,"zscore.txt") %>%
    paste(collapse = "")
zscraw <- scrawdat %>%
    t()%>%
    as.data.frame(.,stringsAsFactors = FALSE)
colnames(zscraw) <- subjnam
write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = TRUE)

##Discretized the Data
dialzdat <- scrawdat %>%
	dndat(.) %>%
	t()%>%
	as.data.frame(.,stringsAsFactors = FALSE)
colnames(dialzdat) <- rownames(RAWDATNUM)

##setting "ID_REF" as a new variable
geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)
colnames(geneNAM) <- "ID_REF"
rownames(dialzdat) <- NULL
dialzdat <-bind_cols(geneNAM,dialzdat)

##NAs in a column
x <- 2
nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
nacol[1,1] = "COL_NAs"
for(x in 2:dim(dialzdat)[2]){
	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
	x <- x + 1
}
colnames(nacol) <- colnames(dialzdat)
dialzdat<-bind_rows(dialzdat,nacol)

##NAs in a row
y <- 1
narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
for(y in 1:dim(dialzdat)[1]){
	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
	y <- y + 1
}
colnames(narowd) <- "ROW_NAs"
dialzdat <- bind_cols(dialzdat,narowd)
colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
colnames(RAWWORD) <- colnames(dialzdat)
##converting to character so that the clinical can be brought together with discrete data
k <- 2
for(k in 2:dim(dialzdat)[2]-1){
	dialzdat[,k] <- as.character(dialzdat[,k])
	k <- k + 1
}
#The End the full data
Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

#Produces Discrete file
nfnaex2 <- strsplit(alz,"[\\|/]") %>%
	.[[1]] %>%
	.[length(.)] %>%
	gsub("\\D","",.) %>%
	c("GSE",.,"dscrt.txt") %>%
	paste(collapse = "")
write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)