#Efrain Gonzalez
#RNA Sequence normalization
#03/29/2018

#Keeping 11 digits
options(digits = 11)
#Libraries required to run the code
library(pryr)
library(MASS)
library(dplyr)
library(tidyr)
library(readr)
library(stringr)

#First we must join all HTSeq-count files together for a given data set
# in this joining make sure that only those genes that are common amongst all sets are
# brought together
#At the end of this step a file will be produced that contains only the genes that were
# common to each HTSeq-count file and the respective count information for that gene

RNATheft <- function() {
	#Get working directory based on the directory that contains the files of interest
	wd <- getwd()
	numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to join and clean all of the HTSeq-Count files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
	HTSeqfiles <- grep("_Count\\.txt$",list.files())
	HTSeqfloc <- list.files()[HTSeqfiles]
	
	#Please join all files
	if(numDAT == 1) {
		#joining all files based on gene information
		for(i in 1:length(HTSeqfloc)) {
			#Using the name of the file to label counts
			namefile <- strsplit(HTSeqfloc[i],".txt") %>%
				.[[1]] %>%
				.[length(.)]
			#Adding the information to finfile based on common genes
			if(i == 1) {
				finfile <- HTSeqfloc[1] %>%
					read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
					filter(.,!grepl("^__",.$`Gene Symbol`))
			} else {
				intermfile <- HTSeqfloc[i] %>%
					read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
					filter(.,!grepl("^__",.$`Gene Symbol`))
				finfile <- inner_join(finfile,intermfile,by = "Gene Symbol")
				
			}
		}
	} else if(numDAT == 2) {
		#Choose the data files to join and clean
		Chosenfil <- select.list(choices = HTSeqfloc,multiple = TRUE, title = "Choose the HTSeq files that you want to join and clean:")
		if(length(Chosenfil) == 0) {
			#Spit out a warning
			warning("You did not select any files and so no cleaning will be performed")
		} else {
			for(i in 1:length(Chosenfil)) {
				#Using the name of the file to label counts
				namefile <- strsplit(Chosenfil[i],".txt") %>%
					.[[1]] %>%
					.[length(.)]
				#Adding the information to finfile based on common genes
				if(i == 1) {
					finfile <- HTSeqfloc[1] %>%
						read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
						filter(.,!grepl("^__",.$`Gene Symbol`))
				} else {
					intermfile <- HTSeqfloc[i] %>%
						read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
						filter(.,!grepl("^__",.$`Gene Symbol`))
					finfile <- inner_join(finfile,intermfile,by = "Gene Symbol")
					
				}
			}
		}
		
	}
	finfile1 <- finfile
	finfile1$GeneVariance <- 0.000000
	finfile1$GeneCountSum <- 0L
	for(i in 1:dim(finfile1)[1]) {
		finfile1$GeneVariance[i] <- finfile1[i,-1] %>%
			.[-dim(.)[2]] %>%
			.[-dim(.)[2]] %>%
			as.vector(.,mode = "integer") %>%
			var(.)
		finfile1$GeneCountSum[i] <- finfile1[i,-1] %>%
			.[-dim(.)[2]] %>%
			.[-dim(.)[2]] %>%
			as.vector(.,mode = "integer") %>%
			sum(.)
	}
	#Rank from least variant to most variant
	finfile1 <- arrange(finfile1,finfile1$GeneVariance)

	#find only the ones with a nonzero variance
	finfile1 <- filter(finfile1,finfile1$GeneVariance > 0)	

	##What if instead I use the criteria to be the following
	## variance = [(1-(1/n))^2/(n-1)] + (1/n)^2
	## it will eliminate any that only have 1 column with 1 in it
	## testvar <- ((1-(1/(dim(finfile1)[2]-3)))^2)/(dim(finfile1)[2]-4) + (1/(dim(finfile1)[2]-3))^2
	
	#making sure that all values in each column are at least above zero
	finfile1 <- filter(finfile1,finfile1$GeneCountSum > dim(finfile1)[2]-3)
	
	##Your minimum variance genes are going to make up .1% of the total amount of genes
	if(dim(finfile1)[1] < 1000) {
		numofgenesvar <- 1
	} else {
		numofgenesvar <- round(.001 * dim(finfile1)[1])
	}
	lowestvargenes <- as.data.frame(finfile1[1:numofgenesvar,],stringsAsFactors = FALSE)
	write.table(lowestvargenes,file = "GenesUsedForVariance.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
	lowestvargenes
	estlowestvargenes <- lowestvargenes$GeneVariance %>%
		as.vector(.,mode = "double") %>%
		mean(.)
		
		
	overallmean <- finfile[,-1] %>%
		as.matrix(.,mode = "integer") %>%
		mean(.)
	normalcounts <- finfile[,-1] %>%
		as.matrix(.,mode = "double")
	normalcounts <- (normalcounts - overallmean)/sqrt(estlowestvargenes)
	normalcounts <- as.data.frame(normalcounts,stringsAsFactors = FALSE)
	normalcounts <- cbind(as.data.frame(finfile$`Gene Symbol`,stringsAsFactors = FALSE),normalcounts)
	colnames(normalcounts)[1] <- "Gene Symbol"
	write.table(normalcounts,file = "NormalizedCounts.txt",sep = "\t",row.names = FALSE, col.names = TRUE)
	normalcounts
}