For Count data

Efrain Gonzalez
1 parent d0434e8502
Showing 1 changed file with 131 additions and 0 deletions Show diff stats
RNormalizeCounts.txt
@@ -0,0 +1,131 @@
+#Efrain Gonzalez
+#RNA Sequence normalization
+#03/29/2018
+
+#Keeping 11 digits
+options(digits = 11)
+#Libraries required to run the code
+library(pryr)
+library(MASS)
+library(dplyr)
+library(tidyr)
+library(readr)
+library(stringr)
+
+#First we must join all HTSeq-count files together for a given data set
+# in this joining make sure that only those genes that are common amongst all sets are
+# brought together
+#At the end of this step a file will be produced that contains only the genes that were
+# common to each HTSeq-count file and the respective count information for that gene
+
+RNATheft <- function() {
+	#Get working directory based on the directory that contains the files of interest
+	wd <- getwd()
+	numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to join and clean all of the HTSeq-Count files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
+	HTSeqfiles <- grep("_Count\\.txt$",list.files())
+	HTSeqfloc <- list.files()[HTSeqfiles]
+	
+	#Please join all files
+	if(numDAT == 1) {
+		#joining all files based on gene information
+		for(i in 1:length(HTSeqfloc)) {
+			#Using the name of the file to label counts
+			namefile <- strsplit(HTSeqfloc[i],".txt") %>%
+				.[[1]] %>%
+				.[length(.)]
+			#Adding the information to finfile based on common genes
+			if(i == 1) {
+				finfile <- HTSeqfloc[1] %>%
+					read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
+					filter(.,!grepl("^__",.$`Gene Symbol`))
+			} else {
+				intermfile <- HTSeqfloc[i] %>%
+					read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
+					filter(.,!grepl("^__",.$`Gene Symbol`))
+				finfile <- inner_join(finfile,intermfile,by = "Gene Symbol")
+				
+			}
+		}
+	} else if(numDAT == 2) {
+		#Choose the data files to join and clean
+		Chosenfil <- select.list(choices = HTSeqfloc,multiple = TRUE, title = "Choose the HTSeq files that you want to join and clean:")
+		if(length(Chosenfil) == 0) {
+			#Spit out a warning
+			warning("You did not select any files and so no cleaning will be performed")
+		} else {
+			for(i in 1:length(Chosenfil)) {
+				#Using the name of the file to label counts
+				namefile <- strsplit(Chosenfil[i],".txt") %>%
+					.[[1]] %>%
+					.[length(.)]
+				#Adding the information to finfile based on common genes
+				if(i == 1) {
+					finfile <- HTSeqfloc[1] %>%
+						read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
+						filter(.,!grepl("^__",.$`Gene Symbol`))
+				} else {
+					intermfile <- HTSeqfloc[i] %>%
+						read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
+						filter(.,!grepl("^__",.$`Gene Symbol`))
+					finfile <- inner_join(finfile,intermfile,by = "Gene Symbol")
+					
+				}
+			}
+		}
+		
+	}
+	finfile1 <- finfile
+	finfile1$GeneVariance <- 0.000000
+	finfile1$GeneCountSum <- 0L
+	for(i in 1:dim(finfile1)[1]) {
+		finfile1$GeneVariance[i] <- finfile1[i,-1] %>%
+			.[-dim(.)[2]] %>%
+			.[-dim(.)[2]] %>%
+			as.vector(.,mode = "integer") %>%
+			var(.)
+		finfile1$GeneCountSum[i] <- finfile1[i,-1] %>%
+			.[-dim(.)[2]] %>%
+			.[-dim(.)[2]] %>%
+			as.vector(.,mode = "integer") %>%
+			sum(.)
+	}
+	#Rank from least variant to most variant
+	finfile1 <- arrange(finfile1,finfile1$GeneVariance)
+
+	#find only the ones with a nonzero variance
+	finfile1 <- filter(finfile1,finfile1$GeneVariance > 0)	
+
+	##What if instead I use the criteria to be the following
+	## variance = [(1-(1/n))^2/(n-1)] + (1/n)^2
+	## it will eliminate any that only have 1 column with 1 in it
+	## testvar <- ((1-(1/(dim(finfile1)[2]-3)))^2)/(dim(finfile1)[2]-4) + (1/(dim(finfile1)[2]-3))^2
+	
+	#making sure that all values in each column are at least above zero
+	finfile1 <- filter(finfile1,finfile1$GeneCountSum > dim(finfile1)[2]-3)
+	
+	##Your minimum variance genes are going to make up .1% of the total amount of genes
+	if(dim(finfile1)[1] < 1000) {
+		numofgenesvar <- 1
+	} else {
+		numofgenesvar <- round(.001 * dim(finfile1)[1])
+	}
+	lowestvargenes <- as.data.frame(finfile1[1:numofgenesvar,],stringsAsFactors = FALSE)
+	write.table(lowestvargenes,file = "GenesUsedForVariance.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
+	lowestvargenes
+	estlowestvargenes <- lowestvargenes$GeneVariance %>%
+		as.vector(.,mode = "double") %>%
+		mean(.)
+		
+		
+	overallmean <- finfile[,-1] %>%
+		as.matrix(.,mode = "integer") %>%
+		mean(.)
+	normalcounts <- finfile[,-1] %>%
+		as.matrix(.,mode = "double")
+	normalcounts <- (normalcounts - overallmean)/sqrt(estlowestvargenes)
+	normalcounts <- as.data.frame(normalcounts,stringsAsFactors = FALSE)
+	normalcounts <- cbind(as.data.frame(finfile$`Gene Symbol`,stringsAsFactors = FALSE),normalcounts)
+	colnames(normalcounts)[1] <- "Gene Symbol"
+	write.table(normalcounts,file = "NormalizedCounts.txt",sep = "\t",row.names = FALSE, col.names = TRUE)
+	normalcounts
+}
 \ No newline at end of file
...	...	@@ -0,0 +1,131 @@
	1	+#Efrain Gonzalez
	2	+#RNA Sequence normalization
	3	+#03/29/2018
	4	+
	5	+#Keeping 11 digits
	6	+options(digits = 11)
	7	+#Libraries required to run the code
	8	+library(pryr)
	9	+library(MASS)
	10	+library(dplyr)
	11	+library(tidyr)
	12	+library(readr)
	13	+library(stringr)
	14	+
	15	+#First we must join all HTSeq-count files together for a given data set
	16	+# in this joining make sure that only those genes that are common amongst all sets are
	17	+# brought together
	18	+#At the end of this step a file will be produced that contains only the genes that were
	19	+# common to each HTSeq-count file and the respective count information for that gene
	20	+
	21	+RNATheft <- function() {
	22	+ #Get working directory based on the directory that contains the files of interest
	23	+ wd <- getwd()
	24	+ numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to join and clean all of the HTSeq-Count files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
	25	+ HTSeqfiles <- grep("_Count\\.txt$",list.files())
	26	+ HTSeqfloc <- list.files()[HTSeqfiles]
	27	+
	28	+ #Please join all files
	29	+ if(numDAT == 1) {
	30	+ #joining all files based on gene information
	31	+ for(i in 1:length(HTSeqfloc)) {
	32	+ #Using the name of the file to label counts
	33	+ namefile <- strsplit(HTSeqfloc[i],".txt") %>%
	34	+ .[[1]] %>%
	35	+ .[length(.)]
	36	+ #Adding the information to finfile based on common genes
	37	+ if(i == 1) {
	38	+ finfile <- HTSeqfloc[1] %>%
	39	+ read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
	40	+ filter(.,!grepl("^__",.$`Gene Symbol`))
	41	+ } else {
	42	+ intermfile <- HTSeqfloc[i] %>%
	43	+ read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
	44	+ filter(.,!grepl("^__",.$`Gene Symbol`))
	45	+ finfile <- inner_join(finfile,intermfile,by = "Gene Symbol")
	46	+
	47	+ }
	48	+ }
	49	+ } else if(numDAT == 2) {
	50	+ #Choose the data files to join and clean
	51	+ Chosenfil <- select.list(choices = HTSeqfloc,multiple = TRUE, title = "Choose the HTSeq files that you want to join and clean:")
	52	+ if(length(Chosenfil) == 0) {
	53	+ #Spit out a warning
	54	+ warning("You did not select any files and so no cleaning will be performed")
	55	+ } else {
	56	+ for(i in 1:length(Chosenfil)) {
	57	+ #Using the name of the file to label counts
	58	+ namefile <- strsplit(Chosenfil[i],".txt") %>%
	59	+ .[[1]] %>%
	60	+ .[length(.)]
	61	+ #Adding the information to finfile based on common genes
	62	+ if(i == 1) {
	63	+ finfile <- HTSeqfloc[1] %>%
	64	+ read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
	65	+ filter(.,!grepl("^__",.$`Gene Symbol`))
	66	+ } else {
	67	+ intermfile <- HTSeqfloc[i] %>%
	68	+ read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
	69	+ filter(.,!grepl("^__",.$`Gene Symbol`))
	70	+ finfile <- inner_join(finfile,intermfile,by = "Gene Symbol")
	71	+
	72	+ }
	73	+ }
	74	+ }
	75	+
	76	+ }
	77	+ finfile1 <- finfile
	78	+ finfile1$GeneVariance <- 0.000000
	79	+ finfile1$GeneCountSum <- 0L
	80	+ for(i in 1:dim(finfile1)[1]) {
	81	+ finfile1$GeneVariance[i] <- finfile1[i,-1] %>%
	82	+ .[-dim(.)[2]] %>%
	83	+ .[-dim(.)[2]] %>%
	84	+ as.vector(.,mode = "integer") %>%
	85	+ var(.)
	86	+ finfile1$GeneCountSum[i] <- finfile1[i,-1] %>%
	87	+ .[-dim(.)[2]] %>%
	88	+ .[-dim(.)[2]] %>%
	89	+ as.vector(.,mode = "integer") %>%
	90	+ sum(.)
	91	+ }
	92	+ #Rank from least variant to most variant
	93	+ finfile1 <- arrange(finfile1,finfile1$GeneVariance)
	94	+
	95	+ #find only the ones with a nonzero variance
	96	+ finfile1 <- filter(finfile1,finfile1$GeneVariance > 0)
	97	+
	98	+ ##What if instead I use the criteria to be the following
	99	+ ## variance = [(1-(1/n))^2/(n-1)] + (1/n)^2
	100	+ ## it will eliminate any that only have 1 column with 1 in it
	101	+ ## testvar <- ((1-(1/(dim(finfile1)[2]-3)))^2)/(dim(finfile1)[2]-4) + (1/(dim(finfile1)[2]-3))^2
	102	+
	103	+ #making sure that all values in each column are at least above zero
	104	+ finfile1 <- filter(finfile1,finfile1$GeneCountSum > dim(finfile1)[2]-3)
	105	+
	106	+ ##Your minimum variance genes are going to make up .1% of the total amount of genes
	107	+ if(dim(finfile1)[1] < 1000) {
	108	+ numofgenesvar <- 1
	109	+ } else {
	110	+ numofgenesvar <- round(.001 * dim(finfile1)[1])
	111	+ }
	112	+ lowestvargenes <- as.data.frame(finfile1[1:numofgenesvar,],stringsAsFactors = FALSE)
	113	+ write.table(lowestvargenes,file = "GenesUsedForVariance.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
	114	+ lowestvargenes
	115	+ estlowestvargenes <- lowestvargenes$GeneVariance %>%
	116	+ as.vector(.,mode = "double") %>%
	117	+ mean(.)
	118	+
	119	+
	120	+ overallmean <- finfile[,-1] %>%
	121	+ as.matrix(.,mode = "integer") %>%
	122	+ mean(.)
	123	+ normalcounts <- finfile[,-1] %>%
	124	+ as.matrix(.,mode = "double")
	125	+ normalcounts <- (normalcounts - overallmean)/sqrt(estlowestvargenes)
	126	+ normalcounts <- as.data.frame(normalcounts,stringsAsFactors = FALSE)
	127	+ normalcounts <- cbind(as.data.frame(finfile$`Gene Symbol`,stringsAsFactors = FALSE),normalcounts)
	128	+ colnames(normalcounts)[1] <- "Gene Symbol"
	129	+ write.table(normalcounts,file = "NormalizedCounts.txt",sep = "\t",row.names = FALSE, col.names = TRUE)
	130	+ normalcounts
	131	+}
0	132	\ No newline at end of file