Commit 7360830df3c63ce651ca2ed6d6e28654b3b04852

Authored by Efrain Gonzalez
1 parent d0434e8502
Exists in master

For Count data

Showing 1 changed file with 131 additions and 0 deletions   Show diff stats
RNormalizeCounts.txt
File was created 1 #Efrain Gonzalez
2 #RNA Sequence normalization
3 #03/29/2018
4
5 #Keeping 11 digits
6 options(digits = 11)
7 #Libraries required to run the code
8 library(pryr)
9 library(MASS)
10 library(dplyr)
11 library(tidyr)
12 library(readr)
13 library(stringr)
14
15 #First we must join all HTSeq-count files together for a given data set
16 # in this joining make sure that only those genes that are common amongst all sets are
17 # brought together
18 #At the end of this step a file will be produced that contains only the genes that were
19 # common to each HTSeq-count file and the respective count information for that gene
20
21 RNATheft <- function() {
22 #Get working directory based on the directory that contains the files of interest
23 wd <- getwd()
24 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to join and clean all of the HTSeq-Count files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
25 HTSeqfiles <- grep("_Count\\.txt$",list.files())
26 HTSeqfloc <- list.files()[HTSeqfiles]
27
28 #Please join all files
29 if(numDAT == 1) {
30 #joining all files based on gene information
31 for(i in 1:length(HTSeqfloc)) {
32 #Using the name of the file to label counts
33 namefile <- strsplit(HTSeqfloc[i],".txt") %>%
34 .[[1]] %>%
35 .[length(.)]
36 #Adding the information to finfile based on common genes
37 if(i == 1) {
38 finfile <- HTSeqfloc[1] %>%
39 read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
40 filter(.,!grepl("^__",.$`Gene Symbol`))
41 } else {
42 intermfile <- HTSeqfloc[i] %>%
43 read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
44 filter(.,!grepl("^__",.$`Gene Symbol`))
45 finfile <- inner_join(finfile,intermfile,by = "Gene Symbol")
46
47 }
48 }
49 } else if(numDAT == 2) {
50 #Choose the data files to join and clean
51 Chosenfil <- select.list(choices = HTSeqfloc,multiple = TRUE, title = "Choose the HTSeq files that you want to join and clean:")
52 if(length(Chosenfil) == 0) {
53 #Spit out a warning
54 warning("You did not select any files and so no cleaning will be performed")
55 } else {
56 for(i in 1:length(Chosenfil)) {
57 #Using the name of the file to label counts
58 namefile <- strsplit(Chosenfil[i],".txt") %>%
59 .[[1]] %>%
60 .[length(.)]
61 #Adding the information to finfile based on common genes
62 if(i == 1) {
63 finfile <- HTSeqfloc[1] %>%
64 read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
65 filter(.,!grepl("^__",.$`Gene Symbol`))
66 } else {
67 intermfile <- HTSeqfloc[i] %>%
68 read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
69 filter(.,!grepl("^__",.$`Gene Symbol`))
70 finfile <- inner_join(finfile,intermfile,by = "Gene Symbol")
71
72 }
73 }
74 }
75
76 }
77 finfile1 <- finfile
78 finfile1$GeneVariance <- 0.000000
79 finfile1$GeneCountSum <- 0L
80 for(i in 1:dim(finfile1)[1]) {
81 finfile1$GeneVariance[i] <- finfile1[i,-1] %>%
82 .[-dim(.)[2]] %>%
83 .[-dim(.)[2]] %>%
84 as.vector(.,mode = "integer") %>%
85 var(.)
86 finfile1$GeneCountSum[i] <- finfile1[i,-1] %>%
87 .[-dim(.)[2]] %>%
88 .[-dim(.)[2]] %>%
89 as.vector(.,mode = "integer") %>%
90 sum(.)
91 }
92 #Rank from least variant to most variant
93 finfile1 <- arrange(finfile1,finfile1$GeneVariance)
94
95 #find only the ones with a nonzero variance
96 finfile1 <- filter(finfile1,finfile1$GeneVariance > 0)
97
98 ##What if instead I use the criteria to be the following
99 ## variance = [(1-(1/n))^2/(n-1)] + (1/n)^2
100 ## it will eliminate any that only have 1 column with 1 in it
101 ## testvar <- ((1-(1/(dim(finfile1)[2]-3)))^2)/(dim(finfile1)[2]-4) + (1/(dim(finfile1)[2]-3))^2
102
103 #making sure that all values in each column are at least above zero
104 finfile1 <- filter(finfile1,finfile1$GeneCountSum > dim(finfile1)[2]-3)
105
106 ##Your minimum variance genes are going to make up .1% of the total amount of genes
107 if(dim(finfile1)[1] < 1000) {
108 numofgenesvar <- 1
109 } else {
110 numofgenesvar <- round(.001 * dim(finfile1)[1])
111 }
112 lowestvargenes <- as.data.frame(finfile1[1:numofgenesvar,],stringsAsFactors = FALSE)
113 write.table(lowestvargenes,file = "GenesUsedForVariance.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
114 lowestvargenes
115 estlowestvargenes <- lowestvargenes$GeneVariance %>%
116 as.vector(.,mode = "double") %>%
117 mean(.)
118
119
120 overallmean <- finfile[,-1] %>%
121 as.matrix(.,mode = "integer") %>%
122 mean(.)
123 normalcounts <- finfile[,-1] %>%
124 as.matrix(.,mode = "double")
125 normalcounts <- (normalcounts - overallmean)/sqrt(estlowestvargenes)
126 normalcounts <- as.data.frame(normalcounts,stringsAsFactors = FALSE)
127 normalcounts <- cbind(as.data.frame(finfile$`Gene Symbol`,stringsAsFactors = FALSE),normalcounts)
128 colnames(normalcounts)[1] <- "Gene Symbol"
129 write.table(normalcounts,file = "NormalizedCounts.txt",sep = "\t",row.names = FALSE, col.names = TRUE)
130 normalcounts
131 }