Commit 7360830df3c63ce651ca2ed6d6e28654b3b04852
1 parent
d0434e8502
Exists in
master
For Count data
Showing
1 changed file
with
131 additions
and
0 deletions
Show diff stats
RNormalizeCounts.txt
... | ... | @@ -0,0 +1,131 @@ |
1 | +#Efrain Gonzalez | |
2 | +#RNA Sequence normalization | |
3 | +#03/29/2018 | |
4 | + | |
5 | +#Keeping 11 digits | |
6 | +options(digits = 11) | |
7 | +#Libraries required to run the code | |
8 | +library(pryr) | |
9 | +library(MASS) | |
10 | +library(dplyr) | |
11 | +library(tidyr) | |
12 | +library(readr) | |
13 | +library(stringr) | |
14 | + | |
15 | +#First we must join all HTSeq-count files together for a given data set | |
16 | +# in this joining make sure that only those genes that are common amongst all sets are | |
17 | +# brought together | |
18 | +#At the end of this step a file will be produced that contains only the genes that were | |
19 | +# common to each HTSeq-count file and the respective count information for that gene | |
20 | + | |
21 | +RNATheft <- function() { | |
22 | + #Get working directory based on the directory that contains the files of interest | |
23 | + wd <- getwd() | |
24 | + numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to join and clean all of the HTSeq-Count files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) | |
25 | + HTSeqfiles <- grep("_Count\\.txt$",list.files()) | |
26 | + HTSeqfloc <- list.files()[HTSeqfiles] | |
27 | + | |
28 | + #Please join all files | |
29 | + if(numDAT == 1) { | |
30 | + #joining all files based on gene information | |
31 | + for(i in 1:length(HTSeqfloc)) { | |
32 | + #Using the name of the file to label counts | |
33 | + namefile <- strsplit(HTSeqfloc[i],".txt") %>% | |
34 | + .[[1]] %>% | |
35 | + .[length(.)] | |
36 | + #Adding the information to finfile based on common genes | |
37 | + if(i == 1) { | |
38 | + finfile <- HTSeqfloc[1] %>% | |
39 | + read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% | |
40 | + filter(.,!grepl("^__",.$`Gene Symbol`)) | |
41 | + } else { | |
42 | + intermfile <- HTSeqfloc[i] %>% | |
43 | + read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% | |
44 | + filter(.,!grepl("^__",.$`Gene Symbol`)) | |
45 | + finfile <- inner_join(finfile,intermfile,by = "Gene Symbol") | |
46 | + | |
47 | + } | |
48 | + } | |
49 | + } else if(numDAT == 2) { | |
50 | + #Choose the data files to join and clean | |
51 | + Chosenfil <- select.list(choices = HTSeqfloc,multiple = TRUE, title = "Choose the HTSeq files that you want to join and clean:") | |
52 | + if(length(Chosenfil) == 0) { | |
53 | + #Spit out a warning | |
54 | + warning("You did not select any files and so no cleaning will be performed") | |
55 | + } else { | |
56 | + for(i in 1:length(Chosenfil)) { | |
57 | + #Using the name of the file to label counts | |
58 | + namefile <- strsplit(Chosenfil[i],".txt") %>% | |
59 | + .[[1]] %>% | |
60 | + .[length(.)] | |
61 | + #Adding the information to finfile based on common genes | |
62 | + if(i == 1) { | |
63 | + finfile <- HTSeqfloc[1] %>% | |
64 | + read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% | |
65 | + filter(.,!grepl("^__",.$`Gene Symbol`)) | |
66 | + } else { | |
67 | + intermfile <- HTSeqfloc[i] %>% | |
68 | + read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% | |
69 | + filter(.,!grepl("^__",.$`Gene Symbol`)) | |
70 | + finfile <- inner_join(finfile,intermfile,by = "Gene Symbol") | |
71 | + | |
72 | + } | |
73 | + } | |
74 | + } | |
75 | + | |
76 | + } | |
77 | + finfile1 <- finfile | |
78 | + finfile1$GeneVariance <- 0.000000 | |
79 | + finfile1$GeneCountSum <- 0L | |
80 | + for(i in 1:dim(finfile1)[1]) { | |
81 | + finfile1$GeneVariance[i] <- finfile1[i,-1] %>% | |
82 | + .[-dim(.)[2]] %>% | |
83 | + .[-dim(.)[2]] %>% | |
84 | + as.vector(.,mode = "integer") %>% | |
85 | + var(.) | |
86 | + finfile1$GeneCountSum[i] <- finfile1[i,-1] %>% | |
87 | + .[-dim(.)[2]] %>% | |
88 | + .[-dim(.)[2]] %>% | |
89 | + as.vector(.,mode = "integer") %>% | |
90 | + sum(.) | |
91 | + } | |
92 | + #Rank from least variant to most variant | |
93 | + finfile1 <- arrange(finfile1,finfile1$GeneVariance) | |
94 | + | |
95 | + #find only the ones with a nonzero variance | |
96 | + finfile1 <- filter(finfile1,finfile1$GeneVariance > 0) | |
97 | + | |
98 | + ##What if instead I use the criteria to be the following | |
99 | + ## variance = [(1-(1/n))^2/(n-1)] + (1/n)^2 | |
100 | + ## it will eliminate any that only have 1 column with 1 in it | |
101 | + ## testvar <- ((1-(1/(dim(finfile1)[2]-3)))^2)/(dim(finfile1)[2]-4) + (1/(dim(finfile1)[2]-3))^2 | |
102 | + | |
103 | + #making sure that all values in each column are at least above zero | |
104 | + finfile1 <- filter(finfile1,finfile1$GeneCountSum > dim(finfile1)[2]-3) | |
105 | + | |
106 | + ##Your minimum variance genes are going to make up .1% of the total amount of genes | |
107 | + if(dim(finfile1)[1] < 1000) { | |
108 | + numofgenesvar <- 1 | |
109 | + } else { | |
110 | + numofgenesvar <- round(.001 * dim(finfile1)[1]) | |
111 | + } | |
112 | + lowestvargenes <- as.data.frame(finfile1[1:numofgenesvar,],stringsAsFactors = FALSE) | |
113 | + write.table(lowestvargenes,file = "GenesUsedForVariance.txt", sep = "\t",row.names = FALSE, col.names = TRUE) | |
114 | + lowestvargenes | |
115 | + estlowestvargenes <- lowestvargenes$GeneVariance %>% | |
116 | + as.vector(.,mode = "double") %>% | |
117 | + mean(.) | |
118 | + | |
119 | + | |
120 | + overallmean <- finfile[,-1] %>% | |
121 | + as.matrix(.,mode = "integer") %>% | |
122 | + mean(.) | |
123 | + normalcounts <- finfile[,-1] %>% | |
124 | + as.matrix(.,mode = "double") | |
125 | + normalcounts <- (normalcounts - overallmean)/sqrt(estlowestvargenes) | |
126 | + normalcounts <- as.data.frame(normalcounts,stringsAsFactors = FALSE) | |
127 | + normalcounts <- cbind(as.data.frame(finfile$`Gene Symbol`,stringsAsFactors = FALSE),normalcounts) | |
128 | + colnames(normalcounts)[1] <- "Gene Symbol" | |
129 | + write.table(normalcounts,file = "NormalizedCounts.txt",sep = "\t",row.names = FALSE, col.names = TRUE) | |
130 | + normalcounts | |
131 | +} | |
0 | 132 | \ No newline at end of file |