Commit 7360830df3c63ce651ca2ed6d6e28654b3b04852
1 parent
d0434e8502
Exists in
master
For Count data
Showing
1 changed file
with
131 additions
and
0 deletions
Show diff stats
RNormalizeCounts.txt
File was created | 1 | #Efrain Gonzalez | |
2 | #RNA Sequence normalization | ||
3 | #03/29/2018 | ||
4 | |||
5 | #Keeping 11 digits | ||
6 | options(digits = 11) | ||
7 | #Libraries required to run the code | ||
8 | library(pryr) | ||
9 | library(MASS) | ||
10 | library(dplyr) | ||
11 | library(tidyr) | ||
12 | library(readr) | ||
13 | library(stringr) | ||
14 | |||
15 | #First we must join all HTSeq-count files together for a given data set | ||
16 | # in this joining make sure that only those genes that are common amongst all sets are | ||
17 | # brought together | ||
18 | #At the end of this step a file will be produced that contains only the genes that were | ||
19 | # common to each HTSeq-count file and the respective count information for that gene | ||
20 | |||
21 | RNATheft <- function() { | ||
22 | #Get working directory based on the directory that contains the files of interest | ||
23 | wd <- getwd() | ||
24 | numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to join and clean all of the HTSeq-Count files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) | ||
25 | HTSeqfiles <- grep("_Count\\.txt$",list.files()) | ||
26 | HTSeqfloc <- list.files()[HTSeqfiles] | ||
27 | |||
28 | #Please join all files | ||
29 | if(numDAT == 1) { | ||
30 | #joining all files based on gene information | ||
31 | for(i in 1:length(HTSeqfloc)) { | ||
32 | #Using the name of the file to label counts | ||
33 | namefile <- strsplit(HTSeqfloc[i],".txt") %>% | ||
34 | .[[1]] %>% | ||
35 | .[length(.)] | ||
36 | #Adding the information to finfile based on common genes | ||
37 | if(i == 1) { | ||
38 | finfile <- HTSeqfloc[1] %>% | ||
39 | read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% | ||
40 | filter(.,!grepl("^__",.$`Gene Symbol`)) | ||
41 | } else { | ||
42 | intermfile <- HTSeqfloc[i] %>% | ||
43 | read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% | ||
44 | filter(.,!grepl("^__",.$`Gene Symbol`)) | ||
45 | finfile <- inner_join(finfile,intermfile,by = "Gene Symbol") | ||
46 | |||
47 | } | ||
48 | } | ||
49 | } else if(numDAT == 2) { | ||
50 | #Choose the data files to join and clean | ||
51 | Chosenfil <- select.list(choices = HTSeqfloc,multiple = TRUE, title = "Choose the HTSeq files that you want to join and clean:") | ||
52 | if(length(Chosenfil) == 0) { | ||
53 | #Spit out a warning | ||
54 | warning("You did not select any files and so no cleaning will be performed") | ||
55 | } else { | ||
56 | for(i in 1:length(Chosenfil)) { | ||
57 | #Using the name of the file to label counts | ||
58 | namefile <- strsplit(Chosenfil[i],".txt") %>% | ||
59 | .[[1]] %>% | ||
60 | .[length(.)] | ||
61 | #Adding the information to finfile based on common genes | ||
62 | if(i == 1) { | ||
63 | finfile <- HTSeqfloc[1] %>% | ||
64 | read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% | ||
65 | filter(.,!grepl("^__",.$`Gene Symbol`)) | ||
66 | } else { | ||
67 | intermfile <- HTSeqfloc[i] %>% | ||
68 | read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% | ||
69 | filter(.,!grepl("^__",.$`Gene Symbol`)) | ||
70 | finfile <- inner_join(finfile,intermfile,by = "Gene Symbol") | ||
71 | |||
72 | } | ||
73 | } | ||
74 | } | ||
75 | |||
76 | } | ||
77 | finfile1 <- finfile | ||
78 | finfile1$GeneVariance <- 0.000000 | ||
79 | finfile1$GeneCountSum <- 0L | ||
80 | for(i in 1:dim(finfile1)[1]) { | ||
81 | finfile1$GeneVariance[i] <- finfile1[i,-1] %>% | ||
82 | .[-dim(.)[2]] %>% | ||
83 | .[-dim(.)[2]] %>% | ||
84 | as.vector(.,mode = "integer") %>% | ||
85 | var(.) | ||
86 | finfile1$GeneCountSum[i] <- finfile1[i,-1] %>% | ||
87 | .[-dim(.)[2]] %>% | ||
88 | .[-dim(.)[2]] %>% | ||
89 | as.vector(.,mode = "integer") %>% | ||
90 | sum(.) | ||
91 | } | ||
92 | #Rank from least variant to most variant | ||
93 | finfile1 <- arrange(finfile1,finfile1$GeneVariance) | ||
94 | |||
95 | #find only the ones with a nonzero variance | ||
96 | finfile1 <- filter(finfile1,finfile1$GeneVariance > 0) | ||
97 | |||
98 | ##What if instead I use the criteria to be the following | ||
99 | ## variance = [(1-(1/n))^2/(n-1)] + (1/n)^2 | ||
100 | ## it will eliminate any that only have 1 column with 1 in it | ||
101 | ## testvar <- ((1-(1/(dim(finfile1)[2]-3)))^2)/(dim(finfile1)[2]-4) + (1/(dim(finfile1)[2]-3))^2 | ||
102 | |||
103 | #making sure that all values in each column are at least above zero | ||
104 | finfile1 <- filter(finfile1,finfile1$GeneCountSum > dim(finfile1)[2]-3) | ||
105 | |||
106 | ##Your minimum variance genes are going to make up .1% of the total amount of genes | ||
107 | if(dim(finfile1)[1] < 1000) { | ||
108 | numofgenesvar <- 1 | ||
109 | } else { | ||
110 | numofgenesvar <- round(.001 * dim(finfile1)[1]) | ||
111 | } | ||
112 | lowestvargenes <- as.data.frame(finfile1[1:numofgenesvar,],stringsAsFactors = FALSE) | ||
113 | write.table(lowestvargenes,file = "GenesUsedForVariance.txt", sep = "\t",row.names = FALSE, col.names = TRUE) | ||
114 | lowestvargenes | ||
115 | estlowestvargenes <- lowestvargenes$GeneVariance %>% | ||
116 | as.vector(.,mode = "double") %>% | ||
117 | mean(.) | ||
118 | |||
119 | |||
120 | overallmean <- finfile[,-1] %>% | ||
121 | as.matrix(.,mode = "integer") %>% | ||
122 | mean(.) | ||
123 | normalcounts <- finfile[,-1] %>% | ||
124 | as.matrix(.,mode = "double") | ||
125 | normalcounts <- (normalcounts - overallmean)/sqrt(estlowestvargenes) | ||
126 | normalcounts <- as.data.frame(normalcounts,stringsAsFactors = FALSE) | ||
127 | normalcounts <- cbind(as.data.frame(finfile$`Gene Symbol`,stringsAsFactors = FALSE),normalcounts) | ||
128 | colnames(normalcounts)[1] <- "Gene Symbol" | ||
129 | write.table(normalcounts,file = "NormalizedCounts.txt",sep = "\t",row.names = FALSE, col.names = TRUE) | ||
130 | normalcounts | ||
131 | } |