Commit 7360830df3c63ce651ca2ed6d6e28654b3b04852
1 parent
d0434e8502
Exists in
master
For Count data
Showing
1 changed file
with
131 additions
and
0 deletions
Show diff stats
RNormalizeCounts.txt
| File was created | 1 | #Efrain Gonzalez | |
| 2 | #RNA Sequence normalization | ||
| 3 | #03/29/2018 | ||
| 4 | |||
| 5 | #Keeping 11 digits | ||
| 6 | options(digits = 11) | ||
| 7 | #Libraries required to run the code | ||
| 8 | library(pryr) | ||
| 9 | library(MASS) | ||
| 10 | library(dplyr) | ||
| 11 | library(tidyr) | ||
| 12 | library(readr) | ||
| 13 | library(stringr) | ||
| 14 | |||
| 15 | #First we must join all HTSeq-count files together for a given data set | ||
| 16 | # in this joining make sure that only those genes that are common amongst all sets are | ||
| 17 | # brought together | ||
| 18 | #At the end of this step a file will be produced that contains only the genes that were | ||
| 19 | # common to each HTSeq-count file and the respective count information for that gene | ||
| 20 | |||
| 21 | RNATheft <- function() { | ||
| 22 | #Get working directory based on the directory that contains the files of interest | ||
| 23 | wd <- getwd() | ||
| 24 | numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to join and clean all of the HTSeq-Count files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) | ||
| 25 | HTSeqfiles <- grep("_Count\\.txt$",list.files()) | ||
| 26 | HTSeqfloc <- list.files()[HTSeqfiles] | ||
| 27 | |||
| 28 | #Please join all files | ||
| 29 | if(numDAT == 1) { | ||
| 30 | #joining all files based on gene information | ||
| 31 | for(i in 1:length(HTSeqfloc)) { | ||
| 32 | #Using the name of the file to label counts | ||
| 33 | namefile <- strsplit(HTSeqfloc[i],".txt") %>% | ||
| 34 | .[[1]] %>% | ||
| 35 | .[length(.)] | ||
| 36 | #Adding the information to finfile based on common genes | ||
| 37 | if(i == 1) { | ||
| 38 | finfile <- HTSeqfloc[1] %>% | ||
| 39 | read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% | ||
| 40 | filter(.,!grepl("^__",.$`Gene Symbol`)) | ||
| 41 | } else { | ||
| 42 | intermfile <- HTSeqfloc[i] %>% | ||
| 43 | read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% | ||
| 44 | filter(.,!grepl("^__",.$`Gene Symbol`)) | ||
| 45 | finfile <- inner_join(finfile,intermfile,by = "Gene Symbol") | ||
| 46 | |||
| 47 | } | ||
| 48 | } | ||
| 49 | } else if(numDAT == 2) { | ||
| 50 | #Choose the data files to join and clean | ||
| 51 | Chosenfil <- select.list(choices = HTSeqfloc,multiple = TRUE, title = "Choose the HTSeq files that you want to join and clean:") | ||
| 52 | if(length(Chosenfil) == 0) { | ||
| 53 | #Spit out a warning | ||
| 54 | warning("You did not select any files and so no cleaning will be performed") | ||
| 55 | } else { | ||
| 56 | for(i in 1:length(Chosenfil)) { | ||
| 57 | #Using the name of the file to label counts | ||
| 58 | namefile <- strsplit(Chosenfil[i],".txt") %>% | ||
| 59 | .[[1]] %>% | ||
| 60 | .[length(.)] | ||
| 61 | #Adding the information to finfile based on common genes | ||
| 62 | if(i == 1) { | ||
| 63 | finfile <- HTSeqfloc[1] %>% | ||
| 64 | read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% | ||
| 65 | filter(.,!grepl("^__",.$`Gene Symbol`)) | ||
| 66 | } else { | ||
| 67 | intermfile <- HTSeqfloc[i] %>% | ||
| 68 | read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% | ||
| 69 | filter(.,!grepl("^__",.$`Gene Symbol`)) | ||
| 70 | finfile <- inner_join(finfile,intermfile,by = "Gene Symbol") | ||
| 71 | |||
| 72 | } | ||
| 73 | } | ||
| 74 | } | ||
| 75 | |||
| 76 | } | ||
| 77 | finfile1 <- finfile | ||
| 78 | finfile1$GeneVariance <- 0.000000 | ||
| 79 | finfile1$GeneCountSum <- 0L | ||
| 80 | for(i in 1:dim(finfile1)[1]) { | ||
| 81 | finfile1$GeneVariance[i] <- finfile1[i,-1] %>% | ||
| 82 | .[-dim(.)[2]] %>% | ||
| 83 | .[-dim(.)[2]] %>% | ||
| 84 | as.vector(.,mode = "integer") %>% | ||
| 85 | var(.) | ||
| 86 | finfile1$GeneCountSum[i] <- finfile1[i,-1] %>% | ||
| 87 | .[-dim(.)[2]] %>% | ||
| 88 | .[-dim(.)[2]] %>% | ||
| 89 | as.vector(.,mode = "integer") %>% | ||
| 90 | sum(.) | ||
| 91 | } | ||
| 92 | #Rank from least variant to most variant | ||
| 93 | finfile1 <- arrange(finfile1,finfile1$GeneVariance) | ||
| 94 | |||
| 95 | #find only the ones with a nonzero variance | ||
| 96 | finfile1 <- filter(finfile1,finfile1$GeneVariance > 0) | ||
| 97 | |||
| 98 | ##What if instead I use the criteria to be the following | ||
| 99 | ## variance = [(1-(1/n))^2/(n-1)] + (1/n)^2 | ||
| 100 | ## it will eliminate any that only have 1 column with 1 in it | ||
| 101 | ## testvar <- ((1-(1/(dim(finfile1)[2]-3)))^2)/(dim(finfile1)[2]-4) + (1/(dim(finfile1)[2]-3))^2 | ||
| 102 | |||
| 103 | #making sure that all values in each column are at least above zero | ||
| 104 | finfile1 <- filter(finfile1,finfile1$GeneCountSum > dim(finfile1)[2]-3) | ||
| 105 | |||
| 106 | ##Your minimum variance genes are going to make up .1% of the total amount of genes | ||
| 107 | if(dim(finfile1)[1] < 1000) { | ||
| 108 | numofgenesvar <- 1 | ||
| 109 | } else { | ||
| 110 | numofgenesvar <- round(.001 * dim(finfile1)[1]) | ||
| 111 | } | ||
| 112 | lowestvargenes <- as.data.frame(finfile1[1:numofgenesvar,],stringsAsFactors = FALSE) | ||
| 113 | write.table(lowestvargenes,file = "GenesUsedForVariance.txt", sep = "\t",row.names = FALSE, col.names = TRUE) | ||
| 114 | lowestvargenes | ||
| 115 | estlowestvargenes <- lowestvargenes$GeneVariance %>% | ||
| 116 | as.vector(.,mode = "double") %>% | ||
| 117 | mean(.) | ||
| 118 | |||
| 119 | |||
| 120 | overallmean <- finfile[,-1] %>% | ||
| 121 | as.matrix(.,mode = "integer") %>% | ||
| 122 | mean(.) | ||
| 123 | normalcounts <- finfile[,-1] %>% | ||
| 124 | as.matrix(.,mode = "double") | ||
| 125 | normalcounts <- (normalcounts - overallmean)/sqrt(estlowestvargenes) | ||
| 126 | normalcounts <- as.data.frame(normalcounts,stringsAsFactors = FALSE) | ||
| 127 | normalcounts <- cbind(as.data.frame(finfile$`Gene Symbol`,stringsAsFactors = FALSE),normalcounts) | ||
| 128 | colnames(normalcounts)[1] <- "Gene Symbol" | ||
| 129 | write.table(normalcounts,file = "NormalizedCounts.txt",sep = "\t",row.names = FALSE, col.names = TRUE) | ||
| 130 | normalcounts | ||
| 131 | } |