Commit 7360830df3c63ce651ca2ed6d6e28654b3b04852
1 parent
d0434e8502
Exists in
master
For Count data
Showing
1 changed file
with
131 additions
and
0 deletions
Show diff stats
RNormalizeCounts.txt
| ... | ... | @@ -0,0 +1,131 @@ |
| 1 | +#Efrain Gonzalez | |
| 2 | +#RNA Sequence normalization | |
| 3 | +#03/29/2018 | |
| 4 | + | |
| 5 | +#Keeping 11 digits | |
| 6 | +options(digits = 11) | |
| 7 | +#Libraries required to run the code | |
| 8 | +library(pryr) | |
| 9 | +library(MASS) | |
| 10 | +library(dplyr) | |
| 11 | +library(tidyr) | |
| 12 | +library(readr) | |
| 13 | +library(stringr) | |
| 14 | + | |
| 15 | +#First we must join all HTSeq-count files together for a given data set | |
| 16 | +# in this joining make sure that only those genes that are common amongst all sets are | |
| 17 | +# brought together | |
| 18 | +#At the end of this step a file will be produced that contains only the genes that were | |
| 19 | +# common to each HTSeq-count file and the respective count information for that gene | |
| 20 | + | |
| 21 | +RNATheft <- function() { | |
| 22 | + #Get working directory based on the directory that contains the files of interest | |
| 23 | + wd <- getwd() | |
| 24 | + numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to join and clean all of the HTSeq-Count files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) | |
| 25 | + HTSeqfiles <- grep("_Count\\.txt$",list.files()) | |
| 26 | + HTSeqfloc <- list.files()[HTSeqfiles] | |
| 27 | + | |
| 28 | + #Please join all files | |
| 29 | + if(numDAT == 1) { | |
| 30 | + #joining all files based on gene information | |
| 31 | + for(i in 1:length(HTSeqfloc)) { | |
| 32 | + #Using the name of the file to label counts | |
| 33 | + namefile <- strsplit(HTSeqfloc[i],".txt") %>% | |
| 34 | + .[[1]] %>% | |
| 35 | + .[length(.)] | |
| 36 | + #Adding the information to finfile based on common genes | |
| 37 | + if(i == 1) { | |
| 38 | + finfile <- HTSeqfloc[1] %>% | |
| 39 | + read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% | |
| 40 | + filter(.,!grepl("^__",.$`Gene Symbol`)) | |
| 41 | + } else { | |
| 42 | + intermfile <- HTSeqfloc[i] %>% | |
| 43 | + read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% | |
| 44 | + filter(.,!grepl("^__",.$`Gene Symbol`)) | |
| 45 | + finfile <- inner_join(finfile,intermfile,by = "Gene Symbol") | |
| 46 | + | |
| 47 | + } | |
| 48 | + } | |
| 49 | + } else if(numDAT == 2) { | |
| 50 | + #Choose the data files to join and clean | |
| 51 | + Chosenfil <- select.list(choices = HTSeqfloc,multiple = TRUE, title = "Choose the HTSeq files that you want to join and clean:") | |
| 52 | + if(length(Chosenfil) == 0) { | |
| 53 | + #Spit out a warning | |
| 54 | + warning("You did not select any files and so no cleaning will be performed") | |
| 55 | + } else { | |
| 56 | + for(i in 1:length(Chosenfil)) { | |
| 57 | + #Using the name of the file to label counts | |
| 58 | + namefile <- strsplit(Chosenfil[i],".txt") %>% | |
| 59 | + .[[1]] %>% | |
| 60 | + .[length(.)] | |
| 61 | + #Adding the information to finfile based on common genes | |
| 62 | + if(i == 1) { | |
| 63 | + finfile <- HTSeqfloc[1] %>% | |
| 64 | + read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% | |
| 65 | + filter(.,!grepl("^__",.$`Gene Symbol`)) | |
| 66 | + } else { | |
| 67 | + intermfile <- HTSeqfloc[i] %>% | |
| 68 | + read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% | |
| 69 | + filter(.,!grepl("^__",.$`Gene Symbol`)) | |
| 70 | + finfile <- inner_join(finfile,intermfile,by = "Gene Symbol") | |
| 71 | + | |
| 72 | + } | |
| 73 | + } | |
| 74 | + } | |
| 75 | + | |
| 76 | + } | |
| 77 | + finfile1 <- finfile | |
| 78 | + finfile1$GeneVariance <- 0.000000 | |
| 79 | + finfile1$GeneCountSum <- 0L | |
| 80 | + for(i in 1:dim(finfile1)[1]) { | |
| 81 | + finfile1$GeneVariance[i] <- finfile1[i,-1] %>% | |
| 82 | + .[-dim(.)[2]] %>% | |
| 83 | + .[-dim(.)[2]] %>% | |
| 84 | + as.vector(.,mode = "integer") %>% | |
| 85 | + var(.) | |
| 86 | + finfile1$GeneCountSum[i] <- finfile1[i,-1] %>% | |
| 87 | + .[-dim(.)[2]] %>% | |
| 88 | + .[-dim(.)[2]] %>% | |
| 89 | + as.vector(.,mode = "integer") %>% | |
| 90 | + sum(.) | |
| 91 | + } | |
| 92 | + #Rank from least variant to most variant | |
| 93 | + finfile1 <- arrange(finfile1,finfile1$GeneVariance) | |
| 94 | + | |
| 95 | + #find only the ones with a nonzero variance | |
| 96 | + finfile1 <- filter(finfile1,finfile1$GeneVariance > 0) | |
| 97 | + | |
| 98 | + ##What if instead I use the criteria to be the following | |
| 99 | + ## variance = [(1-(1/n))^2/(n-1)] + (1/n)^2 | |
| 100 | + ## it will eliminate any that only have 1 column with 1 in it | |
| 101 | + ## testvar <- ((1-(1/(dim(finfile1)[2]-3)))^2)/(dim(finfile1)[2]-4) + (1/(dim(finfile1)[2]-3))^2 | |
| 102 | + | |
| 103 | + #making sure that all values in each column are at least above zero | |
| 104 | + finfile1 <- filter(finfile1,finfile1$GeneCountSum > dim(finfile1)[2]-3) | |
| 105 | + | |
| 106 | + ##Your minimum variance genes are going to make up .1% of the total amount of genes | |
| 107 | + if(dim(finfile1)[1] < 1000) { | |
| 108 | + numofgenesvar <- 1 | |
| 109 | + } else { | |
| 110 | + numofgenesvar <- round(.001 * dim(finfile1)[1]) | |
| 111 | + } | |
| 112 | + lowestvargenes <- as.data.frame(finfile1[1:numofgenesvar,],stringsAsFactors = FALSE) | |
| 113 | + write.table(lowestvargenes,file = "GenesUsedForVariance.txt", sep = "\t",row.names = FALSE, col.names = TRUE) | |
| 114 | + lowestvargenes | |
| 115 | + estlowestvargenes <- lowestvargenes$GeneVariance %>% | |
| 116 | + as.vector(.,mode = "double") %>% | |
| 117 | + mean(.) | |
| 118 | + | |
| 119 | + | |
| 120 | + overallmean <- finfile[,-1] %>% | |
| 121 | + as.matrix(.,mode = "integer") %>% | |
| 122 | + mean(.) | |
| 123 | + normalcounts <- finfile[,-1] %>% | |
| 124 | + as.matrix(.,mode = "double") | |
| 125 | + normalcounts <- (normalcounts - overallmean)/sqrt(estlowestvargenes) | |
| 126 | + normalcounts <- as.data.frame(normalcounts,stringsAsFactors = FALSE) | |
| 127 | + normalcounts <- cbind(as.data.frame(finfile$`Gene Symbol`,stringsAsFactors = FALSE),normalcounts) | |
| 128 | + colnames(normalcounts)[1] <- "Gene Symbol" | |
| 129 | + write.table(normalcounts,file = "NormalizedCounts.txt",sep = "\t",row.names = FALSE, col.names = TRUE) | |
| 130 | + normalcounts | |
| 131 | +} | |
| 0 | 132 | \ No newline at end of file |