Commit 7360830df3c63ce651ca2ed6d6e28654b3b04852

Authored by Efrain Gonzalez
1 parent d0434e8502
Exists in master

For Count data

Showing 1 changed file with 131 additions and 0 deletions   Show diff stats
RNormalizeCounts.txt
... ... @@ -0,0 +1,131 @@
  1 +#Efrain Gonzalez
  2 +#RNA Sequence normalization
  3 +#03/29/2018
  4 +
  5 +#Keeping 11 digits
  6 +options(digits = 11)
  7 +#Libraries required to run the code
  8 +library(pryr)
  9 +library(MASS)
  10 +library(dplyr)
  11 +library(tidyr)
  12 +library(readr)
  13 +library(stringr)
  14 +
  15 +#First we must join all HTSeq-count files together for a given data set
  16 +# in this joining make sure that only those genes that are common amongst all sets are
  17 +# brought together
  18 +#At the end of this step a file will be produced that contains only the genes that were
  19 +# common to each HTSeq-count file and the respective count information for that gene
  20 +
  21 +RNATheft <- function() {
  22 + #Get working directory based on the directory that contains the files of interest
  23 + wd <- getwd()
  24 + numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to join and clean all of the HTSeq-Count files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
  25 + HTSeqfiles <- grep("_Count\\.txt$",list.files())
  26 + HTSeqfloc <- list.files()[HTSeqfiles]
  27 +
  28 + #Please join all files
  29 + if(numDAT == 1) {
  30 + #joining all files based on gene information
  31 + for(i in 1:length(HTSeqfloc)) {
  32 + #Using the name of the file to label counts
  33 + namefile <- strsplit(HTSeqfloc[i],".txt") %>%
  34 + .[[1]] %>%
  35 + .[length(.)]
  36 + #Adding the information to finfile based on common genes
  37 + if(i == 1) {
  38 + finfile <- HTSeqfloc[1] %>%
  39 + read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
  40 + filter(.,!grepl("^__",.$`Gene Symbol`))
  41 + } else {
  42 + intermfile <- HTSeqfloc[i] %>%
  43 + read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
  44 + filter(.,!grepl("^__",.$`Gene Symbol`))
  45 + finfile <- inner_join(finfile,intermfile,by = "Gene Symbol")
  46 +
  47 + }
  48 + }
  49 + } else if(numDAT == 2) {
  50 + #Choose the data files to join and clean
  51 + Chosenfil <- select.list(choices = HTSeqfloc,multiple = TRUE, title = "Choose the HTSeq files that you want to join and clean:")
  52 + if(length(Chosenfil) == 0) {
  53 + #Spit out a warning
  54 + warning("You did not select any files and so no cleaning will be performed")
  55 + } else {
  56 + for(i in 1:length(Chosenfil)) {
  57 + #Using the name of the file to label counts
  58 + namefile <- strsplit(Chosenfil[i],".txt") %>%
  59 + .[[1]] %>%
  60 + .[length(.)]
  61 + #Adding the information to finfile based on common genes
  62 + if(i == 1) {
  63 + finfile <- HTSeqfloc[1] %>%
  64 + read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
  65 + filter(.,!grepl("^__",.$`Gene Symbol`))
  66 + } else {
  67 + intermfile <- HTSeqfloc[i] %>%
  68 + read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
  69 + filter(.,!grepl("^__",.$`Gene Symbol`))
  70 + finfile <- inner_join(finfile,intermfile,by = "Gene Symbol")
  71 +
  72 + }
  73 + }
  74 + }
  75 +
  76 + }
  77 + finfile1 <- finfile
  78 + finfile1$GeneVariance <- 0.000000
  79 + finfile1$GeneCountSum <- 0L
  80 + for(i in 1:dim(finfile1)[1]) {
  81 + finfile1$GeneVariance[i] <- finfile1[i,-1] %>%
  82 + .[-dim(.)[2]] %>%
  83 + .[-dim(.)[2]] %>%
  84 + as.vector(.,mode = "integer") %>%
  85 + var(.)
  86 + finfile1$GeneCountSum[i] <- finfile1[i,-1] %>%
  87 + .[-dim(.)[2]] %>%
  88 + .[-dim(.)[2]] %>%
  89 + as.vector(.,mode = "integer") %>%
  90 + sum(.)
  91 + }
  92 + #Rank from least variant to most variant
  93 + finfile1 <- arrange(finfile1,finfile1$GeneVariance)
  94 +
  95 + #find only the ones with a nonzero variance
  96 + finfile1 <- filter(finfile1,finfile1$GeneVariance > 0)
  97 +
  98 + ##What if instead I use the criteria to be the following
  99 + ## variance = [(1-(1/n))^2/(n-1)] + (1/n)^2
  100 + ## it will eliminate any that only have 1 column with 1 in it
  101 + ## testvar <- ((1-(1/(dim(finfile1)[2]-3)))^2)/(dim(finfile1)[2]-4) + (1/(dim(finfile1)[2]-3))^2
  102 +
  103 + #making sure that all values in each column are at least above zero
  104 + finfile1 <- filter(finfile1,finfile1$GeneCountSum > dim(finfile1)[2]-3)
  105 +
  106 + ##Your minimum variance genes are going to make up .1% of the total amount of genes
  107 + if(dim(finfile1)[1] < 1000) {
  108 + numofgenesvar <- 1
  109 + } else {
  110 + numofgenesvar <- round(.001 * dim(finfile1)[1])
  111 + }
  112 + lowestvargenes <- as.data.frame(finfile1[1:numofgenesvar,],stringsAsFactors = FALSE)
  113 + write.table(lowestvargenes,file = "GenesUsedForVariance.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
  114 + lowestvargenes
  115 + estlowestvargenes <- lowestvargenes$GeneVariance %>%
  116 + as.vector(.,mode = "double") %>%
  117 + mean(.)
  118 +
  119 +
  120 + overallmean <- finfile[,-1] %>%
  121 + as.matrix(.,mode = "integer") %>%
  122 + mean(.)
  123 + normalcounts <- finfile[,-1] %>%
  124 + as.matrix(.,mode = "double")
  125 + normalcounts <- (normalcounts - overallmean)/sqrt(estlowestvargenes)
  126 + normalcounts <- as.data.frame(normalcounts,stringsAsFactors = FALSE)
  127 + normalcounts <- cbind(as.data.frame(finfile$`Gene Symbol`,stringsAsFactors = FALSE),normalcounts)
  128 + colnames(normalcounts)[1] <- "Gene Symbol"
  129 + write.table(normalcounts,file = "NormalizedCounts.txt",sep = "\t",row.names = FALSE, col.names = TRUE)
  130 + normalcounts
  131 +}
0 132 \ No newline at end of file