#Efrain Gonzalez #RNA Sequence normalization #03/29/2018 #Keeping 11 digits options(digits = 11) #Libraries required to run the code library(pryr) library(MASS) library(dplyr) library(tidyr) library(readr) library(stringr) #First we must join all HTSeq-count files together for a given data set # in this joining make sure that only those genes that are common amongst all sets are # brought together #At the end of this step a file will be produced that contains only the genes that were # common to each HTSeq-count file and the respective count information for that gene RNATheft <- function() { #Get working directory based on the directory that contains the files of interest wd <- getwd() numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to join and clean all of the HTSeq-Count files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) HTSeqfiles <- grep("_Count\\.txt$",list.files()) HTSeqfloc <- list.files()[HTSeqfiles] #Please join all files if(numDAT == 1) { #joining all files based on gene information for(i in 1:length(HTSeqfloc)) { #Using the name of the file to label counts namefile <- strsplit(HTSeqfloc[i],".txt") %>% .[[1]] %>% .[length(.)] #Adding the information to finfile based on common genes if(i == 1) { finfile <- HTSeqfloc[1] %>% read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% filter(.,!grepl("^__",.$`Gene Symbol`)) } else { intermfile <- HTSeqfloc[i] %>% read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% filter(.,!grepl("^__",.$`Gene Symbol`)) finfile <- inner_join(finfile,intermfile,by = "Gene Symbol") } } } else if(numDAT == 2) { #Choose the data files to join and clean Chosenfil <- select.list(choices = HTSeqfloc,multiple = TRUE, title = "Choose the HTSeq files that you want to join and clean:") if(length(Chosenfil) == 0) { #Spit out a warning warning("You did not select any files and so no cleaning will be performed") } else { for(i in 1:length(Chosenfil)) { #Using the name of the file to label counts namefile <- strsplit(Chosenfil[i],".txt") %>% .[[1]] %>% .[length(.)] #Adding the information to finfile based on common genes if(i == 1) { finfile <- HTSeqfloc[1] %>% read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% filter(.,!grepl("^__",.$`Gene Symbol`)) } else { intermfile <- HTSeqfloc[i] %>% read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% filter(.,!grepl("^__",.$`Gene Symbol`)) finfile <- inner_join(finfile,intermfile,by = "Gene Symbol") } } } } finfile1 <- finfile finfile1$GeneVariance <- 0.000000 finfile1$GeneCountSum <- 0L for(i in 1:dim(finfile1)[1]) { finfile1$GeneVariance[i] <- finfile1[i,-1] %>% .[-dim(.)[2]] %>% .[-dim(.)[2]] %>% as.vector(.,mode = "integer") %>% var(.) finfile1$GeneCountSum[i] <- finfile1[i,-1] %>% .[-dim(.)[2]] %>% .[-dim(.)[2]] %>% as.vector(.,mode = "integer") %>% sum(.) } #Rank from least variant to most variant finfile1 <- arrange(finfile1,finfile1$GeneVariance) #find only the ones with a nonzero variance finfile1 <- filter(finfile1,finfile1$GeneVariance > 0) ##What if instead I use the criteria to be the following ## variance = [(1-(1/n))^2/(n-1)] + (1/n)^2 ## it will eliminate any that only have 1 column with 1 in it ## testvar <- ((1-(1/(dim(finfile1)[2]-3)))^2)/(dim(finfile1)[2]-4) + (1/(dim(finfile1)[2]-3))^2 #making sure that all values in each column are at least above zero finfile1 <- filter(finfile1,finfile1$GeneCountSum > dim(finfile1)[2]-3) ##Your minimum variance genes are going to make up .1% of the total amount of genes if(dim(finfile1)[1] < 1000) { numofgenesvar <- 1 } else { numofgenesvar <- round(.001 * dim(finfile1)[1]) } lowestvargenes <- as.data.frame(finfile1[1:numofgenesvar,],stringsAsFactors = FALSE) write.table(lowestvargenes,file = "GenesUsedForVariance.txt", sep = "\t",row.names = FALSE, col.names = TRUE) lowestvargenes estlowestvargenes <- lowestvargenes$GeneVariance %>% as.vector(.,mode = "double") %>% mean(.) overallmean <- finfile[,-1] %>% as.matrix(.,mode = "integer") %>% mean(.) normalcounts <- finfile[,-1] %>% as.matrix(.,mode = "double") normalcounts <- (normalcounts - overallmean)/sqrt(estlowestvargenes) normalcounts <- as.data.frame(normalcounts,stringsAsFactors = FALSE) normalcounts <- cbind(as.data.frame(finfile$`Gene Symbol`,stringsAsFactors = FALSE),normalcounts) colnames(normalcounts)[1] <- "Gene Symbol" write.table(normalcounts,file = "NormalizedCounts.txt",sep = "\t",row.names = FALSE, col.names = TRUE) normalcounts }