diff --git a/RNormalizeCounts.txt b/RNormalizeCounts.txt new file mode 100644 index 0000000..677f6e8 --- /dev/null +++ b/RNormalizeCounts.txt @@ -0,0 +1,131 @@ +#Efrain Gonzalez +#RNA Sequence normalization +#03/29/2018 + +#Keeping 11 digits +options(digits = 11) +#Libraries required to run the code +library(pryr) +library(MASS) +library(dplyr) +library(tidyr) +library(readr) +library(stringr) + +#First we must join all HTSeq-count files together for a given data set +# in this joining make sure that only those genes that are common amongst all sets are +# brought together +#At the end of this step a file will be produced that contains only the genes that were +# common to each HTSeq-count file and the respective count information for that gene + +RNATheft <- function() { + #Get working directory based on the directory that contains the files of interest + wd <- getwd() + numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to join and clean all of the HTSeq-Count files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) + HTSeqfiles <- grep("_Count\\.txt$",list.files()) + HTSeqfloc <- list.files()[HTSeqfiles] + + #Please join all files + if(numDAT == 1) { + #joining all files based on gene information + for(i in 1:length(HTSeqfloc)) { + #Using the name of the file to label counts + namefile <- strsplit(HTSeqfloc[i],".txt") %>% + .[[1]] %>% + .[length(.)] + #Adding the information to finfile based on common genes + if(i == 1) { + finfile <- HTSeqfloc[1] %>% + read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% + filter(.,!grepl("^__",.$`Gene Symbol`)) + } else { + intermfile <- HTSeqfloc[i] %>% + read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% + filter(.,!grepl("^__",.$`Gene Symbol`)) + finfile <- inner_join(finfile,intermfile,by = "Gene Symbol") + + } + } + } else if(numDAT == 2) { + #Choose the data files to join and clean + Chosenfil <- select.list(choices = HTSeqfloc,multiple = TRUE, title = "Choose the HTSeq files that you want to join and clean:") + if(length(Chosenfil) == 0) { + #Spit out a warning + warning("You did not select any files and so no cleaning will be performed") + } else { + for(i in 1:length(Chosenfil)) { + #Using the name of the file to label counts + namefile <- strsplit(Chosenfil[i],".txt") %>% + .[[1]] %>% + .[length(.)] + #Adding the information to finfile based on common genes + if(i == 1) { + finfile <- HTSeqfloc[1] %>% + read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% + filter(.,!grepl("^__",.$`Gene Symbol`)) + } else { + intermfile <- HTSeqfloc[i] %>% + read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>% + filter(.,!grepl("^__",.$`Gene Symbol`)) + finfile <- inner_join(finfile,intermfile,by = "Gene Symbol") + + } + } + } + + } + finfile1 <- finfile + finfile1$GeneVariance <- 0.000000 + finfile1$GeneCountSum <- 0L + for(i in 1:dim(finfile1)[1]) { + finfile1$GeneVariance[i] <- finfile1[i,-1] %>% + .[-dim(.)[2]] %>% + .[-dim(.)[2]] %>% + as.vector(.,mode = "integer") %>% + var(.) + finfile1$GeneCountSum[i] <- finfile1[i,-1] %>% + .[-dim(.)[2]] %>% + .[-dim(.)[2]] %>% + as.vector(.,mode = "integer") %>% + sum(.) + } + #Rank from least variant to most variant + finfile1 <- arrange(finfile1,finfile1$GeneVariance) + + #find only the ones with a nonzero variance + finfile1 <- filter(finfile1,finfile1$GeneVariance > 0) + + ##What if instead I use the criteria to be the following + ## variance = [(1-(1/n))^2/(n-1)] + (1/n)^2 + ## it will eliminate any that only have 1 column with 1 in it + ## testvar <- ((1-(1/(dim(finfile1)[2]-3)))^2)/(dim(finfile1)[2]-4) + (1/(dim(finfile1)[2]-3))^2 + + #making sure that all values in each column are at least above zero + finfile1 <- filter(finfile1,finfile1$GeneCountSum > dim(finfile1)[2]-3) + + ##Your minimum variance genes are going to make up .1% of the total amount of genes + if(dim(finfile1)[1] < 1000) { + numofgenesvar <- 1 + } else { + numofgenesvar <- round(.001 * dim(finfile1)[1]) + } + lowestvargenes <- as.data.frame(finfile1[1:numofgenesvar,],stringsAsFactors = FALSE) + write.table(lowestvargenes,file = "GenesUsedForVariance.txt", sep = "\t",row.names = FALSE, col.names = TRUE) + lowestvargenes + estlowestvargenes <- lowestvargenes$GeneVariance %>% + as.vector(.,mode = "double") %>% + mean(.) + + + overallmean <- finfile[,-1] %>% + as.matrix(.,mode = "integer") %>% + mean(.) + normalcounts <- finfile[,-1] %>% + as.matrix(.,mode = "double") + normalcounts <- (normalcounts - overallmean)/sqrt(estlowestvargenes) + normalcounts <- as.data.frame(normalcounts,stringsAsFactors = FALSE) + normalcounts <- cbind(as.data.frame(finfile$`Gene Symbol`,stringsAsFactors = FALSE),normalcounts) + colnames(normalcounts)[1] <- "Gene Symbol" + write.table(normalcounts,file = "NormalizedCounts.txt",sep = "\t",row.names = FALSE, col.names = TRUE) + normalcounts +} \ No newline at end of file