Efrain Gonzalez / Cleaning and Fixing Data with R

Browse Code »

Commit 7360830df3c63ce651ca2ed6d6e28654b3b04852

Authored by Efrain Gonzalez 2018-03-30 16:38:10 -0400

1 parent d0434e8502

Exists in master

For Count data

Showing 1 changed file with 131 additions and 0 deletions Show diff stats

RNormalizeCounts.txt

Diff comments View file @ 7360830

File was created	1	#Efrain Gonzalez
	2	#RNA Sequence normalization
	3	#03/29/2018
	4
	5	#Keeping 11 digits
	6	options(digits = 11)
	7	#Libraries required to run the code
	8	library(pryr)
	9	library(MASS)
	10	library(dplyr)
	11	library(tidyr)
	12	library(readr)
	13	library(stringr)
	14
	15	#First we must join all HTSeq-count files together for a given data set
	16	# in this joining make sure that only those genes that are common amongst all sets are
	17	# brought together
	18	#At the end of this step a file will be produced that contains only the genes that were
	19	# common to each HTSeq-count file and the respective count information for that gene
	20
	21	RNATheft <- function() {
	22	#Get working directory based on the directory that contains the files of interest
	23	wd <- getwd()
	24	numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to join and clean all of the HTSeq-Count files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
	25	HTSeqfiles <- grep("_Count\\.txt$",list.files())
	26	HTSeqfloc <- list.files()[HTSeqfiles]
	27
	28	#Please join all files
	29	if(numDAT == 1) {
	30	#joining all files based on gene information
	31	for(i in 1:length(HTSeqfloc)) {
	32	#Using the name of the file to label counts
	33	namefile <- strsplit(HTSeqfloc[i],".txt") %>%
	34	.[[1]] %>%
	35	.[length(.)]
	36	#Adding the information to finfile based on common genes
	37	if(i == 1) {
	38	finfile <- HTSeqfloc[1] %>%
	39	read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
	40	filter(.,!grepl("^__",.$`Gene Symbol`))
	41	} else {
	42	intermfile <- HTSeqfloc[i] %>%
	43	read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
	44	filter(.,!grepl("^__",.$`Gene Symbol`))
	45	finfile <- inner_join(finfile,intermfile,by = "Gene Symbol")
	46
	47	}
	48	}
	49	} else if(numDAT == 2) {
	50	#Choose the data files to join and clean
	51	Chosenfil <- select.list(choices = HTSeqfloc,multiple = TRUE, title = "Choose the HTSeq files that you want to join and clean:")
	52	if(length(Chosenfil) == 0) {
	53	#Spit out a warning
	54	warning("You did not select any files and so no cleaning will be performed")
	55	} else {
	56	for(i in 1:length(Chosenfil)) {
	57	#Using the name of the file to label counts
	58	namefile <- strsplit(Chosenfil[i],".txt") %>%
	59	.[[1]] %>%
	60	.[length(.)]
	61	#Adding the information to finfile based on common genes
	62	if(i == 1) {
	63	finfile <- HTSeqfloc[1] %>%
	64	read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
	65	filter(.,!grepl("^__",.$`Gene Symbol`))
	66	} else {
	67	intermfile <- HTSeqfloc[i] %>%
	68	read_delim(delim = "\t",col_names = c("Gene Symbol",paste0(namefile,"s"))) %>%
	69	filter(.,!grepl("^__",.$`Gene Symbol`))
	70	finfile <- inner_join(finfile,intermfile,by = "Gene Symbol")
	71
	72	}
	73	}
	74	}
	75
	76	}
	77	finfile1 <- finfile
	78	finfile1$GeneVariance <- 0.000000
	79	finfile1$GeneCountSum <- 0L
	80	for(i in 1:dim(finfile1)[1]) {
	81	finfile1$GeneVariance[i] <- finfile1[i,-1] %>%
	82	.[-dim(.)[2]] %>%
	83	.[-dim(.)[2]] %>%
	84	as.vector(.,mode = "integer") %>%
	85	var(.)
	86	finfile1$GeneCountSum[i] <- finfile1[i,-1] %>%
	87	.[-dim(.)[2]] %>%
	88	.[-dim(.)[2]] %>%
	89	as.vector(.,mode = "integer") %>%
	90	sum(.)
	91	}
	92	#Rank from least variant to most variant
	93	finfile1 <- arrange(finfile1,finfile1$GeneVariance)
	94
	95	#find only the ones with a nonzero variance
	96	finfile1 <- filter(finfile1,finfile1$GeneVariance > 0)
	97
	98	##What if instead I use the criteria to be the following
	99	## variance = [(1-(1/n))^2/(n-1)] + (1/n)^2
	100	## it will eliminate any that only have 1 column with 1 in it
	101	## testvar <- ((1-(1/(dim(finfile1)[2]-3)))^2)/(dim(finfile1)[2]-4) + (1/(dim(finfile1)[2]-3))^2
	102
	103	#making sure that all values in each column are at least above zero
	104	finfile1 <- filter(finfile1,finfile1$GeneCountSum > dim(finfile1)[2]-3)
	105
	106	##Your minimum variance genes are going to make up .1% of the total amount of genes
	107	if(dim(finfile1)[1] < 1000) {
	108	numofgenesvar <- 1
	109	} else {
	110	numofgenesvar <- round(.001 * dim(finfile1)[1])
	111	}
	112	lowestvargenes <- as.data.frame(finfile1[1:numofgenesvar,],stringsAsFactors = FALSE)
	113	write.table(lowestvargenes,file = "GenesUsedForVariance.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
	114	lowestvargenes
	115	estlowestvargenes <- lowestvargenes$GeneVariance %>%
	116	as.vector(.,mode = "double") %>%
	117	mean(.)
	118
	119
	120	overallmean <- finfile[,-1] %>%
	121	as.matrix(.,mode = "integer") %>%
	122	mean(.)
	123	normalcounts <- finfile[,-1] %>%
	124	as.matrix(.,mode = "double")
	125	normalcounts <- (normalcounts - overallmean)/sqrt(estlowestvargenes)
	126	normalcounts <- as.data.frame(normalcounts,stringsAsFactors = FALSE)
	127	normalcounts <- cbind(as.data.frame(finfile$`Gene Symbol`,stringsAsFactors = FALSE),normalcounts)
	128	colnames(normalcounts)[1] <- "Gene Symbol"
	129	write.table(normalcounts,file = "NormalizedCounts.txt",sep = "\t",row.names = FALSE, col.names = TRUE)
	130	normalcounts
	131	}