Efrain Gonzalez / Cleaning and Fixing Data with R

Browse Code »

Commit 788834dd790ca6d024f704be6cb685756857a503

Authored by Efrain Gonzalez 2017-06-12 13:18:49 -0400

1 parent 69cbaf694d

Exists in master

This code takes the clean data and discretizes it

Showing 1 changed file with 157 additions and 0 deletions Show diff stats

RPostClean.R

Diff comments View file @ 788834d

File was created	1	#For Reading Raw Data from the created file
	2
	3	#Required Libraries
	4	library(MASS)
	5	library(dplyr)
	6	library(tidyr)
	7	library(readr)
	8	library(stringr)
	9
	10
	11	#Necessary Functions
	12
	13	#1# Function for discretizing the data
	14	dndat <- function(NDATA){
	15	rownd <- dim(NDATA)[1]
	16	colnd <- dim(NDATA)[2]
	17	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
	18	colnames(DDATA) <- colnames(NDATA)
	19	i = 1
	20	for(i in 1:rownd){
	21	for(j in 1:colnd){
	22	if(is.na(NDATA[i,j])==FALSE){
	23
	24	if(NDATA[i,j] < -1){
	25	DDATA[i,j]=0L
	26	}
	27	if(NDATA[i,j] > 1){
	28	DDATA[i,j]=2L
	29	}
	30	if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
	31	DDATA[i,j]=1L
	32	}
	33	} else{
	34	DDATA[i,j] = NDATA[i,j]
	35	}
	36	j = j + 1
	37	}
	38	i = i + 1
	39	}
	40	DDATA
	41	}
	42
	43
	44	#Bringing in the file
	45	rawdat <- file.choose()
	46	RAWDAT <- rawdat %>%
	47	read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
	48	filter(.,!grepl("Group\|Age\|Region\|PMI\|Title\|Sex\|Braak",X1))
	49	attributes(RAWDAT)$names <- RAWDAT[1,]
	50
	51	#Just the clinical data
	52	RAWWORD <- rawdat %>%
	53	read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
	54	filter(.,grepl("Group\|Age\|Region\|PMI\|Title\|Sex\|Braak",X1))
	55	attributes(RAWWORD)$names <- RAWDAT[1,]
	56	#Add col of NAs to clinical data
	57	z <- 1
	58	naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
	59	for(z in 1:dim(RAWWORD)[1]){
	60	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
	61	z <- z + 1
	62	}
	63	colnames(naroww) <- "ROW_NAs"
	64	RAWWORD <- bind_cols(RAWWORD,naroww)
	65
	66
	67	##Getting back to the data
	68	RAWDAT2 <- RAWDAT[-1,] %>%
	69	dplyr::arrange(.,ID_REF)
	70
	71	##Editing the file for R processing
	72	RAWDATID <- RAWDAT2[,1] %>%
	73	as.matrix(.)
	74	RAWDATNUM <- RAWDAT2[,-1] %>%
	75	mapply(.,FUN = as.numeric) %>%
	76	t(.)
	77
	78	##Consolidating genes with the same name
	79	tabRDATID <- table(RAWDATID)
	80	NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
	81	j <- 1
	82	for(j in 1:length(tabRDATID)){
	83	##Putting the ones without duplicates in their new homes
	84	if(tabRDATID[j] == 1){
	85	NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
	86	}
	87	##Averaging duplicates and putting them in their new homes
	88	if(tabRDATID[j] > 1){
	89	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
	90	}
	91	j <- j + 1
	92	}
	93
	94
	95	#Scaling the Data
	96	scrawdat <- NuRDATN%>%
	97	scale()
	98	attr(scrawdat,"scaled:center") <- NULL
	99	attr(scrawdat,"scaled:scale") <- NULL
	100	colnames(scrawdat) <- rownames(tabRDATID)
	101
	102
	103	#Discretized the Data
	104	dialzdat <- scrawdat %>%
	105	dndat(.) %>%
	106	t()%>%
	107	as.data.frame(.)
	108	colnames(dialzdat) <- rownames(RAWDATNUM)
	109
	110	#gene names
	111	genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
	112	#setting "ID_REF" as a new variable
	113	colnames(genena) <- "ID_REF"
	114	rownames(dialzdat) <- NULL
	115	dialzdat <-bind_cols(genena,dialzdat)
	116
	117	#NAs in a column
	118	x <- 2
	119	nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
	120	nacol[1,1] = "COL_NAs"
	121	for(x in 2:dim(dialzdat)[2]){
	122	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
	123	x <- x + 1
	124	}
	125	colnames(nacol) <- colnames(dialzdat)
	126	dialzdat<-bind_rows(dialzdat,nacol)
	127
	128	#NAs in a row
	129	y <- 1
	130	narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
	131	for(y in 1:dim(dialzdat)[1]){
	132	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
	133	y <- y + 1
	134	}
	135	colnames(narowd) <- "ROW_NAs"
	136	dialzdat <- bind_cols(dialzdat,narowd)
	137
	138	#converting to character so that the clinical can be brought together with discrete data
	139	k <- 2
	140	for(k in 2:dim(dialzdat)[2]-1){
	141	dialzdat[,k] <- as.character(dialzdat[,k])
	142	k <- k + 1
	143	}
	144
	145
	146	#The End the full data we seem to have found Carmen
	147	Fullalzdw <- bind_rows(RAWWORD,dialzdat)
	148
	149	#Create the file
	150	nfnaex <- strsplit(rawdat,"[\\\|/]") %>%
	151	.[[1]] %>%
	152	.[length(.)] %>%
	153	gsub("\\D","",.) %>%
	154	c("GSE",.,"dscrt.txt") %>%
	155	paste(collapse = "")
	156	write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)
	157
	158