Efrain Gonzalez / Cleaning and Fixing Data with R

Commit 0eb3420561162679cbaa6e877e4bb7621c5d694b

Authored by Efrain Gonzalez 2017-06-15 21:05:56 -0400

Exists in master

This code combines the cleaning and discretizing processes. (UNTESTED)

Two files are output one with raw data and the other with discretized data.

Showing 1 changed file with 445 additions and 0 deletions Show diff stats

RCleanDscret.R

Diff comments View file @ 0eb3420

File was created	1	#Libraries required to run the code
	2	library(pryr)
	3	library(MASS)
	4	library(dplyr)
	5	library(tidyr)
	6	library(readr)
	7	library(stringr)
	8
	9
	10	#Necessary Functions
	11	#1#Function for handling the changing of row names and column names
	12	chngrownm <- function(mat){
	13	row <- dim(mat)[1]
	14	col <- dim(mat)[2]
	15	j <- 1
	16	x <- 1
	17	p <- 1
	18	a <- 1
	19	b <- 1
	20	g <- 1
	21	for(j in 1:col){
	22	if("!Sample_source_name_ch1"==mat[1,j]){
	23	colnames(mat)[j] <- "Brain_Region"
	24	}
	25	if("!Sample_title" == mat[1,j]){
	26	colnames(mat)[j] <- "Title"
	27	}
	28	if("!Sample_geo_accession" == mat[1,j]){
	29	colnames(mat)[j] <- "ID_REF"
	30	} else{
	31	if(grepl("Sex\|gender\|Gender\|sex",mat[2,j])==TRUE){
	32	colnames(mat)[j] <- paste0("Sex",x)
	33	x = x + 1
	34	}
	35	if(grepl("postmorteminterval\|PMI\|pmi",mat[2,j])==TRUE){
	36	colnames(mat)[j] <- paste0("PMI",p)
	37	p = p + 1
	38	}
	39	if(grepl("age\|Age\|AGE",mat[2,j])==TRUE){
	40	colnames(mat)[j] <- paste0("Age",a)
	41	a = a + 1
	42	}
	43	if(grepl("braak\|b&b",mat[2,j])==TRUE){
	44	colnames(mat)[j] <- paste0("Braak",b)
	45	b = b + 1
	46	}
	47	if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control",mat[2,j])==TRUE){
	48	colnames(mat)[j] <- paste0("Group",g)
	49	g = g + 1
	50	}
	51
	52	}
	53	j = j + 1
	54	}
	55	mat
	56	}
	57
	58	#2#Function for reorganizing information within the columns
	59	cinfo <- function(mat){
	60	col <- dim(mat)[2]
	61	j <-2
	62	for(j in 2:col){
	63	if(grepl("Group",colnames(mat)[j]) == TRUE){
	64	mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])
	65	}
	66	if(grepl("Age",colnames(mat)[j])==TRUE){
	67	mat[,j] <- gsub("\\D","",mat[,j])%>%
	68	as.integer()
	69	}
	70	if(grepl("Sex",colnames(mat)[j])==TRUE){
	71	mat[,j] <- gsub(".+:\\s","",mat[,j])
	72	}
	73	if(grepl("PMI",colnames(mat)[j])==TRUE){
	74	mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
	75	as.numeric()
	76	}
	77	if(grepl("Braak",colnames(mat)[j])==TRUE){
	78	mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
	79	as.roman()%>%
	80	as.integer()
	81	}
	82	j=j+1
	83	}
	84	mat
	85	}
	86
	87	#3#Function for labeling the gene IDs without names
	88	NAFIXING <- function(GIDNAM){
	89	row <- dim(GIDNAM)[1]
	90	i <- 1
	91	for(i in 1:row){
	92	if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE\|\|is.na(GIDNAM[i,2])==TRUE){
	93	GIDNAM[i,2] <- GIDNAM[i,1]
	94	}
	95	i <- i + 1
	96	}
	97	GIDNAM
	98	}
	99
	100	#4#Function for changing the gene ID to gene name
	101	cgeneID <- function(GeneName,DATA){
	102	colGene <- dim(GeneName)[2]
	103	j <- 1
	104	for(j in 1:colGene){
	105	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
	106	if(is.na(sum(chngsreq))==FALSE){
	107	if(sum(chngsreq) > 0){
	108	DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
	109	}
	110	}
	111	#if(sum(chngsreq) > 0){
	112	##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
	113	#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
	114	#}
	115	j = j+1
	116	}
	117	DATA
	118	}
	119
	120	#5#Function for adjusting the gene names
	121	gcnames <- function(DiData,usecol=1){
	122	nuruns <- dim(DiData)[2]
	123	i = 1
	124	nwnam <- rep("0",length.out=nuruns)
	125	for(i in 1:nuruns){
	126	if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
	127	nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
	128	} else{
	129	nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
	130	}
	131
	132	}
	133	nwnam
	134
	135	}
	136
	137	#6# Function for discretizing the data
	138	dndat <- function(NDATA){
	139	rownd <- dim(NDATA)[1]
	140	colnd <- dim(NDATA)[2]
	141	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
	142	colnames(DDATA) <- colnames(NDATA)
	143	i <- 1
	144	for(i in 1:rownd){
	145	j <- 1
	146	for(j in 1:colnd){
	147	if(is.na(NDATA[i,j])==FALSE){
	148
	149	if(NDATA[i,j] < -1){
	150	DDATA[i,j]=0L
	151	}
	152	if(NDATA[i,j] > 1){
	153	DDATA[i,j]=2L
	154	}
	155	if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
	156	DDATA[i,j]=1L
	157	}
	158	} else{
	159	DDATA[i,j] = NDATA[i,j]
	160	}
	161	j = j + 1
	162	}
	163	i = i + 1
	164	}
	165	DDATA
	166	}
	167
	168
	169	#The Rest of this code will be used every time you want to change a data set
	170
	171	#Getting the series matrix file
	172	print("Choose the series matrix file that you want to Analyze")
	173	alz <- file.choose()
	174
	175	#Getting the GPL file
	176	print("Choose the GPL file that correlates with the above series matrix file")
	177	genena <- file.choose()
	178
	179
	180	#Find out if it is a soft GPL file or not
	181	soft <- strsplit(genena,"[\\\|/]") %>%
	182	.[[1]] %>%
	183	.[length(.)] %>%
	184	grepl("soft\|annot",.)
	185
	186	#Working with the wordy part of the document
	187	alzword <- alz %>%
	188	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
	189	filter(grepl("!Sample",X1))%>%
	190	filter(!grepl("!Sample_contact",X1))
	191
	192	##Changing row names and column names:
	193	ALZWORD <- t(alzword)
	194	rownames(ALZWORD)=NULL
	195	colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
	196	ALZWORD <- chngrownm(ALZWORD)[-1,]
	197	ALZWORD <- ALZWORD%>%
	198	as.data.frame()%>%
	199	dplyr::select(-starts_with("col"))
	200
	201	##Reorganizing information within the columns
	202	ALZWORDF <- cinfo(ALZWORD)
	203
	204
	205	#Working with Actual Data part of file
	206	alzdat <- alz %>%
	207	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
	208	ALZDAT <- t(alzdat[,-1])
	209	rownames(ALZDAT)=NULL
	210
	211	##Is there a clean version of the GPL file available?
	212	gplnum <- strsplit(genena,"[\\\|/]") %>%
	213	.[[1]] %>%
	214	.[length(.)] %>%
	215	gsub("\\D","",.)
	216	clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
	217	if(clfileex >= 1){
	218	#use the clean version
	219	geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
	220	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
	221
	222	}
	223	if(clfileex == 0){
	224	##Lets Create a clean version
	225
	226	##Gene ID to Gene Name
	227	if(soft == TRUE){
	228	#Check to see if there is already a file containing information on soft files
	229	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
	230	if(fileex == 1){
	231	#Check to see if this GPL soft file has been used before
	232	IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	233	.$GPL_FILE_NUM%>%
	234	grepl(gplnum,.) %>%
	235	sum()
	236	if(IDF == 1){
	237	IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	238	.$GPL_FILE_NUM%>%
	239	grep(gplnum,.)
	240	idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	241	.$LOC_ID %>%
	242	.[IDLOCAL]
	243	geneIDNam <- genena %>%
	244	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
	245	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
	246	}
	247	if(IDF == 0){
	248	#No information on this particular GPL file
	249	idLOCGPL <- genena %>%
	250	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
	251	t(.) %>%
	252	grep("^ID\\s*$",.) %>%
	253	-1
	254	cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
	255	cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
	256	geneIDNam <- genena %>%
	257	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
	258	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
	259	}
	260	}
	261	if(fileex == 0){
	262	#We must create a file that we can access for later use
	263	idLOCGPL <- genena %>%
	264	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
	265	t(.) %>%
	266	grep("^ID\\s*$",.) %>%
	267	-1
	268	Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
	269	colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
	270	write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
	271	geneIDNam <- genena %>%
	272	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
	273	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
	274	}
	275	}
	276	if(soft == FALSE){
	277	geneIDNam <- genena %>%
	278	read_delim(delim="\t",comment = "#")%>%
	279	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
	280	}
	281
	282	##Labeling the gene IDs without names
	283	geneIDNam <- NAFIXING(geneIDNam)
	284
	285	##remove the whitespace
	286	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
	287
	288	##Here is the clean version
	289	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
	290	}
	291
	292
	293
	294	##Changing the gene ID to gene name
	295	ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
	296	colnames(ALZDAT) = ALZDAT1[1,]
	297
	298
	299	##Adjusting the column names aka the gene names
	300	colnames(ALZDAT) <- gcnames(ALZDAT)
	301
	302
	303	#Full RAW Data
	304	Fullalzdwr <- ALZDAT %>%
	305	as.data.frame() %>%
	306	cbind(ALZWORDF,.)
	307
	308
	309	#Raw file is output
	310	nfnaex <- strsplit(alz,"[\\]") %>%
	311	.[[1]] %>%
	312	.[length(.)] %>%
	313	gsub("\\D","",.) %>%
	314	c("GSE",.,"aftexcel.txt") %>%
	315	paste(collapse = "")
	316	write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
	317
	318
	319	#Now for the discretization part
	320	##get the wordy part again
	321	rawword <- t(ALZWORDF)
	322
	323	##where is ID_REF located
	324	hereim <- grep("ID_REF",rawword[,1])
	325
	326	##Subject Names GSM...
	327	subjnam <- rawword[hereim,]
	328
	329	##Getting the names for the rows
	330	namedarows <- rownames(rawword)[-hereim] %>%
	331	as.data.frame()
	332	RAWWORD <- rawword[-hereim,] %>%
	333	as.data.frame() %>%
	334	bind_cols(namedarows,.)
	335	z <- 1
	336	naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
	337	for(z in 1:dim(RAWWORD)[1]){
	338	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
	339	z <- z + 1
	340	}
	341
	342	colnames(naroww) <- "ROW_NAs"
	343	RAWWORD <- bind_cols(RAWWORD,naroww)
	344
	345
	346	roALZna <- t(ALZDAT) %>%
	347	rownames(.) %>%
	348	as.data.frame(.)
	349	colnames(roALZna) <- "ID_REF"
	350
	351	RAWDAT <- t(ALZDAT) %>%
	352	as.data.frame(.)
	353	colnames(RAWDAT) <- NULL
	354	rownames(RAWDAT) <- NULL
	355
	356	RAWDAT2 <- RAWDAT %>%
	357	cbind(roALZna,.) %>%
	358	dplyr::arrange(.,ID_REF)
	359
	360	##Editing the file for R processing
	361	RAWDATID <- RAWDAT2[,1] %>%
	362	as.matrix(.)
	363
	364	RAWDATNUM <- RAWDAT2[,-1] %>%
	365	mapply(.,FUN = as.numeric) %>%
	366	t(.)
	367
	368	##Consolidating genes with the same name
	369	###create empty matrix of size equal to tabRDATID
	370	tabRDATID <- table(RAWDATID)
	371	NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
	372	j <- 1
	373	for(j in 1:length(tabRDATID)){
	374
	375	##Putting the ones without duplicates in their new homes
	376	if(tabRDATID[j] == 1){
	377	NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
	378	}
	379	##Averaging duplicates and putting them in their new homes
	380	if(tabRDATID[j] > 1){
	381	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
	382	}
	383	j <- j + 1
	384	}
	385
	386	##Scaling the Data
	387	scrawdat <- NuRDATN%>%
	388	scale()
	389	attr(scrawdat,"scaled:center") <- NULL
	390	attr(scrawdat,"scaled:scale") <- NULL
	391	colnames(scrawdat) <- rownames(tabRDATID)
	392
	393	##Discretized the Data
	394	dialzdat <- scrawdat %>%
	395	dndat(.) %>%
	396	t()%>%
	397	as.data.frame(.)
	398	colnames(dialzdat) <- rownames(RAWDATNUM)
	399
	400	##setting "ID_REF" as a new variable
	401	geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
	402	colnames(geneNAM) <- "ID_REF"
	403	rownames(dialzdat) <- NULL
	404	dialzdat <-bind_cols(geneNAM,dialzdat)
	405
	406	##NAs in a column
	407	x <- 2
	408	nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
	409	nacol[1,1] = "COL_NAs"
	410	for(x in 2:dim(dialzdat)[2]){
	411	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
	412	x <- x + 1
	413	}
	414	colnames(nacol) <- colnames(dialzdat)
	415	dialzdat<-bind_rows(dialzdat,nacol)
	416
	417	##NAs in a row
	418	y <- 1
	419	narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
	420	for(y in 1:dim(dialzdat)[1]){
	421	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
	422	y <- y + 1
	423	}
	424	colnames(narowd) <- "ROW_NAs"
	425	dialzdat <- bind_cols(dialzdat,narowd)
	426	colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
	427	colnames(RAWWORD) <- colnames(dialzdat)
	428	##converting to character so that the clinical can be brought together with discrete data
	429	k <- 2
	430	for(k in 2:dim(dialzdat)[2]-1){
	431	dialzdat[,k] <- as.character(dialzdat[,k])
	432	k <- k + 1
	433	}
	434	#The End the full data
	435	Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
	436
	437	#Produces Discrete file
	438	nfnaex <- strsplit(rawdat,"[\\\|/]") %>%
	439	.[[1]] %>%
	440	.[length(.)] %>%
	441	gsub("\\D","",.) %>%
	442	c("GSE",.,"dscrt.txt") %>%
	443	paste(collapse = "")
	444	write.table(Dscrtalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)
	445
	446