Efrain Gonzalez / Cleaning and Fixing Data with R

Browse Code »

Commit 22a75a38eb8c5a8df6acce96b9c4487874143168

Authored by Efrain Gonzalez 2017-06-05 09:14:35 -0400

1 parent f378e57f40

Exists in master

Most Recent (UNTESTED)

Showing 1 changed file with 324 additions and 0 deletions Show diff stats

RClean4.R

Diff comments View file @ 22a75a3

File was created	1	#Libraries required to run the code
	2	library(pryr)
	3	library(MASS)
	4	library(dplyr)
	5	library(tidyr)
	6	library(readr)
	7	library(stringr)
	8
	9
	10	#Necessary Functions
	11	#1#Function for handling the changing of row names and column names
	12	chngrownm <- function(mat){
	13	row <- dim(mat)[1]
	14	col <- dim(mat)[2]
	15	j <- 1
	16	x <- 1
	17	p <- 1
	18	a <- 1
	19	b <- 1
	20	g <- 1
	21	for(j in 1:col){
	22	if("!Sample_source_name_ch1"==mat[1,j]){
	23	colnames(mat)[j] <- "Brain_Region"
	24	}
	25	if("!Sample_title" == mat[1,j]){
	26	colnames(mat)[j] <- "Title"
	27	}
	28	if("!Sample_geo_accession" == mat[1,j]){
	29	colnames(mat)[j] <- "ID_REF"
	30	} else{
	31	if(grepl("Sex\|gender\|Gender\|sex",mat[2,j])==TRUE){
	32	colnames(mat)[j] <- paste0("Sex",x)
	33	x = x + 1
	34	}
	35	if(grepl("postmorteminterval\|PMI\|pmi",mat[2,j])==TRUE){
	36	colnames(mat)[j] <- paste0("PMI",p)
	37	p = p + 1
	38	}
	39	if(grepl("age\|Age\|AGE",mat[2,j])==TRUE){
	40	colnames(mat)[j] <- paste0("Age",a)
	41	a = a + 1
	42	}
	43	if(grepl("braak\|b&b",mat[2,j])==TRUE){
	44	colnames(mat)[j] <- paste0("Braak",b)
	45	b = b + 1
	46	}
	47	if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control",mat[2,j])==TRUE){
	48	colnames(mat)[j] <- paste0("Group",g)
	49	g = g + 1
	50	}
	51
	52	}
	53	j = j + 1
	54	}
	55	mat
	56	}
	57
	58	#2#Function for reorganizing information within the columns
	59	cinfo <- function(mat){
	60	col <- dim(mat)[2]
	61	j <-2
	62	for(j in 2:col){
	63	if(grepl("Group",colnames(mat)[j]) == TRUE){
	64	mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])
	65	}
	66	if(grepl("Age",colnames(mat)[j])==TRUE){
	67	mat[,j] <- gsub("\\D","",mat[,j])%>%
	68	as.integer()
	69	}
	70	if(grepl("Sex",colnames(mat)[j])==TRUE){
	71	mat[,j] <- gsub(".+:\\s","",mat[,j])
	72	}
	73	if(grepl("PMI",colnames(mat)[j])==TRUE){
	74	mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
	75	as.numeric()
	76	}
	77	if(grepl("Braak",colnames(mat)[j])==TRUE){
	78	mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
	79	as.roman()%>%
	80	as.integer()
	81	}
	82	j=j+1
	83	}
	84	mat
	85	}
	86
	87	#3#Function for labeling the gene IDs without names
	88	NAFIXING <- function(GIDNAM){
	89	row <- dim(GIDNAM)[1]
	90	i <- 1
	91	for(i in 1:row){
	92	if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE\|\|is.na(GIDNAM[i,2])==TRUE){
	93	GIDNAM[i,2] <- GIDNAM[i,1]
	94	}
	95	i <- i + 1
	96	}
	97	GIDNAM
	98	}
	99
	100	##4#Function for changing the gene ID to gene name
	101	##cgeneID <- function(GeneName,DATA){
	102	## colGene <- dim(GeneName)[2]
	103	## j <- 1
	104	## for(j in 1:colGene){
	105	## chngsreq <- grep(GeneName[1,j],DATA[1,])
	106	## if(sum(chngsreq) > 0){
	107	## #DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
	108	## DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
	109	## }
	110	## j = j+1
	111	## }
	112	## DATA
	113	##}
	114	#4#Function for changing the gene ID to gene name
	115	cgeneID <- function(GeneName,DATA){
	116	colGene <- dim(GeneName)[2]
	117	j <- 1
	118	for(j in 1:colGene){
	119	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
	120	if(is.na(sum(chngsreq))==FALSE){
	121	if(sum(chngsreq) > 0){
	122	DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
	123	}
	124	}
	125	#if(sum(chngsreq) > 0){
	126	##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
	127	#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
	128	#}
	129	j = j+1
	130	}
	131	DATA
	132	}
	133
	134	#5#Function for adjusting the gene names
	135	gcnames <- function(DiData,usecol=1){
	136	nuruns <- dim(DiData)[2]
	137	i = 1
	138	nwnam <- rep("0",length.out=nuruns)
	139	for(i in 1:nuruns){
	140	if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
	141	nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
	142	} else{
	143	nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
	144	}
	145
	146	}
	147	nwnam
	148
	149	}
	150
	151
	152
	153	#The Rest of this code will be used every time you want to change a data set
	154
	155	#Getting the series matrix file
	156	print("Choose the series matrix file that you want to Analyze")
	157	alz <- file.choose()
	158
	159	#Getting the GPL file
	160	print("Choose the GPL file that correlates with the above series matrix file")
	161	genena <- file.choose()
	162
	163
	164	#Set working directory based on the directory of the series matrix file Currently only works for windows
	165	##strsplit(alz,"[\\]") %>%
	166	## .[[1]] %>%
	167	## .[-length(.)] %>%
	168	## paste(.,collapse="/") %>%
	169	## setwd()
	170
	171	#Find out if it is a soft GPL file or not
	172	soft <- strsplit(genena,"[\\\|/]") %>%
	173	.[[1]] %>%
	174	.[length(.)] %>%
	175	grepl("soft",.)
	176
	177	#Working with the wordy part of the document
	178	alzword <- alz %>%
	179	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
	180	filter(grepl("!Sample",X1))%>%
	181	filter(!grepl("!Sample_contact",X1))
	182
	183	##Changing row names and column names:
	184	ALZWORD <- t(alzword)
	185	rownames(ALZWORD)=NULL
	186	colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
	187	ALZWORD <- chngrownm(ALZWORD)[-1,]
	188	ALZWORD <- ALZWORD%>%
	189	as.data.frame()%>%
	190	dplyr::select(-starts_with("col"))
	191
	192	##Reorganizing information within the columns
	193	ALZWORDF <- cinfo(ALZWORD)
	194
	195
	196	#Working with Actual Data part of file
	197	alzdat <- alz %>%
	198	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
	199	ALZDAT <- t(alzdat[,-1])
	200	rownames(ALZDAT)=NULL
	201
	202	##Is there a clean version of the GPL file available?
	203	gplnum <- strsplit(genena,"[\\\|/]") %>%
	204	.[[1]] %>%
	205	.[length(.)] %>%
	206	gsub("\\D","",.)
	207	clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
	208	if(clfileex >= 1){
	209	#use the clean version
	210	geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
	211	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
	212
	213	}
	214	if(clfileex == 0){
	215	##Lets Create a clean version
	216
	217	##Gene ID to Gene Name
	218	###geneIDNam <- genena %>%
	219	### read_delim(delim="\t",comment = "#")%>%
	220	### dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	221	###problems with the above for soft files
	222	if(soft == TRUE){
	223	#gplnum <- strsplit(genena,"[\\\|/]") %>%
	224	# .[[1]] %>%
	225	# .[length(.)] %>%
	226	# gsub("\\D","",.)
	227	#Check to see if there is already a file containing information on soft files
	228	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
	229	if(fileex == 1){
	230	#Check to see if this GPL soft file has been used before
	231	IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	232	.$GPL_FILE_NUM%>%
	233	grepl(gplnum,.) %>%
	234	sum()
	235	if(IDF == 1){
	236	IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	237	.$GPL_FILE_NUM%>%
	238	grep(gplnum,.)
	239	idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	240	.$LOC_ID %>%
	241	.[IDLOCAL]
	242	geneIDNam <- genena %>%
	243	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
	244	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	245	}
	246	if(IDF == 0){
	247	#No information on this particular GPL file
	248	idLOCGPL <- genena %>%
	249	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
	250	t(.) %>%
	251	grep("^\\D",.) %>%
	252	length()-1
	253	cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
	254	cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
	255	geneIDNam <- genena %>%
	256	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
	257	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	258	}
	259	}
	260	if(fileex == 0){
	261	#We must create a file that we can access for later use
	262	idLOCGPL <- genena %>%
	263	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
	264	t(.) %>%
	265	grep("^\\D",.) %>%
	266	length()-1
	267	Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
	268	colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
	269	write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
	270	geneIDNam <- genena %>%
	271	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
	272	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	273	}
	274	}
	275	if(soft == FALSE){
	276	geneIDNam <- genena %>%
	277	read_delim(delim="\t",comment = "#")%>%
	278	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	279	}
	280
	281	##Labeling the gene IDs without names
	282	geneIDNam <- NAFIXING(geneIDNam)
	283
	284	##remove the whitespace
	285	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
	286
	287	##Here is the clean version
	288	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
	289	}
	290
	291
	292
	293	##Changing the gene ID to gene name
	294	ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
	295	colnames(ALZDAT) = ALZDAT1[1,]
	296
	297
	298	##Adjusting the column names aka the gene names
	299	colnames(ALZDAT) <- gcnames(ALZDAT)
	300
	301
	302	#Full Data
	303	Fullalzdw <- ALZDAT %>%
	304	as.data.frame() %>%
	305	cbind(ALZWORDF,.)
	306
	307
	308	#nfna <- strsplit(alz,"[\\\|/]") %>%
	309	# .[[1]] %>%
	310	# .[length(.)] %>%
	311	# gsub("\\D","",.) %>%
	312	# c("GSE",.,"after.txt") %>%
	313	# paste(collapse = "")
	314	#write.matrix(Fullalzdw,file = nfna,sep = "\t")
	315
	316	#Perfect for excel viewing
	317	nfnaex <- strsplit(alz,"[\\]") %>%
	318	.[[1]] %>%
	319	.[length(.)] %>%
	320	gsub("\\D","",.) %>%
	321	c("GSE",.,"aftexcel.txt") %>%
	322	paste(collapse = "")
	323	write.table(t(Fullalzdw), file = nfnaex, sep = "\t")
	324
	325