Efrain Gonzalez / Cleaning and Fixing Data with R

Browse Code »

Commit 58ba5d0b06bb84f11c3fc28a88967881141849b5

Authored by Efrain Gonzalez 2017-05-31 14:23:17 -0400

1 parent 18c7602e69

Exists in master

New version which includes the storing of the clean version of the GPL file (UNTESTED)

Showing 1 changed file with 302 additions and 0 deletions Show diff stats

RClean3.R

Diff comments View file @ 58ba5d0

File was created	1	#Libraries required to run the code
	2	library(pryr)
	3	library(MASS)
	4	library(dplyr)
	5	library(tidyr)
	6	library(readr)
	7	library(stringr)
	8
	9
	10	#Necessary Functions
	11	#1#Function for handling the changing of row names and column names
	12	chngrownm <- function(mat){
	13	row <- dim(mat)[1]
	14	col <- dim(mat)[2]
	15	j <- 1
	16	x <- 1
	17	p <- 1
	18	a <- 1
	19	b <- 1
	20	g <- 1
	21	for(j in 1:col){
	22	if("!Sample_source_name_ch1"==mat[1,j]){
	23	colnames(mat)[j] <- "Brain_Region"
	24	}
	25	if("!Sample_title" == mat[1,j]){
	26	colnames(mat)[j] <- "Title"
	27	}
	28	if("!Sample_geo_accession" == mat[1,j]){
	29	colnames(mat)[j] <- "ID_REF"
	30	} else{
	31	if(grepl("Sex\|gender\|Gender\|sex",mat[2,j])==TRUE){
	32	colnames(mat)[j] <- paste0("Sex",x)
	33	x = x + 1
	34	}
	35	if(grepl("postmorteminterval\|PMI\|pmi",mat[2,j])==TRUE){
	36	colnames(mat)[j] <- paste0("PMI",p)
	37	p = p + 1
	38	}
	39	if(grepl("age\|Age\|AGE",mat[2,j])==TRUE){
	40	colnames(mat)[j] <- paste0("Age",a)
	41	a = a + 1
	42	}
	43	if(grepl("braak\|b&b",mat[2,j])==TRUE){
	44	colnames(mat)[j] <- paste0("Braak",b)
	45	b = b + 1
	46	}
	47	if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control",mat[2,j])==TRUE){
	48	colnames(mat)[j] <- paste0("Group",g)
	49	g = g + 1
	50	}
	51
	52	}
	53	j = j + 1
	54	}
	55	mat
	56	}
	57
	58	#2#Function for reorganizing information within the columns
	59	cinfo <- function(mat){
	60	col <- dim(mat)[2]
	61	j <-2
	62	for(j in 2:col){
	63	if(grepl("Group",colnames(mat)[j]) == TRUE){
	64	mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])
	65	}
	66	if(grepl("Age",colnames(mat)[j])==TRUE){
	67	mat[,j] <- gsub("\\D","",mat[,j])%>%
	68	as.integer()
	69	}
	70	if(grepl("Sex",colnames(mat)[j])==TRUE){
	71	mat[,j] <- gsub(".+:\\s","",mat[,j])
	72	}
	73	if(grepl("PMI",colnames(mat)[j])==TRUE){
	74	mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
	75	as.numeric()
	76	}
	77	if(grepl("Braak",colnames(mat)[j])==TRUE){
	78	mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
	79	as.roman()%>%
	80	as.integer()
	81	}
	82	j=j+1
	83	}
	84	mat
	85	}
	86
	87	#3#Function for labeling the gene IDs without names
	88	NAFIXING <- function(GIDNAM){
	89	row <- dim(GIDNAM)[1]
	90	i <- 1
	91	for(i in 1:row){
	92	if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE\|\|is.na(GIDNAM[i,2])==TRUE){
	93	GIDNAM[i,2] <- GIDNAM[i,1]
	94	}
	95	i <- i + 1
	96	}
	97	GIDNAM
	98	}
	99
	100	#4#Function for changing the gene ID to gene name
	101	cgeneID <- function(GeneName,DATA){
	102	colGene <- dim(GeneName)[2]
	103	j <- 1
	104	for(j in 1:colGene){
	105	chngsreq <- grep(GeneName[1,j],DATA[1,])
	106	#DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
	107	DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
	108	j = j+1
	109	}
	110	DATA
	111	}
	112
	113	#5#Function for adjusting the gene names
	114	gcnames <- function(DiData,usecol=1){
	115	nuruns <- dim(DiData)[2]
	116	i = 1
	117	nwnam <- rep("0",length.out=nuruns)
	118	for(i in 1:nuruns){
	119	if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
	120	nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
	121	} else{
	122	nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
	123	}
	124
	125	}
	126	nwnam
	127
	128	}
	129
	130
	131
	132	#The Rest of this code will be used every time you want to change a data set
	133
	134	#Getting the series matrix file
	135	print("Choose the series matrix file that you want to Analyze")
	136	alz <- file.choose()
	137
	138	#Getting the GPL file
	139	print("Choose the GPL file that correlates with the above series matrix file")
	140	genena <- file.choose()
	141
	142
	143	#Set working directory based on the directory of the series matrix file Currently only works for windows
	144	##strsplit(alz,"[\\]") %>%
	145	## .[[1]] %>%
	146	## .[-length(.)] %>%
	147	## paste(.,collapse="/") %>%
	148	## setwd()
	149
	150	#Find out if it is a soft GPL file or not
	151	soft <- strsplit(genena,"[\\\|/]") %>%
	152	.[[1]] %>%
	153	.[length(.)] %>%
	154	grepl("soft",.)
	155
	156	#Working with the wordy part of the document
	157	alzword <- alz %>%
	158	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
	159	filter(grepl("!Sample",X1))%>%
	160	filter(!grepl("!Sample_contact",X1))
	161
	162	##Changing row names and column names:
	163	ALZWORD <- t(alzword)
	164	rownames(ALZWORD)=NULL
	165	colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
	166	ALZWORD <- chngrownm(ALZWORD)[-1,]
	167	ALZWORD <- ALZWORD%>%
	168	as.data.frame()%>%
	169	dplyr::select(-starts_with("col"))
	170
	171	##Reorganizing information within the columns
	172	ALZWORDF <- cinfo(ALZWORD)
	173
	174
	175	#Working with Actual Data part of file
	176	alzdat <- alz %>%
	177	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
	178	ALZDAT <- t(alzdat[,-1])
	179	rownames(ALZDAT)=NULL
	180
	181	##Is there a clean version of the GPL file available?
	182	gplnum <- strsplit(genena,"[\\\|/]") %>%
	183	.[[1]] %>%
	184	.[length(.)] %>%
	185	gsub("\\D","",.)
	186	clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
	187	if(clfileex >= 1){
	188	#use the clean version
	189	geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
	190	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
	191
	192	}
	193	if(clfileex == 0){
	194	##Lets Create a clean version
	195
	196	##Gene ID to Gene Name
	197	###geneIDNam <- genena %>%
	198	### read_delim(delim="\t",comment = "#")%>%
	199	### dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	200	###problems with the above for soft files
	201	if(soft == TRUE){
	202	#gplnum <- strsplit(genena,"[\\\|/]") %>%
	203	# .[[1]] %>%
	204	# .[length(.)] %>%
	205	# gsub("\\D","",.)
	206	#Check to see if there is already a file containing information on soft files
	207	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
	208	if(fileex == 1){
	209	#Check to see if this GPL soft file has been used before
	210	IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	211	.$GPL_FILE_NUM%>%
	212	grepl(gplnum,.) %>%
	213	sum()
	214	if(IDF == 1){
	215	IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	216	.$GPL_FILE_NUM%>%
	217	grep(gplnum,.)
	218	idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	219	.$LOC_ID %>%
	220	.[IDLOCAL]
	221	geneIDNam <- genena %>%
	222	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
	223	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	224	}
	225	if(IDF == 0){
	226	#No information on this particular GPL file
	227	idLOCGPL <- genena %>%
	228	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
	229	t(.) %>%
	230	grep("^\\D",.) %>%
	231	length()-1
	232	cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
	233	cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
	234	geneIDNam <- genena %>%
	235	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
	236	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	237	}
	238	}
	239	if(fileex == 0){
	240	#We must create a file that we can access for later use
	241	idLOCGPL <- genena %>%
	242	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
	243	t(.) %>%
	244	grep("^\\D",.) %>%
	245	length()-1
	246	Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
	247	colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
	248	write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
	249	geneIDNam <- genena %>%
	250	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
	251	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	252	}
	253	}
	254	if(soft == FALSE){
	255	geneIDNam <- genena %>%
	256	read_delim(delim="\t",comment = "#")%>%
	257	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	258	}
	259
	260	##Labeling the gene IDs without names
	261	geneIDNam <- NAFIXING(geneIDNam)
	262
	263	##remove the whitespace
	264	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
	265
	266	##Here is the clean version
	267	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
	268	}
	269
	270
	271
	272	##Changing the gene ID to gene name
	273	ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
	274	colnames(ALZDAT) = ALZDAT1[1,]
	275
	276
	277	##Adjusting the column names aka the gene names
	278	colnames(ALZDAT) <- gcnames(ALZDAT)
	279
	280
	281	#Full Data
	282	Fullalzdw <- ALZDAT %>%
	283	as.data.frame() %>%
	284	cbind(ALZWORDF,.)
	285
	286
	287	#nfna <- strsplit(alz,"[\\\|/]") %>%
	288	# .[[1]] %>%
	289	# .[length(.)] %>%
	290	# gsub("\\D","",.) %>%
	291	# c("GSE",.,"after.txt") %>%
	292	# paste(collapse = "")
	293	#write.matrix(Fullalzdw,file = nfna,sep = "\t")
	294
	295	#Perfect for excel viewing
	296	nfnaex <- strsplit(alz,"[\\]") %>%
	297	.[[1]] %>%
	298	.[length(.)] %>%
	299	gsub("\\D","",.) %>%
	300	c("GSE",.,"aftexcel.txt") %>%
	301	paste(collapse = "")
	302	write.table(t(Fullalzdw), file = nfnaex, sep = "\t",row.names = FALSE)