Efrain Gonzalez / Cleaning and Fixing Data with R

Browse Code »

Commit cc59b7f8323cecc33ca1facf20f024c2a1b5a73e

Authored by Efrain Gonzalez 2017-05-30 11:51:02 -0400

1 parent c0625ba184

Exists in master

Second version of code (UNTESTED)

Showing 1 changed file with 284 additions and 0 deletions Show diff stats

RClean2.R

Diff comments View file @ cc59b7f

File was created	1	#Libraries required to run the code
	2	library(pryr)
	3	library(MASS)
	4	library(dplyr)
	5	library(tidyr)
	6	library(readr)
	7	library(stringr)
	8
	9
	10	#Necessary Functions
	11	#1#Function for handling the changing of row names and column names
	12	chngrownm <- function(mat){
	13	row <- dim(mat)[1]
	14	col <- dim(mat)[2]
	15	j <- 1
	16	x <- 1
	17	p <- 1
	18	a <- 1
	19	b <- 1
	20	g <- 1
	21	for(j in 1:col){
	22	if("!Sample_source_name_ch1"==mat[1,j]){
	23	colnames(mat)[j] <- "Brain_Region"
	24	}
	25	if("!Sample_title" == mat[1,j]){
	26	colnames(mat)[j] <- "Title"
	27	}
	28	if("!Sample_geo_accession" == mat[1,j]){
	29	colnames(mat)[j] <- "ID_REF"
	30	} else{
	31	if(grepl("Sex\|gender\|Gender\|sex",mat[2,j])==TRUE){
	32	colnames(mat)[j] <- paste0("Sex",x)
	33	x = x + 1
	34	}
	35	if(grepl("postmorteminterval\|PMI\|pmi",mat[2,j])==TRUE){
	36	colnames(mat)[j] <- paste0("PMI",p)
	37	p = p + 1
	38	}
	39	if(grepl("age\|Age\|AGE",mat[2,j])==TRUE){
	40	colnames(mat)[j] <- paste0("Age",a)
	41	a = a + 1
	42	}
	43	if(grepl("braak\|b&b",mat[2,j])==TRUE){
	44	colnames(mat)[j] <- paste0("Braak",b)
	45	b = b + 1
	46	}
	47	if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control",mat[2,j])==TRUE){
	48	colnames(mat)[j] <- paste0("Group",g)
	49	g = g + 1
	50	}
	51
	52	}
	53	j = j + 1
	54	}
	55	mat
	56	}
	57
	58	#2#Function for reorganizing information within the columns
	59	cinfo <- function(mat){
	60	col <- dim(mat)[2]
	61	j <-2
	62	for(j in 2:col){
	63	if(grepl("Group",colnames(mat)[j]) == TRUE){
	64	mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])
	65	}
	66	if(grepl("Age",colnames(mat)[j])==TRUE){
	67	mat[,j] <- gsub("\\D","",mat[,j])%>%
	68	as.integer()
	69	}
	70	if(grepl("Sex",colnames(mat)[j])==TRUE){
	71	mat[,j] <- gsub(".+:\\s","",mat[,j])
	72	}
	73	if(grepl("PMI",colnames(mat)[j])==TRUE){
	74	mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
	75	as.numeric()
	76	}
	77	if(grepl("Braak",colnames(mat)[j])==TRUE){
	78	mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
	79	as.roman()%>%
	80	as.integer()
	81	}
	82	j=j+1
	83	}
	84	mat
	85	}
	86
	87	#3#Function for labeling the gene IDs without names
	88	NAFIXING <- function(GIDNAM){
	89	row <- dim(GIDNAM)[1]
	90	i <- 1
	91	x <- 1
	92	for(i in 1:row){
	93	if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE){
	94	GIDNAM[i,2] <- gsub(".*",paste0(NA,x),GIDNAM[i,2])
	95	x <- x + 1
	96	}
	97	i <- i + 1
	98	}
	99	GIDNAM
	100	}
	101
	102	#4#Function for changing the gene ID to gene name
	103	cgeneID <- function(GeneName,DATA){
	104	colGene <- dim(GeneName)[2]
	105	j <- 1
	106	for(j in 1:colGene){
	107	chngsreq <- grep(GeneName[1,j],DATA[1,])
	108	DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
	109	j = j+1
	110	}
	111	DATA
	112	}
	113
	114	#5#Function for adjusting the gene names
	115	gcnames <- function(DiData,usecol=1){
	116	nuruns <- dim(DiData)[2]
	117	i = 1
	118	nwnam <- rep("0",length.out=nuruns)
	119	for(i in 1:nuruns){
	120	if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
	121	nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
	122	} else{
	123	nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
	124	}
	125
	126	}
	127	nwnam
	128
	129	}
	130
	131
	132
	133	#The Rest of this code will be used every time you want to change a data set
	134
	135	#Getting the series matrix file
	136	print("Choose the series matrix file that you want to Analyze")
	137	alz <- file.choose()
	138
	139	#Getting the GPL file
	140	print("Choose the GPL file that correlates with the above series matrix file")
	141	genena <- file.choose()
	142
	143
	144	#Set working directory based on the directory of the series matrix file Currently only works for windows
	145	##strsplit(alz,"[\\]") %>%
	146	## .[[1]] %>%
	147	## .[-length(.)] %>%
	148	## paste(.,collapse="/") %>%
	149	## setwd()
	150
	151	#Find out if it is a soft GPL file or not
	152	soft <- strsplit(genena,"[\\\|/]") %>%
	153	.[[1]] %>%
	154	.[length(.)] %>%
	155	grepl("soft",.)
	156
	157	#Working with the wordy part of the document
	158	alzword <- alz %>%
	159	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
	160	filter(grepl("!Sample",X1))%>%
	161	filter(!grepl("!Sample_contact",X1))
	162
	163	##Changing row names and column names:
	164	ALZWORD <- t(alzword)
	165	rownames(ALZWORD)=NULL
	166	colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
	167	ALZWORD <- chngrownm(ALZWORD)[-1,]
	168	ALZWORD <- ALZWORD%>%
	169	as.data.frame()%>%
	170	dplyr::select(-starts_with("col"))
	171
	172	##Reorganizing information within the columns
	173	ALZWORDF <- cinfo(ALZWORD)
	174
	175
	176	#Working with Actual Data part of file
	177	alzdat <- alz %>%
	178	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
	179	ALZDAT <- t(alzdat[,-1])
	180	rownames(ALZDAT)=NULL
	181
	182
	183	##Gene ID to Gene Name
	184	###geneIDNam <- genena %>%
	185	### read_delim(delim="\t",comment = "#")%>%
	186	### dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	187	###problems with the above for soft files
	188	if(soft == TRUE){
	189	gplnum <- strsplit(genena,"[\\\|/]") %>%
	190	.[[1]] %>%
	191	.[length(.)] %>%
	192	gsub("\\D","",.)
	193	#Check to see if there is already a file containing information on soft files
	194	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
	195	if(fileex == 1){
	196	#Check to see if this GPL soft file has been used before
	197	IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	198	.$GPL_FILE_NUM%>%
	199	grepl(gplnum,.) %>%
	200	sum()
	201	if(IDF == 1){
	202	IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	203	.$GPL_FILE_NUM%>%
	204	grep(gplnum,.)
	205	idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	206	.$LOC_ID %>%
	207	.[IDLOCAL]
	208	geneIDNam <- genena %>%
	209	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
	210	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	211	}
	212	if(IDF == 0){
	213	#No information on this particular GPL file
	214	idLOCGPL <- genena %>%
	215	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
	216	t(.) %>%
	217	grep("^\\D",.) %>%
	218	length()-1
	219	cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
	220	cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
	221	geneIDNam <- genena %>%
	222	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
	223	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	224	}
	225	}
	226	if(fileex == 0){
	227	#We must create a file that we can access for later use
	228	idLOCGPL <- genena %>%
	229	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
	230	t(.) %>%
	231	grep("^\\D",.) %>%
	232	length()-1
	233	Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
	234	colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
	235	write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
	236	geneIDNam <- genena %>%
	237	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
	238	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	239	}
	240	}
	241	if(soft == FALSE){
	242	geneIDNam <- genena %>%
	243	read_delim(delim="\t",comment = "#")%>%
	244	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	245	}
	246
	247	##Labeling the gene IDs without names
	248	geneIDNam <- NAFIXING(geneIDNam)
	249
	250	##remove the whitespace
	251	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
	252
	253	##Changing the gene ID to gene name
	254	ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
	255	colnames(ALZDAT) = ALZDAT1[1,]
	256
	257
	258	##Adjusting the column names aka the gene names
	259	colnames(ALZDAT) <- gcnames(ALZDAT)
	260
	261
	262	#Full Data
	263	Fullalzdw <- ALZDAT %>%
	264	as.data.frame() %>%
	265	cbind(ALZWORDF,.)
	266
	267
	268	nfna <- strsplit(alz,"[\\]") %>%
	269	.[[1]] %>%
	270	.[length(.)] %>%
	271	gsub("\\D","",.) %>%
	272	c("GSE",.,"after.txt") %>%
	273	paste(collapse = "")
	274	write.matrix(Fullalzdw,file = nfna,sep = "\t")
	275	#Perfect for excel viewing
	276	nfnaex <- strsplit(alz,"[\\]") %>%
	277	.[[1]] %>%
	278	.[length(.)] %>%
	279	gsub("\\D","",.) %>%
	280	c("GSE",.,"aftexcel.txt") %>%
	281	paste(collapse = "")
	282	write.table(t(Fullalzdw), file = nfnaex, sep = "\t")
	283
	284
	285