Efrain Gonzalez / Cleaning and Fixing Data with R

Browse Code »

Commit a66a63dc504c3814768a1a79640a405d82ea0524

Authored by Efrain Gonzalez 2017-05-26 12:14:11 -0400

1 parent 061033644f

Exists in master

First version of the cleaning process in R

Showing 1 changed file with 198 additions and 0 deletions Show diff stats

Rclean.txt

Diff comments View file @ a66a63d

File was created	1	#Libraries required to run the code
	2	library(MASS)
	3	library(pryr)
	4	library(dplyr)
	5	library(tidyr)
	6	library(readr)
	7	library(stringr)
	8
	9
	10	#Necessary Functions
	11	#1#Function for handling the changing of row names and column names
	12	chngrownm <- function(mat){
	13	row <- dim(mat)[1]
	14	col <- dim(mat)[2]
	15	j <- 1
	16	x <- 1
	17	p <- 1
	18	a <- 1
	19	b <- 1
	20	g <- 1
	21	for(j in 1:col){
	22	if("!Sample_source_name_ch1"==mat[1,j]){
	23	colnames(mat)[j] <- "Brain_Region"
	24	}
	25	if("!Sample_title" == mat[1,j]){
	26	colnames(mat)[j] <- "Title"
	27	}
	28	if("!Sample_geo_accession" == mat[1,j]){
	29	colnames(mat)[j] <- "ID_REF"
	30	} else{
	31	if(grepl("Sex\|gender\|Gender\|sex",mat[2,j])==TRUE){
	32	colnames(mat)[j] <- paste0("Sex",x)
	33	x = x + 1
	34	}
	35	if(grepl("postmorteminterval\|PMI\|pmi",mat[2,j])==TRUE){
	36	colnames(mat)[j] <- paste0("PMI",p)
	37	p = p + 1
	38	}
	39	if(grepl("age\|Age\|AGE",mat[2,j])==TRUE){
	40	colnames(mat)[j] <- paste0("Age",a)
	41	a = a + 1
	42	}
	43	if(grepl("braak\|b&b",mat[2,j])==TRUE){
	44	colnames(mat)[j] <- paste0("Braak",b)
	45	b = b + 1
	46	}
	47	if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control",mat[2,j])==TRUE){
	48	colnames(mat)[j] <- paste0("Group",g)
	49	g = g + 1
	50	}
	51
	52	}
	53	j = j + 1
	54	}
	55	mat
	56	}
	57
	58	#2#Function for reorganizing information within the columns
	59	cinfo <- function(mat){
	60	col <- dim(mat)[2]
	61	j <-2
	62	for(j in 2:col){
	63	if(grepl("Group",colnames(mat)[j]) == TRUE){
	64	mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])
	65	}
	66	if(grepl("Age",colnames(mat)[j])==TRUE){
	67	mat[,j] <- gsub("\\D","",mat[,j])%>%
	68	as.integer()
	69	}
	70	if(grepl("Sex",colnames(mat)[j])==TRUE){
	71	mat[,j] <- gsub(".+:\\s","",mat[,j])
	72	}
	73	if(grepl("PMI",colnames(mat)[j])==TRUE){
	74	mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
	75	as.numeric()
	76	}
	77	if(grepl("Braak",colnames(mat)[j])==TRUE){
	78	mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
	79	as.roman()%>%
	80	as.integer()
	81	}
	82	j=j+1
	83	}
	84	mat
	85	}
	86
	87	#3#Function for changing the gene ID to gene name
	88	cgeneID <- function(GeneName,DATA){
	89	colGene <- dim(GeneName)[2]
	90	j <- 1
	91	for(j in 1:colGene){
	92	chngsreq <- grep(GeneName[1,j],DATA[1,])
	93	DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
	94	j = j+1
	95	}
	96	DATA
	97	}
	98
	99	#4#Function for adjusting the gene names
	100	gcnames <- function(DiData,usecol=1){
	101	nuruns <- dim(DiData)[2]
	102	i = 1
	103	nwnam <- rep("0",length.out=nuruns)
	104	for(i in 1:nuruns){
	105	if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
	106	nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
	107	} else{
	108	nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
	109	}
	110
	111	}
	112	nwnam
	113
	114	}
	115
	116
	117
	118	#The Rest of this code will be used every time you want to change a data set
	119
	120	#Getting the series matrix file
	121	print("Choose the series matrix file that you want to Analyze")
	122	alz <- file.choose()
	123
	124	#Getting the GPL file
	125	print("Choose the GPL file that correlates with the above series matrix file")
	126	genena <- file.choose()
	127
	128
	129	#Set working directory based on the directory of the series matrix file
	130	##strsplit(alz,"[\\]") %>%
	131	## .[[1]] %>%
	132	## .[-length(.)] %>%
	133	## paste(.,collapse="/") %>%
	134	## setwd()
	135
	136
	137	#Working with the wordy part of the document
	138	alzword <- alz %>%
	139	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
	140	filter(grepl("!Sample",X1))%>%
	141	filter(!grepl("!Sample_contact",X1))
	142
	143	##Changing row names and column names:
	144	ALZWORD <- t(alzword)
	145	rownames(ALZWORD)=NULL
	146	colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
	147	ALZWORD <- chngrownm(ALZWORD)[-1,]
	148	ALZWORD <- ALZWORD%>%
	149	as.data.frame()%>%
	150	dplyr::select(-starts_with("col"))
	151
	152	##Reorganizing information within the columns
	153	ALZWORDF <- cinfo(ALZWORD)
	154
	155
	156	#Working with Actual Data part of file
	157	alzdat <- alz %>%
	158	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
	159	ALZDAT <- t(alzdat[,-1])
	160	rownames(ALZDAT)=NULL
	161
	162
	163	##Gene ID to Gene Name
	164	geneIDNam <- genena %>%
	165	read_delim(delim="\t",comment = "#")%>%
	166	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	167
	168	##Changing the ID to a Name
	169	ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
	170	colnames(ALZDAT) = ALZDAT1[1,]
	171
	172
	173	##Adjusting the column names aka the gene names
	174	colnames(ALZDAT) <- gcnames(ALZDAT)
	175
	176
	177	#Full Data
	178	Fullalzdw <- ALZDAT %>%
	179	as.data.frame() %>%
	180	cbind(ALZWORDF,.)
	181
	182	##since the order in which the packages are added matters I moved this package to the top
	183	##library(MASS)
	184	nfna <- strsplit(alz,"[\\]") %>%
	185	.[[1]] %>%
	186	.[length(.)] %>%
	187	gsub("\\D","",.) %>%
	188	c("GSE",.,"after.txt") %>%
	189	paste(collapse = "")
	190	MASS::write.matrix(Fullalzdw,file = nfna,sep = "\t")
	191	#Perfect for excel viewing
	192	nfnaex <- strsplit(alz,"[\\]") %>%
	193	.[[1]] %>%
	194	.[length(.)] %>%
	195	gsub("\\D","",.) %>%
	196	c("GSE",.,"aftexcel.txt") %>%
	197	paste(collapse = "")
	198	write.table(t(Fullalzdw), file = nfnaex, sep = "\t")
	199