RClean4.R 8.08 KB
#Libraries required to run the code
library(pryr)
library(MASS)
library(dplyr)
library(tidyr)
library(readr)
library(stringr)


#Necessary Functions
#1#Function for handling the changing of row names and column names
chngrownm <- function(mat){
	row <- dim(mat)[1]
	col <- dim(mat)[2]
	j <- 1
	x <- 1
	p <- 1	
	a <- 1 
	b <- 1
	g <- 1
	for(j in 1:col){
		if("!Sample_source_name_ch1"==mat[1,j]){
			colnames(mat)[j] <- "Brain_Region"	
		} 
		if("!Sample_title" == mat[1,j]){
			colnames(mat)[j] <- "Title"
		} 
		if("!Sample_geo_accession" == mat[1,j]){
			colnames(mat)[j] <- "ID_REF"
		} else{
			if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
				colnames(mat)[j] <- paste0("Sex",x)
				x = x + 1
			}
			if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
				colnames(mat)[j] <- paste0("PMI",p)
				p = p + 1
			}
			if(grepl("age|Age|AGE",mat[2,j])==TRUE){
				colnames(mat)[j] <- paste0("Age",a)
				a = a + 1
			 }
			if(grepl("braak|b&b",mat[2,j])==TRUE){
				colnames(mat)[j] <- paste0("Braak",b)
				b = b + 1
			}
			if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){
				colnames(mat)[j] <- paste0("Group",g)
				g = g + 1
			}
			
		}
		j = j + 1
	}
	mat
}			

#2#Function for reorganizing information within the columns
cinfo <- function(mat){
	col <- dim(mat)[2]
	j <-2
	for(j in 2:col){
		if(grepl("Group",colnames(mat)[j]) == TRUE){
			mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
		}
		if(grepl("Age",colnames(mat)[j])==TRUE){
			mat[,j] <- gsub("\\D","",mat[,j])%>%
				as.integer()
		}
		if(grepl("Sex",colnames(mat)[j])==TRUE){
			mat[,j] <- gsub(".+:\\s","",mat[,j])
		}
		if(grepl("PMI",colnames(mat)[j])==TRUE){
			mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
				as.numeric() 
		}
		if(grepl("Braak",colnames(mat)[j])==TRUE){
			mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
				as.roman()%>%
				as.integer()
		}
	j=j+1
	}
	mat
}

#3#Function for labeling the gene IDs without names
NAFIXING <- function(GIDNAM){
	row <- dim(GIDNAM)[1]
	i <- 1
	for(i in 1:row){
		if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
			GIDNAM[i,2] <- GIDNAM[i,1]
		}
		i <- i + 1
	}
	GIDNAM
}	

#4#Function for changing the gene ID to gene name
cgeneID <- function(GeneName,DATA){
    colGene <- dim(GeneName)[2]
     j <- 1
     for(j in 1:colGene){
	    chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
	    if(is.na(sum(chngsreq))==FALSE){
		    if(sum(chngsreq) > 0){
			    DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
		    }
	    }
	    j = j+1
	}
	DATA
}

#5#Function for adjusting the gene names
gcnames <- function(DiData,usecol=1){
	nuruns <- dim(DiData)[2]
	i = 1
	nwnam <- rep("0",length.out=nuruns)
	for(i in 1:nuruns){
		if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
			nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
		} else{
			nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
		}
		
	}
	nwnam

}



#The Rest of this code will be used every time you want to change a data set

#Getting the series matrix file
print("Choose the series matrix file that you want to Analyze")
alz <- file.choose()

#Getting the GPL file
print("Choose the GPL file that correlates with the above series matrix file")
genena <- file.choose()


#Set working directory based on the directory of the series matrix file Currently only works for windows
##strsplit(alz,"[\\]") %>%
##	.[[1]] %>%
##	.[-length(.)] %>%
##	paste(.,collapse="/") %>%
##	setwd()

#Find out if it is a soft GPL file or not
soft <- strsplit(genena,"[\\|/]") %>%
	.[[1]] %>%
	.[length(.)] %>%
	grepl("soft|annot",.)

#Working with the wordy part of the document
alzword <- alz %>%
	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
	filter(grepl("!Sample",X1))%>%
	filter(!grepl("!Sample_contact",X1))

##Changing row names and column names:
ALZWORD <- t(alzword)
rownames(ALZWORD)=NULL
colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
ALZWORD <- chngrownm(ALZWORD)[-1,]
ALZWORD <- ALZWORD%>%
	as.data.frame()%>%
	dplyr::select(-starts_with("col"))

##Reorganizing information within the columns
ALZWORDF <- cinfo(ALZWORD)


#Working with Actual Data part of file
alzdat <- alz %>% 
	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
ALZDAT <- t(alzdat[,-1])
rownames(ALZDAT)=NULL

##Is there a clean version of the GPL file available?
gplnum <- strsplit(genena,"[\\|/]") %>%
	.[[1]] %>%
	.[length(.)] %>%
	gsub("\\D","",.)
clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
if(clfileex >= 1){
#use the clean version
geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

} 
if(clfileex == 0){
##Lets Create a clean version
 
##Gene ID to Gene Name
###geneIDNam <- genena %>%
###	read_delim(delim="\t",comment = "#")%>%
###	dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
###problems with the above for soft files
	if(soft == TRUE){
		#gplnum <- strsplit(genena,"[\\|/]") %>%
		#	.[[1]] %>%
		#	.[length(.)] %>%
		#	gsub("\\D","",.)
		#Check to see if there is already a file containing information on soft files
		fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
		if(fileex == 1){
			#Check to see if this GPL soft file has been used before
			IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
				.$GPL_FILE_NUM%>%
				grepl(gplnum,.) %>%
				sum()
			if(IDF == 1){
				IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
					.$GPL_FILE_NUM%>%
					grep(gplnum,.)
				idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
					.$LOC_ID %>%
					.[IDLOCAL]
				geneIDNam <- genena %>%
					read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
					dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
			}
			if(IDF == 0){
				#No information on this particular GPL file
				idLOCGPL <- genena %>%
					read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
					t(.) %>%
					.[1,] %>%
					grep("^ID\\s*$",.) %>%
					-1
				cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
					cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
				geneIDNam <- genena %>%
					read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
					dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
			}
		}
		if(fileex == 0){
			#We must create a file that we can access for later use
			idLOCGPL <- genena %>%
				read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
				t(.) %>%
				.[1,] %>%
				grep("^ID\\s*$",.) %>%
				-1
			Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
			colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
			write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
			geneIDNam <- genena %>%
				read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
				dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
		}
	}
	if(soft == FALSE){
		geneIDNam <- genena %>%
		read_delim(delim="\t",comment = "#")%>%
		dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
	}

	##Labeling the gene IDs without names
	geneIDNam <- NAFIXING(geneIDNam)	

	##remove the whitespace
	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
	
	##Here is the clean version
	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
}



##Changing the gene ID to gene name
ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
colnames(ALZDAT) = ALZDAT1[1,]


##Adjusting the column names aka the gene names
colnames(ALZDAT) <- gcnames(ALZDAT)


#Full Data
Fullalzdw <- ALZDAT %>%
	as.data.frame() %>%
	cbind(ALZWORDF,.)


#nfna <- strsplit(alz,"[\\|/]") %>%
#	.[[1]] %>%
#	.[length(.)] %>%
#	gsub("\\D","",.) %>%
#	c("GSE",.,"after.txt") %>%
#	paste(collapse = "")
#write.matrix(Fullalzdw,file = nfna,sep = "\t")

#Perfect for excel viewing
nfnaex <- strsplit(alz,"[\\]") %>%
	.[[1]] %>%
	.[length(.)] %>%
	gsub("\\D","",.) %>%
	c("GSE",.,"aftexcel.txt") %>%
	paste(collapse = "")
write.table(t(Fullalzdw), file = nfnaex, sep = "\t")