From cc59b7f8323cecc33ca1facf20f024c2a1b5a73e Mon Sep 17 00:00:00 2001
From: Efrain Gonzalez <egonz340@fiu.edu>
Date: Tue, 30 May 2017 11:51:02 -0400
Subject: [PATCH] Second version of code (UNTESTED)

---
 RClean2.R | 284 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 284 insertions(+)
 create mode 100644 RClean2.R

diff --git a/RClean2.R b/RClean2.R
new file mode 100644
index 0000000..cb8981d
--- /dev/null
+++ b/RClean2.R
@@ -0,0 +1,284 @@
+#Libraries required to run the code
+library(pryr)
+library(MASS)
+library(dplyr)
+library(tidyr)
+library(readr)
+library(stringr)
+
+
+#Necessary Functions
+#1#Function for handling the changing of row names and column names
+chngrownm <- function(mat){
+	row <- dim(mat)[1]
+	col <- dim(mat)[2]
+	j <- 1
+	x <- 1
+	p <- 1	
+	a <- 1
+	b <- 1
+	g <- 1
+	for(j in 1:col){
+		if("!Sample_source_name_ch1"==mat[1,j]){
+			colnames(mat)[j] <- "Brain_Region"	
+		} 
+		if("!Sample_title" == mat[1,j]){
+			colnames(mat)[j] <- "Title"
+		} 
+		if("!Sample_geo_accession" == mat[1,j]){
+			colnames(mat)[j] <- "ID_REF"
+		} else{
+			if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("Sex",x)
+				x = x + 1
+			}
+			if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("PMI",p)
+				p = p + 1
+			}
+			if(grepl("age|Age|AGE",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("Age",a)
+				a = a + 1
+			 }
+			if(grepl("braak|b&b",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("Braak",b)
+				b = b + 1
+			}
+			if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("Group",g)
+				g = g + 1
+			}
+			
+		}
+		j = j + 1
+	}
+	mat
+}			
+
+#2#Function for reorganizing information within the columns
+cinfo <- function(mat){
+	col <- dim(mat)[2]
+	j <-2
+	for(j in 2:col){
+		if(grepl("Group",colnames(mat)[j]) == TRUE){
+			mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
+		}
+		if(grepl("Age",colnames(mat)[j])==TRUE){
+			mat[,j] <- gsub("\\D","",mat[,j])%>%
+				as.integer()
+		}
+		if(grepl("Sex",colnames(mat)[j])==TRUE){
+			mat[,j] <- gsub(".+:\\s","",mat[,j])
+		}
+		if(grepl("PMI",colnames(mat)[j])==TRUE){
+			mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
+				as.numeric() 
+		}
+		if(grepl("Braak",colnames(mat)[j])==TRUE){
+			mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
+				as.roman()%>%
+				as.integer()
+		}
+	j=j+1
+	}
+	mat
+}
+
+#3#Function for labeling the gene IDs without names
+NAFIXING <- function(GIDNAM){
+	row <- dim(GIDNAM)[1]
+	i <- 1
+	x <- 1
+	for(i in 1:row){
+		if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE){
+			GIDNAM[i,2] <- gsub(".*",paste0(NA,x),GIDNAM[i,2])
+			x <- x + 1
+		}
+		i <- i + 1
+	}
+	GIDNAM
+}	
+
+#4#Function for changing the gene ID to gene name
+cgeneID <- function(GeneName,DATA){
+	colGene <- dim(GeneName)[2]
+	j <- 1
+	for(j in 1:colGene){
+		chngsreq <- grep(GeneName[1,j],DATA[1,])
+		DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
+		j = j+1
+	}
+	DATA
+}
+
+#5#Function for adjusting the gene names
+gcnames <- function(DiData,usecol=1){
+	nuruns <- dim(DiData)[2]
+	i = 1
+	nwnam <- rep("0",length.out=nuruns)
+	for(i in 1:nuruns){
+		if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
+			nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
+		} else{
+			nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
+		}
+		
+	}
+	nwnam
+
+}
+
+
+
+#The Rest of this code will be used every time you want to change a data set
+
+#Getting the series matrix file
+print("Choose the series matrix file that you want to Analyze")
+alz <- file.choose()
+
+#Getting the GPL file
+print("Choose the GPL file that correlates with the above series matrix file")
+genena <- file.choose()
+
+
+#Set working directory based on the directory of the series matrix file Currently only works for windows
+##strsplit(alz,"[\\]") %>%
+##	.[[1]] %>%
+##	.[-length(.)] %>%
+##	paste(.,collapse="/") %>%
+##	setwd()
+
+#Find out if it is a soft GPL file or not
+soft <- strsplit(genena,"[\\|/]") %>%
+	.[[1]] %>%
+	.[length(.)] %>%
+	grepl("soft",.)
+
+#Working with the wordy part of the document
+alzword <- alz %>%
+	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
+	filter(grepl("!Sample",X1))%>%
+	filter(!grepl("!Sample_contact",X1))
+
+##Changing row names and column names:
+ALZWORD <- t(alzword)
+rownames(ALZWORD)=NULL
+colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
+ALZWORD <- chngrownm(ALZWORD)[-1,]
+ALZWORD <- ALZWORD%>%
+	as.data.frame()%>%
+	dplyr::select(-starts_with("col"))
+
+##Reorganizing information within the columns
+ALZWORDF <- cinfo(ALZWORD)
+
+
+#Working with Actual Data part of file
+alzdat <- alz %>% 
+	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
+ALZDAT <- t(alzdat[,-1])
+rownames(ALZDAT)=NULL
+
+
+##Gene ID to Gene Name
+###geneIDNam <- genena %>%
+###	read_delim(delim="\t",comment = "#")%>%
+###	dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
+###problems with the above for soft files
+if(soft == TRUE){
+	gplnum <- strsplit(genena,"[\\|/]") %>%
+		.[[1]] %>%
+		.[length(.)] %>%
+		gsub("\\D","",.)
+	#Check to see if there is already a file containing information on soft files
+	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
+	if(fileex == 1){
+		#Check to see if this GPL soft file has been used before
+		IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
+			.$GPL_FILE_NUM%>%
+			grepl(gplnum,.) %>%
+			sum()
+		if(IDF == 1){
+			IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
+				.$GPL_FILE_NUM%>%
+				grep(gplnum,.)
+			idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
+				.$LOC_ID %>%
+				.[IDLOCAL]
+			geneIDNam <- genena %>%
+				read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
+				dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
+		}
+		if(IDF == 0){
+			#No information on this particular GPL file
+			idLOCGPL <- genena %>%
+				read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
+				t(.) %>%
+				grep("^\\D",.) %>%
+				length()-1
+			cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
+				cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
+			geneIDNam <- genena %>%
+				read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
+				dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
+		}
+	}
+	if(fileex == 0){
+		#We must create a file that we can access for later use
+		idLOCGPL <- genena %>%
+			read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
+			t(.) %>%
+			grep("^\\D",.) %>%
+			length()-1
+		Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
+		colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
+		write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
+		geneIDNam <- genena %>%
+			read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
+			dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
+	}
+}
+if(soft == FALSE){
+	geneIDNam <- genena %>%
+	read_delim(delim="\t",comment = "#")%>%
+	dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
+}
+
+##Labeling the gene IDs without names
+geneIDNam <- NAFIXING(geneIDNam)	
+
+##remove the whitespace
+geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))	
+
+##Changing the gene ID to gene name
+ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
+colnames(ALZDAT) = ALZDAT1[1,]
+
+
+##Adjusting the column names aka the gene names
+colnames(ALZDAT) <- gcnames(ALZDAT)
+
+
+#Full Data
+Fullalzdw <- ALZDAT %>%
+	as.data.frame() %>%
+	cbind(ALZWORDF,.)
+
+
+nfna <- strsplit(alz,"[\\]") %>%
+	.[[1]] %>%
+	.[length(.)] %>%
+	gsub("\\D","",.) %>%
+	c("GSE",.,"after.txt") %>%
+	paste(collapse = "")
+write.matrix(Fullalzdw,file = nfna,sep = "\t")
+#Perfect for excel viewing
+nfnaex <- strsplit(alz,"[\\]") %>%
+	.[[1]] %>%
+	.[length(.)] %>%
+	gsub("\\D","",.) %>%
+	c("GSE",.,"aftexcel.txt") %>%
+	paste(collapse = "")
+write.table(t(Fullalzdw), file = nfnaex, sep = "\t")
+
+
-- 
2.29.0