Second version of code (UNTESTED)

Efrain Gonzalez
1 parent c0625ba184
Showing 1 changed file with 284 additions and 0 deletions Show diff stats
RClean2.R
@@ -0,0 +1,284 @@
+#Libraries required to run the code
+library(pryr)
+library(MASS)
+library(dplyr)
+library(tidyr)
+library(readr)
+library(stringr)
+
+
+#Necessary Functions
+#1#Function for handling the changing of row names and column names
+chngrownm <- function(mat){
+	row <- dim(mat)[1]
+	col <- dim(mat)[2]
+	j <- 1
+	x <- 1
+	p <- 1	
+	a <- 1
+	b <- 1
+	g <- 1
+	for(j in 1:col){
+		if("!Sample_source_name_ch1"==mat[1,j]){
+			colnames(mat)[j] <- "Brain_Region"	
+		} 
+		if("!Sample_title" == mat[1,j]){
+			colnames(mat)[j] <- "Title"
+		} 
+		if("!Sample_geo_accession" == mat[1,j]){
+			colnames(mat)[j] <- "ID_REF"
+		} else{
+			if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("Sex",x)
+				x = x + 1
+			}
+			if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("PMI",p)
+				p = p + 1
+			}
+			if(grepl("age|Age|AGE",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("Age",a)
+				a = a + 1
+			 }
+			if(grepl("braak|b&b",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("Braak",b)
+				b = b + 1
+			}
+			if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("Group",g)
+				g = g + 1
+			}
+			
+		}
+		j = j + 1
+	}
+	mat
+}			
+
+#2#Function for reorganizing information within the columns
+cinfo <- function(mat){
+	col <- dim(mat)[2]
+	j <-2
+	for(j in 2:col){
+		if(grepl("Group",colnames(mat)[j]) == TRUE){
+			mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
+		}
+		if(grepl("Age",colnames(mat)[j])==TRUE){
+			mat[,j] <- gsub("\\D","",mat[,j])%>%
+				as.integer()
+		}
+		if(grepl("Sex",colnames(mat)[j])==TRUE){
+			mat[,j] <- gsub(".+:\\s","",mat[,j])
+		}
+		if(grepl("PMI",colnames(mat)[j])==TRUE){
+			mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
+				as.numeric() 
+		}
+		if(grepl("Braak",colnames(mat)[j])==TRUE){
+			mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
+				as.roman()%>%
+				as.integer()
+		}
+	j=j+1
+	}
+	mat
+}
+
+#3#Function for labeling the gene IDs without names
+NAFIXING <- function(GIDNAM){
+	row <- dim(GIDNAM)[1]
+	i <- 1
+	x <- 1
+	for(i in 1:row){
+		if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE){
+			GIDNAM[i,2] <- gsub(".*",paste0(NA,x),GIDNAM[i,2])
+			x <- x + 1
+		}
+		i <- i + 1
+	}
+	GIDNAM
+}	
+
+#4#Function for changing the gene ID to gene name
+cgeneID <- function(GeneName,DATA){
+	colGene <- dim(GeneName)[2]
+	j <- 1
+	for(j in 1:colGene){
+		chngsreq <- grep(GeneName[1,j],DATA[1,])
+		DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
+		j = j+1
+	}
+	DATA
+}
+
+#5#Function for adjusting the gene names
+gcnames <- function(DiData,usecol=1){
+	nuruns <- dim(DiData)[2]
+	i = 1
+	nwnam <- rep("0",length.out=nuruns)
+	for(i in 1:nuruns){
+		if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
+			nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
+		} else{
+			nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
+		}
+		
+	}
+	nwnam
+
+}
+
+
+
+#The Rest of this code will be used every time you want to change a data set
+
+#Getting the series matrix file
+print("Choose the series matrix file that you want to Analyze")
+alz <- file.choose()
+
+#Getting the GPL file
+print("Choose the GPL file that correlates with the above series matrix file")
+genena <- file.choose()
+
+
+#Set working directory based on the directory of the series matrix file Currently only works for windows
+##strsplit(alz,"[\\]") %>%
+##	.[[1]] %>%
+##	.[-length(.)] %>%
+##	paste(.,collapse="/") %>%
+##	setwd()
+
+#Find out if it is a soft GPL file or not
+soft <- strsplit(genena,"[\\|/]") %>%
+	.[[1]] %>%
+	.[length(.)] %>%
+	grepl("soft",.)
+
+#Working with the wordy part of the document
+alzword <- alz %>%
+	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
+	filter(grepl("!Sample",X1))%>%
+	filter(!grepl("!Sample_contact",X1))
+
+##Changing row names and column names:
+ALZWORD <- t(alzword)
+rownames(ALZWORD)=NULL
+colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
+ALZWORD <- chngrownm(ALZWORD)[-1,]
+ALZWORD <- ALZWORD%>%
+	as.data.frame()%>%
+	dplyr::select(-starts_with("col"))
+
+##Reorganizing information within the columns
+ALZWORDF <- cinfo(ALZWORD)
+
+
+#Working with Actual Data part of file
+alzdat <- alz %>% 
+	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
+ALZDAT <- t(alzdat[,-1])
+rownames(ALZDAT)=NULL
+
+
+##Gene ID to Gene Name
+###geneIDNam <- genena %>%
+###	read_delim(delim="\t",comment = "#")%>%
+###	dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
+###problems with the above for soft files
+if(soft == TRUE){
+	gplnum <- strsplit(genena,"[\\|/]") %>%
+		.[[1]] %>%
+		.[length(.)] %>%
+		gsub("\\D","",.)
+	#Check to see if there is already a file containing information on soft files
+	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
+	if(fileex == 1){
+		#Check to see if this GPL soft file has been used before
+		IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
+			.$GPL_FILE_NUM%>%
+			grepl(gplnum,.) %>%
+			sum()
+		if(IDF == 1){
+			IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
+				.$GPL_FILE_NUM%>%
+				grep(gplnum,.)
+			idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
+				.$LOC_ID %>%
+				.[IDLOCAL]
+			geneIDNam <- genena %>%
+				read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
+				dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
+		}
+		if(IDF == 0){
+			#No information on this particular GPL file
+			idLOCGPL <- genena %>%
+				read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
+				t(.) %>%
+				grep("^\\D",.) %>%
+				length()-1
+			cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
+				cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
+			geneIDNam <- genena %>%
+				read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
+				dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
+		}
+	}
+	if(fileex == 0){
+		#We must create a file that we can access for later use
+		idLOCGPL <- genena %>%
+			read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
+			t(.) %>%
+			grep("^\\D",.) %>%
+			length()-1
+		Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
+		colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
+		write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
+		geneIDNam <- genena %>%
+			read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
+			dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
+	}
+}
+if(soft == FALSE){
+	geneIDNam <- genena %>%
+	read_delim(delim="\t",comment = "#")%>%
+	dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
+}
+
+##Labeling the gene IDs without names
+geneIDNam <- NAFIXING(geneIDNam)	
+
+##remove the whitespace
+geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))	
+
+##Changing the gene ID to gene name
+ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
+colnames(ALZDAT) = ALZDAT1[1,]
+
+
+##Adjusting the column names aka the gene names
+colnames(ALZDAT) <- gcnames(ALZDAT)
+
+
+#Full Data
+Fullalzdw <- ALZDAT %>%
+	as.data.frame() %>%
+	cbind(ALZWORDF,.)
+
+
+nfna <- strsplit(alz,"[\\]") %>%
+	.[[1]] %>%
+	.[length(.)] %>%
+	gsub("\\D","",.) %>%
+	c("GSE",.,"after.txt") %>%
+	paste(collapse = "")
+write.matrix(Fullalzdw,file = nfna,sep = "\t")
+#Perfect for excel viewing
+nfnaex <- strsplit(alz,"[\\]") %>%
+	.[[1]] %>%
+	.[length(.)] %>%
+	gsub("\\D","",.) %>%
+	c("GSE",.,"aftexcel.txt") %>%
+	paste(collapse = "")
+write.table(t(Fullalzdw), file = nfnaex, sep = "\t")
+
+
...	...	@@ -0,0 +1,284 @@
	1	+#Libraries required to run the code
	2	+library(pryr)
	3	+library(MASS)
	4	+library(dplyr)
	5	+library(tidyr)
	6	+library(readr)
	7	+library(stringr)
	8	+
	9	+
	10	+#Necessary Functions
	11	+#1#Function for handling the changing of row names and column names
	12	+chngrownm <- function(mat){
	13	+ row <- dim(mat)[1]
	14	+ col <- dim(mat)[2]
	15	+ j <- 1
	16	+ x <- 1
	17	+ p <- 1
	18	+ a <- 1
	19	+ b <- 1
	20	+ g <- 1
	21	+ for(j in 1:col){
	22	+ if("!Sample_source_name_ch1"==mat[1,j]){
	23	+ colnames(mat)[j] <- "Brain_Region"
	24	+ }
	25	+ if("!Sample_title" == mat[1,j]){
	26	+ colnames(mat)[j] <- "Title"
	27	+ }
	28	+ if("!Sample_geo_accession" == mat[1,j]){
	29	+ colnames(mat)[j] <- "ID_REF"
	30	+ } else{
	31	+ if(grepl("Sex\|gender\|Gender\|sex",mat[2,j])==TRUE){
	32	+ colnames(mat)[j] <- paste0("Sex",x)
	33	+ x = x + 1
	34	+ }
	35	+ if(grepl("postmorteminterval\|PMI\|pmi",mat[2,j])==TRUE){
	36	+ colnames(mat)[j] <- paste0("PMI",p)
	37	+ p = p + 1
	38	+ }
	39	+ if(grepl("age\|Age\|AGE",mat[2,j])==TRUE){
	40	+ colnames(mat)[j] <- paste0("Age",a)
	41	+ a = a + 1
	42	+ }
	43	+ if(grepl("braak\|b&b",mat[2,j])==TRUE){
	44	+ colnames(mat)[j] <- paste0("Braak",b)
	45	+ b = b + 1
	46	+ }
	47	+ if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control",mat[2,j])==TRUE){
	48	+ colnames(mat)[j] <- paste0("Group",g)
	49	+ g = g + 1
	50	+ }
	51	+
	52	+ }
	53	+ j = j + 1
	54	+ }
	55	+ mat
	56	+}
	57	+
	58	+#2#Function for reorganizing information within the columns
	59	+cinfo <- function(mat){
	60	+ col <- dim(mat)[2]
	61	+ j <-2
	62	+ for(j in 2:col){
	63	+ if(grepl("Group",colnames(mat)[j]) == TRUE){
	64	+ mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])
	65	+ }
	66	+ if(grepl("Age",colnames(mat)[j])==TRUE){
	67	+ mat[,j] <- gsub("\\D","",mat[,j])%>%
	68	+ as.integer()
	69	+ }
	70	+ if(grepl("Sex",colnames(mat)[j])==TRUE){
	71	+ mat[,j] <- gsub(".+:\\s","",mat[,j])
	72	+ }
	73	+ if(grepl("PMI",colnames(mat)[j])==TRUE){
	74	+ mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
	75	+ as.numeric()
	76	+ }
	77	+ if(grepl("Braak",colnames(mat)[j])==TRUE){
	78	+ mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
	79	+ as.roman()%>%
	80	+ as.integer()
	81	+ }
	82	+ j=j+1
	83	+ }
	84	+ mat
	85	+}
	86	+
	87	+#3#Function for labeling the gene IDs without names
	88	+NAFIXING <- function(GIDNAM){
	89	+ row <- dim(GIDNAM)[1]
	90	+ i <- 1
	91	+ x <- 1
	92	+ for(i in 1:row){
	93	+ if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE){
	94	+ GIDNAM[i,2] <- gsub(".*",paste0(NA,x),GIDNAM[i,2])
	95	+ x <- x + 1
	96	+ }
	97	+ i <- i + 1
	98	+ }
	99	+ GIDNAM
	100	+}
	101	+
	102	+#4#Function for changing the gene ID to gene name
	103	+cgeneID <- function(GeneName,DATA){
	104	+ colGene <- dim(GeneName)[2]
	105	+ j <- 1
	106	+ for(j in 1:colGene){
	107	+ chngsreq <- grep(GeneName[1,j],DATA[1,])
	108	+ DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
	109	+ j = j+1
	110	+ }
	111	+ DATA
	112	+}
	113	+
	114	+#5#Function for adjusting the gene names
	115	+gcnames <- function(DiData,usecol=1){
	116	+ nuruns <- dim(DiData)[2]
	117	+ i = 1
	118	+ nwnam <- rep("0",length.out=nuruns)
	119	+ for(i in 1:nuruns){
	120	+ if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
	121	+ nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
	122	+ } else{
	123	+ nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
	124	+ }
	125	+
	126	+ }
	127	+ nwnam
	128	+
	129	+}
	130	+
	131	+
	132	+
	133	+#The Rest of this code will be used every time you want to change a data set
	134	+
	135	+#Getting the series matrix file
	136	+print("Choose the series matrix file that you want to Analyze")
	137	+alz <- file.choose()
	138	+
	139	+#Getting the GPL file
	140	+print("Choose the GPL file that correlates with the above series matrix file")
	141	+genena <- file.choose()
	142	+
	143	+
	144	+#Set working directory based on the directory of the series matrix file Currently only works for windows
	145	+##strsplit(alz,"[\\]") %>%
	146	+## .[[1]] %>%
	147	+## .[-length(.)] %>%
	148	+## paste(.,collapse="/") %>%
	149	+## setwd()
	150	+
	151	+#Find out if it is a soft GPL file or not
	152	+soft <- strsplit(genena,"[\\\|/]") %>%
	153	+ .[[1]] %>%
	154	+ .[length(.)] %>%
	155	+ grepl("soft",.)
	156	+
	157	+#Working with the wordy part of the document
	158	+alzword <- alz %>%
	159	+ read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
	160	+ filter(grepl("!Sample",X1))%>%
	161	+ filter(!grepl("!Sample_contact",X1))
	162	+
	163	+##Changing row names and column names:
	164	+ALZWORD <- t(alzword)
	165	+rownames(ALZWORD)=NULL
	166	+colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
	167	+ALZWORD <- chngrownm(ALZWORD)[-1,]
	168	+ALZWORD <- ALZWORD%>%
	169	+ as.data.frame()%>%
	170	+ dplyr::select(-starts_with("col"))
	171	+
	172	+##Reorganizing information within the columns
	173	+ALZWORDF <- cinfo(ALZWORD)
	174	+
	175	+
	176	+#Working with Actual Data part of file
	177	+alzdat <- alz %>%
	178	+ read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
	179	+ALZDAT <- t(alzdat[,-1])
	180	+rownames(ALZDAT)=NULL
	181	+
	182	+
	183	+##Gene ID to Gene Name
	184	+###geneIDNam <- genena %>%
	185	+### read_delim(delim="\t",comment = "#")%>%
	186	+### dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	187	+###problems with the above for soft files
	188	+if(soft == TRUE){
	189	+ gplnum <- strsplit(genena,"[\\\|/]") %>%
	190	+ .[[1]] %>%
	191	+ .[length(.)] %>%
	192	+ gsub("\\D","",.)
	193	+ #Check to see if there is already a file containing information on soft files
	194	+ fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
	195	+ if(fileex == 1){
	196	+ #Check to see if this GPL soft file has been used before
	197	+ IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	198	+ .$GPL_FILE_NUM%>%
	199	+ grepl(gplnum,.) %>%
	200	+ sum()
	201	+ if(IDF == 1){
	202	+ IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	203	+ .$GPL_FILE_NUM%>%
	204	+ grep(gplnum,.)
	205	+ idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	206	+ .$LOC_ID %>%
	207	+ .[IDLOCAL]
	208	+ geneIDNam <- genena %>%
	209	+ read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
	210	+ dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	211	+ }
	212	+ if(IDF == 0){
	213	+ #No information on this particular GPL file
	214	+ idLOCGPL <- genena %>%
	215	+ read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
	216	+ t(.) %>%
	217	+ grep("^\\D",.) %>%
	218	+ length()-1
	219	+ cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
	220	+ cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
	221	+ geneIDNam <- genena %>%
	222	+ read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
	223	+ dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	224	+ }
	225	+ }
	226	+ if(fileex == 0){
	227	+ #We must create a file that we can access for later use
	228	+ idLOCGPL <- genena %>%
	229	+ read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
	230	+ t(.) %>%
	231	+ grep("^\\D",.) %>%
	232	+ length()-1
	233	+ Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
	234	+ colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
	235	+ write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
	236	+ geneIDNam <- genena %>%
	237	+ read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
	238	+ dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	239	+ }
	240	+}
	241	+if(soft == FALSE){
	242	+ geneIDNam <- genena %>%
	243	+ read_delim(delim="\t",comment = "#")%>%
	244	+ dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	245	+}
	246	+
	247	+##Labeling the gene IDs without names
	248	+geneIDNam <- NAFIXING(geneIDNam)
	249	+
	250	+##remove the whitespace
	251	+geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
	252	+
	253	+##Changing the gene ID to gene name
	254	+ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
	255	+colnames(ALZDAT) = ALZDAT1[1,]
	256	+
	257	+
	258	+##Adjusting the column names aka the gene names
	259	+colnames(ALZDAT) <- gcnames(ALZDAT)
	260	+
	261	+
	262	+#Full Data
	263	+Fullalzdw <- ALZDAT %>%
	264	+ as.data.frame() %>%
	265	+ cbind(ALZWORDF,.)
	266	+
	267	+
	268	+nfna <- strsplit(alz,"[\\]") %>%
	269	+ .[[1]] %>%
	270	+ .[length(.)] %>%
	271	+ gsub("\\D","",.) %>%
	272	+ c("GSE",.,"after.txt") %>%
	273	+ paste(collapse = "")
	274	+write.matrix(Fullalzdw,file = nfna,sep = "\t")
	275	+#Perfect for excel viewing
	276	+nfnaex <- strsplit(alz,"[\\]") %>%
	277	+ .[[1]] %>%
	278	+ .[length(.)] %>%
	279	+ gsub("\\D","",.) %>%
	280	+ c("GSE",.,"aftexcel.txt") %>%
	281	+ paste(collapse = "")
	282	+write.table(t(Fullalzdw), file = nfnaex, sep = "\t")
	283	+
	284	+