First version of cleaning data with R

Efrain Gonzalez
1 parent 16b4f55de1
Showing 1 changed file with 198 additions and 0 deletions Show diff stats
Rclean.R
@@ -0,0 +1,198 @@
+#Libraries required to run the code
+library(MASS)
+library(pryr)
+library(dplyr)
+library(tidyr)
+library(readr)
+library(stringr)
+
+
+#Necessary Functions
+#1#Function for handling the changing of row names and column names
+chngrownm <- function(mat){
+	row <- dim(mat)[1]
+	col <- dim(mat)[2]
+	j <- 1
+	x <- 1
+	p <- 1	
+	a <- 1
+	b <- 1
+	g <- 1
+	for(j in 1:col){
+		if("!Sample_source_name_ch1"==mat[1,j]){
+			colnames(mat)[j] <- "Brain_Region"	
+		} 
+		if("!Sample_title" == mat[1,j]){
+			colnames(mat)[j] <- "Title"
+		} 
+		if("!Sample_geo_accession" == mat[1,j]){
+			colnames(mat)[j] <- "ID_REF"
+		} else{
+			if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("Sex",x)
+				x = x + 1
+			}
+			if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("PMI",p)
+				p = p + 1
+			}
+			if(grepl("age|Age|AGE",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("Age",a)
+				a = a + 1
+			 }
+			if(grepl("braak|b&b",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("Braak",b)
+				b = b + 1
+			}
+			if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
+				colnames(mat)[j] <- paste0("Group",g)
+				g = g + 1
+			}
+			
+		}
+		j = j + 1
+	}
+	mat
+}			
+
+#2#Function for reorganizing information within the columns
+cinfo <- function(mat){
+	col <- dim(mat)[2]
+	j <-2
+	for(j in 2:col){
+		if(grepl("Group",colnames(mat)[j]) == TRUE){
+			mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
+		}
+		if(grepl("Age",colnames(mat)[j])==TRUE){
+			mat[,j] <- gsub("\\D","",mat[,j])%>%
+				as.integer()
+		}
+		if(grepl("Sex",colnames(mat)[j])==TRUE){
+			mat[,j] <- gsub(".+:\\s","",mat[,j])
+		}
+		if(grepl("PMI",colnames(mat)[j])==TRUE){
+			mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
+				as.numeric() 
+		}
+		if(grepl("Braak",colnames(mat)[j])==TRUE){
+			mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
+				as.roman()%>%
+				as.integer()
+		}
+	j=j+1
+	}
+	mat
+}
+
+#3#Function for changing the gene ID to gene name
+cgeneID <- function(GeneName,DATA){
+	colGene <- dim(GeneName)[2]
+	j <- 1
+	for(j in 1:colGene){
+		chngsreq <- grep(GeneName[1,j],DATA[1,])
+		DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
+		j = j+1
+	}
+	DATA
+}
+
+#4#Function for adjusting the gene names
+gcnames <- function(DiData,usecol=1){
+	nuruns <- dim(DiData)[2]
+	i = 1
+	nwnam <- rep("0",length.out=nuruns)
+	for(i in 1:nuruns){
+		if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
+			nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
+		} else{
+			nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
+		}
+		
+	}
+	nwnam
+
+}
+
+
+
+#The Rest of this code will be used every time you want to change a data set
+
+#Getting the series matrix file
+print("Choose the series matrix file that you want to Analyze")
+alz <- file.choose()
+
+#Getting the GPL file
+print("Choose the GPL file that correlates with the above series matrix file")
+genena <- file.choose()
+
+
+#Set working directory based on the directory of the series matrix file
+##strsplit(alz,"[\\]") %>%
+##	.[[1]] %>%
+##	.[-length(.)] %>%
+##	paste(.,collapse="/") %>%
+##	setwd()
+	
+
+#Working with the wordy part of the document
+alzword <- alz %>%
+	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
+	filter(grepl("!Sample",X1))%>%
+	filter(!grepl("!Sample_contact",X1))
+
+##Changing row names and column names:
+ALZWORD <- t(alzword)
+rownames(ALZWORD)=NULL
+colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
+ALZWORD <- chngrownm(ALZWORD)[-1,]
+ALZWORD <- ALZWORD%>%
+	as.data.frame()%>%
+	dplyr::select(-starts_with("col"))
+
+##Reorganizing information within the columns
+ALZWORDF <- cinfo(ALZWORD)
+
+
+#Working with Actual Data part of file
+alzdat <- alz %>% 
+	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
+ALZDAT <- t(alzdat[,-1])
+rownames(ALZDAT)=NULL
+
+
+##Gene ID to Gene Name
+geneIDNam <- genena %>%
+	read_delim(delim="\t",comment = "#")%>%
+	dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
+
+##Changing the ID to a Name
+ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
+colnames(ALZDAT) = ALZDAT1[1,]
+
+
+##Adjusting the column names aka the gene names
+colnames(ALZDAT) <- gcnames(ALZDAT)
+
+
+#Full Data
+Fullalzdw <- ALZDAT %>%
+	as.data.frame() %>%
+	cbind(ALZWORDF,.)
+
+##since the order in which the packages are added matters I moved this package to the top 
+##library(MASS)
+nfna <- strsplit(alz,"[\\]") %>%
+	.[[1]] %>%
+	.[length(.)] %>%
+	gsub("\\D","",.) %>%
+	c("GSE",.,"after.txt") %>%
+	paste(collapse = "")
+MASS::write.matrix(Fullalzdw,file = nfna,sep = "\t")
+#Perfect for excel viewing
+nfnaex <- strsplit(alz,"[\\]") %>%
+	.[[1]] %>%
+	.[length(.)] %>%
+	gsub("\\D","",.) %>%
+	c("GSE",.,"aftexcel.txt") %>%
+	paste(collapse = "")
+write.table(t(Fullalzdw), file = nfnaex, sep = "\t")
...	...	@@ -0,0 +1,198 @@
	1	+#Libraries required to run the code
	2	+library(MASS)
	3	+library(pryr)
	4	+library(dplyr)
	5	+library(tidyr)
	6	+library(readr)
	7	+library(stringr)
	8	+
	9	+
	10	+#Necessary Functions
	11	+#1#Function for handling the changing of row names and column names
	12	+chngrownm <- function(mat){
	13	+ row <- dim(mat)[1]
	14	+ col <- dim(mat)[2]
	15	+ j <- 1
	16	+ x <- 1
	17	+ p <- 1
	18	+ a <- 1
	19	+ b <- 1
	20	+ g <- 1
	21	+ for(j in 1:col){
	22	+ if("!Sample_source_name_ch1"==mat[1,j]){
	23	+ colnames(mat)[j] <- "Brain_Region"
	24	+ }
	25	+ if("!Sample_title" == mat[1,j]){
	26	+ colnames(mat)[j] <- "Title"
	27	+ }
	28	+ if("!Sample_geo_accession" == mat[1,j]){
	29	+ colnames(mat)[j] <- "ID_REF"
	30	+ } else{
	31	+ if(grepl("Sex\|gender\|Gender\|sex",mat[2,j])==TRUE){
	32	+ colnames(mat)[j] <- paste0("Sex",x)
	33	+ x = x + 1
	34	+ }
	35	+ if(grepl("postmorteminterval\|PMI\|pmi",mat[2,j])==TRUE){
	36	+ colnames(mat)[j] <- paste0("PMI",p)
	37	+ p = p + 1
	38	+ }
	39	+ if(grepl("age\|Age\|AGE",mat[2,j])==TRUE){
	40	+ colnames(mat)[j] <- paste0("Age",a)
	41	+ a = a + 1
	42	+ }
	43	+ if(grepl("braak\|b&b",mat[2,j])==TRUE){
	44	+ colnames(mat)[j] <- paste0("Braak",b)
	45	+ b = b + 1
	46	+ }
	47	+ if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control",mat[2,j])==TRUE){
	48	+ colnames(mat)[j] <- paste0("Group",g)
	49	+ g = g + 1
	50	+ }
	51	+
	52	+ }
	53	+ j = j + 1
	54	+ }
	55	+ mat
	56	+}
	57	+
	58	+#2#Function for reorganizing information within the columns
	59	+cinfo <- function(mat){
	60	+ col <- dim(mat)[2]
	61	+ j <-2
	62	+ for(j in 2:col){
	63	+ if(grepl("Group",colnames(mat)[j]) == TRUE){
	64	+ mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])
	65	+ }
	66	+ if(grepl("Age",colnames(mat)[j])==TRUE){
	67	+ mat[,j] <- gsub("\\D","",mat[,j])%>%
	68	+ as.integer()
	69	+ }
	70	+ if(grepl("Sex",colnames(mat)[j])==TRUE){
	71	+ mat[,j] <- gsub(".+:\\s","",mat[,j])
	72	+ }
	73	+ if(grepl("PMI",colnames(mat)[j])==TRUE){
	74	+ mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
	75	+ as.numeric()
	76	+ }
	77	+ if(grepl("Braak",colnames(mat)[j])==TRUE){
	78	+ mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
	79	+ as.roman()%>%
	80	+ as.integer()
	81	+ }
	82	+ j=j+1
	83	+ }
	84	+ mat
	85	+}
	86	+
	87	+#3#Function for changing the gene ID to gene name
	88	+cgeneID <- function(GeneName,DATA){
	89	+ colGene <- dim(GeneName)[2]
	90	+ j <- 1
	91	+ for(j in 1:colGene){
	92	+ chngsreq <- grep(GeneName[1,j],DATA[1,])
	93	+ DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
	94	+ j = j+1
	95	+ }
	96	+ DATA
	97	+}
	98	+
	99	+#4#Function for adjusting the gene names
	100	+gcnames <- function(DiData,usecol=1){
	101	+ nuruns <- dim(DiData)[2]
	102	+ i = 1
	103	+ nwnam <- rep("0",length.out=nuruns)
	104	+ for(i in 1:nuruns){
	105	+ if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
	106	+ nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
	107	+ } else{
	108	+ nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
	109	+ }
	110	+
	111	+ }
	112	+ nwnam
	113	+
	114	+}
	115	+
	116	+
	117	+
	118	+#The Rest of this code will be used every time you want to change a data set
	119	+
	120	+#Getting the series matrix file
	121	+print("Choose the series matrix file that you want to Analyze")
	122	+alz <- file.choose()
	123	+
	124	+#Getting the GPL file
	125	+print("Choose the GPL file that correlates with the above series matrix file")
	126	+genena <- file.choose()
	127	+
	128	+
	129	+#Set working directory based on the directory of the series matrix file
	130	+##strsplit(alz,"[\\]") %>%
	131	+## .[[1]] %>%
	132	+## .[-length(.)] %>%
	133	+## paste(.,collapse="/") %>%
	134	+## setwd()
	135	+
	136	+
	137	+#Working with the wordy part of the document
	138	+alzword <- alz %>%
	139	+ read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
	140	+ filter(grepl("!Sample",X1))%>%
	141	+ filter(!grepl("!Sample_contact",X1))
	142	+
	143	+##Changing row names and column names:
	144	+ALZWORD <- t(alzword)
	145	+rownames(ALZWORD)=NULL
	146	+colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
	147	+ALZWORD <- chngrownm(ALZWORD)[-1,]
	148	+ALZWORD <- ALZWORD%>%
	149	+ as.data.frame()%>%
	150	+ dplyr::select(-starts_with("col"))
	151	+
	152	+##Reorganizing information within the columns
	153	+ALZWORDF <- cinfo(ALZWORD)
	154	+
	155	+
	156	+#Working with Actual Data part of file
	157	+alzdat <- alz %>%
	158	+ read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
	159	+ALZDAT <- t(alzdat[,-1])
	160	+rownames(ALZDAT)=NULL
	161	+
	162	+
	163	+##Gene ID to Gene Name
	164	+geneIDNam <- genena %>%
	165	+ read_delim(delim="\t",comment = "#")%>%
	166	+ dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
	167	+
	168	+##Changing the ID to a Name
	169	+ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
	170	+colnames(ALZDAT) = ALZDAT1[1,]
	171	+
	172	+
	173	+##Adjusting the column names aka the gene names
	174	+colnames(ALZDAT) <- gcnames(ALZDAT)
	175	+
	176	+
	177	+#Full Data
	178	+Fullalzdw <- ALZDAT %>%
	179	+ as.data.frame() %>%
	180	+ cbind(ALZWORDF,.)
	181	+
	182	+##since the order in which the packages are added matters I moved this package to the top
	183	+##library(MASS)
	184	+nfna <- strsplit(alz,"[\\]") %>%
	185	+ .[[1]] %>%
	186	+ .[length(.)] %>%
	187	+ gsub("\\D","",.) %>%
	188	+ c("GSE",.,"after.txt") %>%
	189	+ paste(collapse = "")
	190	+MASS::write.matrix(Fullalzdw,file = nfna,sep = "\t")
	191	+#Perfect for excel viewing
	192	+nfnaex <- strsplit(alz,"[\\]") %>%
	193	+ .[[1]] %>%
	194	+ .[length(.)] %>%
	195	+ gsub("\\D","",.) %>%
	196	+ c("GSE",.,"aftexcel.txt") %>%
	197	+ paste(collapse = "")
	198	+write.table(t(Fullalzdw), file = nfnaex, sep = "\t")