Most recent update fixed a few handling errors

Efrain Gonzalez
1 parent f31e87a636
Showing 1 changed file with 49 additions and 35 deletions Show diff stats
RCleanDscret.R
@@ -24,11 +24,9 @@ chngrownm &lt;- function(mat){
 	for(j in 1:col){
 		if("!Sample_source_name_ch1"==mat[1,j]){
 			colnames(mat)[j] <- "Brain_Region"	
-		} 
-		if("!Sample_title" == mat[1,j]){
+		} else if("!Sample_title" == mat[1,j]){
 			colnames(mat)[j] <- "Title"
-		} 
-		if("!Sample_geo_accession" == mat[1,j]){
+		} else if("!Sample_geo_accession" == mat[1,j]){
 			colnames(mat)[j] <- "ID_REF"
 		} else{
 			if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
@@ -102,23 +100,41 @@ NAFIXING &lt;- function(GIDNAM){
  
 #4#Function for changing the gene ID to gene name
 cgeneID <- function(GeneName,DATA){
-    colGene <- dim(GeneName)[2]
-     j <- 1
-     for(j in 1:colGene){
-	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
-	if(is.na(sum(chngsreq))==FALSE){
-		if(sum(chngsreq) > 0){
-			DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
+	nj <- t(GeneName)
+	nq <- t(DATA)
+	colGene <- dim(nj)[2]
+	colDATA <- dim(nq)[2]
+	j <- 1
+	for(j in 1:colDATA){
+		#where is that gene id located within the GPL file
+		chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
+		if(is.na(sum(chngreq))==FALSE){
+			if(sum(chngreq) > 0){
+			nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
+			}
 		}
+		j <- j + 1
 	}
-		#if(sum(chngsreq) > 0){
-		##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
-		#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
-		#}
-	j = j+1
-	}
-	DATA
+	nq
 }
+#cgeneID <- function(GeneName,DATA){
+#    colGene <- dim(GeneName)[2]
+#     j <- 1
+#     for(j in 1:colGene){
+#	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
+#	if(is.na(sum(chngsreq))==FALSE){
+#		if(sum(chngsreq) > 0){
+#			DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
+#		}
+#	}
+#		#if(sum(chngsreq) > 0){
+#		##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
+#		#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
+#		#}
+#	j = j+1
+#	}
+#	DATA
+#}
  
 #5#Function for adjusting the gene names
 gcnames <- function(DiData,usecol=1){
@@ -151,11 +167,9 @@ dndat &lt;- function(NDATA){
  
 				if(NDATA[i,j] < -1){
 					DDATA[i,j]=0L
-				}
-				if(NDATA[i,j] > 1){
+				} else if(NDATA[i,j] > 1){
 					DDATA[i,j]=2L
-				}
-				if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
+				} else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
 					DDATA[i,j]=1L
 				}
 			} else{
@@ -222,8 +236,7 @@ if(clfileex &gt;= 1){
 geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
 	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
  
-} 
-if(clfileex == 0){
+} else if(clfileex == 0){
 ##Lets Create a clean version
  
 ##Gene ID to Gene Name
@@ -246,8 +259,7 @@ if(clfileex == 0){
 				geneIDNam <- genena %>%
 					read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
 					dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
-			}
-			if(IDF == 0){
+			} else if(IDF == 0){
 				#No information on this particular GPL file
 				idLOCGPL <- genena %>%
 					read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
@@ -260,8 +272,7 @@ if(clfileex == 0){
 					read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 					dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 			}
-		}
-		if(fileex == 0){
+		} else if(fileex == 0){
 			#We must create a file that we can access for later use
 			idLOCGPL <- genena %>%
 				read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
@@ -275,8 +286,7 @@ if(clfileex == 0){
 				read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 				dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 		}
-	 }
-	if(soft == FALSE){
+	 } else if(soft == FALSE){
 		geneIDNam <- genena %>%
 		read_delim(delim="\t",comment = "#")%>%
 		dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
@@ -295,7 +305,7 @@ if(clfileex == 0){
  
  
 ##Changing the gene ID to gene name
-ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
+ALZDAT1 <- cgeneID(geneIDNam,alzdat)
 colnames(ALZDAT) = ALZDAT1[1,]
  
  
@@ -338,8 +348,13 @@ RAWWORD &lt;- rawword[-hereim,] %&gt;%
 z <- 1
 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
 for(z in 1:dim(RAWWORD)[1]){
-	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
-	z <- z + 1
+    if(sum(is.na(RAWWORD[z,])) > 0){
+        naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
+    }
+	if(length(grep("NA",RAWWORD[z,])) > 0){
+        naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
+    }
+    z <- z + 1
 }
  
 colnames(naroww) <- "ROW_NAs"
@@ -378,9 +393,8 @@ for(j in 1:length(tabRDATID)){
 	##Putting the ones without duplicates in their new homes
 	if(tabRDATID[j] == 1){
 		NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
-	}
+	} else if(tabRDATID[j] > 1){
 	##Averaging duplicates and putting them in their new homes
-	if(tabRDATID[j] > 1){
 		NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
 	}
 	j <- j + 1
...	...	@@ -24,11 +24,9 @@ chngrownm <- function(mat){
24	24	for(j in 1:col){
25	25	if("!Sample_source_name_ch1"==mat[1,j]){
26	26	colnames(mat)[j] <- "Brain_Region"
27		- }
28		- if("!Sample_title" == mat[1,j]){
	27	+ } else if("!Sample_title" == mat[1,j]){
29	28	colnames(mat)[j] <- "Title"
30		- }
31		- if("!Sample_geo_accession" == mat[1,j]){
	29	+ } else if("!Sample_geo_accession" == mat[1,j]){
32	30	colnames(mat)[j] <- "ID_REF"
33	31	} else{
34	32	if(grepl("Sex\|gender\|Gender\|sex",mat[2,j])==TRUE){
...	...	@@ -102,23 +100,41 @@ NAFIXING <- function(GIDNAM){
102	100
103	101	#4#Function for changing the gene ID to gene name
104	102	cgeneID <- function(GeneName,DATA){
105		- colGene <- dim(GeneName)[2]
106		- j <- 1
107		- for(j in 1:colGene){
108		- chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
109		- if(is.na(sum(chngsreq))==FALSE){
110		- if(sum(chngsreq) > 0){
111		- DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
	103	+ nj <- t(GeneName)
	104	+ nq <- t(DATA)
	105	+ colGene <- dim(nj)[2]
	106	+ colDATA <- dim(nq)[2]
	107	+ j <- 1
	108	+ for(j in 1:colDATA){
	109	+ #where is that gene id located within the GPL file
	110	+ chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
	111	+ if(is.na(sum(chngreq))==FALSE){
	112	+ if(sum(chngreq) > 0){
	113	+ nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
	114	+ }
112	115	}
	116	+ j <- j + 1
113	117	}
114		- #if(sum(chngsreq) > 0){
115		- ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
116		- #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
117		- #}
118		- j = j+1
119		- }
120		- DATA
	118	+ nq
121	119	}
	120	+#cgeneID <- function(GeneName,DATA){
	121	+# colGene <- dim(GeneName)[2]
	122	+# j <- 1
	123	+# for(j in 1:colGene){
	124	+# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
	125	+# if(is.na(sum(chngsreq))==FALSE){
	126	+# if(sum(chngsreq) > 0){
	127	+# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
	128	+# }
	129	+# }
	130	+# #if(sum(chngsreq) > 0){
	131	+# ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
	132	+# #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
	133	+# #}
	134	+# j = j+1
	135	+# }
	136	+# DATA
	137	+#}
122	138
123	139	#5#Function for adjusting the gene names
124	140	gcnames <- function(DiData,usecol=1){
...	...	@@ -151,11 +167,9 @@ dndat <- function(NDATA){
151	167
152	168	if(NDATA[i,j] < -1){
153	169	DDATA[i,j]=0L
154		- }
155		- if(NDATA[i,j] > 1){
	170	+ } else if(NDATA[i,j] > 1){
156	171	DDATA[i,j]=2L
157		- }
158		- if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
	172	+ } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
159	173	DDATA[i,j]=1L
160	174	}
161	175	} else{
...	...	@@ -222,8 +236,7 @@ if(clfileex >= 1){
222	236	geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
223	237	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
224	238
225		-}
226		-if(clfileex == 0){
	239	+} else if(clfileex == 0){
227	240	##Lets Create a clean version
228	241
229	242	##Gene ID to Gene Name
...	...	@@ -246,8 +259,7 @@ if(clfileex == 0){
246	259	geneIDNam <- genena %>%
247	260	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
248	261	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
249		- }
250		- if(IDF == 0){
	262	+ } else if(IDF == 0){
251	263	#No information on this particular GPL file
252	264	idLOCGPL <- genena %>%
253	265	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
...	...	@@ -260,8 +272,7 @@ if(clfileex == 0){
260	272	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
261	273	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
262	274	}
263		- }
264		- if(fileex == 0){
	275	+ } else if(fileex == 0){
265	276	#We must create a file that we can access for later use
266	277	idLOCGPL <- genena %>%
267	278	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
...	...	@@ -275,8 +286,7 @@ if(clfileex == 0){
275	286	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
276	287	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
277	288	}
278		- }
279		- if(soft == FALSE){
	289	+ } else if(soft == FALSE){
280	290	geneIDNam <- genena %>%
281	291	read_delim(delim="\t",comment = "#")%>%
282	292	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
...	...	@@ -295,7 +305,7 @@ if(clfileex == 0){
295	305
296	306
297	307	##Changing the gene ID to gene name
298		-ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
	308	+ALZDAT1 <- cgeneID(geneIDNam,alzdat)
299	309	colnames(ALZDAT) = ALZDAT1[1,]
300	310
301	311
...	...	@@ -338,8 +348,13 @@ RAWWORD <- rawword[-hereim,] %>%
338	348	z <- 1
339	349	naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
340	350	for(z in 1:dim(RAWWORD)[1]){
341		- naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
342		- z <- z + 1
	351	+ if(sum(is.na(RAWWORD[z,])) > 0){
	352	+ naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
	353	+ }
	354	+ if(length(grep("NA",RAWWORD[z,])) > 0){
	355	+ naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
	356	+ }
	357	+ z <- z + 1
343	358	}
344	359
345	360	colnames(naroww) <- "ROW_NAs"
...	...	@@ -378,9 +393,8 @@ for(j in 1:length(tabRDATID)){
378	393	##Putting the ones without duplicates in their new homes
379	394	if(tabRDATID[j] == 1){
380	395	NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
381		- }
	396	+ } else if(tabRDATID[j] > 1){
382	397	##Averaging duplicates and putting them in their new homes
383		- if(tabRDATID[j] > 1){
384	398	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
385	399	}
386	400	j <- j + 1