Merge branch 'master' of smlg.fiu.edu:efraingonzalez0/cleaning-and-fixing-data-with-r

Efrain Gonzalez
2 parents fe3623221f 8953eaff1b
Showing 1 changed file Show diff stats
RAutoClDs.R
@@ -28,29 +28,29 @@ chngrownm &lt;- function(mat){
 		if("!Sample_source_name_ch1"==mat[1,e]){
 			colnames(mat)[e] <- "Brain_Region"	
 		} 
-		if("!Sample_title" == mat[1,e]){
+		else if("!Sample_title" == mat[1,e]){
 			colnames(mat)[e] <- "Title"
 		} 
-		if("!Sample_geo_accession" == mat[1,e]){
+		else if("!Sample_geo_accession" == mat[1,e]){
 			colnames(mat)[e] <- "ID_REF"
 		} else{
 			if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Sex",r)
 				r = r + 1
 			}
-			if(grepl("postmorteminterval|PMI|pmi",mat[2,e])==TRUE){
+			else if(grepl("postmorteminterval|PMI|pmi",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("PMI",a)
 				a = a + 1
 			}
-			if(grepl("age|Age|AGE",mat[2,e])==TRUE){
+			else if(grepl("age|Age|AGE",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Age",h)
 				h = h + 1
 			 }
-			if(grepl("braak|b&b",mat[2,e])==TRUE){
+			else if(grepl("braak|b&b",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Braak",g)
 				g = g + 1
 			}
-			if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){
+			else if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Group",o)
 				o = o + 1
 			}
@@ -69,18 +69,18 @@ cinfo &lt;- function(mat){
 		if(grepl("Group",colnames(mat)[j]) == TRUE){
 			mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
 		}
-		if(grepl("Age",colnames(mat)[j])==TRUE){
+		else if(grepl("Age",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("\\D","",mat[,j])%>%
 				as.integer()
 		}
-		if(grepl("Sex",colnames(mat)[j])==TRUE){
+		else if(grepl("Sex",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub(".+:\\s","",mat[,j])
 		}
-		if(grepl("PMI",colnames(mat)[j])==TRUE){
+		else if(grepl("PMI",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
 				as.numeric() 
 		}
-		if(grepl("Braak",colnames(mat)[j])==TRUE){
+		else if(grepl("Braak",colnames(mat)[j])==TRUE){
 			mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
 				as.roman()%>%
 				as.integer()
@@ -235,7 +235,7 @@ THEFT &lt;- function(){
 				read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
  
 			} 
-			if(clfileex == 0){
+			else if(clfileex == 0){
 			##Lets Create a clean version
  
 			##Gene ID to Gene Name
@@ -259,7 +259,7 @@ THEFT &lt;- function(){
 								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
 								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 						}
-						if(IDF == 0){
+						else if(IDF == 0){
 							#No information on this particular GPL file
 							idLOCGPL <- genena %>%
 								read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
@@ -273,7 +273,7 @@ THEFT &lt;- function(){
 								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 						}
 					}
-					if(fileex == 0){
+					else if(fileex == 0){
 						#We must create a file that we can access for later use
 						idLOCGPL <- genena %>%
 							read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
@@ -288,7 +288,7 @@ THEFT &lt;- function(){
 							dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 					}
 	 			}
-				if(soft == FALSE){
+				else if(soft == FALSE){
 					geneIDNam <- genena %>%
 						read_delim(delim="\t",comment = "#")%>%
 						dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
@@ -391,7 +391,7 @@ THEFT &lt;- function(){
 					NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
 				}
 				##Averaging duplicates and putting them in their new homes
-				if(tabRDATID[j] > 1){
+				else if(tabRDATID[j] > 1){
 					NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
 				}
 				j <- j + 1
@@ -461,7 +461,7 @@ THEFT &lt;- function(){
 	}
  
 	#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
-	if(numDAT == 2){
+	else if(numDAT == 2){
 		#All the files you want to analyze
 		ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
 		if(length(ANDIS) == 0){
@@ -524,7 +524,7 @@ THEFT &lt;- function(){
 					read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
  
 				} 
-				if(clfileex == 0){
+				else if(clfileex == 0){
 				##Lets Create a clean version
  
 				##Gene ID to Gene Name
@@ -548,7 +548,7 @@ THEFT &lt;- function(){
 									read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
 									dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 							}
-							if(IDF == 0){
+							else if(IDF == 0){
 								#No information on this particular GPL file
 								idLOCGPL <- genena %>%
 									read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
@@ -562,7 +562,7 @@ THEFT &lt;- function(){
 									dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 							}
 						}
-						if(fileex == 0){
+						else if(fileex == 0){
 							#We must create a file that we can access for later use
 							idLOCGPL <- genena %>%
 								read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
@@ -577,7 +577,7 @@ THEFT &lt;- function(){
 								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 						}
 		 			}
-					if(soft == FALSE){
+					else if(soft == FALSE){
 						geneIDNam <- genena %>%
 							read_delim(delim="\t",comment = "#")%>%
 							dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
@@ -680,7 +680,7 @@ THEFT &lt;- function(){
 						NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
 					}
 					##Averaging duplicates and putting them in their new homes
-					if(tabRDATID[j] > 1){
+					else if(tabRDATID[j] > 1){
 						NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
 					}
 					j <- j + 1
...	...	@@ -28,29 +28,29 @@ chngrownm <- function(mat){
28	28	if("!Sample_source_name_ch1"==mat[1,e]){
29	29	colnames(mat)[e] <- "Brain_Region"
30	30	}
31		- if("!Sample_title" == mat[1,e]){
	31	+ else if("!Sample_title" == mat[1,e]){
32	32	colnames(mat)[e] <- "Title"
33	33	}
34		- if("!Sample_geo_accession" == mat[1,e]){
	34	+ else if("!Sample_geo_accession" == mat[1,e]){
35	35	colnames(mat)[e] <- "ID_REF"
36	36	} else{
37	37	if(grepl("Sex\|gender\|Gender\|sex",mat[2,e])==TRUE){
38	38	colnames(mat)[e] <- paste0("Sex",r)
39	39	r = r + 1
40	40	}
41		- if(grepl("postmorteminterval\|PMI\|pmi",mat[2,e])==TRUE){
	41	+ else if(grepl("postmorteminterval\|PMI\|pmi",mat[2,e])==TRUE){
42	42	colnames(mat)[e] <- paste0("PMI",a)
43	43	a = a + 1
44	44	}
45		- if(grepl("age\|Age\|AGE",mat[2,e])==TRUE){
	45	+ else if(grepl("age\|Age\|AGE",mat[2,e])==TRUE){
46	46	colnames(mat)[e] <- paste0("Age",h)
47	47	h = h + 1
48	48	}
49		- if(grepl("braak\|b&b",mat[2,e])==TRUE){
	49	+ else if(grepl("braak\|b&b",mat[2,e])==TRUE){
50	50	colnames(mat)[e] <- paste0("Braak",g)
51	51	g = g + 1
52	52	}
53		- if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control\|Normal",mat[2,e])==TRUE){
	53	+ else if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control\|Normal",mat[2,e])==TRUE){
54	54	colnames(mat)[e] <- paste0("Group",o)
55	55	o = o + 1
56	56	}
...	...	@@ -69,18 +69,18 @@ cinfo <- function(mat){
69	69	if(grepl("Group",colnames(mat)[j]) == TRUE){
70	70	mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])
71	71	}
72		- if(grepl("Age",colnames(mat)[j])==TRUE){
	72	+ else if(grepl("Age",colnames(mat)[j])==TRUE){
73	73	mat[,j] <- gsub("\\D","",mat[,j])%>%
74	74	as.integer()
75	75	}
76		- if(grepl("Sex",colnames(mat)[j])==TRUE){
	76	+ else if(grepl("Sex",colnames(mat)[j])==TRUE){
77	77	mat[,j] <- gsub(".+:\\s","",mat[,j])
78	78	}
79		- if(grepl("PMI",colnames(mat)[j])==TRUE){
	79	+ else if(grepl("PMI",colnames(mat)[j])==TRUE){
80	80	mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
81	81	as.numeric()
82	82	}
83		- if(grepl("Braak",colnames(mat)[j])==TRUE){
	83	+ else if(grepl("Braak",colnames(mat)[j])==TRUE){
84	84	mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
85	85	as.roman()%>%
86	86	as.integer()
...	...	@@ -235,7 +235,7 @@ THEFT <- function(){
235	235	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
236	236
237	237	}
238		- if(clfileex == 0){
	238	+ else if(clfileex == 0){
239	239	##Lets Create a clean version
240	240
241	241	##Gene ID to Gene Name
...	...	@@ -259,7 +259,7 @@ THEFT <- function(){
259	259	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
260	260	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
261	261	}
262		- if(IDF == 0){
	262	+ else if(IDF == 0){
263	263	#No information on this particular GPL file
264	264	idLOCGPL <- genena %>%
265	265	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
...	...	@@ -273,7 +273,7 @@ THEFT <- function(){
273	273	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
274	274	}
275	275	}
276		- if(fileex == 0){
	276	+ else if(fileex == 0){
277	277	#We must create a file that we can access for later use
278	278	idLOCGPL <- genena %>%
279	279	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
...	...	@@ -288,7 +288,7 @@ THEFT <- function(){
288	288	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
289	289	}
290	290	}
291		- if(soft == FALSE){
	291	+ else if(soft == FALSE){
292	292	geneIDNam <- genena %>%
293	293	read_delim(delim="\t",comment = "#")%>%
294	294	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
...	...	@@ -391,7 +391,7 @@ THEFT <- function(){
391	391	NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
392	392	}
393	393	##Averaging duplicates and putting them in their new homes
394		- if(tabRDATID[j] > 1){
	394	+ else if(tabRDATID[j] > 1){
395	395	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
396	396	}
397	397	j <- j + 1
...	...	@@ -461,7 +461,7 @@ THEFT <- function(){
461	461	}
462	462
463	463	#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
464		- if(numDAT == 2){
	464	+ else if(numDAT == 2){
465	465	#All the files you want to analyze
466	466	ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
467	467	if(length(ANDIS) == 0){
...	...	@@ -524,7 +524,7 @@ THEFT <- function(){
524	524	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
525	525
526	526	}
527		- if(clfileex == 0){
	527	+ else if(clfileex == 0){
528	528	##Lets Create a clean version
529	529
530	530	##Gene ID to Gene Name
...	...	@@ -548,7 +548,7 @@ THEFT <- function(){
548	548	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
549	549	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
550	550	}
551		- if(IDF == 0){
	551	+ else if(IDF == 0){
552	552	#No information on this particular GPL file
553	553	idLOCGPL <- genena %>%
554	554	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
...	...	@@ -562,7 +562,7 @@ THEFT <- function(){
562	562	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
563	563	}
564	564	}
565		- if(fileex == 0){
	565	+ else if(fileex == 0){
566	566	#We must create a file that we can access for later use
567	567	idLOCGPL <- genena %>%
568	568	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
...	...	@@ -577,7 +577,7 @@ THEFT <- function(){
577	577	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
578	578	}
579	579	}
580		- if(soft == FALSE){
	580	+ else if(soft == FALSE){
581	581	geneIDNam <- genena %>%
582	582	read_delim(delim="\t",comment = "#")%>%
583	583	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
...	...	@@ -680,7 +680,7 @@ THEFT <- function(){
680	680	NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
681	681	}
682	682	##Averaging duplicates and putting them in their new homes
683		- if(tabRDATID[j] > 1){
	683	+ else if(tabRDATID[j] > 1){
684	684	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
685	685	}
686	686	j <- j + 1