Merge branch 'master' of smlg.fiu.edu:efraingonzalez0/cleaning-and-fixing-data-with-r

Efrain Gonzalez
2 parents 01a09e19c4 edf2baf695
Showing 2 changed files Show diff stats
RAutoClDs.R
RCleanDscret.R
@@ -2,8 +2,8 @@
 #               Don't Use This Code Just Yet                           #
 ########################################################################
 #Efrain H. Gonzalez
-#6/16/2017
-
+#6/21/2017
+options(digits = 11)
 #Libraries required to run the code
 library(pryr)
 library(MASS)
@@ -27,30 +27,28 @@ chngrownm &lt;- function(mat){
 	for(e in 1:col){
 		if("!Sample_source_name_ch1"==mat[1,e]){
 			colnames(mat)[e] <- "Brain_Region"	
-		} 
-		else if("!Sample_title" == mat[1,e]){
+		} else if("!Sample_title" == mat[1,e]){
 			colnames(mat)[e] <- "Title"
-		} 
-		else if("!Sample_geo_accession" == mat[1,e]){
+		} else if("!Sample_geo_accession" == mat[1,e]){
 			colnames(mat)[e] <- "ID_REF"
 		} else{
 			if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Sex",r)
 				r = r + 1
 			}
-			else if(grepl("postmorteminterval|PMI|pmi",mat[2,e])==TRUE){
+			if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("PMI",a)
 				a = a + 1
 			}
-			else if(grepl("age|Age|AGE",mat[2,e])==TRUE){
+			if(grepl("age|Age|AGE",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Age",h)
 				h = h + 1
 			 }
-			else if(grepl("braak|b&b",mat[2,e])==TRUE){
+			if(grepl("braak|b&b",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Braak",g)
 				g = g + 1
 			}
-			else if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){
+			if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Group",o)
 				o = o + 1
 			}
@@ -68,19 +66,15 @@ cinfo &lt;- function(mat){
 	for(j in 2:col){
 		if(grepl("Group",colnames(mat)[j]) == TRUE){
 			mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
-		}
-		else if(grepl("Age",colnames(mat)[j])==TRUE){
+		} else if(grepl("Age",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("\\D","",mat[,j])%>%
 				as.integer()
-		}
-		else if(grepl("Sex",colnames(mat)[j])==TRUE){
+		} else if(grepl("Sex",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub(".+:\\s","",mat[,j])
-		}
-		else if(grepl("PMI",colnames(mat)[j])==TRUE){
+		} else if(grepl("PMI",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
 				as.numeric() 
-		}
-		else if(grepl("Braak",colnames(mat)[j])==TRUE){
+		} else if(grepl("Braak",colnames(mat)[j])==TRUE){
 			mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
 				as.roman()%>%
 				as.integer()
@@ -105,19 +99,37 @@ NAFIXING &lt;- function(GIDNAM){
  
 #4#Function for changing the gene ID to gene name
 cgeneID <- function(GeneName,DATA){
-    colGene <- dim(GeneName)[2]
-     j <- 1
-     for(j in 1:colGene){
-	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
-	if(is.na(sum(chngsreq))==FALSE){
-		if(sum(chngsreq) > 0){
-			DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
+	nj <- t(GeneName)
+	nq <- t(DATA)
+	colGene <- dim(nj)[2]
+	colDATA <- dim(nq)[2]
+	j <- 1
+	for(j in 1:colDATA){
+		#where is that gene id located within the GPL file
+		chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
+		if(is.na(sum(chngreq))==FALSE){
+			if(sum(chngreq) > 0){
+			nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
+			}
 		}
+		j <- j + 1
 	}
-	j = j+1
-	}
-	DATA
+	nq
 }
+#cgeneID <- function(GeneName,DATA){
+#    colGene <- dim(GeneName)[2]
+#     j <- 1
+#     for(j in 1:colGene){
+#	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
+#	if(is.na(sum(chngsreq))==FALSE){
+#		if(sum(chngsreq) > 0){
+#			DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
+#		}
+#	}
+#	j = j+1
+#	}
+#	DATA
+#}
  
 #5#Function for adjusting the gene names
 gcnames <- function(DiData,usecol=1){
@@ -150,11 +162,9 @@ dndat &lt;- function(NDATA){
  
 				if(NDATA[i,j] < -1){
 					DDATA[i,j]=0L
-				}
-				if(NDATA[i,j] > 1){
+				} else if(NDATA[i,j] > 1){
 					DDATA[i,j]=2L
-				}
-				if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
+				} else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
 					DDATA[i,j]=1L
 				}
 			} else{
@@ -176,13 +186,13 @@ THEFT &lt;- function(){
 	#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
 	numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
 	GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
-	
+	GSEfloc <- list.files()[GSEfileloc]
 	#ALL DATA FILES WILL BE CLEANED
 	if(numDAT == 1){
 		#indexing the data files
 		n <- 1	
-		for(n in 1: length(GSEfileloc)){
-			alz <- list.files()[GSEfileloc[n]]
+		for(n in 1: length(GSEfloc)){
+			alz <- GSEfloc[n]
  
 			#Working with the wordy part of the document
 			alzword <- alz %>%
@@ -234,8 +244,7 @@ THEFT &lt;- function(){
 			geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
 				read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
  
-			} 
-			else if(clfileex == 0){
+			} else if(clfileex == 0){
 			##Lets Create a clean version
  
 			##Gene ID to Gene Name
@@ -258,8 +267,7 @@ THEFT &lt;- function(){
 							geneIDNam <- genena %>%
 								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
 								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
-						}
-						else if(IDF == 0){
+						} else if(IDF == 0){
 							#No information on this particular GPL file
 							idLOCGPL <- genena %>%
 								read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
@@ -272,8 +280,7 @@ THEFT &lt;- function(){
 								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 						}
-					}
-					else if(fileex == 0){
+					} else if(fileex == 0){
 						#We must create a file that we can access for later use
 						idLOCGPL <- genena %>%
 							read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
@@ -287,8 +294,7 @@ THEFT &lt;- function(){
 							read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 							dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 					}
-	 			}
-				else if(soft == FALSE){
+	 			} else if(soft == FALSE){
 					geneIDNam <- genena %>%
 						read_delim(delim="\t",comment = "#")%>%
 						dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
@@ -307,7 +313,7 @@ THEFT &lt;- function(){
  
  
 			##Changing the gene ID to gene name
-			ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
+			ALZDAT1 <- cgeneID(geneIDNam,alzdat)
 			colnames(ALZDAT) = ALZDAT1[1,]
  
  
@@ -350,9 +356,14 @@ THEFT &lt;- function(){
 			z <- 1
 			naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
 			for(z in 1:dim(RAWWORD)[1]){
-				naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
-				z <- z + 1
-			}
+                if(sum(is.na(RAWWORD[z,])) > 0){
+				    naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
+				}
+				if(length(grep("NA",RAWWORD[z,])) > 0){
+                naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
+                }
+                z <- z + 1
+            }
  
 			colnames(naroww) <- "ROW_NAs"
 			RAWWORD <- bind_cols(RAWWORD,naroww)
@@ -389,9 +400,8 @@ THEFT &lt;- function(){
 				##Putting the ones without duplicates in their new homes
 				if(tabRDATID[j] == 1){
 					NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
-				}
-				##Averaging duplicates and putting them in their new homes
-				else if(tabRDATID[j] > 1){
+				} else if(tabRDATID[j] > 1){
+				    ##Averaging duplicates and putting them in their new homes
 					NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
 				}
 				j <- j + 1
@@ -458,10 +468,9 @@ THEFT &lt;- function(){
 			write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
 			n <- n +1
 		}
-	}
-	
+	} else if(numDAT == 2){
 	#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
-	else if(numDAT == 2){
+
 		#All the files you want to analyze
 		ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
 		if(length(ANDIS) == 0){
@@ -523,8 +532,7 @@ THEFT &lt;- function(){
 				geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
 					read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
  
-				} 
-				else if(clfileex == 0){
+				} else if(clfileex == 0){
 				##Lets Create a clean version
  
 				##Gene ID to Gene Name
@@ -547,8 +555,7 @@ THEFT &lt;- function(){
 								geneIDNam <- genena %>%
 									read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
 									dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
-							}
-							else if(IDF == 0){
+							} else if(IDF == 0){
 								#No information on this particular GPL file
 								idLOCGPL <- genena %>%
 									read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
@@ -561,8 +568,7 @@ THEFT &lt;- function(){
 									read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 									dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 							}
-						}
-						else if(fileex == 0){
+						} else if(fileex == 0){
 							#We must create a file that we can access for later use
 							idLOCGPL <- genena %>%
 								read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
@@ -576,8 +582,7 @@ THEFT &lt;- function(){
 								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 						}
-		 			}
-					else if(soft == FALSE){
+		 			} else if(soft == FALSE){
 						geneIDNam <- genena %>%
 							read_delim(delim="\t",comment = "#")%>%
 							dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
@@ -596,7 +601,7 @@ THEFT &lt;- function(){
  
  
 				##Changing the gene ID to gene name
-				ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
+				ALZDAT1 <- cgeneID(geneIDNam,alzdat)
 				colnames(ALZDAT) = ALZDAT1[1,]
  
  
@@ -639,9 +644,14 @@ THEFT &lt;- function(){
 				z <- 1
 				naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
 				for(z in 1:dim(RAWWORD)[1]){
-					naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
-					z <- z + 1
-				}
+                    if(sum(is.na(RAWWORD[z,])) > 0){
+                        naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
+                    }
+					if(length(grep("NA",RAWWORD[z,])) > 0){
+                        naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
+                    }
+                    z <- z + 1
+                }
  
 				colnames(naroww) <- "ROW_NAs"
 				RAWWORD <- bind_cols(RAWWORD,naroww)
@@ -678,9 +688,8 @@ THEFT &lt;- function(){
 					##Putting the ones without duplicates in their new homes
 					if(tabRDATID[j] == 1){
 						NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
-					}
+					} else if(tabRDATID[j] > 1){
 					##Averaging duplicates and putting them in their new homes
-					else if(tabRDATID[j] > 1){
 						NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
 					}
 					j <- j + 1
 ##Posted 6/15/2017
-
+options(digits = 11)
  
 #Libraries required to run the code
 library(pryr)
@@ -24,11 +24,9 @@ chngrownm &lt;- function(mat){
 	for(j in 1:col){
 		if("!Sample_source_name_ch1"==mat[1,j]){
 			colnames(mat)[j] <- "Brain_Region"	
-		} 
-		if("!Sample_title" == mat[1,j]){
+		} else if("!Sample_title" == mat[1,j]){
 			colnames(mat)[j] <- "Title"
-		} 
-		if("!Sample_geo_accession" == mat[1,j]){
+		} else if("!Sample_geo_accession" == mat[1,j]){
 			colnames(mat)[j] <- "ID_REF"
 		} else{
 			if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
@@ -102,23 +100,41 @@ NAFIXING &lt;- function(GIDNAM){
  
 #4#Function for changing the gene ID to gene name
 cgeneID <- function(GeneName,DATA){
-    colGene <- dim(GeneName)[2]
-     j <- 1
-     for(j in 1:colGene){
-	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
-	if(is.na(sum(chngsreq))==FALSE){
-		if(sum(chngsreq) > 0){
-			DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
+	nj <- t(GeneName)
+	nq <- t(DATA)
+	colGene <- dim(nj)[2]
+	colDATA <- dim(nq)[2]
+	j <- 1
+	for(j in 1:colDATA){
+		#where is that gene id located within the GPL file
+		chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
+		if(is.na(sum(chngreq))==FALSE){
+			if(sum(chngreq) > 0){
+			nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
+			}
 		}
+		j <- j + 1
 	}
-		#if(sum(chngsreq) > 0){
-		##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
-		#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
-		#}
-	j = j+1
-	}
-	DATA
+	nq
 }
+#cgeneID <- function(GeneName,DATA){
+#    colGene <- dim(GeneName)[2]
+#     j <- 1
+#     for(j in 1:colGene){
+#	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
+#	if(is.na(sum(chngsreq))==FALSE){
+#		if(sum(chngsreq) > 0){
+#			DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
+#		}
+#	}
+#		#if(sum(chngsreq) > 0){
+#		##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
+#		#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
+#		#}
+#	j = j+1
+#	}
+#	DATA
+#}
  
 #5#Function for adjusting the gene names
 gcnames <- function(DiData,usecol=1){
@@ -151,11 +167,9 @@ dndat &lt;- function(NDATA){
  
 				if(NDATA[i,j] < -1){
 					DDATA[i,j]=0L
-				}
-				if(NDATA[i,j] > 1){
+				} else if(NDATA[i,j] > 1){
 					DDATA[i,j]=2L
-				}
-				if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
+				} else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
 					DDATA[i,j]=1L
 				}
 			} else{
@@ -222,8 +236,7 @@ if(clfileex &gt;= 1){
 geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
 	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
  
-} 
-if(clfileex == 0){
+} else if(clfileex == 0){
 ##Lets Create a clean version
  
 ##Gene ID to Gene Name
@@ -246,8 +259,7 @@ if(clfileex == 0){
 				geneIDNam <- genena %>%
 					read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
 					dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
-			}
-			if(IDF == 0){
+			} else if(IDF == 0){
 				#No information on this particular GPL file
 				idLOCGPL <- genena %>%
 					read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
@@ -260,8 +272,7 @@ if(clfileex == 0){
 					read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 					dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 			}
-		}
-		if(fileex == 0){
+		} else if(fileex == 0){
 			#We must create a file that we can access for later use
 			idLOCGPL <- genena %>%
 				read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
@@ -275,8 +286,7 @@ if(clfileex == 0){
 				read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 				dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 		}
-	 }
-	if(soft == FALSE){
+	 } else if(soft == FALSE){
 		geneIDNam <- genena %>%
 		read_delim(delim="\t",comment = "#")%>%
 		dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
@@ -295,7 +305,7 @@ if(clfileex == 0){
  
  
 ##Changing the gene ID to gene name
-ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
+ALZDAT1 <- cgeneID(geneIDNam,alzdat)
 colnames(ALZDAT) = ALZDAT1[1,]
  
  
@@ -338,8 +348,13 @@ RAWWORD &lt;- rawword[-hereim,] %&gt;%
 z <- 1
 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
 for(z in 1:dim(RAWWORD)[1]){
-	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
-	z <- z + 1
+    if(sum(is.na(RAWWORD[z,])) > 0){
+        naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
+    }
+	if(length(grep("NA",RAWWORD[z,])) > 0){
+        naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
+    }
+    z <- z + 1
 }
  
 colnames(naroww) <- "ROW_NAs"
@@ -378,9 +393,8 @@ for(j in 1:length(tabRDATID)){
 	##Putting the ones without duplicates in their new homes
 	if(tabRDATID[j] == 1){
 		NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
-	}
+	} else if(tabRDATID[j] > 1){
 	##Averaging duplicates and putting them in their new homes
-	if(tabRDATID[j] > 1){
 		NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
 	}
 	j <- j + 1
...	...	@@ -2,8 +2,8 @@
2	2	# Don't Use This Code Just Yet #
3	3	########################################################################
4	4	#Efrain H. Gonzalez
5		-#6/16/2017
6		-
	5	+#6/21/2017
	6	+options(digits = 11)
7	7	#Libraries required to run the code
8	8	library(pryr)
9	9	library(MASS)
...	...	@@ -27,30 +27,28 @@ chngrownm <- function(mat){
27	27	for(e in 1:col){
28	28	if("!Sample_source_name_ch1"==mat[1,e]){
29	29	colnames(mat)[e] <- "Brain_Region"
30		- }
31		- else if("!Sample_title" == mat[1,e]){
	30	+ } else if("!Sample_title" == mat[1,e]){
32	31	colnames(mat)[e] <- "Title"
33		- }
34		- else if("!Sample_geo_accession" == mat[1,e]){
	32	+ } else if("!Sample_geo_accession" == mat[1,e]){
35	33	colnames(mat)[e] <- "ID_REF"
36	34	} else{
37	35	if(grepl("Sex\|gender\|Gender\|sex",mat[2,e])==TRUE){
38	36	colnames(mat)[e] <- paste0("Sex",r)
39	37	r = r + 1
40	38	}
41		- else if(grepl("postmorteminterval\|PMI\|pmi",mat[2,e])==TRUE){
	39	+ if(grepl("postmorteminterval\|PMI\|pmi\|interval",mat[2,e])==TRUE){
42	40	colnames(mat)[e] <- paste0("PMI",a)
43	41	a = a + 1
44	42	}
45		- else if(grepl("age\|Age\|AGE",mat[2,e])==TRUE){
	43	+ if(grepl("age\|Age\|AGE",mat[2,e])==TRUE){
46	44	colnames(mat)[e] <- paste0("Age",h)
47	45	h = h + 1
48	46	}
49		- else if(grepl("braak\|b&b",mat[2,e])==TRUE){
	47	+ if(grepl("braak\|b&b",mat[2,e])==TRUE){
50	48	colnames(mat)[e] <- paste0("Braak",g)
51	49	g = g + 1
52	50	}
53		- else if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control\|Normal",mat[2,e])==TRUE){
	51	+ if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control\|Normal",mat[2,e])==TRUE){
54	52	colnames(mat)[e] <- paste0("Group",o)
55	53	o = o + 1
56	54	}
...	...	@@ -68,19 +66,15 @@ cinfo <- function(mat){
68	66	for(j in 2:col){
69	67	if(grepl("Group",colnames(mat)[j]) == TRUE){
70	68	mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])
71		- }
72		- else if(grepl("Age",colnames(mat)[j])==TRUE){
	69	+ } else if(grepl("Age",colnames(mat)[j])==TRUE){
73	70	mat[,j] <- gsub("\\D","",mat[,j])%>%
74	71	as.integer()
75		- }
76		- else if(grepl("Sex",colnames(mat)[j])==TRUE){
	72	+ } else if(grepl("Sex",colnames(mat)[j])==TRUE){
77	73	mat[,j] <- gsub(".+:\\s","",mat[,j])
78		- }
79		- else if(grepl("PMI",colnames(mat)[j])==TRUE){
	74	+ } else if(grepl("PMI",colnames(mat)[j])==TRUE){
80	75	mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
81	76	as.numeric()
82		- }
83		- else if(grepl("Braak",colnames(mat)[j])==TRUE){
	77	+ } else if(grepl("Braak",colnames(mat)[j])==TRUE){
84	78	mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
85	79	as.roman()%>%
86	80	as.integer()
...	...	@@ -105,19 +99,37 @@ NAFIXING <- function(GIDNAM){
105	99
106	100	#4#Function for changing the gene ID to gene name
107	101	cgeneID <- function(GeneName,DATA){
108		- colGene <- dim(GeneName)[2]
109		- j <- 1
110		- for(j in 1:colGene){
111		- chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
112		- if(is.na(sum(chngsreq))==FALSE){
113		- if(sum(chngsreq) > 0){
114		- DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
	102	+ nj <- t(GeneName)
	103	+ nq <- t(DATA)
	104	+ colGene <- dim(nj)[2]
	105	+ colDATA <- dim(nq)[2]
	106	+ j <- 1
	107	+ for(j in 1:colDATA){
	108	+ #where is that gene id located within the GPL file
	109	+ chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
	110	+ if(is.na(sum(chngreq))==FALSE){
	111	+ if(sum(chngreq) > 0){
	112	+ nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
	113	+ }
115	114	}
	115	+ j <- j + 1
116	116	}
117		- j = j+1
118		- }
119		- DATA
	117	+ nq
120	118	}
	119	+#cgeneID <- function(GeneName,DATA){
	120	+# colGene <- dim(GeneName)[2]
	121	+# j <- 1
	122	+# for(j in 1:colGene){
	123	+# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
	124	+# if(is.na(sum(chngsreq))==FALSE){
	125	+# if(sum(chngsreq) > 0){
	126	+# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
	127	+# }
	128	+# }
	129	+# j = j+1
	130	+# }
	131	+# DATA
	132	+#}
121	133
122	134	#5#Function for adjusting the gene names
123	135	gcnames <- function(DiData,usecol=1){
...	...	@@ -150,11 +162,9 @@ dndat <- function(NDATA){
150	162
151	163	if(NDATA[i,j] < -1){
152	164	DDATA[i,j]=0L
153		- }
154		- if(NDATA[i,j] > 1){
	165	+ } else if(NDATA[i,j] > 1){
155	166	DDATA[i,j]=2L
156		- }
157		- if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
	167	+ } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
158	168	DDATA[i,j]=1L
159	169	}
160	170	} else{
...	...	@@ -176,13 +186,13 @@ THEFT <- function(){
176	186	#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
177	187	numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
178	188	GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
179		-
	189	+ GSEfloc <- list.files()[GSEfileloc]
180	190	#ALL DATA FILES WILL BE CLEANED
181	191	if(numDAT == 1){
182	192	#indexing the data files
183	193	n <- 1
184		- for(n in 1: length(GSEfileloc)){
185		- alz <- list.files()[GSEfileloc[n]]
	194	+ for(n in 1: length(GSEfloc)){
	195	+ alz <- GSEfloc[n]
186	196
187	197	#Working with the wordy part of the document
188	198	alzword <- alz %>%
...	...	@@ -234,8 +244,7 @@ THEFT <- function(){
234	244	geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
235	245	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
236	246
237		- }
238		- else if(clfileex == 0){
	247	+ } else if(clfileex == 0){
239	248	##Lets Create a clean version
240	249
241	250	##Gene ID to Gene Name
...	...	@@ -258,8 +267,7 @@ THEFT <- function(){
258	267	geneIDNam <- genena %>%
259	268	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
260	269	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
261		- }
262		- else if(IDF == 0){
	270	+ } else if(IDF == 0){
263	271	#No information on this particular GPL file
264	272	idLOCGPL <- genena %>%
265	273	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
...	...	@@ -272,8 +280,7 @@ THEFT <- function(){
272	280	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
273	281	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
274	282	}
275		- }
276		- else if(fileex == 0){
	283	+ } else if(fileex == 0){
277	284	#We must create a file that we can access for later use
278	285	idLOCGPL <- genena %>%
279	286	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
...	...	@@ -287,8 +294,7 @@ THEFT <- function(){
287	294	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
288	295	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
289	296	}
290		- }
291		- else if(soft == FALSE){
	297	+ } else if(soft == FALSE){
292	298	geneIDNam <- genena %>%
293	299	read_delim(delim="\t",comment = "#")%>%
294	300	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
...	...	@@ -307,7 +313,7 @@ THEFT <- function(){
307	313
308	314
309	315	##Changing the gene ID to gene name
310		- ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
	316	+ ALZDAT1 <- cgeneID(geneIDNam,alzdat)
311	317	colnames(ALZDAT) = ALZDAT1[1,]
312	318
313	319
...	...	@@ -350,9 +356,14 @@ THEFT <- function(){
350	356	z <- 1
351	357	naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
352	358	for(z in 1:dim(RAWWORD)[1]){
353		- naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
354		- z <- z + 1
355		- }
	359	+ if(sum(is.na(RAWWORD[z,])) > 0){
	360	+ naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
	361	+ }
	362	+ if(length(grep("NA",RAWWORD[z,])) > 0){
	363	+ naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
	364	+ }
	365	+ z <- z + 1
	366	+ }
356	367
357	368	colnames(naroww) <- "ROW_NAs"
358	369	RAWWORD <- bind_cols(RAWWORD,naroww)
...	...	@@ -389,9 +400,8 @@ THEFT <- function(){
389	400	##Putting the ones without duplicates in their new homes
390	401	if(tabRDATID[j] == 1){
391	402	NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
392		- }
393		- ##Averaging duplicates and putting them in their new homes
394		- else if(tabRDATID[j] > 1){
	403	+ } else if(tabRDATID[j] > 1){
	404	+ ##Averaging duplicates and putting them in their new homes
395	405	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
396	406	}
397	407	j <- j + 1
...	...	@@ -458,10 +468,9 @@ THEFT <- function(){
458	468	write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
459	469	n <- n +1
460	470	}
461		- }
462		-
	471	+ } else if(numDAT == 2){
463	472	#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
464		- else if(numDAT == 2){
	473	+
465	474	#All the files you want to analyze
466	475	ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
467	476	if(length(ANDIS) == 0){
...	...	@@ -523,8 +532,7 @@ THEFT <- function(){
523	532	geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
524	533	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
525	534
526		- }
527		- else if(clfileex == 0){
	535	+ } else if(clfileex == 0){
528	536	##Lets Create a clean version
529	537
530	538	##Gene ID to Gene Name
...	...	@@ -547,8 +555,7 @@ THEFT <- function(){
547	555	geneIDNam <- genena %>%
548	556	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
549	557	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
550		- }
551		- else if(IDF == 0){
	558	+ } else if(IDF == 0){
552	559	#No information on this particular GPL file
553	560	idLOCGPL <- genena %>%
554	561	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
...	...	@@ -561,8 +568,7 @@ THEFT <- function(){
561	568	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
562	569	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
563	570	}
564		- }
565		- else if(fileex == 0){
	571	+ } else if(fileex == 0){
566	572	#We must create a file that we can access for later use
567	573	idLOCGPL <- genena %>%
568	574	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
...	...	@@ -576,8 +582,7 @@ THEFT <- function(){
576	582	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
577	583	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
578	584	}
579		- }
580		- else if(soft == FALSE){
	585	+ } else if(soft == FALSE){
581	586	geneIDNam <- genena %>%
582	587	read_delim(delim="\t",comment = "#")%>%
583	588	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
...	...	@@ -596,7 +601,7 @@ THEFT <- function(){
596	601
597	602
598	603	##Changing the gene ID to gene name
599		- ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
	604	+ ALZDAT1 <- cgeneID(geneIDNam,alzdat)
600	605	colnames(ALZDAT) = ALZDAT1[1,]
601	606
602	607
...	...	@@ -639,9 +644,14 @@ THEFT <- function(){
639	644	z <- 1
640	645	naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
641	646	for(z in 1:dim(RAWWORD)[1]){
642		- naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
643		- z <- z + 1
644		- }
	647	+ if(sum(is.na(RAWWORD[z,])) > 0){
	648	+ naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
	649	+ }
	650	+ if(length(grep("NA",RAWWORD[z,])) > 0){
	651	+ naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
	652	+ }
	653	+ z <- z + 1
	654	+ }
645	655
646	656	colnames(naroww) <- "ROW_NAs"
647	657	RAWWORD <- bind_cols(RAWWORD,naroww)
...	...	@@ -678,9 +688,8 @@ THEFT <- function(){
678	688	##Putting the ones without duplicates in their new homes
679	689	if(tabRDATID[j] == 1){
680	690	NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
681		- }
	691	+ } else if(tabRDATID[j] > 1){
682	692	##Averaging duplicates and putting them in their new homes
683		- else if(tabRDATID[j] > 1){
684	693	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
685	694	}
686	695	j <- j + 1
1	1	##Posted 6/15/2017
2		-
	2	+options(digits = 11)
3	3
4	4	#Libraries required to run the code
5	5	library(pryr)
...	...	@@ -24,11 +24,9 @@ chngrownm <- function(mat){
24	24	for(j in 1:col){
25	25	if("!Sample_source_name_ch1"==mat[1,j]){
26	26	colnames(mat)[j] <- "Brain_Region"
27		- }
28		- if("!Sample_title" == mat[1,j]){
	27	+ } else if("!Sample_title" == mat[1,j]){
29	28	colnames(mat)[j] <- "Title"
30		- }
31		- if("!Sample_geo_accession" == mat[1,j]){
	29	+ } else if("!Sample_geo_accession" == mat[1,j]){
32	30	colnames(mat)[j] <- "ID_REF"
33	31	} else{
34	32	if(grepl("Sex\|gender\|Gender\|sex",mat[2,j])==TRUE){
...	...	@@ -102,23 +100,41 @@ NAFIXING <- function(GIDNAM){
102	100
103	101	#4#Function for changing the gene ID to gene name
104	102	cgeneID <- function(GeneName,DATA){
105		- colGene <- dim(GeneName)[2]
106		- j <- 1
107		- for(j in 1:colGene){
108		- chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
109		- if(is.na(sum(chngsreq))==FALSE){
110		- if(sum(chngsreq) > 0){
111		- DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
	103	+ nj <- t(GeneName)
	104	+ nq <- t(DATA)
	105	+ colGene <- dim(nj)[2]
	106	+ colDATA <- dim(nq)[2]
	107	+ j <- 1
	108	+ for(j in 1:colDATA){
	109	+ #where is that gene id located within the GPL file
	110	+ chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
	111	+ if(is.na(sum(chngreq))==FALSE){
	112	+ if(sum(chngreq) > 0){
	113	+ nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
	114	+ }
112	115	}
	116	+ j <- j + 1
113	117	}
114		- #if(sum(chngsreq) > 0){
115		- ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
116		- #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
117		- #}
118		- j = j+1
119		- }
120		- DATA
	118	+ nq
121	119	}
	120	+#cgeneID <- function(GeneName,DATA){
	121	+# colGene <- dim(GeneName)[2]
	122	+# j <- 1
	123	+# for(j in 1:colGene){
	124	+# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
	125	+# if(is.na(sum(chngsreq))==FALSE){
	126	+# if(sum(chngsreq) > 0){
	127	+# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
	128	+# }
	129	+# }
	130	+# #if(sum(chngsreq) > 0){
	131	+# ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
	132	+# #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
	133	+# #}
	134	+# j = j+1
	135	+# }
	136	+# DATA
	137	+#}
122	138
123	139	#5#Function for adjusting the gene names
124	140	gcnames <- function(DiData,usecol=1){
...	...	@@ -151,11 +167,9 @@ dndat <- function(NDATA){
151	167
152	168	if(NDATA[i,j] < -1){
153	169	DDATA[i,j]=0L
154		- }
155		- if(NDATA[i,j] > 1){
	170	+ } else if(NDATA[i,j] > 1){
156	171	DDATA[i,j]=2L
157		- }
158		- if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
	172	+ } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
159	173	DDATA[i,j]=1L
160	174	}
161	175	} else{
...	...	@@ -222,8 +236,7 @@ if(clfileex >= 1){
222	236	geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
223	237	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
224	238
225		-}
226		-if(clfileex == 0){
	239	+} else if(clfileex == 0){
227	240	##Lets Create a clean version
228	241
229	242	##Gene ID to Gene Name
...	...	@@ -246,8 +259,7 @@ if(clfileex == 0){
246	259	geneIDNam <- genena %>%
247	260	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
248	261	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
249		- }
250		- if(IDF == 0){
	262	+ } else if(IDF == 0){
251	263	#No information on this particular GPL file
252	264	idLOCGPL <- genena %>%
253	265	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
...	...	@@ -260,8 +272,7 @@ if(clfileex == 0){
260	272	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
261	273	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
262	274	}
263		- }
264		- if(fileex == 0){
	275	+ } else if(fileex == 0){
265	276	#We must create a file that we can access for later use
266	277	idLOCGPL <- genena %>%
267	278	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
...	...	@@ -275,8 +286,7 @@ if(clfileex == 0){
275	286	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
276	287	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
277	288	}
278		- }
279		- if(soft == FALSE){
	289	+ } else if(soft == FALSE){
280	290	geneIDNam <- genena %>%
281	291	read_delim(delim="\t",comment = "#")%>%
282	292	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
...	...	@@ -295,7 +305,7 @@ if(clfileex == 0){
295	305
296	306
297	307	##Changing the gene ID to gene name
298		-ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
	308	+ALZDAT1 <- cgeneID(geneIDNam,alzdat)
299	309	colnames(ALZDAT) = ALZDAT1[1,]
300	310
301	311
...	...	@@ -338,8 +348,13 @@ RAWWORD <- rawword[-hereim,] %>%
338	348	z <- 1
339	349	naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
340	350	for(z in 1:dim(RAWWORD)[1]){
341		- naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
342		- z <- z + 1
	351	+ if(sum(is.na(RAWWORD[z,])) > 0){
	352	+ naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
	353	+ }
	354	+ if(length(grep("NA",RAWWORD[z,])) > 0){
	355	+ naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
	356	+ }
	357	+ z <- z + 1
343	358	}
344	359
345	360	colnames(naroww) <- "ROW_NAs"
...	...	@@ -378,9 +393,8 @@ for(j in 1:length(tabRDATID)){
378	393	##Putting the ones without duplicates in their new homes
379	394	if(tabRDATID[j] == 1){
380	395	NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
381		- }
	396	+ } else if(tabRDATID[j] > 1){
382	397	##Averaging duplicates and putting them in their new homes
383		- if(tabRDATID[j] > 1){
384	398	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
385	399	}
386	400	j <- j + 1