Commit eccb7a19e29c5a6300ce75a7154eac8089de2a0b

Authored by Efrain Gonzalez
Exists in master

Merge branch 'master' of smlg.fiu.edu:efraingonzalez0/cleaning-and-fixing-data-with-r

... ... @@ -2,8 +2,8 @@
2 2 # Don't Use This Code Just Yet #
3 3 ########################################################################
4 4 #Efrain H. Gonzalez
5   -#6/16/2017
6   -
  5 +#6/21/2017
  6 +options(digits = 11)
7 7 #Libraries required to run the code
8 8 library(pryr)
9 9 library(MASS)
... ... @@ -27,30 +27,28 @@ chngrownm <- function(mat){
27 27 for(e in 1:col){
28 28 if("!Sample_source_name_ch1"==mat[1,e]){
29 29 colnames(mat)[e] <- "Brain_Region"
30   - }
31   - else if("!Sample_title" == mat[1,e]){
  30 + } else if("!Sample_title" == mat[1,e]){
32 31 colnames(mat)[e] <- "Title"
33   - }
34   - else if("!Sample_geo_accession" == mat[1,e]){
  32 + } else if("!Sample_geo_accession" == mat[1,e]){
35 33 colnames(mat)[e] <- "ID_REF"
36 34 } else{
37 35 if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){
38 36 colnames(mat)[e] <- paste0("Sex",r)
39 37 r = r + 1
40 38 }
41   - else if(grepl("postmorteminterval|PMI|pmi",mat[2,e])==TRUE){
  39 + if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){
42 40 colnames(mat)[e] <- paste0("PMI",a)
43 41 a = a + 1
44 42 }
45   - else if(grepl("age|Age|AGE",mat[2,e])==TRUE){
  43 + if(grepl("age|Age|AGE",mat[2,e])==TRUE){
46 44 colnames(mat)[e] <- paste0("Age",h)
47 45 h = h + 1
48 46 }
49   - else if(grepl("braak|b&b",mat[2,e])==TRUE){
  47 + if(grepl("braak|b&b",mat[2,e])==TRUE){
50 48 colnames(mat)[e] <- paste0("Braak",g)
51 49 g = g + 1
52 50 }
53   - else if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){
  51 + if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){
54 52 colnames(mat)[e] <- paste0("Group",o)
55 53 o = o + 1
56 54 }
... ... @@ -68,19 +66,15 @@ cinfo &lt;- function(mat){
68 66 for(j in 2:col){
69 67 if(grepl("Group",colnames(mat)[j]) == TRUE){
70 68 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
71   - }
72   - else if(grepl("Age",colnames(mat)[j])==TRUE){
  69 + } else if(grepl("Age",colnames(mat)[j])==TRUE){
73 70 mat[,j] <- gsub("\\D","",mat[,j])%>%
74 71 as.integer()
75   - }
76   - else if(grepl("Sex",colnames(mat)[j])==TRUE){
  72 + } else if(grepl("Sex",colnames(mat)[j])==TRUE){
77 73 mat[,j] <- gsub(".+:\\s","",mat[,j])
78   - }
79   - else if(grepl("PMI",colnames(mat)[j])==TRUE){
  74 + } else if(grepl("PMI",colnames(mat)[j])==TRUE){
80 75 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
81 76 as.numeric()
82   - }
83   - else if(grepl("Braak",colnames(mat)[j])==TRUE){
  77 + } else if(grepl("Braak",colnames(mat)[j])==TRUE){
84 78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
85 79 as.roman()%>%
86 80 as.integer()
... ... @@ -105,19 +99,37 @@ NAFIXING &lt;- function(GIDNAM){
105 99  
106 100 #4#Function for changing the gene ID to gene name
107 101 cgeneID <- function(GeneName,DATA){
108   - colGene <- dim(GeneName)[2]
109   - j <- 1
110   - for(j in 1:colGene){
111   - chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
112   - if(is.na(sum(chngsreq))==FALSE){
113   - if(sum(chngsreq) > 0){
114   - DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
  102 + nj <- t(GeneName)
  103 + nq <- t(DATA)
  104 + colGene <- dim(nj)[2]
  105 + colDATA <- dim(nq)[2]
  106 + j <- 1
  107 + for(j in 1:colDATA){
  108 + #where is that gene id located within the GPL file
  109 + chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
  110 + if(is.na(sum(chngreq))==FALSE){
  111 + if(sum(chngreq) > 0){
  112 + nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
  113 + }
115 114 }
  115 + j <- j + 1
116 116 }
117   - j = j+1
118   - }
119   - DATA
  117 + nq
120 118 }
  119 +#cgeneID <- function(GeneName,DATA){
  120 +# colGene <- dim(GeneName)[2]
  121 +# j <- 1
  122 +# for(j in 1:colGene){
  123 +# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
  124 +# if(is.na(sum(chngsreq))==FALSE){
  125 +# if(sum(chngsreq) > 0){
  126 +# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
  127 +# }
  128 +# }
  129 +# j = j+1
  130 +# }
  131 +# DATA
  132 +#}
121 133  
122 134 #5#Function for adjusting the gene names
123 135 gcnames <- function(DiData,usecol=1){
... ... @@ -150,11 +162,9 @@ dndat &lt;- function(NDATA){
150 162  
151 163 if(NDATA[i,j] < -1){
152 164 DDATA[i,j]=0L
153   - }
154   - if(NDATA[i,j] > 1){
  165 + } else if(NDATA[i,j] > 1){
155 166 DDATA[i,j]=2L
156   - }
157   - if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
  167 + } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
158 168 DDATA[i,j]=1L
159 169 }
160 170 } else{
... ... @@ -176,13 +186,13 @@ THEFT &lt;- function(){
176 186 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
177 187 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
178 188 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
179   -
  189 + GSEfloc <- list.files()[GSEfileloc]
180 190 #ALL DATA FILES WILL BE CLEANED
181 191 if(numDAT == 1){
182 192 #indexing the data files
183 193 n <- 1
184   - for(n in 1: length(GSEfileloc)){
185   - alz <- list.files()[GSEfileloc[n]]
  194 + for(n in 1: length(GSEfloc)){
  195 + alz <- GSEfloc[n]
186 196  
187 197 #Working with the wordy part of the document
188 198 alzword <- alz %>%
... ... @@ -234,8 +244,7 @@ THEFT &lt;- function(){
234 244 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
235 245 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
236 246  
237   - }
238   - else if(clfileex == 0){
  247 + } else if(clfileex == 0){
239 248 ##Lets Create a clean version
240 249  
241 250 ##Gene ID to Gene Name
... ... @@ -258,8 +267,7 @@ THEFT &lt;- function(){
258 267 geneIDNam <- genena %>%
259 268 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
260 269 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
261   - }
262   - else if(IDF == 0){
  270 + } else if(IDF == 0){
263 271 #No information on this particular GPL file
264 272 idLOCGPL <- genena %>%
265 273 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
... ... @@ -272,8 +280,7 @@ THEFT &lt;- function(){
272 280 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
273 281 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
274 282 }
275   - }
276   - else if(fileex == 0){
  283 + } else if(fileex == 0){
277 284 #We must create a file that we can access for later use
278 285 idLOCGPL <- genena %>%
279 286 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
... ... @@ -287,8 +294,7 @@ THEFT &lt;- function(){
287 294 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
288 295 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
289 296 }
290   - }
291   - else if(soft == FALSE){
  297 + } else if(soft == FALSE){
292 298 geneIDNam <- genena %>%
293 299 read_delim(delim="\t",comment = "#")%>%
294 300 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
... ... @@ -307,7 +313,7 @@ THEFT &lt;- function(){
307 313  
308 314  
309 315 ##Changing the gene ID to gene name
310   - ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
  316 + ALZDAT1 <- cgeneID(geneIDNam,alzdat)
311 317 colnames(ALZDAT) = ALZDAT1[1,]
312 318  
313 319  
... ... @@ -350,9 +356,14 @@ THEFT &lt;- function(){
350 356 z <- 1
351 357 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
352 358 for(z in 1:dim(RAWWORD)[1]){
353   - naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
354   - z <- z + 1
355   - }
  359 + if(sum(is.na(RAWWORD[z,])) > 0){
  360 + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
  361 + }
  362 + if(length(grep("NA",RAWWORD[z,])) > 0){
  363 + naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
  364 + }
  365 + z <- z + 1
  366 + }
356 367  
357 368 colnames(naroww) <- "ROW_NAs"
358 369 RAWWORD <- bind_cols(RAWWORD,naroww)
... ... @@ -389,9 +400,8 @@ THEFT &lt;- function(){
389 400 ##Putting the ones without duplicates in their new homes
390 401 if(tabRDATID[j] == 1){
391 402 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
392   - }
393   - ##Averaging duplicates and putting them in their new homes
394   - else if(tabRDATID[j] > 1){
  403 + } else if(tabRDATID[j] > 1){
  404 + ##Averaging duplicates and putting them in their new homes
395 405 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
396 406 }
397 407 j <- j + 1
... ... @@ -458,10 +468,9 @@ THEFT &lt;- function(){
458 468 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
459 469 n <- n +1
460 470 }
461   - }
462   -
  471 + } else if(numDAT == 2){
463 472 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
464   - else if(numDAT == 2){
  473 +
465 474 #All the files you want to analyze
466 475 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
467 476 if(length(ANDIS) == 0){
... ... @@ -523,8 +532,7 @@ THEFT &lt;- function(){
523 532 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
524 533 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
525 534  
526   - }
527   - else if(clfileex == 0){
  535 + } else if(clfileex == 0){
528 536 ##Lets Create a clean version
529 537  
530 538 ##Gene ID to Gene Name
... ... @@ -547,8 +555,7 @@ THEFT &lt;- function(){
547 555 geneIDNam <- genena %>%
548 556 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
549 557 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
550   - }
551   - else if(IDF == 0){
  558 + } else if(IDF == 0){
552 559 #No information on this particular GPL file
553 560 idLOCGPL <- genena %>%
554 561 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
... ... @@ -561,8 +568,7 @@ THEFT &lt;- function(){
561 568 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
562 569 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
563 570 }
564   - }
565   - else if(fileex == 0){
  571 + } else if(fileex == 0){
566 572 #We must create a file that we can access for later use
567 573 idLOCGPL <- genena %>%
568 574 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
... ... @@ -576,8 +582,7 @@ THEFT &lt;- function(){
576 582 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
577 583 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
578 584 }
579   - }
580   - else if(soft == FALSE){
  585 + } else if(soft == FALSE){
581 586 geneIDNam <- genena %>%
582 587 read_delim(delim="\t",comment = "#")%>%
583 588 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
... ... @@ -596,7 +601,7 @@ THEFT &lt;- function(){
596 601  
597 602  
598 603 ##Changing the gene ID to gene name
599   - ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
  604 + ALZDAT1 <- cgeneID(geneIDNam,alzdat)
600 605 colnames(ALZDAT) = ALZDAT1[1,]
601 606  
602 607  
... ... @@ -639,9 +644,14 @@ THEFT &lt;- function(){
639 644 z <- 1
640 645 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
641 646 for(z in 1:dim(RAWWORD)[1]){
642   - naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
643   - z <- z + 1
644   - }
  647 + if(sum(is.na(RAWWORD[z,])) > 0){
  648 + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
  649 + }
  650 + if(length(grep("NA",RAWWORD[z,])) > 0){
  651 + naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
  652 + }
  653 + z <- z + 1
  654 + }
645 655  
646 656 colnames(naroww) <- "ROW_NAs"
647 657 RAWWORD <- bind_cols(RAWWORD,naroww)
... ... @@ -678,9 +688,8 @@ THEFT &lt;- function(){
678 688 ##Putting the ones without duplicates in their new homes
679 689 if(tabRDATID[j] == 1){
680 690 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
681   - }
  691 + } else if(tabRDATID[j] > 1){
682 692 ##Averaging duplicates and putting them in their new homes
683   - else if(tabRDATID[j] > 1){
684 693 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
685 694 }
686 695 j <- j + 1
1 1 ##Posted 6/15/2017
2   -
  2 +options(digits = 11)
3 3  
4 4 #Libraries required to run the code
5 5 library(pryr)
... ... @@ -24,11 +24,9 @@ chngrownm &lt;- function(mat){
24 24 for(j in 1:col){
25 25 if("!Sample_source_name_ch1"==mat[1,j]){
26 26 colnames(mat)[j] <- "Brain_Region"
27   - }
28   - if("!Sample_title" == mat[1,j]){
  27 + } else if("!Sample_title" == mat[1,j]){
29 28 colnames(mat)[j] <- "Title"
30   - }
31   - if("!Sample_geo_accession" == mat[1,j]){
  29 + } else if("!Sample_geo_accession" == mat[1,j]){
32 30 colnames(mat)[j] <- "ID_REF"
33 31 } else{
34 32 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
... ... @@ -102,23 +100,41 @@ NAFIXING &lt;- function(GIDNAM){
102 100  
103 101 #4#Function for changing the gene ID to gene name
104 102 cgeneID <- function(GeneName,DATA){
105   - colGene <- dim(GeneName)[2]
106   - j <- 1
107   - for(j in 1:colGene){
108   - chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
109   - if(is.na(sum(chngsreq))==FALSE){
110   - if(sum(chngsreq) > 0){
111   - DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
  103 + nj <- t(GeneName)
  104 + nq <- t(DATA)
  105 + colGene <- dim(nj)[2]
  106 + colDATA <- dim(nq)[2]
  107 + j <- 1
  108 + for(j in 1:colDATA){
  109 + #where is that gene id located within the GPL file
  110 + chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
  111 + if(is.na(sum(chngreq))==FALSE){
  112 + if(sum(chngreq) > 0){
  113 + nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
  114 + }
112 115 }
  116 + j <- j + 1
113 117 }
114   - #if(sum(chngsreq) > 0){
115   - ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
116   - #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
117   - #}
118   - j = j+1
119   - }
120   - DATA
  118 + nq
121 119 }
  120 +#cgeneID <- function(GeneName,DATA){
  121 +# colGene <- dim(GeneName)[2]
  122 +# j <- 1
  123 +# for(j in 1:colGene){
  124 +# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
  125 +# if(is.na(sum(chngsreq))==FALSE){
  126 +# if(sum(chngsreq) > 0){
  127 +# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
  128 +# }
  129 +# }
  130 +# #if(sum(chngsreq) > 0){
  131 +# ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
  132 +# #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
  133 +# #}
  134 +# j = j+1
  135 +# }
  136 +# DATA
  137 +#}
122 138  
123 139 #5#Function for adjusting the gene names
124 140 gcnames <- function(DiData,usecol=1){
... ... @@ -151,11 +167,9 @@ dndat &lt;- function(NDATA){
151 167  
152 168 if(NDATA[i,j] < -1){
153 169 DDATA[i,j]=0L
154   - }
155   - if(NDATA[i,j] > 1){
  170 + } else if(NDATA[i,j] > 1){
156 171 DDATA[i,j]=2L
157   - }
158   - if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
  172 + } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
159 173 DDATA[i,j]=1L
160 174 }
161 175 } else{
... ... @@ -222,8 +236,7 @@ if(clfileex &gt;= 1){
222 236 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
223 237 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
224 238  
225   -}
226   -if(clfileex == 0){
  239 +} else if(clfileex == 0){
227 240 ##Lets Create a clean version
228 241  
229 242 ##Gene ID to Gene Name
... ... @@ -246,8 +259,7 @@ if(clfileex == 0){
246 259 geneIDNam <- genena %>%
247 260 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
248 261 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
249   - }
250   - if(IDF == 0){
  262 + } else if(IDF == 0){
251 263 #No information on this particular GPL file
252 264 idLOCGPL <- genena %>%
253 265 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
... ... @@ -260,8 +272,7 @@ if(clfileex == 0){
260 272 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
261 273 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
262 274 }
263   - }
264   - if(fileex == 0){
  275 + } else if(fileex == 0){
265 276 #We must create a file that we can access for later use
266 277 idLOCGPL <- genena %>%
267 278 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
... ... @@ -275,8 +286,7 @@ if(clfileex == 0){
275 286 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
276 287 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
277 288 }
278   - }
279   - if(soft == FALSE){
  289 + } else if(soft == FALSE){
280 290 geneIDNam <- genena %>%
281 291 read_delim(delim="\t",comment = "#")%>%
282 292 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
... ... @@ -295,7 +305,7 @@ if(clfileex == 0){
295 305  
296 306  
297 307 ##Changing the gene ID to gene name
298   -ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
  308 +ALZDAT1 <- cgeneID(geneIDNam,alzdat)
299 309 colnames(ALZDAT) = ALZDAT1[1,]
300 310  
301 311  
... ... @@ -338,8 +348,13 @@ RAWWORD &lt;- rawword[-hereim,] %&gt;%
338 348 z <- 1
339 349 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
340 350 for(z in 1:dim(RAWWORD)[1]){
341   - naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
342   - z <- z + 1
  351 + if(sum(is.na(RAWWORD[z,])) > 0){
  352 + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
  353 + }
  354 + if(length(grep("NA",RAWWORD[z,])) > 0){
  355 + naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
  356 + }
  357 + z <- z + 1
343 358 }
344 359  
345 360 colnames(naroww) <- "ROW_NAs"
... ... @@ -378,9 +393,8 @@ for(j in 1:length(tabRDATID)){
378 393 ##Putting the ones without duplicates in their new homes
379 394 if(tabRDATID[j] == 1){
380 395 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
381   - }
  396 + } else if(tabRDATID[j] > 1){
382 397 ##Averaging duplicates and putting them in their new homes
383   - if(tabRDATID[j] > 1){
384 398 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
385 399 }
386 400 j <- j + 1