Commit eccb7a19e29c5a6300ce75a7154eac8089de2a0b
Exists in
master
Merge branch 'master' of smlg.fiu.edu:efraingonzalez0/cleaning-and-fixing-data-with-r
Showing
2 changed files
Show diff stats
RAutoClDs.R
... | ... | @@ -2,8 +2,8 @@ |
2 | 2 | # Don't Use This Code Just Yet # |
3 | 3 | ######################################################################## |
4 | 4 | #Efrain H. Gonzalez |
5 | -#6/16/2017 | |
6 | - | |
5 | +#6/21/2017 | |
6 | +options(digits = 11) | |
7 | 7 | #Libraries required to run the code |
8 | 8 | library(pryr) |
9 | 9 | library(MASS) |
... | ... | @@ -27,30 +27,28 @@ chngrownm <- function(mat){ |
27 | 27 | for(e in 1:col){ |
28 | 28 | if("!Sample_source_name_ch1"==mat[1,e]){ |
29 | 29 | colnames(mat)[e] <- "Brain_Region" |
30 | - } | |
31 | - else if("!Sample_title" == mat[1,e]){ | |
30 | + } else if("!Sample_title" == mat[1,e]){ | |
32 | 31 | colnames(mat)[e] <- "Title" |
33 | - } | |
34 | - else if("!Sample_geo_accession" == mat[1,e]){ | |
32 | + } else if("!Sample_geo_accession" == mat[1,e]){ | |
35 | 33 | colnames(mat)[e] <- "ID_REF" |
36 | 34 | } else{ |
37 | 35 | if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){ |
38 | 36 | colnames(mat)[e] <- paste0("Sex",r) |
39 | 37 | r = r + 1 |
40 | 38 | } |
41 | - else if(grepl("postmorteminterval|PMI|pmi",mat[2,e])==TRUE){ | |
39 | + if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){ | |
42 | 40 | colnames(mat)[e] <- paste0("PMI",a) |
43 | 41 | a = a + 1 |
44 | 42 | } |
45 | - else if(grepl("age|Age|AGE",mat[2,e])==TRUE){ | |
43 | + if(grepl("age|Age|AGE",mat[2,e])==TRUE){ | |
46 | 44 | colnames(mat)[e] <- paste0("Age",h) |
47 | 45 | h = h + 1 |
48 | 46 | } |
49 | - else if(grepl("braak|b&b",mat[2,e])==TRUE){ | |
47 | + if(grepl("braak|b&b",mat[2,e])==TRUE){ | |
50 | 48 | colnames(mat)[e] <- paste0("Braak",g) |
51 | 49 | g = g + 1 |
52 | 50 | } |
53 | - else if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){ | |
51 | + if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){ | |
54 | 52 | colnames(mat)[e] <- paste0("Group",o) |
55 | 53 | o = o + 1 |
56 | 54 | } |
... | ... | @@ -68,19 +66,15 @@ cinfo <- function(mat){ |
68 | 66 | for(j in 2:col){ |
69 | 67 | if(grepl("Group",colnames(mat)[j]) == TRUE){ |
70 | 68 | mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) |
71 | - } | |
72 | - else if(grepl("Age",colnames(mat)[j])==TRUE){ | |
69 | + } else if(grepl("Age",colnames(mat)[j])==TRUE){ | |
73 | 70 | mat[,j] <- gsub("\\D","",mat[,j])%>% |
74 | 71 | as.integer() |
75 | - } | |
76 | - else if(grepl("Sex",colnames(mat)[j])==TRUE){ | |
72 | + } else if(grepl("Sex",colnames(mat)[j])==TRUE){ | |
77 | 73 | mat[,j] <- gsub(".+:\\s","",mat[,j]) |
78 | - } | |
79 | - else if(grepl("PMI",colnames(mat)[j])==TRUE){ | |
74 | + } else if(grepl("PMI",colnames(mat)[j])==TRUE){ | |
80 | 75 | mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% |
81 | 76 | as.numeric() |
82 | - } | |
83 | - else if(grepl("Braak",colnames(mat)[j])==TRUE){ | |
77 | + } else if(grepl("Braak",colnames(mat)[j])==TRUE){ | |
84 | 78 | mat[,j]<-gsub(".+:\\s","",mat[,j])%>% |
85 | 79 | as.roman()%>% |
86 | 80 | as.integer() |
... | ... | @@ -105,19 +99,37 @@ NAFIXING <- function(GIDNAM){ |
105 | 99 | |
106 | 100 | #4#Function for changing the gene ID to gene name |
107 | 101 | cgeneID <- function(GeneName,DATA){ |
108 | - colGene <- dim(GeneName)[2] | |
109 | - j <- 1 | |
110 | - for(j in 1:colGene){ | |
111 | - chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) | |
112 | - if(is.na(sum(chngsreq))==FALSE){ | |
113 | - if(sum(chngsreq) > 0){ | |
114 | - DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
102 | + nj <- t(GeneName) | |
103 | + nq <- t(DATA) | |
104 | + colGene <- dim(nj)[2] | |
105 | + colDATA <- dim(nq)[2] | |
106 | + j <- 1 | |
107 | + for(j in 1:colDATA){ | |
108 | + #where is that gene id located within the GPL file | |
109 | + chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,]) | |
110 | + if(is.na(sum(chngreq))==FALSE){ | |
111 | + if(sum(chngreq) > 0){ | |
112 | + nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j]) | |
113 | + } | |
115 | 114 | } |
115 | + j <- j + 1 | |
116 | 116 | } |
117 | - j = j+1 | |
118 | - } | |
119 | - DATA | |
117 | + nq | |
120 | 118 | } |
119 | +#cgeneID <- function(GeneName,DATA){ | |
120 | +# colGene <- dim(GeneName)[2] | |
121 | +# j <- 1 | |
122 | +# for(j in 1:colGene){ | |
123 | +# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) | |
124 | +# if(is.na(sum(chngsreq))==FALSE){ | |
125 | +# if(sum(chngsreq) > 0){ | |
126 | +# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
127 | +# } | |
128 | +# } | |
129 | +# j = j+1 | |
130 | +# } | |
131 | +# DATA | |
132 | +#} | |
121 | 133 | |
122 | 134 | #5#Function for adjusting the gene names |
123 | 135 | gcnames <- function(DiData,usecol=1){ |
... | ... | @@ -150,11 +162,9 @@ dndat <- function(NDATA){ |
150 | 162 | |
151 | 163 | if(NDATA[i,j] < -1){ |
152 | 164 | DDATA[i,j]=0L |
153 | - } | |
154 | - if(NDATA[i,j] > 1){ | |
165 | + } else if(NDATA[i,j] > 1){ | |
155 | 166 | DDATA[i,j]=2L |
156 | - } | |
157 | - if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | |
167 | + } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | |
158 | 168 | DDATA[i,j]=1L |
159 | 169 | } |
160 | 170 | } else{ |
... | ... | @@ -176,13 +186,13 @@ THEFT <- function(){ |
176 | 186 | #gsub("wd",wd,"Do you want to clean all data files in the directory wd?") |
177 | 187 | numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) |
178 | 188 | GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files()) |
179 | - | |
189 | + GSEfloc <- list.files()[GSEfileloc] | |
180 | 190 | #ALL DATA FILES WILL BE CLEANED |
181 | 191 | if(numDAT == 1){ |
182 | 192 | #indexing the data files |
183 | 193 | n <- 1 |
184 | - for(n in 1: length(GSEfileloc)){ | |
185 | - alz <- list.files()[GSEfileloc[n]] | |
194 | + for(n in 1: length(GSEfloc)){ | |
195 | + alz <- GSEfloc[n] | |
186 | 196 | |
187 | 197 | #Working with the wordy part of the document |
188 | 198 | alzword <- alz %>% |
... | ... | @@ -234,8 +244,7 @@ THEFT <- function(){ |
234 | 244 | geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% |
235 | 245 | read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") |
236 | 246 | |
237 | - } | |
238 | - else if(clfileex == 0){ | |
247 | + } else if(clfileex == 0){ | |
239 | 248 | ##Lets Create a clean version |
240 | 249 | |
241 | 250 | ##Gene ID to Gene Name |
... | ... | @@ -258,8 +267,7 @@ THEFT <- function(){ |
258 | 267 | geneIDNam <- genena %>% |
259 | 268 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% |
260 | 269 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
261 | - } | |
262 | - else if(IDF == 0){ | |
270 | + } else if(IDF == 0){ | |
263 | 271 | #No information on this particular GPL file |
264 | 272 | idLOCGPL <- genena %>% |
265 | 273 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% |
... | ... | @@ -272,8 +280,7 @@ THEFT <- function(){ |
272 | 280 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% |
273 | 281 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
274 | 282 | } |
275 | - } | |
276 | - else if(fileex == 0){ | |
283 | + } else if(fileex == 0){ | |
277 | 284 | #We must create a file that we can access for later use |
278 | 285 | idLOCGPL <- genena %>% |
279 | 286 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% |
... | ... | @@ -287,8 +294,7 @@ THEFT <- function(){ |
287 | 294 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% |
288 | 295 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
289 | 296 | } |
290 | - } | |
291 | - else if(soft == FALSE){ | |
297 | + } else if(soft == FALSE){ | |
292 | 298 | geneIDNam <- genena %>% |
293 | 299 | read_delim(delim="\t",comment = "#")%>% |
294 | 300 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
... | ... | @@ -307,7 +313,7 @@ THEFT <- function(){ |
307 | 313 | |
308 | 314 | |
309 | 315 | ##Changing the gene ID to gene name |
310 | - ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) | |
316 | + ALZDAT1 <- cgeneID(geneIDNam,alzdat) | |
311 | 317 | colnames(ALZDAT) = ALZDAT1[1,] |
312 | 318 | |
313 | 319 | |
... | ... | @@ -350,9 +356,14 @@ THEFT <- function(){ |
350 | 356 | z <- 1 |
351 | 357 | naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) |
352 | 358 | for(z in 1:dim(RAWWORD)[1]){ |
353 | - naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
354 | - z <- z + 1 | |
355 | - } | |
359 | + if(sum(is.na(RAWWORD[z,])) > 0){ | |
360 | + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
361 | + } | |
362 | + if(length(grep("NA",RAWWORD[z,])) > 0){ | |
363 | + naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] | |
364 | + } | |
365 | + z <- z + 1 | |
366 | + } | |
356 | 367 | |
357 | 368 | colnames(naroww) <- "ROW_NAs" |
358 | 369 | RAWWORD <- bind_cols(RAWWORD,naroww) |
... | ... | @@ -389,9 +400,8 @@ THEFT <- function(){ |
389 | 400 | ##Putting the ones without duplicates in their new homes |
390 | 401 | if(tabRDATID[j] == 1){ |
391 | 402 | NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] |
392 | - } | |
393 | - ##Averaging duplicates and putting them in their new homes | |
394 | - else if(tabRDATID[j] > 1){ | |
403 | + } else if(tabRDATID[j] > 1){ | |
404 | + ##Averaging duplicates and putting them in their new homes | |
395 | 405 | NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) |
396 | 406 | } |
397 | 407 | j <- j + 1 |
... | ... | @@ -458,10 +468,9 @@ THEFT <- function(){ |
458 | 468 | write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) |
459 | 469 | n <- n +1 |
460 | 470 | } |
461 | - } | |
462 | - | |
471 | + } else if(numDAT == 2){ | |
463 | 472 | #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN |
464 | - else if(numDAT == 2){ | |
473 | + | |
465 | 474 | #All the files you want to analyze |
466 | 475 | ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:") |
467 | 476 | if(length(ANDIS) == 0){ |
... | ... | @@ -523,8 +532,7 @@ THEFT <- function(){ |
523 | 532 | geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% |
524 | 533 | read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") |
525 | 534 | |
526 | - } | |
527 | - else if(clfileex == 0){ | |
535 | + } else if(clfileex == 0){ | |
528 | 536 | ##Lets Create a clean version |
529 | 537 | |
530 | 538 | ##Gene ID to Gene Name |
... | ... | @@ -547,8 +555,7 @@ THEFT <- function(){ |
547 | 555 | geneIDNam <- genena %>% |
548 | 556 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% |
549 | 557 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
550 | - } | |
551 | - else if(IDF == 0){ | |
558 | + } else if(IDF == 0){ | |
552 | 559 | #No information on this particular GPL file |
553 | 560 | idLOCGPL <- genena %>% |
554 | 561 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% |
... | ... | @@ -561,8 +568,7 @@ THEFT <- function(){ |
561 | 568 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% |
562 | 569 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
563 | 570 | } |
564 | - } | |
565 | - else if(fileex == 0){ | |
571 | + } else if(fileex == 0){ | |
566 | 572 | #We must create a file that we can access for later use |
567 | 573 | idLOCGPL <- genena %>% |
568 | 574 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% |
... | ... | @@ -576,8 +582,7 @@ THEFT <- function(){ |
576 | 582 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% |
577 | 583 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
578 | 584 | } |
579 | - } | |
580 | - else if(soft == FALSE){ | |
585 | + } else if(soft == FALSE){ | |
581 | 586 | geneIDNam <- genena %>% |
582 | 587 | read_delim(delim="\t",comment = "#")%>% |
583 | 588 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
... | ... | @@ -596,7 +601,7 @@ THEFT <- function(){ |
596 | 601 | |
597 | 602 | |
598 | 603 | ##Changing the gene ID to gene name |
599 | - ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) | |
604 | + ALZDAT1 <- cgeneID(geneIDNam,alzdat) | |
600 | 605 | colnames(ALZDAT) = ALZDAT1[1,] |
601 | 606 | |
602 | 607 | |
... | ... | @@ -639,9 +644,14 @@ THEFT <- function(){ |
639 | 644 | z <- 1 |
640 | 645 | naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) |
641 | 646 | for(z in 1:dim(RAWWORD)[1]){ |
642 | - naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
643 | - z <- z + 1 | |
644 | - } | |
647 | + if(sum(is.na(RAWWORD[z,])) > 0){ | |
648 | + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
649 | + } | |
650 | + if(length(grep("NA",RAWWORD[z,])) > 0){ | |
651 | + naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] | |
652 | + } | |
653 | + z <- z + 1 | |
654 | + } | |
645 | 655 | |
646 | 656 | colnames(naroww) <- "ROW_NAs" |
647 | 657 | RAWWORD <- bind_cols(RAWWORD,naroww) |
... | ... | @@ -678,9 +688,8 @@ THEFT <- function(){ |
678 | 688 | ##Putting the ones without duplicates in their new homes |
679 | 689 | if(tabRDATID[j] == 1){ |
680 | 690 | NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] |
681 | - } | |
691 | + } else if(tabRDATID[j] > 1){ | |
682 | 692 | ##Averaging duplicates and putting them in their new homes |
683 | - else if(tabRDATID[j] > 1){ | |
684 | 693 | NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) |
685 | 694 | } |
686 | 695 | j <- j + 1 |
RCleanDscret.R
1 | 1 | ##Posted 6/15/2017 |
2 | - | |
2 | +options(digits = 11) | |
3 | 3 | |
4 | 4 | #Libraries required to run the code |
5 | 5 | library(pryr) |
... | ... | @@ -24,11 +24,9 @@ chngrownm <- function(mat){ |
24 | 24 | for(j in 1:col){ |
25 | 25 | if("!Sample_source_name_ch1"==mat[1,j]){ |
26 | 26 | colnames(mat)[j] <- "Brain_Region" |
27 | - } | |
28 | - if("!Sample_title" == mat[1,j]){ | |
27 | + } else if("!Sample_title" == mat[1,j]){ | |
29 | 28 | colnames(mat)[j] <- "Title" |
30 | - } | |
31 | - if("!Sample_geo_accession" == mat[1,j]){ | |
29 | + } else if("!Sample_geo_accession" == mat[1,j]){ | |
32 | 30 | colnames(mat)[j] <- "ID_REF" |
33 | 31 | } else{ |
34 | 32 | if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ |
... | ... | @@ -102,23 +100,41 @@ NAFIXING <- function(GIDNAM){ |
102 | 100 | |
103 | 101 | #4#Function for changing the gene ID to gene name |
104 | 102 | cgeneID <- function(GeneName,DATA){ |
105 | - colGene <- dim(GeneName)[2] | |
106 | - j <- 1 | |
107 | - for(j in 1:colGene){ | |
108 | - chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) | |
109 | - if(is.na(sum(chngsreq))==FALSE){ | |
110 | - if(sum(chngsreq) > 0){ | |
111 | - DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
103 | + nj <- t(GeneName) | |
104 | + nq <- t(DATA) | |
105 | + colGene <- dim(nj)[2] | |
106 | + colDATA <- dim(nq)[2] | |
107 | + j <- 1 | |
108 | + for(j in 1:colDATA){ | |
109 | + #where is that gene id located within the GPL file | |
110 | + chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,]) | |
111 | + if(is.na(sum(chngreq))==FALSE){ | |
112 | + if(sum(chngreq) > 0){ | |
113 | + nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j]) | |
114 | + } | |
112 | 115 | } |
116 | + j <- j + 1 | |
113 | 117 | } |
114 | - #if(sum(chngsreq) > 0){ | |
115 | - ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) | |
116 | - #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
117 | - #} | |
118 | - j = j+1 | |
119 | - } | |
120 | - DATA | |
118 | + nq | |
121 | 119 | } |
120 | +#cgeneID <- function(GeneName,DATA){ | |
121 | +# colGene <- dim(GeneName)[2] | |
122 | +# j <- 1 | |
123 | +# for(j in 1:colGene){ | |
124 | +# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) | |
125 | +# if(is.na(sum(chngsreq))==FALSE){ | |
126 | +# if(sum(chngsreq) > 0){ | |
127 | +# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
128 | +# } | |
129 | +# } | |
130 | +# #if(sum(chngsreq) > 0){ | |
131 | +# ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) | |
132 | +# #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
133 | +# #} | |
134 | +# j = j+1 | |
135 | +# } | |
136 | +# DATA | |
137 | +#} | |
122 | 138 | |
123 | 139 | #5#Function for adjusting the gene names |
124 | 140 | gcnames <- function(DiData,usecol=1){ |
... | ... | @@ -151,11 +167,9 @@ dndat <- function(NDATA){ |
151 | 167 | |
152 | 168 | if(NDATA[i,j] < -1){ |
153 | 169 | DDATA[i,j]=0L |
154 | - } | |
155 | - if(NDATA[i,j] > 1){ | |
170 | + } else if(NDATA[i,j] > 1){ | |
156 | 171 | DDATA[i,j]=2L |
157 | - } | |
158 | - if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | |
172 | + } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | |
159 | 173 | DDATA[i,j]=1L |
160 | 174 | } |
161 | 175 | } else{ |
... | ... | @@ -222,8 +236,7 @@ if(clfileex >= 1){ |
222 | 236 | geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% |
223 | 237 | read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") |
224 | 238 | |
225 | -} | |
226 | -if(clfileex == 0){ | |
239 | +} else if(clfileex == 0){ | |
227 | 240 | ##Lets Create a clean version |
228 | 241 | |
229 | 242 | ##Gene ID to Gene Name |
... | ... | @@ -246,8 +259,7 @@ if(clfileex == 0){ |
246 | 259 | geneIDNam <- genena %>% |
247 | 260 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% |
248 | 261 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
249 | - } | |
250 | - if(IDF == 0){ | |
262 | + } else if(IDF == 0){ | |
251 | 263 | #No information on this particular GPL file |
252 | 264 | idLOCGPL <- genena %>% |
253 | 265 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% |
... | ... | @@ -260,8 +272,7 @@ if(clfileex == 0){ |
260 | 272 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% |
261 | 273 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
262 | 274 | } |
263 | - } | |
264 | - if(fileex == 0){ | |
275 | + } else if(fileex == 0){ | |
265 | 276 | #We must create a file that we can access for later use |
266 | 277 | idLOCGPL <- genena %>% |
267 | 278 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% |
... | ... | @@ -275,8 +286,7 @@ if(clfileex == 0){ |
275 | 286 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% |
276 | 287 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
277 | 288 | } |
278 | - } | |
279 | - if(soft == FALSE){ | |
289 | + } else if(soft == FALSE){ | |
280 | 290 | geneIDNam <- genena %>% |
281 | 291 | read_delim(delim="\t",comment = "#")%>% |
282 | 292 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
... | ... | @@ -295,7 +305,7 @@ if(clfileex == 0){ |
295 | 305 | |
296 | 306 | |
297 | 307 | ##Changing the gene ID to gene name |
298 | -ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) | |
308 | +ALZDAT1 <- cgeneID(geneIDNam,alzdat) | |
299 | 309 | colnames(ALZDAT) = ALZDAT1[1,] |
300 | 310 | |
301 | 311 | |
... | ... | @@ -338,8 +348,13 @@ RAWWORD <- rawword[-hereim,] %>% |
338 | 348 | z <- 1 |
339 | 349 | naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) |
340 | 350 | for(z in 1:dim(RAWWORD)[1]){ |
341 | - naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
342 | - z <- z + 1 | |
351 | + if(sum(is.na(RAWWORD[z,])) > 0){ | |
352 | + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
353 | + } | |
354 | + if(length(grep("NA",RAWWORD[z,])) > 0){ | |
355 | + naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] | |
356 | + } | |
357 | + z <- z + 1 | |
343 | 358 | } |
344 | 359 | |
345 | 360 | colnames(naroww) <- "ROW_NAs" |
... | ... | @@ -378,9 +393,8 @@ for(j in 1:length(tabRDATID)){ |
378 | 393 | ##Putting the ones without duplicates in their new homes |
379 | 394 | if(tabRDATID[j] == 1){ |
380 | 395 | NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] |
381 | - } | |
396 | + } else if(tabRDATID[j] > 1){ | |
382 | 397 | ##Averaging duplicates and putting them in their new homes |
383 | - if(tabRDATID[j] > 1){ | |
384 | 398 | NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) |
385 | 399 | } |
386 | 400 | j <- j + 1 |