Commit eccb7a19e29c5a6300ce75a7154eac8089de2a0b
Exists in
master
Merge branch 'master' of smlg.fiu.edu:efraingonzalez0/cleaning-and-fixing-data-with-r
Showing
2 changed files
 
Show diff stats
RAutoClDs.R
| ... | ... | @@ -2,8 +2,8 @@ | 
| 2 | 2 | # Don't Use This Code Just Yet # | 
| 3 | 3 | ######################################################################## | 
| 4 | 4 | #Efrain H. Gonzalez | 
| 5 | -#6/16/2017 | |
| 6 | - | |
| 5 | +#6/21/2017 | |
| 6 | +options(digits = 11) | |
| 7 | 7 | #Libraries required to run the code | 
| 8 | 8 | library(pryr) | 
| 9 | 9 | library(MASS) | 
| ... | ... | @@ -27,30 +27,28 @@ chngrownm <- function(mat){ | 
| 27 | 27 | for(e in 1:col){ | 
| 28 | 28 | if("!Sample_source_name_ch1"==mat[1,e]){ | 
| 29 | 29 | colnames(mat)[e] <- "Brain_Region" | 
| 30 | - } | |
| 31 | - else if("!Sample_title" == mat[1,e]){ | |
| 30 | + } else if("!Sample_title" == mat[1,e]){ | |
| 32 | 31 | colnames(mat)[e] <- "Title" | 
| 33 | - } | |
| 34 | - else if("!Sample_geo_accession" == mat[1,e]){ | |
| 32 | + } else if("!Sample_geo_accession" == mat[1,e]){ | |
| 35 | 33 | colnames(mat)[e] <- "ID_REF" | 
| 36 | 34 | } else{ | 
| 37 | 35 | if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){ | 
| 38 | 36 | colnames(mat)[e] <- paste0("Sex",r) | 
| 39 | 37 | r = r + 1 | 
| 40 | 38 | } | 
| 41 | - else if(grepl("postmorteminterval|PMI|pmi",mat[2,e])==TRUE){ | |
| 39 | + if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){ | |
| 42 | 40 | colnames(mat)[e] <- paste0("PMI",a) | 
| 43 | 41 | a = a + 1 | 
| 44 | 42 | } | 
| 45 | - else if(grepl("age|Age|AGE",mat[2,e])==TRUE){ | |
| 43 | + if(grepl("age|Age|AGE",mat[2,e])==TRUE){ | |
| 46 | 44 | colnames(mat)[e] <- paste0("Age",h) | 
| 47 | 45 | h = h + 1 | 
| 48 | 46 | } | 
| 49 | - else if(grepl("braak|b&b",mat[2,e])==TRUE){ | |
| 47 | + if(grepl("braak|b&b",mat[2,e])==TRUE){ | |
| 50 | 48 | colnames(mat)[e] <- paste0("Braak",g) | 
| 51 | 49 | g = g + 1 | 
| 52 | 50 | } | 
| 53 | - else if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){ | |
| 51 | + if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){ | |
| 54 | 52 | colnames(mat)[e] <- paste0("Group",o) | 
| 55 | 53 | o = o + 1 | 
| 56 | 54 | } | 
| ... | ... | @@ -68,19 +66,15 @@ cinfo <- function(mat){ | 
| 68 | 66 | for(j in 2:col){ | 
| 69 | 67 | if(grepl("Group",colnames(mat)[j]) == TRUE){ | 
| 70 | 68 | mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) | 
| 71 | - } | |
| 72 | - else if(grepl("Age",colnames(mat)[j])==TRUE){ | |
| 69 | + } else if(grepl("Age",colnames(mat)[j])==TRUE){ | |
| 73 | 70 | mat[,j] <- gsub("\\D","",mat[,j])%>% | 
| 74 | 71 | as.integer() | 
| 75 | - } | |
| 76 | - else if(grepl("Sex",colnames(mat)[j])==TRUE){ | |
| 72 | + } else if(grepl("Sex",colnames(mat)[j])==TRUE){ | |
| 77 | 73 | mat[,j] <- gsub(".+:\\s","",mat[,j]) | 
| 78 | - } | |
| 79 | - else if(grepl("PMI",colnames(mat)[j])==TRUE){ | |
| 74 | + } else if(grepl("PMI",colnames(mat)[j])==TRUE){ | |
| 80 | 75 | mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% | 
| 81 | 76 | as.numeric() | 
| 82 | - } | |
| 83 | - else if(grepl("Braak",colnames(mat)[j])==TRUE){ | |
| 77 | + } else if(grepl("Braak",colnames(mat)[j])==TRUE){ | |
| 84 | 78 | mat[,j]<-gsub(".+:\\s","",mat[,j])%>% | 
| 85 | 79 | as.roman()%>% | 
| 86 | 80 | as.integer() | 
| ... | ... | @@ -105,19 +99,37 @@ NAFIXING <- function(GIDNAM){ | 
| 105 | 99 | |
| 106 | 100 | #4#Function for changing the gene ID to gene name | 
| 107 | 101 | cgeneID <- function(GeneName,DATA){ | 
| 108 | - colGene <- dim(GeneName)[2] | |
| 109 | - j <- 1 | |
| 110 | - for(j in 1:colGene){ | |
| 111 | - chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) | |
| 112 | - if(is.na(sum(chngsreq))==FALSE){ | |
| 113 | - if(sum(chngsreq) > 0){ | |
| 114 | - DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
| 102 | + nj <- t(GeneName) | |
| 103 | + nq <- t(DATA) | |
| 104 | + colGene <- dim(nj)[2] | |
| 105 | + colDATA <- dim(nq)[2] | |
| 106 | + j <- 1 | |
| 107 | + for(j in 1:colDATA){ | |
| 108 | + #where is that gene id located within the GPL file | |
| 109 | + chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,]) | |
| 110 | + if(is.na(sum(chngreq))==FALSE){ | |
| 111 | + if(sum(chngreq) > 0){ | |
| 112 | + nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j]) | |
| 113 | + } | |
| 115 | 114 | } | 
| 115 | + j <- j + 1 | |
| 116 | 116 | } | 
| 117 | - j = j+1 | |
| 118 | - } | |
| 119 | - DATA | |
| 117 | + nq | |
| 120 | 118 | } | 
| 119 | +#cgeneID <- function(GeneName,DATA){ | |
| 120 | +# colGene <- dim(GeneName)[2] | |
| 121 | +# j <- 1 | |
| 122 | +# for(j in 1:colGene){ | |
| 123 | +# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) | |
| 124 | +# if(is.na(sum(chngsreq))==FALSE){ | |
| 125 | +# if(sum(chngsreq) > 0){ | |
| 126 | +# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
| 127 | +# } | |
| 128 | +# } | |
| 129 | +# j = j+1 | |
| 130 | +# } | |
| 131 | +# DATA | |
| 132 | +#} | |
| 121 | 133 | |
| 122 | 134 | #5#Function for adjusting the gene names | 
| 123 | 135 | gcnames <- function(DiData,usecol=1){ | 
| ... | ... | @@ -150,11 +162,9 @@ dndat <- function(NDATA){ | 
| 150 | 162 | |
| 151 | 163 | if(NDATA[i,j] < -1){ | 
| 152 | 164 | DDATA[i,j]=0L | 
| 153 | - } | |
| 154 | - if(NDATA[i,j] > 1){ | |
| 165 | + } else if(NDATA[i,j] > 1){ | |
| 155 | 166 | DDATA[i,j]=2L | 
| 156 | - } | |
| 157 | - if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | |
| 167 | + } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | |
| 158 | 168 | DDATA[i,j]=1L | 
| 159 | 169 | } | 
| 160 | 170 | } else{ | 
| ... | ... | @@ -176,13 +186,13 @@ THEFT <- function(){ | 
| 176 | 186 | #gsub("wd",wd,"Do you want to clean all data files in the directory wd?") | 
| 177 | 187 | numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) | 
| 178 | 188 | GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files()) | 
| 179 | - | |
| 189 | + GSEfloc <- list.files()[GSEfileloc] | |
| 180 | 190 | #ALL DATA FILES WILL BE CLEANED | 
| 181 | 191 | if(numDAT == 1){ | 
| 182 | 192 | #indexing the data files | 
| 183 | 193 | n <- 1 | 
| 184 | - for(n in 1: length(GSEfileloc)){ | |
| 185 | - alz <- list.files()[GSEfileloc[n]] | |
| 194 | + for(n in 1: length(GSEfloc)){ | |
| 195 | + alz <- GSEfloc[n] | |
| 186 | 196 | |
| 187 | 197 | #Working with the wordy part of the document | 
| 188 | 198 | alzword <- alz %>% | 
| ... | ... | @@ -234,8 +244,7 @@ THEFT <- function(){ | 
| 234 | 244 | geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% | 
| 235 | 245 | read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") | 
| 236 | 246 | |
| 237 | - } | |
| 238 | - else if(clfileex == 0){ | |
| 247 | + } else if(clfileex == 0){ | |
| 239 | 248 | ##Lets Create a clean version | 
| 240 | 249 | |
| 241 | 250 | ##Gene ID to Gene Name | 
| ... | ... | @@ -258,8 +267,7 @@ THEFT <- function(){ | 
| 258 | 267 | geneIDNam <- genena %>% | 
| 259 | 268 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% | 
| 260 | 269 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | 
| 261 | - } | |
| 262 | - else if(IDF == 0){ | |
| 270 | + } else if(IDF == 0){ | |
| 263 | 271 | #No information on this particular GPL file | 
| 264 | 272 | idLOCGPL <- genena %>% | 
| 265 | 273 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | 
| ... | ... | @@ -272,8 +280,7 @@ THEFT <- function(){ | 
| 272 | 280 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | 
| 273 | 281 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | 
| 274 | 282 | } | 
| 275 | - } | |
| 276 | - else if(fileex == 0){ | |
| 283 | + } else if(fileex == 0){ | |
| 277 | 284 | #We must create a file that we can access for later use | 
| 278 | 285 | idLOCGPL <- genena %>% | 
| 279 | 286 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | 
| ... | ... | @@ -287,8 +294,7 @@ THEFT <- function(){ | 
| 287 | 294 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | 
| 288 | 295 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | 
| 289 | 296 | } | 
| 290 | - } | |
| 291 | - else if(soft == FALSE){ | |
| 297 | + } else if(soft == FALSE){ | |
| 292 | 298 | geneIDNam <- genena %>% | 
| 293 | 299 | read_delim(delim="\t",comment = "#")%>% | 
| 294 | 300 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | 
| ... | ... | @@ -307,7 +313,7 @@ THEFT <- function(){ | 
| 307 | 313 | |
| 308 | 314 | |
| 309 | 315 | ##Changing the gene ID to gene name | 
| 310 | - ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) | |
| 316 | + ALZDAT1 <- cgeneID(geneIDNam,alzdat) | |
| 311 | 317 | colnames(ALZDAT) = ALZDAT1[1,] | 
| 312 | 318 | |
| 313 | 319 | |
| ... | ... | @@ -350,9 +356,14 @@ THEFT <- function(){ | 
| 350 | 356 | z <- 1 | 
| 351 | 357 | naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) | 
| 352 | 358 | for(z in 1:dim(RAWWORD)[1]){ | 
| 353 | - naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
| 354 | - z <- z + 1 | |
| 355 | - } | |
| 359 | + if(sum(is.na(RAWWORD[z,])) > 0){ | |
| 360 | + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
| 361 | + } | |
| 362 | + if(length(grep("NA",RAWWORD[z,])) > 0){ | |
| 363 | + naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] | |
| 364 | + } | |
| 365 | + z <- z + 1 | |
| 366 | + } | |
| 356 | 367 | |
| 357 | 368 | colnames(naroww) <- "ROW_NAs" | 
| 358 | 369 | RAWWORD <- bind_cols(RAWWORD,naroww) | 
| ... | ... | @@ -389,9 +400,8 @@ THEFT <- function(){ | 
| 389 | 400 | ##Putting the ones without duplicates in their new homes | 
| 390 | 401 | if(tabRDATID[j] == 1){ | 
| 391 | 402 | NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] | 
| 392 | - } | |
| 393 | - ##Averaging duplicates and putting them in their new homes | |
| 394 | - else if(tabRDATID[j] > 1){ | |
| 403 | + } else if(tabRDATID[j] > 1){ | |
| 404 | + ##Averaging duplicates and putting them in their new homes | |
| 395 | 405 | NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) | 
| 396 | 406 | } | 
| 397 | 407 | j <- j + 1 | 
| ... | ... | @@ -458,10 +468,9 @@ THEFT <- function(){ | 
| 458 | 468 | write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) | 
| 459 | 469 | n <- n +1 | 
| 460 | 470 | } | 
| 461 | - } | |
| 462 | - | |
| 471 | + } else if(numDAT == 2){ | |
| 463 | 472 | #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN | 
| 464 | - else if(numDAT == 2){ | |
| 473 | + | |
| 465 | 474 | #All the files you want to analyze | 
| 466 | 475 | ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:") | 
| 467 | 476 | if(length(ANDIS) == 0){ | 
| ... | ... | @@ -523,8 +532,7 @@ THEFT <- function(){ | 
| 523 | 532 | geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% | 
| 524 | 533 | read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") | 
| 525 | 534 | |
| 526 | - } | |
| 527 | - else if(clfileex == 0){ | |
| 535 | + } else if(clfileex == 0){ | |
| 528 | 536 | ##Lets Create a clean version | 
| 529 | 537 | |
| 530 | 538 | ##Gene ID to Gene Name | 
| ... | ... | @@ -547,8 +555,7 @@ THEFT <- function(){ | 
| 547 | 555 | geneIDNam <- genena %>% | 
| 548 | 556 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% | 
| 549 | 557 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | 
| 550 | - } | |
| 551 | - else if(IDF == 0){ | |
| 558 | + } else if(IDF == 0){ | |
| 552 | 559 | #No information on this particular GPL file | 
| 553 | 560 | idLOCGPL <- genena %>% | 
| 554 | 561 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | 
| ... | ... | @@ -561,8 +568,7 @@ THEFT <- function(){ | 
| 561 | 568 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | 
| 562 | 569 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | 
| 563 | 570 | } | 
| 564 | - } | |
| 565 | - else if(fileex == 0){ | |
| 571 | + } else if(fileex == 0){ | |
| 566 | 572 | #We must create a file that we can access for later use | 
| 567 | 573 | idLOCGPL <- genena %>% | 
| 568 | 574 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | 
| ... | ... | @@ -576,8 +582,7 @@ THEFT <- function(){ | 
| 576 | 582 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | 
| 577 | 583 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | 
| 578 | 584 | } | 
| 579 | - } | |
| 580 | - else if(soft == FALSE){ | |
| 585 | + } else if(soft == FALSE){ | |
| 581 | 586 | geneIDNam <- genena %>% | 
| 582 | 587 | read_delim(delim="\t",comment = "#")%>% | 
| 583 | 588 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | 
| ... | ... | @@ -596,7 +601,7 @@ THEFT <- function(){ | 
| 596 | 601 | |
| 597 | 602 | |
| 598 | 603 | ##Changing the gene ID to gene name | 
| 599 | - ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) | |
| 604 | + ALZDAT1 <- cgeneID(geneIDNam,alzdat) | |
| 600 | 605 | colnames(ALZDAT) = ALZDAT1[1,] | 
| 601 | 606 | |
| 602 | 607 | |
| ... | ... | @@ -639,9 +644,14 @@ THEFT <- function(){ | 
| 639 | 644 | z <- 1 | 
| 640 | 645 | naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) | 
| 641 | 646 | for(z in 1:dim(RAWWORD)[1]){ | 
| 642 | - naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
| 643 | - z <- z + 1 | |
| 644 | - } | |
| 647 | + if(sum(is.na(RAWWORD[z,])) > 0){ | |
| 648 | + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
| 649 | + } | |
| 650 | + if(length(grep("NA",RAWWORD[z,])) > 0){ | |
| 651 | + naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] | |
| 652 | + } | |
| 653 | + z <- z + 1 | |
| 654 | + } | |
| 645 | 655 | |
| 646 | 656 | colnames(naroww) <- "ROW_NAs" | 
| 647 | 657 | RAWWORD <- bind_cols(RAWWORD,naroww) | 
| ... | ... | @@ -678,9 +688,8 @@ THEFT <- function(){ | 
| 678 | 688 | ##Putting the ones without duplicates in their new homes | 
| 679 | 689 | if(tabRDATID[j] == 1){ | 
| 680 | 690 | NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] | 
| 681 | - } | |
| 691 | + } else if(tabRDATID[j] > 1){ | |
| 682 | 692 | ##Averaging duplicates and putting them in their new homes | 
| 683 | - else if(tabRDATID[j] > 1){ | |
| 684 | 693 | NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) | 
| 685 | 694 | } | 
| 686 | 695 | j <- j + 1 | 
RCleanDscret.R
| 1 | 1 | ##Posted 6/15/2017 | 
| 2 | - | |
| 2 | +options(digits = 11) | |
| 3 | 3 | |
| 4 | 4 | #Libraries required to run the code | 
| 5 | 5 | library(pryr) | 
| ... | ... | @@ -24,11 +24,9 @@ chngrownm <- function(mat){ | 
| 24 | 24 | for(j in 1:col){ | 
| 25 | 25 | if("!Sample_source_name_ch1"==mat[1,j]){ | 
| 26 | 26 | colnames(mat)[j] <- "Brain_Region" | 
| 27 | - } | |
| 28 | - if("!Sample_title" == mat[1,j]){ | |
| 27 | + } else if("!Sample_title" == mat[1,j]){ | |
| 29 | 28 | colnames(mat)[j] <- "Title" | 
| 30 | - } | |
| 31 | - if("!Sample_geo_accession" == mat[1,j]){ | |
| 29 | + } else if("!Sample_geo_accession" == mat[1,j]){ | |
| 32 | 30 | colnames(mat)[j] <- "ID_REF" | 
| 33 | 31 | } else{ | 
| 34 | 32 | if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ | 
| ... | ... | @@ -102,23 +100,41 @@ NAFIXING <- function(GIDNAM){ | 
| 102 | 100 | |
| 103 | 101 | #4#Function for changing the gene ID to gene name | 
| 104 | 102 | cgeneID <- function(GeneName,DATA){ | 
| 105 | - colGene <- dim(GeneName)[2] | |
| 106 | - j <- 1 | |
| 107 | - for(j in 1:colGene){ | |
| 108 | - chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) | |
| 109 | - if(is.na(sum(chngsreq))==FALSE){ | |
| 110 | - if(sum(chngsreq) > 0){ | |
| 111 | - DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
| 103 | + nj <- t(GeneName) | |
| 104 | + nq <- t(DATA) | |
| 105 | + colGene <- dim(nj)[2] | |
| 106 | + colDATA <- dim(nq)[2] | |
| 107 | + j <- 1 | |
| 108 | + for(j in 1:colDATA){ | |
| 109 | + #where is that gene id located within the GPL file | |
| 110 | + chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,]) | |
| 111 | + if(is.na(sum(chngreq))==FALSE){ | |
| 112 | + if(sum(chngreq) > 0){ | |
| 113 | + nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j]) | |
| 114 | + } | |
| 112 | 115 | } | 
| 116 | + j <- j + 1 | |
| 113 | 117 | } | 
| 114 | - #if(sum(chngsreq) > 0){ | |
| 115 | - ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) | |
| 116 | - #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
| 117 | - #} | |
| 118 | - j = j+1 | |
| 119 | - } | |
| 120 | - DATA | |
| 118 | + nq | |
| 121 | 119 | } | 
| 120 | +#cgeneID <- function(GeneName,DATA){ | |
| 121 | +# colGene <- dim(GeneName)[2] | |
| 122 | +# j <- 1 | |
| 123 | +# for(j in 1:colGene){ | |
| 124 | +# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) | |
| 125 | +# if(is.na(sum(chngsreq))==FALSE){ | |
| 126 | +# if(sum(chngsreq) > 0){ | |
| 127 | +# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
| 128 | +# } | |
| 129 | +# } | |
| 130 | +# #if(sum(chngsreq) > 0){ | |
| 131 | +# ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) | |
| 132 | +# #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
| 133 | +# #} | |
| 134 | +# j = j+1 | |
| 135 | +# } | |
| 136 | +# DATA | |
| 137 | +#} | |
| 122 | 138 | |
| 123 | 139 | #5#Function for adjusting the gene names | 
| 124 | 140 | gcnames <- function(DiData,usecol=1){ | 
| ... | ... | @@ -151,11 +167,9 @@ dndat <- function(NDATA){ | 
| 151 | 167 | |
| 152 | 168 | if(NDATA[i,j] < -1){ | 
| 153 | 169 | DDATA[i,j]=0L | 
| 154 | - } | |
| 155 | - if(NDATA[i,j] > 1){ | |
| 170 | + } else if(NDATA[i,j] > 1){ | |
| 156 | 171 | DDATA[i,j]=2L | 
| 157 | - } | |
| 158 | - if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | |
| 172 | + } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | |
| 159 | 173 | DDATA[i,j]=1L | 
| 160 | 174 | } | 
| 161 | 175 | } else{ | 
| ... | ... | @@ -222,8 +236,7 @@ if(clfileex >= 1){ | 
| 222 | 236 | geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% | 
| 223 | 237 | read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") | 
| 224 | 238 | |
| 225 | -} | |
| 226 | -if(clfileex == 0){ | |
| 239 | +} else if(clfileex == 0){ | |
| 227 | 240 | ##Lets Create a clean version | 
| 228 | 241 | |
| 229 | 242 | ##Gene ID to Gene Name | 
| ... | ... | @@ -246,8 +259,7 @@ if(clfileex == 0){ | 
| 246 | 259 | geneIDNam <- genena %>% | 
| 247 | 260 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% | 
| 248 | 261 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | 
| 249 | - } | |
| 250 | - if(IDF == 0){ | |
| 262 | + } else if(IDF == 0){ | |
| 251 | 263 | #No information on this particular GPL file | 
| 252 | 264 | idLOCGPL <- genena %>% | 
| 253 | 265 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | 
| ... | ... | @@ -260,8 +272,7 @@ if(clfileex == 0){ | 
| 260 | 272 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | 
| 261 | 273 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | 
| 262 | 274 | } | 
| 263 | - } | |
| 264 | - if(fileex == 0){ | |
| 275 | + } else if(fileex == 0){ | |
| 265 | 276 | #We must create a file that we can access for later use | 
| 266 | 277 | idLOCGPL <- genena %>% | 
| 267 | 278 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | 
| ... | ... | @@ -275,8 +286,7 @@ if(clfileex == 0){ | 
| 275 | 286 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | 
| 276 | 287 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | 
| 277 | 288 | } | 
| 278 | - } | |
| 279 | - if(soft == FALSE){ | |
| 289 | + } else if(soft == FALSE){ | |
| 280 | 290 | geneIDNam <- genena %>% | 
| 281 | 291 | read_delim(delim="\t",comment = "#")%>% | 
| 282 | 292 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | 
| ... | ... | @@ -295,7 +305,7 @@ if(clfileex == 0){ | 
| 295 | 305 | |
| 296 | 306 | |
| 297 | 307 | ##Changing the gene ID to gene name | 
| 298 | -ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) | |
| 308 | +ALZDAT1 <- cgeneID(geneIDNam,alzdat) | |
| 299 | 309 | colnames(ALZDAT) = ALZDAT1[1,] | 
| 300 | 310 | |
| 301 | 311 | |
| ... | ... | @@ -338,8 +348,13 @@ RAWWORD <- rawword[-hereim,] %>% | 
| 338 | 348 | z <- 1 | 
| 339 | 349 | naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) | 
| 340 | 350 | for(z in 1:dim(RAWWORD)[1]){ | 
| 341 | - naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
| 342 | - z <- z + 1 | |
| 351 | + if(sum(is.na(RAWWORD[z,])) > 0){ | |
| 352 | + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
| 353 | + } | |
| 354 | + if(length(grep("NA",RAWWORD[z,])) > 0){ | |
| 355 | + naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] | |
| 356 | + } | |
| 357 | + z <- z + 1 | |
| 343 | 358 | } | 
| 344 | 359 | |
| 345 | 360 | colnames(naroww) <- "ROW_NAs" | 
| ... | ... | @@ -378,9 +393,8 @@ for(j in 1:length(tabRDATID)){ | 
| 378 | 393 | ##Putting the ones without duplicates in their new homes | 
| 379 | 394 | if(tabRDATID[j] == 1){ | 
| 380 | 395 | NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] | 
| 381 | - } | |
| 396 | + } else if(tabRDATID[j] > 1){ | |
| 382 | 397 | ##Averaging duplicates and putting them in their new homes | 
| 383 | - if(tabRDATID[j] > 1){ | |
| 384 | 398 | NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) | 
| 385 | 399 | } | 
| 386 | 400 | j <- j + 1 |