Commit e340baf086d0c414acf14dfc085805879c94e966
Exists in
master
Merge branch 'master' of smlg.fiu.edu:efraingonzalez0/cleaning-and-fixing-data-with-r
Showing
1 changed file
Show diff stats
RAutoClDs.R
| ... | ... | @@ -28,29 +28,29 @@ chngrownm <- function(mat){ |
| 28 | 28 | if("!Sample_source_name_ch1"==mat[1,e]){ |
| 29 | 29 | colnames(mat)[e] <- "Brain_Region" |
| 30 | 30 | } |
| 31 | - if("!Sample_title" == mat[1,e]){ | |
| 31 | + else if("!Sample_title" == mat[1,e]){ | |
| 32 | 32 | colnames(mat)[e] <- "Title" |
| 33 | 33 | } |
| 34 | - if("!Sample_geo_accession" == mat[1,e]){ | |
| 34 | + else if("!Sample_geo_accession" == mat[1,e]){ | |
| 35 | 35 | colnames(mat)[e] <- "ID_REF" |
| 36 | 36 | } else{ |
| 37 | 37 | if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){ |
| 38 | 38 | colnames(mat)[e] <- paste0("Sex",r) |
| 39 | 39 | r = r + 1 |
| 40 | 40 | } |
| 41 | - if(grepl("postmorteminterval|PMI|pmi",mat[2,e])==TRUE){ | |
| 41 | + else if(grepl("postmorteminterval|PMI|pmi",mat[2,e])==TRUE){ | |
| 42 | 42 | colnames(mat)[e] <- paste0("PMI",a) |
| 43 | 43 | a = a + 1 |
| 44 | 44 | } |
| 45 | - if(grepl("age|Age|AGE",mat[2,e])==TRUE){ | |
| 45 | + else if(grepl("age|Age|AGE",mat[2,e])==TRUE){ | |
| 46 | 46 | colnames(mat)[e] <- paste0("Age",h) |
| 47 | 47 | h = h + 1 |
| 48 | 48 | } |
| 49 | - if(grepl("braak|b&b",mat[2,e])==TRUE){ | |
| 49 | + else if(grepl("braak|b&b",mat[2,e])==TRUE){ | |
| 50 | 50 | colnames(mat)[e] <- paste0("Braak",g) |
| 51 | 51 | g = g + 1 |
| 52 | 52 | } |
| 53 | - if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){ | |
| 53 | + else if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){ | |
| 54 | 54 | colnames(mat)[e] <- paste0("Group",o) |
| 55 | 55 | o = o + 1 |
| 56 | 56 | } |
| ... | ... | @@ -69,18 +69,18 @@ cinfo <- function(mat){ |
| 69 | 69 | if(grepl("Group",colnames(mat)[j]) == TRUE){ |
| 70 | 70 | mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) |
| 71 | 71 | } |
| 72 | - if(grepl("Age",colnames(mat)[j])==TRUE){ | |
| 72 | + else if(grepl("Age",colnames(mat)[j])==TRUE){ | |
| 73 | 73 | mat[,j] <- gsub("\\D","",mat[,j])%>% |
| 74 | 74 | as.integer() |
| 75 | 75 | } |
| 76 | - if(grepl("Sex",colnames(mat)[j])==TRUE){ | |
| 76 | + else if(grepl("Sex",colnames(mat)[j])==TRUE){ | |
| 77 | 77 | mat[,j] <- gsub(".+:\\s","",mat[,j]) |
| 78 | 78 | } |
| 79 | - if(grepl("PMI",colnames(mat)[j])==TRUE){ | |
| 79 | + else if(grepl("PMI",colnames(mat)[j])==TRUE){ | |
| 80 | 80 | mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% |
| 81 | 81 | as.numeric() |
| 82 | 82 | } |
| 83 | - if(grepl("Braak",colnames(mat)[j])==TRUE){ | |
| 83 | + else if(grepl("Braak",colnames(mat)[j])==TRUE){ | |
| 84 | 84 | mat[,j]<-gsub(".+:\\s","",mat[,j])%>% |
| 85 | 85 | as.roman()%>% |
| 86 | 86 | as.integer() |
| ... | ... | @@ -235,7 +235,7 @@ THEFT <- function(){ |
| 235 | 235 | read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") |
| 236 | 236 | |
| 237 | 237 | } |
| 238 | - if(clfileex == 0){ | |
| 238 | + else if(clfileex == 0){ | |
| 239 | 239 | ##Lets Create a clean version |
| 240 | 240 | |
| 241 | 241 | ##Gene ID to Gene Name |
| ... | ... | @@ -259,7 +259,7 @@ THEFT <- function(){ |
| 259 | 259 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% |
| 260 | 260 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
| 261 | 261 | } |
| 262 | - if(IDF == 0){ | |
| 262 | + else if(IDF == 0){ | |
| 263 | 263 | #No information on this particular GPL file |
| 264 | 264 | idLOCGPL <- genena %>% |
| 265 | 265 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% |
| ... | ... | @@ -273,7 +273,7 @@ THEFT <- function(){ |
| 273 | 273 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
| 274 | 274 | } |
| 275 | 275 | } |
| 276 | - if(fileex == 0){ | |
| 276 | + else if(fileex == 0){ | |
| 277 | 277 | #We must create a file that we can access for later use |
| 278 | 278 | idLOCGPL <- genena %>% |
| 279 | 279 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% |
| ... | ... | @@ -288,7 +288,7 @@ THEFT <- function(){ |
| 288 | 288 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
| 289 | 289 | } |
| 290 | 290 | } |
| 291 | - if(soft == FALSE){ | |
| 291 | + else if(soft == FALSE){ | |
| 292 | 292 | geneIDNam <- genena %>% |
| 293 | 293 | read_delim(delim="\t",comment = "#")%>% |
| 294 | 294 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
| ... | ... | @@ -391,7 +391,7 @@ THEFT <- function(){ |
| 391 | 391 | NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] |
| 392 | 392 | } |
| 393 | 393 | ##Averaging duplicates and putting them in their new homes |
| 394 | - if(tabRDATID[j] > 1){ | |
| 394 | + else if(tabRDATID[j] > 1){ | |
| 395 | 395 | NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) |
| 396 | 396 | } |
| 397 | 397 | j <- j + 1 |
| ... | ... | @@ -461,7 +461,7 @@ THEFT <- function(){ |
| 461 | 461 | } |
| 462 | 462 | |
| 463 | 463 | #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN |
| 464 | - if(numDAT == 2){ | |
| 464 | + else if(numDAT == 2){ | |
| 465 | 465 | #All the files you want to analyze |
| 466 | 466 | ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:") |
| 467 | 467 | if(length(ANDIS) == 0){ |
| ... | ... | @@ -524,7 +524,7 @@ THEFT <- function(){ |
| 524 | 524 | read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") |
| 525 | 525 | |
| 526 | 526 | } |
| 527 | - if(clfileex == 0){ | |
| 527 | + else if(clfileex == 0){ | |
| 528 | 528 | ##Lets Create a clean version |
| 529 | 529 | |
| 530 | 530 | ##Gene ID to Gene Name |
| ... | ... | @@ -548,7 +548,7 @@ THEFT <- function(){ |
| 548 | 548 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% |
| 549 | 549 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
| 550 | 550 | } |
| 551 | - if(IDF == 0){ | |
| 551 | + else if(IDF == 0){ | |
| 552 | 552 | #No information on this particular GPL file |
| 553 | 553 | idLOCGPL <- genena %>% |
| 554 | 554 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% |
| ... | ... | @@ -562,7 +562,7 @@ THEFT <- function(){ |
| 562 | 562 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
| 563 | 563 | } |
| 564 | 564 | } |
| 565 | - if(fileex == 0){ | |
| 565 | + else if(fileex == 0){ | |
| 566 | 566 | #We must create a file that we can access for later use |
| 567 | 567 | idLOCGPL <- genena %>% |
| 568 | 568 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% |
| ... | ... | @@ -577,7 +577,7 @@ THEFT <- function(){ |
| 577 | 577 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
| 578 | 578 | } |
| 579 | 579 | } |
| 580 | - if(soft == FALSE){ | |
| 580 | + else if(soft == FALSE){ | |
| 581 | 581 | geneIDNam <- genena %>% |
| 582 | 582 | read_delim(delim="\t",comment = "#")%>% |
| 583 | 583 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
| ... | ... | @@ -680,7 +680,7 @@ THEFT <- function(){ |
| 680 | 680 | NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] |
| 681 | 681 | } |
| 682 | 682 | ##Averaging duplicates and putting them in their new homes |
| 683 | - if(tabRDATID[j] > 1){ | |
| 683 | + else if(tabRDATID[j] > 1){ | |
| 684 | 684 | NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) |
| 685 | 685 | } |
| 686 | 686 | j <- j + 1 |