diff --git a/RCleanDscret.R b/RCleanDscret.R index 3efab06..6c37b45 100644 --- a/RCleanDscret.R +++ b/RCleanDscret.R @@ -24,11 +24,9 @@ chngrownm <- function(mat){ for(j in 1:col){ if("!Sample_source_name_ch1"==mat[1,j]){ colnames(mat)[j] <- "Brain_Region" - } - if("!Sample_title" == mat[1,j]){ + } else if("!Sample_title" == mat[1,j]){ colnames(mat)[j] <- "Title" - } - if("!Sample_geo_accession" == mat[1,j]){ + } else if("!Sample_geo_accession" == mat[1,j]){ colnames(mat)[j] <- "ID_REF" } else{ if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ @@ -102,23 +100,41 @@ NAFIXING <- function(GIDNAM){ #4#Function for changing the gene ID to gene name cgeneID <- function(GeneName,DATA){ - colGene <- dim(GeneName)[2] - j <- 1 - for(j in 1:colGene){ - chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) - if(is.na(sum(chngsreq))==FALSE){ - if(sum(chngsreq) > 0){ - DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) + nj <- t(GeneName) + nq <- t(DATA) + colGene <- dim(nj)[2] + colDATA <- dim(nq)[2] + j <- 1 + for(j in 1:colDATA){ + #where is that gene id located within the GPL file + chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,]) + if(is.na(sum(chngreq))==FALSE){ + if(sum(chngreq) > 0){ + nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j]) + } } + j <- j + 1 } - #if(sum(chngsreq) > 0){ - ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) - #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) - #} - j = j+1 - } - DATA + nq } +#cgeneID <- function(GeneName,DATA){ +# colGene <- dim(GeneName)[2] +# j <- 1 +# for(j in 1:colGene){ +# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) +# if(is.na(sum(chngsreq))==FALSE){ +# if(sum(chngsreq) > 0){ +# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) +# } +# } +# #if(sum(chngsreq) > 0){ +# ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) +# #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) +# #} +# j = j+1 +# } +# DATA +#} #5#Function for adjusting the gene names gcnames <- function(DiData,usecol=1){ @@ -151,11 +167,9 @@ dndat <- function(NDATA){ if(NDATA[i,j] < -1){ DDATA[i,j]=0L - } - if(NDATA[i,j] > 1){ + } else if(NDATA[i,j] > 1){ DDATA[i,j]=2L - } - if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ + } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ DDATA[i,j]=1L } } else{ @@ -222,8 +236,7 @@ if(clfileex >= 1){ geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") -} -if(clfileex == 0){ +} else if(clfileex == 0){ ##Lets Create a clean version ##Gene ID to Gene Name @@ -246,8 +259,7 @@ if(clfileex == 0){ geneIDNam <- genena %>% read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) - } - if(IDF == 0){ + } else if(IDF == 0){ #No information on this particular GPL file idLOCGPL <- genena %>% read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% @@ -260,8 +272,7 @@ if(clfileex == 0){ read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) } - } - if(fileex == 0){ + } else if(fileex == 0){ #We must create a file that we can access for later use idLOCGPL <- genena %>% read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% @@ -275,8 +286,7 @@ if(clfileex == 0){ read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) } - } - if(soft == FALSE){ + } else if(soft == FALSE){ geneIDNam <- genena %>% read_delim(delim="\t",comment = "#")%>% dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) @@ -295,7 +305,7 @@ if(clfileex == 0){ ##Changing the gene ID to gene name -ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) +ALZDAT1 <- cgeneID(geneIDNam,alzdat) colnames(ALZDAT) = ALZDAT1[1,] @@ -338,8 +348,13 @@ RAWWORD <- rawword[-hereim,] %>% z <- 1 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) for(z in 1:dim(RAWWORD)[1]){ - naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) - z <- z + 1 + if(sum(is.na(RAWWORD[z,])) > 0){ + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) + } + if(length(grep("NA",RAWWORD[z,])) > 0){ + naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] + } + z <- z + 1 } colnames(naroww) <- "ROW_NAs" @@ -378,9 +393,8 @@ for(j in 1:length(tabRDATID)){ ##Putting the ones without duplicates in their new homes if(tabRDATID[j] == 1){ NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] - } + } else if(tabRDATID[j] > 1){ ##Averaging duplicates and putting them in their new homes - if(tabRDATID[j] > 1){ NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) } j <- j + 1