Commit 805474e1e6ec9b11cd677022efe720af7c3dda36
1 parent
f31e87a636
Exists in
master
Most recent update fixed a few handling errors
Showing
1 changed file
with
49 additions
and
35 deletions
 
Show diff stats
RCleanDscret.R
| ... | ... | @@ -24,11 +24,9 @@ chngrownm <- function(mat){ | 
| 24 | 24 | for(j in 1:col){ | 
| 25 | 25 | if("!Sample_source_name_ch1"==mat[1,j]){ | 
| 26 | 26 | colnames(mat)[j] <- "Brain_Region" | 
| 27 | - } | |
| 28 | - if("!Sample_title" == mat[1,j]){ | |
| 27 | + } else if("!Sample_title" == mat[1,j]){ | |
| 29 | 28 | colnames(mat)[j] <- "Title" | 
| 30 | - } | |
| 31 | - if("!Sample_geo_accession" == mat[1,j]){ | |
| 29 | + } else if("!Sample_geo_accession" == mat[1,j]){ | |
| 32 | 30 | colnames(mat)[j] <- "ID_REF" | 
| 33 | 31 | } else{ | 
| 34 | 32 | if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ | 
| ... | ... | @@ -102,23 +100,41 @@ NAFIXING <- function(GIDNAM){ | 
| 102 | 100 | |
| 103 | 101 | #4#Function for changing the gene ID to gene name | 
| 104 | 102 | cgeneID <- function(GeneName,DATA){ | 
| 105 | - colGene <- dim(GeneName)[2] | |
| 106 | - j <- 1 | |
| 107 | - for(j in 1:colGene){ | |
| 108 | - chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) | |
| 109 | - if(is.na(sum(chngsreq))==FALSE){ | |
| 110 | - if(sum(chngsreq) > 0){ | |
| 111 | - DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
| 103 | + nj <- t(GeneName) | |
| 104 | + nq <- t(DATA) | |
| 105 | + colGene <- dim(nj)[2] | |
| 106 | + colDATA <- dim(nq)[2] | |
| 107 | + j <- 1 | |
| 108 | + for(j in 1:colDATA){ | |
| 109 | + #where is that gene id located within the GPL file | |
| 110 | + chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,]) | |
| 111 | + if(is.na(sum(chngreq))==FALSE){ | |
| 112 | + if(sum(chngreq) > 0){ | |
| 113 | + nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j]) | |
| 114 | + } | |
| 112 | 115 | } | 
| 116 | + j <- j + 1 | |
| 113 | 117 | } | 
| 114 | - #if(sum(chngsreq) > 0){ | |
| 115 | - ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) | |
| 116 | - #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
| 117 | - #} | |
| 118 | - j = j+1 | |
| 119 | - } | |
| 120 | - DATA | |
| 118 | + nq | |
| 121 | 119 | } | 
| 120 | +#cgeneID <- function(GeneName,DATA){ | |
| 121 | +# colGene <- dim(GeneName)[2] | |
| 122 | +# j <- 1 | |
| 123 | +# for(j in 1:colGene){ | |
| 124 | +# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) | |
| 125 | +# if(is.na(sum(chngsreq))==FALSE){ | |
| 126 | +# if(sum(chngsreq) > 0){ | |
| 127 | +# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
| 128 | +# } | |
| 129 | +# } | |
| 130 | +# #if(sum(chngsreq) > 0){ | |
| 131 | +# ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) | |
| 132 | +# #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
| 133 | +# #} | |
| 134 | +# j = j+1 | |
| 135 | +# } | |
| 136 | +# DATA | |
| 137 | +#} | |
| 122 | 138 | |
| 123 | 139 | #5#Function for adjusting the gene names | 
| 124 | 140 | gcnames <- function(DiData,usecol=1){ | 
| ... | ... | @@ -151,11 +167,9 @@ dndat <- function(NDATA){ | 
| 151 | 167 | |
| 152 | 168 | if(NDATA[i,j] < -1){ | 
| 153 | 169 | DDATA[i,j]=0L | 
| 154 | - } | |
| 155 | - if(NDATA[i,j] > 1){ | |
| 170 | + } else if(NDATA[i,j] > 1){ | |
| 156 | 171 | DDATA[i,j]=2L | 
| 157 | - } | |
| 158 | - if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | |
| 172 | + } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | |
| 159 | 173 | DDATA[i,j]=1L | 
| 160 | 174 | } | 
| 161 | 175 | } else{ | 
| ... | ... | @@ -222,8 +236,7 @@ if(clfileex >= 1){ | 
| 222 | 236 | geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% | 
| 223 | 237 | read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") | 
| 224 | 238 | |
| 225 | -} | |
| 226 | -if(clfileex == 0){ | |
| 239 | +} else if(clfileex == 0){ | |
| 227 | 240 | ##Lets Create a clean version | 
| 228 | 241 | |
| 229 | 242 | ##Gene ID to Gene Name | 
| ... | ... | @@ -246,8 +259,7 @@ if(clfileex == 0){ | 
| 246 | 259 | geneIDNam <- genena %>% | 
| 247 | 260 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% | 
| 248 | 261 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | 
| 249 | - } | |
| 250 | - if(IDF == 0){ | |
| 262 | + } else if(IDF == 0){ | |
| 251 | 263 | #No information on this particular GPL file | 
| 252 | 264 | idLOCGPL <- genena %>% | 
| 253 | 265 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | 
| ... | ... | @@ -260,8 +272,7 @@ if(clfileex == 0){ | 
| 260 | 272 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | 
| 261 | 273 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | 
| 262 | 274 | } | 
| 263 | - } | |
| 264 | - if(fileex == 0){ | |
| 275 | + } else if(fileex == 0){ | |
| 265 | 276 | #We must create a file that we can access for later use | 
| 266 | 277 | idLOCGPL <- genena %>% | 
| 267 | 278 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | 
| ... | ... | @@ -275,8 +286,7 @@ if(clfileex == 0){ | 
| 275 | 286 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | 
| 276 | 287 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | 
| 277 | 288 | } | 
| 278 | - } | |
| 279 | - if(soft == FALSE){ | |
| 289 | + } else if(soft == FALSE){ | |
| 280 | 290 | geneIDNam <- genena %>% | 
| 281 | 291 | read_delim(delim="\t",comment = "#")%>% | 
| 282 | 292 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | 
| ... | ... | @@ -295,7 +305,7 @@ if(clfileex == 0){ | 
| 295 | 305 | |
| 296 | 306 | |
| 297 | 307 | ##Changing the gene ID to gene name | 
| 298 | -ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) | |
| 308 | +ALZDAT1 <- cgeneID(geneIDNam,alzdat) | |
| 299 | 309 | colnames(ALZDAT) = ALZDAT1[1,] | 
| 300 | 310 | |
| 301 | 311 | |
| ... | ... | @@ -338,8 +348,13 @@ RAWWORD <- rawword[-hereim,] %>% | 
| 338 | 348 | z <- 1 | 
| 339 | 349 | naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) | 
| 340 | 350 | for(z in 1:dim(RAWWORD)[1]){ | 
| 341 | - naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
| 342 | - z <- z + 1 | |
| 351 | + if(sum(is.na(RAWWORD[z,])) > 0){ | |
| 352 | + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
| 353 | + } | |
| 354 | + if(length(grep("NA",RAWWORD[z,])) > 0){ | |
| 355 | + naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] | |
| 356 | + } | |
| 357 | + z <- z + 1 | |
| 343 | 358 | } | 
| 344 | 359 | |
| 345 | 360 | colnames(naroww) <- "ROW_NAs" | 
| ... | ... | @@ -378,9 +393,8 @@ for(j in 1:length(tabRDATID)){ | 
| 378 | 393 | ##Putting the ones without duplicates in their new homes | 
| 379 | 394 | if(tabRDATID[j] == 1){ | 
| 380 | 395 | NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] | 
| 381 | - } | |
| 396 | + } else if(tabRDATID[j] > 1){ | |
| 382 | 397 | ##Averaging duplicates and putting them in their new homes | 
| 383 | - if(tabRDATID[j] > 1){ | |
| 384 | 398 | NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) | 
| 385 | 399 | } | 
| 386 | 400 | j <- j + 1 |