Commit 805474e1e6ec9b11cd677022efe720af7c3dda36
1 parent
f31e87a636
Exists in
master
Most recent update fixed a few handling errors
Showing
1 changed file
with
49 additions
and
35 deletions
Show diff stats
RCleanDscret.R
... | ... | @@ -24,11 +24,9 @@ chngrownm <- function(mat){ |
24 | 24 | for(j in 1:col){ |
25 | 25 | if("!Sample_source_name_ch1"==mat[1,j]){ |
26 | 26 | colnames(mat)[j] <- "Brain_Region" |
27 | - } | |
28 | - if("!Sample_title" == mat[1,j]){ | |
27 | + } else if("!Sample_title" == mat[1,j]){ | |
29 | 28 | colnames(mat)[j] <- "Title" |
30 | - } | |
31 | - if("!Sample_geo_accession" == mat[1,j]){ | |
29 | + } else if("!Sample_geo_accession" == mat[1,j]){ | |
32 | 30 | colnames(mat)[j] <- "ID_REF" |
33 | 31 | } else{ |
34 | 32 | if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ |
... | ... | @@ -102,23 +100,41 @@ NAFIXING <- function(GIDNAM){ |
102 | 100 | |
103 | 101 | #4#Function for changing the gene ID to gene name |
104 | 102 | cgeneID <- function(GeneName,DATA){ |
105 | - colGene <- dim(GeneName)[2] | |
106 | - j <- 1 | |
107 | - for(j in 1:colGene){ | |
108 | - chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) | |
109 | - if(is.na(sum(chngsreq))==FALSE){ | |
110 | - if(sum(chngsreq) > 0){ | |
111 | - DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
103 | + nj <- t(GeneName) | |
104 | + nq <- t(DATA) | |
105 | + colGene <- dim(nj)[2] | |
106 | + colDATA <- dim(nq)[2] | |
107 | + j <- 1 | |
108 | + for(j in 1:colDATA){ | |
109 | + #where is that gene id located within the GPL file | |
110 | + chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,]) | |
111 | + if(is.na(sum(chngreq))==FALSE){ | |
112 | + if(sum(chngreq) > 0){ | |
113 | + nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j]) | |
114 | + } | |
112 | 115 | } |
116 | + j <- j + 1 | |
113 | 117 | } |
114 | - #if(sum(chngsreq) > 0){ | |
115 | - ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) | |
116 | - #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
117 | - #} | |
118 | - j = j+1 | |
119 | - } | |
120 | - DATA | |
118 | + nq | |
121 | 119 | } |
120 | +#cgeneID <- function(GeneName,DATA){ | |
121 | +# colGene <- dim(GeneName)[2] | |
122 | +# j <- 1 | |
123 | +# for(j in 1:colGene){ | |
124 | +# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) | |
125 | +# if(is.na(sum(chngsreq))==FALSE){ | |
126 | +# if(sum(chngsreq) > 0){ | |
127 | +# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
128 | +# } | |
129 | +# } | |
130 | +# #if(sum(chngsreq) > 0){ | |
131 | +# ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) | |
132 | +# #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
133 | +# #} | |
134 | +# j = j+1 | |
135 | +# } | |
136 | +# DATA | |
137 | +#} | |
122 | 138 | |
123 | 139 | #5#Function for adjusting the gene names |
124 | 140 | gcnames <- function(DiData,usecol=1){ |
... | ... | @@ -151,11 +167,9 @@ dndat <- function(NDATA){ |
151 | 167 | |
152 | 168 | if(NDATA[i,j] < -1){ |
153 | 169 | DDATA[i,j]=0L |
154 | - } | |
155 | - if(NDATA[i,j] > 1){ | |
170 | + } else if(NDATA[i,j] > 1){ | |
156 | 171 | DDATA[i,j]=2L |
157 | - } | |
158 | - if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | |
172 | + } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | |
159 | 173 | DDATA[i,j]=1L |
160 | 174 | } |
161 | 175 | } else{ |
... | ... | @@ -222,8 +236,7 @@ if(clfileex >= 1){ |
222 | 236 | geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% |
223 | 237 | read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") |
224 | 238 | |
225 | -} | |
226 | -if(clfileex == 0){ | |
239 | +} else if(clfileex == 0){ | |
227 | 240 | ##Lets Create a clean version |
228 | 241 | |
229 | 242 | ##Gene ID to Gene Name |
... | ... | @@ -246,8 +259,7 @@ if(clfileex == 0){ |
246 | 259 | geneIDNam <- genena %>% |
247 | 260 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% |
248 | 261 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
249 | - } | |
250 | - if(IDF == 0){ | |
262 | + } else if(IDF == 0){ | |
251 | 263 | #No information on this particular GPL file |
252 | 264 | idLOCGPL <- genena %>% |
253 | 265 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% |
... | ... | @@ -260,8 +272,7 @@ if(clfileex == 0){ |
260 | 272 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% |
261 | 273 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
262 | 274 | } |
263 | - } | |
264 | - if(fileex == 0){ | |
275 | + } else if(fileex == 0){ | |
265 | 276 | #We must create a file that we can access for later use |
266 | 277 | idLOCGPL <- genena %>% |
267 | 278 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% |
... | ... | @@ -275,8 +286,7 @@ if(clfileex == 0){ |
275 | 286 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% |
276 | 287 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
277 | 288 | } |
278 | - } | |
279 | - if(soft == FALSE){ | |
289 | + } else if(soft == FALSE){ | |
280 | 290 | geneIDNam <- genena %>% |
281 | 291 | read_delim(delim="\t",comment = "#")%>% |
282 | 292 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) |
... | ... | @@ -295,7 +305,7 @@ if(clfileex == 0){ |
295 | 305 | |
296 | 306 | |
297 | 307 | ##Changing the gene ID to gene name |
298 | -ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) | |
308 | +ALZDAT1 <- cgeneID(geneIDNam,alzdat) | |
299 | 309 | colnames(ALZDAT) = ALZDAT1[1,] |
300 | 310 | |
301 | 311 | |
... | ... | @@ -338,8 +348,13 @@ RAWWORD <- rawword[-hereim,] %>% |
338 | 348 | z <- 1 |
339 | 349 | naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) |
340 | 350 | for(z in 1:dim(RAWWORD)[1]){ |
341 | - naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
342 | - z <- z + 1 | |
351 | + if(sum(is.na(RAWWORD[z,])) > 0){ | |
352 | + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
353 | + } | |
354 | + if(length(grep("NA",RAWWORD[z,])) > 0){ | |
355 | + naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] | |
356 | + } | |
357 | + z <- z + 1 | |
343 | 358 | } |
344 | 359 | |
345 | 360 | colnames(naroww) <- "ROW_NAs" |
... | ... | @@ -378,9 +393,8 @@ for(j in 1:length(tabRDATID)){ |
378 | 393 | ##Putting the ones without duplicates in their new homes |
379 | 394 | if(tabRDATID[j] == 1){ |
380 | 395 | NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] |
381 | - } | |
396 | + } else if(tabRDATID[j] > 1){ | |
382 | 397 | ##Averaging duplicates and putting them in their new homes |
383 | - if(tabRDATID[j] > 1){ | |
384 | 398 | NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) |
385 | 399 | } |
386 | 400 | j <- j + 1 |