Commit 805474e1e6ec9b11cd677022efe720af7c3dda36

Authored by Efrain Gonzalez
1 parent f31e87a636
Exists in master

Most recent update fixed a few handling errors

Showing 1 changed file with 49 additions and 35 deletions   Show diff stats
... ... @@ -24,11 +24,9 @@ chngrownm <- function(mat){
24 24 for(j in 1:col){
25 25 if("!Sample_source_name_ch1"==mat[1,j]){
26 26 colnames(mat)[j] <- "Brain_Region"
27   - }
28   - if("!Sample_title" == mat[1,j]){
  27 + } else if("!Sample_title" == mat[1,j]){
29 28 colnames(mat)[j] <- "Title"
30   - }
31   - if("!Sample_geo_accession" == mat[1,j]){
  29 + } else if("!Sample_geo_accession" == mat[1,j]){
32 30 colnames(mat)[j] <- "ID_REF"
33 31 } else{
34 32 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
... ... @@ -102,23 +100,41 @@ NAFIXING &lt;- function(GIDNAM){
102 100  
103 101 #4#Function for changing the gene ID to gene name
104 102 cgeneID <- function(GeneName,DATA){
105   - colGene <- dim(GeneName)[2]
106   - j <- 1
107   - for(j in 1:colGene){
108   - chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
109   - if(is.na(sum(chngsreq))==FALSE){
110   - if(sum(chngsreq) > 0){
111   - DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
  103 + nj <- t(GeneName)
  104 + nq <- t(DATA)
  105 + colGene <- dim(nj)[2]
  106 + colDATA <- dim(nq)[2]
  107 + j <- 1
  108 + for(j in 1:colDATA){
  109 + #where is that gene id located within the GPL file
  110 + chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
  111 + if(is.na(sum(chngreq))==FALSE){
  112 + if(sum(chngreq) > 0){
  113 + nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
  114 + }
112 115 }
  116 + j <- j + 1
113 117 }
114   - #if(sum(chngsreq) > 0){
115   - ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
116   - #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
117   - #}
118   - j = j+1
119   - }
120   - DATA
  118 + nq
121 119 }
  120 +#cgeneID <- function(GeneName,DATA){
  121 +# colGene <- dim(GeneName)[2]
  122 +# j <- 1
  123 +# for(j in 1:colGene){
  124 +# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
  125 +# if(is.na(sum(chngsreq))==FALSE){
  126 +# if(sum(chngsreq) > 0){
  127 +# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
  128 +# }
  129 +# }
  130 +# #if(sum(chngsreq) > 0){
  131 +# ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
  132 +# #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
  133 +# #}
  134 +# j = j+1
  135 +# }
  136 +# DATA
  137 +#}
122 138  
123 139 #5#Function for adjusting the gene names
124 140 gcnames <- function(DiData,usecol=1){
... ... @@ -151,11 +167,9 @@ dndat &lt;- function(NDATA){
151 167  
152 168 if(NDATA[i,j] < -1){
153 169 DDATA[i,j]=0L
154   - }
155   - if(NDATA[i,j] > 1){
  170 + } else if(NDATA[i,j] > 1){
156 171 DDATA[i,j]=2L
157   - }
158   - if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
  172 + } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
159 173 DDATA[i,j]=1L
160 174 }
161 175 } else{
... ... @@ -222,8 +236,7 @@ if(clfileex &gt;= 1){
222 236 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
223 237 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
224 238  
225   -}
226   -if(clfileex == 0){
  239 +} else if(clfileex == 0){
227 240 ##Lets Create a clean version
228 241  
229 242 ##Gene ID to Gene Name
... ... @@ -246,8 +259,7 @@ if(clfileex == 0){
246 259 geneIDNam <- genena %>%
247 260 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
248 261 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
249   - }
250   - if(IDF == 0){
  262 + } else if(IDF == 0){
251 263 #No information on this particular GPL file
252 264 idLOCGPL <- genena %>%
253 265 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
... ... @@ -260,8 +272,7 @@ if(clfileex == 0){
260 272 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
261 273 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
262 274 }
263   - }
264   - if(fileex == 0){
  275 + } else if(fileex == 0){
265 276 #We must create a file that we can access for later use
266 277 idLOCGPL <- genena %>%
267 278 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
... ... @@ -275,8 +286,7 @@ if(clfileex == 0){
275 286 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
276 287 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
277 288 }
278   - }
279   - if(soft == FALSE){
  289 + } else if(soft == FALSE){
280 290 geneIDNam <- genena %>%
281 291 read_delim(delim="\t",comment = "#")%>%
282 292 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
... ... @@ -295,7 +305,7 @@ if(clfileex == 0){
295 305  
296 306  
297 307 ##Changing the gene ID to gene name
298   -ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
  308 +ALZDAT1 <- cgeneID(geneIDNam,alzdat)
299 309 colnames(ALZDAT) = ALZDAT1[1,]
300 310  
301 311  
... ... @@ -338,8 +348,13 @@ RAWWORD &lt;- rawword[-hereim,] %&gt;%
338 348 z <- 1
339 349 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
340 350 for(z in 1:dim(RAWWORD)[1]){
341   - naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
342   - z <- z + 1
  351 + if(sum(is.na(RAWWORD[z,])) > 0){
  352 + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
  353 + }
  354 + if(length(grep("NA",RAWWORD[z,])) > 0){
  355 + naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
  356 + }
  357 + z <- z + 1
343 358 }
344 359  
345 360 colnames(naroww) <- "ROW_NAs"
... ... @@ -378,9 +393,8 @@ for(j in 1:length(tabRDATID)){
378 393 ##Putting the ones without duplicates in their new homes
379 394 if(tabRDATID[j] == 1){
380 395 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
381   - }
  396 + } else if(tabRDATID[j] > 1){
382 397 ##Averaging duplicates and putting them in their new homes
383   - if(tabRDATID[j] > 1){
384 398 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
385 399 }
386 400 j <- j + 1