Commit adfed316993d072e749a8cd85434fc667c054f22
1 parent
689231363c
Exists in
master
An automated version of the RCleanDscret.R
Working on outputting more insightful errors and warnings. (UNTESTED)
Showing
1 changed file
with
752 additions
and
0 deletions
Show diff stats
RAutoClDs.R
File was created | 1 | #Efrain H. Gonzalez | |
2 | #6/19/2017 | ||
3 | #Libraries required to run the code | ||
4 | library(pryr) | ||
5 | library(MASS) | ||
6 | library(dplyr) | ||
7 | library(tidyr) | ||
8 | library(readr) | ||
9 | library(stringr) | ||
10 | |||
11 | |||
12 | #Necessary Functions | ||
13 | #1#Function for handling the changing of row names and column names | ||
14 | chngrownm <- function(mat){ | ||
15 | row <- dim(mat)[1] | ||
16 | col <- dim(mat)[2] | ||
17 | j <- 1 | ||
18 | x <- 1 | ||
19 | p <- 1 | ||
20 | a <- 1 | ||
21 | b <- 1 | ||
22 | g <- 1 | ||
23 | for(j in 1:col){ | ||
24 | if("!Sample_source_name_ch1"==mat[1,j]){ | ||
25 | colnames(mat)[j] <- "Brain_Region" | ||
26 | } | ||
27 | if("!Sample_title" == mat[1,j]){ | ||
28 | colnames(mat)[j] <- "Title" | ||
29 | } | ||
30 | if("!Sample_geo_accession" == mat[1,j]){ | ||
31 | colnames(mat)[j] <- "ID_REF" | ||
32 | } else{ | ||
33 | if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ | ||
34 | colnames(mat)[j] <- paste0("Sex",x) | ||
35 | x = x + 1 | ||
36 | } | ||
37 | if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ | ||
38 | colnames(mat)[j] <- paste0("PMI",p) | ||
39 | p = p + 1 | ||
40 | } | ||
41 | if(grepl("age|Age|AGE",mat[2,j])==TRUE){ | ||
42 | colnames(mat)[j] <- paste0("Age",a) | ||
43 | a = a + 1 | ||
44 | } | ||
45 | if(grepl("braak|b&b",mat[2,j])==TRUE){ | ||
46 | colnames(mat)[j] <- paste0("Braak",b) | ||
47 | b = b + 1 | ||
48 | } | ||
49 | if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){ | ||
50 | colnames(mat)[j] <- paste0("Group",g) | ||
51 | g = g + 1 | ||
52 | } | ||
53 | |||
54 | } | ||
55 | j = j + 1 | ||
56 | } | ||
57 | mat | ||
58 | } | ||
59 | |||
60 | #2#Function for reorganizing information within the columns | ||
61 | cinfo <- function(mat){ | ||
62 | col <- dim(mat)[2] | ||
63 | j <-2 | ||
64 | for(j in 2:col){ | ||
65 | if(grepl("Group",colnames(mat)[j]) == TRUE){ | ||
66 | mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) | ||
67 | } | ||
68 | if(grepl("Age",colnames(mat)[j])==TRUE){ | ||
69 | mat[,j] <- gsub("\\D","",mat[,j])%>% | ||
70 | as.integer() | ||
71 | } | ||
72 | if(grepl("Sex",colnames(mat)[j])==TRUE){ | ||
73 | mat[,j] <- gsub(".+:\\s","",mat[,j]) | ||
74 | } | ||
75 | if(grepl("PMI",colnames(mat)[j])==TRUE){ | ||
76 | mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% | ||
77 | as.numeric() | ||
78 | } | ||
79 | if(grepl("Braak",colnames(mat)[j])==TRUE){ | ||
80 | mat[,j]<-gsub(".+:\\s","",mat[,j])%>% | ||
81 | as.roman()%>% | ||
82 | as.integer() | ||
83 | } | ||
84 | j=j+1 | ||
85 | } | ||
86 | mat | ||
87 | } | ||
88 | |||
89 | #3#Function for labeling the gene IDs without names | ||
90 | NAFIXING <- function(GIDNAM){ | ||
91 | row <- dim(GIDNAM)[1] | ||
92 | i <- 1 | ||
93 | for(i in 1:row){ | ||
94 | if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ | ||
95 | GIDNAM[i,2] <- GIDNAM[i,1] | ||
96 | } | ||
97 | i <- i + 1 | ||
98 | } | ||
99 | GIDNAM | ||
100 | } | ||
101 | |||
102 | #4#Function for changing the gene ID to gene name | ||
103 | cgeneID <- function(GeneName,DATA){ | ||
104 | colGene <- dim(GeneName)[2] | ||
105 | j <- 1 | ||
106 | for(j in 1:colGene){ | ||
107 | chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) | ||
108 | if(is.na(sum(chngsreq))==FALSE){ | ||
109 | if(sum(chngsreq) > 0){ | ||
110 | DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | ||
111 | } | ||
112 | } | ||
113 | j = j+1 | ||
114 | } | ||
115 | DATA | ||
116 | } | ||
117 | |||
118 | #5#Function for adjusting the gene names | ||
119 | gcnames <- function(DiData,usecol=1){ | ||
120 | nuruns <- dim(DiData)[2] | ||
121 | i = 1 | ||
122 | nwnam <- rep("0",length.out=nuruns) | ||
123 | for(i in 1:nuruns){ | ||
124 | if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ | ||
125 | nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol]) | ||
126 | } else{ | ||
127 | nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1]) | ||
128 | } | ||
129 | |||
130 | } | ||
131 | nwnam | ||
132 | |||
133 | } | ||
134 | |||
135 | #6# Function for discretizing the data | ||
136 | dndat <- function(NDATA){ | ||
137 | rownd <- dim(NDATA)[1] | ||
138 | colnd <- dim(NDATA)[2] | ||
139 | DDATA <- matrix(0,nrow=rownd,ncol=colnd) | ||
140 | colnames(DDATA) <- colnames(NDATA) | ||
141 | i <- 1 | ||
142 | for(i in 1:rownd){ | ||
143 | j <- 1 | ||
144 | for(j in 1:colnd){ | ||
145 | if(is.na(NDATA[i,j])==FALSE){ | ||
146 | |||
147 | if(NDATA[i,j] < -1){ | ||
148 | DDATA[i,j]=0L | ||
149 | } | ||
150 | if(NDATA[i,j] > 1){ | ||
151 | DDATA[i,j]=2L | ||
152 | } | ||
153 | if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | ||
154 | DDATA[i,j]=1L | ||
155 | } | ||
156 | } else{ | ||
157 | DDATA[i,j] = NDATA[i,j] | ||
158 | } | ||
159 | j = j + 1 | ||
160 | } | ||
161 | i = i + 1 | ||
162 | } | ||
163 | DDATA | ||
164 | } | ||
165 | |||
166 | |||
167 | #MajorFunction#This is the function that does everything else | ||
168 | THEFT <- function(){ | ||
169 | #Set working directory based on the directory of the series matrix file Currently only works for windows | ||
170 | wd <- getwd() | ||
171 | #list.files() | ||
172 | #gsub("wd",wd,"Do you want to clean all data files in the directory wd?") | ||
173 | numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) | ||
174 | GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files()) | ||
175 | |||
176 | #ALL DATA FILES WILL BE CLEANED | ||
177 | if(numDAT == 1){ | ||
178 | #indexing the data files | ||
179 | n <- 1 | ||
180 | for(n in 1: length(GSEfileloc)){ | ||
181 | alz <- list.files()[GSEfileloc[n]] | ||
182 | |||
183 | #Working with the wordy part of the document | ||
184 | alzword <- alz %>% | ||
185 | read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% | ||
186 | filter(grepl("!Sample",X1))%>% | ||
187 | filter(!grepl("!Sample_contact",X1)) | ||
188 | |||
189 | #Getting the GPL file | ||
190 | genena <- grep("_platform_id",alzword$X1) %>% | ||
191 | alzword$X2[.] %>% | ||
192 | str_trim(.) %>% | ||
193 | paste0("^",.) %>% | ||
194 | grep(.,list.files()) %>% | ||
195 | list.files()[.] | ||
196 | |||
197 | #Find out if it is a soft GPL file or not | ||
198 | soft <- strsplit(genena,"[\\|/]") %>% | ||
199 | .[[1]] %>% | ||
200 | .[length(.)] %>% | ||
201 | grepl("soft",.) | ||
202 | |||
203 | ##Changing row names and column names: | ||
204 | ALZWORD <- t(alzword) | ||
205 | rownames(ALZWORD)=NULL | ||
206 | colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) | ||
207 | ALZWORD <- chngrownm(ALZWORD)[-1,] | ||
208 | ALZWORD <- ALZWORD%>% | ||
209 | as.data.frame()%>% | ||
210 | dplyr::select(-starts_with("col")) | ||
211 | |||
212 | ##Reorganizing information within the columns and final clinical data | ||
213 | ALZWORDF <- cinfo(ALZWORD) | ||
214 | |||
215 | |||
216 | #Working with Actual Data part of file | ||
217 | alzdat <- alz %>% | ||
218 | read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) | ||
219 | ALZDAT <- t(alzdat[,-1]) | ||
220 | rownames(ALZDAT)=NULL | ||
221 | |||
222 | ##Is there a clean version of the GPL file available? | ||
223 | gplnum <- strsplit(genena,"[\\|/]") %>% | ||
224 | .[[1]] %>% | ||
225 | .[length(.)] %>% | ||
226 | gsub("\\D","",.) | ||
227 | clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) | ||
228 | if(clfileex >= 1){ | ||
229 | #use the clean version | ||
230 | geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% | ||
231 | read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") | ||
232 | |||
233 | } | ||
234 | if(clfileex == 0){ | ||
235 | ##Lets Create a clean version | ||
236 | |||
237 | ##Gene ID to Gene Name | ||
238 | if(soft == TRUE){ | ||
239 | #Check to see if there is already a file containing information on soft files | ||
240 | fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) | ||
241 | if(fileex == 1){ | ||
242 | #Check to see if this GPL soft file has been used before | ||
243 | IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | ||
244 | .$GPL_FILE_NUM%>% | ||
245 | grepl(gplnum,.) %>% | ||
246 | sum() | ||
247 | if(IDF == 1){ | ||
248 | IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | ||
249 | .$GPL_FILE_NUM%>% | ||
250 | grep(gplnum,.) | ||
251 | idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | ||
252 | .$LOC_ID %>% | ||
253 | .[IDLOCAL] | ||
254 | geneIDNam <- genena %>% | ||
255 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% | ||
256 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | ||
257 | } | ||
258 | if(IDF == 0){ | ||
259 | #No information on this particular GPL file | ||
260 | idLOCGPL <- genena %>% | ||
261 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | ||
262 | t(.) %>% | ||
263 | grep("^ID\\s*$",.) %>% | ||
264 | -1 | ||
265 | cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% | ||
266 | cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) | ||
267 | geneIDNam <- genena %>% | ||
268 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | ||
269 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | ||
270 | } | ||
271 | } | ||
272 | if(fileex == 0){ | ||
273 | #We must create a file that we can access for later use | ||
274 | idLOCGPL <- genena %>% | ||
275 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | ||
276 | t(.) %>% | ||
277 | grep("^ID\\s*$",.) %>% | ||
278 | -1 | ||
279 | Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) | ||
280 | colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") | ||
281 | write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) | ||
282 | geneIDNam <- genena %>% | ||
283 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | ||
284 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | ||
285 | } | ||
286 | } | ||
287 | if(soft == FALSE){ | ||
288 | geneIDNam <- genena %>% | ||
289 | read_delim(delim="\t",comment = "#")%>% | ||
290 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | ||
291 | } | ||
292 | |||
293 | ##Labeling the gene IDs without names | ||
294 | geneIDNam <- NAFIXING(geneIDNam) | ||
295 | |||
296 | ##remove the whitespace | ||
297 | geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) | ||
298 | |||
299 | ##Here is the clean version | ||
300 | write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) | ||
301 | } | ||
302 | |||
303 | |||
304 | |||
305 | ##Changing the gene ID to gene name | ||
306 | ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) | ||
307 | colnames(ALZDAT) = ALZDAT1[1,] | ||
308 | |||
309 | |||
310 | ##Adjusting the column names aka the gene names | ||
311 | colnames(ALZDAT) <- gcnames(ALZDAT) | ||
312 | |||
313 | |||
314 | #Full RAW Data | ||
315 | Fullalzdwr <- ALZDAT %>% | ||
316 | as.data.frame() %>% | ||
317 | cbind(ALZWORDF,.) | ||
318 | |||
319 | #Raw file is output | ||
320 | nfnaex <- strsplit(alz,"[\\]") %>% | ||
321 | .[[1]] %>% | ||
322 | .[length(.)] %>% | ||
323 | gsub("\\D","",.) %>% | ||
324 | c("GSE",.,"aftexcel.txt") %>% | ||
325 | paste(collapse = "") | ||
326 | write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") | ||
327 | |||
328 | |||
329 | |||
330 | #Now for the discretization part | ||
331 | ##get the wordy part again | ||
332 | rawword <- t(ALZWORDF) | ||
333 | |||
334 | ##where is ID_REF located | ||
335 | hereim <- grep("ID_REF",rownames(rawword)) | ||
336 | |||
337 | ##Subject Names GSM... | ||
338 | subjnam <- rawword[hereim,] | ||
339 | |||
340 | ##Getting the names for the rows | ||
341 | namedarows <- rownames(rawword)[-hereim] %>% | ||
342 | as.data.frame() | ||
343 | RAWWORD <- rawword[-hereim,] %>% | ||
344 | as.data.frame() %>% | ||
345 | bind_cols(namedarows,.) | ||
346 | z <- 1 | ||
347 | naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) | ||
348 | for(z in 1:dim(RAWWORD)[1]){ | ||
349 | naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | ||
350 | z <- z + 1 | ||
351 | } | ||
352 | |||
353 | colnames(naroww) <- "ROW_NAs" | ||
354 | RAWWORD <- bind_cols(RAWWORD,naroww) | ||
355 | |||
356 | |||
357 | roALZna <- t(ALZDAT) %>% | ||
358 | rownames(.) %>% | ||
359 | as.data.frame(.) | ||
360 | colnames(roALZna) <- "ID_REF" | ||
361 | |||
362 | RAWDAT <- t(ALZDAT) %>% | ||
363 | as.data.frame(.) | ||
364 | colnames(RAWDAT) <- NULL | ||
365 | rownames(RAWDAT) <- NULL | ||
366 | |||
367 | RAWDAT2 <- RAWDAT %>% | ||
368 | cbind(roALZna,.) %>% | ||
369 | dplyr::arrange(.,ID_REF) | ||
370 | |||
371 | ##Editing the file for R processing | ||
372 | RAWDATID <- RAWDAT2[,1] %>% | ||
373 | as.matrix(.) | ||
374 | |||
375 | RAWDATNUM <- RAWDAT2[,-1] %>% | ||
376 | mapply(.,FUN = as.numeric) %>% | ||
377 | t(.) | ||
378 | |||
379 | ##Consolidating genes with the same name | ||
380 | ###create empty matrix of size equal to tabRDATID | ||
381 | tabRDATID <- table(RAWDATID) | ||
382 | NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) | ||
383 | j <- 1 | ||
384 | for(j in 1:length(tabRDATID)){ | ||
385 | ##Putting the ones without duplicates in their new homes | ||
386 | if(tabRDATID[j] == 1){ | ||
387 | NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] | ||
388 | } | ||
389 | ##Averaging duplicates and putting them in their new homes | ||
390 | if(tabRDATID[j] > 1){ | ||
391 | NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) | ||
392 | } | ||
393 | j <- j + 1 | ||
394 | } | ||
395 | |||
396 | ##Scaling the Data | ||
397 | scrawdat <- NuRDATN%>% | ||
398 | scale() | ||
399 | attr(scrawdat,"scaled:center") <- NULL | ||
400 | attr(scrawdat,"scaled:scale") <- NULL | ||
401 | colnames(scrawdat) <- rownames(tabRDATID) | ||
402 | |||
403 | ##Discretized the Data | ||
404 | dialzdat <- scrawdat %>% | ||
405 | dndat(.) %>% | ||
406 | t()%>% | ||
407 | as.data.frame(.) | ||
408 | colnames(dialzdat) <- rownames(RAWDATNUM) | ||
409 | |||
410 | ##setting "ID_REF" as a new variable | ||
411 | geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) | ||
412 | colnames(geneNAM) <- "ID_REF" | ||
413 | rownames(dialzdat) <- NULL | ||
414 | dialzdat <-bind_cols(geneNAM,dialzdat) | ||
415 | |||
416 | ##NAs in a column | ||
417 | x <- 2 | ||
418 | nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) | ||
419 | nacol[1,1] = "COL_NAs" | ||
420 | for(x in 2:dim(dialzdat)[2]){ | ||
421 | nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) | ||
422 | x <- x + 1 | ||
423 | } | ||
424 | colnames(nacol) <- colnames(dialzdat) | ||
425 | dialzdat <- bind_rows(dialzdat,nacol) | ||
426 | |||
427 | ##NAs in a row | ||
428 | y <- 1 | ||
429 | narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) | ||
430 | for(y in 1:dim(dialzdat)[1]){ | ||
431 | narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) | ||
432 | y <- y + 1 | ||
433 | } | ||
434 | colnames(narowd) <- "ROW_NAs" | ||
435 | dialzdat <- bind_cols(dialzdat,narowd) | ||
436 | colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam | ||
437 | colnames(RAWWORD) <- colnames(dialzdat) | ||
438 | ##converting to character so that the clinical can be brought together with discrete data | ||
439 | k <- 2 | ||
440 | for(k in 2:dim(dialzdat)[2]-1){ | ||
441 | dialzdat[,k] <- as.character(dialzdat[,k]) | ||
442 | k <- k + 1 | ||
443 | } | ||
444 | #The End the full data | ||
445 | Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) | ||
446 | |||
447 | #Produces Discrete file | ||
448 | nfnaex2 <- strsplit(alz,"[\\|/]") %>% | ||
449 | .[[1]] %>% | ||
450 | .[length(.)] %>% | ||
451 | gsub("\\D","",.) %>% | ||
452 | c("GSE",.,"dscrt.txt") %>% | ||
453 | paste(collapse = "") | ||
454 | write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) | ||
455 | n <- n +1 | ||
456 | } | ||
457 | } | ||
458 | |||
459 | #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN | ||
460 | if(numDAT == 2){ | ||
461 | #All the files you want to analyze | ||
462 | ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:") | ||
463 | if(length(ANDIS) == 0){ | ||
464 | #Spit out a warning | ||
465 | warning("You did not select any files and so no cleaning will be performed") | ||
466 | } else{ | ||
467 | #indexing the data files | ||
468 | n <- 1 | ||
469 | for(n in 1: length(ANDIS)){ | ||
470 | alz <- ANDIS[n] | ||
471 | |||
472 | #Working with the wordy part of the document | ||
473 | alzword <- alz %>% | ||
474 | read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% | ||
475 | filter(grepl("!Sample",X1))%>% | ||
476 | filter(!grepl("!Sample_contact",X1)) | ||
477 | |||
478 | #Getting the GPL file | ||
479 | genena <- grep("_platform_id",alzword$X1) %>% | ||
480 | alzword$X2[.] %>% | ||
481 | str_trim(.) %>% | ||
482 | paste0("^",.) %>% | ||
483 | grep(.,list.files()) %>% | ||
484 | list.files()[.] | ||
485 | |||
486 | #Find out if it is a soft GPL file or not | ||
487 | soft <- strsplit(genena,"[\\|/]") %>% | ||
488 | .[[1]] %>% | ||
489 | .[length(.)] %>% | ||
490 | grepl("soft",.) | ||
491 | |||
492 | ##Changing row names and column names: | ||
493 | ALZWORD <- t(alzword) | ||
494 | rownames(ALZWORD)=NULL | ||
495 | colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) | ||
496 | ALZWORD <- chngrownm(ALZWORD)[-1,] | ||
497 | ALZWORD <- ALZWORD%>% | ||
498 | as.data.frame()%>% | ||
499 | dplyr::select(-starts_with("col")) | ||
500 | |||
501 | ##Reorganizing information within the columns and final clinical data | ||
502 | ALZWORDF <- cinfo(ALZWORD) | ||
503 | |||
504 | |||
505 | #Working with Actual Data part of file | ||
506 | alzdat <- alz %>% | ||
507 | read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) | ||
508 | ALZDAT <- t(alzdat[,-1]) | ||
509 | rownames(ALZDAT)=NULL | ||
510 | |||
511 | ##Is there a clean version of the GPL file available? | ||
512 | gplnum <- strsplit(genena,"[\\|/]") %>% | ||
513 | .[[1]] %>% | ||
514 | .[length(.)] %>% | ||
515 | gsub("\\D","",.) | ||
516 | clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) | ||
517 | if(clfileex >= 1){ | ||
518 | #use the clean version | ||
519 | geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% | ||
520 | read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") | ||
521 | |||
522 | } | ||
523 | if(clfileex == 0){ | ||
524 | ##Lets Create a clean version | ||
525 | |||
526 | ##Gene ID to Gene Name | ||
527 | if(soft == TRUE){ | ||
528 | #Check to see if there is already a file containing information on soft files | ||
529 | fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) | ||
530 | if(fileex == 1){ | ||
531 | #Check to see if this GPL soft file has been used before | ||
532 | IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | ||
533 | .$GPL_FILE_NUM%>% | ||
534 | grepl(gplnum,.) %>% | ||
535 | sum() | ||
536 | if(IDF == 1){ | ||
537 | IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | ||
538 | .$GPL_FILE_NUM%>% | ||
539 | grep(gplnum,.) | ||
540 | idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | ||
541 | .$LOC_ID %>% | ||
542 | .[IDLOCAL] | ||
543 | geneIDNam <- genena %>% | ||
544 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% | ||
545 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | ||
546 | } | ||
547 | if(IDF == 0){ | ||
548 | #No information on this particular GPL file | ||
549 | idLOCGPL <- genena %>% | ||
550 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | ||
551 | t(.) %>% | ||
552 | grep("^ID\\s*$",.) %>% | ||
553 | -1 | ||
554 | cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% | ||
555 | cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) | ||
556 | geneIDNam <- genena %>% | ||
557 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | ||
558 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | ||
559 | } | ||
560 | } | ||
561 | if(fileex == 0){ | ||
562 | #We must create a file that we can access for later use | ||
563 | idLOCGPL <- genena %>% | ||
564 | read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | ||
565 | t(.) %>% | ||
566 | grep("^ID\\s*$",.) %>% | ||
567 | -1 | ||
568 | Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) | ||
569 | colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") | ||
570 | write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) | ||
571 | geneIDNam <- genena %>% | ||
572 | read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | ||
573 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | ||
574 | } | ||
575 | } | ||
576 | if(soft == FALSE){ | ||
577 | geneIDNam <- genena %>% | ||
578 | read_delim(delim="\t",comment = "#")%>% | ||
579 | dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | ||
580 | } | ||
581 | |||
582 | ##Labeling the gene IDs without names | ||
583 | geneIDNam <- NAFIXING(geneIDNam) | ||
584 | |||
585 | ##remove the whitespace | ||
586 | geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) | ||
587 | |||
588 | ##Here is the clean version | ||
589 | write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) | ||
590 | } | ||
591 | |||
592 | |||
593 | |||
594 | ##Changing the gene ID to gene name | ||
595 | ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) | ||
596 | colnames(ALZDAT) = ALZDAT1[1,] | ||
597 | |||
598 | |||
599 | ##Adjusting the column names aka the gene names | ||
600 | colnames(ALZDAT) <- gcnames(ALZDAT) | ||
601 | |||
602 | |||
603 | #Full RAW Data | ||
604 | Fullalzdwr <- ALZDAT %>% | ||
605 | as.data.frame() %>% | ||
606 | cbind(ALZWORDF,.) | ||
607 | |||
608 | #Raw file is output | ||
609 | nfnaex <- strsplit(alz,"[\\]") %>% | ||
610 | .[[1]] %>% | ||
611 | .[length(.)] %>% | ||
612 | gsub("\\D","",.) %>% | ||
613 | c("GSE",.,"aftexcel.txt") %>% | ||
614 | paste(collapse = "") | ||
615 | write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") | ||
616 | |||
617 | |||
618 | |||
619 | #Now for the discretization part | ||
620 | ##get the wordy part again | ||
621 | rawword <- t(ALZWORDF) | ||
622 | |||
623 | ##where is ID_REF located | ||
624 | hereim <- grep("ID_REF",rownames(rawword)) | ||
625 | |||
626 | ##Subject Names GSM... | ||
627 | subjnam <- rawword[hereim,] | ||
628 | |||
629 | ##Getting the names for the rows | ||
630 | namedarows <- rownames(rawword)[-hereim] %>% | ||
631 | as.data.frame() | ||
632 | RAWWORD <- rawword[-hereim,] %>% | ||
633 | as.data.frame() %>% | ||
634 | bind_cols(namedarows,.) | ||
635 | z <- 1 | ||
636 | naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) | ||
637 | for(z in 1:dim(RAWWORD)[1]){ | ||
638 | naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | ||
639 | z <- z + 1 | ||
640 | } | ||
641 | |||
642 | colnames(naroww) <- "ROW_NAs" | ||
643 | RAWWORD <- bind_cols(RAWWORD,naroww) | ||
644 | |||
645 | |||
646 | roALZna <- t(ALZDAT) %>% | ||
647 | rownames(.) %>% | ||
648 | as.data.frame(.) | ||
649 | colnames(roALZna) <- "ID_REF" | ||
650 | |||
651 | RAWDAT <- t(ALZDAT) %>% | ||
652 | as.data.frame(.) | ||
653 | colnames(RAWDAT) <- NULL | ||
654 | rownames(RAWDAT) <- NULL | ||
655 | |||
656 | RAWDAT2 <- RAWDAT %>% | ||
657 | cbind(roALZna,.) %>% | ||
658 | dplyr::arrange(.,ID_REF) | ||
659 | |||
660 | ##Editing the file for R processing | ||
661 | RAWDATID <- RAWDAT2[,1] %>% | ||
662 | as.matrix(.) | ||
663 | |||
664 | RAWDATNUM <- RAWDAT2[,-1] %>% | ||
665 | mapply(.,FUN = as.numeric) %>% | ||
666 | t(.) | ||
667 | |||
668 | ##Consolidating genes with the same name | ||
669 | ###create empty matrix of size equal to tabRDATID | ||
670 | tabRDATID <- table(RAWDATID) | ||
671 | NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) | ||
672 | j <- 1 | ||
673 | for(j in 1:length(tabRDATID)){ | ||
674 | ##Putting the ones without duplicates in their new homes | ||
675 | if(tabRDATID[j] == 1){ | ||
676 | NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] | ||
677 | } | ||
678 | ##Averaging duplicates and putting them in their new homes | ||
679 | if(tabRDATID[j] > 1){ | ||
680 | NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) | ||
681 | } | ||
682 | j <- j + 1 | ||
683 | } | ||
684 | |||
685 | ##Scaling the Data | ||
686 | scrawdat <- NuRDATN%>% | ||
687 | scale() | ||
688 | attr(scrawdat,"scaled:center") <- NULL | ||
689 | attr(scrawdat,"scaled:scale") <- NULL | ||
690 | colnames(scrawdat) <- rownames(tabRDATID) | ||
691 | |||
692 | ##Discretized the Data | ||
693 | dialzdat <- scrawdat %>% | ||
694 | dndat(.) %>% | ||
695 | t()%>% | ||
696 | as.data.frame(.) | ||
697 | colnames(dialzdat) <- rownames(RAWDATNUM) | ||
698 | |||
699 | ##setting "ID_REF" as a new variable | ||
700 | geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) | ||
701 | colnames(geneNAM) <- "ID_REF" | ||
702 | rownames(dialzdat) <- NULL | ||
703 | dialzdat <-bind_cols(geneNAM,dialzdat) | ||
704 | |||
705 | ##NAs in a column | ||
706 | x <- 2 | ||
707 | nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) | ||
708 | nacol[1,1] = "COL_NAs" | ||
709 | for(x in 2:dim(dialzdat)[2]){ | ||
710 | nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) | ||
711 | x <- x + 1 | ||
712 | } | ||
713 | colnames(nacol) <- colnames(dialzdat) | ||
714 | dialzdat <- bind_rows(dialzdat,nacol) | ||
715 | |||
716 | ##NAs in a row | ||
717 | y <- 1 | ||
718 | narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) | ||
719 | for(y in 1:dim(dialzdat)[1]){ | ||
720 | narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) | ||
721 | y <- y + 1 | ||
722 | } | ||
723 | colnames(narowd) <- "ROW_NAs" | ||
724 | dialzdat <- bind_cols(dialzdat,narowd) | ||
725 | colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam | ||
726 | colnames(RAWWORD) <- colnames(dialzdat) | ||
727 | ##converting to character so that the clinical can be brought together with discrete data | ||
728 | k <- 2 | ||
729 | for(k in 2:dim(dialzdat)[2]-1){ | ||
730 | dialzdat[,k] <- as.character(dialzdat[,k]) | ||
731 | k <- k + 1 | ||
732 | } | ||
733 | #The End the full data | ||
734 | Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) | ||
735 | |||
736 | #Produces Discrete file | ||
737 | nfnaex2 <- strsplit(alz,"[\\|/]") %>% | ||
738 | .[[1]] %>% | ||
739 | .[length(.)] %>% | ||
740 | gsub("\\D","",.) %>% | ||
741 | c("GSE",.,"dscrt.txt") %>% | ||
742 | paste(collapse = "") | ||
743 | write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) | ||
744 | |||
745 | |||
746 | n <- n + 1 | ||
747 | } | ||
748 | } | ||
749 | } | ||
750 | } | ||
751 | #The Rest of this code will be used every time you want to change a data set | ||
752 | THEFT() |