Commit adfed316993d072e749a8cd85434fc667c054f22

Authored by Efrain Gonzalez
1 parent 689231363c
Exists in master

An automated version of the RCleanDscret.R

Working on outputting more insightful errors and warnings. (UNTESTED)
Showing 1 changed file with 752 additions and 0 deletions   Show diff stats
File was created 1 #Efrain H. Gonzalez
2 #6/19/2017
3 #Libraries required to run the code
4 library(pryr)
5 library(MASS)
6 library(dplyr)
7 library(tidyr)
8 library(readr)
9 library(stringr)
10
11
12 #Necessary Functions
13 #1#Function for handling the changing of row names and column names
14 chngrownm <- function(mat){
15 row <- dim(mat)[1]
16 col <- dim(mat)[2]
17 j <- 1
18 x <- 1
19 p <- 1
20 a <- 1
21 b <- 1
22 g <- 1
23 for(j in 1:col){
24 if("!Sample_source_name_ch1"==mat[1,j]){
25 colnames(mat)[j] <- "Brain_Region"
26 }
27 if("!Sample_title" == mat[1,j]){
28 colnames(mat)[j] <- "Title"
29 }
30 if("!Sample_geo_accession" == mat[1,j]){
31 colnames(mat)[j] <- "ID_REF"
32 } else{
33 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
34 colnames(mat)[j] <- paste0("Sex",x)
35 x = x + 1
36 }
37 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
38 colnames(mat)[j] <- paste0("PMI",p)
39 p = p + 1
40 }
41 if(grepl("age|Age|AGE",mat[2,j])==TRUE){
42 colnames(mat)[j] <- paste0("Age",a)
43 a = a + 1
44 }
45 if(grepl("braak|b&b",mat[2,j])==TRUE){
46 colnames(mat)[j] <- paste0("Braak",b)
47 b = b + 1
48 }
49 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){
50 colnames(mat)[j] <- paste0("Group",g)
51 g = g + 1
52 }
53
54 }
55 j = j + 1
56 }
57 mat
58 }
59
60 #2#Function for reorganizing information within the columns
61 cinfo <- function(mat){
62 col <- dim(mat)[2]
63 j <-2
64 for(j in 2:col){
65 if(grepl("Group",colnames(mat)[j]) == TRUE){
66 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
67 }
68 if(grepl("Age",colnames(mat)[j])==TRUE){
69 mat[,j] <- gsub("\\D","",mat[,j])%>%
70 as.integer()
71 }
72 if(grepl("Sex",colnames(mat)[j])==TRUE){
73 mat[,j] <- gsub(".+:\\s","",mat[,j])
74 }
75 if(grepl("PMI",colnames(mat)[j])==TRUE){
76 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
77 as.numeric()
78 }
79 if(grepl("Braak",colnames(mat)[j])==TRUE){
80 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
81 as.roman()%>%
82 as.integer()
83 }
84 j=j+1
85 }
86 mat
87 }
88
89 #3#Function for labeling the gene IDs without names
90 NAFIXING <- function(GIDNAM){
91 row <- dim(GIDNAM)[1]
92 i <- 1
93 for(i in 1:row){
94 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
95 GIDNAM[i,2] <- GIDNAM[i,1]
96 }
97 i <- i + 1
98 }
99 GIDNAM
100 }
101
102 #4#Function for changing the gene ID to gene name
103 cgeneID <- function(GeneName,DATA){
104 colGene <- dim(GeneName)[2]
105 j <- 1
106 for(j in 1:colGene){
107 chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
108 if(is.na(sum(chngsreq))==FALSE){
109 if(sum(chngsreq) > 0){
110 DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
111 }
112 }
113 j = j+1
114 }
115 DATA
116 }
117
118 #5#Function for adjusting the gene names
119 gcnames <- function(DiData,usecol=1){
120 nuruns <- dim(DiData)[2]
121 i = 1
122 nwnam <- rep("0",length.out=nuruns)
123 for(i in 1:nuruns){
124 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
125 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
126 } else{
127 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
128 }
129
130 }
131 nwnam
132
133 }
134
135 #6# Function for discretizing the data
136 dndat <- function(NDATA){
137 rownd <- dim(NDATA)[1]
138 colnd <- dim(NDATA)[2]
139 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
140 colnames(DDATA) <- colnames(NDATA)
141 i <- 1
142 for(i in 1:rownd){
143 j <- 1
144 for(j in 1:colnd){
145 if(is.na(NDATA[i,j])==FALSE){
146
147 if(NDATA[i,j] < -1){
148 DDATA[i,j]=0L
149 }
150 if(NDATA[i,j] > 1){
151 DDATA[i,j]=2L
152 }
153 if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
154 DDATA[i,j]=1L
155 }
156 } else{
157 DDATA[i,j] = NDATA[i,j]
158 }
159 j = j + 1
160 }
161 i = i + 1
162 }
163 DDATA
164 }
165
166
167 #MajorFunction#This is the function that does everything else
168 THEFT <- function(){
169 #Set working directory based on the directory of the series matrix file Currently only works for windows
170 wd <- getwd()
171 #list.files()
172 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
173 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
174 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
175
176 #ALL DATA FILES WILL BE CLEANED
177 if(numDAT == 1){
178 #indexing the data files
179 n <- 1
180 for(n in 1: length(GSEfileloc)){
181 alz <- list.files()[GSEfileloc[n]]
182
183 #Working with the wordy part of the document
184 alzword <- alz %>%
185 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
186 filter(grepl("!Sample",X1))%>%
187 filter(!grepl("!Sample_contact",X1))
188
189 #Getting the GPL file
190 genena <- grep("_platform_id",alzword$X1) %>%
191 alzword$X2[.] %>%
192 str_trim(.) %>%
193 paste0("^",.) %>%
194 grep(.,list.files()) %>%
195 list.files()[.]
196
197 #Find out if it is a soft GPL file or not
198 soft <- strsplit(genena,"[\\|/]") %>%
199 .[[1]] %>%
200 .[length(.)] %>%
201 grepl("soft",.)
202
203 ##Changing row names and column names:
204 ALZWORD <- t(alzword)
205 rownames(ALZWORD)=NULL
206 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
207 ALZWORD <- chngrownm(ALZWORD)[-1,]
208 ALZWORD <- ALZWORD%>%
209 as.data.frame()%>%
210 dplyr::select(-starts_with("col"))
211
212 ##Reorganizing information within the columns and final clinical data
213 ALZWORDF <- cinfo(ALZWORD)
214
215
216 #Working with Actual Data part of file
217 alzdat <- alz %>%
218 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
219 ALZDAT <- t(alzdat[,-1])
220 rownames(ALZDAT)=NULL
221
222 ##Is there a clean version of the GPL file available?
223 gplnum <- strsplit(genena,"[\\|/]") %>%
224 .[[1]] %>%
225 .[length(.)] %>%
226 gsub("\\D","",.)
227 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
228 if(clfileex >= 1){
229 #use the clean version
230 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
231 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
232
233 }
234 if(clfileex == 0){
235 ##Lets Create a clean version
236
237 ##Gene ID to Gene Name
238 if(soft == TRUE){
239 #Check to see if there is already a file containing information on soft files
240 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
241 if(fileex == 1){
242 #Check to see if this GPL soft file has been used before
243 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
244 .$GPL_FILE_NUM%>%
245 grepl(gplnum,.) %>%
246 sum()
247 if(IDF == 1){
248 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
249 .$GPL_FILE_NUM%>%
250 grep(gplnum,.)
251 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
252 .$LOC_ID %>%
253 .[IDLOCAL]
254 geneIDNam <- genena %>%
255 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
256 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
257 }
258 if(IDF == 0){
259 #No information on this particular GPL file
260 idLOCGPL <- genena %>%
261 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
262 t(.) %>%
263 grep("^ID\\s*$",.) %>%
264 -1
265 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
266 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
267 geneIDNam <- genena %>%
268 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
269 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
270 }
271 }
272 if(fileex == 0){
273 #We must create a file that we can access for later use
274 idLOCGPL <- genena %>%
275 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
276 t(.) %>%
277 grep("^ID\\s*$",.) %>%
278 -1
279 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
280 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
281 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
282 geneIDNam <- genena %>%
283 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
284 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
285 }
286 }
287 if(soft == FALSE){
288 geneIDNam <- genena %>%
289 read_delim(delim="\t",comment = "#")%>%
290 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
291 }
292
293 ##Labeling the gene IDs without names
294 geneIDNam <- NAFIXING(geneIDNam)
295
296 ##remove the whitespace
297 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
298
299 ##Here is the clean version
300 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
301 }
302
303
304
305 ##Changing the gene ID to gene name
306 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
307 colnames(ALZDAT) = ALZDAT1[1,]
308
309
310 ##Adjusting the column names aka the gene names
311 colnames(ALZDAT) <- gcnames(ALZDAT)
312
313
314 #Full RAW Data
315 Fullalzdwr <- ALZDAT %>%
316 as.data.frame() %>%
317 cbind(ALZWORDF,.)
318
319 #Raw file is output
320 nfnaex <- strsplit(alz,"[\\]") %>%
321 .[[1]] %>%
322 .[length(.)] %>%
323 gsub("\\D","",.) %>%
324 c("GSE",.,"aftexcel.txt") %>%
325 paste(collapse = "")
326 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
327
328
329
330 #Now for the discretization part
331 ##get the wordy part again
332 rawword <- t(ALZWORDF)
333
334 ##where is ID_REF located
335 hereim <- grep("ID_REF",rownames(rawword))
336
337 ##Subject Names GSM...
338 subjnam <- rawword[hereim,]
339
340 ##Getting the names for the rows
341 namedarows <- rownames(rawword)[-hereim] %>%
342 as.data.frame()
343 RAWWORD <- rawword[-hereim,] %>%
344 as.data.frame() %>%
345 bind_cols(namedarows,.)
346 z <- 1
347 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
348 for(z in 1:dim(RAWWORD)[1]){
349 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
350 z <- z + 1
351 }
352
353 colnames(naroww) <- "ROW_NAs"
354 RAWWORD <- bind_cols(RAWWORD,naroww)
355
356
357 roALZna <- t(ALZDAT) %>%
358 rownames(.) %>%
359 as.data.frame(.)
360 colnames(roALZna) <- "ID_REF"
361
362 RAWDAT <- t(ALZDAT) %>%
363 as.data.frame(.)
364 colnames(RAWDAT) <- NULL
365 rownames(RAWDAT) <- NULL
366
367 RAWDAT2 <- RAWDAT %>%
368 cbind(roALZna,.) %>%
369 dplyr::arrange(.,ID_REF)
370
371 ##Editing the file for R processing
372 RAWDATID <- RAWDAT2[,1] %>%
373 as.matrix(.)
374
375 RAWDATNUM <- RAWDAT2[,-1] %>%
376 mapply(.,FUN = as.numeric) %>%
377 t(.)
378
379 ##Consolidating genes with the same name
380 ###create empty matrix of size equal to tabRDATID
381 tabRDATID <- table(RAWDATID)
382 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
383 j <- 1
384 for(j in 1:length(tabRDATID)){
385 ##Putting the ones without duplicates in their new homes
386 if(tabRDATID[j] == 1){
387 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
388 }
389 ##Averaging duplicates and putting them in their new homes
390 if(tabRDATID[j] > 1){
391 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
392 }
393 j <- j + 1
394 }
395
396 ##Scaling the Data
397 scrawdat <- NuRDATN%>%
398 scale()
399 attr(scrawdat,"scaled:center") <- NULL
400 attr(scrawdat,"scaled:scale") <- NULL
401 colnames(scrawdat) <- rownames(tabRDATID)
402
403 ##Discretized the Data
404 dialzdat <- scrawdat %>%
405 dndat(.) %>%
406 t()%>%
407 as.data.frame(.)
408 colnames(dialzdat) <- rownames(RAWDATNUM)
409
410 ##setting "ID_REF" as a new variable
411 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
412 colnames(geneNAM) <- "ID_REF"
413 rownames(dialzdat) <- NULL
414 dialzdat <-bind_cols(geneNAM,dialzdat)
415
416 ##NAs in a column
417 x <- 2
418 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
419 nacol[1,1] = "COL_NAs"
420 for(x in 2:dim(dialzdat)[2]){
421 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
422 x <- x + 1
423 }
424 colnames(nacol) <- colnames(dialzdat)
425 dialzdat <- bind_rows(dialzdat,nacol)
426
427 ##NAs in a row
428 y <- 1
429 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
430 for(y in 1:dim(dialzdat)[1]){
431 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
432 y <- y + 1
433 }
434 colnames(narowd) <- "ROW_NAs"
435 dialzdat <- bind_cols(dialzdat,narowd)
436 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
437 colnames(RAWWORD) <- colnames(dialzdat)
438 ##converting to character so that the clinical can be brought together with discrete data
439 k <- 2
440 for(k in 2:dim(dialzdat)[2]-1){
441 dialzdat[,k] <- as.character(dialzdat[,k])
442 k <- k + 1
443 }
444 #The End the full data
445 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
446
447 #Produces Discrete file
448 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
449 .[[1]] %>%
450 .[length(.)] %>%
451 gsub("\\D","",.) %>%
452 c("GSE",.,"dscrt.txt") %>%
453 paste(collapse = "")
454 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
455 n <- n +1
456 }
457 }
458
459 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
460 if(numDAT == 2){
461 #All the files you want to analyze
462 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
463 if(length(ANDIS) == 0){
464 #Spit out a warning
465 warning("You did not select any files and so no cleaning will be performed")
466 } else{
467 #indexing the data files
468 n <- 1
469 for(n in 1: length(ANDIS)){
470 alz <- ANDIS[n]
471
472 #Working with the wordy part of the document
473 alzword <- alz %>%
474 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
475 filter(grepl("!Sample",X1))%>%
476 filter(!grepl("!Sample_contact",X1))
477
478 #Getting the GPL file
479 genena <- grep("_platform_id",alzword$X1) %>%
480 alzword$X2[.] %>%
481 str_trim(.) %>%
482 paste0("^",.) %>%
483 grep(.,list.files()) %>%
484 list.files()[.]
485
486 #Find out if it is a soft GPL file or not
487 soft <- strsplit(genena,"[\\|/]") %>%
488 .[[1]] %>%
489 .[length(.)] %>%
490 grepl("soft",.)
491
492 ##Changing row names and column names:
493 ALZWORD <- t(alzword)
494 rownames(ALZWORD)=NULL
495 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
496 ALZWORD <- chngrownm(ALZWORD)[-1,]
497 ALZWORD <- ALZWORD%>%
498 as.data.frame()%>%
499 dplyr::select(-starts_with("col"))
500
501 ##Reorganizing information within the columns and final clinical data
502 ALZWORDF <- cinfo(ALZWORD)
503
504
505 #Working with Actual Data part of file
506 alzdat <- alz %>%
507 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
508 ALZDAT <- t(alzdat[,-1])
509 rownames(ALZDAT)=NULL
510
511 ##Is there a clean version of the GPL file available?
512 gplnum <- strsplit(genena,"[\\|/]") %>%
513 .[[1]] %>%
514 .[length(.)] %>%
515 gsub("\\D","",.)
516 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
517 if(clfileex >= 1){
518 #use the clean version
519 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
520 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
521
522 }
523 if(clfileex == 0){
524 ##Lets Create a clean version
525
526 ##Gene ID to Gene Name
527 if(soft == TRUE){
528 #Check to see if there is already a file containing information on soft files
529 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
530 if(fileex == 1){
531 #Check to see if this GPL soft file has been used before
532 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
533 .$GPL_FILE_NUM%>%
534 grepl(gplnum,.) %>%
535 sum()
536 if(IDF == 1){
537 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
538 .$GPL_FILE_NUM%>%
539 grep(gplnum,.)
540 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
541 .$LOC_ID %>%
542 .[IDLOCAL]
543 geneIDNam <- genena %>%
544 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
545 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
546 }
547 if(IDF == 0){
548 #No information on this particular GPL file
549 idLOCGPL <- genena %>%
550 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
551 t(.) %>%
552 grep("^ID\\s*$",.) %>%
553 -1
554 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
555 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
556 geneIDNam <- genena %>%
557 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
558 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
559 }
560 }
561 if(fileex == 0){
562 #We must create a file that we can access for later use
563 idLOCGPL <- genena %>%
564 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
565 t(.) %>%
566 grep("^ID\\s*$",.) %>%
567 -1
568 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
569 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
570 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
571 geneIDNam <- genena %>%
572 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
573 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
574 }
575 }
576 if(soft == FALSE){
577 geneIDNam <- genena %>%
578 read_delim(delim="\t",comment = "#")%>%
579 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
580 }
581
582 ##Labeling the gene IDs without names
583 geneIDNam <- NAFIXING(geneIDNam)
584
585 ##remove the whitespace
586 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
587
588 ##Here is the clean version
589 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
590 }
591
592
593
594 ##Changing the gene ID to gene name
595 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
596 colnames(ALZDAT) = ALZDAT1[1,]
597
598
599 ##Adjusting the column names aka the gene names
600 colnames(ALZDAT) <- gcnames(ALZDAT)
601
602
603 #Full RAW Data
604 Fullalzdwr <- ALZDAT %>%
605 as.data.frame() %>%
606 cbind(ALZWORDF,.)
607
608 #Raw file is output
609 nfnaex <- strsplit(alz,"[\\]") %>%
610 .[[1]] %>%
611 .[length(.)] %>%
612 gsub("\\D","",.) %>%
613 c("GSE",.,"aftexcel.txt") %>%
614 paste(collapse = "")
615 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
616
617
618
619 #Now for the discretization part
620 ##get the wordy part again
621 rawword <- t(ALZWORDF)
622
623 ##where is ID_REF located
624 hereim <- grep("ID_REF",rownames(rawword))
625
626 ##Subject Names GSM...
627 subjnam <- rawword[hereim,]
628
629 ##Getting the names for the rows
630 namedarows <- rownames(rawword)[-hereim] %>%
631 as.data.frame()
632 RAWWORD <- rawword[-hereim,] %>%
633 as.data.frame() %>%
634 bind_cols(namedarows,.)
635 z <- 1
636 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
637 for(z in 1:dim(RAWWORD)[1]){
638 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
639 z <- z + 1
640 }
641
642 colnames(naroww) <- "ROW_NAs"
643 RAWWORD <- bind_cols(RAWWORD,naroww)
644
645
646 roALZna <- t(ALZDAT) %>%
647 rownames(.) %>%
648 as.data.frame(.)
649 colnames(roALZna) <- "ID_REF"
650
651 RAWDAT <- t(ALZDAT) %>%
652 as.data.frame(.)
653 colnames(RAWDAT) <- NULL
654 rownames(RAWDAT) <- NULL
655
656 RAWDAT2 <- RAWDAT %>%
657 cbind(roALZna,.) %>%
658 dplyr::arrange(.,ID_REF)
659
660 ##Editing the file for R processing
661 RAWDATID <- RAWDAT2[,1] %>%
662 as.matrix(.)
663
664 RAWDATNUM <- RAWDAT2[,-1] %>%
665 mapply(.,FUN = as.numeric) %>%
666 t(.)
667
668 ##Consolidating genes with the same name
669 ###create empty matrix of size equal to tabRDATID
670 tabRDATID <- table(RAWDATID)
671 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
672 j <- 1
673 for(j in 1:length(tabRDATID)){
674 ##Putting the ones without duplicates in their new homes
675 if(tabRDATID[j] == 1){
676 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
677 }
678 ##Averaging duplicates and putting them in their new homes
679 if(tabRDATID[j] > 1){
680 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
681 }
682 j <- j + 1
683 }
684
685 ##Scaling the Data
686 scrawdat <- NuRDATN%>%
687 scale()
688 attr(scrawdat,"scaled:center") <- NULL
689 attr(scrawdat,"scaled:scale") <- NULL
690 colnames(scrawdat) <- rownames(tabRDATID)
691
692 ##Discretized the Data
693 dialzdat <- scrawdat %>%
694 dndat(.) %>%
695 t()%>%
696 as.data.frame(.)
697 colnames(dialzdat) <- rownames(RAWDATNUM)
698
699 ##setting "ID_REF" as a new variable
700 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
701 colnames(geneNAM) <- "ID_REF"
702 rownames(dialzdat) <- NULL
703 dialzdat <-bind_cols(geneNAM,dialzdat)
704
705 ##NAs in a column
706 x <- 2
707 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
708 nacol[1,1] = "COL_NAs"
709 for(x in 2:dim(dialzdat)[2]){
710 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
711 x <- x + 1
712 }
713 colnames(nacol) <- colnames(dialzdat)
714 dialzdat <- bind_rows(dialzdat,nacol)
715
716 ##NAs in a row
717 y <- 1
718 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
719 for(y in 1:dim(dialzdat)[1]){
720 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
721 y <- y + 1
722 }
723 colnames(narowd) <- "ROW_NAs"
724 dialzdat <- bind_cols(dialzdat,narowd)
725 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
726 colnames(RAWWORD) <- colnames(dialzdat)
727 ##converting to character so that the clinical can be brought together with discrete data
728 k <- 2
729 for(k in 2:dim(dialzdat)[2]-1){
730 dialzdat[,k] <- as.character(dialzdat[,k])
731 k <- k + 1
732 }
733 #The End the full data
734 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
735
736 #Produces Discrete file
737 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
738 .[[1]] %>%
739 .[length(.)] %>%
740 gsub("\\D","",.) %>%
741 c("GSE",.,"dscrt.txt") %>%
742 paste(collapse = "")
743 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
744
745
746 n <- n + 1
747 }
748 }
749 }
750 }
751 #The Rest of this code will be used every time you want to change a data set
752 THEFT()