Commit adfed316993d072e749a8cd85434fc667c054f22
1 parent
689231363c
Exists in
master
An automated version of the RCleanDscret.R
Working on outputting more insightful errors and warnings. (UNTESTED)
Showing
1 changed file
with
752 additions
and
0 deletions
Show diff stats
RAutoClDs.R
... | ... | @@ -0,0 +1,752 @@ |
1 | +#Efrain H. Gonzalez | |
2 | +#6/19/2017 | |
3 | +#Libraries required to run the code | |
4 | +library(pryr) | |
5 | +library(MASS) | |
6 | +library(dplyr) | |
7 | +library(tidyr) | |
8 | +library(readr) | |
9 | +library(stringr) | |
10 | + | |
11 | + | |
12 | +#Necessary Functions | |
13 | +#1#Function for handling the changing of row names and column names | |
14 | +chngrownm <- function(mat){ | |
15 | + row <- dim(mat)[1] | |
16 | + col <- dim(mat)[2] | |
17 | + j <- 1 | |
18 | + x <- 1 | |
19 | + p <- 1 | |
20 | + a <- 1 | |
21 | + b <- 1 | |
22 | + g <- 1 | |
23 | + for(j in 1:col){ | |
24 | + if("!Sample_source_name_ch1"==mat[1,j]){ | |
25 | + colnames(mat)[j] <- "Brain_Region" | |
26 | + } | |
27 | + if("!Sample_title" == mat[1,j]){ | |
28 | + colnames(mat)[j] <- "Title" | |
29 | + } | |
30 | + if("!Sample_geo_accession" == mat[1,j]){ | |
31 | + colnames(mat)[j] <- "ID_REF" | |
32 | + } else{ | |
33 | + if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ | |
34 | + colnames(mat)[j] <- paste0("Sex",x) | |
35 | + x = x + 1 | |
36 | + } | |
37 | + if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ | |
38 | + colnames(mat)[j] <- paste0("PMI",p) | |
39 | + p = p + 1 | |
40 | + } | |
41 | + if(grepl("age|Age|AGE",mat[2,j])==TRUE){ | |
42 | + colnames(mat)[j] <- paste0("Age",a) | |
43 | + a = a + 1 | |
44 | + } | |
45 | + if(grepl("braak|b&b",mat[2,j])==TRUE){ | |
46 | + colnames(mat)[j] <- paste0("Braak",b) | |
47 | + b = b + 1 | |
48 | + } | |
49 | + if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){ | |
50 | + colnames(mat)[j] <- paste0("Group",g) | |
51 | + g = g + 1 | |
52 | + } | |
53 | + | |
54 | + } | |
55 | + j = j + 1 | |
56 | + } | |
57 | + mat | |
58 | +} | |
59 | + | |
60 | +#2#Function for reorganizing information within the columns | |
61 | +cinfo <- function(mat){ | |
62 | + col <- dim(mat)[2] | |
63 | + j <-2 | |
64 | + for(j in 2:col){ | |
65 | + if(grepl("Group",colnames(mat)[j]) == TRUE){ | |
66 | + mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) | |
67 | + } | |
68 | + if(grepl("Age",colnames(mat)[j])==TRUE){ | |
69 | + mat[,j] <- gsub("\\D","",mat[,j])%>% | |
70 | + as.integer() | |
71 | + } | |
72 | + if(grepl("Sex",colnames(mat)[j])==TRUE){ | |
73 | + mat[,j] <- gsub(".+:\\s","",mat[,j]) | |
74 | + } | |
75 | + if(grepl("PMI",colnames(mat)[j])==TRUE){ | |
76 | + mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% | |
77 | + as.numeric() | |
78 | + } | |
79 | + if(grepl("Braak",colnames(mat)[j])==TRUE){ | |
80 | + mat[,j]<-gsub(".+:\\s","",mat[,j])%>% | |
81 | + as.roman()%>% | |
82 | + as.integer() | |
83 | + } | |
84 | + j=j+1 | |
85 | + } | |
86 | + mat | |
87 | +} | |
88 | + | |
89 | +#3#Function for labeling the gene IDs without names | |
90 | +NAFIXING <- function(GIDNAM){ | |
91 | + row <- dim(GIDNAM)[1] | |
92 | + i <- 1 | |
93 | + for(i in 1:row){ | |
94 | + if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ | |
95 | + GIDNAM[i,2] <- GIDNAM[i,1] | |
96 | + } | |
97 | + i <- i + 1 | |
98 | + } | |
99 | + GIDNAM | |
100 | +} | |
101 | + | |
102 | +#4#Function for changing the gene ID to gene name | |
103 | +cgeneID <- function(GeneName,DATA){ | |
104 | + colGene <- dim(GeneName)[2] | |
105 | + j <- 1 | |
106 | + for(j in 1:colGene){ | |
107 | + chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) | |
108 | + if(is.na(sum(chngsreq))==FALSE){ | |
109 | + if(sum(chngsreq) > 0){ | |
110 | + DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
111 | + } | |
112 | + } | |
113 | + j = j+1 | |
114 | + } | |
115 | + DATA | |
116 | +} | |
117 | + | |
118 | +#5#Function for adjusting the gene names | |
119 | +gcnames <- function(DiData,usecol=1){ | |
120 | + nuruns <- dim(DiData)[2] | |
121 | + i = 1 | |
122 | + nwnam <- rep("0",length.out=nuruns) | |
123 | + for(i in 1:nuruns){ | |
124 | + if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ | |
125 | + nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol]) | |
126 | + } else{ | |
127 | + nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1]) | |
128 | + } | |
129 | + | |
130 | + } | |
131 | + nwnam | |
132 | + | |
133 | +} | |
134 | + | |
135 | +#6# Function for discretizing the data | |
136 | +dndat <- function(NDATA){ | |
137 | + rownd <- dim(NDATA)[1] | |
138 | + colnd <- dim(NDATA)[2] | |
139 | + DDATA <- matrix(0,nrow=rownd,ncol=colnd) | |
140 | + colnames(DDATA) <- colnames(NDATA) | |
141 | + i <- 1 | |
142 | + for(i in 1:rownd){ | |
143 | + j <- 1 | |
144 | + for(j in 1:colnd){ | |
145 | + if(is.na(NDATA[i,j])==FALSE){ | |
146 | + | |
147 | + if(NDATA[i,j] < -1){ | |
148 | + DDATA[i,j]=0L | |
149 | + } | |
150 | + if(NDATA[i,j] > 1){ | |
151 | + DDATA[i,j]=2L | |
152 | + } | |
153 | + if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | |
154 | + DDATA[i,j]=1L | |
155 | + } | |
156 | + } else{ | |
157 | + DDATA[i,j] = NDATA[i,j] | |
158 | + } | |
159 | + j = j + 1 | |
160 | + } | |
161 | + i = i + 1 | |
162 | + } | |
163 | + DDATA | |
164 | +} | |
165 | + | |
166 | + | |
167 | +#MajorFunction#This is the function that does everything else | |
168 | +THEFT <- function(){ | |
169 | + #Set working directory based on the directory of the series matrix file Currently only works for windows | |
170 | + wd <- getwd() | |
171 | + #list.files() | |
172 | + #gsub("wd",wd,"Do you want to clean all data files in the directory wd?") | |
173 | + numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) | |
174 | + GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files()) | |
175 | + | |
176 | + #ALL DATA FILES WILL BE CLEANED | |
177 | + if(numDAT == 1){ | |
178 | + #indexing the data files | |
179 | + n <- 1 | |
180 | + for(n in 1: length(GSEfileloc)){ | |
181 | + alz <- list.files()[GSEfileloc[n]] | |
182 | + | |
183 | + #Working with the wordy part of the document | |
184 | + alzword <- alz %>% | |
185 | + read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% | |
186 | + filter(grepl("!Sample",X1))%>% | |
187 | + filter(!grepl("!Sample_contact",X1)) | |
188 | + | |
189 | + #Getting the GPL file | |
190 | + genena <- grep("_platform_id",alzword$X1) %>% | |
191 | + alzword$X2[.] %>% | |
192 | + str_trim(.) %>% | |
193 | + paste0("^",.) %>% | |
194 | + grep(.,list.files()) %>% | |
195 | + list.files()[.] | |
196 | + | |
197 | + #Find out if it is a soft GPL file or not | |
198 | + soft <- strsplit(genena,"[\\|/]") %>% | |
199 | + .[[1]] %>% | |
200 | + .[length(.)] %>% | |
201 | + grepl("soft",.) | |
202 | + | |
203 | + ##Changing row names and column names: | |
204 | + ALZWORD <- t(alzword) | |
205 | + rownames(ALZWORD)=NULL | |
206 | + colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) | |
207 | + ALZWORD <- chngrownm(ALZWORD)[-1,] | |
208 | + ALZWORD <- ALZWORD%>% | |
209 | + as.data.frame()%>% | |
210 | + dplyr::select(-starts_with("col")) | |
211 | + | |
212 | + ##Reorganizing information within the columns and final clinical data | |
213 | + ALZWORDF <- cinfo(ALZWORD) | |
214 | + | |
215 | + | |
216 | + #Working with Actual Data part of file | |
217 | + alzdat <- alz %>% | |
218 | + read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) | |
219 | + ALZDAT <- t(alzdat[,-1]) | |
220 | + rownames(ALZDAT)=NULL | |
221 | + | |
222 | + ##Is there a clean version of the GPL file available? | |
223 | + gplnum <- strsplit(genena,"[\\|/]") %>% | |
224 | + .[[1]] %>% | |
225 | + .[length(.)] %>% | |
226 | + gsub("\\D","",.) | |
227 | + clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) | |
228 | + if(clfileex >= 1){ | |
229 | + #use the clean version | |
230 | + geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% | |
231 | + read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") | |
232 | + | |
233 | + } | |
234 | + if(clfileex == 0){ | |
235 | + ##Lets Create a clean version | |
236 | + | |
237 | + ##Gene ID to Gene Name | |
238 | + if(soft == TRUE){ | |
239 | + #Check to see if there is already a file containing information on soft files | |
240 | + fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) | |
241 | + if(fileex == 1){ | |
242 | + #Check to see if this GPL soft file has been used before | |
243 | + IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | |
244 | + .$GPL_FILE_NUM%>% | |
245 | + grepl(gplnum,.) %>% | |
246 | + sum() | |
247 | + if(IDF == 1){ | |
248 | + IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | |
249 | + .$GPL_FILE_NUM%>% | |
250 | + grep(gplnum,.) | |
251 | + idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | |
252 | + .$LOC_ID %>% | |
253 | + .[IDLOCAL] | |
254 | + geneIDNam <- genena %>% | |
255 | + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% | |
256 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
257 | + } | |
258 | + if(IDF == 0){ | |
259 | + #No information on this particular GPL file | |
260 | + idLOCGPL <- genena %>% | |
261 | + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | |
262 | + t(.) %>% | |
263 | + grep("^ID\\s*$",.) %>% | |
264 | + -1 | |
265 | + cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% | |
266 | + cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) | |
267 | + geneIDNam <- genena %>% | |
268 | + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | |
269 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
270 | + } | |
271 | + } | |
272 | + if(fileex == 0){ | |
273 | + #We must create a file that we can access for later use | |
274 | + idLOCGPL <- genena %>% | |
275 | + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | |
276 | + t(.) %>% | |
277 | + grep("^ID\\s*$",.) %>% | |
278 | + -1 | |
279 | + Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) | |
280 | + colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") | |
281 | + write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) | |
282 | + geneIDNam <- genena %>% | |
283 | + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | |
284 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
285 | + } | |
286 | + } | |
287 | + if(soft == FALSE){ | |
288 | + geneIDNam <- genena %>% | |
289 | + read_delim(delim="\t",comment = "#")%>% | |
290 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
291 | + } | |
292 | + | |
293 | + ##Labeling the gene IDs without names | |
294 | + geneIDNam <- NAFIXING(geneIDNam) | |
295 | + | |
296 | + ##remove the whitespace | |
297 | + geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) | |
298 | + | |
299 | + ##Here is the clean version | |
300 | + write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) | |
301 | + } | |
302 | + | |
303 | + | |
304 | + | |
305 | + ##Changing the gene ID to gene name | |
306 | + ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) | |
307 | + colnames(ALZDAT) = ALZDAT1[1,] | |
308 | + | |
309 | + | |
310 | + ##Adjusting the column names aka the gene names | |
311 | + colnames(ALZDAT) <- gcnames(ALZDAT) | |
312 | + | |
313 | + | |
314 | + #Full RAW Data | |
315 | + Fullalzdwr <- ALZDAT %>% | |
316 | + as.data.frame() %>% | |
317 | + cbind(ALZWORDF,.) | |
318 | + | |
319 | + #Raw file is output | |
320 | + nfnaex <- strsplit(alz,"[\\]") %>% | |
321 | + .[[1]] %>% | |
322 | + .[length(.)] %>% | |
323 | + gsub("\\D","",.) %>% | |
324 | + c("GSE",.,"aftexcel.txt") %>% | |
325 | + paste(collapse = "") | |
326 | + write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") | |
327 | + | |
328 | + | |
329 | + | |
330 | + #Now for the discretization part | |
331 | + ##get the wordy part again | |
332 | + rawword <- t(ALZWORDF) | |
333 | + | |
334 | + ##where is ID_REF located | |
335 | + hereim <- grep("ID_REF",rownames(rawword)) | |
336 | + | |
337 | + ##Subject Names GSM... | |
338 | + subjnam <- rawword[hereim,] | |
339 | + | |
340 | + ##Getting the names for the rows | |
341 | + namedarows <- rownames(rawword)[-hereim] %>% | |
342 | + as.data.frame() | |
343 | + RAWWORD <- rawword[-hereim,] %>% | |
344 | + as.data.frame() %>% | |
345 | + bind_cols(namedarows,.) | |
346 | + z <- 1 | |
347 | + naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) | |
348 | + for(z in 1:dim(RAWWORD)[1]){ | |
349 | + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
350 | + z <- z + 1 | |
351 | + } | |
352 | + | |
353 | + colnames(naroww) <- "ROW_NAs" | |
354 | + RAWWORD <- bind_cols(RAWWORD,naroww) | |
355 | + | |
356 | + | |
357 | + roALZna <- t(ALZDAT) %>% | |
358 | + rownames(.) %>% | |
359 | + as.data.frame(.) | |
360 | + colnames(roALZna) <- "ID_REF" | |
361 | + | |
362 | + RAWDAT <- t(ALZDAT) %>% | |
363 | + as.data.frame(.) | |
364 | + colnames(RAWDAT) <- NULL | |
365 | + rownames(RAWDAT) <- NULL | |
366 | + | |
367 | + RAWDAT2 <- RAWDAT %>% | |
368 | + cbind(roALZna,.) %>% | |
369 | + dplyr::arrange(.,ID_REF) | |
370 | + | |
371 | + ##Editing the file for R processing | |
372 | + RAWDATID <- RAWDAT2[,1] %>% | |
373 | + as.matrix(.) | |
374 | + | |
375 | + RAWDATNUM <- RAWDAT2[,-1] %>% | |
376 | + mapply(.,FUN = as.numeric) %>% | |
377 | + t(.) | |
378 | + | |
379 | + ##Consolidating genes with the same name | |
380 | + ###create empty matrix of size equal to tabRDATID | |
381 | + tabRDATID <- table(RAWDATID) | |
382 | + NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) | |
383 | + j <- 1 | |
384 | + for(j in 1:length(tabRDATID)){ | |
385 | + ##Putting the ones without duplicates in their new homes | |
386 | + if(tabRDATID[j] == 1){ | |
387 | + NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] | |
388 | + } | |
389 | + ##Averaging duplicates and putting them in their new homes | |
390 | + if(tabRDATID[j] > 1){ | |
391 | + NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) | |
392 | + } | |
393 | + j <- j + 1 | |
394 | + } | |
395 | + | |
396 | + ##Scaling the Data | |
397 | + scrawdat <- NuRDATN%>% | |
398 | + scale() | |
399 | + attr(scrawdat,"scaled:center") <- NULL | |
400 | + attr(scrawdat,"scaled:scale") <- NULL | |
401 | + colnames(scrawdat) <- rownames(tabRDATID) | |
402 | + | |
403 | + ##Discretized the Data | |
404 | + dialzdat <- scrawdat %>% | |
405 | + dndat(.) %>% | |
406 | + t()%>% | |
407 | + as.data.frame(.) | |
408 | + colnames(dialzdat) <- rownames(RAWDATNUM) | |
409 | + | |
410 | + ##setting "ID_REF" as a new variable | |
411 | + geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) | |
412 | + colnames(geneNAM) <- "ID_REF" | |
413 | + rownames(dialzdat) <- NULL | |
414 | + dialzdat <-bind_cols(geneNAM,dialzdat) | |
415 | + | |
416 | + ##NAs in a column | |
417 | + x <- 2 | |
418 | + nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) | |
419 | + nacol[1,1] = "COL_NAs" | |
420 | + for(x in 2:dim(dialzdat)[2]){ | |
421 | + nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) | |
422 | + x <- x + 1 | |
423 | + } | |
424 | + colnames(nacol) <- colnames(dialzdat) | |
425 | + dialzdat <- bind_rows(dialzdat,nacol) | |
426 | + | |
427 | + ##NAs in a row | |
428 | + y <- 1 | |
429 | + narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) | |
430 | + for(y in 1:dim(dialzdat)[1]){ | |
431 | + narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) | |
432 | + y <- y + 1 | |
433 | + } | |
434 | + colnames(narowd) <- "ROW_NAs" | |
435 | + dialzdat <- bind_cols(dialzdat,narowd) | |
436 | + colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam | |
437 | + colnames(RAWWORD) <- colnames(dialzdat) | |
438 | + ##converting to character so that the clinical can be brought together with discrete data | |
439 | + k <- 2 | |
440 | + for(k in 2:dim(dialzdat)[2]-1){ | |
441 | + dialzdat[,k] <- as.character(dialzdat[,k]) | |
442 | + k <- k + 1 | |
443 | + } | |
444 | + #The End the full data | |
445 | + Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) | |
446 | + | |
447 | + #Produces Discrete file | |
448 | + nfnaex2 <- strsplit(alz,"[\\|/]") %>% | |
449 | + .[[1]] %>% | |
450 | + .[length(.)] %>% | |
451 | + gsub("\\D","",.) %>% | |
452 | + c("GSE",.,"dscrt.txt") %>% | |
453 | + paste(collapse = "") | |
454 | + write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) | |
455 | + n <- n +1 | |
456 | + } | |
457 | + } | |
458 | + | |
459 | + #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN | |
460 | + if(numDAT == 2){ | |
461 | + #All the files you want to analyze | |
462 | + ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:") | |
463 | + if(length(ANDIS) == 0){ | |
464 | + #Spit out a warning | |
465 | + warning("You did not select any files and so no cleaning will be performed") | |
466 | + } else{ | |
467 | + #indexing the data files | |
468 | + n <- 1 | |
469 | + for(n in 1: length(ANDIS)){ | |
470 | + alz <- ANDIS[n] | |
471 | + | |
472 | + #Working with the wordy part of the document | |
473 | + alzword <- alz %>% | |
474 | + read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% | |
475 | + filter(grepl("!Sample",X1))%>% | |
476 | + filter(!grepl("!Sample_contact",X1)) | |
477 | + | |
478 | + #Getting the GPL file | |
479 | + genena <- grep("_platform_id",alzword$X1) %>% | |
480 | + alzword$X2[.] %>% | |
481 | + str_trim(.) %>% | |
482 | + paste0("^",.) %>% | |
483 | + grep(.,list.files()) %>% | |
484 | + list.files()[.] | |
485 | + | |
486 | + #Find out if it is a soft GPL file or not | |
487 | + soft <- strsplit(genena,"[\\|/]") %>% | |
488 | + .[[1]] %>% | |
489 | + .[length(.)] %>% | |
490 | + grepl("soft",.) | |
491 | + | |
492 | + ##Changing row names and column names: | |
493 | + ALZWORD <- t(alzword) | |
494 | + rownames(ALZWORD)=NULL | |
495 | + colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) | |
496 | + ALZWORD <- chngrownm(ALZWORD)[-1,] | |
497 | + ALZWORD <- ALZWORD%>% | |
498 | + as.data.frame()%>% | |
499 | + dplyr::select(-starts_with("col")) | |
500 | + | |
501 | + ##Reorganizing information within the columns and final clinical data | |
502 | + ALZWORDF <- cinfo(ALZWORD) | |
503 | + | |
504 | + | |
505 | + #Working with Actual Data part of file | |
506 | + alzdat <- alz %>% | |
507 | + read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) | |
508 | + ALZDAT <- t(alzdat[,-1]) | |
509 | + rownames(ALZDAT)=NULL | |
510 | + | |
511 | + ##Is there a clean version of the GPL file available? | |
512 | + gplnum <- strsplit(genena,"[\\|/]") %>% | |
513 | + .[[1]] %>% | |
514 | + .[length(.)] %>% | |
515 | + gsub("\\D","",.) | |
516 | + clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) | |
517 | + if(clfileex >= 1){ | |
518 | + #use the clean version | |
519 | + geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% | |
520 | + read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") | |
521 | + | |
522 | + } | |
523 | + if(clfileex == 0){ | |
524 | + ##Lets Create a clean version | |
525 | + | |
526 | + ##Gene ID to Gene Name | |
527 | + if(soft == TRUE){ | |
528 | + #Check to see if there is already a file containing information on soft files | |
529 | + fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) | |
530 | + if(fileex == 1){ | |
531 | + #Check to see if this GPL soft file has been used before | |
532 | + IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | |
533 | + .$GPL_FILE_NUM%>% | |
534 | + grepl(gplnum,.) %>% | |
535 | + sum() | |
536 | + if(IDF == 1){ | |
537 | + IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | |
538 | + .$GPL_FILE_NUM%>% | |
539 | + grep(gplnum,.) | |
540 | + idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | |
541 | + .$LOC_ID %>% | |
542 | + .[IDLOCAL] | |
543 | + geneIDNam <- genena %>% | |
544 | + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% | |
545 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
546 | + } | |
547 | + if(IDF == 0){ | |
548 | + #No information on this particular GPL file | |
549 | + idLOCGPL <- genena %>% | |
550 | + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | |
551 | + t(.) %>% | |
552 | + grep("^ID\\s*$",.) %>% | |
553 | + -1 | |
554 | + cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% | |
555 | + cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) | |
556 | + geneIDNam <- genena %>% | |
557 | + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | |
558 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
559 | + } | |
560 | + } | |
561 | + if(fileex == 0){ | |
562 | + #We must create a file that we can access for later use | |
563 | + idLOCGPL <- genena %>% | |
564 | + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | |
565 | + t(.) %>% | |
566 | + grep("^ID\\s*$",.) %>% | |
567 | + -1 | |
568 | + Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) | |
569 | + colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") | |
570 | + write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) | |
571 | + geneIDNam <- genena %>% | |
572 | + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | |
573 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
574 | + } | |
575 | + } | |
576 | + if(soft == FALSE){ | |
577 | + geneIDNam <- genena %>% | |
578 | + read_delim(delim="\t",comment = "#")%>% | |
579 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
580 | + } | |
581 | + | |
582 | + ##Labeling the gene IDs without names | |
583 | + geneIDNam <- NAFIXING(geneIDNam) | |
584 | + | |
585 | + ##remove the whitespace | |
586 | + geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) | |
587 | + | |
588 | + ##Here is the clean version | |
589 | + write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) | |
590 | + } | |
591 | + | |
592 | + | |
593 | + | |
594 | + ##Changing the gene ID to gene name | |
595 | + ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) | |
596 | + colnames(ALZDAT) = ALZDAT1[1,] | |
597 | + | |
598 | + | |
599 | + ##Adjusting the column names aka the gene names | |
600 | + colnames(ALZDAT) <- gcnames(ALZDAT) | |
601 | + | |
602 | + | |
603 | + #Full RAW Data | |
604 | + Fullalzdwr <- ALZDAT %>% | |
605 | + as.data.frame() %>% | |
606 | + cbind(ALZWORDF,.) | |
607 | + | |
608 | + #Raw file is output | |
609 | + nfnaex <- strsplit(alz,"[\\]") %>% | |
610 | + .[[1]] %>% | |
611 | + .[length(.)] %>% | |
612 | + gsub("\\D","",.) %>% | |
613 | + c("GSE",.,"aftexcel.txt") %>% | |
614 | + paste(collapse = "") | |
615 | + write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") | |
616 | + | |
617 | + | |
618 | + | |
619 | + #Now for the discretization part | |
620 | + ##get the wordy part again | |
621 | + rawword <- t(ALZWORDF) | |
622 | + | |
623 | + ##where is ID_REF located | |
624 | + hereim <- grep("ID_REF",rownames(rawword)) | |
625 | + | |
626 | + ##Subject Names GSM... | |
627 | + subjnam <- rawword[hereim,] | |
628 | + | |
629 | + ##Getting the names for the rows | |
630 | + namedarows <- rownames(rawword)[-hereim] %>% | |
631 | + as.data.frame() | |
632 | + RAWWORD <- rawword[-hereim,] %>% | |
633 | + as.data.frame() %>% | |
634 | + bind_cols(namedarows,.) | |
635 | + z <- 1 | |
636 | + naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) | |
637 | + for(z in 1:dim(RAWWORD)[1]){ | |
638 | + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
639 | + z <- z + 1 | |
640 | + } | |
641 | + | |
642 | + colnames(naroww) <- "ROW_NAs" | |
643 | + RAWWORD <- bind_cols(RAWWORD,naroww) | |
644 | + | |
645 | + | |
646 | + roALZna <- t(ALZDAT) %>% | |
647 | + rownames(.) %>% | |
648 | + as.data.frame(.) | |
649 | + colnames(roALZna) <- "ID_REF" | |
650 | + | |
651 | + RAWDAT <- t(ALZDAT) %>% | |
652 | + as.data.frame(.) | |
653 | + colnames(RAWDAT) <- NULL | |
654 | + rownames(RAWDAT) <- NULL | |
655 | + | |
656 | + RAWDAT2 <- RAWDAT %>% | |
657 | + cbind(roALZna,.) %>% | |
658 | + dplyr::arrange(.,ID_REF) | |
659 | + | |
660 | + ##Editing the file for R processing | |
661 | + RAWDATID <- RAWDAT2[,1] %>% | |
662 | + as.matrix(.) | |
663 | + | |
664 | + RAWDATNUM <- RAWDAT2[,-1] %>% | |
665 | + mapply(.,FUN = as.numeric) %>% | |
666 | + t(.) | |
667 | + | |
668 | + ##Consolidating genes with the same name | |
669 | + ###create empty matrix of size equal to tabRDATID | |
670 | + tabRDATID <- table(RAWDATID) | |
671 | + NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) | |
672 | + j <- 1 | |
673 | + for(j in 1:length(tabRDATID)){ | |
674 | + ##Putting the ones without duplicates in their new homes | |
675 | + if(tabRDATID[j] == 1){ | |
676 | + NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] | |
677 | + } | |
678 | + ##Averaging duplicates and putting them in their new homes | |
679 | + if(tabRDATID[j] > 1){ | |
680 | + NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) | |
681 | + } | |
682 | + j <- j + 1 | |
683 | + } | |
684 | + | |
685 | + ##Scaling the Data | |
686 | + scrawdat <- NuRDATN%>% | |
687 | + scale() | |
688 | + attr(scrawdat,"scaled:center") <- NULL | |
689 | + attr(scrawdat,"scaled:scale") <- NULL | |
690 | + colnames(scrawdat) <- rownames(tabRDATID) | |
691 | + | |
692 | + ##Discretized the Data | |
693 | + dialzdat <- scrawdat %>% | |
694 | + dndat(.) %>% | |
695 | + t()%>% | |
696 | + as.data.frame(.) | |
697 | + colnames(dialzdat) <- rownames(RAWDATNUM) | |
698 | + | |
699 | + ##setting "ID_REF" as a new variable | |
700 | + geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) | |
701 | + colnames(geneNAM) <- "ID_REF" | |
702 | + rownames(dialzdat) <- NULL | |
703 | + dialzdat <-bind_cols(geneNAM,dialzdat) | |
704 | + | |
705 | + ##NAs in a column | |
706 | + x <- 2 | |
707 | + nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) | |
708 | + nacol[1,1] = "COL_NAs" | |
709 | + for(x in 2:dim(dialzdat)[2]){ | |
710 | + nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) | |
711 | + x <- x + 1 | |
712 | + } | |
713 | + colnames(nacol) <- colnames(dialzdat) | |
714 | + dialzdat <- bind_rows(dialzdat,nacol) | |
715 | + | |
716 | + ##NAs in a row | |
717 | + y <- 1 | |
718 | + narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) | |
719 | + for(y in 1:dim(dialzdat)[1]){ | |
720 | + narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) | |
721 | + y <- y + 1 | |
722 | + } | |
723 | + colnames(narowd) <- "ROW_NAs" | |
724 | + dialzdat <- bind_cols(dialzdat,narowd) | |
725 | + colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam | |
726 | + colnames(RAWWORD) <- colnames(dialzdat) | |
727 | + ##converting to character so that the clinical can be brought together with discrete data | |
728 | + k <- 2 | |
729 | + for(k in 2:dim(dialzdat)[2]-1){ | |
730 | + dialzdat[,k] <- as.character(dialzdat[,k]) | |
731 | + k <- k + 1 | |
732 | + } | |
733 | + #The End the full data | |
734 | + Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) | |
735 | + | |
736 | + #Produces Discrete file | |
737 | + nfnaex2 <- strsplit(alz,"[\\|/]") %>% | |
738 | + .[[1]] %>% | |
739 | + .[length(.)] %>% | |
740 | + gsub("\\D","",.) %>% | |
741 | + c("GSE",.,"dscrt.txt") %>% | |
742 | + paste(collapse = "") | |
743 | + write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) | |
744 | + | |
745 | + | |
746 | + n <- n + 1 | |
747 | + } | |
748 | + } | |
749 | + } | |
750 | +} | |
751 | +#The Rest of this code will be used every time you want to change a data set | |
752 | +THEFT() | |
0 | 753 | \ No newline at end of file |