Commit 0eb3420561162679cbaa6e877e4bb7621c5d694b
1 parent
2167ed7633
Exists in
master
This code combines the cleaning and discretizing processes. (UNTESTED)
Two files are output one with raw data and the other with discretized data.
Showing
1 changed file
with
445 additions
and
0 deletions
Show diff stats
RCleanDscret.R
... | ... | @@ -0,0 +1,445 @@ |
1 | +#Libraries required to run the code | |
2 | +library(pryr) | |
3 | +library(MASS) | |
4 | +library(dplyr) | |
5 | +library(tidyr) | |
6 | +library(readr) | |
7 | +library(stringr) | |
8 | + | |
9 | + | |
10 | +#Necessary Functions | |
11 | +#1#Function for handling the changing of row names and column names | |
12 | +chngrownm <- function(mat){ | |
13 | + row <- dim(mat)[1] | |
14 | + col <- dim(mat)[2] | |
15 | + j <- 1 | |
16 | + x <- 1 | |
17 | + p <- 1 | |
18 | + a <- 1 | |
19 | + b <- 1 | |
20 | + g <- 1 | |
21 | + for(j in 1:col){ | |
22 | + if("!Sample_source_name_ch1"==mat[1,j]){ | |
23 | + colnames(mat)[j] <- "Brain_Region" | |
24 | + } | |
25 | + if("!Sample_title" == mat[1,j]){ | |
26 | + colnames(mat)[j] <- "Title" | |
27 | + } | |
28 | + if("!Sample_geo_accession" == mat[1,j]){ | |
29 | + colnames(mat)[j] <- "ID_REF" | |
30 | + } else{ | |
31 | + if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ | |
32 | + colnames(mat)[j] <- paste0("Sex",x) | |
33 | + x = x + 1 | |
34 | + } | |
35 | + if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ | |
36 | + colnames(mat)[j] <- paste0("PMI",p) | |
37 | + p = p + 1 | |
38 | + } | |
39 | + if(grepl("age|Age|AGE",mat[2,j])==TRUE){ | |
40 | + colnames(mat)[j] <- paste0("Age",a) | |
41 | + a = a + 1 | |
42 | + } | |
43 | + if(grepl("braak|b&b",mat[2,j])==TRUE){ | |
44 | + colnames(mat)[j] <- paste0("Braak",b) | |
45 | + b = b + 1 | |
46 | + } | |
47 | + if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){ | |
48 | + colnames(mat)[j] <- paste0("Group",g) | |
49 | + g = g + 1 | |
50 | + } | |
51 | + | |
52 | + } | |
53 | + j = j + 1 | |
54 | + } | |
55 | + mat | |
56 | +} | |
57 | + | |
58 | +#2#Function for reorganizing information within the columns | |
59 | +cinfo <- function(mat){ | |
60 | + col <- dim(mat)[2] | |
61 | + j <-2 | |
62 | + for(j in 2:col){ | |
63 | + if(grepl("Group",colnames(mat)[j]) == TRUE){ | |
64 | + mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) | |
65 | + } | |
66 | + if(grepl("Age",colnames(mat)[j])==TRUE){ | |
67 | + mat[,j] <- gsub("\\D","",mat[,j])%>% | |
68 | + as.integer() | |
69 | + } | |
70 | + if(grepl("Sex",colnames(mat)[j])==TRUE){ | |
71 | + mat[,j] <- gsub(".+:\\s","",mat[,j]) | |
72 | + } | |
73 | + if(grepl("PMI",colnames(mat)[j])==TRUE){ | |
74 | + mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% | |
75 | + as.numeric() | |
76 | + } | |
77 | + if(grepl("Braak",colnames(mat)[j])==TRUE){ | |
78 | + mat[,j]<-gsub(".+:\\s","",mat[,j])%>% | |
79 | + as.roman()%>% | |
80 | + as.integer() | |
81 | + } | |
82 | + j=j+1 | |
83 | + } | |
84 | + mat | |
85 | +} | |
86 | + | |
87 | +#3#Function for labeling the gene IDs without names | |
88 | +NAFIXING <- function(GIDNAM){ | |
89 | + row <- dim(GIDNAM)[1] | |
90 | + i <- 1 | |
91 | + for(i in 1:row){ | |
92 | + if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ | |
93 | + GIDNAM[i,2] <- GIDNAM[i,1] | |
94 | + } | |
95 | + i <- i + 1 | |
96 | + } | |
97 | + GIDNAM | |
98 | +} | |
99 | + | |
100 | +#4#Function for changing the gene ID to gene name | |
101 | +cgeneID <- function(GeneName,DATA){ | |
102 | + colGene <- dim(GeneName)[2] | |
103 | + j <- 1 | |
104 | + for(j in 1:colGene){ | |
105 | + chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) | |
106 | + if(is.na(sum(chngsreq))==FALSE){ | |
107 | + if(sum(chngsreq) > 0){ | |
108 | + DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
109 | + } | |
110 | + } | |
111 | + #if(sum(chngsreq) > 0){ | |
112 | + ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) | |
113 | + #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) | |
114 | + #} | |
115 | + j = j+1 | |
116 | + } | |
117 | + DATA | |
118 | +} | |
119 | + | |
120 | +#5#Function for adjusting the gene names | |
121 | +gcnames <- function(DiData,usecol=1){ | |
122 | + nuruns <- dim(DiData)[2] | |
123 | + i = 1 | |
124 | + nwnam <- rep("0",length.out=nuruns) | |
125 | + for(i in 1:nuruns){ | |
126 | + if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ | |
127 | + nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol]) | |
128 | + } else{ | |
129 | + nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1]) | |
130 | + } | |
131 | + | |
132 | + } | |
133 | + nwnam | |
134 | + | |
135 | +} | |
136 | + | |
137 | +#6# Function for discretizing the data | |
138 | +dndat <- function(NDATA){ | |
139 | + rownd <- dim(NDATA)[1] | |
140 | + colnd <- dim(NDATA)[2] | |
141 | + DDATA <- matrix(0,nrow=rownd,ncol=colnd) | |
142 | + colnames(DDATA) <- colnames(NDATA) | |
143 | + i <- 1 | |
144 | + for(i in 1:rownd){ | |
145 | + j <- 1 | |
146 | + for(j in 1:colnd){ | |
147 | + if(is.na(NDATA[i,j])==FALSE){ | |
148 | + | |
149 | + if(NDATA[i,j] < -1){ | |
150 | + DDATA[i,j]=0L | |
151 | + } | |
152 | + if(NDATA[i,j] > 1){ | |
153 | + DDATA[i,j]=2L | |
154 | + } | |
155 | + if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | |
156 | + DDATA[i,j]=1L | |
157 | + } | |
158 | + } else{ | |
159 | + DDATA[i,j] = NDATA[i,j] | |
160 | + } | |
161 | + j = j + 1 | |
162 | + } | |
163 | + i = i + 1 | |
164 | + } | |
165 | + DDATA | |
166 | +} | |
167 | + | |
168 | + | |
169 | +#The Rest of this code will be used every time you want to change a data set | |
170 | + | |
171 | +#Getting the series matrix file | |
172 | +print("Choose the series matrix file that you want to Analyze") | |
173 | +alz <- file.choose() | |
174 | + | |
175 | +#Getting the GPL file | |
176 | +print("Choose the GPL file that correlates with the above series matrix file") | |
177 | +genena <- file.choose() | |
178 | + | |
179 | + | |
180 | +#Find out if it is a soft GPL file or not | |
181 | +soft <- strsplit(genena,"[\\|/]") %>% | |
182 | + .[[1]] %>% | |
183 | + .[length(.)] %>% | |
184 | + grepl("soft|annot",.) | |
185 | + | |
186 | +#Working with the wordy part of the document | |
187 | +alzword <- alz %>% | |
188 | + read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% | |
189 | + filter(grepl("!Sample",X1))%>% | |
190 | + filter(!grepl("!Sample_contact",X1)) | |
191 | + | |
192 | +##Changing row names and column names: | |
193 | +ALZWORD <- t(alzword) | |
194 | +rownames(ALZWORD)=NULL | |
195 | +colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) | |
196 | +ALZWORD <- chngrownm(ALZWORD)[-1,] | |
197 | +ALZWORD <- ALZWORD%>% | |
198 | + as.data.frame()%>% | |
199 | + dplyr::select(-starts_with("col")) | |
200 | + | |
201 | +##Reorganizing information within the columns | |
202 | +ALZWORDF <- cinfo(ALZWORD) | |
203 | + | |
204 | + | |
205 | +#Working with Actual Data part of file | |
206 | +alzdat <- alz %>% | |
207 | + read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) | |
208 | +ALZDAT <- t(alzdat[,-1]) | |
209 | +rownames(ALZDAT)=NULL | |
210 | + | |
211 | +##Is there a clean version of the GPL file available? | |
212 | +gplnum <- strsplit(genena,"[\\|/]") %>% | |
213 | + .[[1]] %>% | |
214 | + .[length(.)] %>% | |
215 | + gsub("\\D","",.) | |
216 | +clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) | |
217 | +if(clfileex >= 1){ | |
218 | +#use the clean version | |
219 | +geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% | |
220 | + read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") | |
221 | + | |
222 | +} | |
223 | +if(clfileex == 0){ | |
224 | +##Lets Create a clean version | |
225 | + | |
226 | +##Gene ID to Gene Name | |
227 | + if(soft == TRUE){ | |
228 | + #Check to see if there is already a file containing information on soft files | |
229 | + fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) | |
230 | + if(fileex == 1){ | |
231 | + #Check to see if this GPL soft file has been used before | |
232 | + IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | |
233 | + .$GPL_FILE_NUM%>% | |
234 | + grepl(gplnum,.) %>% | |
235 | + sum() | |
236 | + if(IDF == 1){ | |
237 | + IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | |
238 | + .$GPL_FILE_NUM%>% | |
239 | + grep(gplnum,.) | |
240 | + idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% | |
241 | + .$LOC_ID %>% | |
242 | + .[IDLOCAL] | |
243 | + geneIDNam <- genena %>% | |
244 | + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% | |
245 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
246 | + } | |
247 | + if(IDF == 0){ | |
248 | + #No information on this particular GPL file | |
249 | + idLOCGPL <- genena %>% | |
250 | + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | |
251 | + t(.) %>% | |
252 | + grep("^ID\\s*$",.) %>% | |
253 | + -1 | |
254 | + cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% | |
255 | + cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) | |
256 | + geneIDNam <- genena %>% | |
257 | + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | |
258 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
259 | + } | |
260 | + } | |
261 | + if(fileex == 0){ | |
262 | + #We must create a file that we can access for later use | |
263 | + idLOCGPL <- genena %>% | |
264 | + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% | |
265 | + t(.) %>% | |
266 | + grep("^ID\\s*$",.) %>% | |
267 | + -1 | |
268 | + Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) | |
269 | + colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") | |
270 | + write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) | |
271 | + geneIDNam <- genena %>% | |
272 | + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% | |
273 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
274 | + } | |
275 | + } | |
276 | + if(soft == FALSE){ | |
277 | + geneIDNam <- genena %>% | |
278 | + read_delim(delim="\t",comment = "#")%>% | |
279 | + dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) | |
280 | + } | |
281 | + | |
282 | + ##Labeling the gene IDs without names | |
283 | + geneIDNam <- NAFIXING(geneIDNam) | |
284 | + | |
285 | + ##remove the whitespace | |
286 | + geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) | |
287 | + | |
288 | + ##Here is the clean version | |
289 | + write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) | |
290 | +} | |
291 | + | |
292 | + | |
293 | + | |
294 | +##Changing the gene ID to gene name | |
295 | +ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) | |
296 | +colnames(ALZDAT) = ALZDAT1[1,] | |
297 | + | |
298 | + | |
299 | +##Adjusting the column names aka the gene names | |
300 | +colnames(ALZDAT) <- gcnames(ALZDAT) | |
301 | + | |
302 | + | |
303 | +#Full RAW Data | |
304 | +Fullalzdwr <- ALZDAT %>% | |
305 | + as.data.frame() %>% | |
306 | + cbind(ALZWORDF,.) | |
307 | + | |
308 | + | |
309 | +#Raw file is output | |
310 | +nfnaex <- strsplit(alz,"[\\]") %>% | |
311 | + .[[1]] %>% | |
312 | + .[length(.)] %>% | |
313 | + gsub("\\D","",.) %>% | |
314 | + c("GSE",.,"aftexcel.txt") %>% | |
315 | + paste(collapse = "") | |
316 | +write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") | |
317 | + | |
318 | + | |
319 | +#Now for the discretization part | |
320 | +##get the wordy part again | |
321 | +rawword <- t(ALZWORDF) | |
322 | + | |
323 | +##where is ID_REF located | |
324 | +hereim <- grep("ID_REF",rawword[,1]) | |
325 | + | |
326 | +##Subject Names GSM... | |
327 | +subjnam <- rawword[hereim,] | |
328 | + | |
329 | +##Getting the names for the rows | |
330 | +namedarows <- rownames(rawword)[-hereim] %>% | |
331 | + as.data.frame() | |
332 | +RAWWORD <- rawword[-hereim,] %>% | |
333 | + as.data.frame() %>% | |
334 | + bind_cols(namedarows,.) | |
335 | +z <- 1 | |
336 | +naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) | |
337 | +for(z in 1:dim(RAWWORD)[1]){ | |
338 | + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
339 | + z <- z + 1 | |
340 | +} | |
341 | + | |
342 | +colnames(naroww) <- "ROW_NAs" | |
343 | +RAWWORD <- bind_cols(RAWWORD,naroww) | |
344 | + | |
345 | + | |
346 | +roALZna <- t(ALZDAT) %>% | |
347 | + rownames(.) %>% | |
348 | + as.data.frame(.) | |
349 | +colnames(roALZna) <- "ID_REF" | |
350 | + | |
351 | +RAWDAT <- t(ALZDAT) %>% | |
352 | + as.data.frame(.) | |
353 | +colnames(RAWDAT) <- NULL | |
354 | +rownames(RAWDAT) <- NULL | |
355 | + | |
356 | +RAWDAT2 <- RAWDAT %>% | |
357 | + cbind(roALZna,.) %>% | |
358 | + dplyr::arrange(.,ID_REF) | |
359 | + | |
360 | +##Editing the file for R processing | |
361 | +RAWDATID <- RAWDAT2[,1] %>% | |
362 | + as.matrix(.) | |
363 | + | |
364 | +RAWDATNUM <- RAWDAT2[,-1] %>% | |
365 | + mapply(.,FUN = as.numeric) %>% | |
366 | + t(.) | |
367 | + | |
368 | +##Consolidating genes with the same name | |
369 | +###create empty matrix of size equal to tabRDATID | |
370 | +tabRDATID <- table(RAWDATID) | |
371 | +NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) | |
372 | +j <- 1 | |
373 | +for(j in 1:length(tabRDATID)){ | |
374 | + | |
375 | + ##Putting the ones without duplicates in their new homes | |
376 | + if(tabRDATID[j] == 1){ | |
377 | + NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] | |
378 | + } | |
379 | + ##Averaging duplicates and putting them in their new homes | |
380 | + if(tabRDATID[j] > 1){ | |
381 | + NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) | |
382 | + } | |
383 | + j <- j + 1 | |
384 | +} | |
385 | + | |
386 | +##Scaling the Data | |
387 | +scrawdat <- NuRDATN%>% | |
388 | + scale() | |
389 | +attr(scrawdat,"scaled:center") <- NULL | |
390 | +attr(scrawdat,"scaled:scale") <- NULL | |
391 | +colnames(scrawdat) <- rownames(tabRDATID) | |
392 | + | |
393 | +##Discretized the Data | |
394 | +dialzdat <- scrawdat %>% | |
395 | + dndat(.) %>% | |
396 | + t()%>% | |
397 | + as.data.frame(.) | |
398 | +colnames(dialzdat) <- rownames(RAWDATNUM) | |
399 | + | |
400 | +##setting "ID_REF" as a new variable | |
401 | +geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) | |
402 | +colnames(geneNAM) <- "ID_REF" | |
403 | +rownames(dialzdat) <- NULL | |
404 | +dialzdat <-bind_cols(geneNAM,dialzdat) | |
405 | + | |
406 | +##NAs in a column | |
407 | +x <- 2 | |
408 | +nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) | |
409 | +nacol[1,1] = "COL_NAs" | |
410 | +for(x in 2:dim(dialzdat)[2]){ | |
411 | + nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) | |
412 | + x <- x + 1 | |
413 | +} | |
414 | +colnames(nacol) <- colnames(dialzdat) | |
415 | +dialzdat<-bind_rows(dialzdat,nacol) | |
416 | + | |
417 | +##NAs in a row | |
418 | +y <- 1 | |
419 | +narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) | |
420 | +for(y in 1:dim(dialzdat)[1]){ | |
421 | + narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) | |
422 | + y <- y + 1 | |
423 | +} | |
424 | +colnames(narowd) <- "ROW_NAs" | |
425 | +dialzdat <- bind_cols(dialzdat,narowd) | |
426 | +colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam | |
427 | +colnames(RAWWORD) <- colnames(dialzdat) | |
428 | +##converting to character so that the clinical can be brought together with discrete data | |
429 | +k <- 2 | |
430 | +for(k in 2:dim(dialzdat)[2]-1){ | |
431 | + dialzdat[,k] <- as.character(dialzdat[,k]) | |
432 | + k <- k + 1 | |
433 | +} | |
434 | +#The End the full data | |
435 | +Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) | |
436 | + | |
437 | +#Produces Discrete file | |
438 | +nfnaex <- strsplit(rawdat,"[\\|/]") %>% | |
439 | + .[[1]] %>% | |
440 | + .[length(.)] %>% | |
441 | + gsub("\\D","",.) %>% | |
442 | + c("GSE",.,"dscrt.txt") %>% | |
443 | + paste(collapse = "") | |
444 | +write.table(Dscrtalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE) | |
445 | + |