Commit 0eb3420561162679cbaa6e877e4bb7621c5d694b

Authored by Efrain Gonzalez
1 parent 2167ed7633
Exists in master

This code combines the cleaning and discretizing processes. (UNTESTED)

Two files are output one with raw data and the other with discretized data.
Showing 1 changed file with 445 additions and 0 deletions   Show diff stats
File was created 1 #Libraries required to run the code
2 library(pryr)
3 library(MASS)
4 library(dplyr)
5 library(tidyr)
6 library(readr)
7 library(stringr)
8
9
10 #Necessary Functions
11 #1#Function for handling the changing of row names and column names
12 chngrownm <- function(mat){
13 row <- dim(mat)[1]
14 col <- dim(mat)[2]
15 j <- 1
16 x <- 1
17 p <- 1
18 a <- 1
19 b <- 1
20 g <- 1
21 for(j in 1:col){
22 if("!Sample_source_name_ch1"==mat[1,j]){
23 colnames(mat)[j] <- "Brain_Region"
24 }
25 if("!Sample_title" == mat[1,j]){
26 colnames(mat)[j] <- "Title"
27 }
28 if("!Sample_geo_accession" == mat[1,j]){
29 colnames(mat)[j] <- "ID_REF"
30 } else{
31 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
32 colnames(mat)[j] <- paste0("Sex",x)
33 x = x + 1
34 }
35 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
36 colnames(mat)[j] <- paste0("PMI",p)
37 p = p + 1
38 }
39 if(grepl("age|Age|AGE",mat[2,j])==TRUE){
40 colnames(mat)[j] <- paste0("Age",a)
41 a = a + 1
42 }
43 if(grepl("braak|b&b",mat[2,j])==TRUE){
44 colnames(mat)[j] <- paste0("Braak",b)
45 b = b + 1
46 }
47 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
48 colnames(mat)[j] <- paste0("Group",g)
49 g = g + 1
50 }
51
52 }
53 j = j + 1
54 }
55 mat
56 }
57
58 #2#Function for reorganizing information within the columns
59 cinfo <- function(mat){
60 col <- dim(mat)[2]
61 j <-2
62 for(j in 2:col){
63 if(grepl("Group",colnames(mat)[j]) == TRUE){
64 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
65 }
66 if(grepl("Age",colnames(mat)[j])==TRUE){
67 mat[,j] <- gsub("\\D","",mat[,j])%>%
68 as.integer()
69 }
70 if(grepl("Sex",colnames(mat)[j])==TRUE){
71 mat[,j] <- gsub(".+:\\s","",mat[,j])
72 }
73 if(grepl("PMI",colnames(mat)[j])==TRUE){
74 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
75 as.numeric()
76 }
77 if(grepl("Braak",colnames(mat)[j])==TRUE){
78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
79 as.roman()%>%
80 as.integer()
81 }
82 j=j+1
83 }
84 mat
85 }
86
87 #3#Function for labeling the gene IDs without names
88 NAFIXING <- function(GIDNAM){
89 row <- dim(GIDNAM)[1]
90 i <- 1
91 for(i in 1:row){
92 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
93 GIDNAM[i,2] <- GIDNAM[i,1]
94 }
95 i <- i + 1
96 }
97 GIDNAM
98 }
99
100 #4#Function for changing the gene ID to gene name
101 cgeneID <- function(GeneName,DATA){
102 colGene <- dim(GeneName)[2]
103 j <- 1
104 for(j in 1:colGene){
105 chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
106 if(is.na(sum(chngsreq))==FALSE){
107 if(sum(chngsreq) > 0){
108 DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
109 }
110 }
111 #if(sum(chngsreq) > 0){
112 ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
113 #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
114 #}
115 j = j+1
116 }
117 DATA
118 }
119
120 #5#Function for adjusting the gene names
121 gcnames <- function(DiData,usecol=1){
122 nuruns <- dim(DiData)[2]
123 i = 1
124 nwnam <- rep("0",length.out=nuruns)
125 for(i in 1:nuruns){
126 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
127 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
128 } else{
129 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
130 }
131
132 }
133 nwnam
134
135 }
136
137 #6# Function for discretizing the data
138 dndat <- function(NDATA){
139 rownd <- dim(NDATA)[1]
140 colnd <- dim(NDATA)[2]
141 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
142 colnames(DDATA) <- colnames(NDATA)
143 i <- 1
144 for(i in 1:rownd){
145 j <- 1
146 for(j in 1:colnd){
147 if(is.na(NDATA[i,j])==FALSE){
148
149 if(NDATA[i,j] < -1){
150 DDATA[i,j]=0L
151 }
152 if(NDATA[i,j] > 1){
153 DDATA[i,j]=2L
154 }
155 if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
156 DDATA[i,j]=1L
157 }
158 } else{
159 DDATA[i,j] = NDATA[i,j]
160 }
161 j = j + 1
162 }
163 i = i + 1
164 }
165 DDATA
166 }
167
168
169 #The Rest of this code will be used every time you want to change a data set
170
171 #Getting the series matrix file
172 print("Choose the series matrix file that you want to Analyze")
173 alz <- file.choose()
174
175 #Getting the GPL file
176 print("Choose the GPL file that correlates with the above series matrix file")
177 genena <- file.choose()
178
179
180 #Find out if it is a soft GPL file or not
181 soft <- strsplit(genena,"[\\|/]") %>%
182 .[[1]] %>%
183 .[length(.)] %>%
184 grepl("soft|annot",.)
185
186 #Working with the wordy part of the document
187 alzword <- alz %>%
188 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
189 filter(grepl("!Sample",X1))%>%
190 filter(!grepl("!Sample_contact",X1))
191
192 ##Changing row names and column names:
193 ALZWORD <- t(alzword)
194 rownames(ALZWORD)=NULL
195 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
196 ALZWORD <- chngrownm(ALZWORD)[-1,]
197 ALZWORD <- ALZWORD%>%
198 as.data.frame()%>%
199 dplyr::select(-starts_with("col"))
200
201 ##Reorganizing information within the columns
202 ALZWORDF <- cinfo(ALZWORD)
203
204
205 #Working with Actual Data part of file
206 alzdat <- alz %>%
207 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
208 ALZDAT <- t(alzdat[,-1])
209 rownames(ALZDAT)=NULL
210
211 ##Is there a clean version of the GPL file available?
212 gplnum <- strsplit(genena,"[\\|/]") %>%
213 .[[1]] %>%
214 .[length(.)] %>%
215 gsub("\\D","",.)
216 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
217 if(clfileex >= 1){
218 #use the clean version
219 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
220 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
221
222 }
223 if(clfileex == 0){
224 ##Lets Create a clean version
225
226 ##Gene ID to Gene Name
227 if(soft == TRUE){
228 #Check to see if there is already a file containing information on soft files
229 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
230 if(fileex == 1){
231 #Check to see if this GPL soft file has been used before
232 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
233 .$GPL_FILE_NUM%>%
234 grepl(gplnum,.) %>%
235 sum()
236 if(IDF == 1){
237 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
238 .$GPL_FILE_NUM%>%
239 grep(gplnum,.)
240 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
241 .$LOC_ID %>%
242 .[IDLOCAL]
243 geneIDNam <- genena %>%
244 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
245 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
246 }
247 if(IDF == 0){
248 #No information on this particular GPL file
249 idLOCGPL <- genena %>%
250 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
251 t(.) %>%
252 grep("^ID\\s*$",.) %>%
253 -1
254 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
255 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
256 geneIDNam <- genena %>%
257 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
258 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
259 }
260 }
261 if(fileex == 0){
262 #We must create a file that we can access for later use
263 idLOCGPL <- genena %>%
264 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
265 t(.) %>%
266 grep("^ID\\s*$",.) %>%
267 -1
268 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
269 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
270 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
271 geneIDNam <- genena %>%
272 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
273 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
274 }
275 }
276 if(soft == FALSE){
277 geneIDNam <- genena %>%
278 read_delim(delim="\t",comment = "#")%>%
279 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
280 }
281
282 ##Labeling the gene IDs without names
283 geneIDNam <- NAFIXING(geneIDNam)
284
285 ##remove the whitespace
286 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
287
288 ##Here is the clean version
289 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
290 }
291
292
293
294 ##Changing the gene ID to gene name
295 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
296 colnames(ALZDAT) = ALZDAT1[1,]
297
298
299 ##Adjusting the column names aka the gene names
300 colnames(ALZDAT) <- gcnames(ALZDAT)
301
302
303 #Full RAW Data
304 Fullalzdwr <- ALZDAT %>%
305 as.data.frame() %>%
306 cbind(ALZWORDF,.)
307
308
309 #Raw file is output
310 nfnaex <- strsplit(alz,"[\\]") %>%
311 .[[1]] %>%
312 .[length(.)] %>%
313 gsub("\\D","",.) %>%
314 c("GSE",.,"aftexcel.txt") %>%
315 paste(collapse = "")
316 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
317
318
319 #Now for the discretization part
320 ##get the wordy part again
321 rawword <- t(ALZWORDF)
322
323 ##where is ID_REF located
324 hereim <- grep("ID_REF",rawword[,1])
325
326 ##Subject Names GSM...
327 subjnam <- rawword[hereim,]
328
329 ##Getting the names for the rows
330 namedarows <- rownames(rawword)[-hereim] %>%
331 as.data.frame()
332 RAWWORD <- rawword[-hereim,] %>%
333 as.data.frame() %>%
334 bind_cols(namedarows,.)
335 z <- 1
336 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
337 for(z in 1:dim(RAWWORD)[1]){
338 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
339 z <- z + 1
340 }
341
342 colnames(naroww) <- "ROW_NAs"
343 RAWWORD <- bind_cols(RAWWORD,naroww)
344
345
346 roALZna <- t(ALZDAT) %>%
347 rownames(.) %>%
348 as.data.frame(.)
349 colnames(roALZna) <- "ID_REF"
350
351 RAWDAT <- t(ALZDAT) %>%
352 as.data.frame(.)
353 colnames(RAWDAT) <- NULL
354 rownames(RAWDAT) <- NULL
355
356 RAWDAT2 <- RAWDAT %>%
357 cbind(roALZna,.) %>%
358 dplyr::arrange(.,ID_REF)
359
360 ##Editing the file for R processing
361 RAWDATID <- RAWDAT2[,1] %>%
362 as.matrix(.)
363
364 RAWDATNUM <- RAWDAT2[,-1] %>%
365 mapply(.,FUN = as.numeric) %>%
366 t(.)
367
368 ##Consolidating genes with the same name
369 ###create empty matrix of size equal to tabRDATID
370 tabRDATID <- table(RAWDATID)
371 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
372 j <- 1
373 for(j in 1:length(tabRDATID)){
374
375 ##Putting the ones without duplicates in their new homes
376 if(tabRDATID[j] == 1){
377 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
378 }
379 ##Averaging duplicates and putting them in their new homes
380 if(tabRDATID[j] > 1){
381 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
382 }
383 j <- j + 1
384 }
385
386 ##Scaling the Data
387 scrawdat <- NuRDATN%>%
388 scale()
389 attr(scrawdat,"scaled:center") <- NULL
390 attr(scrawdat,"scaled:scale") <- NULL
391 colnames(scrawdat) <- rownames(tabRDATID)
392
393 ##Discretized the Data
394 dialzdat <- scrawdat %>%
395 dndat(.) %>%
396 t()%>%
397 as.data.frame(.)
398 colnames(dialzdat) <- rownames(RAWDATNUM)
399
400 ##setting "ID_REF" as a new variable
401 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
402 colnames(geneNAM) <- "ID_REF"
403 rownames(dialzdat) <- NULL
404 dialzdat <-bind_cols(geneNAM,dialzdat)
405
406 ##NAs in a column
407 x <- 2
408 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
409 nacol[1,1] = "COL_NAs"
410 for(x in 2:dim(dialzdat)[2]){
411 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
412 x <- x + 1
413 }
414 colnames(nacol) <- colnames(dialzdat)
415 dialzdat<-bind_rows(dialzdat,nacol)
416
417 ##NAs in a row
418 y <- 1
419 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
420 for(y in 1:dim(dialzdat)[1]){
421 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
422 y <- y + 1
423 }
424 colnames(narowd) <- "ROW_NAs"
425 dialzdat <- bind_cols(dialzdat,narowd)
426 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
427 colnames(RAWWORD) <- colnames(dialzdat)
428 ##converting to character so that the clinical can be brought together with discrete data
429 k <- 2
430 for(k in 2:dim(dialzdat)[2]-1){
431 dialzdat[,k] <- as.character(dialzdat[,k])
432 k <- k + 1
433 }
434 #The End the full data
435 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
436
437 #Produces Discrete file
438 nfnaex <- strsplit(rawdat,"[\\|/]") %>%
439 .[[1]] %>%
440 .[length(.)] %>%
441 gsub("\\D","",.) %>%
442 c("GSE",.,"dscrt.txt") %>%
443 paste(collapse = "")
444 write.table(Dscrtalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)
445
446