Commit 22a75a38eb8c5a8df6acce96b9c4487874143168

Authored by Efrain Gonzalez
1 parent f378e57f40
Exists in master

Most Recent (UNTESTED)

Showing 1 changed file with 324 additions and 0 deletions   Show diff stats
File was created 1 #Libraries required to run the code
2 library(pryr)
3 library(MASS)
4 library(dplyr)
5 library(tidyr)
6 library(readr)
7 library(stringr)
8
9
10 #Necessary Functions
11 #1#Function for handling the changing of row names and column names
12 chngrownm <- function(mat){
13 row <- dim(mat)[1]
14 col <- dim(mat)[2]
15 j <- 1
16 x <- 1
17 p <- 1
18 a <- 1
19 b <- 1
20 g <- 1
21 for(j in 1:col){
22 if("!Sample_source_name_ch1"==mat[1,j]){
23 colnames(mat)[j] <- "Brain_Region"
24 }
25 if("!Sample_title" == mat[1,j]){
26 colnames(mat)[j] <- "Title"
27 }
28 if("!Sample_geo_accession" == mat[1,j]){
29 colnames(mat)[j] <- "ID_REF"
30 } else{
31 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
32 colnames(mat)[j] <- paste0("Sex",x)
33 x = x + 1
34 }
35 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
36 colnames(mat)[j] <- paste0("PMI",p)
37 p = p + 1
38 }
39 if(grepl("age|Age|AGE",mat[2,j])==TRUE){
40 colnames(mat)[j] <- paste0("Age",a)
41 a = a + 1
42 }
43 if(grepl("braak|b&b",mat[2,j])==TRUE){
44 colnames(mat)[j] <- paste0("Braak",b)
45 b = b + 1
46 }
47 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
48 colnames(mat)[j] <- paste0("Group",g)
49 g = g + 1
50 }
51
52 }
53 j = j + 1
54 }
55 mat
56 }
57
58 #2#Function for reorganizing information within the columns
59 cinfo <- function(mat){
60 col <- dim(mat)[2]
61 j <-2
62 for(j in 2:col){
63 if(grepl("Group",colnames(mat)[j]) == TRUE){
64 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
65 }
66 if(grepl("Age",colnames(mat)[j])==TRUE){
67 mat[,j] <- gsub("\\D","",mat[,j])%>%
68 as.integer()
69 }
70 if(grepl("Sex",colnames(mat)[j])==TRUE){
71 mat[,j] <- gsub(".+:\\s","",mat[,j])
72 }
73 if(grepl("PMI",colnames(mat)[j])==TRUE){
74 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
75 as.numeric()
76 }
77 if(grepl("Braak",colnames(mat)[j])==TRUE){
78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
79 as.roman()%>%
80 as.integer()
81 }
82 j=j+1
83 }
84 mat
85 }
86
87 #3#Function for labeling the gene IDs without names
88 NAFIXING <- function(GIDNAM){
89 row <- dim(GIDNAM)[1]
90 i <- 1
91 for(i in 1:row){
92 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
93 GIDNAM[i,2] <- GIDNAM[i,1]
94 }
95 i <- i + 1
96 }
97 GIDNAM
98 }
99
100 ##4#Function for changing the gene ID to gene name
101 ##cgeneID <- function(GeneName,DATA){
102 ## colGene <- dim(GeneName)[2]
103 ## j <- 1
104 ## for(j in 1:colGene){
105 ## chngsreq <- grep(GeneName[1,j],DATA[1,])
106 ## if(sum(chngsreq) > 0){
107 ## #DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
108 ## DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
109 ## }
110 ## j = j+1
111 ## }
112 ## DATA
113 ##}
114 #4#Function for changing the gene ID to gene name
115 cgeneID <- function(GeneName,DATA){
116 colGene <- dim(GeneName)[2]
117 j <- 1
118 for(j in 1:colGene){
119 chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
120 if(is.na(sum(chngsreq))==FALSE){
121 if(sum(chngsreq) > 0){
122 DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
123 }
124 }
125 #if(sum(chngsreq) > 0){
126 ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
127 #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
128 #}
129 j = j+1
130 }
131 DATA
132 }
133
134 #5#Function for adjusting the gene names
135 gcnames <- function(DiData,usecol=1){
136 nuruns <- dim(DiData)[2]
137 i = 1
138 nwnam <- rep("0",length.out=nuruns)
139 for(i in 1:nuruns){
140 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
141 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
142 } else{
143 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
144 }
145
146 }
147 nwnam
148
149 }
150
151
152
153 #The Rest of this code will be used every time you want to change a data set
154
155 #Getting the series matrix file
156 print("Choose the series matrix file that you want to Analyze")
157 alz <- file.choose()
158
159 #Getting the GPL file
160 print("Choose the GPL file that correlates with the above series matrix file")
161 genena <- file.choose()
162
163
164 #Set working directory based on the directory of the series matrix file Currently only works for windows
165 ##strsplit(alz,"[\\]") %>%
166 ## .[[1]] %>%
167 ## .[-length(.)] %>%
168 ## paste(.,collapse="/") %>%
169 ## setwd()
170
171 #Find out if it is a soft GPL file or not
172 soft <- strsplit(genena,"[\\|/]") %>%
173 .[[1]] %>%
174 .[length(.)] %>%
175 grepl("soft",.)
176
177 #Working with the wordy part of the document
178 alzword <- alz %>%
179 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
180 filter(grepl("!Sample",X1))%>%
181 filter(!grepl("!Sample_contact",X1))
182
183 ##Changing row names and column names:
184 ALZWORD <- t(alzword)
185 rownames(ALZWORD)=NULL
186 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
187 ALZWORD <- chngrownm(ALZWORD)[-1,]
188 ALZWORD <- ALZWORD%>%
189 as.data.frame()%>%
190 dplyr::select(-starts_with("col"))
191
192 ##Reorganizing information within the columns
193 ALZWORDF <- cinfo(ALZWORD)
194
195
196 #Working with Actual Data part of file
197 alzdat <- alz %>%
198 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
199 ALZDAT <- t(alzdat[,-1])
200 rownames(ALZDAT)=NULL
201
202 ##Is there a clean version of the GPL file available?
203 gplnum <- strsplit(genena,"[\\|/]") %>%
204 .[[1]] %>%
205 .[length(.)] %>%
206 gsub("\\D","",.)
207 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
208 if(clfileex >= 1){
209 #use the clean version
210 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
211 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
212
213 }
214 if(clfileex == 0){
215 ##Lets Create a clean version
216
217 ##Gene ID to Gene Name
218 ###geneIDNam <- genena %>%
219 ### read_delim(delim="\t",comment = "#")%>%
220 ### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
221 ###problems with the above for soft files
222 if(soft == TRUE){
223 #gplnum <- strsplit(genena,"[\\|/]") %>%
224 # .[[1]] %>%
225 # .[length(.)] %>%
226 # gsub("\\D","",.)
227 #Check to see if there is already a file containing information on soft files
228 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
229 if(fileex == 1){
230 #Check to see if this GPL soft file has been used before
231 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
232 .$GPL_FILE_NUM%>%
233 grepl(gplnum,.) %>%
234 sum()
235 if(IDF == 1){
236 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
237 .$GPL_FILE_NUM%>%
238 grep(gplnum,.)
239 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
240 .$LOC_ID %>%
241 .[IDLOCAL]
242 geneIDNam <- genena %>%
243 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
244 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
245 }
246 if(IDF == 0){
247 #No information on this particular GPL file
248 idLOCGPL <- genena %>%
249 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
250 t(.) %>%
251 grep("^\\D",.) %>%
252 length()-1
253 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
254 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
255 geneIDNam <- genena %>%
256 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
257 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
258 }
259 }
260 if(fileex == 0){
261 #We must create a file that we can access for later use
262 idLOCGPL <- genena %>%
263 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
264 t(.) %>%
265 grep("^\\D",.) %>%
266 length()-1
267 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
268 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
269 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
270 geneIDNam <- genena %>%
271 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
272 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
273 }
274 }
275 if(soft == FALSE){
276 geneIDNam <- genena %>%
277 read_delim(delim="\t",comment = "#")%>%
278 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
279 }
280
281 ##Labeling the gene IDs without names
282 geneIDNam <- NAFIXING(geneIDNam)
283
284 ##remove the whitespace
285 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
286
287 ##Here is the clean version
288 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
289 }
290
291
292
293 ##Changing the gene ID to gene name
294 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
295 colnames(ALZDAT) = ALZDAT1[1,]
296
297
298 ##Adjusting the column names aka the gene names
299 colnames(ALZDAT) <- gcnames(ALZDAT)
300
301
302 #Full Data
303 Fullalzdw <- ALZDAT %>%
304 as.data.frame() %>%
305 cbind(ALZWORDF,.)
306
307
308 #nfna <- strsplit(alz,"[\\|/]") %>%
309 # .[[1]] %>%
310 # .[length(.)] %>%
311 # gsub("\\D","",.) %>%
312 # c("GSE",.,"after.txt") %>%
313 # paste(collapse = "")
314 #write.matrix(Fullalzdw,file = nfna,sep = "\t")
315
316 #Perfect for excel viewing
317 nfnaex <- strsplit(alz,"[\\]") %>%
318 .[[1]] %>%
319 .[length(.)] %>%
320 gsub("\\D","",.) %>%
321 c("GSE",.,"aftexcel.txt") %>%
322 paste(collapse = "")
323 write.table(t(Fullalzdw), file = nfnaex, sep = "\t")
324
325