Commit 83db0077e429f9efd63267d3dee589fbaf5c85a8

Authored by Efrain Gonzalez
1 parent e340baf086
Exists in master

Moved to new folder

1 #Libraries required to run the code 1 #Libraries required to run the code
2 library(pryr) 2 library(pryr)
3 library(MASS) 3 library(MASS)
4 library(dplyr) 4 library(dplyr)
5 library(tidyr) 5 library(tidyr)
6 library(readr) 6 library(readr)
7 library(stringr) 7 library(stringr)
8 8
9 9
10 #Necessary Functions 10 #Necessary Functions
11 #1#Function for handling the changing of row names and column names 11 #1#Function for handling the changing of row names and column names
12 chngrownm <- function(mat){ 12 chngrownm <- function(mat){
13 row <- dim(mat)[1] 13 row <- dim(mat)[1]
14 col <- dim(mat)[2] 14 col <- dim(mat)[2]
15 j <- 1 15 j <- 1
16 x <- 1 16 x <- 1
17 p <- 1 17 p <- 1
18 a <- 1 18 a <- 1
19 b <- 1 19 b <- 1
20 g <- 1 20 g <- 1
21 for(j in 1:col){ 21 for(j in 1:col){
22 if("!Sample_source_name_ch1"==mat[1,j]){ 22 if("!Sample_source_name_ch1"==mat[1,j]){
23 colnames(mat)[j] <- "Brain_Region" 23 colnames(mat)[j] <- "Brain_Region"
24 } 24 }
25 if("!Sample_title" == mat[1,j]){ 25 if("!Sample_title" == mat[1,j]){
26 colnames(mat)[j] <- "Title" 26 colnames(mat)[j] <- "Title"
27 } 27 }
28 if("!Sample_geo_accession" == mat[1,j]){ 28 if("!Sample_geo_accession" == mat[1,j]){
29 colnames(mat)[j] <- "ID_REF" 29 colnames(mat)[j] <- "ID_REF"
30 } else{ 30 } else{
31 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ 31 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
32 colnames(mat)[j] <- paste0("Sex",x) 32 colnames(mat)[j] <- paste0("Sex",x)
33 x = x + 1 33 x = x + 1
34 } 34 }
35 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ 35 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
36 colnames(mat)[j] <- paste0("PMI",p) 36 colnames(mat)[j] <- paste0("PMI",p)
37 p = p + 1 37 p = p + 1
38 } 38 }
39 if(grepl("age|Age|AGE",mat[2,j])==TRUE){ 39 if(grepl("age|Age|AGE",mat[2,j])==TRUE){
40 colnames(mat)[j] <- paste0("Age",a) 40 colnames(mat)[j] <- paste0("Age",a)
41 a = a + 1 41 a = a + 1
42 } 42 }
43 if(grepl("braak|b&b",mat[2,j])==TRUE){ 43 if(grepl("braak|b&b",mat[2,j])==TRUE){
44 colnames(mat)[j] <- paste0("Braak",b) 44 colnames(mat)[j] <- paste0("Braak",b)
45 b = b + 1 45 b = b + 1
46 } 46 }
47 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){ 47 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
48 colnames(mat)[j] <- paste0("Group",g) 48 colnames(mat)[j] <- paste0("Group",g)
49 g = g + 1 49 g = g + 1
50 } 50 }
51 51
52 } 52 }
53 j = j + 1 53 j = j + 1
54 } 54 }
55 mat 55 mat
56 } 56 }
57 57
58 #2#Function for reorganizing information within the columns 58 #2#Function for reorganizing information within the columns
59 cinfo <- function(mat){ 59 cinfo <- function(mat){
60 col <- dim(mat)[2] 60 col <- dim(mat)[2]
61 j <-2 61 j <-2
62 for(j in 2:col){ 62 for(j in 2:col){
63 if(grepl("Group",colnames(mat)[j]) == TRUE){ 63 if(grepl("Group",colnames(mat)[j]) == TRUE){
64 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 64 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
65 } 65 }
66 if(grepl("Age",colnames(mat)[j])==TRUE){ 66 if(grepl("Age",colnames(mat)[j])==TRUE){
67 mat[,j] <- gsub("\\D","",mat[,j])%>% 67 mat[,j] <- gsub("\\D","",mat[,j])%>%
68 as.integer() 68 as.integer()
69 } 69 }
70 if(grepl("Sex",colnames(mat)[j])==TRUE){ 70 if(grepl("Sex",colnames(mat)[j])==TRUE){
71 mat[,j] <- gsub(".+:\\s","",mat[,j]) 71 mat[,j] <- gsub(".+:\\s","",mat[,j])
72 } 72 }
73 if(grepl("PMI",colnames(mat)[j])==TRUE){ 73 if(grepl("PMI",colnames(mat)[j])==TRUE){
74 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 74 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
75 as.numeric() 75 as.numeric()
76 } 76 }
77 if(grepl("Braak",colnames(mat)[j])==TRUE){ 77 if(grepl("Braak",colnames(mat)[j])==TRUE){
78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
79 as.roman()%>% 79 as.roman()%>%
80 as.integer() 80 as.integer()
81 } 81 }
82 j=j+1 82 j=j+1
83 } 83 }
84 mat 84 mat
85 } 85 }
86 86
87 #3#Function for labeling the gene IDs without names 87 #3#Function for labeling the gene IDs without names
88 NAFIXING <- function(GIDNAM){ 88 NAFIXING <- function(GIDNAM){
89 row <- dim(GIDNAM)[1] 89 row <- dim(GIDNAM)[1]
90 i <- 1 90 i <- 1
91 for(i in 1:row){ 91 for(i in 1:row){
92 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 92 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
93 GIDNAM[i,2] <- GIDNAM[i,1] 93 GIDNAM[i,2] <- GIDNAM[i,1]
94 } 94 }
95 i <- i + 1 95 i <- i + 1
96 } 96 }
97 GIDNAM 97 GIDNAM
98 } 98 }
99 99
100 #4#Function for changing the gene ID to gene name 100 #4#Function for changing the gene ID to gene name
101 cgeneID <- function(GeneName,DATA){ 101 cgeneID <- function(GeneName,DATA){
102 colGene <- dim(GeneName)[2] 102 colGene <- dim(GeneName)[2]
103 j <- 1 103 j <- 1
104 for(j in 1:colGene){ 104 for(j in 1:colGene){
105 chngsreq <- grep(paste0("^",GeneName[1,j]),DATA[1,]) 105 chngsreq <- grep(paste0("^",GeneName[1,j]),DATA[1,])
106 DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) 106 DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
107 j = j+1 107 j = j+1
108 } 108 }
109 DATA 109 DATA
110 } 110 }
111 111
112 #5#Function for adjusting the gene names 112 #5#Function for adjusting the gene names
113 gcnames <- function(DiData,usecol=1){ 113 gcnames <- function(DiData,usecol=1){
114 nuruns <- dim(DiData)[2] 114 nuruns <- dim(DiData)[2]
115 i = 1 115 i = 1
116 nwnam <- rep("0",length.out=nuruns) 116 nwnam <- rep("0",length.out=nuruns)
117 for(i in 1:nuruns){ 117 for(i in 1:nuruns){
118 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ 118 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
119 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol] 119 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
120 } else{ 120 } else{
121 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1] 121 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
122 } 122 }
123 123
124 } 124 }
125 nwnam 125 nwnam
126 126
127 } 127 }
128 128
129 129
130 130
131 #The Rest of this code will be used every time you want to change a data set 131 #The Rest of this code will be used every time you want to change a data set
132 132
133 #Getting the series matrix file 133 #Getting the series matrix file
134 print("Choose the series matrix file that you want to Analyze") 134 print("Choose the series matrix file that you want to Analyze")
135 alz <- file.choose() 135 alz <- file.choose()
136 136
137 #Getting the GPL file 137 #Getting the GPL file
138 print("Choose the GPL file that correlates with the above series matrix file") 138 print("Choose the GPL file that correlates with the above series matrix file")
139 genena <- file.choose() 139 genena <- file.choose()
140 140
141 141
142 #Set working directory based on the directory of the series matrix file Currently only works for windows 142 #Set working directory based on the directory of the series matrix file Currently only works for windows
143 ##strsplit(alz,"[\\]") %>% 143 ##strsplit(alz,"[\\]") %>%
144 ## .[[1]] %>% 144 ## .[[1]] %>%
145 ## .[-length(.)] %>% 145 ## .[-length(.)] %>%
146 ## paste(.,collapse="/") %>% 146 ## paste(.,collapse="/") %>%
147 ## setwd() 147 ## setwd()
148 148
149 #Find out if it is a soft GPL file or not 149 #Find out if it is a soft GPL file or not
150 soft <- strsplit(genena,"[\\|/]") %>% 150 soft <- strsplit(genena,"[\\|/]") %>%
151 .[[1]] %>% 151 .[[1]] %>%
152 .[length(.)] %>% 152 .[length(.)] %>%
153 grepl("soft",.) 153 grepl("soft",.)
154 154
155 #Working with the wordy part of the document 155 #Working with the wordy part of the document
156 alzword <- alz %>% 156 alzword <- alz %>%
157 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 157 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
158 filter(grepl("!Sample",X1))%>% 158 filter(grepl("!Sample",X1))%>%
159 filter(!grepl("!Sample_contact",X1)) 159 filter(!grepl("!Sample_contact",X1))
160 160
161 ##Changing row names and column names: 161 ##Changing row names and column names:
162 ALZWORD <- t(alzword) 162 ALZWORD <- t(alzword)
163 rownames(ALZWORD)=NULL 163 rownames(ALZWORD)=NULL
164 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 164 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
165 ALZWORD <- chngrownm(ALZWORD)[-1,] 165 ALZWORD <- chngrownm(ALZWORD)[-1,]
166 ALZWORD <- ALZWORD%>% 166 ALZWORD <- ALZWORD%>%
167 as.data.frame()%>% 167 as.data.frame()%>%
168 dplyr::select(-starts_with("col")) 168 dplyr::select(-starts_with("col"))
169 169
170 ##Reorganizing information within the columns 170 ##Reorganizing information within the columns
171 ALZWORDF <- cinfo(ALZWORD) 171 ALZWORDF <- cinfo(ALZWORD)
172 172
173 173
174 #Working with Actual Data part of file 174 #Working with Actual Data part of file
175 alzdat <- alz %>% 175 alzdat <- alz %>%
176 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 176 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
177 ALZDAT <- t(alzdat[,-1]) 177 ALZDAT <- t(alzdat[,-1])
178 rownames(ALZDAT)=NULL 178 rownames(ALZDAT)=NULL
179 179
180 180
181 ##Gene ID to Gene Name 181 ##Gene ID to Gene Name
182 ###geneIDNam <- genena %>% 182 ###geneIDNam <- genena %>%
183 ### read_delim(delim="\t",comment = "#")%>% 183 ### read_delim(delim="\t",comment = "#")%>%
184 ### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 184 ### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
185 ###problems with the above for soft files 185 ###problems with the above for soft files
186 if(soft == TRUE){ 186 if(soft == TRUE){
187 gplnum <- strsplit(genena,"[\\|/]") %>% 187 gplnum <- strsplit(genena,"[\\|/]") %>%
188 .[[1]] %>% 188 .[[1]] %>%
189 .[length(.)] %>% 189 .[length(.)] %>%
190 gsub("\\D","",.) 190 gsub("\\D","",.)
191 #Check to see if there is already a file containing information on soft files 191 #Check to see if there is already a file containing information on soft files
192 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 192 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
193 if(fileex == 1){ 193 if(fileex == 1){
194 #Check to see if this GPL soft file has been used before 194 #Check to see if this GPL soft file has been used before
195 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 195 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
196 .$GPL_FILE_NUM%>% 196 .$GPL_FILE_NUM%>%
197 grepl(gplnum,.) %>% 197 grepl(gplnum,.) %>%
198 sum() 198 sum()
199 if(IDF == 1){ 199 if(IDF == 1){
200 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 200 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
201 .$GPL_FILE_NUM%>% 201 .$GPL_FILE_NUM%>%
202 grep(gplnum,.) 202 grep(gplnum,.)
203 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 203 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
204 .$LOC_ID %>% 204 .$LOC_ID %>%
205 .[IDLOCAL] 205 .[IDLOCAL]
206 geneIDNam <- genena %>% 206 geneIDNam <- genena %>%
207 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 207 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
208 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 208 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
209 } 209 }
210 if(IDF == 0){ 210 if(IDF == 0){
211 #No information on this particular GPL file 211 #No information on this particular GPL file
212 idLOCGPL <- genena %>% 212 idLOCGPL <- genena %>%
213 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 213 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
214 t(.) %>% 214 t(.) %>%
215 grep("^\\D",.) %>% 215 grep("^\\D",.) %>%
216 length()-1 216 length()-1
217 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 217 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
218 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 218 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
219 geneIDNam <- genena %>% 219 geneIDNam <- genena %>%
220 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 220 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
221 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 221 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
222 } 222 }
223 } 223 }
224 if(fileex == 0){ 224 if(fileex == 0){
225 #We must create a file that we can access for later use 225 #We must create a file that we can access for later use
226 idLOCGPL <- genena %>% 226 idLOCGPL <- genena %>%
227 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 227 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
228 t(.) %>% 228 t(.) %>%
229 grep("^\\D",.) %>% 229 grep("^\\D",.) %>%
230 length()-1 230 length()-1
231 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 231 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
232 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 232 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
233 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 233 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
234 geneIDNam <- genena %>% 234 geneIDNam <- genena %>%
235 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 235 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
236 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 236 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
237 } 237 }
238 } 238 }
239 if(soft == FALSE){ 239 if(soft == FALSE){
240 geneIDNam <- genena %>% 240 geneIDNam <- genena %>%
241 read_delim(delim="\t",comment = "#")%>% 241 read_delim(delim="\t",comment = "#")%>%
242 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 242 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
243 } 243 }
244 244
245 ##Labeling the gene IDs without names 245 ##Labeling the gene IDs without names
246 geneIDNam <- NAFIXING(geneIDNam) 246 geneIDNam <- NAFIXING(geneIDNam)
247 247
248 ##remove the whitespace 248 ##remove the whitespace
249 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 249 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
250 250
251 ##Changing the gene ID to gene name 251 ##Changing the gene ID to gene name
252 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 252 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
253 colnames(ALZDAT) = ALZDAT1[1,] 253 colnames(ALZDAT) = ALZDAT1[1,]
254 254
255 255
256 ##Adjusting the column names aka the gene names 256 ##Adjusting the column names aka the gene names
257 colnames(ALZDAT) <- gcnames(ALZDAT) 257 colnames(ALZDAT) <- gcnames(ALZDAT)
258 258
259 259
260 #Full Data 260 #Full Data
261 Fullalzdw <- ALZDAT %>% 261 Fullalzdw <- ALZDAT %>%
262 as.data.frame() %>% 262 as.data.frame() %>%
263 cbind(ALZWORDF,.) 263 cbind(ALZWORDF,.)
264 264
265 265
266 #nfna <- strsplit(alz,"[\\]") %>% 266 #nfna <- strsplit(alz,"[\\]") %>%
267 # .[[1]] %>% 267 # .[[1]] %>%
268 # .[length(.)] %>% 268 # .[length(.)] %>%
269 # gsub("\\D","",.) %>% 269 # gsub("\\D","",.) %>%
270 # c("GSE",.,"after.txt") %>% 270 # c("GSE",.,"after.txt") %>%
271 # paste(collapse = "") 271 # paste(collapse = "")
272 #write.matrix(Fullalzdw,file = nfna,sep = "\t") 272 #write.matrix(Fullalzdw,file = nfna,sep = "\t")
273 #Perfect for excel viewing 273 #Perfect for excel viewing
274 nfnaex <- strsplit(alz,"[\\]") %>% 274 nfnaex <- strsplit(alz,"[\\]") %>%
275 .[[1]] %>% 275 .[[1]] %>%
276 .[length(.)] %>% 276 .[length(.)] %>%
277 gsub("\\D","",.) %>% 277 gsub("\\D","",.) %>%
278 c("GSE",.,"aftexcel.txt") %>% 278 c("GSE",.,"aftexcel.txt") %>%
279 paste(collapse = "") 279 paste(collapse = "")
280 write.table(t(Fullalzdw), file = nfnaex, sep = "\t",row.names = FALSE) 280 write.table(t(Fullalzdw), file = nfnaex, sep = "\t",row.names = FALSE)
281 281
282 282
283 283
1 #Libraries required to run the code 1 #Libraries required to run the code
2 library(pryr) 2 library(pryr)
3 library(MASS) 3 library(MASS)
4 library(dplyr) 4 library(dplyr)
5 library(tidyr) 5 library(tidyr)
6 library(readr) 6 library(readr)
7 library(stringr) 7 library(stringr)
8 8
9 9
10 #Necessary Functions 10 #Necessary Functions
11 #1#Function for handling the changing of row names and column names 11 #1#Function for handling the changing of row names and column names
12 chngrownm <- function(mat){ 12 chngrownm <- function(mat){
13 row <- dim(mat)[1] 13 row <- dim(mat)[1]
14 col <- dim(mat)[2] 14 col <- dim(mat)[2]
15 j <- 1 15 j <- 1
16 x <- 1 16 x <- 1
17 p <- 1 17 p <- 1
18 a <- 1 18 a <- 1
19 b <- 1 19 b <- 1
20 g <- 1 20 g <- 1
21 for(j in 1:col){ 21 for(j in 1:col){
22 if("!Sample_source_name_ch1"==mat[1,j]){ 22 if("!Sample_source_name_ch1"==mat[1,j]){
23 colnames(mat)[j] <- "Brain_Region" 23 colnames(mat)[j] <- "Brain_Region"
24 } 24 }
25 if("!Sample_title" == mat[1,j]){ 25 if("!Sample_title" == mat[1,j]){
26 colnames(mat)[j] <- "Title" 26 colnames(mat)[j] <- "Title"
27 } 27 }
28 if("!Sample_geo_accession" == mat[1,j]){ 28 if("!Sample_geo_accession" == mat[1,j]){
29 colnames(mat)[j] <- "ID_REF" 29 colnames(mat)[j] <- "ID_REF"
30 } else{ 30 } else{
31 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ 31 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
32 colnames(mat)[j] <- paste0("Sex",x) 32 colnames(mat)[j] <- paste0("Sex",x)
33 x = x + 1 33 x = x + 1
34 } 34 }
35 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ 35 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
36 colnames(mat)[j] <- paste0("PMI",p) 36 colnames(mat)[j] <- paste0("PMI",p)
37 p = p + 1 37 p = p + 1
38 } 38 }
39 if(grepl("age|Age|AGE",mat[2,j])==TRUE){ 39 if(grepl("age|Age|AGE",mat[2,j])==TRUE){
40 colnames(mat)[j] <- paste0("Age",a) 40 colnames(mat)[j] <- paste0("Age",a)
41 a = a + 1 41 a = a + 1
42 } 42 }
43 if(grepl("braak|b&b",mat[2,j])==TRUE){ 43 if(grepl("braak|b&b",mat[2,j])==TRUE){
44 colnames(mat)[j] <- paste0("Braak",b) 44 colnames(mat)[j] <- paste0("Braak",b)
45 b = b + 1 45 b = b + 1
46 } 46 }
47 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){ 47 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
48 colnames(mat)[j] <- paste0("Group",g) 48 colnames(mat)[j] <- paste0("Group",g)
49 g = g + 1 49 g = g + 1
50 } 50 }
51 51
52 } 52 }
53 j = j + 1 53 j = j + 1
54 } 54 }
55 mat 55 mat
56 } 56 }
57 57
58 #2#Function for reorganizing information within the columns 58 #2#Function for reorganizing information within the columns
59 cinfo <- function(mat){ 59 cinfo <- function(mat){
60 col <- dim(mat)[2] 60 col <- dim(mat)[2]
61 j <-2 61 j <-2
62 for(j in 2:col){ 62 for(j in 2:col){
63 if(grepl("Group",colnames(mat)[j]) == TRUE){ 63 if(grepl("Group",colnames(mat)[j]) == TRUE){
64 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 64 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
65 } 65 }
66 if(grepl("Age",colnames(mat)[j])==TRUE){ 66 if(grepl("Age",colnames(mat)[j])==TRUE){
67 mat[,j] <- gsub("\\D","",mat[,j])%>% 67 mat[,j] <- gsub("\\D","",mat[,j])%>%
68 as.integer() 68 as.integer()
69 } 69 }
70 if(grepl("Sex",colnames(mat)[j])==TRUE){ 70 if(grepl("Sex",colnames(mat)[j])==TRUE){
71 mat[,j] <- gsub(".+:\\s","",mat[,j]) 71 mat[,j] <- gsub(".+:\\s","",mat[,j])
72 } 72 }
73 if(grepl("PMI",colnames(mat)[j])==TRUE){ 73 if(grepl("PMI",colnames(mat)[j])==TRUE){
74 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 74 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
75 as.numeric() 75 as.numeric()
76 } 76 }
77 if(grepl("Braak",colnames(mat)[j])==TRUE){ 77 if(grepl("Braak",colnames(mat)[j])==TRUE){
78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
79 as.roman()%>% 79 as.roman()%>%
80 as.integer() 80 as.integer()
81 } 81 }
82 j=j+1 82 j=j+1
83 } 83 }
84 mat 84 mat
85 } 85 }
86 86
87 #3#Function for labeling the gene IDs without names 87 #3#Function for labeling the gene IDs without names
88 NAFIXING <- function(GIDNAM){ 88 NAFIXING <- function(GIDNAM){
89 row <- dim(GIDNAM)[1] 89 row <- dim(GIDNAM)[1]
90 i <- 1 90 i <- 1
91 for(i in 1:row){ 91 for(i in 1:row){
92 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 92 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
93 GIDNAM[i,2] <- GIDNAM[i,1] 93 GIDNAM[i,2] <- GIDNAM[i,1]
94 } 94 }
95 i <- i + 1 95 i <- i + 1
96 } 96 }
97 GIDNAM 97 GIDNAM
98 } 98 }
99 99
100 #4#Function for changing the gene ID to gene name 100 #4#Function for changing the gene ID to gene name
101 cgeneID <- function(GeneName,DATA){ 101 cgeneID <- function(GeneName,DATA){
102 colGene <- dim(GeneName)[2] 102 colGene <- dim(GeneName)[2]
103 j <- 1 103 j <- 1
104 for(j in 1:colGene){ 104 for(j in 1:colGene){
105 chngsreq <- grep(GeneName[1,j],DATA[1,]) 105 chngsreq <- grep(GeneName[1,j],DATA[1,])
106 #DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) 106 #DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
107 DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 107 DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
108 j = j+1 108 j = j+1
109 } 109 }
110 DATA 110 DATA
111 } 111 }
112 112
113 #5#Function for adjusting the gene names 113 #5#Function for adjusting the gene names
114 gcnames <- function(DiData,usecol=1){ 114 gcnames <- function(DiData,usecol=1){
115 nuruns <- dim(DiData)[2] 115 nuruns <- dim(DiData)[2]
116 i = 1 116 i = 1
117 nwnam <- rep("0",length.out=nuruns) 117 nwnam <- rep("0",length.out=nuruns)
118 for(i in 1:nuruns){ 118 for(i in 1:nuruns){
119 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ 119 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
120 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol] 120 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
121 } else{ 121 } else{
122 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1] 122 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
123 } 123 }
124 124
125 } 125 }
126 nwnam 126 nwnam
127 127
128 } 128 }
129 129
130 130
131 131
132 #The Rest of this code will be used every time you want to change a data set 132 #The Rest of this code will be used every time you want to change a data set
133 133
134 #Getting the series matrix file 134 #Getting the series matrix file
135 print("Choose the series matrix file that you want to Analyze") 135 print("Choose the series matrix file that you want to Analyze")
136 alz <- file.choose() 136 alz <- file.choose()
137 137
138 #Getting the GPL file 138 #Getting the GPL file
139 print("Choose the GPL file that correlates with the above series matrix file") 139 print("Choose the GPL file that correlates with the above series matrix file")
140 genena <- file.choose() 140 genena <- file.choose()
141 141
142 142
143 #Set working directory based on the directory of the series matrix file Currently only works for windows 143 #Set working directory based on the directory of the series matrix file Currently only works for windows
144 ##strsplit(alz,"[\\]") %>% 144 ##strsplit(alz,"[\\]") %>%
145 ## .[[1]] %>% 145 ## .[[1]] %>%
146 ## .[-length(.)] %>% 146 ## .[-length(.)] %>%
147 ## paste(.,collapse="/") %>% 147 ## paste(.,collapse="/") %>%
148 ## setwd() 148 ## setwd()
149 149
150 #Find out if it is a soft GPL file or not 150 #Find out if it is a soft GPL file or not
151 soft <- strsplit(genena,"[\\|/]") %>% 151 soft <- strsplit(genena,"[\\|/]") %>%
152 .[[1]] %>% 152 .[[1]] %>%
153 .[length(.)] %>% 153 .[length(.)] %>%
154 grepl("soft",.) 154 grepl("soft",.)
155 155
156 #Working with the wordy part of the document 156 #Working with the wordy part of the document
157 alzword <- alz %>% 157 alzword <- alz %>%
158 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 158 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
159 filter(grepl("!Sample",X1))%>% 159 filter(grepl("!Sample",X1))%>%
160 filter(!grepl("!Sample_contact",X1)) 160 filter(!grepl("!Sample_contact",X1))
161 161
162 ##Changing row names and column names: 162 ##Changing row names and column names:
163 ALZWORD <- t(alzword) 163 ALZWORD <- t(alzword)
164 rownames(ALZWORD)=NULL 164 rownames(ALZWORD)=NULL
165 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 165 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
166 ALZWORD <- chngrownm(ALZWORD)[-1,] 166 ALZWORD <- chngrownm(ALZWORD)[-1,]
167 ALZWORD <- ALZWORD%>% 167 ALZWORD <- ALZWORD%>%
168 as.data.frame()%>% 168 as.data.frame()%>%
169 dplyr::select(-starts_with("col")) 169 dplyr::select(-starts_with("col"))
170 170
171 ##Reorganizing information within the columns 171 ##Reorganizing information within the columns
172 ALZWORDF <- cinfo(ALZWORD) 172 ALZWORDF <- cinfo(ALZWORD)
173 173
174 174
175 #Working with Actual Data part of file 175 #Working with Actual Data part of file
176 alzdat <- alz %>% 176 alzdat <- alz %>%
177 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 177 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
178 ALZDAT <- t(alzdat[,-1]) 178 ALZDAT <- t(alzdat[,-1])
179 rownames(ALZDAT)=NULL 179 rownames(ALZDAT)=NULL
180 180
181 ##Is there a clean version of the GPL file available? 181 ##Is there a clean version of the GPL file available?
182 gplnum <- strsplit(genena,"[\\|/]") %>% 182 gplnum <- strsplit(genena,"[\\|/]") %>%
183 .[[1]] %>% 183 .[[1]] %>%
184 .[length(.)] %>% 184 .[length(.)] %>%
185 gsub("\\D","",.) 185 gsub("\\D","",.)
186 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 186 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
187 if(clfileex >= 1){ 187 if(clfileex >= 1){
188 #use the clean version 188 #use the clean version
189 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 189 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
190 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 190 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
191 191
192 } 192 }
193 if(clfileex == 0){ 193 if(clfileex == 0){
194 ##Lets Create a clean version 194 ##Lets Create a clean version
195 195
196 ##Gene ID to Gene Name 196 ##Gene ID to Gene Name
197 ###geneIDNam <- genena %>% 197 ###geneIDNam <- genena %>%
198 ### read_delim(delim="\t",comment = "#")%>% 198 ### read_delim(delim="\t",comment = "#")%>%
199 ### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 199 ### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
200 ###problems with the above for soft files 200 ###problems with the above for soft files
201 if(soft == TRUE){ 201 if(soft == TRUE){
202 #gplnum <- strsplit(genena,"[\\|/]") %>% 202 #gplnum <- strsplit(genena,"[\\|/]") %>%
203 # .[[1]] %>% 203 # .[[1]] %>%
204 # .[length(.)] %>% 204 # .[length(.)] %>%
205 # gsub("\\D","",.) 205 # gsub("\\D","",.)
206 #Check to see if there is already a file containing information on soft files 206 #Check to see if there is already a file containing information on soft files
207 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 207 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
208 if(fileex == 1){ 208 if(fileex == 1){
209 #Check to see if this GPL soft file has been used before 209 #Check to see if this GPL soft file has been used before
210 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 210 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
211 .$GPL_FILE_NUM%>% 211 .$GPL_FILE_NUM%>%
212 grepl(gplnum,.) %>% 212 grepl(gplnum,.) %>%
213 sum() 213 sum()
214 if(IDF == 1){ 214 if(IDF == 1){
215 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 215 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
216 .$GPL_FILE_NUM%>% 216 .$GPL_FILE_NUM%>%
217 grep(gplnum,.) 217 grep(gplnum,.)
218 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 218 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
219 .$LOC_ID %>% 219 .$LOC_ID %>%
220 .[IDLOCAL] 220 .[IDLOCAL]
221 geneIDNam <- genena %>% 221 geneIDNam <- genena %>%
222 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 222 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
223 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 223 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
224 } 224 }
225 if(IDF == 0){ 225 if(IDF == 0){
226 #No information on this particular GPL file 226 #No information on this particular GPL file
227 idLOCGPL <- genena %>% 227 idLOCGPL <- genena %>%
228 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 228 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
229 t(.) %>% 229 t(.) %>%
230 grep("^\\D",.) %>% 230 grep("^\\D",.) %>%
231 length()-1 231 length()-1
232 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 232 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
233 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 233 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
234 geneIDNam <- genena %>% 234 geneIDNam <- genena %>%
235 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 235 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
236 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 236 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
237 } 237 }
238 } 238 }
239 if(fileex == 0){ 239 if(fileex == 0){
240 #We must create a file that we can access for later use 240 #We must create a file that we can access for later use
241 idLOCGPL <- genena %>% 241 idLOCGPL <- genena %>%
242 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 242 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
243 t(.) %>% 243 t(.) %>%
244 grep("^\\D",.) %>% 244 grep("^\\D",.) %>%
245 length()-1 245 length()-1
246 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 246 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
247 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 247 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
248 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 248 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
249 geneIDNam <- genena %>% 249 geneIDNam <- genena %>%
250 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 250 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
251 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 251 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
252 } 252 }
253 } 253 }
254 if(soft == FALSE){ 254 if(soft == FALSE){
255 geneIDNam <- genena %>% 255 geneIDNam <- genena %>%
256 read_delim(delim="\t",comment = "#")%>% 256 read_delim(delim="\t",comment = "#")%>%
257 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 257 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
258 } 258 }
259 259
260 ##Labeling the gene IDs without names 260 ##Labeling the gene IDs without names
261 geneIDNam <- NAFIXING(geneIDNam) 261 geneIDNam <- NAFIXING(geneIDNam)
262 262
263 ##remove the whitespace 263 ##remove the whitespace
264 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 264 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
265 265
266 ##Here is the clean version 266 ##Here is the clean version
267 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 267 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
268 } 268 }
269 269
270 270
271 271
272 ##Changing the gene ID to gene name 272 ##Changing the gene ID to gene name
273 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 273 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
274 colnames(ALZDAT) = ALZDAT1[1,] 274 colnames(ALZDAT) = ALZDAT1[1,]
275 275
276 276
277 ##Adjusting the column names aka the gene names 277 ##Adjusting the column names aka the gene names
278 colnames(ALZDAT) <- gcnames(ALZDAT) 278 colnames(ALZDAT) <- gcnames(ALZDAT)
279 279
280 280
281 #Full Data 281 #Full Data
282 Fullalzdw <- ALZDAT %>% 282 Fullalzdw <- ALZDAT %>%
283 as.data.frame() %>% 283 as.data.frame() %>%
284 cbind(ALZWORDF,.) 284 cbind(ALZWORDF,.)
285 285
286 286
287 #nfna <- strsplit(alz,"[\\|/]") %>% 287 #nfna <- strsplit(alz,"[\\|/]") %>%
288 # .[[1]] %>% 288 # .[[1]] %>%
289 # .[length(.)] %>% 289 # .[length(.)] %>%
290 # gsub("\\D","",.) %>% 290 # gsub("\\D","",.) %>%
291 # c("GSE",.,"after.txt") %>% 291 # c("GSE",.,"after.txt") %>%
292 # paste(collapse = "") 292 # paste(collapse = "")
293 #write.matrix(Fullalzdw,file = nfna,sep = "\t") 293 #write.matrix(Fullalzdw,file = nfna,sep = "\t")
294 294
295 #Perfect for excel viewing 295 #Perfect for excel viewing
296 nfnaex <- strsplit(alz,"[\\]") %>% 296 nfnaex <- strsplit(alz,"[\\]") %>%
297 .[[1]] %>% 297 .[[1]] %>%
298 .[length(.)] %>% 298 .[length(.)] %>%
299 gsub("\\D","",.) %>% 299 gsub("\\D","",.) %>%
300 c("GSE",.,"aftexcel.txt") %>% 300 c("GSE",.,"aftexcel.txt") %>%
301 paste(collapse = "") 301 paste(collapse = "")
302 write.table(t(Fullalzdw), file = nfnaex, sep = "\t") 302 write.table(t(Fullalzdw), file = nfnaex, sep = "\t")
303 303