Commit 501231d3ebcc148491cc9f407215c4a8d1369b35

Authored by Efrain Gonzalez
1 parent 48208ef1ad
Exists in master

Added |^UCSC_RefGene_Name$ to list of potential names in GPL

Showing 1 changed file with 4 additions and 4 deletions   Show diff stats
1 ##Posted 6/15/2017 1 ##Posted 6/15/2017
2 options(digits = 11) 2 options(digits = 11)
3 3
4 #Libraries required to run the code 4 #Libraries required to run the code
5 library(pryr) 5 library(pryr)
6 library(MASS) 6 library(MASS)
7 library(dplyr) 7 library(dplyr)
8 library(tidyr) 8 library(tidyr)
9 library(readr) 9 library(readr)
10 library(stringr) 10 library(stringr)
11 11
12 12
13 #Necessary Functions 13 #Necessary Functions
14 #1#Function for handling the changing of row names and column names 14 #1#Function for handling the changing of row names and column names
15 chngrownm <- function(mat){ 15 chngrownm <- function(mat){
16 row <- dim(mat)[1] 16 row <- dim(mat)[1]
17 col <- dim(mat)[2] 17 col <- dim(mat)[2]
18 j <- 1 18 j <- 1
19 x <- 1 19 x <- 1
20 p <- 1 20 p <- 1
21 a <- 1 21 a <- 1
22 b <- 1 22 b <- 1
23 g <- 1 23 g <- 1
24 for(j in 1:col){ 24 for(j in 1:col){
25 if("!Sample_source_name_ch1"==mat[1,j]){ 25 if("!Sample_source_name_ch1"==mat[1,j]){
26 colnames(mat)[j] <- "Brain_Region" 26 colnames(mat)[j] <- "Brain_Region"
27 } else if("!Sample_title" == mat[1,j]){ 27 } else if("!Sample_title" == mat[1,j]){
28 colnames(mat)[j] <- "Title" 28 colnames(mat)[j] <- "Title"
29 } else if("!Sample_geo_accession" == mat[1,j]){ 29 } else if("!Sample_geo_accession" == mat[1,j]){
30 colnames(mat)[j] <- "ID_REF" 30 colnames(mat)[j] <- "ID_REF"
31 } else{ 31 } else{
32 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ 32 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
33 colnames(mat)[j] <- paste0("Sex",x) 33 colnames(mat)[j] <- paste0("Sex",x)
34 x = x + 1 34 x = x + 1
35 } 35 }
36 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ 36 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
37 colnames(mat)[j] <- paste0("PMI",p) 37 colnames(mat)[j] <- paste0("PMI",p)
38 p = p + 1 38 p = p + 1
39 } 39 }
40 if(grepl("age|Age|AGE",mat[2,j])==TRUE){ 40 if(grepl("age|Age|AGE",mat[2,j])==TRUE){
41 colnames(mat)[j] <- paste0("Age",a) 41 colnames(mat)[j] <- paste0("Age",a)
42 a = a + 1 42 a = a + 1
43 } 43 }
44 if(grepl("braak|b&b",mat[2,j])==TRUE){ 44 if(grepl("braak|b&b",mat[2,j])==TRUE){
45 colnames(mat)[j] <- paste0("Braak",b) 45 colnames(mat)[j] <- paste0("Braak",b)
46 b = b + 1 46 b = b + 1
47 } 47 }
48 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){ 48 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){
49 colnames(mat)[j] <- paste0("Group",g) 49 colnames(mat)[j] <- paste0("Group",g)
50 g = g + 1 50 g = g + 1
51 } 51 }
52 52
53 } 53 }
54 j = j + 1 54 j = j + 1
55 } 55 }
56 mat 56 mat
57 } 57 }
58 58
59 #2#Function for reorganizing information within the columns 59 #2#Function for reorganizing information within the columns
60 cinfo <- function(mat){ 60 cinfo <- function(mat){
61 col <- dim(mat)[2] 61 col <- dim(mat)[2]
62 j <-2 62 j <-2
63 for(j in 2:col){ 63 for(j in 2:col){
64 if(grepl("Group",colnames(mat)[j]) == TRUE){ 64 if(grepl("Group",colnames(mat)[j]) == TRUE){
65 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 65 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
66 } 66 }
67 if(grepl("Age",colnames(mat)[j])==TRUE){ 67 if(grepl("Age",colnames(mat)[j])==TRUE){
68 mat[,j] <- gsub("\\D","",mat[,j])%>% 68 mat[,j] <- gsub("\\D","",mat[,j])%>%
69 as.integer() 69 as.integer()
70 } 70 }
71 if(grepl("Sex",colnames(mat)[j])==TRUE){ 71 if(grepl("Sex",colnames(mat)[j])==TRUE){
72 mat[,j] <- gsub(".+:\\s","",mat[,j]) 72 mat[,j] <- gsub(".+:\\s","",mat[,j])
73 } 73 }
74 if(grepl("PMI",colnames(mat)[j])==TRUE){ 74 if(grepl("PMI",colnames(mat)[j])==TRUE){
75 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 75 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
76 as.numeric() 76 as.numeric()
77 } 77 }
78 if(grepl("Braak",colnames(mat)[j])==TRUE){ 78 if(grepl("Braak",colnames(mat)[j])==TRUE){
79 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 79 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
80 as.roman()%>% 80 as.roman()%>%
81 as.integer() 81 as.integer()
82 } 82 }
83 j=j+1 83 j=j+1
84 } 84 }
85 mat 85 mat
86 } 86 }
87 87
88 #3#Function for labeling the gene IDs without names 88 #3#Function for labeling the gene IDs without names
89 NAFIXING <- function(GIDNAM){ 89 NAFIXING <- function(GIDNAM){
90 row <- dim(GIDNAM)[1] 90 row <- dim(GIDNAM)[1]
91 i <- 1 91 i <- 1
92 for(i in 1:row){ 92 for(i in 1:row){
93 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 93 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
94 GIDNAM[i,2] <- GIDNAM[i,1] 94 GIDNAM[i,2] <- GIDNAM[i,1]
95 } 95 }
96 i <- i + 1 96 i <- i + 1
97 } 97 }
98 GIDNAM 98 GIDNAM
99 } 99 }
100 100
101 #4#Function for changing the gene ID to gene name 101 #4#Function for changing the gene ID to gene name
102 cgeneID <- function(GeneName,DATA){ 102 cgeneID <- function(GeneName,DATA){
103 nj <- t(GeneName) 103 nj <- t(GeneName)
104 nq <- t(DATA) 104 nq <- t(DATA)
105 colGene <- dim(nj)[2] 105 colGene <- dim(nj)[2]
106 colDATA <- dim(nq)[2] 106 colDATA <- dim(nq)[2]
107 j <- 1 107 j <- 1
108 for(j in 1:colDATA){ 108 for(j in 1:colDATA){
109 #where is that gene id located within the GPL file 109 #where is that gene id located within the GPL file
110 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,]) 110 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
111 if(is.na(sum(chngreq))==FALSE){ 111 if(is.na(sum(chngreq))==FALSE){
112 if(sum(chngreq) > 0){ 112 if(sum(chngreq) > 0){
113 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j]) 113 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
114 } 114 }
115 } 115 }
116 j <- j + 1 116 j <- j + 1
117 } 117 }
118 nq 118 nq
119 } 119 }
120 #cgeneID <- function(GeneName,DATA){ 120 #cgeneID <- function(GeneName,DATA){
121 # colGene <- dim(GeneName)[2] 121 # colGene <- dim(GeneName)[2]
122 # j <- 1 122 # j <- 1
123 # for(j in 1:colGene){ 123 # for(j in 1:colGene){
124 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) 124 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
125 # if(is.na(sum(chngsreq))==FALSE){ 125 # if(is.na(sum(chngsreq))==FALSE){
126 # if(sum(chngsreq) > 0){ 126 # if(sum(chngsreq) > 0){
127 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 127 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
128 # } 128 # }
129 # } 129 # }
130 # #if(sum(chngsreq) > 0){ 130 # #if(sum(chngsreq) > 0){
131 # ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) 131 # ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
132 # #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 132 # #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
133 # #} 133 # #}
134 # j = j+1 134 # j = j+1
135 # } 135 # }
136 # DATA 136 # DATA
137 #} 137 #}
138 138
139 #5#Function for adjusting the gene names 139 #5#Function for adjusting the gene names
140 gcnames <- function(DiData,usecol=1){ 140 gcnames <- function(DiData,usecol=1){
141 nuruns <- dim(DiData)[2] 141 nuruns <- dim(DiData)[2]
142 i = 1 142 i = 1
143 nwnam <- rep("0",length.out=nuruns) 143 nwnam <- rep("0",length.out=nuruns)
144 for(i in 1:nuruns){ 144 for(i in 1:nuruns){
145 if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){ 145 if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){
146 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol]) 146 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])
147 } else{ 147 } else{
148 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1]) 148 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])
149 } 149 }
150 150
151 } 151 }
152 nwnam 152 nwnam
153 153
154 } 154 }
155 155
156 #6# Function for discretizing the data 156 #6# Function for discretizing the data
157 dndat <- function(NDATA){ 157 dndat <- function(NDATA){
158 rownd <- dim(NDATA)[1] 158 rownd <- dim(NDATA)[1]
159 colnd <- dim(NDATA)[2] 159 colnd <- dim(NDATA)[2]
160 DDATA <- matrix(0,nrow=rownd,ncol=colnd) 160 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
161 colnames(DDATA) <- colnames(NDATA) 161 colnames(DDATA) <- colnames(NDATA)
162 i <- 1 162 i <- 1
163 for(i in 1:rownd){ 163 for(i in 1:rownd){
164 j <- 1 164 j <- 1
165 for(j in 1:colnd){ 165 for(j in 1:colnd){
166 if(is.na(NDATA[i,j])==FALSE){ 166 if(is.na(NDATA[i,j])==FALSE){
167 167
168 if(NDATA[i,j] < -1){ 168 if(NDATA[i,j] < -1){
169 DDATA[i,j]=0L 169 DDATA[i,j]=0L
170 } else if(NDATA[i,j] > 1){ 170 } else if(NDATA[i,j] > 1){
171 DDATA[i,j]=2L 171 DDATA[i,j]=2L
172 } else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){ 172 } else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){
173 DDATA[i,j]=1L 173 DDATA[i,j]=1L
174 } 174 }
175 } else{ 175 } else{
176 DDATA[i,j] = NDATA[i,j] 176 DDATA[i,j] = NDATA[i,j]
177 } 177 }
178 j = j + 1 178 j = j + 1
179 } 179 }
180 i = i + 1 180 i = i + 1
181 } 181 }
182 DDATA 182 DDATA
183 } 183 }
184 184
185 185
186 #The Rest of this code will be used every time you want to change a data set 186 #The Rest of this code will be used every time you want to change a data set
187 187
188 #Getting the series matrix file 188 #Getting the series matrix file
189 print("Choose the series matrix file that you want to Analyze") 189 print("Choose the series matrix file that you want to Analyze")
190 alz <- file.choose() 190 alz <- file.choose()
191 191
192 #Getting the GPL file 192 #Getting the GPL file
193 print("Choose the GPL file that correlates with the above series matrix file") 193 print("Choose the GPL file that correlates with the above series matrix file")
194 genena <- file.choose() 194 genena <- file.choose()
195 195
196 196
197 #Find out if it is a soft GPL file or not 197 #Find out if it is a soft GPL file or not
198 soft <- strsplit(genena,"[\\|/]") %>% 198 soft <- strsplit(genena,"[\\|/]") %>%
199 .[[1]] %>% 199 .[[1]] %>%
200 .[length(.)] %>% 200 .[length(.)] %>%
201 grepl("soft|annot",.) 201 grepl("soft|annot",.)
202 202
203 #Working with the wordy part of the document 203 #Working with the wordy part of the document
204 alzword <- alz %>% 204 alzword <- alz %>%
205 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 205 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
206 filter(grepl("!Sample",X1))%>% 206 filter(grepl("!Sample",X1))%>%
207 filter(!grepl("!Sample_contact",X1)) 207 filter(!grepl("!Sample_contact",X1))
208 208
209 ##Changing row names and column names: 209 ##Changing row names and column names:
210 ALZWORD <- t(alzword) 210 ALZWORD <- t(alzword)
211 rownames(ALZWORD)=NULL 211 rownames(ALZWORD)=NULL
212 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 212 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
213 ALZWORD <- chngrownm(ALZWORD)[-1,] 213 ALZWORD <- chngrownm(ALZWORD)[-1,]
214 ALZWORD <- ALZWORD%>% 214 ALZWORD <- ALZWORD%>%
215 as.data.frame(.,stringsAsFactors = FALSE)%>% 215 as.data.frame(.,stringsAsFactors = FALSE)%>%
216 dplyr::select(-starts_with("col")) 216 dplyr::select(-starts_with("col"))
217 217
218 ##Reorganizing information within the columns 218 ##Reorganizing information within the columns
219 ALZWORDF <- cinfo(ALZWORD) 219 ALZWORDF <- cinfo(ALZWORD)
220 220
221 221
222 #Working with Actual Data part of file 222 #Working with Actual Data part of file
223 alzdat <- alz %>% 223 alzdat <- alz %>%
224 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 224 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
225 ALZDAT <- t(alzdat[,-1]) 225 ALZDAT <- t(alzdat[,-1])
226 rownames(ALZDAT)=NULL 226 rownames(ALZDAT)=NULL
227 227
228 ##Is there a clean version of the GPL file available? 228 ##Is there a clean version of the GPL file available?
229 gplnum <- strsplit(genena,"[\\|/]") %>% 229 gplnum <- strsplit(genena,"[\\|/]") %>%
230 .[[1]] %>% 230 .[[1]] %>%
231 .[length(.)] %>% 231 .[length(.)] %>%
232 gsub("\\D","",.) 232 gsub("\\D","",.)
233 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 233 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
234 if(clfileex >= 1){ 234 if(clfileex >= 1){
235 #use the clean version 235 #use the clean version
236 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 236 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
237 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 237 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
238 238
239 } else if(clfileex == 0){ 239 } else if(clfileex == 0){
240 ##Lets Create a clean version 240 ##Lets Create a clean version
241 241
242 ##Gene ID to Gene Name 242 ##Gene ID to Gene Name
243 if(soft == TRUE){ 243 if(soft == TRUE){
244 #Check to see if there is already a file containing information on soft files 244 #Check to see if there is already a file containing information on soft files
245 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 245 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
246 if(fileex == 1){ 246 if(fileex == 1){
247 #Check to see if this GPL soft file has been used before 247 #Check to see if this GPL soft file has been used before
248 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 248 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
249 .$GPL_FILE_NUM%>% 249 .$GPL_FILE_NUM%>%
250 grepl(gplnum,.) %>% 250 grepl(gplnum,.) %>%
251 sum() 251 sum()
252 if(IDF == 1){ 252 if(IDF == 1){
253 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 253 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
254 .$GPL_FILE_NUM%>% 254 .$GPL_FILE_NUM%>%
255 grep(gplnum,.) 255 grep(gplnum,.)
256 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 256 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
257 .$LOC_ID %>% 257 .$LOC_ID %>%
258 .[IDLOCAL] 258 .[IDLOCAL]
259 geneIDNam <- genena %>% 259 geneIDNam <- genena %>%
260 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 260 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
261 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.))) 261 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
262 } else if(IDF == 0){ 262 } else if(IDF == 0){
263 #No information on this particular GPL file 263 #No information on this particular GPL file
264 idLOCGPL <- genena %>% 264 idLOCGPL <- genena %>%
265 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 265 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
266 t(.) %>% 266 t(.) %>%
267 grep("^ID\\s*$",.) %>% 267 grep("^ID\\s*$",.) %>%
268 -1 268 -1
269 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 269 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
270 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 270 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
271 geneIDNam <- genena %>% 271 geneIDNam <- genena %>%
272 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 272 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
273 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.))) 273 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
274 } 274 }
275 } else if(fileex == 0){ 275 } else if(fileex == 0){
276 #We must create a file that we can access for later use 276 #We must create a file that we can access for later use
277 idLOCGPL <- genena %>% 277 idLOCGPL <- genena %>%
278 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 278 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
279 t(.) %>% 279 t(.) %>%
280 grep("^ID\\s*$",.) %>% 280 grep("^ID\\s*$",.) %>%
281 -1 281 -1
282 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 282 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
283 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 283 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
284 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 284 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
285 geneIDNam <- genena %>% 285 geneIDNam <- genena %>%
286 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 286 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
287 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.))) 287 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
288 } 288 }
289 } else if(soft == FALSE){ 289 } else if(soft == FALSE){
290 geneIDNam <- genena %>% 290 geneIDNam <- genena %>%
291 read_delim(delim="\t",comment = "#")%>% 291 read_delim(delim="\t",comment = "#")%>%
292 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.))) 292 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
293 } 293 }
294 294
295 ##Labeling the gene IDs without names 295 ##Labeling the gene IDs without names
296 geneIDNam <- NAFIXING(geneIDNam) 296 geneIDNam <- NAFIXING(geneIDNam)
297 297
298 ##remove the whitespace 298 ##remove the whitespace
299 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 299 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
300 300
301 ##Here is the clean version 301 ##Here is the clean version
302 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 302 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
303 } 303 }
304 304
305 305
306 306
307 ##Changing the gene ID to gene name 307 ##Changing the gene ID to gene name
308 ALZDAT1 <- cgeneID(geneIDNam,alzdat) 308 ALZDAT1 <- cgeneID(geneIDNam,alzdat)
309 colnames(ALZDAT) = ALZDAT1[1,] 309 colnames(ALZDAT) = ALZDAT1[1,]
310 310
311 311
312 ##Adjusting the column names aka the gene names 312 ##Adjusting the column names aka the gene names
313 colnames(ALZDAT) <- gcnames(ALZDAT) 313 colnames(ALZDAT) <- gcnames(ALZDAT)
314 314
315 315
316 #Full RAW Data 316 #Full RAW Data
317 Fullalzdwr <- ALZDAT %>% 317 Fullalzdwr <- ALZDAT %>%
318 as.data.frame(.,stringsAsFactors = FALSE) %>% 318 as.data.frame(.,stringsAsFactors = FALSE) %>%
319 cbind(ALZWORDF,.) 319 cbind(ALZWORDF,.)
320 320
321 321
322 #Raw file is output 322 #Raw file is output
323 nfnaex <- strsplit(alz,"[\\]") %>% 323 nfnaex <- strsplit(alz,"[\\]") %>%
324 .[[1]] %>% 324 .[[1]] %>%
325 .[length(.)] %>% 325 .[length(.)] %>%
326 gsub("\\D","",.) %>% 326 gsub("\\D","",.) %>%
327 c("GSE",.,"aftexcel.txt") %>% 327 c("GSE",.,"aftexcel.txt") %>%
328 paste(collapse = "") 328 paste(collapse = "")
329 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 329 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
330 330
331 331
332 #Now for the discretization part 332 #Now for the discretization part
333 ##get the wordy part again 333 ##get the wordy part again
334 rawword <- t(ALZWORDF) 334 rawword <- t(ALZWORDF)
335 335
336 ##where is ID_REF located 336 ##where is ID_REF located
337 hereim <- grep("ID_REF",rownames(rawword)) 337 hereim <- grep("ID_REF",rownames(rawword))
338 338
339 ##Subject Names GSM... 339 ##Subject Names GSM...
340 subjnam <- rawword[hereim,] 340 subjnam <- rawword[hereim,]
341 341
342 ##Getting the names for the rows 342 ##Getting the names for the rows
343 namedarows <- rownames(rawword)[-hereim] %>% 343 namedarows <- rownames(rawword)[-hereim] %>%
344 as.data.frame(.,stringsAsFactors = FALSE) 344 as.data.frame(.,stringsAsFactors = FALSE)
345 RAWWORD <- rawword[-hereim,] %>% 345 RAWWORD <- rawword[-hereim,] %>%
346 as.data.frame(.,stringsAsFactors = FALSE) %>% 346 as.data.frame(.,stringsAsFactors = FALSE) %>%
347 bind_cols(namedarows,.) 347 bind_cols(namedarows,.)
348 z <- 1 348 z <- 1
349 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 349 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
350 for(z in 1:dim(RAWWORD)[1]){ 350 for(z in 1:dim(RAWWORD)[1]){
351 if(sum(is.na(RAWWORD[z,])) > 0){ 351 if(sum(is.na(RAWWORD[z,])) > 0){
352 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 352 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
353 } 353 }
354 if(length(grep("NA",RAWWORD[z,])) > 0){ 354 if(length(grep("NA",RAWWORD[z,])) > 0){
355 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] 355 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
356 } 356 }
357 z <- z + 1 357 z <- z + 1
358 } 358 }
359 359
360 colnames(naroww) <- "ROW_NAs" 360 colnames(naroww) <- "ROW_NAs"
361 RAWWORD <- bind_cols(RAWWORD,naroww) 361 RAWWORD <- bind_cols(RAWWORD,naroww)
362 362
363 363
364 roALZna <- t(ALZDAT) %>% 364 roALZna <- t(ALZDAT) %>%
365 rownames(.) %>% 365 rownames(.) %>%
366 as.data.frame(.,stringsAsFactors = FALSE) 366 as.data.frame(.,stringsAsFactors = FALSE)
367 colnames(roALZna) <- "ID_REF" 367 colnames(roALZna) <- "ID_REF"
368 368
369 RAWDAT <- t(ALZDAT) %>% 369 RAWDAT <- t(ALZDAT) %>%
370 as.data.frame(.,stringsAsFactors = FALSE) 370 as.data.frame(.,stringsAsFactors = FALSE)
371 colnames(RAWDAT) <- NULL 371 colnames(RAWDAT) <- NULL
372 rownames(RAWDAT) <- NULL 372 rownames(RAWDAT) <- NULL
373 373
374 RAWDAT2 <- RAWDAT %>% 374 RAWDAT2 <- RAWDAT %>%
375 cbind(roALZna,.) %>% 375 cbind(roALZna,.) %>%
376 dplyr::arrange(.,ID_REF) 376 dplyr::arrange(.,ID_REF)
377 377
378 ##Editing the file for R processing 378 ##Editing the file for R processing
379 RAWDATID <- RAWDAT2[,1] %>% 379 RAWDATID <- RAWDAT2[,1] %>%
380 as.matrix(.) 380 as.matrix(.)
381 381
382 RAWDATNUM <- RAWDAT2[,-1] %>% 382 RAWDATNUM <- RAWDAT2[,-1] %>%
383 mapply(.,FUN = as.numeric) %>% 383 mapply(.,FUN = as.numeric) %>%
384 t(.) 384 t(.)
385 385
386 ##Consolidating genes with the same name 386 ##Consolidating genes with the same name
387 ###create empty matrix of size equal to tabRDATID 387 ###create empty matrix of size equal to tabRDATID
388 tabRDATID <- table(RAWDATID) 388 tabRDATID <- table(RAWDATID)
389 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 389 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
390 j <- 1 390 j <- 1
391 for(j in 1:length(tabRDATID)){ 391 for(j in 1:length(tabRDATID)){
392 392
393 ##Putting the ones without duplicates in their new homes 393 ##Putting the ones without duplicates in their new homes
394 if(tabRDATID[j] == 1){ 394 if(tabRDATID[j] == 1){
395 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 395 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
396 } else if(tabRDATID[j] > 1){ 396 } else if(tabRDATID[j] > 1){
397 ##Averaging duplicates and putting them in their new homes 397 ##Averaging duplicates and putting them in their new homes
398 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 398 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
399 } 399 }
400 j <- j + 1 400 j <- j + 1
401 } 401 }
402 402
403 ##Outputting non Z-score Average over genes 403 ##Outputting non Z-score Average over genes
404 newoutput <-NuRDATN 404 newoutput <-NuRDATN
405 colnames(newoutput) <- rownames(tabRDATID) 405 colnames(newoutput) <- rownames(tabRDATID)
406 nfnewout <- strsplit(alz,"[\\]") %>% 406 nfnewout <- strsplit(alz,"[\\]") %>%
407 .[[1]] %>% 407 .[[1]] %>%
408 .[length(.)] %>% 408 .[length(.)] %>%
409 gsub("\\D","",.) %>% 409 gsub("\\D","",.) %>%
410 c("GSE",.,"avg.txt") %>% 410 c("GSE",.,"avg.txt") %>%
411 paste(collapse = "") 411 paste(collapse = "")
412 noutput <- newoutput %>% 412 noutput <- newoutput %>%
413 t()%>% 413 t()%>%
414 as.data.frame(.,stringsAsFactors = FALSE) 414 as.data.frame(.,stringsAsFactors = FALSE)
415 noutput <- cbind(rownames(noutput),noutput) 415 noutput <- cbind(rownames(noutput),noutput)
416 colnames(noutput) <- c("Gene Symbol",subjnam) 416 colnames(noutput) <- c("Gene Symbol",subjnam)
417 rownames(noutput) <- NULL 417 rownames(noutput) <- NULL
418 write.table(noutput, file = nfnewout, sep = "\t",col.names = TRUE,row.names = FALSE) 418 write.table(noutput, file = nfnewout, sep = "\t",col.names = TRUE,row.names = FALSE)
419 419
420 420
421 ##Scaling the Data 421 ##Scaling the Data
422 scrawdat <- NuRDATN%>% 422 scrawdat <- NuRDATN%>%
423 scale() 423 scale()
424 attr(scrawdat,"scaled:center") <- NULL 424 attr(scrawdat,"scaled:center") <- NULL
425 attr(scrawdat,"scaled:scale") <- NULL 425 attr(scrawdat,"scaled:scale") <- NULL
426 colnames(scrawdat) <- rownames(tabRDATID) 426 colnames(scrawdat) <- rownames(tabRDATID)
427 427
428 #Outputting the Z-score file 428 #Outputting the Z-score file
429 nfnzsc <- strsplit(alz,"[\\]") %>% 429 nfnzsc <- strsplit(alz,"[\\]") %>%
430 .[[1]] %>% 430 .[[1]] %>%
431 .[length(.)] %>% 431 .[length(.)] %>%
432 gsub("\\D","",.) %>% 432 gsub("\\D","",.) %>%
433 c("GSE",.,"zscore.txt") %>% 433 c("GSE",.,"zscore.txt") %>%
434 paste(collapse = "") 434 paste(collapse = "")
435 zscraw <- scrawdat %>% 435 zscraw <- scrawdat %>%
436 t()%>% 436 t()%>%
437 as.data.frame(.,stringsAsFactors = FALSE) 437 as.data.frame(.,stringsAsFactors = FALSE)
438 zscraw <- cbind(rownames(zscraw),zscraw) 438 zscraw <- cbind(rownames(zscraw),zscraw)
439 colnames(zscraw) <- c("Gene Symbol",subjnam) 439 colnames(zscraw) <- c("Gene Symbol",subjnam)
440 rownames(zscraw) <- NULL 440 rownames(zscraw) <- NULL
441 write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = FALSE) 441 write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = FALSE)
442 442
443 443
444 ##Discretized the Data 444 ##Discretized the Data
445 dialzdat <- scrawdat %>% 445 dialzdat <- scrawdat %>%
446 dndat(.) %>% 446 dndat(.) %>%
447 t()%>% 447 t()%>%
448 as.data.frame(.,stringsAsFactors = FALSE) 448 as.data.frame(.,stringsAsFactors = FALSE)
449 colnames(dialzdat) <- rownames(RAWDATNUM) 449 colnames(dialzdat) <- rownames(RAWDATNUM)
450 450
451 ##setting "ID_REF" as a new variable 451 ##setting "ID_REF" as a new variable
452 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE) 452 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)
453 colnames(geneNAM) <- "ID_REF" 453 colnames(geneNAM) <- "ID_REF"
454 rownames(dialzdat) <- NULL 454 rownames(dialzdat) <- NULL
455 dialzdat <-bind_cols(geneNAM,dialzdat) 455 dialzdat <-bind_cols(geneNAM,dialzdat)
456 456
457 ##NAs in a column 457 ##NAs in a column
458 x <- 2 458 x <- 2
459 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 459 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
460 nacol[1,1] = "COL_NAs" 460 nacol[1,1] = "COL_NAs"
461 for(x in 2:dim(dialzdat)[2]){ 461 for(x in 2:dim(dialzdat)[2]){
462 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 462 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
463 x <- x + 1 463 x <- x + 1
464 } 464 }
465 colnames(nacol) <- colnames(dialzdat) 465 colnames(nacol) <- colnames(dialzdat)
466 dialzdat<-bind_rows(dialzdat,nacol) 466 dialzdat<-bind_rows(dialzdat,nacol)
467 467
468 ##NAs in a row 468 ##NAs in a row
469 y <- 1 469 y <- 1
470 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 470 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
471 for(y in 1:dim(dialzdat)[1]){ 471 for(y in 1:dim(dialzdat)[1]){
472 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 472 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
473 y <- y + 1 473 y <- y + 1
474 } 474 }
475 colnames(narowd) <- "ROW_NAs" 475 colnames(narowd) <- "ROW_NAs"
476 dialzdat <- bind_cols(dialzdat,narowd) 476 dialzdat <- bind_cols(dialzdat,narowd)
477 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 477 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
478 colnames(RAWWORD) <- colnames(dialzdat) 478 colnames(RAWWORD) <- colnames(dialzdat)
479 ##converting to character so that the clinical can be brought together with discrete data 479 ##converting to character so that the clinical can be brought together with discrete data
480 k <- 2 480 k <- 2
481 for(k in 2:dim(dialzdat)[2]-1){ 481 for(k in 2:dim(dialzdat)[2]-1){
482 dialzdat[,k] <- as.character(dialzdat[,k]) 482 dialzdat[,k] <- as.character(dialzdat[,k])
483 k <- k + 1 483 k <- k + 1
484 } 484 }
485 #The End the full data 485 #The End the full data
486 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 486 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
487 487
488 #Produces Discrete file 488 #Produces Discrete file
489 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 489 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
490 .[[1]] %>% 490 .[[1]] %>%
491 .[length(.)] %>% 491 .[length(.)] %>%
492 gsub("\\D","",.) %>% 492 gsub("\\D","",.) %>%
493 c("GSE",.,"dscrt.txt") %>% 493 c("GSE",.,"dscrt.txt") %>%
494 paste(collapse = "") 494 paste(collapse = "")
495 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 495 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
496 496
497 497