Commit da758df454781c9b5a48d30b9aaf7c8e4e994cf5

Authored by Efrain Gonzalez
1 parent b8bafad302
Exists in master

Updated outputting new file: average before z-score

Showing 1 changed file with 41 additions and 20 deletions   Show diff stats
1 ##Posted 6/15/2017 1 ##Posted 6/15/2017
2 options(digits = 11) 2 options(digits = 11)
3 3
4 #Libraries required to run the code 4 #Libraries required to run the code
5 library(pryr) 5 library(pryr)
6 library(MASS) 6 library(MASS)
7 library(dplyr) 7 library(dplyr)
8 library(tidyr) 8 library(tidyr)
9 library(readr) 9 library(readr)
10 library(stringr) 10 library(stringr)
11 11
12 12
13 #Necessary Functions 13 #Necessary Functions
14 #1#Function for handling the changing of row names and column names 14 #1#Function for handling the changing of row names and column names
15 chngrownm <- function(mat){ 15 chngrownm <- function(mat){
16 row <- dim(mat)[1] 16 row <- dim(mat)[1]
17 col <- dim(mat)[2] 17 col <- dim(mat)[2]
18 j <- 1 18 j <- 1
19 x <- 1 19 x <- 1
20 p <- 1 20 p <- 1
21 a <- 1 21 a <- 1
22 b <- 1 22 b <- 1
23 g <- 1 23 g <- 1
24 for(j in 1:col){ 24 for(j in 1:col){
25 if("!Sample_source_name_ch1"==mat[1,j]){ 25 if("!Sample_source_name_ch1"==mat[1,j]){
26 colnames(mat)[j] <- "Brain_Region" 26 colnames(mat)[j] <- "Brain_Region"
27 } else if("!Sample_title" == mat[1,j]){ 27 } else if("!Sample_title" == mat[1,j]){
28 colnames(mat)[j] <- "Title" 28 colnames(mat)[j] <- "Title"
29 } else if("!Sample_geo_accession" == mat[1,j]){ 29 } else if("!Sample_geo_accession" == mat[1,j]){
30 colnames(mat)[j] <- "ID_REF" 30 colnames(mat)[j] <- "ID_REF"
31 } else{ 31 } else{
32 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ 32 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
33 colnames(mat)[j] <- paste0("Sex",x) 33 colnames(mat)[j] <- paste0("Sex",x)
34 x = x + 1 34 x = x + 1
35 } 35 }
36 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ 36 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
37 colnames(mat)[j] <- paste0("PMI",p) 37 colnames(mat)[j] <- paste0("PMI",p)
38 p = p + 1 38 p = p + 1
39 } 39 }
40 if(grepl("age|Age|AGE",mat[2,j])==TRUE){ 40 if(grepl("age|Age|AGE",mat[2,j])==TRUE){
41 colnames(mat)[j] <- paste0("Age",a) 41 colnames(mat)[j] <- paste0("Age",a)
42 a = a + 1 42 a = a + 1
43 } 43 }
44 if(grepl("braak|b&b",mat[2,j])==TRUE){ 44 if(grepl("braak|b&b",mat[2,j])==TRUE){
45 colnames(mat)[j] <- paste0("Braak",b) 45 colnames(mat)[j] <- paste0("Braak",b)
46 b = b + 1 46 b = b + 1
47 } 47 }
48 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){ 48 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){
49 colnames(mat)[j] <- paste0("Group",g) 49 colnames(mat)[j] <- paste0("Group",g)
50 g = g + 1 50 g = g + 1
51 } 51 }
52 52
53 } 53 }
54 j = j + 1 54 j = j + 1
55 } 55 }
56 mat 56 mat
57 } 57 }
58 58
59 #2#Function for reorganizing information within the columns 59 #2#Function for reorganizing information within the columns
60 cinfo <- function(mat){ 60 cinfo <- function(mat){
61 col <- dim(mat)[2] 61 col <- dim(mat)[2]
62 j <-2 62 j <-2
63 for(j in 2:col){ 63 for(j in 2:col){
64 if(grepl("Group",colnames(mat)[j]) == TRUE){ 64 if(grepl("Group",colnames(mat)[j]) == TRUE){
65 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 65 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
66 } 66 }
67 if(grepl("Age",colnames(mat)[j])==TRUE){ 67 if(grepl("Age",colnames(mat)[j])==TRUE){
68 mat[,j] <- gsub("\\D","",mat[,j])%>% 68 mat[,j] <- gsub("\\D","",mat[,j])%>%
69 as.integer() 69 as.integer()
70 } 70 }
71 if(grepl("Sex",colnames(mat)[j])==TRUE){ 71 if(grepl("Sex",colnames(mat)[j])==TRUE){
72 mat[,j] <- gsub(".+:\\s","",mat[,j]) 72 mat[,j] <- gsub(".+:\\s","",mat[,j])
73 } 73 }
74 if(grepl("PMI",colnames(mat)[j])==TRUE){ 74 if(grepl("PMI",colnames(mat)[j])==TRUE){
75 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 75 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
76 as.numeric() 76 as.numeric()
77 } 77 }
78 if(grepl("Braak",colnames(mat)[j])==TRUE){ 78 if(grepl("Braak",colnames(mat)[j])==TRUE){
79 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 79 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
80 as.roman()%>% 80 as.roman()%>%
81 as.integer() 81 as.integer()
82 } 82 }
83 j=j+1 83 j=j+1
84 } 84 }
85 mat 85 mat
86 } 86 }
87 87
88 #3#Function for labeling the gene IDs without names 88 #3#Function for labeling the gene IDs without names
89 NAFIXING <- function(GIDNAM){ 89 NAFIXING <- function(GIDNAM){
90 row <- dim(GIDNAM)[1] 90 row <- dim(GIDNAM)[1]
91 i <- 1 91 i <- 1
92 for(i in 1:row){ 92 for(i in 1:row){
93 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 93 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
94 GIDNAM[i,2] <- GIDNAM[i,1] 94 GIDNAM[i,2] <- GIDNAM[i,1]
95 } 95 }
96 i <- i + 1 96 i <- i + 1
97 } 97 }
98 GIDNAM 98 GIDNAM
99 } 99 }
100 100
101 #4#Function for changing the gene ID to gene name 101 #4#Function for changing the gene ID to gene name
102 cgeneID <- function(GeneName,DATA){ 102 cgeneID <- function(GeneName,DATA){
103 nj <- t(GeneName) 103 nj <- t(GeneName)
104 nq <- t(DATA) 104 nq <- t(DATA)
105 colGene <- dim(nj)[2] 105 colGene <- dim(nj)[2]
106 colDATA <- dim(nq)[2] 106 colDATA <- dim(nq)[2]
107 j <- 1 107 j <- 1
108 for(j in 1:colDATA){ 108 for(j in 1:colDATA){
109 #where is that gene id located within the GPL file 109 #where is that gene id located within the GPL file
110 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,]) 110 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
111 if(is.na(sum(chngreq))==FALSE){ 111 if(is.na(sum(chngreq))==FALSE){
112 if(sum(chngreq) > 0){ 112 if(sum(chngreq) > 0){
113 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j]) 113 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
114 } 114 }
115 } 115 }
116 j <- j + 1 116 j <- j + 1
117 } 117 }
118 nq 118 nq
119 } 119 }
120 #cgeneID <- function(GeneName,DATA){ 120 #cgeneID <- function(GeneName,DATA){
121 # colGene <- dim(GeneName)[2] 121 # colGene <- dim(GeneName)[2]
122 # j <- 1 122 # j <- 1
123 # for(j in 1:colGene){ 123 # for(j in 1:colGene){
124 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) 124 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
125 # if(is.na(sum(chngsreq))==FALSE){ 125 # if(is.na(sum(chngsreq))==FALSE){
126 # if(sum(chngsreq) > 0){ 126 # if(sum(chngsreq) > 0){
127 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 127 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
128 # } 128 # }
129 # } 129 # }
130 # #if(sum(chngsreq) > 0){ 130 # #if(sum(chngsreq) > 0){
131 # ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) 131 # ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
132 # #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 132 # #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
133 # #} 133 # #}
134 # j = j+1 134 # j = j+1
135 # } 135 # }
136 # DATA 136 # DATA
137 #} 137 #}
138 138
139 #5#Function for adjusting the gene names 139 #5#Function for adjusting the gene names
140 gcnames <- function(DiData,usecol=1){ 140 gcnames <- function(DiData,usecol=1){
141 nuruns <- dim(DiData)[2] 141 nuruns <- dim(DiData)[2]
142 i = 1 142 i = 1
143 nwnam <- rep("0",length.out=nuruns) 143 nwnam <- rep("0",length.out=nuruns)
144 for(i in 1:nuruns){ 144 for(i in 1:nuruns){
145 if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){ 145 if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){
146 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol]) 146 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])
147 } else{ 147 } else{
148 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1]) 148 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])
149 } 149 }
150 150
151 } 151 }
152 nwnam 152 nwnam
153 153
154 } 154 }
155 155
156 #6# Function for discretizing the data 156 #6# Function for discretizing the data
157 dndat <- function(NDATA){ 157 dndat <- function(NDATA){
158 rownd <- dim(NDATA)[1] 158 rownd <- dim(NDATA)[1]
159 colnd <- dim(NDATA)[2] 159 colnd <- dim(NDATA)[2]
160 DDATA <- matrix(0,nrow=rownd,ncol=colnd) 160 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
161 colnames(DDATA) <- colnames(NDATA) 161 colnames(DDATA) <- colnames(NDATA)
162 i <- 1 162 i <- 1
163 for(i in 1:rownd){ 163 for(i in 1:rownd){
164 j <- 1 164 j <- 1
165 for(j in 1:colnd){ 165 for(j in 1:colnd){
166 if(is.na(NDATA[i,j])==FALSE){ 166 if(is.na(NDATA[i,j])==FALSE){
167 167
168 if(NDATA[i,j] < -1){ 168 if(NDATA[i,j] < -1){
169 DDATA[i,j]=0L 169 DDATA[i,j]=0L
170 } else if(NDATA[i,j] > 1){ 170 } else if(NDATA[i,j] > 1){
171 DDATA[i,j]=2L 171 DDATA[i,j]=2L
172 } else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){ 172 } else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){
173 DDATA[i,j]=1L 173 DDATA[i,j]=1L
174 } 174 }
175 } else{ 175 } else{
176 DDATA[i,j] = NDATA[i,j] 176 DDATA[i,j] = NDATA[i,j]
177 } 177 }
178 j = j + 1 178 j = j + 1
179 } 179 }
180 i = i + 1 180 i = i + 1
181 } 181 }
182 DDATA 182 DDATA
183 } 183 }
184 184
185 185
186 #The Rest of this code will be used every time you want to change a data set 186 #The Rest of this code will be used every time you want to change a data set
187 187
188 #Getting the series matrix file 188 #Getting the series matrix file
189 print("Choose the series matrix file that you want to Analyze") 189 print("Choose the series matrix file that you want to Analyze")
190 alz <- file.choose() 190 alz <- file.choose()
191 191
192 #Getting the GPL file 192 #Getting the GPL file
193 print("Choose the GPL file that correlates with the above series matrix file") 193 print("Choose the GPL file that correlates with the above series matrix file")
194 genena <- file.choose() 194 genena <- file.choose()
195 195
196 196
197 #Find out if it is a soft GPL file or not 197 #Find out if it is a soft GPL file or not
198 soft <- strsplit(genena,"[\\|/]") %>% 198 soft <- strsplit(genena,"[\\|/]") %>%
199 .[[1]] %>% 199 .[[1]] %>%
200 .[length(.)] %>% 200 .[length(.)] %>%
201 grepl("soft|annot",.) 201 grepl("soft|annot",.)
202 202
203 #Working with the wordy part of the document 203 #Working with the wordy part of the document
204 alzword <- alz %>% 204 alzword <- alz %>%
205 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 205 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
206 filter(grepl("!Sample",X1))%>% 206 filter(grepl("!Sample",X1))%>%
207 filter(!grepl("!Sample_contact",X1)) 207 filter(!grepl("!Sample_contact",X1))
208 208
209 ##Changing row names and column names: 209 ##Changing row names and column names:
210 ALZWORD <- t(alzword) 210 ALZWORD <- t(alzword)
211 rownames(ALZWORD)=NULL 211 rownames(ALZWORD)=NULL
212 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 212 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
213 ALZWORD <- chngrownm(ALZWORD)[-1,] 213 ALZWORD <- chngrownm(ALZWORD)[-1,]
214 ALZWORD <- ALZWORD%>% 214 ALZWORD <- ALZWORD%>%
215 as.data.frame(.,stringsAsFactors = FALSE)%>% 215 as.data.frame(.,stringsAsFactors = FALSE)%>%
216 dplyr::select(-starts_with("col")) 216 dplyr::select(-starts_with("col"))
217 217
218 ##Reorganizing information within the columns 218 ##Reorganizing information within the columns
219 ALZWORDF <- cinfo(ALZWORD) 219 ALZWORDF <- cinfo(ALZWORD)
220 220
221 221
222 #Working with Actual Data part of file 222 #Working with Actual Data part of file
223 alzdat <- alz %>% 223 alzdat <- alz %>%
224 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 224 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
225 ALZDAT <- t(alzdat[,-1]) 225 ALZDAT <- t(alzdat[,-1])
226 rownames(ALZDAT)=NULL 226 rownames(ALZDAT)=NULL
227 227
228 ##Is there a clean version of the GPL file available? 228 ##Is there a clean version of the GPL file available?
229 gplnum <- strsplit(genena,"[\\|/]") %>% 229 gplnum <- strsplit(genena,"[\\|/]") %>%
230 .[[1]] %>% 230 .[[1]] %>%
231 .[length(.)] %>% 231 .[length(.)] %>%
232 gsub("\\D","",.) 232 gsub("\\D","",.)
233 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 233 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
234 if(clfileex >= 1){ 234 if(clfileex >= 1){
235 #use the clean version 235 #use the clean version
236 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 236 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
237 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 237 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
238 238
239 } else if(clfileex == 0){ 239 } else if(clfileex == 0){
240 ##Lets Create a clean version 240 ##Lets Create a clean version
241 241
242 ##Gene ID to Gene Name 242 ##Gene ID to Gene Name
243 if(soft == TRUE){ 243 if(soft == TRUE){
244 #Check to see if there is already a file containing information on soft files 244 #Check to see if there is already a file containing information on soft files
245 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 245 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
246 if(fileex == 1){ 246 if(fileex == 1){
247 #Check to see if this GPL soft file has been used before 247 #Check to see if this GPL soft file has been used before
248 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 248 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
249 .$GPL_FILE_NUM%>% 249 .$GPL_FILE_NUM%>%
250 grepl(gplnum,.) %>% 250 grepl(gplnum,.) %>%
251 sum() 251 sum()
252 if(IDF == 1){ 252 if(IDF == 1){
253 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 253 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
254 .$GPL_FILE_NUM%>% 254 .$GPL_FILE_NUM%>%
255 grep(gplnum,.) 255 grep(gplnum,.)
256 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 256 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
257 .$LOC_ID %>% 257 .$LOC_ID %>%
258 .[IDLOCAL] 258 .[IDLOCAL]
259 geneIDNam <- genena %>% 259 geneIDNam <- genena %>%
260 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 260 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
261 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.))) 261 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.)))
262 } else if(IDF == 0){ 262 } else if(IDF == 0){
263 #No information on this particular GPL file 263 #No information on this particular GPL file
264 idLOCGPL <- genena %>% 264 idLOCGPL <- genena %>%
265 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 265 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
266 t(.) %>% 266 t(.) %>%
267 grep("^ID\\s*$",.) %>% 267 grep("^ID\\s*$",.) %>%
268 -1 268 -1
269 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 269 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
270 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 270 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
271 geneIDNam <- genena %>% 271 geneIDNam <- genena %>%
272 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 272 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
273 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.))) 273 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.)))
274 } 274 }
275 } else if(fileex == 0){ 275 } else if(fileex == 0){
276 #We must create a file that we can access for later use 276 #We must create a file that we can access for later use
277 idLOCGPL <- genena %>% 277 idLOCGPL <- genena %>%
278 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 278 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
279 t(.) %>% 279 t(.) %>%
280 grep("^ID\\s*$",.) %>% 280 grep("^ID\\s*$",.) %>%
281 -1 281 -1
282 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 282 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
283 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 283 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
284 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 284 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
285 geneIDNam <- genena %>% 285 geneIDNam <- genena %>%
286 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 286 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
287 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.))) 287 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.)))
288 } 288 }
289 } else if(soft == FALSE){ 289 } else if(soft == FALSE){
290 geneIDNam <- genena %>% 290 geneIDNam <- genena %>%
291 read_delim(delim="\t",comment = "#")%>% 291 read_delim(delim="\t",comment = "#")%>%
292 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.))) 292 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.)))
293 } 293 }
294 294
295 ##Labeling the gene IDs without names 295 ##Labeling the gene IDs without names
296 geneIDNam <- NAFIXING(geneIDNam) 296 geneIDNam <- NAFIXING(geneIDNam)
297 297
298 ##remove the whitespace 298 ##remove the whitespace
299 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 299 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
300 300
301 ##Here is the clean version 301 ##Here is the clean version
302 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 302 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
303 } 303 }
304 304
305 305
306 306
307 ##Changing the gene ID to gene name 307 ##Changing the gene ID to gene name
308 ALZDAT1 <- cgeneID(geneIDNam,alzdat) 308 ALZDAT1 <- cgeneID(geneIDNam,alzdat)
309 colnames(ALZDAT) = ALZDAT1[1,] 309 colnames(ALZDAT) = ALZDAT1[1,]
310 310
311 311
312 ##Adjusting the column names aka the gene names 312 ##Adjusting the column names aka the gene names
313 colnames(ALZDAT) <- gcnames(ALZDAT) 313 colnames(ALZDAT) <- gcnames(ALZDAT)
314 314
315 315
316 #Full RAW Data 316 #Full RAW Data
317 Fullalzdwr <- ALZDAT %>% 317 Fullalzdwr <- ALZDAT %>%
318 as.data.frame(.,stringsAsFactors = FALSE) %>% 318 as.data.frame(.,stringsAsFactors = FALSE) %>%
319 cbind(ALZWORDF,.) 319 cbind(ALZWORDF,.)
320 320
321 321
322 #Raw file is output 322 #Raw file is output
323 nfnaex <- strsplit(alz,"[\\]") %>% 323 nfnaex <- strsplit(alz,"[\\]") %>%
324 .[[1]] %>% 324 .[[1]] %>%
325 .[length(.)] %>% 325 .[length(.)] %>%
326 gsub("\\D","",.) %>% 326 gsub("\\D","",.) %>%
327 c("GSE",.,"aftexcel.txt") %>% 327 c("GSE",.,"aftexcel.txt") %>%
328 paste(collapse = "") 328 paste(collapse = "")
329 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 329 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
330 330
331 331
332 #Now for the discretization part 332 #Now for the discretization part
333 ##get the wordy part again 333 ##get the wordy part again
334 rawword <- t(ALZWORDF) 334 rawword <- t(ALZWORDF)
335 335
336 ##where is ID_REF located 336 ##where is ID_REF located
337 hereim <- grep("ID_REF",rownames(rawword)) 337 hereim <- grep("ID_REF",rownames(rawword))
338 338
339 ##Subject Names GSM... 339 ##Subject Names GSM...
340 subjnam <- rawword[hereim,] 340 subjnam <- rawword[hereim,]
341 341
342 ##Getting the names for the rows 342 ##Getting the names for the rows
343 namedarows <- rownames(rawword)[-hereim] %>% 343 namedarows <- rownames(rawword)[-hereim] %>%
344 as.data.frame(.,stringsAsFactors = FALSE) 344 as.data.frame(.,stringsAsFactors = FALSE)
345 RAWWORD <- rawword[-hereim,] %>% 345 RAWWORD <- rawword[-hereim,] %>%
346 as.data.frame(.,stringsAsFactors = FALSE) %>% 346 as.data.frame(.,stringsAsFactors = FALSE) %>%
347 bind_cols(namedarows,.) 347 bind_cols(namedarows,.)
348 z <- 1 348 z <- 1
349 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 349 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
350 for(z in 1:dim(RAWWORD)[1]){ 350 for(z in 1:dim(RAWWORD)[1]){
351 if(sum(is.na(RAWWORD[z,])) > 0){ 351 if(sum(is.na(RAWWORD[z,])) > 0){
352 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 352 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
353 } 353 }
354 if(length(grep("NA",RAWWORD[z,])) > 0){ 354 if(length(grep("NA",RAWWORD[z,])) > 0){
355 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] 355 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
356 } 356 }
357 z <- z + 1 357 z <- z + 1
358 } 358 }
359 359
360 colnames(naroww) <- "ROW_NAs" 360 colnames(naroww) <- "ROW_NAs"
361 RAWWORD <- bind_cols(RAWWORD,naroww) 361 RAWWORD <- bind_cols(RAWWORD,naroww)
362 362
363 363
364 roALZna <- t(ALZDAT) %>% 364 roALZna <- t(ALZDAT) %>%
365 rownames(.) %>% 365 rownames(.) %>%
366 as.data.frame(.,stringsAsFactors = FALSE) 366 as.data.frame(.,stringsAsFactors = FALSE)
367 colnames(roALZna) <- "ID_REF" 367 colnames(roALZna) <- "ID_REF"
368 368
369 RAWDAT <- t(ALZDAT) %>% 369 RAWDAT <- t(ALZDAT) %>%
370 as.data.frame(.,stringsAsFactors = FALSE) 370 as.data.frame(.,stringsAsFactors = FALSE)
371 colnames(RAWDAT) <- NULL 371 colnames(RAWDAT) <- NULL
372 rownames(RAWDAT) <- NULL 372 rownames(RAWDAT) <- NULL
373 373
374 RAWDAT2 <- RAWDAT %>% 374 RAWDAT2 <- RAWDAT %>%
375 cbind(roALZna,.) %>% 375 cbind(roALZna,.) %>%
376 dplyr::arrange(.,ID_REF) 376 dplyr::arrange(.,ID_REF)
377 377
378 ##Editing the file for R processing 378 ##Editing the file for R processing
379 RAWDATID <- RAWDAT2[,1] %>% 379 RAWDATID <- RAWDAT2[,1] %>%
380 as.matrix(.) 380 as.matrix(.)
381 381
382 RAWDATNUM <- RAWDAT2[,-1] %>% 382 RAWDATNUM <- RAWDAT2[,-1] %>%
383 mapply(.,FUN = as.numeric) %>% 383 mapply(.,FUN = as.numeric) %>%
384 t(.) 384 t(.)
385 385
386 ##Consolidating genes with the same name 386 ##Consolidating genes with the same name
387 ###create empty matrix of size equal to tabRDATID 387 ###create empty matrix of size equal to tabRDATID
388 tabRDATID <- table(RAWDATID) 388 tabRDATID <- table(RAWDATID)
389 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 389 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
390 j <- 1 390 j <- 1
391 for(j in 1:length(tabRDATID)){ 391 for(j in 1:length(tabRDATID)){
392 392
393 ##Putting the ones without duplicates in their new homes 393 ##Putting the ones without duplicates in their new homes
394 if(tabRDATID[j] == 1){ 394 if(tabRDATID[j] == 1){
395 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 395 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
396 } else if(tabRDATID[j] > 1){ 396 } else if(tabRDATID[j] > 1){
397 ##Averaging duplicates and putting them in their new homes 397 ##Averaging duplicates and putting them in their new homes
398 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 398 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
399 } 399 }
400 j <- j + 1 400 j <- j + 1
401 } 401 }
402 402
403 ##Scaling the Data 403 ##Outputting non Z-score Average over genes
404 scrawdat <- NuRDATN%>% 404 newoutput <-NuRDATN
405 scale() 405 colnames(newoutput) <- rownames(tabRDATID)
406 attr(scrawdat,"scaled:center") <- NULL 406 nfnewout <- strsplit(alz,"[\\]") %>%
407 attr(scrawdat,"scaled:scale") <- NULL 407 .[[1]] %>%
408 colnames(scrawdat) <- rownames(tabRDATID) 408 .[length(.)] %>%
409 409 gsub("\\D","",.) %>%
410 #Outputting the Z-score file 410 c("GSE",.,"avg.txt") %>%
411 nfnzsc <- strsplit(alz,"[\\]") %>% 411 paste(collapse = "")
412 .[[1]] %>% 412 noutput <- newoutput %>%
413 .[length(.)] %>% 413 t()%>%
414 gsub("\\D","",.) %>% 414 as.data.frame(.,stringsAsFactors = FALSE)
415 c("GSE",.,"zscore.txt") %>% 415 noutput <- cbind(rownames(noutput),noutput)
416 paste(collapse = "") 416 colnames(noutput) <- c("Gene Symbol",subjnam)
417 zscraw <- scrawdat %>% 417 rownames(noutput) <- NULL
418 t()%>% 418 write.table(noutput, file = nfnewout, sep = "\t",col.names = TRUE,row.names = FALSE)
419 as.data.frame(.,stringsAsFactors = FALSE) 419
420 colnames(zscraw) <- subjnam 420
421 write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = TRUE) 421 ##Scaling the Data
422 422 scrawdat <- NuRDATN%>%
423 scale()
424 attr(scrawdat,"scaled:center") <- NULL
425 attr(scrawdat,"scaled:scale") <- NULL
426 colnames(scrawdat) <- rownames(tabRDATID)
427
428 #Outputting the Z-score file
429 nfnzsc <- strsplit(alz,"[\\]") %>%
430 .[[1]] %>%
431 .[length(.)] %>%
432 gsub("\\D","",.) %>%
433 c("GSE",.,"zscore.txt") %>%
434 paste(collapse = "")
435 zscraw <- scrawdat %>%
436 t()%>%
437 as.data.frame(.,stringsAsFactors = FALSE)
438 zscraw <- cbind(rownames(zscraw),zscraw)
439 colnames(zscraw) <- c("Gene Symbol",subjnam)
440 rownames(zscraw) <- NULL
441 write.table(zscraw, file = nfnewout, sep = "\t",col.names = TRUE,row.names = FALSE)
442
443
423 ##Discretized the Data 444 ##Discretized the Data
424 dialzdat <- scrawdat %>% 445 dialzdat <- scrawdat %>%
425 dndat(.) %>% 446 dndat(.) %>%
426 t()%>% 447 t()%>%
427 as.data.frame(.,stringsAsFactors = FALSE) 448 as.data.frame(.,stringsAsFactors = FALSE)
428 colnames(dialzdat) <- rownames(RAWDATNUM) 449 colnames(dialzdat) <- rownames(RAWDATNUM)
429 450
430 ##setting "ID_REF" as a new variable 451 ##setting "ID_REF" as a new variable
431 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE) 452 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)
432 colnames(geneNAM) <- "ID_REF" 453 colnames(geneNAM) <- "ID_REF"
433 rownames(dialzdat) <- NULL 454 rownames(dialzdat) <- NULL
434 dialzdat <-bind_cols(geneNAM,dialzdat) 455 dialzdat <-bind_cols(geneNAM,dialzdat)
435 456
436 ##NAs in a column 457 ##NAs in a column
437 x <- 2 458 x <- 2
438 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 459 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
439 nacol[1,1] = "COL_NAs" 460 nacol[1,1] = "COL_NAs"
440 for(x in 2:dim(dialzdat)[2]){ 461 for(x in 2:dim(dialzdat)[2]){
441 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 462 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
442 x <- x + 1 463 x <- x + 1
443 } 464 }
444 colnames(nacol) <- colnames(dialzdat) 465 colnames(nacol) <- colnames(dialzdat)
445 dialzdat<-bind_rows(dialzdat,nacol) 466 dialzdat<-bind_rows(dialzdat,nacol)
446 467
447 ##NAs in a row 468 ##NAs in a row
448 y <- 1 469 y <- 1
449 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 470 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
450 for(y in 1:dim(dialzdat)[1]){ 471 for(y in 1:dim(dialzdat)[1]){
451 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 472 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
452 y <- y + 1 473 y <- y + 1
453 } 474 }
454 colnames(narowd) <- "ROW_NAs" 475 colnames(narowd) <- "ROW_NAs"
455 dialzdat <- bind_cols(dialzdat,narowd) 476 dialzdat <- bind_cols(dialzdat,narowd)
456 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 477 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
457 colnames(RAWWORD) <- colnames(dialzdat) 478 colnames(RAWWORD) <- colnames(dialzdat)
458 ##converting to character so that the clinical can be brought together with discrete data 479 ##converting to character so that the clinical can be brought together with discrete data
459 k <- 2 480 k <- 2
460 for(k in 2:dim(dialzdat)[2]-1){ 481 for(k in 2:dim(dialzdat)[2]-1){
461 dialzdat[,k] <- as.character(dialzdat[,k]) 482 dialzdat[,k] <- as.character(dialzdat[,k])
462 k <- k + 1 483 k <- k + 1
463 } 484 }
464 #The End the full data 485 #The End the full data
465 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 486 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
466 487
467 #Produces Discrete file 488 #Produces Discrete file
468 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 489 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
469 .[[1]] %>% 490 .[[1]] %>%
470 .[length(.)] %>% 491 .[length(.)] %>%
471 gsub("\\D","",.) %>% 492 gsub("\\D","",.) %>%
472 c("GSE",.,"dscrt.txt") %>% 493 c("GSE",.,"dscrt.txt") %>%
473 paste(collapse = "") 494 paste(collapse = "")
474 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 495 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
475 496
476 497