Commit 805474e1e6ec9b11cd677022efe720af7c3dda36

Authored by Efrain Gonzalez
1 parent f31e87a636
Exists in master

Most recent update fixed a few handling errors

Showing 1 changed file with 49 additions and 35 deletions   Show diff stats
1 ##Posted 6/15/2017 1 ##Posted 6/15/2017
2 2
3 3
4 #Libraries required to run the code 4 #Libraries required to run the code
5 library(pryr) 5 library(pryr)
6 library(MASS) 6 library(MASS)
7 library(dplyr) 7 library(dplyr)
8 library(tidyr) 8 library(tidyr)
9 library(readr) 9 library(readr)
10 library(stringr) 10 library(stringr)
11 11
12 12
13 #Necessary Functions 13 #Necessary Functions
14 #1#Function for handling the changing of row names and column names 14 #1#Function for handling the changing of row names and column names
15 chngrownm <- function(mat){ 15 chngrownm <- function(mat){
16 row <- dim(mat)[1] 16 row <- dim(mat)[1]
17 col <- dim(mat)[2] 17 col <- dim(mat)[2]
18 j <- 1 18 j <- 1
19 x <- 1 19 x <- 1
20 p <- 1 20 p <- 1
21 a <- 1 21 a <- 1
22 b <- 1 22 b <- 1
23 g <- 1 23 g <- 1
24 for(j in 1:col){ 24 for(j in 1:col){
25 if("!Sample_source_name_ch1"==mat[1,j]){ 25 if("!Sample_source_name_ch1"==mat[1,j]){
26 colnames(mat)[j] <- "Brain_Region" 26 colnames(mat)[j] <- "Brain_Region"
27 } 27 } else if("!Sample_title" == mat[1,j]){
28 if("!Sample_title" == mat[1,j]){
29 colnames(mat)[j] <- "Title" 28 colnames(mat)[j] <- "Title"
30 } 29 } else if("!Sample_geo_accession" == mat[1,j]){
31 if("!Sample_geo_accession" == mat[1,j]){
32 colnames(mat)[j] <- "ID_REF" 30 colnames(mat)[j] <- "ID_REF"
33 } else{ 31 } else{
34 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ 32 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
35 colnames(mat)[j] <- paste0("Sex",x) 33 colnames(mat)[j] <- paste0("Sex",x)
36 x = x + 1 34 x = x + 1
37 } 35 }
38 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ 36 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
39 colnames(mat)[j] <- paste0("PMI",p) 37 colnames(mat)[j] <- paste0("PMI",p)
40 p = p + 1 38 p = p + 1
41 } 39 }
42 if(grepl("age|Age|AGE",mat[2,j])==TRUE){ 40 if(grepl("age|Age|AGE",mat[2,j])==TRUE){
43 colnames(mat)[j] <- paste0("Age",a) 41 colnames(mat)[j] <- paste0("Age",a)
44 a = a + 1 42 a = a + 1
45 } 43 }
46 if(grepl("braak|b&b",mat[2,j])==TRUE){ 44 if(grepl("braak|b&b",mat[2,j])==TRUE){
47 colnames(mat)[j] <- paste0("Braak",b) 45 colnames(mat)[j] <- paste0("Braak",b)
48 b = b + 1 46 b = b + 1
49 } 47 }
50 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){ 48 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){
51 colnames(mat)[j] <- paste0("Group",g) 49 colnames(mat)[j] <- paste0("Group",g)
52 g = g + 1 50 g = g + 1
53 } 51 }
54 52
55 } 53 }
56 j = j + 1 54 j = j + 1
57 } 55 }
58 mat 56 mat
59 } 57 }
60 58
61 #2#Function for reorganizing information within the columns 59 #2#Function for reorganizing information within the columns
62 cinfo <- function(mat){ 60 cinfo <- function(mat){
63 col <- dim(mat)[2] 61 col <- dim(mat)[2]
64 j <-2 62 j <-2
65 for(j in 2:col){ 63 for(j in 2:col){
66 if(grepl("Group",colnames(mat)[j]) == TRUE){ 64 if(grepl("Group",colnames(mat)[j]) == TRUE){
67 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 65 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
68 } 66 }
69 if(grepl("Age",colnames(mat)[j])==TRUE){ 67 if(grepl("Age",colnames(mat)[j])==TRUE){
70 mat[,j] <- gsub("\\D","",mat[,j])%>% 68 mat[,j] <- gsub("\\D","",mat[,j])%>%
71 as.integer() 69 as.integer()
72 } 70 }
73 if(grepl("Sex",colnames(mat)[j])==TRUE){ 71 if(grepl("Sex",colnames(mat)[j])==TRUE){
74 mat[,j] <- gsub(".+:\\s","",mat[,j]) 72 mat[,j] <- gsub(".+:\\s","",mat[,j])
75 } 73 }
76 if(grepl("PMI",colnames(mat)[j])==TRUE){ 74 if(grepl("PMI",colnames(mat)[j])==TRUE){
77 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 75 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
78 as.numeric() 76 as.numeric()
79 } 77 }
80 if(grepl("Braak",colnames(mat)[j])==TRUE){ 78 if(grepl("Braak",colnames(mat)[j])==TRUE){
81 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 79 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
82 as.roman()%>% 80 as.roman()%>%
83 as.integer() 81 as.integer()
84 } 82 }
85 j=j+1 83 j=j+1
86 } 84 }
87 mat 85 mat
88 } 86 }
89 87
90 #3#Function for labeling the gene IDs without names 88 #3#Function for labeling the gene IDs without names
91 NAFIXING <- function(GIDNAM){ 89 NAFIXING <- function(GIDNAM){
92 row <- dim(GIDNAM)[1] 90 row <- dim(GIDNAM)[1]
93 i <- 1 91 i <- 1
94 for(i in 1:row){ 92 for(i in 1:row){
95 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 93 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
96 GIDNAM[i,2] <- GIDNAM[i,1] 94 GIDNAM[i,2] <- GIDNAM[i,1]
97 } 95 }
98 i <- i + 1 96 i <- i + 1
99 } 97 }
100 GIDNAM 98 GIDNAM
101 } 99 }
102 100
103 #4#Function for changing the gene ID to gene name 101 #4#Function for changing the gene ID to gene name
104 cgeneID <- function(GeneName,DATA){ 102 cgeneID <- function(GeneName,DATA){
105 colGene <- dim(GeneName)[2] 103 nj <- t(GeneName)
106 j <- 1 104 nq <- t(DATA)
107 for(j in 1:colGene){ 105 colGene <- dim(nj)[2]
108 chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) 106 colDATA <- dim(nq)[2]
109 if(is.na(sum(chngsreq))==FALSE){ 107 j <- 1
110 if(sum(chngsreq) > 0){ 108 for(j in 1:colDATA){
111 DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 109 #where is that gene id located within the GPL file
110 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
111 if(is.na(sum(chngreq))==FALSE){
112 if(sum(chngreq) > 0){
113 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
114 }
112 } 115 }
116 j <- j + 1
113 } 117 }
114 #if(sum(chngsreq) > 0){ 118 nq
115 ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
116 #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
117 #}
118 j = j+1
119 }
120 DATA
121 } 119 }
120 #cgeneID <- function(GeneName,DATA){
121 # colGene <- dim(GeneName)[2]
122 # j <- 1
123 # for(j in 1:colGene){
124 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
125 # if(is.na(sum(chngsreq))==FALSE){
126 # if(sum(chngsreq) > 0){
127 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
128 # }
129 # }
130 # #if(sum(chngsreq) > 0){
131 # ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
132 # #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
133 # #}
134 # j = j+1
135 # }
136 # DATA
137 #}
122 138
123 #5#Function for adjusting the gene names 139 #5#Function for adjusting the gene names
124 gcnames <- function(DiData,usecol=1){ 140 gcnames <- function(DiData,usecol=1){
125 nuruns <- dim(DiData)[2] 141 nuruns <- dim(DiData)[2]
126 i = 1 142 i = 1
127 nwnam <- rep("0",length.out=nuruns) 143 nwnam <- rep("0",length.out=nuruns)
128 for(i in 1:nuruns){ 144 for(i in 1:nuruns){
129 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ 145 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
130 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol]) 146 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
131 } else{ 147 } else{
132 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1]) 148 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
133 } 149 }
134 150
135 } 151 }
136 nwnam 152 nwnam
137 153
138 } 154 }
139 155
140 #6# Function for discretizing the data 156 #6# Function for discretizing the data
141 dndat <- function(NDATA){ 157 dndat <- function(NDATA){
142 rownd <- dim(NDATA)[1] 158 rownd <- dim(NDATA)[1]
143 colnd <- dim(NDATA)[2] 159 colnd <- dim(NDATA)[2]
144 DDATA <- matrix(0,nrow=rownd,ncol=colnd) 160 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
145 colnames(DDATA) <- colnames(NDATA) 161 colnames(DDATA) <- colnames(NDATA)
146 i <- 1 162 i <- 1
147 for(i in 1:rownd){ 163 for(i in 1:rownd){
148 j <- 1 164 j <- 1
149 for(j in 1:colnd){ 165 for(j in 1:colnd){
150 if(is.na(NDATA[i,j])==FALSE){ 166 if(is.na(NDATA[i,j])==FALSE){
151 167
152 if(NDATA[i,j] < -1){ 168 if(NDATA[i,j] < -1){
153 DDATA[i,j]=0L 169 DDATA[i,j]=0L
154 } 170 } else if(NDATA[i,j] > 1){
155 if(NDATA[i,j] > 1){
156 DDATA[i,j]=2L 171 DDATA[i,j]=2L
157 } 172 } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
158 if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
159 DDATA[i,j]=1L 173 DDATA[i,j]=1L
160 } 174 }
161 } else{ 175 } else{
162 DDATA[i,j] = NDATA[i,j] 176 DDATA[i,j] = NDATA[i,j]
163 } 177 }
164 j = j + 1 178 j = j + 1
165 } 179 }
166 i = i + 1 180 i = i + 1
167 } 181 }
168 DDATA 182 DDATA
169 } 183 }
170 184
171 185
172 #The Rest of this code will be used every time you want to change a data set 186 #The Rest of this code will be used every time you want to change a data set
173 187
174 #Getting the series matrix file 188 #Getting the series matrix file
175 print("Choose the series matrix file that you want to Analyze") 189 print("Choose the series matrix file that you want to Analyze")
176 alz <- file.choose() 190 alz <- file.choose()
177 191
178 #Getting the GPL file 192 #Getting the GPL file
179 print("Choose the GPL file that correlates with the above series matrix file") 193 print("Choose the GPL file that correlates with the above series matrix file")
180 genena <- file.choose() 194 genena <- file.choose()
181 195
182 196
183 #Find out if it is a soft GPL file or not 197 #Find out if it is a soft GPL file or not
184 soft <- strsplit(genena,"[\\|/]") %>% 198 soft <- strsplit(genena,"[\\|/]") %>%
185 .[[1]] %>% 199 .[[1]] %>%
186 .[length(.)] %>% 200 .[length(.)] %>%
187 grepl("soft|annot",.) 201 grepl("soft|annot",.)
188 202
189 #Working with the wordy part of the document 203 #Working with the wordy part of the document
190 alzword <- alz %>% 204 alzword <- alz %>%
191 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 205 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
192 filter(grepl("!Sample",X1))%>% 206 filter(grepl("!Sample",X1))%>%
193 filter(!grepl("!Sample_contact",X1)) 207 filter(!grepl("!Sample_contact",X1))
194 208
195 ##Changing row names and column names: 209 ##Changing row names and column names:
196 ALZWORD <- t(alzword) 210 ALZWORD <- t(alzword)
197 rownames(ALZWORD)=NULL 211 rownames(ALZWORD)=NULL
198 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 212 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
199 ALZWORD <- chngrownm(ALZWORD)[-1,] 213 ALZWORD <- chngrownm(ALZWORD)[-1,]
200 ALZWORD <- ALZWORD%>% 214 ALZWORD <- ALZWORD%>%
201 as.data.frame()%>% 215 as.data.frame()%>%
202 dplyr::select(-starts_with("col")) 216 dplyr::select(-starts_with("col"))
203 217
204 ##Reorganizing information within the columns 218 ##Reorganizing information within the columns
205 ALZWORDF <- cinfo(ALZWORD) 219 ALZWORDF <- cinfo(ALZWORD)
206 220
207 221
208 #Working with Actual Data part of file 222 #Working with Actual Data part of file
209 alzdat <- alz %>% 223 alzdat <- alz %>%
210 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 224 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
211 ALZDAT <- t(alzdat[,-1]) 225 ALZDAT <- t(alzdat[,-1])
212 rownames(ALZDAT)=NULL 226 rownames(ALZDAT)=NULL
213 227
214 ##Is there a clean version of the GPL file available? 228 ##Is there a clean version of the GPL file available?
215 gplnum <- strsplit(genena,"[\\|/]") %>% 229 gplnum <- strsplit(genena,"[\\|/]") %>%
216 .[[1]] %>% 230 .[[1]] %>%
217 .[length(.)] %>% 231 .[length(.)] %>%
218 gsub("\\D","",.) 232 gsub("\\D","",.)
219 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 233 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
220 if(clfileex >= 1){ 234 if(clfileex >= 1){
221 #use the clean version 235 #use the clean version
222 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 236 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
223 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 237 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
224 238
225 } 239 } else if(clfileex == 0){
226 if(clfileex == 0){
227 ##Lets Create a clean version 240 ##Lets Create a clean version
228 241
229 ##Gene ID to Gene Name 242 ##Gene ID to Gene Name
230 if(soft == TRUE){ 243 if(soft == TRUE){
231 #Check to see if there is already a file containing information on soft files 244 #Check to see if there is already a file containing information on soft files
232 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 245 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
233 if(fileex == 1){ 246 if(fileex == 1){
234 #Check to see if this GPL soft file has been used before 247 #Check to see if this GPL soft file has been used before
235 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 248 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
236 .$GPL_FILE_NUM%>% 249 .$GPL_FILE_NUM%>%
237 grepl(gplnum,.) %>% 250 grepl(gplnum,.) %>%
238 sum() 251 sum()
239 if(IDF == 1){ 252 if(IDF == 1){
240 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 253 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
241 .$GPL_FILE_NUM%>% 254 .$GPL_FILE_NUM%>%
242 grep(gplnum,.) 255 grep(gplnum,.)
243 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 256 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
244 .$LOC_ID %>% 257 .$LOC_ID %>%
245 .[IDLOCAL] 258 .[IDLOCAL]
246 geneIDNam <- genena %>% 259 geneIDNam <- genena %>%
247 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 260 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
248 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 261 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
249 } 262 } else if(IDF == 0){
250 if(IDF == 0){
251 #No information on this particular GPL file 263 #No information on this particular GPL file
252 idLOCGPL <- genena %>% 264 idLOCGPL <- genena %>%
253 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 265 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
254 t(.) %>% 266 t(.) %>%
255 grep("^ID\\s*$",.) %>% 267 grep("^ID\\s*$",.) %>%
256 -1 268 -1
257 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 269 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
258 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 270 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
259 geneIDNam <- genena %>% 271 geneIDNam <- genena %>%
260 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 272 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
261 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 273 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
262 } 274 }
263 } 275 } else if(fileex == 0){
264 if(fileex == 0){
265 #We must create a file that we can access for later use 276 #We must create a file that we can access for later use
266 idLOCGPL <- genena %>% 277 idLOCGPL <- genena %>%
267 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 278 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
268 t(.) %>% 279 t(.) %>%
269 grep("^ID\\s*$",.) %>% 280 grep("^ID\\s*$",.) %>%
270 -1 281 -1
271 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 282 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
272 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 283 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
273 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 284 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
274 geneIDNam <- genena %>% 285 geneIDNam <- genena %>%
275 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 286 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
276 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 287 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
277 } 288 }
278 } 289 } else if(soft == FALSE){
279 if(soft == FALSE){
280 geneIDNam <- genena %>% 290 geneIDNam <- genena %>%
281 read_delim(delim="\t",comment = "#")%>% 291 read_delim(delim="\t",comment = "#")%>%
282 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 292 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
283 } 293 }
284 294
285 ##Labeling the gene IDs without names 295 ##Labeling the gene IDs without names
286 geneIDNam <- NAFIXING(geneIDNam) 296 geneIDNam <- NAFIXING(geneIDNam)
287 297
288 ##remove the whitespace 298 ##remove the whitespace
289 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 299 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
290 300
291 ##Here is the clean version 301 ##Here is the clean version
292 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 302 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
293 } 303 }
294 304
295 305
296 306
297 ##Changing the gene ID to gene name 307 ##Changing the gene ID to gene name
298 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 308 ALZDAT1 <- cgeneID(geneIDNam,alzdat)
299 colnames(ALZDAT) = ALZDAT1[1,] 309 colnames(ALZDAT) = ALZDAT1[1,]
300 310
301 311
302 ##Adjusting the column names aka the gene names 312 ##Adjusting the column names aka the gene names
303 colnames(ALZDAT) <- gcnames(ALZDAT) 313 colnames(ALZDAT) <- gcnames(ALZDAT)
304 314
305 315
306 #Full RAW Data 316 #Full RAW Data
307 Fullalzdwr <- ALZDAT %>% 317 Fullalzdwr <- ALZDAT %>%
308 as.data.frame() %>% 318 as.data.frame() %>%
309 cbind(ALZWORDF,.) 319 cbind(ALZWORDF,.)
310 320
311 321
312 #Raw file is output 322 #Raw file is output
313 nfnaex <- strsplit(alz,"[\\]") %>% 323 nfnaex <- strsplit(alz,"[\\]") %>%
314 .[[1]] %>% 324 .[[1]] %>%
315 .[length(.)] %>% 325 .[length(.)] %>%
316 gsub("\\D","",.) %>% 326 gsub("\\D","",.) %>%
317 c("GSE",.,"aftexcel.txt") %>% 327 c("GSE",.,"aftexcel.txt") %>%
318 paste(collapse = "") 328 paste(collapse = "")
319 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 329 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
320 330
321 331
322 #Now for the discretization part 332 #Now for the discretization part
323 ##get the wordy part again 333 ##get the wordy part again
324 rawword <- t(ALZWORDF) 334 rawword <- t(ALZWORDF)
325 335
326 ##where is ID_REF located 336 ##where is ID_REF located
327 hereim <- grep("ID_REF",rownames(rawword)) 337 hereim <- grep("ID_REF",rownames(rawword))
328 338
329 ##Subject Names GSM... 339 ##Subject Names GSM...
330 subjnam <- rawword[hereim,] 340 subjnam <- rawword[hereim,]
331 341
332 ##Getting the names for the rows 342 ##Getting the names for the rows
333 namedarows <- rownames(rawword)[-hereim] %>% 343 namedarows <- rownames(rawword)[-hereim] %>%
334 as.data.frame() 344 as.data.frame()
335 RAWWORD <- rawword[-hereim,] %>% 345 RAWWORD <- rawword[-hereim,] %>%
336 as.data.frame() %>% 346 as.data.frame() %>%
337 bind_cols(namedarows,.) 347 bind_cols(namedarows,.)
338 z <- 1 348 z <- 1
339 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 349 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
340 for(z in 1:dim(RAWWORD)[1]){ 350 for(z in 1:dim(RAWWORD)[1]){
341 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 351 if(sum(is.na(RAWWORD[z,])) > 0){
342 z <- z + 1 352 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
353 }
354 if(length(grep("NA",RAWWORD[z,])) > 0){
355 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
356 }
357 z <- z + 1
343 } 358 }
344 359
345 colnames(naroww) <- "ROW_NAs" 360 colnames(naroww) <- "ROW_NAs"
346 RAWWORD <- bind_cols(RAWWORD,naroww) 361 RAWWORD <- bind_cols(RAWWORD,naroww)
347 362
348 363
349 roALZna <- t(ALZDAT) %>% 364 roALZna <- t(ALZDAT) %>%
350 rownames(.) %>% 365 rownames(.) %>%
351 as.data.frame(.) 366 as.data.frame(.)
352 colnames(roALZna) <- "ID_REF" 367 colnames(roALZna) <- "ID_REF"
353 368
354 RAWDAT <- t(ALZDAT) %>% 369 RAWDAT <- t(ALZDAT) %>%
355 as.data.frame(.) 370 as.data.frame(.)
356 colnames(RAWDAT) <- NULL 371 colnames(RAWDAT) <- NULL
357 rownames(RAWDAT) <- NULL 372 rownames(RAWDAT) <- NULL
358 373
359 RAWDAT2 <- RAWDAT %>% 374 RAWDAT2 <- RAWDAT %>%
360 cbind(roALZna,.) %>% 375 cbind(roALZna,.) %>%
361 dplyr::arrange(.,ID_REF) 376 dplyr::arrange(.,ID_REF)
362 377
363 ##Editing the file for R processing 378 ##Editing the file for R processing
364 RAWDATID <- RAWDAT2[,1] %>% 379 RAWDATID <- RAWDAT2[,1] %>%
365 as.matrix(.) 380 as.matrix(.)
366 381
367 RAWDATNUM <- RAWDAT2[,-1] %>% 382 RAWDATNUM <- RAWDAT2[,-1] %>%
368 mapply(.,FUN = as.numeric) %>% 383 mapply(.,FUN = as.numeric) %>%
369 t(.) 384 t(.)
370 385
371 ##Consolidating genes with the same name 386 ##Consolidating genes with the same name
372 ###create empty matrix of size equal to tabRDATID 387 ###create empty matrix of size equal to tabRDATID
373 tabRDATID <- table(RAWDATID) 388 tabRDATID <- table(RAWDATID)
374 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 389 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
375 j <- 1 390 j <- 1
376 for(j in 1:length(tabRDATID)){ 391 for(j in 1:length(tabRDATID)){
377 392
378 ##Putting the ones without duplicates in their new homes 393 ##Putting the ones without duplicates in their new homes
379 if(tabRDATID[j] == 1){ 394 if(tabRDATID[j] == 1){
380 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 395 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
381 } 396 } else if(tabRDATID[j] > 1){
382 ##Averaging duplicates and putting them in their new homes 397 ##Averaging duplicates and putting them in their new homes
383 if(tabRDATID[j] > 1){
384 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 398 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
385 } 399 }
386 j <- j + 1 400 j <- j + 1
387 } 401 }
388 402
389 ##Scaling the Data 403 ##Scaling the Data
390 scrawdat <- NuRDATN%>% 404 scrawdat <- NuRDATN%>%
391 scale() 405 scale()
392 attr(scrawdat,"scaled:center") <- NULL 406 attr(scrawdat,"scaled:center") <- NULL
393 attr(scrawdat,"scaled:scale") <- NULL 407 attr(scrawdat,"scaled:scale") <- NULL
394 colnames(scrawdat) <- rownames(tabRDATID) 408 colnames(scrawdat) <- rownames(tabRDATID)
395 409
396 ##Discretized the Data 410 ##Discretized the Data
397 dialzdat <- scrawdat %>% 411 dialzdat <- scrawdat %>%
398 dndat(.) %>% 412 dndat(.) %>%
399 t()%>% 413 t()%>%
400 as.data.frame(.) 414 as.data.frame(.)
401 colnames(dialzdat) <- rownames(RAWDATNUM) 415 colnames(dialzdat) <- rownames(RAWDATNUM)
402 416
403 ##setting "ID_REF" as a new variable 417 ##setting "ID_REF" as a new variable
404 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 418 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
405 colnames(geneNAM) <- "ID_REF" 419 colnames(geneNAM) <- "ID_REF"
406 rownames(dialzdat) <- NULL 420 rownames(dialzdat) <- NULL
407 dialzdat <-bind_cols(geneNAM,dialzdat) 421 dialzdat <-bind_cols(geneNAM,dialzdat)
408 422
409 ##NAs in a column 423 ##NAs in a column
410 x <- 2 424 x <- 2
411 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 425 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
412 nacol[1,1] = "COL_NAs" 426 nacol[1,1] = "COL_NAs"
413 for(x in 2:dim(dialzdat)[2]){ 427 for(x in 2:dim(dialzdat)[2]){
414 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 428 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
415 x <- x + 1 429 x <- x + 1
416 } 430 }
417 colnames(nacol) <- colnames(dialzdat) 431 colnames(nacol) <- colnames(dialzdat)
418 dialzdat<-bind_rows(dialzdat,nacol) 432 dialzdat<-bind_rows(dialzdat,nacol)
419 433
420 ##NAs in a row 434 ##NAs in a row
421 y <- 1 435 y <- 1
422 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 436 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
423 for(y in 1:dim(dialzdat)[1]){ 437 for(y in 1:dim(dialzdat)[1]){
424 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 438 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
425 y <- y + 1 439 y <- y + 1
426 } 440 }
427 colnames(narowd) <- "ROW_NAs" 441 colnames(narowd) <- "ROW_NAs"
428 dialzdat <- bind_cols(dialzdat,narowd) 442 dialzdat <- bind_cols(dialzdat,narowd)
429 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 443 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
430 colnames(RAWWORD) <- colnames(dialzdat) 444 colnames(RAWWORD) <- colnames(dialzdat)
431 ##converting to character so that the clinical can be brought together with discrete data 445 ##converting to character so that the clinical can be brought together with discrete data
432 k <- 2 446 k <- 2
433 for(k in 2:dim(dialzdat)[2]-1){ 447 for(k in 2:dim(dialzdat)[2]-1){
434 dialzdat[,k] <- as.character(dialzdat[,k]) 448 dialzdat[,k] <- as.character(dialzdat[,k])