Commit 51d31a33543c4ec4f1fdeafafdf73f2e4aa4a036

Authored by Efrain Gonzalez
1 parent 234f89c9aa
Exists in master

Update (UNTESTED)

Showing 1 changed file with 1 additions and 1 deletions   Show diff stats
1 ##Posted 6/15/2017 1 ##Posted 6/15/2017
2 2
3 3
4 #Libraries required to run the code 4 #Libraries required to run the code
5 library(pryr) 5 library(pryr)
6 library(MASS) 6 library(MASS)
7 library(dplyr) 7 library(dplyr)
8 library(tidyr) 8 library(tidyr)
9 library(readr) 9 library(readr)
10 library(stringr) 10 library(stringr)
11 11
12 12
13 #Necessary Functions 13 #Necessary Functions
14 #1#Function for handling the changing of row names and column names 14 #1#Function for handling the changing of row names and column names
15 chngrownm <- function(mat){ 15 chngrownm <- function(mat){
16 row <- dim(mat)[1] 16 row <- dim(mat)[1]
17 col <- dim(mat)[2] 17 col <- dim(mat)[2]
18 j <- 1 18 j <- 1
19 x <- 1 19 x <- 1
20 p <- 1 20 p <- 1
21 a <- 1 21 a <- 1
22 b <- 1 22 b <- 1
23 g <- 1 23 g <- 1
24 for(j in 1:col){ 24 for(j in 1:col){
25 if("!Sample_source_name_ch1"==mat[1,j]){ 25 if("!Sample_source_name_ch1"==mat[1,j]){
26 colnames(mat)[j] <- "Brain_Region" 26 colnames(mat)[j] <- "Brain_Region"
27 } 27 }
28 if("!Sample_title" == mat[1,j]){ 28 if("!Sample_title" == mat[1,j]){
29 colnames(mat)[j] <- "Title" 29 colnames(mat)[j] <- "Title"
30 } 30 }
31 if("!Sample_geo_accession" == mat[1,j]){ 31 if("!Sample_geo_accession" == mat[1,j]){
32 colnames(mat)[j] <- "ID_REF" 32 colnames(mat)[j] <- "ID_REF"
33 } else{ 33 } else{
34 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ 34 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
35 colnames(mat)[j] <- paste0("Sex",x) 35 colnames(mat)[j] <- paste0("Sex",x)
36 x = x + 1 36 x = x + 1
37 } 37 }
38 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ 38 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
39 colnames(mat)[j] <- paste0("PMI",p) 39 colnames(mat)[j] <- paste0("PMI",p)
40 p = p + 1 40 p = p + 1
41 } 41 }
42 if(grepl("age|Age|AGE",mat[2,j])==TRUE){ 42 if(grepl("age|Age|AGE",mat[2,j])==TRUE){
43 colnames(mat)[j] <- paste0("Age",a) 43 colnames(mat)[j] <- paste0("Age",a)
44 a = a + 1 44 a = a + 1
45 } 45 }
46 if(grepl("braak|b&b",mat[2,j])==TRUE){ 46 if(grepl("braak|b&b",mat[2,j])==TRUE){
47 colnames(mat)[j] <- paste0("Braak",b) 47 colnames(mat)[j] <- paste0("Braak",b)
48 b = b + 1 48 b = b + 1
49 } 49 }
50 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){ 50 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){
51 colnames(mat)[j] <- paste0("Group",g) 51 colnames(mat)[j] <- paste0("Group",g)
52 g = g + 1 52 g = g + 1
53 } 53 }
54 54
55 } 55 }
56 j = j + 1 56 j = j + 1
57 } 57 }
58 mat 58 mat
59 } 59 }
60 60
61 #2#Function for reorganizing information within the columns 61 #2#Function for reorganizing information within the columns
62 cinfo <- function(mat){ 62 cinfo <- function(mat){
63 col <- dim(mat)[2] 63 col <- dim(mat)[2]
64 j <-2 64 j <-2
65 for(j in 2:col){ 65 for(j in 2:col){
66 if(grepl("Group",colnames(mat)[j]) == TRUE){ 66 if(grepl("Group",colnames(mat)[j]) == TRUE){
67 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 67 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
68 } 68 }
69 if(grepl("Age",colnames(mat)[j])==TRUE){ 69 if(grepl("Age",colnames(mat)[j])==TRUE){
70 mat[,j] <- gsub("\\D","",mat[,j])%>% 70 mat[,j] <- gsub("\\D","",mat[,j])%>%
71 as.integer() 71 as.integer()
72 } 72 }
73 if(grepl("Sex",colnames(mat)[j])==TRUE){ 73 if(grepl("Sex",colnames(mat)[j])==TRUE){
74 mat[,j] <- gsub(".+:\\s","",mat[,j]) 74 mat[,j] <- gsub(".+:\\s","",mat[,j])
75 } 75 }
76 if(grepl("PMI",colnames(mat)[j])==TRUE){ 76 if(grepl("PMI",colnames(mat)[j])==TRUE){
77 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 77 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
78 as.numeric() 78 as.numeric()
79 } 79 }
80 if(grepl("Braak",colnames(mat)[j])==TRUE){ 80 if(grepl("Braak",colnames(mat)[j])==TRUE){
81 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 81 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
82 as.roman()%>% 82 as.roman()%>%
83 as.integer() 83 as.integer()
84 } 84 }
85 j=j+1 85 j=j+1
86 } 86 }
87 mat 87 mat
88 } 88 }
89 89
90 #3#Function for labeling the gene IDs without names 90 #3#Function for labeling the gene IDs without names
91 NAFIXING <- function(GIDNAM){ 91 NAFIXING <- function(GIDNAM){
92 row <- dim(GIDNAM)[1] 92 row <- dim(GIDNAM)[1]
93 i <- 1 93 i <- 1
94 for(i in 1:row){ 94 for(i in 1:row){
95 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 95 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
96 GIDNAM[i,2] <- GIDNAM[i,1] 96 GIDNAM[i,2] <- GIDNAM[i,1]
97 } 97 }
98 i <- i + 1 98 i <- i + 1
99 } 99 }
100 GIDNAM 100 GIDNAM
101 } 101 }
102 102
103 #4#Function for changing the gene ID to gene name 103 #4#Function for changing the gene ID to gene name
104 cgeneID <- function(GeneName,DATA){ 104 cgeneID <- function(GeneName,DATA){
105 colGene <- dim(GeneName)[2] 105 colGene <- dim(GeneName)[2]
106 j <- 1 106 j <- 1
107 for(j in 1:colGene){ 107 for(j in 1:colGene){
108 chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) 108 chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
109 if(is.na(sum(chngsreq))==FALSE){ 109 if(is.na(sum(chngsreq))==FALSE){
110 if(sum(chngsreq) > 0){ 110 if(sum(chngsreq) > 0){
111 DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 111 DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
112 } 112 }
113 } 113 }
114 #if(sum(chngsreq) > 0){ 114 #if(sum(chngsreq) > 0){
115 ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) 115 ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
116 #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 116 #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
117 #} 117 #}
118 j = j+1 118 j = j+1
119 } 119 }
120 DATA 120 DATA
121 } 121 }
122 122
123 #5#Function for adjusting the gene names 123 #5#Function for adjusting the gene names
124 gcnames <- function(DiData,usecol=1){ 124 gcnames <- function(DiData,usecol=1){
125 nuruns <- dim(DiData)[2] 125 nuruns <- dim(DiData)[2]
126 i = 1 126 i = 1
127 nwnam <- rep("0",length.out=nuruns) 127 nwnam <- rep("0",length.out=nuruns)
128 for(i in 1:nuruns){ 128 for(i in 1:nuruns){
129 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ 129 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
130 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol]) 130 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
131 } else{ 131 } else{
132 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1]) 132 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
133 } 133 }
134 134
135 } 135 }
136 nwnam 136 nwnam
137 137
138 } 138 }
139 139
140 #6# Function for discretizing the data 140 #6# Function for discretizing the data
141 dndat <- function(NDATA){ 141 dndat <- function(NDATA){
142 rownd <- dim(NDATA)[1] 142 rownd <- dim(NDATA)[1]
143 colnd <- dim(NDATA)[2] 143 colnd <- dim(NDATA)[2]
144 DDATA <- matrix(0,nrow=rownd,ncol=colnd) 144 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
145 colnames(DDATA) <- colnames(NDATA) 145 colnames(DDATA) <- colnames(NDATA)
146 i <- 1 146 i <- 1
147 for(i in 1:rownd){ 147 for(i in 1:rownd){
148 j <- 1 148 j <- 1
149 for(j in 1:colnd){ 149 for(j in 1:colnd){
150 if(is.na(NDATA[i,j])==FALSE){ 150 if(is.na(NDATA[i,j])==FALSE){
151 151
152 if(NDATA[i,j] < -1){ 152 if(NDATA[i,j] < -1){
153 DDATA[i,j]=0L 153 DDATA[i,j]=0L
154 } 154 }
155 if(NDATA[i,j] > 1){ 155 if(NDATA[i,j] > 1){
156 DDATA[i,j]=2L 156 DDATA[i,j]=2L
157 } 157 }
158 if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ 158 if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
159 DDATA[i,j]=1L 159 DDATA[i,j]=1L
160 } 160 }
161 } else{ 161 } else{
162 DDATA[i,j] = NDATA[i,j] 162 DDATA[i,j] = NDATA[i,j]
163 } 163 }
164 j = j + 1 164 j = j + 1
165 } 165 }
166 i = i + 1 166 i = i + 1
167 } 167 }
168 DDATA 168 DDATA
169 } 169 }
170 170
171 171
172 #The Rest of this code will be used every time you want to change a data set 172 #The Rest of this code will be used every time you want to change a data set
173 173
174 #Getting the series matrix file 174 #Getting the series matrix file
175 print("Choose the series matrix file that you want to Analyze") 175 print("Choose the series matrix file that you want to Analyze")
176 alz <- file.choose() 176 alz <- file.choose()
177 177
178 #Getting the GPL file 178 #Getting the GPL file
179 print("Choose the GPL file that correlates with the above series matrix file") 179 print("Choose the GPL file that correlates with the above series matrix file")
180 genena <- file.choose() 180 genena <- file.choose()
181 181
182 182
183 #Find out if it is a soft GPL file or not 183 #Find out if it is a soft GPL file or not
184 soft <- strsplit(genena,"[\\|/]") %>% 184 soft <- strsplit(genena,"[\\|/]") %>%
185 .[[1]] %>% 185 .[[1]] %>%
186 .[length(.)] %>% 186 .[length(.)] %>%
187 grepl("soft|annot",.) 187 grepl("soft|annot",.)
188 188
189 #Working with the wordy part of the document 189 #Working with the wordy part of the document
190 alzword <- alz %>% 190 alzword <- alz %>%
191 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 191 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
192 filter(grepl("!Sample",X1))%>% 192 filter(grepl("!Sample",X1))%>%
193 filter(!grepl("!Sample_contact",X1)) 193 filter(!grepl("!Sample_contact",X1))
194 194
195 ##Changing row names and column names: 195 ##Changing row names and column names:
196 ALZWORD <- t(alzword) 196 ALZWORD <- t(alzword)
197 rownames(ALZWORD)=NULL 197 rownames(ALZWORD)=NULL
198 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 198 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
199 ALZWORD <- chngrownm(ALZWORD)[-1,] 199 ALZWORD <- chngrownm(ALZWORD)[-1,]
200 ALZWORD <- ALZWORD%>% 200 ALZWORD <- ALZWORD%>%
201 as.data.frame()%>% 201 as.data.frame()%>%
202 dplyr::select(-starts_with("col")) 202 dplyr::select(-starts_with("col"))
203 203
204 ##Reorganizing information within the columns 204 ##Reorganizing information within the columns
205 ALZWORDF <- cinfo(ALZWORD) 205 ALZWORDF <- cinfo(ALZWORD)
206 206
207 207
208 #Working with Actual Data part of file 208 #Working with Actual Data part of file
209 alzdat <- alz %>% 209 alzdat <- alz %>%
210 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 210 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
211 ALZDAT <- t(alzdat[,-1]) 211 ALZDAT <- t(alzdat[,-1])
212 rownames(ALZDAT)=NULL 212 rownames(ALZDAT)=NULL
213 213
214 ##Is there a clean version of the GPL file available? 214 ##Is there a clean version of the GPL file available?
215 gplnum <- strsplit(genena,"[\\|/]") %>% 215 gplnum <- strsplit(genena,"[\\|/]") %>%
216 .[[1]] %>% 216 .[[1]] %>%
217 .[length(.)] %>% 217 .[length(.)] %>%
218 gsub("\\D","",.) 218 gsub("\\D","",.)
219 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 219 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
220 if(clfileex >= 1){ 220 if(clfileex >= 1){
221 #use the clean version 221 #use the clean version
222 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 222 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
223 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 223 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
224 224
225 } 225 }
226 if(clfileex == 0){ 226 if(clfileex == 0){
227 ##Lets Create a clean version 227 ##Lets Create a clean version
228 228
229 ##Gene ID to Gene Name 229 ##Gene ID to Gene Name
230 if(soft == TRUE){ 230 if(soft == TRUE){
231 #Check to see if there is already a file containing information on soft files 231 #Check to see if there is already a file containing information on soft files
232 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 232 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
233 if(fileex == 1){ 233 if(fileex == 1){
234 #Check to see if this GPL soft file has been used before 234 #Check to see if this GPL soft file has been used before
235 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 235 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
236 .$GPL_FILE_NUM%>% 236 .$GPL_FILE_NUM%>%
237 grepl(gplnum,.) %>% 237 grepl(gplnum,.) %>%
238 sum() 238 sum()
239 if(IDF == 1){ 239 if(IDF == 1){
240 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 240 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
241 .$GPL_FILE_NUM%>% 241 .$GPL_FILE_NUM%>%
242 grep(gplnum,.) 242 grep(gplnum,.)
243 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 243 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
244 .$LOC_ID %>% 244 .$LOC_ID %>%
245 .[IDLOCAL] 245 .[IDLOCAL]
246 geneIDNam <- genena %>% 246 geneIDNam <- genena %>%
247 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 247 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
248 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 248 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
249 } 249 }
250 if(IDF == 0){ 250 if(IDF == 0){
251 #No information on this particular GPL file 251 #No information on this particular GPL file
252 idLOCGPL <- genena %>% 252 idLOCGPL <- genena %>%
253 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 253 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
254 t(.) %>% 254 t(.) %>%
255 grep("^ID\\s*$",.) %>% 255 grep("^ID\\s*$",.) %>%
256 -1 256 -1
257 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 257 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
258 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 258 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
259 geneIDNam <- genena %>% 259 geneIDNam <- genena %>%
260 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 260 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
261 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 261 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
262 } 262 }
263 } 263 }
264 if(fileex == 0){ 264 if(fileex == 0){
265 #We must create a file that we can access for later use 265 #We must create a file that we can access for later use
266 idLOCGPL <- genena %>% 266 idLOCGPL <- genena %>%
267 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 267 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
268 t(.) %>% 268 t(.) %>%
269 grep("^ID\\s*$",.) %>% 269 grep("^ID\\s*$",.) %>%
270 -1 270 -1
271 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 271 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
272 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 272 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
273 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 273 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
274 geneIDNam <- genena %>% 274 geneIDNam <- genena %>%
275 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 275 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
276 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 276 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
277 } 277 }
278 } 278 }
279 if(soft == FALSE){ 279 if(soft == FALSE){
280 geneIDNam <- genena %>% 280 geneIDNam <- genena %>%
281 read_delim(delim="\t",comment = "#")%>% 281 read_delim(delim="\t",comment = "#")%>%
282 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 282 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
283 } 283 }
284 284
285 ##Labeling the gene IDs without names 285 ##Labeling the gene IDs without names
286 geneIDNam <- NAFIXING(geneIDNam) 286 geneIDNam <- NAFIXING(geneIDNam)
287 287
288 ##remove the whitespace 288 ##remove the whitespace
289 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 289 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
290 290
291 ##Here is the clean version 291 ##Here is the clean version
292 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 292 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
293 } 293 }
294 294
295 295
296 296
297 ##Changing the gene ID to gene name 297 ##Changing the gene ID to gene name
298 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 298 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
299 colnames(ALZDAT) = ALZDAT1[1,] 299 colnames(ALZDAT) = ALZDAT1[1,]
300 300
301 301
302 ##Adjusting the column names aka the gene names 302 ##Adjusting the column names aka the gene names
303 colnames(ALZDAT) <- gcnames(ALZDAT) 303 colnames(ALZDAT) <- gcnames(ALZDAT)
304 304
305 305
306 #Full RAW Data 306 #Full RAW Data
307 Fullalzdwr <- ALZDAT %>% 307 Fullalzdwr <- ALZDAT %>%
308 as.data.frame() %>% 308 as.data.frame() %>%
309 cbind(ALZWORDF,.) 309 cbind(ALZWORDF,.)
310 310
311 311
312 #Raw file is output 312 #Raw file is output
313 nfnaex <- strsplit(alz,"[\\]") %>% 313 nfnaex <- strsplit(alz,"[\\]") %>%
314 .[[1]] %>% 314 .[[1]] %>%
315 .[length(.)] %>% 315 .[length(.)] %>%
316 gsub("\\D","",.) %>% 316 gsub("\\D","",.) %>%
317 c("GSE",.,"aftexcel.txt") %>% 317 c("GSE",.,"aftexcel.txt") %>%
318 paste(collapse = "") 318 paste(collapse = "")
319 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 319 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
320 320
321 321
322 #Now for the discretization part 322 #Now for the discretization part
323 ##get the wordy part again 323 ##get the wordy part again
324 rawword <- t(ALZWORDF) 324 rawword <- t(ALZWORDF)
325 325
326 ##where is ID_REF located 326 ##where is ID_REF located
327 hereim <- grep("ID_REF",rawword[,1]) 327 hereim <- grep("ID_REF",rawword[,1])
328 328
329 ##Subject Names GSM... 329 ##Subject Names GSM...
330 subjnam <- rawword[hereim,] 330 subjnam <- rawword[hereim,]
331 331
332 ##Getting the names for the rows 332 ##Getting the names for the rows
333 namedarows <- rownames(rawword)[-hereim] %>% 333 namedarows <- rownames(rawword)[-hereim] %>%
334 as.data.frame() 334 as.data.frame()
335 RAWWORD <- rawword[-hereim,] %>% 335 RAWWORD <- rawword[-hereim,] %>%
336 as.data.frame() %>% 336 as.data.frame() %>%
337 bind_cols(namedarows,.) 337 bind_cols(namedarows,.)
338 z <- 1 338 z <- 1
339 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 339 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
340 for(z in 1:dim(RAWWORD)[1]){ 340 for(z in 1:dim(RAWWORD)[1]){
341 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 341 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
342 z <- z + 1 342 z <- z + 1
343 } 343 }
344 344
345 colnames(naroww) <- "ROW_NAs" 345 colnames(naroww) <- "ROW_NAs"
346 RAWWORD <- bind_cols(RAWWORD,naroww) 346 RAWWORD <- bind_cols(RAWWORD,naroww)
347 347
348 348
349 roALZna <- t(ALZDAT) %>% 349 roALZna <- t(ALZDAT) %>%
350 rownames(.) %>% 350 rownames(.) %>%
351 as.data.frame(.) 351 as.data.frame(.)
352 colnames(roALZna) <- "ID_REF" 352 colnames(roALZna) <- "ID_REF"
353 353
354 RAWDAT <- t(ALZDAT) %>% 354 RAWDAT <- t(ALZDAT) %>%
355 as.data.frame(.) 355 as.data.frame(.)
356 colnames(RAWDAT) <- NULL 356 colnames(RAWDAT) <- NULL
357 rownames(RAWDAT) <- NULL 357 rownames(RAWDAT) <- NULL
358 358
359 RAWDAT2 <- RAWDAT %>% 359 RAWDAT2 <- RAWDAT %>%
360 cbind(roALZna,.) %>% 360 cbind(roALZna,.) %>%
361 dplyr::arrange(.,ID_REF) 361 dplyr::arrange(.,ID_REF)
362 362
363 ##Editing the file for R processing 363 ##Editing the file for R processing
364 RAWDATID <- RAWDAT2[,1] %>% 364 RAWDATID <- RAWDAT2[,1] %>%
365 as.matrix(.) 365 as.matrix(.)
366 366
367 RAWDATNUM <- RAWDAT2[,-1] %>% 367 RAWDATNUM <- RAWDAT2[,-1] %>%
368 mapply(.,FUN = as.numeric) %>% 368 mapply(.,FUN = as.numeric) %>%
369 t(.) 369 t(.)
370 370
371 ##Consolidating genes with the same name 371 ##Consolidating genes with the same name
372 ###create empty matrix of size equal to tabRDATID 372 ###create empty matrix of size equal to tabRDATID
373 tabRDATID <- table(RAWDATID) 373 tabRDATID <- table(RAWDATID)
374 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 374 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
375 j <- 1 375 j <- 1
376 for(j in 1:length(tabRDATID)){ 376 for(j in 1:length(tabRDATID)){
377 377
378 ##Putting the ones without duplicates in their new homes 378 ##Putting the ones without duplicates in their new homes
379 if(tabRDATID[j] == 1){ 379 if(tabRDATID[j] == 1){
380 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 380 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
381 } 381 }
382 ##Averaging duplicates and putting them in their new homes 382 ##Averaging duplicates and putting them in their new homes
383 if(tabRDATID[j] > 1){ 383 if(tabRDATID[j] > 1){
384 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 384 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
385 } 385 }
386 j <- j + 1 386 j <- j + 1
387 } 387 }
388 388
389 ##Scaling the Data 389 ##Scaling the Data
390 scrawdat <- NuRDATN%>% 390 scrawdat <- NuRDATN%>%
391 scale() 391 scale()
392 attr(scrawdat,"scaled:center") <- NULL 392 attr(scrawdat,"scaled:center") <- NULL
393 attr(scrawdat,"scaled:scale") <- NULL 393 attr(scrawdat,"scaled:scale") <- NULL
394 colnames(scrawdat) <- rownames(tabRDATID) 394 colnames(scrawdat) <- rownames(tabRDATID)
395 395
396 ##Discretized the Data 396 ##Discretized the Data
397 dialzdat <- scrawdat %>% 397 dialzdat <- scrawdat %>%
398 dndat(.) %>% 398 dndat(.) %>%
399 t()%>% 399 t()%>%
400 as.data.frame(.) 400 as.data.frame(.)
401 colnames(dialzdat) <- rownames(RAWDATNUM) 401 colnames(dialzdat) <- rownames(RAWDATNUM)
402 402
403 ##setting "ID_REF" as a new variable 403 ##setting "ID_REF" as a new variable
404 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 404 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
405 colnames(geneNAM) <- "ID_REF" 405 colnames(geneNAM) <- "ID_REF"
406 rownames(dialzdat) <- NULL 406 rownames(dialzdat) <- NULL
407 dialzdat <-bind_cols(geneNAM,dialzdat) 407 dialzdat <-bind_cols(geneNAM,dialzdat)
408 408
409 ##NAs in a column 409 ##NAs in a column
410 x <- 2 410 x <- 2
411 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 411 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
412 nacol[1,1] = "COL_NAs" 412 nacol[1,1] = "COL_NAs"
413 for(x in 2:dim(dialzdat)[2]){ 413 for(x in 2:dim(dialzdat)[2]){
414 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 414 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
415 x <- x + 1 415 x <- x + 1
416 } 416 }
417 colnames(nacol) <- colnames(dialzdat) 417 colnames(nacol) <- colnames(dialzdat)
418 dialzdat<-bind_rows(dialzdat,nacol) 418 dialzdat<-bind_rows(dialzdat,nacol)
419 419
420 ##NAs in a row 420 ##NAs in a row
421 y <- 1 421 y <- 1
422 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 422 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
423 for(y in 1:dim(dialzdat)[1]){ 423 for(y in 1:dim(dialzdat)[1]){
424 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 424 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
425 y <- y + 1 425 y <- y + 1
426 } 426 }
427 colnames(narowd) <- "ROW_NAs" 427 colnames(narowd) <- "ROW_NAs"
428 dialzdat <- bind_cols(dialzdat,narowd) 428 dialzdat <- bind_cols(dialzdat,narowd)
429 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 429 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
430 colnames(RAWWORD) <- colnames(dialzdat) 430 colnames(RAWWORD) <- colnames(dialzdat)
431 ##converting to character so that the clinical can be brought together with discrete data 431 ##converting to character so that the clinical can be brought together with discrete data
432 k <- 2 432 k <- 2
433 for(k in 2:dim(dialzdat)[2]-1){ 433 for(k in 2:dim(dialzdat)[2]-1){
434 dialzdat[,k] <- as.character(dialzdat[,k]) 434 dialzdat[,k] <- as.character(dialzdat[,k])
435 k <- k + 1 435 k <- k + 1
436 } 436 }
437 #The End the full data 437 #The End the full data
438 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 438 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
439 439
440 #Produces Discrete file 440 #Produces Discrete file
441 nfnaex <- strsplit(rawdat,"[\\|/]") %>% 441 nfnaex <- strsplit(rawdat,"[\\|/]") %>%
442 .[[1]] %>% 442 .[[1]] %>%
443 .[length(.)] %>% 443 .[length(.)] %>%
444 gsub("\\D","",.) %>% 444 gsub("\\D","",.) %>%
445 c("GSE",.,"dscrt.txt") %>% 445 c("GSE",.,"dscrt.txt") %>%
446 paste(collapse = "") 446 paste(collapse = "")
447 write.table(Dscrtalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE) 447 write.table(Dscrtalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)
448 448
449 449