Commit 234f89c9aa9ae39e4b3f156e36f0ccf51282a363

Authored by Efrain Gonzalez
1 parent 0eb3420561
Exists in master

Outputs raw and discretized data files(UNTESTED)

Showing 1 changed file with 3 additions and 0 deletions   Show diff stats
1 ##Posted 6/15/2017
2
3
1 #Libraries required to run the code 4 #Libraries required to run the code
2 library(pryr) 5 library(pryr)
3 library(MASS) 6 library(MASS)
4 library(dplyr) 7 library(dplyr)
5 library(tidyr) 8 library(tidyr)
6 library(readr) 9 library(readr)
7 library(stringr) 10 library(stringr)
8 11
9 12
10 #Necessary Functions 13 #Necessary Functions
11 #1#Function for handling the changing of row names and column names 14 #1#Function for handling the changing of row names and column names
12 chngrownm <- function(mat){ 15 chngrownm <- function(mat){
13 row <- dim(mat)[1] 16 row <- dim(mat)[1]
14 col <- dim(mat)[2] 17 col <- dim(mat)[2]
15 j <- 1 18 j <- 1
16 x <- 1 19 x <- 1
17 p <- 1 20 p <- 1
18 a <- 1 21 a <- 1
19 b <- 1 22 b <- 1
20 g <- 1 23 g <- 1
21 for(j in 1:col){ 24 for(j in 1:col){
22 if("!Sample_source_name_ch1"==mat[1,j]){ 25 if("!Sample_source_name_ch1"==mat[1,j]){
23 colnames(mat)[j] <- "Brain_Region" 26 colnames(mat)[j] <- "Brain_Region"
24 } 27 }
25 if("!Sample_title" == mat[1,j]){ 28 if("!Sample_title" == mat[1,j]){
26 colnames(mat)[j] <- "Title" 29 colnames(mat)[j] <- "Title"
27 } 30 }
28 if("!Sample_geo_accession" == mat[1,j]){ 31 if("!Sample_geo_accession" == mat[1,j]){
29 colnames(mat)[j] <- "ID_REF" 32 colnames(mat)[j] <- "ID_REF"
30 } else{ 33 } else{
31 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ 34 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
32 colnames(mat)[j] <- paste0("Sex",x) 35 colnames(mat)[j] <- paste0("Sex",x)
33 x = x + 1 36 x = x + 1
34 } 37 }
35 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ 38 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
36 colnames(mat)[j] <- paste0("PMI",p) 39 colnames(mat)[j] <- paste0("PMI",p)
37 p = p + 1 40 p = p + 1
38 } 41 }
39 if(grepl("age|Age|AGE",mat[2,j])==TRUE){ 42 if(grepl("age|Age|AGE",mat[2,j])==TRUE){
40 colnames(mat)[j] <- paste0("Age",a) 43 colnames(mat)[j] <- paste0("Age",a)
41 a = a + 1 44 a = a + 1
42 } 45 }
43 if(grepl("braak|b&b",mat[2,j])==TRUE){ 46 if(grepl("braak|b&b",mat[2,j])==TRUE){
44 colnames(mat)[j] <- paste0("Braak",b) 47 colnames(mat)[j] <- paste0("Braak",b)
45 b = b + 1 48 b = b + 1
46 } 49 }
47 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){ 50 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
48 colnames(mat)[j] <- paste0("Group",g) 51 colnames(mat)[j] <- paste0("Group",g)
49 g = g + 1 52 g = g + 1
50 } 53 }
51 54
52 } 55 }
53 j = j + 1 56 j = j + 1
54 } 57 }
55 mat 58 mat
56 } 59 }
57 60
58 #2#Function for reorganizing information within the columns 61 #2#Function for reorganizing information within the columns
59 cinfo <- function(mat){ 62 cinfo <- function(mat){
60 col <- dim(mat)[2] 63 col <- dim(mat)[2]
61 j <-2 64 j <-2
62 for(j in 2:col){ 65 for(j in 2:col){
63 if(grepl("Group",colnames(mat)[j]) == TRUE){ 66 if(grepl("Group",colnames(mat)[j]) == TRUE){
64 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 67 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
65 } 68 }
66 if(grepl("Age",colnames(mat)[j])==TRUE){ 69 if(grepl("Age",colnames(mat)[j])==TRUE){
67 mat[,j] <- gsub("\\D","",mat[,j])%>% 70 mat[,j] <- gsub("\\D","",mat[,j])%>%
68 as.integer() 71 as.integer()
69 } 72 }
70 if(grepl("Sex",colnames(mat)[j])==TRUE){ 73 if(grepl("Sex",colnames(mat)[j])==TRUE){
71 mat[,j] <- gsub(".+:\\s","",mat[,j]) 74 mat[,j] <- gsub(".+:\\s","",mat[,j])
72 } 75 }
73 if(grepl("PMI",colnames(mat)[j])==TRUE){ 76 if(grepl("PMI",colnames(mat)[j])==TRUE){
74 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 77 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
75 as.numeric() 78 as.numeric()
76 } 79 }
77 if(grepl("Braak",colnames(mat)[j])==TRUE){ 80 if(grepl("Braak",colnames(mat)[j])==TRUE){
78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 81 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
79 as.roman()%>% 82 as.roman()%>%
80 as.integer() 83 as.integer()
81 } 84 }
82 j=j+1 85 j=j+1
83 } 86 }
84 mat 87 mat
85 } 88 }
86 89
87 #3#Function for labeling the gene IDs without names 90 #3#Function for labeling the gene IDs without names
88 NAFIXING <- function(GIDNAM){ 91 NAFIXING <- function(GIDNAM){
89 row <- dim(GIDNAM)[1] 92 row <- dim(GIDNAM)[1]
90 i <- 1 93 i <- 1
91 for(i in 1:row){ 94 for(i in 1:row){
92 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 95 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
93 GIDNAM[i,2] <- GIDNAM[i,1] 96 GIDNAM[i,2] <- GIDNAM[i,1]
94 } 97 }
95 i <- i + 1 98 i <- i + 1
96 } 99 }
97 GIDNAM 100 GIDNAM
98 } 101 }
99 102
100 #4#Function for changing the gene ID to gene name 103 #4#Function for changing the gene ID to gene name
101 cgeneID <- function(GeneName,DATA){ 104 cgeneID <- function(GeneName,DATA){
102 colGene <- dim(GeneName)[2] 105 colGene <- dim(GeneName)[2]
103 j <- 1 106 j <- 1
104 for(j in 1:colGene){ 107 for(j in 1:colGene){
105 chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) 108 chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
106 if(is.na(sum(chngsreq))==FALSE){ 109 if(is.na(sum(chngsreq))==FALSE){
107 if(sum(chngsreq) > 0){ 110 if(sum(chngsreq) > 0){
108 DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 111 DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
109 } 112 }
110 } 113 }
111 #if(sum(chngsreq) > 0){ 114 #if(sum(chngsreq) > 0){
112 ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) 115 ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
113 #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 116 #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
114 #} 117 #}
115 j = j+1 118 j = j+1
116 } 119 }
117 DATA 120 DATA
118 } 121 }
119 122
120 #5#Function for adjusting the gene names 123 #5#Function for adjusting the gene names
121 gcnames <- function(DiData,usecol=1){ 124 gcnames <- function(DiData,usecol=1){
122 nuruns <- dim(DiData)[2] 125 nuruns <- dim(DiData)[2]
123 i = 1 126 i = 1
124 nwnam <- rep("0",length.out=nuruns) 127 nwnam <- rep("0",length.out=nuruns)
125 for(i in 1:nuruns){ 128 for(i in 1:nuruns){
126 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ 129 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
127 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol]) 130 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
128 } else{ 131 } else{
129 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1]) 132 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
130 } 133 }
131 134
132 } 135 }
133 nwnam 136 nwnam
134 137
135 } 138 }
136 139
137 #6# Function for discretizing the data 140 #6# Function for discretizing the data
138 dndat <- function(NDATA){ 141 dndat <- function(NDATA){
139 rownd <- dim(NDATA)[1] 142 rownd <- dim(NDATA)[1]
140 colnd <- dim(NDATA)[2] 143 colnd <- dim(NDATA)[2]
141 DDATA <- matrix(0,nrow=rownd,ncol=colnd) 144 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
142 colnames(DDATA) <- colnames(NDATA) 145 colnames(DDATA) <- colnames(NDATA)
143 i <- 1 146 i <- 1
144 for(i in 1:rownd){ 147 for(i in 1:rownd){
145 j <- 1 148 j <- 1
146 for(j in 1:colnd){ 149 for(j in 1:colnd){
147 if(is.na(NDATA[i,j])==FALSE){ 150 if(is.na(NDATA[i,j])==FALSE){
148 151
149 if(NDATA[i,j] < -1){ 152 if(NDATA[i,j] < -1){
150 DDATA[i,j]=0L 153 DDATA[i,j]=0L
151 } 154 }
152 if(NDATA[i,j] > 1){ 155 if(NDATA[i,j] > 1){
153 DDATA[i,j]=2L 156 DDATA[i,j]=2L
154 } 157 }
155 if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ 158 if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
156 DDATA[i,j]=1L 159 DDATA[i,j]=1L
157 } 160 }
158 } else{ 161 } else{
159 DDATA[i,j] = NDATA[i,j] 162 DDATA[i,j] = NDATA[i,j]
160 } 163 }
161 j = j + 1 164 j = j + 1
162 } 165 }
163 i = i + 1 166 i = i + 1
164 } 167 }
165 DDATA 168 DDATA
166 } 169 }
167 170
168 171
169 #The Rest of this code will be used every time you want to change a data set 172 #The Rest of this code will be used every time you want to change a data set
170 173
171 #Getting the series matrix file 174 #Getting the series matrix file
172 print("Choose the series matrix file that you want to Analyze") 175 print("Choose the series matrix file that you want to Analyze")
173 alz <- file.choose() 176 alz <- file.choose()
174 177
175 #Getting the GPL file 178 #Getting the GPL file
176 print("Choose the GPL file that correlates with the above series matrix file") 179 print("Choose the GPL file that correlates with the above series matrix file")
177 genena <- file.choose() 180 genena <- file.choose()
178 181
179 182
180 #Find out if it is a soft GPL file or not 183 #Find out if it is a soft GPL file or not
181 soft <- strsplit(genena,"[\\|/]") %>% 184 soft <- strsplit(genena,"[\\|/]") %>%
182 .[[1]] %>% 185 .[[1]] %>%
183 .[length(.)] %>% 186 .[length(.)] %>%
184 grepl("soft|annot",.) 187 grepl("soft|annot",.)
185 188
186 #Working with the wordy part of the document 189 #Working with the wordy part of the document
187 alzword <- alz %>% 190 alzword <- alz %>%
188 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 191 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
189 filter(grepl("!Sample",X1))%>% 192 filter(grepl("!Sample",X1))%>%
190 filter(!grepl("!Sample_contact",X1)) 193 filter(!grepl("!Sample_contact",X1))
191 194
192 ##Changing row names and column names: 195 ##Changing row names and column names:
193 ALZWORD <- t(alzword) 196 ALZWORD <- t(alzword)
194 rownames(ALZWORD)=NULL 197 rownames(ALZWORD)=NULL
195 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 198 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
196 ALZWORD <- chngrownm(ALZWORD)[-1,] 199 ALZWORD <- chngrownm(ALZWORD)[-1,]
197 ALZWORD <- ALZWORD%>% 200 ALZWORD <- ALZWORD%>%
198 as.data.frame()%>% 201 as.data.frame()%>%
199 dplyr::select(-starts_with("col")) 202 dplyr::select(-starts_with("col"))
200 203
201 ##Reorganizing information within the columns 204 ##Reorganizing information within the columns
202 ALZWORDF <- cinfo(ALZWORD) 205 ALZWORDF <- cinfo(ALZWORD)
203 206
204 207
205 #Working with Actual Data part of file 208 #Working with Actual Data part of file
206 alzdat <- alz %>% 209 alzdat <- alz %>%
207 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 210 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
208 ALZDAT <- t(alzdat[,-1]) 211 ALZDAT <- t(alzdat[,-1])
209 rownames(ALZDAT)=NULL 212 rownames(ALZDAT)=NULL
210 213
211 ##Is there a clean version of the GPL file available? 214 ##Is there a clean version of the GPL file available?
212 gplnum <- strsplit(genena,"[\\|/]") %>% 215 gplnum <- strsplit(genena,"[\\|/]") %>%
213 .[[1]] %>% 216 .[[1]] %>%
214 .[length(.)] %>% 217 .[length(.)] %>%
215 gsub("\\D","",.) 218 gsub("\\D","",.)
216 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 219 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
217 if(clfileex >= 1){ 220 if(clfileex >= 1){
218 #use the clean version 221 #use the clean version
219 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 222 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
220 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 223 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
221 224
222 } 225 }
223 if(clfileex == 0){ 226 if(clfileex == 0){
224 ##Lets Create a clean version 227 ##Lets Create a clean version
225 228
226 ##Gene ID to Gene Name 229 ##Gene ID to Gene Name
227 if(soft == TRUE){ 230 if(soft == TRUE){
228 #Check to see if there is already a file containing information on soft files 231 #Check to see if there is already a file containing information on soft files
229 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 232 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
230 if(fileex == 1){ 233 if(fileex == 1){
231 #Check to see if this GPL soft file has been used before 234 #Check to see if this GPL soft file has been used before
232 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 235 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
233 .$GPL_FILE_NUM%>% 236 .$GPL_FILE_NUM%>%
234 grepl(gplnum,.) %>% 237 grepl(gplnum,.) %>%
235 sum() 238 sum()
236 if(IDF == 1){ 239 if(IDF == 1){
237 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 240 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
238 .$GPL_FILE_NUM%>% 241 .$GPL_FILE_NUM%>%
239 grep(gplnum,.) 242 grep(gplnum,.)
240 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 243 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
241 .$LOC_ID %>% 244 .$LOC_ID %>%
242 .[IDLOCAL] 245 .[IDLOCAL]
243 geneIDNam <- genena %>% 246 geneIDNam <- genena %>%
244 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 247 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
245 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 248 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
246 } 249 }
247 if(IDF == 0){ 250 if(IDF == 0){
248 #No information on this particular GPL file 251 #No information on this particular GPL file
249 idLOCGPL <- genena %>% 252 idLOCGPL <- genena %>%
250 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 253 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
251 t(.) %>% 254 t(.) %>%
252 grep("^ID\\s*$",.) %>% 255 grep("^ID\\s*$",.) %>%
253 -1 256 -1
254 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 257 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
255 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 258 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
256 geneIDNam <- genena %>% 259 geneIDNam <- genena %>%
257 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 260 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
258 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 261 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
259 } 262 }
260 } 263 }
261 if(fileex == 0){ 264 if(fileex == 0){
262 #We must create a file that we can access for later use 265 #We must create a file that we can access for later use
263 idLOCGPL <- genena %>% 266 idLOCGPL <- genena %>%
264 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 267 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
265 t(.) %>% 268 t(.) %>%
266 grep("^ID\\s*$",.) %>% 269 grep("^ID\\s*$",.) %>%
267 -1 270 -1
268 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 271 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
269 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 272 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
270 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 273 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
271 geneIDNam <- genena %>% 274 geneIDNam <- genena %>%
272 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 275 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
273 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 276 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
274 } 277 }
275 } 278 }
276 if(soft == FALSE){ 279 if(soft == FALSE){
277 geneIDNam <- genena %>% 280 geneIDNam <- genena %>%
278 read_delim(delim="\t",comment = "#")%>% 281 read_delim(delim="\t",comment = "#")%>%
279 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 282 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
280 } 283 }
281 284
282 ##Labeling the gene IDs without names 285 ##Labeling the gene IDs without names
283 geneIDNam <- NAFIXING(geneIDNam) 286 geneIDNam <- NAFIXING(geneIDNam)
284 287
285 ##remove the whitespace 288 ##remove the whitespace
286 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 289 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
287 290
288 ##Here is the clean version 291 ##Here is the clean version
289 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 292 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
290 } 293 }
291 294
292 295
293 296
294 ##Changing the gene ID to gene name 297 ##Changing the gene ID to gene name
295 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 298 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
296 colnames(ALZDAT) = ALZDAT1[1,] 299 colnames(ALZDAT) = ALZDAT1[1,]
297 300
298 301
299 ##Adjusting the column names aka the gene names 302 ##Adjusting the column names aka the gene names
300 colnames(ALZDAT) <- gcnames(ALZDAT) 303 colnames(ALZDAT) <- gcnames(ALZDAT)
301 304
302 305
303 #Full RAW Data 306 #Full RAW Data
304 Fullalzdwr <- ALZDAT %>% 307 Fullalzdwr <- ALZDAT %>%
305 as.data.frame() %>% 308 as.data.frame() %>%
306 cbind(ALZWORDF,.) 309 cbind(ALZWORDF,.)
307 310
308 311
309 #Raw file is output 312 #Raw file is output
310 nfnaex <- strsplit(alz,"[\\]") %>% 313 nfnaex <- strsplit(alz,"[\\]") %>%
311 .[[1]] %>% 314 .[[1]] %>%
312 .[length(.)] %>% 315 .[length(.)] %>%
313 gsub("\\D","",.) %>% 316 gsub("\\D","",.) %>%
314 c("GSE",.,"aftexcel.txt") %>% 317 c("GSE",.,"aftexcel.txt") %>%
315 paste(collapse = "") 318 paste(collapse = "")
316 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 319 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
317 320
318 321
319 #Now for the discretization part 322 #Now for the discretization part
320 ##get the wordy part again 323 ##get the wordy part again
321 rawword <- t(ALZWORDF) 324 rawword <- t(ALZWORDF)
322 325
323 ##where is ID_REF located 326 ##where is ID_REF located
324 hereim <- grep("ID_REF",rawword[,1]) 327 hereim <- grep("ID_REF",rawword[,1])
325 328
326 ##Subject Names GSM... 329 ##Subject Names GSM...
327 subjnam <- rawword[hereim,] 330 subjnam <- rawword[hereim,]
328 331
329 ##Getting the names for the rows 332 ##Getting the names for the rows
330 namedarows <- rownames(rawword)[-hereim] %>% 333 namedarows <- rownames(rawword)[-hereim] %>%
331 as.data.frame() 334 as.data.frame()
332 RAWWORD <- rawword[-hereim,] %>% 335 RAWWORD <- rawword[-hereim,] %>%
333 as.data.frame() %>% 336 as.data.frame() %>%
334 bind_cols(namedarows,.) 337 bind_cols(namedarows,.)
335 z <- 1 338 z <- 1
336 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 339 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
337 for(z in 1:dim(RAWWORD)[1]){ 340 for(z in 1:dim(RAWWORD)[1]){
338 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 341 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
339 z <- z + 1 342 z <- z + 1
340 } 343 }
341 344
342 colnames(naroww) <- "ROW_NAs" 345 colnames(naroww) <- "ROW_NAs"
343 RAWWORD <- bind_cols(RAWWORD,naroww) 346 RAWWORD <- bind_cols(RAWWORD,naroww)
344 347
345 348
346 roALZna <- t(ALZDAT) %>% 349 roALZna <- t(ALZDAT) %>%
347 rownames(.) %>% 350 rownames(.) %>%
348 as.data.frame(.) 351 as.data.frame(.)
349 colnames(roALZna) <- "ID_REF" 352 colnames(roALZna) <- "ID_REF"
350 353
351 RAWDAT <- t(ALZDAT) %>% 354 RAWDAT <- t(ALZDAT) %>%
352 as.data.frame(.) 355 as.data.frame(.)
353 colnames(RAWDAT) <- NULL 356 colnames(RAWDAT) <- NULL
354 rownames(RAWDAT) <- NULL 357 rownames(RAWDAT) <- NULL
355 358
356 RAWDAT2 <- RAWDAT %>% 359 RAWDAT2 <- RAWDAT %>%
357 cbind(roALZna,.) %>% 360 cbind(roALZna,.) %>%
358 dplyr::arrange(.,ID_REF) 361 dplyr::arrange(.,ID_REF)
359 362
360 ##Editing the file for R processing 363 ##Editing the file for R processing
361 RAWDATID <- RAWDAT2[,1] %>% 364 RAWDATID <- RAWDAT2[,1] %>%
362 as.matrix(.) 365 as.matrix(.)
363 366
364 RAWDATNUM <- RAWDAT2[,-1] %>% 367 RAWDATNUM <- RAWDAT2[,-1] %>%
365 mapply(.,FUN = as.numeric) %>% 368 mapply(.,FUN = as.numeric) %>%
366 t(.) 369 t(.)
367 370
368 ##Consolidating genes with the same name 371 ##Consolidating genes with the same name
369 ###create empty matrix of size equal to tabRDATID 372 ###create empty matrix of size equal to tabRDATID
370 tabRDATID <- table(RAWDATID) 373 tabRDATID <- table(RAWDATID)
371 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 374 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
372 j <- 1 375 j <- 1
373 for(j in 1:length(tabRDATID)){ 376 for(j in 1:length(tabRDATID)){
374 377
375 ##Putting the ones without duplicates in their new homes 378 ##Putting the ones without duplicates in their new homes
376 if(tabRDATID[j] == 1){ 379 if(tabRDATID[j] == 1){
377 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 380 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
378 } 381 }
379 ##Averaging duplicates and putting them in their new homes 382 ##Averaging duplicates and putting them in their new homes
380 if(tabRDATID[j] > 1){ 383 if(tabRDATID[j] > 1){
381 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 384 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
382 } 385 }
383 j <- j + 1 386 j <- j + 1
384 } 387 }
385 388
386 ##Scaling the Data 389 ##Scaling the Data
387 scrawdat <- NuRDATN%>% 390 scrawdat <- NuRDATN%>%
388 scale() 391 scale()
389 attr(scrawdat,"scaled:center") <- NULL 392 attr(scrawdat,"scaled:center") <- NULL
390 attr(scrawdat,"scaled:scale") <- NULL 393 attr(scrawdat,"scaled:scale") <- NULL
391 colnames(scrawdat) <- rownames(tabRDATID) 394 colnames(scrawdat) <- rownames(tabRDATID)
392 395
393 ##Discretized the Data 396 ##Discretized the Data
394 dialzdat <- scrawdat %>% 397 dialzdat <- scrawdat %>%
395 dndat(.) %>% 398 dndat(.) %>%
396 t()%>% 399 t()%>%
397 as.data.frame(.) 400 as.data.frame(.)
398 colnames(dialzdat) <- rownames(RAWDATNUM) 401 colnames(dialzdat) <- rownames(RAWDATNUM)
399 402
400 ##setting "ID_REF" as a new variable 403 ##setting "ID_REF" as a new variable
401 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 404 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
402 colnames(geneNAM) <- "ID_REF" 405 colnames(geneNAM) <- "ID_REF"
403 rownames(dialzdat) <- NULL 406 rownames(dialzdat) <- NULL
404 dialzdat <-bind_cols(geneNAM,dialzdat) 407 dialzdat <-bind_cols(geneNAM,dialzdat)
405 408
406 ##NAs in a column 409 ##NAs in a column
407 x <- 2 410 x <- 2
408 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 411 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
409 nacol[1,1] = "COL_NAs" 412 nacol[1,1] = "COL_NAs"
410 for(x in 2:dim(dialzdat)[2]){ 413 for(x in 2:dim(dialzdat)[2]){
411 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 414 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
412 x <- x + 1 415 x <- x + 1
413 } 416 }
414 colnames(nacol) <- colnames(dialzdat) 417 colnames(nacol) <- colnames(dialzdat)
415 dialzdat<-bind_rows(dialzdat,nacol) 418 dialzdat<-bind_rows(dialzdat,nacol)
416 419
417 ##NAs in a row 420 ##NAs in a row
418 y <- 1 421 y <- 1
419 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 422 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
420 for(y in 1:dim(dialzdat)[1]){ 423 for(y in 1:dim(dialzdat)[1]){
421 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 424 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
422 y <- y + 1 425 y <- y + 1
423 } 426 }
424 colnames(narowd) <- "ROW_NAs" 427 colnames(narowd) <- "ROW_NAs"
425 dialzdat <- bind_cols(dialzdat,narowd) 428 dialzdat <- bind_cols(dialzdat,narowd)
426 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 429 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
427 colnames(RAWWORD) <- colnames(dialzdat) 430 colnames(RAWWORD) <- colnames(dialzdat)
428 ##converting to character so that the clinical can be brought together with discrete data 431 ##converting to character so that the clinical can be brought together with discrete data
429 k <- 2 432 k <- 2
430 for(k in 2:dim(dialzdat)[2]-1){ 433 for(k in 2:dim(dialzdat)[2]-1){
431 dialzdat[,k] <- as.character(dialzdat[,k]) 434 dialzdat[,k] <- as.character(dialzdat[,k])
432 k <- k + 1 435 k <- k + 1
433 } 436 }
434 #The End the full data 437 #The End the full data
435 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 438 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
436 439
437 #Produces Discrete file 440 #Produces Discrete file
438 nfnaex <- strsplit(rawdat,"[\\|/]") %>% 441 nfnaex <- strsplit(rawdat,"[\\|/]") %>%
439 .[[1]] %>% 442 .[[1]] %>%
440 .[length(.)] %>% 443 .[length(.)] %>%
441 gsub("\\D","",.) %>% 444 gsub("\\D","",.) %>%
442 c("GSE",.,"dscrt.txt") %>% 445 c("GSE",.,"dscrt.txt") %>%
443 paste(collapse = "") 446 paste(collapse = "")
444 write.table(Dscrtalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE) 447 write.table(Dscrtalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)
445 448
446 449