Commit b2053f56b69da1d68abb902a7ec37884a79c7e8a

Authored by Efrain Gonzalez
1 parent f2cad6d272
Exists in master

Updating handling of strings in as.data.frame function

Showing 1 changed file with 7 additions and 7 deletions   Show diff stats
1 ##Posted 6/15/2017 1 ##Posted 6/15/2017
2 options(digits = 11) 2 options(digits = 11)
3 3
4 #Libraries required to run the code 4 #Libraries required to run the code
5 library(pryr) 5 library(pryr)
6 library(MASS) 6 library(MASS)
7 library(dplyr) 7 library(dplyr)
8 library(tidyr) 8 library(tidyr)
9 library(readr) 9 library(readr)
10 library(stringr) 10 library(stringr)
11 11
12 12
13 #Necessary Functions 13 #Necessary Functions
14 #1#Function for handling the changing of row names and column names 14 #1#Function for handling the changing of row names and column names
15 chngrownm <- function(mat){ 15 chngrownm <- function(mat){
16 row <- dim(mat)[1] 16 row <- dim(mat)[1]
17 col <- dim(mat)[2] 17 col <- dim(mat)[2]
18 j <- 1 18 j <- 1
19 x <- 1 19 x <- 1
20 p <- 1 20 p <- 1
21 a <- 1 21 a <- 1
22 b <- 1 22 b <- 1
23 g <- 1 23 g <- 1
24 for(j in 1:col){ 24 for(j in 1:col){
25 if("!Sample_source_name_ch1"==mat[1,j]){ 25 if("!Sample_source_name_ch1"==mat[1,j]){
26 colnames(mat)[j] <- "Brain_Region" 26 colnames(mat)[j] <- "Brain_Region"
27 } else if("!Sample_title" == mat[1,j]){ 27 } else if("!Sample_title" == mat[1,j]){
28 colnames(mat)[j] <- "Title" 28 colnames(mat)[j] <- "Title"
29 } else if("!Sample_geo_accession" == mat[1,j]){ 29 } else if("!Sample_geo_accession" == mat[1,j]){
30 colnames(mat)[j] <- "ID_REF" 30 colnames(mat)[j] <- "ID_REF"
31 } else{ 31 } else{
32 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ 32 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
33 colnames(mat)[j] <- paste0("Sex",x) 33 colnames(mat)[j] <- paste0("Sex",x)
34 x = x + 1 34 x = x + 1
35 } 35 }
36 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ 36 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
37 colnames(mat)[j] <- paste0("PMI",p) 37 colnames(mat)[j] <- paste0("PMI",p)
38 p = p + 1 38 p = p + 1
39 } 39 }
40 if(grepl("age|Age|AGE",mat[2,j])==TRUE){ 40 if(grepl("age|Age|AGE",mat[2,j])==TRUE){
41 colnames(mat)[j] <- paste0("Age",a) 41 colnames(mat)[j] <- paste0("Age",a)
42 a = a + 1 42 a = a + 1
43 } 43 }
44 if(grepl("braak|b&b",mat[2,j])==TRUE){ 44 if(grepl("braak|b&b",mat[2,j])==TRUE){
45 colnames(mat)[j] <- paste0("Braak",b) 45 colnames(mat)[j] <- paste0("Braak",b)
46 b = b + 1 46 b = b + 1
47 } 47 }
48 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){ 48 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){
49 colnames(mat)[j] <- paste0("Group",g) 49 colnames(mat)[j] <- paste0("Group",g)
50 g = g + 1 50 g = g + 1
51 } 51 }
52 52
53 } 53 }
54 j = j + 1 54 j = j + 1
55 } 55 }
56 mat 56 mat
57 } 57 }
58 58
59 #2#Function for reorganizing information within the columns 59 #2#Function for reorganizing information within the columns
60 cinfo <- function(mat){ 60 cinfo <- function(mat){
61 col <- dim(mat)[2] 61 col <- dim(mat)[2]
62 j <-2 62 j <-2
63 for(j in 2:col){ 63 for(j in 2:col){
64 if(grepl("Group",colnames(mat)[j]) == TRUE){ 64 if(grepl("Group",colnames(mat)[j]) == TRUE){
65 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 65 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
66 } 66 }
67 if(grepl("Age",colnames(mat)[j])==TRUE){ 67 if(grepl("Age",colnames(mat)[j])==TRUE){
68 mat[,j] <- gsub("\\D","",mat[,j])%>% 68 mat[,j] <- gsub("\\D","",mat[,j])%>%
69 as.integer() 69 as.integer()
70 } 70 }
71 if(grepl("Sex",colnames(mat)[j])==TRUE){ 71 if(grepl("Sex",colnames(mat)[j])==TRUE){
72 mat[,j] <- gsub(".+:\\s","",mat[,j]) 72 mat[,j] <- gsub(".+:\\s","",mat[,j])
73 } 73 }
74 if(grepl("PMI",colnames(mat)[j])==TRUE){ 74 if(grepl("PMI",colnames(mat)[j])==TRUE){
75 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 75 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
76 as.numeric() 76 as.numeric()
77 } 77 }
78 if(grepl("Braak",colnames(mat)[j])==TRUE){ 78 if(grepl("Braak",colnames(mat)[j])==TRUE){
79 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 79 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
80 as.roman()%>% 80 as.roman()%>%
81 as.integer() 81 as.integer()
82 } 82 }
83 j=j+1 83 j=j+1
84 } 84 }
85 mat 85 mat
86 } 86 }
87 87
88 #3#Function for labeling the gene IDs without names 88 #3#Function for labeling the gene IDs without names
89 NAFIXING <- function(GIDNAM){ 89 NAFIXING <- function(GIDNAM){
90 row <- dim(GIDNAM)[1] 90 row <- dim(GIDNAM)[1]
91 i <- 1 91 i <- 1
92 for(i in 1:row){ 92 for(i in 1:row){
93 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 93 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
94 GIDNAM[i,2] <- GIDNAM[i,1] 94 GIDNAM[i,2] <- GIDNAM[i,1]
95 } 95 }
96 i <- i + 1 96 i <- i + 1
97 } 97 }
98 GIDNAM 98 GIDNAM
99 } 99 }
100 100
101 #4#Function for changing the gene ID to gene name 101 #4#Function for changing the gene ID to gene name
102 cgeneID <- function(GeneName,DATA){ 102 cgeneID <- function(GeneName,DATA){
103 nj <- t(GeneName) 103 nj <- t(GeneName)
104 nq <- t(DATA) 104 nq <- t(DATA)
105 colGene <- dim(nj)[2] 105 colGene <- dim(nj)[2]
106 colDATA <- dim(nq)[2] 106 colDATA <- dim(nq)[2]
107 j <- 1 107 j <- 1
108 for(j in 1:colDATA){ 108 for(j in 1:colDATA){
109 #where is that gene id located within the GPL file 109 #where is that gene id located within the GPL file
110 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,]) 110 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
111 if(is.na(sum(chngreq))==FALSE){ 111 if(is.na(sum(chngreq))==FALSE){
112 if(sum(chngreq) > 0){ 112 if(sum(chngreq) > 0){
113 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j]) 113 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
114 } 114 }
115 } 115 }
116 j <- j + 1 116 j <- j + 1
117 } 117 }
118 nq 118 nq
119 } 119 }
120 #cgeneID <- function(GeneName,DATA){ 120 #cgeneID <- function(GeneName,DATA){
121 # colGene <- dim(GeneName)[2] 121 # colGene <- dim(GeneName)[2]
122 # j <- 1 122 # j <- 1
123 # for(j in 1:colGene){ 123 # for(j in 1:colGene){
124 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) 124 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
125 # if(is.na(sum(chngsreq))==FALSE){ 125 # if(is.na(sum(chngsreq))==FALSE){
126 # if(sum(chngsreq) > 0){ 126 # if(sum(chngsreq) > 0){
127 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 127 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
128 # } 128 # }
129 # } 129 # }
130 # #if(sum(chngsreq) > 0){ 130 # #if(sum(chngsreq) > 0){
131 # ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) 131 # ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
132 # #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 132 # #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
133 # #} 133 # #}
134 # j = j+1 134 # j = j+1
135 # } 135 # }
136 # DATA 136 # DATA
137 #} 137 #}
138 138
139 #5#Function for adjusting the gene names 139 #5#Function for adjusting the gene names
140 gcnames <- function(DiData,usecol=1){ 140 gcnames <- function(DiData,usecol=1){
141 nuruns <- dim(DiData)[2] 141 nuruns <- dim(DiData)[2]
142 i = 1 142 i = 1
143 nwnam <- rep("0",length.out=nuruns) 143 nwnam <- rep("0",length.out=nuruns)
144 for(i in 1:nuruns){ 144 for(i in 1:nuruns){
145 if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){ 145 if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){
146 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol]) 146 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])
147 } else{ 147 } else{
148 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1]) 148 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])
149 } 149 }
150 150
151 } 151 }
152 nwnam 152 nwnam
153 153
154 } 154 }
155 155
156 #6# Function for discretizing the data 156 #6# Function for discretizing the data
157 dndat <- function(NDATA){ 157 dndat <- function(NDATA){
158 rownd <- dim(NDATA)[1] 158 rownd <- dim(NDATA)[1]
159 colnd <- dim(NDATA)[2] 159 colnd <- dim(NDATA)[2]
160 DDATA <- matrix(0,nrow=rownd,ncol=colnd) 160 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
161 colnames(DDATA) <- colnames(NDATA) 161 colnames(DDATA) <- colnames(NDATA)
162 i <- 1 162 i <- 1
163 for(i in 1:rownd){ 163 for(i in 1:rownd){
164 j <- 1 164 j <- 1
165 for(j in 1:colnd){ 165 for(j in 1:colnd){
166 if(is.na(NDATA[i,j])==FALSE){ 166 if(is.na(NDATA[i,j])==FALSE){
167 167
168 if(NDATA[i,j] < -1){ 168 if(NDATA[i,j] < -1){
169 DDATA[i,j]=0L 169 DDATA[i,j]=0L
170 } else if(NDATA[i,j] > 1){ 170 } else if(NDATA[i,j] > 1){
171 DDATA[i,j]=2L 171 DDATA[i,j]=2L
172 } else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){ 172 } else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){
173 DDATA[i,j]=1L 173 DDATA[i,j]=1L
174 } 174 }
175 } else{ 175 } else{
176 DDATA[i,j] = NDATA[i,j] 176 DDATA[i,j] = NDATA[i,j]
177 } 177 }
178 j = j + 1 178 j = j + 1
179 } 179 }
180 i = i + 1 180 i = i + 1
181 } 181 }
182 DDATA 182 DDATA
183 } 183 }
184 184
185 185
186 #The Rest of this code will be used every time you want to change a data set 186 #The Rest of this code will be used every time you want to change a data set
187 187
188 #Getting the series matrix file 188 #Getting the series matrix file
189 print("Choose the series matrix file that you want to Analyze") 189 print("Choose the series matrix file that you want to Analyze")
190 alz <- file.choose() 190 alz <- file.choose()
191 191
192 #Getting the GPL file 192 #Getting the GPL file
193 print("Choose the GPL file that correlates with the above series matrix file") 193 print("Choose the GPL file that correlates with the above series matrix file")
194 genena <- file.choose() 194 genena <- file.choose()
195 195
196 196
197 #Find out if it is a soft GPL file or not 197 #Find out if it is a soft GPL file or not
198 soft <- strsplit(genena,"[\\|/]") %>% 198 soft <- strsplit(genena,"[\\|/]") %>%
199 .[[1]] %>% 199 .[[1]] %>%
200 .[length(.)] %>% 200 .[length(.)] %>%
201 grepl("soft|annot",.) 201 grepl("soft|annot",.)
202 202
203 #Working with the wordy part of the document 203 #Working with the wordy part of the document
204 alzword <- alz %>% 204 alzword <- alz %>%
205 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 205 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
206 filter(grepl("!Sample",X1))%>% 206 filter(grepl("!Sample",X1))%>%
207 filter(!grepl("!Sample_contact",X1)) 207 filter(!grepl("!Sample_contact",X1))
208 208
209 ##Changing row names and column names: 209 ##Changing row names and column names:
210 ALZWORD <- t(alzword) 210 ALZWORD <- t(alzword)
211 rownames(ALZWORD)=NULL 211 rownames(ALZWORD)=NULL
212 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 212 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
213 ALZWORD <- chngrownm(ALZWORD)[-1,] 213 ALZWORD <- chngrownm(ALZWORD)[-1,]
214 ALZWORD <- ALZWORD%>% 214 ALZWORD <- ALZWORD%>%
215 as.data.frame()%>% 215 as.data.frame(.,stringsAsFactors = FALSE)%>%
216 dplyr::select(-starts_with("col")) 216 dplyr::select(-starts_with("col"))
217 217
218 ##Reorganizing information within the columns 218 ##Reorganizing information within the columns
219 ALZWORDF <- cinfo(ALZWORD) 219 ALZWORDF <- cinfo(ALZWORD)
220 220
221 221
222 #Working with Actual Data part of file 222 #Working with Actual Data part of file
223 alzdat <- alz %>% 223 alzdat <- alz %>%
224 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 224 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
225 ALZDAT <- t(alzdat[,-1]) 225 ALZDAT <- t(alzdat[,-1])
226 rownames(ALZDAT)=NULL 226 rownames(ALZDAT)=NULL
227 227
228 ##Is there a clean version of the GPL file available? 228 ##Is there a clean version of the GPL file available?
229 gplnum <- strsplit(genena,"[\\|/]") %>% 229 gplnum <- strsplit(genena,"[\\|/]") %>%
230 .[[1]] %>% 230 .[[1]] %>%
231 .[length(.)] %>% 231 .[length(.)] %>%
232 gsub("\\D","",.) 232 gsub("\\D","",.)
233 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 233 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
234 if(clfileex >= 1){ 234 if(clfileex >= 1){
235 #use the clean version 235 #use the clean version
236 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 236 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
237 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 237 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
238 238
239 } else if(clfileex == 0){ 239 } else if(clfileex == 0){
240 ##Lets Create a clean version 240 ##Lets Create a clean version
241 241
242 ##Gene ID to Gene Name 242 ##Gene ID to Gene Name
243 if(soft == TRUE){ 243 if(soft == TRUE){
244 #Check to see if there is already a file containing information on soft files 244 #Check to see if there is already a file containing information on soft files
245 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 245 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
246 if(fileex == 1){ 246 if(fileex == 1){
247 #Check to see if this GPL soft file has been used before 247 #Check to see if this GPL soft file has been used before
248 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 248 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
249 .$GPL_FILE_NUM%>% 249 .$GPL_FILE_NUM%>%
250 grepl(gplnum,.) %>% 250 grepl(gplnum,.) %>%
251 sum() 251 sum()
252 if(IDF == 1){ 252 if(IDF == 1){
253 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 253 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
254 .$GPL_FILE_NUM%>% 254 .$GPL_FILE_NUM%>%
255 grep(gplnum,.) 255 grep(gplnum,.)
256 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 256 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
257 .$LOC_ID %>% 257 .$LOC_ID %>%
258 .[IDLOCAL] 258 .[IDLOCAL]
259 geneIDNam <- genena %>% 259 geneIDNam <- genena %>%
260 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 260 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
261 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 261 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
262 } else if(IDF == 0){ 262 } else if(IDF == 0){
263 #No information on this particular GPL file 263 #No information on this particular GPL file
264 idLOCGPL <- genena %>% 264 idLOCGPL <- genena %>%
265 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 265 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
266 t(.) %>% 266 t(.) %>%
267 grep("^ID\\s*$",.) %>% 267 grep("^ID\\s*$",.) %>%
268 -1 268 -1
269 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 269 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
270 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 270 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
271 geneIDNam <- genena %>% 271 geneIDNam <- genena %>%
272 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 272 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
273 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 273 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
274 } 274 }
275 } else if(fileex == 0){ 275 } else if(fileex == 0){
276 #We must create a file that we can access for later use 276 #We must create a file that we can access for later use
277 idLOCGPL <- genena %>% 277 idLOCGPL <- genena %>%
278 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 278 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
279 t(.) %>% 279 t(.) %>%
280 grep("^ID\\s*$",.) %>% 280 grep("^ID\\s*$",.) %>%
281 -1 281 -1
282 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 282 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
283 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 283 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
284 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 284 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
285 geneIDNam <- genena %>% 285 geneIDNam <- genena %>%
286 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 286 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
287 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 287 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
288 } 288 }
289 } else if(soft == FALSE){ 289 } else if(soft == FALSE){
290 geneIDNam <- genena %>% 290 geneIDNam <- genena %>%
291 read_delim(delim="\t",comment = "#")%>% 291 read_delim(delim="\t",comment = "#")%>%
292 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 292 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
293 } 293 }
294 294
295 ##Labeling the gene IDs without names 295 ##Labeling the gene IDs without names
296 geneIDNam <- NAFIXING(geneIDNam) 296 geneIDNam <- NAFIXING(geneIDNam)
297 297
298 ##remove the whitespace 298 ##remove the whitespace
299 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 299 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
300 300
301 ##Here is the clean version 301 ##Here is the clean version
302 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 302 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
303 } 303 }
304 304
305 305
306 306
307 ##Changing the gene ID to gene name 307 ##Changing the gene ID to gene name
308 ALZDAT1 <- cgeneID(geneIDNam,alzdat) 308 ALZDAT1 <- cgeneID(geneIDNam,alzdat)
309 colnames(ALZDAT) = ALZDAT1[1,] 309 colnames(ALZDAT) = ALZDAT1[1,]
310 310
311 311
312 ##Adjusting the column names aka the gene names 312 ##Adjusting the column names aka the gene names
313 colnames(ALZDAT) <- gcnames(ALZDAT) 313 colnames(ALZDAT) <- gcnames(ALZDAT)
314 314
315 315
316 #Full RAW Data 316 #Full RAW Data
317 Fullalzdwr <- ALZDAT %>% 317 Fullalzdwr <- ALZDAT %>%
318 as.data.frame() %>% 318 as.data.frame() %>%
319 cbind(ALZWORDF,.) 319 cbind(ALZWORDF,.)
320 320
321 321
322 #Raw file is output 322 #Raw file is output
323 nfnaex <- strsplit(alz,"[\\]") %>% 323 nfnaex <- strsplit(alz,"[\\]") %>%
324 .[[1]] %>% 324 .[[1]] %>%
325 .[length(.)] %>% 325 .[length(.)] %>%
326 gsub("\\D","",.) %>% 326 gsub("\\D","",.) %>%
327 c("GSE",.,"aftexcel.txt") %>% 327 c("GSE",.,"aftexcel.txt") %>%
328 paste(collapse = "") 328 paste(collapse = "")
329 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 329 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
330 330
331 331
332 #Now for the discretization part 332 #Now for the discretization part
333 ##get the wordy part again 333 ##get the wordy part again
334 rawword <- t(ALZWORDF) 334 rawword <- t(ALZWORDF)
335 335
336 ##where is ID_REF located 336 ##where is ID_REF located
337 hereim <- grep("ID_REF",rownames(rawword)) 337 hereim <- grep("ID_REF",rownames(rawword))
338 338
339 ##Subject Names GSM... 339 ##Subject Names GSM...
340 subjnam <- rawword[hereim,] 340 subjnam <- rawword[hereim,]
341 341
342 ##Getting the names for the rows 342 ##Getting the names for the rows
343 namedarows <- rownames(rawword)[-hereim] %>% 343 namedarows <- rownames(rawword)[-hereim] %>%
344 as.data.frame() 344 as.data.frame(.,stringsAsFactors = FALSE)
345 RAWWORD <- rawword[-hereim,] %>% 345 RAWWORD <- rawword[-hereim,] %>%
346 as.data.frame() %>% 346 as.data.frame(.,stringsAsFactors = FALSE) %>%
347 bind_cols(namedarows,.) 347 bind_cols(namedarows,.)
348 z <- 1 348 z <- 1
349 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 349 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
350 for(z in 1:dim(RAWWORD)[1]){ 350 for(z in 1:dim(RAWWORD)[1]){
351 if(sum(is.na(RAWWORD[z,])) > 0){ 351 if(sum(is.na(RAWWORD[z,])) > 0){
352 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 352 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
353 } 353 }
354 if(length(grep("NA",RAWWORD[z,])) > 0){ 354 if(length(grep("NA",RAWWORD[z,])) > 0){
355 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] 355 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
356 } 356 }
357 z <- z + 1 357 z <- z + 1
358 } 358 }
359 359
360 colnames(naroww) <- "ROW_NAs" 360 colnames(naroww) <- "ROW_NAs"
361 RAWWORD <- bind_cols(RAWWORD,naroww) 361 RAWWORD <- bind_cols(RAWWORD,naroww)
362 362
363 363
364 roALZna <- t(ALZDAT) %>% 364 roALZna <- t(ALZDAT) %>%
365 rownames(.) %>% 365 rownames(.) %>%
366 as.data.frame(.) 366 as.data.frame(.,stringsAsFactors = FALSE)
367 colnames(roALZna) <- "ID_REF" 367 colnames(roALZna) <- "ID_REF"
368 368
369 RAWDAT <- t(ALZDAT) %>% 369 RAWDAT <- t(ALZDAT) %>%
370 as.data.frame(.) 370 as.data.frame(.,stringsAsFactors = FALSE)
371 colnames(RAWDAT) <- NULL 371 colnames(RAWDAT) <- NULL
372 rownames(RAWDAT) <- NULL 372 rownames(RAWDAT) <- NULL
373 373
374 RAWDAT2 <- RAWDAT %>% 374 RAWDAT2 <- RAWDAT %>%
375 cbind(roALZna,.) %>% 375 cbind(roALZna,.) %>%
376 dplyr::arrange(.,ID_REF) 376 dplyr::arrange(.,ID_REF)
377 377
378 ##Editing the file for R processing 378 ##Editing the file for R processing
379 RAWDATID <- RAWDAT2[,1] %>% 379 RAWDATID <- RAWDAT2[,1] %>%
380 as.matrix(.) 380 as.matrix(.)
381 381
382 RAWDATNUM <- RAWDAT2[,-1] %>% 382 RAWDATNUM <- RAWDAT2[,-1] %>%
383 mapply(.,FUN = as.numeric) %>% 383 mapply(.,FUN = as.numeric) %>%
384 t(.) 384 t(.)
385 385
386 ##Consolidating genes with the same name 386 ##Consolidating genes with the same name
387 ###create empty matrix of size equal to tabRDATID 387 ###create empty matrix of size equal to tabRDATID
388 tabRDATID <- table(RAWDATID) 388 tabRDATID <- table(RAWDATID)
389 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 389 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
390 j <- 1 390 j <- 1
391 for(j in 1:length(tabRDATID)){ 391 for(j in 1:length(tabRDATID)){
392 392
393 ##Putting the ones without duplicates in their new homes 393 ##Putting the ones without duplicates in their new homes
394 if(tabRDATID[j] == 1){ 394 if(tabRDATID[j] == 1){
395 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 395 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
396 } else if(tabRDATID[j] > 1){ 396 } else if(tabRDATID[j] > 1){
397 ##Averaging duplicates and putting them in their new homes 397 ##Averaging duplicates and putting them in their new homes
398 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 398 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
399 } 399 }
400 j <- j + 1 400 j <- j + 1
401 } 401 }
402 402
403 ##Scaling the Data 403 ##Scaling the Data
404 scrawdat <- NuRDATN%>% 404 scrawdat <- NuRDATN%>%
405 scale() 405 scale()
406 attr(scrawdat,"scaled:center") <- NULL 406 attr(scrawdat,"scaled:center") <- NULL
407 attr(scrawdat,"scaled:scale") <- NULL 407 attr(scrawdat,"scaled:scale") <- NULL
408 colnames(scrawdat) <- rownames(tabRDATID) 408 colnames(scrawdat) <- rownames(tabRDATID)
409 409
410 ##Discretized the Data 410 ##Discretized the Data
411 dialzdat <- scrawdat %>% 411 dialzdat <- scrawdat %>%
412 dndat(.) %>% 412 dndat(.) %>%
413 t()%>% 413 t()%>%
414 as.data.frame(.) 414 as.data.frame(.,stringsAsFactors = FALSE)
415 colnames(dialzdat) <- rownames(RAWDATNUM) 415 colnames(dialzdat) <- rownames(RAWDATNUM)
416 416
417 ##setting "ID_REF" as a new variable 417 ##setting "ID_REF" as a new variable
418 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 418 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)
419 colnames(geneNAM) <- "ID_REF" 419 colnames(geneNAM) <- "ID_REF"
420 rownames(dialzdat) <- NULL 420 rownames(dialzdat) <- NULL
421 dialzdat <-bind_cols(geneNAM,dialzdat) 421 dialzdat <-bind_cols(geneNAM,dialzdat)
422 422
423 ##NAs in a column 423 ##NAs in a column
424 x <- 2 424 x <- 2
425 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 425 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
426 nacol[1,1] = "COL_NAs" 426 nacol[1,1] = "COL_NAs"
427 for(x in 2:dim(dialzdat)[2]){ 427 for(x in 2:dim(dialzdat)[2]){
428 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 428 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
429 x <- x + 1 429 x <- x + 1
430 } 430 }
431 colnames(nacol) <- colnames(dialzdat) 431 colnames(nacol) <- colnames(dialzdat)
432 dialzdat<-bind_rows(dialzdat,nacol) 432 dialzdat<-bind_rows(dialzdat,nacol)
433 433
434 ##NAs in a row 434 ##NAs in a row
435 y <- 1 435 y <- 1
436 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 436 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
437 for(y in 1:dim(dialzdat)[1]){ 437 for(y in 1:dim(dialzdat)[1]){
438 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 438 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
439 y <- y + 1 439 y <- y + 1
440 } 440 }
441 colnames(narowd) <- "ROW_NAs" 441 colnames(narowd) <- "ROW_NAs"
442 dialzdat <- bind_cols(dialzdat,narowd) 442 dialzdat <- bind_cols(dialzdat,narowd)
443 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 443 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
444 colnames(RAWWORD) <- colnames(dialzdat) 444 colnames(RAWWORD) <- colnames(dialzdat)
445 ##converting to character so that the clinical can be brought together with discrete data 445 ##converting to character so that the clinical can be brought together with discrete data
446 k <- 2 446 k <- 2
447 for(k in 2:dim(dialzdat)[2]-1){ 447 for(k in 2:dim(dialzdat)[2]-1){
448 dialzdat[,k] <- as.character(dialzdat[,k]) 448 dialzdat[,k] <- as.character(dialzdat[,k])
449 k <- k + 1 449 k <- k + 1
450 } 450 }
451 #The End the full data 451 #The End the full data
452 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 452 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
453 453
454 #Produces Discrete file 454 #Produces Discrete file
455 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 455 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
456 .[[1]] %>% 456 .[[1]] %>%
457 .[length(.)] %>% 457 .[length(.)] %>%
458 gsub("\\D","",.) %>% 458 gsub("\\D","",.) %>%
459 c("GSE",.,"dscrt.txt") %>% 459 c("GSE",.,"dscrt.txt") %>%
460 paste(collapse = "") 460 paste(collapse = "")
461 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 461 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
462 462
463 463