Commit eccb7a19e29c5a6300ce75a7154eac8089de2a0b

Authored by Efrain Gonzalez
Exists in master

Merge branch 'master' of smlg.fiu.edu:efraingonzalez0/cleaning-and-fixing-data-with-r

1 ######################################################################## 1 ########################################################################
2 # Don't Use This Code Just Yet # 2 # Don't Use This Code Just Yet #
3 ######################################################################## 3 ########################################################################
4 #Efrain H. Gonzalez 4 #Efrain H. Gonzalez
5 #6/16/2017 5 #6/21/2017
6 6 options(digits = 11)
7 #Libraries required to run the code 7 #Libraries required to run the code
8 library(pryr) 8 library(pryr)
9 library(MASS) 9 library(MASS)
10 library(dplyr) 10 library(dplyr)
11 library(tidyr) 11 library(tidyr)
12 library(readr) 12 library(readr)
13 library(stringr) 13 library(stringr)
14 14
15 15
16 #Necessary Functions 16 #Necessary Functions
17 #1#Function for handling the changing of row names and column names 17 #1#Function for handling the changing of row names and column names
18 chngrownm <- function(mat){ 18 chngrownm <- function(mat){
19 row <- dim(mat)[1] 19 row <- dim(mat)[1]
20 col <- dim(mat)[2] 20 col <- dim(mat)[2]
21 e <- 1 21 e <- 1
22 r <- 1 22 r <- 1
23 a <- 1 23 a <- 1
24 h <- 1 24 h <- 1
25 g <- 1 25 g <- 1
26 o <- 1 26 o <- 1
27 for(e in 1:col){ 27 for(e in 1:col){
28 if("!Sample_source_name_ch1"==mat[1,e]){ 28 if("!Sample_source_name_ch1"==mat[1,e]){
29 colnames(mat)[e] <- "Brain_Region" 29 colnames(mat)[e] <- "Brain_Region"
30 } 30 } else if("!Sample_title" == mat[1,e]){
31 else if("!Sample_title" == mat[1,e]){
32 colnames(mat)[e] <- "Title" 31 colnames(mat)[e] <- "Title"
33 } 32 } else if("!Sample_geo_accession" == mat[1,e]){
34 else if("!Sample_geo_accession" == mat[1,e]){
35 colnames(mat)[e] <- "ID_REF" 33 colnames(mat)[e] <- "ID_REF"
36 } else{ 34 } else{
37 if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){ 35 if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){
38 colnames(mat)[e] <- paste0("Sex",r) 36 colnames(mat)[e] <- paste0("Sex",r)
39 r = r + 1 37 r = r + 1
40 } 38 }
41 else if(grepl("postmorteminterval|PMI|pmi",mat[2,e])==TRUE){ 39 if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){
42 colnames(mat)[e] <- paste0("PMI",a) 40 colnames(mat)[e] <- paste0("PMI",a)
43 a = a + 1 41 a = a + 1
44 } 42 }
45 else if(grepl("age|Age|AGE",mat[2,e])==TRUE){ 43 if(grepl("age|Age|AGE",mat[2,e])==TRUE){
46 colnames(mat)[e] <- paste0("Age",h) 44 colnames(mat)[e] <- paste0("Age",h)
47 h = h + 1 45 h = h + 1
48 } 46 }
49 else if(grepl("braak|b&b",mat[2,e])==TRUE){ 47 if(grepl("braak|b&b",mat[2,e])==TRUE){
50 colnames(mat)[e] <- paste0("Braak",g) 48 colnames(mat)[e] <- paste0("Braak",g)
51 g = g + 1 49 g = g + 1
52 } 50 }
53 else if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){ 51 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){
54 colnames(mat)[e] <- paste0("Group",o) 52 colnames(mat)[e] <- paste0("Group",o)
55 o = o + 1 53 o = o + 1
56 } 54 }
57 55
58 } 56 }
59 e = e + 1 57 e = e + 1
60 } 58 }
61 mat 59 mat
62 } 60 }
63 61
64 #2#Function for reorganizing information within the columns 62 #2#Function for reorganizing information within the columns
65 cinfo <- function(mat){ 63 cinfo <- function(mat){
66 col <- dim(mat)[2] 64 col <- dim(mat)[2]
67 j <-2 65 j <-2
68 for(j in 2:col){ 66 for(j in 2:col){
69 if(grepl("Group",colnames(mat)[j]) == TRUE){ 67 if(grepl("Group",colnames(mat)[j]) == TRUE){
70 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 68 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
71 } 69 } else if(grepl("Age",colnames(mat)[j])==TRUE){
72 else if(grepl("Age",colnames(mat)[j])==TRUE){
73 mat[,j] <- gsub("\\D","",mat[,j])%>% 70 mat[,j] <- gsub("\\D","",mat[,j])%>%
74 as.integer() 71 as.integer()
75 } 72 } else if(grepl("Sex",colnames(mat)[j])==TRUE){
76 else if(grepl("Sex",colnames(mat)[j])==TRUE){
77 mat[,j] <- gsub(".+:\\s","",mat[,j]) 73 mat[,j] <- gsub(".+:\\s","",mat[,j])
78 } 74 } else if(grepl("PMI",colnames(mat)[j])==TRUE){
79 else if(grepl("PMI",colnames(mat)[j])==TRUE){
80 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 75 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
81 as.numeric() 76 as.numeric()
82 } 77 } else if(grepl("Braak",colnames(mat)[j])==TRUE){
83 else if(grepl("Braak",colnames(mat)[j])==TRUE){
84 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
85 as.roman()%>% 79 as.roman()%>%
86 as.integer() 80 as.integer()
87 } 81 }
88 j=j+1 82 j=j+1
89 } 83 }
90 mat 84 mat
91 } 85 }
92 86
93 #3#Function for labeling the gene IDs without names 87 #3#Function for labeling the gene IDs without names
94 NAFIXING <- function(GIDNAM){ 88 NAFIXING <- function(GIDNAM){
95 row <- dim(GIDNAM)[1] 89 row <- dim(GIDNAM)[1]
96 i <- 1 90 i <- 1
97 for(i in 1:row){ 91 for(i in 1:row){
98 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 92 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
99 GIDNAM[i,2] <- GIDNAM[i,1] 93 GIDNAM[i,2] <- GIDNAM[i,1]
100 } 94 }
101 i <- i + 1 95 i <- i + 1
102 } 96 }
103 GIDNAM 97 GIDNAM
104 } 98 }
105 99
106 #4#Function for changing the gene ID to gene name 100 #4#Function for changing the gene ID to gene name
107 cgeneID <- function(GeneName,DATA){ 101 cgeneID <- function(GeneName,DATA){
108 colGene <- dim(GeneName)[2] 102 nj <- t(GeneName)
109 j <- 1 103 nq <- t(DATA)
110 for(j in 1:colGene){ 104 colGene <- dim(nj)[2]
111 chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) 105 colDATA <- dim(nq)[2]
112 if(is.na(sum(chngsreq))==FALSE){ 106 j <- 1
113 if(sum(chngsreq) > 0){ 107 for(j in 1:colDATA){
114 DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 108 #where is that gene id located within the GPL file
109 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
110 if(is.na(sum(chngreq))==FALSE){
111 if(sum(chngreq) > 0){
112 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
113 }
115 } 114 }
115 j <- j + 1
116 } 116 }
117 j = j+1 117 nq
118 }
119 DATA
120 } 118 }
119 #cgeneID <- function(GeneName,DATA){
120 # colGene <- dim(GeneName)[2]
121 # j <- 1
122 # for(j in 1:colGene){
123 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
124 # if(is.na(sum(chngsreq))==FALSE){
125 # if(sum(chngsreq) > 0){
126 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
127 # }
128 # }
129 # j = j+1
130 # }
131 # DATA
132 #}
121 133
122 #5#Function for adjusting the gene names 134 #5#Function for adjusting the gene names
123 gcnames <- function(DiData,usecol=1){ 135 gcnames <- function(DiData,usecol=1){
124 nuruns <- dim(DiData)[2] 136 nuruns <- dim(DiData)[2]
125 i = 1 137 i = 1
126 nwnam <- rep("0",length.out=nuruns) 138 nwnam <- rep("0",length.out=nuruns)
127 for(i in 1:nuruns){ 139 for(i in 1:nuruns){
128 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ 140 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
129 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol]) 141 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
130 } else{ 142 } else{
131 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1]) 143 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
132 } 144 }
133 145
134 } 146 }
135 nwnam 147 nwnam
136 148
137 } 149 }
138 150
139 #6# Function for discretizing the data 151 #6# Function for discretizing the data
140 dndat <- function(NDATA){ 152 dndat <- function(NDATA){
141 rownd <- dim(NDATA)[1] 153 rownd <- dim(NDATA)[1]
142 colnd <- dim(NDATA)[2] 154 colnd <- dim(NDATA)[2]
143 DDATA <- matrix(0,nrow=rownd,ncol=colnd) 155 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
144 colnames(DDATA) <- colnames(NDATA) 156 colnames(DDATA) <- colnames(NDATA)
145 i <- 1 157 i <- 1
146 for(i in 1:rownd){ 158 for(i in 1:rownd){
147 j <- 1 159 j <- 1
148 for(j in 1:colnd){ 160 for(j in 1:colnd){
149 if(is.na(NDATA[i,j])==FALSE){ 161 if(is.na(NDATA[i,j])==FALSE){
150 162
151 if(NDATA[i,j] < -1){ 163 if(NDATA[i,j] < -1){
152 DDATA[i,j]=0L 164 DDATA[i,j]=0L
153 } 165 } else if(NDATA[i,j] > 1){
154 if(NDATA[i,j] > 1){
155 DDATA[i,j]=2L 166 DDATA[i,j]=2L
156 } 167 } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
157 if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
158 DDATA[i,j]=1L 168 DDATA[i,j]=1L
159 } 169 }
160 } else{ 170 } else{
161 DDATA[i,j] = NDATA[i,j] 171 DDATA[i,j] = NDATA[i,j]
162 } 172 }
163 j = j + 1 173 j = j + 1
164 } 174 }
165 i = i + 1 175 i = i + 1
166 } 176 }
167 DDATA 177 DDATA
168 } 178 }
169 179
170 180
171 #MajorFunction#This is the function that does everything else 181 #MajorFunction#This is the function that does everything else
172 THEFT <- function(){ 182 THEFT <- function(){
173 #Set working directory based on the directory of the series matrix file Currently only works for windows 183 #Set working directory based on the directory of the series matrix file Currently only works for windows
174 wd <- getwd() 184 wd <- getwd()
175 #list.files() 185 #list.files()
176 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?") 186 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
177 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) 187 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
178 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files()) 188 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
179 189 GSEfloc <- list.files()[GSEfileloc]
180 #ALL DATA FILES WILL BE CLEANED 190 #ALL DATA FILES WILL BE CLEANED
181 if(numDAT == 1){ 191 if(numDAT == 1){
182 #indexing the data files 192 #indexing the data files
183 n <- 1 193 n <- 1
184 for(n in 1: length(GSEfileloc)){ 194 for(n in 1: length(GSEfloc)){
185 alz <- list.files()[GSEfileloc[n]] 195 alz <- GSEfloc[n]
186 196
187 #Working with the wordy part of the document 197 #Working with the wordy part of the document
188 alzword <- alz %>% 198 alzword <- alz %>%
189 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 199 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
190 filter(grepl("!Sample",X1))%>% 200 filter(grepl("!Sample",X1))%>%
191 filter(!grepl("!Sample_contact",X1)) 201 filter(!grepl("!Sample_contact",X1))
192 202
193 #Getting the GPL file 203 #Getting the GPL file
194 genena <- grep("_platform_id",alzword$X1) %>% 204 genena <- grep("_platform_id",alzword$X1) %>%
195 alzword$X2[.] %>% 205 alzword$X2[.] %>%
196 str_trim(.) %>% 206 str_trim(.) %>%
197 paste0("^",.,"\\D") %>% 207 paste0("^",.,"\\D") %>%
198 grep(.,list.files()) %>% 208 grep(.,list.files()) %>%
199 list.files()[.] 209 list.files()[.]
200 210
201 #Find out if it is a soft GPL file or not 211 #Find out if it is a soft GPL file or not
202 soft <- strsplit(genena,"[\\|/]") %>% 212 soft <- strsplit(genena,"[\\|/]") %>%
203 .[[1]] %>% 213 .[[1]] %>%
204 .[length(.)] %>% 214 .[length(.)] %>%
205 grepl("soft",.) 215 grepl("soft",.)
206 216
207 ##Changing row names and column names: 217 ##Changing row names and column names:
208 ALZWORD <- t(alzword) 218 ALZWORD <- t(alzword)
209 rownames(ALZWORD)=NULL 219 rownames(ALZWORD)=NULL
210 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 220 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
211 ALZWORD <- chngrownm(ALZWORD)[-1,] 221 ALZWORD <- chngrownm(ALZWORD)[-1,]
212 ALZWORD <- ALZWORD%>% 222 ALZWORD <- ALZWORD%>%
213 as.data.frame()%>% 223 as.data.frame()%>%
214 dplyr::select(-starts_with("col")) 224 dplyr::select(-starts_with("col"))
215 225
216 ##Reorganizing information within the columns and final clinical data 226 ##Reorganizing information within the columns and final clinical data
217 ALZWORDF <- cinfo(ALZWORD) 227 ALZWORDF <- cinfo(ALZWORD)
218 228
219 229
220 #Working with Actual Data part of file 230 #Working with Actual Data part of file
221 alzdat <- alz %>% 231 alzdat <- alz %>%
222 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 232 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
223 ALZDAT <- t(alzdat[,-1]) 233 ALZDAT <- t(alzdat[,-1])
224 rownames(ALZDAT)=NULL 234 rownames(ALZDAT)=NULL
225 235
226 ##Is there a clean version of the GPL file available? 236 ##Is there a clean version of the GPL file available?
227 gplnum <- strsplit(genena,"[\\|/]") %>% 237 gplnum <- strsplit(genena,"[\\|/]") %>%
228 .[[1]] %>% 238 .[[1]] %>%
229 .[length(.)] %>% 239 .[length(.)] %>%
230 gsub("\\D","",.) 240 gsub("\\D","",.)
231 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 241 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
232 if(clfileex >= 1){ 242 if(clfileex >= 1){
233 #use the clean version 243 #use the clean version
234 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 244 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
235 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 245 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
236 246
237 } 247 } else if(clfileex == 0){
238 else if(clfileex == 0){
239 ##Lets Create a clean version 248 ##Lets Create a clean version
240 249
241 ##Gene ID to Gene Name 250 ##Gene ID to Gene Name
242 if(soft == TRUE){ 251 if(soft == TRUE){
243 #Check to see if there is already a file containing information on soft files 252 #Check to see if there is already a file containing information on soft files
244 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 253 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
245 if(fileex == 1){ 254 if(fileex == 1){
246 #Check to see if this GPL soft file has been used before 255 #Check to see if this GPL soft file has been used before
247 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 256 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
248 .$GPL_FILE_NUM%>% 257 .$GPL_FILE_NUM%>%
249 grepl(gplnum,.) %>% 258 grepl(gplnum,.) %>%
250 sum() 259 sum()
251 if(IDF == 1){ 260 if(IDF == 1){
252 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 261 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
253 .$GPL_FILE_NUM%>% 262 .$GPL_FILE_NUM%>%
254 grep(gplnum,.) 263 grep(gplnum,.)
255 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 264 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
256 .$LOC_ID %>% 265 .$LOC_ID %>%
257 .[IDLOCAL] 266 .[IDLOCAL]
258 geneIDNam <- genena %>% 267 geneIDNam <- genena %>%
259 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 268 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
260 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 269 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
261 } 270 } else if(IDF == 0){
262 else if(IDF == 0){
263 #No information on this particular GPL file 271 #No information on this particular GPL file
264 idLOCGPL <- genena %>% 272 idLOCGPL <- genena %>%
265 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 273 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
266 t(.) %>% 274 t(.) %>%
267 grep("^ID\\s*$",.) %>% 275 grep("^ID\\s*$",.) %>%
268 -1 276 -1
269 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 277 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
270 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 278 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
271 geneIDNam <- genena %>% 279 geneIDNam <- genena %>%
272 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 280 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
273 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 281 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
274 } 282 }
275 } 283 } else if(fileex == 0){
276 else if(fileex == 0){
277 #We must create a file that we can access for later use 284 #We must create a file that we can access for later use
278 idLOCGPL <- genena %>% 285 idLOCGPL <- genena %>%
279 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 286 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
280 t(.) %>% 287 t(.) %>%
281 grep("^ID\\s*$",.) %>% 288 grep("^ID\\s*$",.) %>%
282 -1 289 -1
283 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 290 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
284 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 291 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
285 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 292 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
286 geneIDNam <- genena %>% 293 geneIDNam <- genena %>%
287 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 294 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
288 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 295 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
289 } 296 }
290 } 297 } else if(soft == FALSE){
291 else if(soft == FALSE){
292 geneIDNam <- genena %>% 298 geneIDNam <- genena %>%
293 read_delim(delim="\t",comment = "#")%>% 299 read_delim(delim="\t",comment = "#")%>%
294 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 300 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
295 } 301 }
296 302
297 ##Labeling the gene IDs without names 303 ##Labeling the gene IDs without names
298 geneIDNam <- NAFIXING(geneIDNam) 304 geneIDNam <- NAFIXING(geneIDNam)
299 305
300 ##remove the whitespace 306 ##remove the whitespace
301 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 307 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
302 308
303 ##Here is the clean version 309 ##Here is the clean version
304 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 310 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
305 } 311 }
306 312
307 313
308 314
309 ##Changing the gene ID to gene name 315 ##Changing the gene ID to gene name
310 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 316 ALZDAT1 <- cgeneID(geneIDNam,alzdat)
311 colnames(ALZDAT) = ALZDAT1[1,] 317 colnames(ALZDAT) = ALZDAT1[1,]
312 318
313 319
314 ##Adjusting the column names aka the gene names 320 ##Adjusting the column names aka the gene names
315 colnames(ALZDAT) <- gcnames(ALZDAT) 321 colnames(ALZDAT) <- gcnames(ALZDAT)
316 322
317 323
318 #Full RAW Data 324 #Full RAW Data
319 Fullalzdwr <- ALZDAT %>% 325 Fullalzdwr <- ALZDAT %>%
320 as.data.frame() %>% 326 as.data.frame() %>%
321 cbind(ALZWORDF,.) 327 cbind(ALZWORDF,.)
322 328
323 #Raw file is output 329 #Raw file is output
324 nfnaex <- strsplit(alz,"[\\]") %>% 330 nfnaex <- strsplit(alz,"[\\]") %>%
325 .[[1]] %>% 331 .[[1]] %>%
326 .[length(.)] %>% 332 .[length(.)] %>%
327 gsub("\\D","",.) %>% 333 gsub("\\D","",.) %>%
328 c("GSE",.,"aftexcel.txt") %>% 334 c("GSE",.,"aftexcel.txt") %>%
329 paste(collapse = "") 335 paste(collapse = "")
330 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 336 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
331 337
332 338
333 339
334 #Now for the discretization part 340 #Now for the discretization part
335 ##get the wordy part again 341 ##get the wordy part again
336 rawword <- t(ALZWORDF) 342 rawword <- t(ALZWORDF)
337 343
338 ##where is ID_REF located 344 ##where is ID_REF located
339 hereim <- grep("ID_REF",rownames(rawword)) 345 hereim <- grep("ID_REF",rownames(rawword))
340 346
341 ##Subject Names GSM... 347 ##Subject Names GSM...
342 subjnam <- rawword[hereim,] 348 subjnam <- rawword[hereim,]
343 349
344 ##Getting the names for the rows 350 ##Getting the names for the rows
345 namedarows <- rownames(rawword)[-hereim] %>% 351 namedarows <- rownames(rawword)[-hereim] %>%
346 as.data.frame() 352 as.data.frame()
347 RAWWORD <- rawword[-hereim,] %>% 353 RAWWORD <- rawword[-hereim,] %>%
348 as.data.frame() %>% 354 as.data.frame() %>%
349 bind_cols(namedarows,.) 355 bind_cols(namedarows,.)
350 z <- 1 356 z <- 1
351 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 357 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
352 for(z in 1:dim(RAWWORD)[1]){ 358 for(z in 1:dim(RAWWORD)[1]){
353 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 359 if(sum(is.na(RAWWORD[z,])) > 0){
354 z <- z + 1 360 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
355 } 361 }
362 if(length(grep("NA",RAWWORD[z,])) > 0){
363 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
364 }
365 z <- z + 1
366 }
356 367
357 colnames(naroww) <- "ROW_NAs" 368 colnames(naroww) <- "ROW_NAs"
358 RAWWORD <- bind_cols(RAWWORD,naroww) 369 RAWWORD <- bind_cols(RAWWORD,naroww)
359 370
360 371
361 roALZna <- t(ALZDAT) %>% 372 roALZna <- t(ALZDAT) %>%
362 rownames(.) %>% 373 rownames(.) %>%
363 as.data.frame(.) 374 as.data.frame(.)
364 colnames(roALZna) <- "ID_REF" 375 colnames(roALZna) <- "ID_REF"
365 376
366 RAWDAT <- t(ALZDAT) %>% 377 RAWDAT <- t(ALZDAT) %>%
367 as.data.frame(.) 378 as.data.frame(.)
368 colnames(RAWDAT) <- NULL 379 colnames(RAWDAT) <- NULL
369 rownames(RAWDAT) <- NULL 380 rownames(RAWDAT) <- NULL
370 381
371 RAWDAT2 <- RAWDAT %>% 382 RAWDAT2 <- RAWDAT %>%
372 cbind(roALZna,.) %>% 383 cbind(roALZna,.) %>%
373 dplyr::arrange(.,ID_REF) 384 dplyr::arrange(.,ID_REF)
374 385
375 ##Editing the file for R processing 386 ##Editing the file for R processing
376 RAWDATID <- RAWDAT2[,1] %>% 387 RAWDATID <- RAWDAT2[,1] %>%
377 as.matrix(.) 388 as.matrix(.)
378 389
379 RAWDATNUM <- RAWDAT2[,-1] %>% 390 RAWDATNUM <- RAWDAT2[,-1] %>%
380 mapply(.,FUN = as.numeric) %>% 391 mapply(.,FUN = as.numeric) %>%
381 t(.) 392 t(.)
382 393
383 ##Consolidating genes with the same name 394 ##Consolidating genes with the same name
384 ###create empty matrix of size equal to tabRDATID 395 ###create empty matrix of size equal to tabRDATID
385 tabRDATID <- table(RAWDATID) 396 tabRDATID <- table(RAWDATID)
386 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 397 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
387 j <- 1 398 j <- 1
388 for(j in 1:length(tabRDATID)){ 399 for(j in 1:length(tabRDATID)){
389 ##Putting the ones without duplicates in their new homes 400 ##Putting the ones without duplicates in their new homes
390 if(tabRDATID[j] == 1){ 401 if(tabRDATID[j] == 1){
391 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 402 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
392 } 403 } else if(tabRDATID[j] > 1){
393 ##Averaging duplicates and putting them in their new homes 404 ##Averaging duplicates and putting them in their new homes
394 else if(tabRDATID[j] > 1){
395 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 405 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
396 } 406 }
397 j <- j + 1 407 j <- j + 1
398 } 408 }
399 409
400 ##Scaling the Data 410 ##Scaling the Data
401 scrawdat <- NuRDATN%>% 411 scrawdat <- NuRDATN%>%
402 scale() 412 scale()
403 attr(scrawdat,"scaled:center") <- NULL 413 attr(scrawdat,"scaled:center") <- NULL
404 attr(scrawdat,"scaled:scale") <- NULL 414 attr(scrawdat,"scaled:scale") <- NULL
405 colnames(scrawdat) <- rownames(tabRDATID) 415 colnames(scrawdat) <- rownames(tabRDATID)
406 416
407 ##Discretized the Data 417 ##Discretized the Data
408 dialzdat <- scrawdat %>% 418 dialzdat <- scrawdat %>%
409 dndat(.) %>% 419 dndat(.) %>%
410 t()%>% 420 t()%>%
411 as.data.frame(.) 421 as.data.frame(.)
412 colnames(dialzdat) <- rownames(RAWDATNUM) 422 colnames(dialzdat) <- rownames(RAWDATNUM)
413 423
414 ##setting "ID_REF" as a new variable 424 ##setting "ID_REF" as a new variable
415 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 425 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
416 colnames(geneNAM) <- "ID_REF" 426 colnames(geneNAM) <- "ID_REF"
417 rownames(dialzdat) <- NULL 427 rownames(dialzdat) <- NULL
418 dialzdat <-bind_cols(geneNAM,dialzdat) 428 dialzdat <-bind_cols(geneNAM,dialzdat)
419 429
420 ##NAs in a column 430 ##NAs in a column
421 x <- 2 431 x <- 2
422 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 432 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
423 nacol[1,1] = "COL_NAs" 433 nacol[1,1] = "COL_NAs"
424 for(x in 2:dim(dialzdat)[2]){ 434 for(x in 2:dim(dialzdat)[2]){
425 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 435 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
426 x <- x + 1 436 x <- x + 1
427 } 437 }
428 colnames(nacol) <- colnames(dialzdat) 438 colnames(nacol) <- colnames(dialzdat)
429 dialzdat <- bind_rows(dialzdat,nacol) 439 dialzdat <- bind_rows(dialzdat,nacol)
430 440
431 ##NAs in a row 441 ##NAs in a row
432 y <- 1 442 y <- 1
433 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 443 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
434 for(y in 1:dim(dialzdat)[1]){ 444 for(y in 1:dim(dialzdat)[1]){
435 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 445 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
436 y <- y + 1 446 y <- y + 1
437 } 447 }
438 colnames(narowd) <- "ROW_NAs" 448 colnames(narowd) <- "ROW_NAs"
439 dialzdat <- bind_cols(dialzdat,narowd) 449 dialzdat <- bind_cols(dialzdat,narowd)
440 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 450 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
441 colnames(RAWWORD) <- colnames(dialzdat) 451 colnames(RAWWORD) <- colnames(dialzdat)
442 ##converting to character so that the clinical can be brought together with discrete data 452 ##converting to character so that the clinical can be brought together with discrete data
443 k <- 2 453 k <- 2
444 for(k in 2:dim(dialzdat)[2]-1){ 454 for(k in 2:dim(dialzdat)[2]-1){
445 dialzdat[,k] <- as.character(dialzdat[,k]) 455 dialzdat[,k] <- as.character(dialzdat[,k])
446 k <- k + 1 456 k <- k + 1
447 } 457 }
448 #The End the full data 458 #The End the full data
449 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 459 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
450 460
451 #Produces Discrete file 461 #Produces Discrete file
452 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 462 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
453 .[[1]] %>% 463 .[[1]] %>%
454 .[length(.)] %>% 464 .[length(.)] %>%
455 gsub("\\D","",.) %>% 465 gsub("\\D","",.) %>%
456 c("GSE",.,"dscrt.txt") %>% 466 c("GSE",.,"dscrt.txt") %>%
457 paste(collapse = "") 467 paste(collapse = "")
458 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 468 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
459 n <- n +1 469 n <- n +1
460 } 470 }
461 } 471 } else if(numDAT == 2){
462
463 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN 472 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
464 else if(numDAT == 2){ 473
465 #All the files you want to analyze 474 #All the files you want to analyze
466 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:") 475 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
467 if(length(ANDIS) == 0){ 476 if(length(ANDIS) == 0){
468 #Spit out a warning 477 #Spit out a warning
469 warning("You did not select any files and so no cleaning will be performed") 478 warning("You did not select any files and so no cleaning will be performed")
470 } else{ 479 } else{
471 #indexing the data files 480 #indexing the data files
472 n <- 1 481 n <- 1
473 for(n in 1: length(ANDIS)){ 482 for(n in 1: length(ANDIS)){
474 alz <- ANDIS[n] 483 alz <- ANDIS[n]
475 484
476 #Working with the wordy part of the document 485 #Working with the wordy part of the document
477 alzword <- alz %>% 486 alzword <- alz %>%
478 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 487 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
479 filter(grepl("!Sample",X1))%>% 488 filter(grepl("!Sample",X1))%>%
480 filter(!grepl("!Sample_contact",X1)) 489 filter(!grepl("!Sample_contact",X1))
481 490
482 #Getting the GPL file 491 #Getting the GPL file
483 genena <- grep("_platform_id",alzword$X1) %>% 492 genena <- grep("_platform_id",alzword$X1) %>%
484 alzword$X2[.] %>% 493 alzword$X2[.] %>%
485 str_trim(.) %>% 494 str_trim(.) %>%
486 paste0("^",.,"\\D") %>% 495 paste0("^",.,"\\D") %>%
487 grep(.,list.files()) %>% 496 grep(.,list.files()) %>%
488 list.files()[.] 497 list.files()[.]
489 498
490 #Find out if it is a soft GPL file or not 499 #Find out if it is a soft GPL file or not
491 soft <- strsplit(genena,"[\\|/]") %>% 500 soft <- strsplit(genena,"[\\|/]") %>%
492 .[[1]] %>% 501 .[[1]] %>%
493 .[length(.)] %>% 502 .[length(.)] %>%
494 grepl("soft",.) 503 grepl("soft",.)
495 504
496 ##Changing row names and column names: 505 ##Changing row names and column names:
497 ALZWORD <- t(alzword) 506 ALZWORD <- t(alzword)
498 rownames(ALZWORD)=NULL 507 rownames(ALZWORD)=NULL
499 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 508 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
500 ALZWORD <- chngrownm(ALZWORD)[-1,] 509 ALZWORD <- chngrownm(ALZWORD)[-1,]
501 ALZWORD <- ALZWORD%>% 510 ALZWORD <- ALZWORD%>%
502 as.data.frame()%>% 511 as.data.frame()%>%
503 dplyr::select(-starts_with("col")) 512 dplyr::select(-starts_with("col"))
504 513
505 ##Reorganizing information within the columns and final clinical data 514 ##Reorganizing information within the columns and final clinical data
506 ALZWORDF <- cinfo(ALZWORD) 515 ALZWORDF <- cinfo(ALZWORD)
507 516
508 517
509 #Working with Actual Data part of file 518 #Working with Actual Data part of file
510 alzdat <- alz %>% 519 alzdat <- alz %>%
511 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 520 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
512 ALZDAT <- t(alzdat[,-1]) 521 ALZDAT <- t(alzdat[,-1])
513 rownames(ALZDAT)=NULL 522 rownames(ALZDAT)=NULL
514 523
515 ##Is there a clean version of the GPL file available? 524 ##Is there a clean version of the GPL file available?
516 gplnum <- strsplit(genena,"[\\|/]") %>% 525 gplnum <- strsplit(genena,"[\\|/]") %>%
517 .[[1]] %>% 526 .[[1]] %>%
518 .[length(.)] %>% 527 .[length(.)] %>%
519 gsub("\\D","",.) 528 gsub("\\D","",.)
520 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 529 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
521 if(clfileex >= 1){ 530 if(clfileex >= 1){
522 #use the clean version 531 #use the clean version
523 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 532 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
524 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 533 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
525 534
526 } 535 } else if(clfileex == 0){
527 else if(clfileex == 0){
528 ##Lets Create a clean version 536 ##Lets Create a clean version
529 537
530 ##Gene ID to Gene Name 538 ##Gene ID to Gene Name
531 if(soft == TRUE){ 539 if(soft == TRUE){
532 #Check to see if there is already a file containing information on soft files 540 #Check to see if there is already a file containing information on soft files
533 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 541 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
534 if(fileex == 1){ 542 if(fileex == 1){
535 #Check to see if this GPL soft file has been used before 543 #Check to see if this GPL soft file has been used before
536 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 544 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
537 .$GPL_FILE_NUM%>% 545 .$GPL_FILE_NUM%>%
538 grepl(gplnum,.) %>% 546 grepl(gplnum,.) %>%
539 sum() 547 sum()
540 if(IDF == 1){ 548 if(IDF == 1){
541 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 549 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
542 .$GPL_FILE_NUM%>% 550 .$GPL_FILE_NUM%>%
543 grep(gplnum,.) 551 grep(gplnum,.)
544 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 552 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
545 .$LOC_ID %>% 553 .$LOC_ID %>%
546 .[IDLOCAL] 554 .[IDLOCAL]
547 geneIDNam <- genena %>% 555 geneIDNam <- genena %>%
548 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 556 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
549 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 557 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
550 } 558 } else if(IDF == 0){
551 else if(IDF == 0){
552 #No information on this particular GPL file 559 #No information on this particular GPL file
553 idLOCGPL <- genena %>% 560 idLOCGPL <- genena %>%
554 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 561 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
555 t(.) %>% 562 t(.) %>%
556 grep("^ID\\s*$",.) %>% 563 grep("^ID\\s*$",.) %>%
557 -1 564 -1
558 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 565 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
559 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 566 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
560 geneIDNam <- genena %>% 567 geneIDNam <- genena %>%
561 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 568 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
562 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 569 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
563 } 570 }
564 } 571 } else if(fileex == 0){
565 else if(fileex == 0){
566 #We must create a file that we can access for later use 572 #We must create a file that we can access for later use
567 idLOCGPL <- genena %>% 573 idLOCGPL <- genena %>%
568 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 574 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
569 t(.) %>% 575 t(.) %>%
570 grep("^ID\\s*$",.) %>% 576 grep("^ID\\s*$",.) %>%
571 -1 577 -1
572 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 578 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
573 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 579 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
574 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 580 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
575 geneIDNam <- genena %>% 581 geneIDNam <- genena %>%
576 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 582 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
577 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 583 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
578 } 584 }
579 } 585 } else if(soft == FALSE){
580 else if(soft == FALSE){
581 geneIDNam <- genena %>% 586 geneIDNam <- genena %>%
582 read_delim(delim="\t",comment = "#")%>% 587 read_delim(delim="\t",comment = "#")%>%
583 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 588 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
584 } 589 }
585 590
586 ##Labeling the gene IDs without names 591 ##Labeling the gene IDs without names
587 geneIDNam <- NAFIXING(geneIDNam) 592 geneIDNam <- NAFIXING(geneIDNam)
588 593
589 ##remove the whitespace 594 ##remove the whitespace
590 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 595 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
591 596
592 ##Here is the clean version 597 ##Here is the clean version
593 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 598 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
594 } 599 }
595 600
596 601
597 602
598 ##Changing the gene ID to gene name 603 ##Changing the gene ID to gene name
599 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 604 ALZDAT1 <- cgeneID(geneIDNam,alzdat)
600 colnames(ALZDAT) = ALZDAT1[1,] 605 colnames(ALZDAT) = ALZDAT1[1,]
601 606
602 607
603 ##Adjusting the column names aka the gene names 608 ##Adjusting the column names aka the gene names
604 colnames(ALZDAT) <- gcnames(ALZDAT) 609 colnames(ALZDAT) <- gcnames(ALZDAT)
605 610
606 611
607 #Full RAW Data 612 #Full RAW Data
608 Fullalzdwr <- ALZDAT %>% 613 Fullalzdwr <- ALZDAT %>%
609 as.data.frame() %>% 614 as.data.frame() %>%
610 cbind(ALZWORDF,.) 615 cbind(ALZWORDF,.)
611 616
612 #Raw file is output 617 #Raw file is output
613 nfnaex <- strsplit(alz,"[\\]") %>% 618 nfnaex <- strsplit(alz,"[\\]") %>%
614 .[[1]] %>% 619 .[[1]] %>%
615 .[length(.)] %>% 620 .[length(.)] %>%
616 gsub("\\D","",.) %>% 621 gsub("\\D","",.) %>%
617 c("GSE",.,"aftexcel.txt") %>% 622 c("GSE",.,"aftexcel.txt") %>%
618 paste(collapse = "") 623 paste(collapse = "")
619 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 624 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
620 625
621 626
622 627
623 #Now for the discretization part 628 #Now for the discretization part
624 ##get the wordy part again 629 ##get the wordy part again
625 rawword <- t(ALZWORDF) 630 rawword <- t(ALZWORDF)
626 631
627 ##where is ID_REF located 632 ##where is ID_REF located
628 hereim <- grep("ID_REF",rownames(rawword)) 633 hereim <- grep("ID_REF",rownames(rawword))
629 634
630 ##Subject Names GSM... 635 ##Subject Names GSM...
631 subjnam <- rawword[hereim,] 636 subjnam <- rawword[hereim,]
632 637
633 ##Getting the names for the rows 638 ##Getting the names for the rows
634 namedarows <- rownames(rawword)[-hereim] %>% 639 namedarows <- rownames(rawword)[-hereim] %>%
635 as.data.frame() 640 as.data.frame()
636 RAWWORD <- rawword[-hereim,] %>% 641 RAWWORD <- rawword[-hereim,] %>%
637 as.data.frame() %>% 642 as.data.frame() %>%
638 bind_cols(namedarows,.) 643 bind_cols(namedarows,.)
639 z <- 1 644 z <- 1
640 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 645 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
641 for(z in 1:dim(RAWWORD)[1]){ 646 for(z in 1:dim(RAWWORD)[1]){
642 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 647 if(sum(is.na(RAWWORD[z,])) > 0){
643 z <- z + 1 648 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
644 } 649 }
650 if(length(grep("NA",RAWWORD[z,])) > 0){
651 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
652 }
653 z <- z + 1
654 }
645 655
646 colnames(naroww) <- "ROW_NAs" 656 colnames(naroww) <- "ROW_NAs"
647 RAWWORD <- bind_cols(RAWWORD,naroww) 657 RAWWORD <- bind_cols(RAWWORD,naroww)
648 658
649 659
650 roALZna <- t(ALZDAT) %>% 660 roALZna <- t(ALZDAT) %>%
651 rownames(.) %>% 661 rownames(.) %>%
652 as.data.frame(.) 662 as.data.frame(.)
653 colnames(roALZna) <- "ID_REF" 663 colnames(roALZna) <- "ID_REF"
654 664
655 RAWDAT <- t(ALZDAT) %>% 665 RAWDAT <- t(ALZDAT) %>%
656 as.data.frame(.) 666 as.data.frame(.)
657 colnames(RAWDAT) <- NULL 667 colnames(RAWDAT) <- NULL
658 rownames(RAWDAT) <- NULL 668 rownames(RAWDAT) <- NULL
659 669
660 RAWDAT2 <- RAWDAT %>% 670 RAWDAT2 <- RAWDAT %>%
661 cbind(roALZna,.) %>% 671 cbind(roALZna,.) %>%
662 dplyr::arrange(.,ID_REF) 672 dplyr::arrange(.,ID_REF)
663 673
664 ##Editing the file for R processing 674 ##Editing the file for R processing
665 RAWDATID <- RAWDAT2[,1] %>% 675 RAWDATID <- RAWDAT2[,1] %>%
666 as.matrix(.) 676 as.matrix(.)
667 677
668 RAWDATNUM <- RAWDAT2[,-1] %>% 678 RAWDATNUM <- RAWDAT2[,-1] %>%
669 mapply(.,FUN = as.numeric) %>% 679 mapply(.,FUN = as.numeric) %>%
670 t(.) 680 t(.)
671 681
672 ##Consolidating genes with the same name 682 ##Consolidating genes with the same name
673 ###create empty matrix of size equal to tabRDATID 683 ###create empty matrix of size equal to tabRDATID
674 tabRDATID <- table(RAWDATID) 684 tabRDATID <- table(RAWDATID)
675 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 685 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
676 j <- 1 686 j <- 1
677 for(j in 1:length(tabRDATID)){ 687 for(j in 1:length(tabRDATID)){
678 ##Putting the ones without duplicates in their new homes 688 ##Putting the ones without duplicates in their new homes
679 if(tabRDATID[j] == 1){ 689 if(tabRDATID[j] == 1){
680 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 690 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
681 } 691 } else if(tabRDATID[j] > 1){
682 ##Averaging duplicates and putting them in their new homes 692 ##Averaging duplicates and putting them in their new homes
683 else if(tabRDATID[j] > 1){
684 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 693 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
685 } 694 }
686 j <- j + 1 695 j <- j + 1
687 } 696 }
688 697
689 ##Scaling the Data 698 ##Scaling the Data
690 scrawdat <- NuRDATN%>% 699 scrawdat <- NuRDATN%>%
691 scale() 700 scale()
692 attr(scrawdat,"scaled:center") <- NULL 701 attr(scrawdat,"scaled:center") <- NULL
693 attr(scrawdat,"scaled:scale") <- NULL 702 attr(scrawdat,"scaled:scale") <- NULL
694 colnames(scrawdat) <- rownames(tabRDATID) 703 colnames(scrawdat) <- rownames(tabRDATID)
695 704
696 ##Discretized the Data 705 ##Discretized the Data
697 dialzdat <- scrawdat %>% 706 dialzdat <- scrawdat %>%
698 dndat(.) %>% 707 dndat(.) %>%
699 t()%>% 708 t()%>%
700 as.data.frame(.) 709 as.data.frame(.)
701 colnames(dialzdat) <- rownames(RAWDATNUM) 710 colnames(dialzdat) <- rownames(RAWDATNUM)
702 711
703 ##setting "ID_REF" as a new variable 712 ##setting "ID_REF" as a new variable
704 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 713 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
705 colnames(geneNAM) <- "ID_REF" 714 colnames(geneNAM) <- "ID_REF"
706 rownames(dialzdat) <- NULL 715 rownames(dialzdat) <- NULL
707 dialzdat <-bind_cols(geneNAM,dialzdat) 716 dialzdat <-bind_cols(geneNAM,dialzdat)
708 717
709 ##NAs in a column 718 ##NAs in a column
710 x <- 2 719 x <- 2
711 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 720 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
712 nacol[1,1] = "COL_NAs" 721 nacol[1,1] = "COL_NAs"
713 for(x in 2:dim(dialzdat)[2]){ 722 for(x in 2:dim(dialzdat)[2]){
714 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 723 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
715 x <- x + 1 724 x <- x + 1
716 } 725 }
717 colnames(nacol) <- colnames(dialzdat) 726 colnames(nacol) <- colnames(dialzdat)
718 dialzdat <- bind_rows(dialzdat,nacol) 727 dialzdat <- bind_rows(dialzdat,nacol)
719 728
720 ##NAs in a row 729 ##NAs in a row
721 y <- 1 730 y <- 1
722 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 731 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
723 for(y in 1:dim(dialzdat)[1]){ 732 for(y in 1:dim(dialzdat)[1]){
724 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 733 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
725 y <- y + 1 734 y <- y + 1
726 } 735 }
727 colnames(narowd) <- "ROW_NAs" 736 colnames(narowd) <- "ROW_NAs"
728 dialzdat <- bind_cols(dialzdat,narowd) 737 dialzdat <- bind_cols(dialzdat,narowd)
729 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 738 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
730 colnames(RAWWORD) <- colnames(dialzdat) 739 colnames(RAWWORD) <- colnames(dialzdat)
731 ##converting to character so that the clinical can be brought together with discrete data 740 ##converting to character so that the clinical can be brought together with discrete data
732 k <- 2 741 k <- 2
733 for(k in 2:dim(dialzdat)[2]-1){ 742 for(k in 2:dim(dialzdat)[2]-1){
734 dialzdat[,k] <- as.character(dialzdat[,k]) 743 dialzdat[,k] <- as.character(dialzdat[,k])
735 k <- k + 1 744 k <- k + 1
1 ##Posted 6/15/2017 1 ##Posted 6/15/2017
2 2 options(digits = 11)
3 3
4 #Libraries required to run the code 4 #Libraries required to run the code
5 library(pryr) 5 library(pryr)
6 library(MASS) 6 library(MASS)
7 library(dplyr) 7 library(dplyr)
8 library(tidyr) 8 library(tidyr)
9 library(readr) 9 library(readr)
10 library(stringr) 10 library(stringr)
11 11
12 12
13 #Necessary Functions 13 #Necessary Functions
14 #1#Function for handling the changing of row names and column names 14 #1#Function for handling the changing of row names and column names
15 chngrownm <- function(mat){ 15 chngrownm <- function(mat){
16 row <- dim(mat)[1] 16 row <- dim(mat)[1]
17 col <- dim(mat)[2] 17 col <- dim(mat)[2]
18 j <- 1 18 j <- 1
19 x <- 1 19 x <- 1
20 p <- 1 20 p <- 1
21 a <- 1 21 a <- 1
22 b <- 1 22 b <- 1
23 g <- 1 23 g <- 1
24 for(j in 1:col){ 24 for(j in 1:col){
25 if("!Sample_source_name_ch1"==mat[1,j]){ 25 if("!Sample_source_name_ch1"==mat[1,j]){
26 colnames(mat)[j] <- "Brain_Region" 26 colnames(mat)[j] <- "Brain_Region"
27 } 27 } else if("!Sample_title" == mat[1,j]){
28 if("!Sample_title" == mat[1,j]){
29 colnames(mat)[j] <- "Title" 28 colnames(mat)[j] <- "Title"
30 } 29 } else if("!Sample_geo_accession" == mat[1,j]){
31 if("!Sample_geo_accession" == mat[1,j]){
32 colnames(mat)[j] <- "ID_REF" 30 colnames(mat)[j] <- "ID_REF"
33 } else{ 31 } else{
34 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ 32 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
35 colnames(mat)[j] <- paste0("Sex",x) 33 colnames(mat)[j] <- paste0("Sex",x)
36 x = x + 1 34 x = x + 1
37 } 35 }
38 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ 36 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
39 colnames(mat)[j] <- paste0("PMI",p) 37 colnames(mat)[j] <- paste0("PMI",p)
40 p = p + 1 38 p = p + 1
41 } 39 }
42 if(grepl("age|Age|AGE",mat[2,j])==TRUE){ 40 if(grepl("age|Age|AGE",mat[2,j])==TRUE){
43 colnames(mat)[j] <- paste0("Age",a) 41 colnames(mat)[j] <- paste0("Age",a)
44 a = a + 1 42 a = a + 1
45 } 43 }
46 if(grepl("braak|b&b",mat[2,j])==TRUE){ 44 if(grepl("braak|b&b",mat[2,j])==TRUE){
47 colnames(mat)[j] <- paste0("Braak",b) 45 colnames(mat)[j] <- paste0("Braak",b)
48 b = b + 1 46 b = b + 1
49 } 47 }
50 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){ 48 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){
51 colnames(mat)[j] <- paste0("Group",g) 49 colnames(mat)[j] <- paste0("Group",g)
52 g = g + 1 50 g = g + 1
53 } 51 }
54 52
55 } 53 }
56 j = j + 1 54 j = j + 1
57 } 55 }
58 mat 56 mat
59 } 57 }
60 58
61 #2#Function for reorganizing information within the columns 59 #2#Function for reorganizing information within the columns
62 cinfo <- function(mat){ 60 cinfo <- function(mat){
63 col <- dim(mat)[2] 61 col <- dim(mat)[2]
64 j <-2 62 j <-2
65 for(j in 2:col){ 63 for(j in 2:col){
66 if(grepl("Group",colnames(mat)[j]) == TRUE){ 64 if(grepl("Group",colnames(mat)[j]) == TRUE){
67 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 65 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
68 } 66 }
69 if(grepl("Age",colnames(mat)[j])==TRUE){ 67 if(grepl("Age",colnames(mat)[j])==TRUE){
70 mat[,j] <- gsub("\\D","",mat[,j])%>% 68 mat[,j] <- gsub("\\D","",mat[,j])%>%
71 as.integer() 69 as.integer()
72 } 70 }
73 if(grepl("Sex",colnames(mat)[j])==TRUE){ 71 if(grepl("Sex",colnames(mat)[j])==TRUE){
74 mat[,j] <- gsub(".+:\\s","",mat[,j]) 72 mat[,j] <- gsub(".+:\\s","",mat[,j])
75 } 73 }
76 if(grepl("PMI",colnames(mat)[j])==TRUE){ 74 if(grepl("PMI",colnames(mat)[j])==TRUE){
77 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 75 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
78 as.numeric() 76 as.numeric()
79 } 77 }
80 if(grepl("Braak",colnames(mat)[j])==TRUE){ 78 if(grepl("Braak",colnames(mat)[j])==TRUE){
81 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 79 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
82 as.roman()%>% 80 as.roman()%>%
83 as.integer() 81 as.integer()
84 } 82 }
85 j=j+1 83 j=j+1
86 } 84 }
87 mat 85 mat
88 } 86 }
89 87
90 #3#Function for labeling the gene IDs without names 88 #3#Function for labeling the gene IDs without names
91 NAFIXING <- function(GIDNAM){ 89 NAFIXING <- function(GIDNAM){
92 row <- dim(GIDNAM)[1] 90 row <- dim(GIDNAM)[1]
93 i <- 1 91 i <- 1
94 for(i in 1:row){ 92 for(i in 1:row){
95 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 93 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
96 GIDNAM[i,2] <- GIDNAM[i,1] 94 GIDNAM[i,2] <- GIDNAM[i,1]
97 } 95 }
98 i <- i + 1 96 i <- i + 1
99 } 97 }
100 GIDNAM 98 GIDNAM
101 } 99 }
102 100
103 #4#Function for changing the gene ID to gene name 101 #4#Function for changing the gene ID to gene name
104 cgeneID <- function(GeneName,DATA){ 102 cgeneID <- function(GeneName,DATA){
105 colGene <- dim(GeneName)[2] 103 nj <- t(GeneName)
106 j <- 1 104 nq <- t(DATA)
107 for(j in 1:colGene){ 105 colGene <- dim(nj)[2]
108 chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) 106 colDATA <- dim(nq)[2]
109 if(is.na(sum(chngsreq))==FALSE){ 107 j <- 1
110 if(sum(chngsreq) > 0){ 108 for(j in 1:colDATA){
111 DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 109 #where is that gene id located within the GPL file
110 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
111 if(is.na(sum(chngreq))==FALSE){
112 if(sum(chngreq) > 0){
113 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
114 }
112 } 115 }
116 j <- j + 1
113 } 117 }
114 #if(sum(chngsreq) > 0){ 118 nq
115 ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
116 #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
117 #}
118 j = j+1
119 }
120 DATA
121 } 119 }
120 #cgeneID <- function(GeneName,DATA){
121 # colGene <- dim(GeneName)[2]
122 # j <- 1
123 # for(j in 1:colGene){
124 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
125 # if(is.na(sum(chngsreq))==FALSE){
126 # if(sum(chngsreq) > 0){
127 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
128 # }
129 # }
130 # #if(sum(chngsreq) > 0){
131 # ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
132 # #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
133 # #}
134 # j = j+1
135 # }
136 # DATA
137 #}
122 138
123 #5#Function for adjusting the gene names 139 #5#Function for adjusting the gene names
124 gcnames <- function(DiData,usecol=1){ 140 gcnames <- function(DiData,usecol=1){
125 nuruns <- dim(DiData)[2] 141 nuruns <- dim(DiData)[2]
126 i = 1 142 i = 1
127 nwnam <- rep("0",length.out=nuruns) 143 nwnam <- rep("0",length.out=nuruns)
128 for(i in 1:nuruns){ 144 for(i in 1:nuruns){
129 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ 145 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
130 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol]) 146 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
131 } else{ 147 } else{
132 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1]) 148 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
133 } 149 }
134 150
135 } 151 }
136 nwnam 152 nwnam
137 153
138 } 154 }
139 155
140 #6# Function for discretizing the data 156 #6# Function for discretizing the data
141 dndat <- function(NDATA){ 157 dndat <- function(NDATA){
142 rownd <- dim(NDATA)[1] 158 rownd <- dim(NDATA)[1]
143 colnd <- dim(NDATA)[2] 159 colnd <- dim(NDATA)[2]
144 DDATA <- matrix(0,nrow=rownd,ncol=colnd) 160 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
145 colnames(DDATA) <- colnames(NDATA) 161 colnames(DDATA) <- colnames(NDATA)
146 i <- 1 162 i <- 1
147 for(i in 1:rownd){ 163 for(i in 1:rownd){
148 j <- 1 164 j <- 1
149 for(j in 1:colnd){ 165 for(j in 1:colnd){
150 if(is.na(NDATA[i,j])==FALSE){ 166 if(is.na(NDATA[i,j])==FALSE){
151 167
152 if(NDATA[i,j] < -1){ 168 if(NDATA[i,j] < -1){
153 DDATA[i,j]=0L 169 DDATA[i,j]=0L
154 } 170 } else if(NDATA[i,j] > 1){
155 if(NDATA[i,j] > 1){
156 DDATA[i,j]=2L 171 DDATA[i,j]=2L
157 } 172 } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
158 if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
159 DDATA[i,j]=1L 173 DDATA[i,j]=1L
160 } 174 }
161 } else{ 175 } else{
162 DDATA[i,j] = NDATA[i,j] 176 DDATA[i,j] = NDATA[i,j]
163 } 177 }
164 j = j + 1 178 j = j + 1
165 } 179 }
166 i = i + 1 180 i = i + 1
167 } 181 }
168 DDATA 182 DDATA
169 } 183 }
170 184
171 185
172 #The Rest of this code will be used every time you want to change a data set 186 #The Rest of this code will be used every time you want to change a data set
173 187
174 #Getting the series matrix file 188 #Getting the series matrix file
175 print("Choose the series matrix file that you want to Analyze") 189 print("Choose the series matrix file that you want to Analyze")
176 alz <- file.choose() 190 alz <- file.choose()
177 191
178 #Getting the GPL file 192 #Getting the GPL file
179 print("Choose the GPL file that correlates with the above series matrix file") 193 print("Choose the GPL file that correlates with the above series matrix file")
180 genena <- file.choose() 194 genena <- file.choose()
181 195
182 196
183 #Find out if it is a soft GPL file or not 197 #Find out if it is a soft GPL file or not
184 soft <- strsplit(genena,"[\\|/]") %>% 198 soft <- strsplit(genena,"[\\|/]") %>%
185 .[[1]] %>% 199 .[[1]] %>%
186 .[length(.)] %>% 200 .[length(.)] %>%
187 grepl("soft|annot",.) 201 grepl("soft|annot",.)
188 202
189 #Working with the wordy part of the document 203 #Working with the wordy part of the document
190 alzword <- alz %>% 204 alzword <- alz %>%
191 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 205 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
192 filter(grepl("!Sample",X1))%>% 206 filter(grepl("!Sample",X1))%>%
193 filter(!grepl("!Sample_contact",X1)) 207 filter(!grepl("!Sample_contact",X1))
194 208
195 ##Changing row names and column names: 209 ##Changing row names and column names:
196 ALZWORD <- t(alzword) 210 ALZWORD <- t(alzword)
197 rownames(ALZWORD)=NULL 211 rownames(ALZWORD)=NULL
198 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 212 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
199 ALZWORD <- chngrownm(ALZWORD)[-1,] 213 ALZWORD <- chngrownm(ALZWORD)[-1,]
200 ALZWORD <- ALZWORD%>% 214 ALZWORD <- ALZWORD%>%
201 as.data.frame()%>% 215 as.data.frame()%>%
202 dplyr::select(-starts_with("col")) 216 dplyr::select(-starts_with("col"))
203 217
204 ##Reorganizing information within the columns 218 ##Reorganizing information within the columns
205 ALZWORDF <- cinfo(ALZWORD) 219 ALZWORDF <- cinfo(ALZWORD)
206 220
207 221
208 #Working with Actual Data part of file 222 #Working with Actual Data part of file
209 alzdat <- alz %>% 223 alzdat <- alz %>%
210 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 224 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
211 ALZDAT <- t(alzdat[,-1]) 225 ALZDAT <- t(alzdat[,-1])
212 rownames(ALZDAT)=NULL 226 rownames(ALZDAT)=NULL
213 227
214 ##Is there a clean version of the GPL file available? 228 ##Is there a clean version of the GPL file available?
215 gplnum <- strsplit(genena,"[\\|/]") %>% 229 gplnum <- strsplit(genena,"[\\|/]") %>%
216 .[[1]] %>% 230 .[[1]] %>%
217 .[length(.)] %>% 231 .[length(.)] %>%
218 gsub("\\D","",.) 232 gsub("\\D","",.)
219 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 233 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
220 if(clfileex >= 1){ 234 if(clfileex >= 1){
221 #use the clean version 235 #use the clean version
222 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 236 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
223 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 237 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
224 238
225 } 239 } else if(clfileex == 0){
226 if(clfileex == 0){
227 ##Lets Create a clean version 240 ##Lets Create a clean version
228 241
229 ##Gene ID to Gene Name 242 ##Gene ID to Gene Name
230 if(soft == TRUE){ 243 if(soft == TRUE){
231 #Check to see if there is already a file containing information on soft files 244 #Check to see if there is already a file containing information on soft files
232 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 245 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
233 if(fileex == 1){ 246 if(fileex == 1){
234 #Check to see if this GPL soft file has been used before 247 #Check to see if this GPL soft file has been used before
235 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 248 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
236 .$GPL_FILE_NUM%>% 249 .$GPL_FILE_NUM%>%
237 grepl(gplnum,.) %>% 250 grepl(gplnum,.) %>%
238 sum() 251 sum()
239 if(IDF == 1){ 252 if(IDF == 1){
240 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 253 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
241 .$GPL_FILE_NUM%>% 254 .$GPL_FILE_NUM%>%
242 grep(gplnum,.) 255 grep(gplnum,.)
243 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 256 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
244 .$LOC_ID %>% 257 .$LOC_ID %>%
245 .[IDLOCAL] 258 .[IDLOCAL]
246 geneIDNam <- genena %>% 259 geneIDNam <- genena %>%
247 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 260 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
248 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 261 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
249 } 262 } else if(IDF == 0){
250 if(IDF == 0){
251 #No information on this particular GPL file 263 #No information on this particular GPL file
252 idLOCGPL <- genena %>% 264 idLOCGPL <- genena %>%
253 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 265 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
254 t(.) %>% 266 t(.) %>%
255 grep("^ID\\s*$",.) %>% 267 grep("^ID\\s*$",.) %>%
256 -1 268 -1
257 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 269 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
258 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 270 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
259 geneIDNam <- genena %>% 271 geneIDNam <- genena %>%
260 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 272 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
261 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 273 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
262 } 274 }
263 } 275 } else if(fileex == 0){
264 if(fileex == 0){
265 #We must create a file that we can access for later use 276 #We must create a file that we can access for later use
266 idLOCGPL <- genena %>% 277 idLOCGPL <- genena %>%
267 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 278 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
268 t(.) %>% 279 t(.) %>%
269 grep("^ID\\s*$",.) %>% 280 grep("^ID\\s*$",.) %>%
270 -1 281 -1
271 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 282 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
272 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 283 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
273 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 284 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
274 geneIDNam <- genena %>% 285 geneIDNam <- genena %>%
275 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 286 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
276 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 287 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
277 } 288 }
278 } 289 } else if(soft == FALSE){
279 if(soft == FALSE){
280 geneIDNam <- genena %>% 290 geneIDNam <- genena %>%
281 read_delim(delim="\t",comment = "#")%>% 291 read_delim(delim="\t",comment = "#")%>%
282 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 292 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
283 } 293 }
284 294
285 ##Labeling the gene IDs without names 295 ##Labeling the gene IDs without names
286 geneIDNam <- NAFIXING(geneIDNam) 296 geneIDNam <- NAFIXING(geneIDNam)
287 297
288 ##remove the whitespace 298 ##remove the whitespace
289 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 299 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
290 300
291 ##Here is the clean version 301 ##Here is the clean version
292 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 302 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
293 } 303 }
294 304
295 305
296 306
297 ##Changing the gene ID to gene name 307 ##Changing the gene ID to gene name
298 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 308 ALZDAT1 <- cgeneID(geneIDNam,alzdat)
299 colnames(ALZDAT) = ALZDAT1[1,] 309 colnames(ALZDAT) = ALZDAT1[1,]
300 310
301 311
302 ##Adjusting the column names aka the gene names 312 ##Adjusting the column names aka the gene names
303 colnames(ALZDAT) <- gcnames(ALZDAT) 313 colnames(ALZDAT) <- gcnames(ALZDAT)
304 314
305 315
306 #Full RAW Data 316 #Full RAW Data
307 Fullalzdwr <- ALZDAT %>% 317 Fullalzdwr <- ALZDAT %>%
308 as.data.frame() %>% 318 as.data.frame() %>%
309 cbind(ALZWORDF,.) 319 cbind(ALZWORDF,.)
310 320
311 321
312 #Raw file is output 322 #Raw file is output
313 nfnaex <- strsplit(alz,"[\\]") %>% 323 nfnaex <- strsplit(alz,"[\\]") %>%
314 .[[1]] %>% 324 .[[1]] %>%
315 .[length(.)] %>% 325 .[length(.)] %>%
316 gsub("\\D","",.) %>% 326 gsub("\\D","",.) %>%
317 c("GSE",.,"aftexcel.txt") %>% 327 c("GSE",.,"aftexcel.txt") %>%
318 paste(collapse = "") 328 paste(collapse = "")
319 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 329 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
320 330
321 331
322 #Now for the discretization part 332 #Now for the discretization part
323 ##get the wordy part again 333 ##get the wordy part again
324 rawword <- t(ALZWORDF) 334 rawword <- t(ALZWORDF)
325 335
326 ##where is ID_REF located 336 ##where is ID_REF located
327 hereim <- grep("ID_REF",rownames(rawword)) 337 hereim <- grep("ID_REF",rownames(rawword))
328 338
329 ##Subject Names GSM... 339 ##Subject Names GSM...
330 subjnam <- rawword[hereim,] 340 subjnam <- rawword[hereim,]
331 341
332 ##Getting the names for the rows 342 ##Getting the names for the rows
333 namedarows <- rownames(rawword)[-hereim] %>% 343 namedarows <- rownames(rawword)[-hereim] %>%
334 as.data.frame() 344 as.data.frame()
335 RAWWORD <- rawword[-hereim,] %>% 345 RAWWORD <- rawword[-hereim,] %>%
336 as.data.frame() %>% 346 as.data.frame() %>%
337 bind_cols(namedarows,.) 347 bind_cols(namedarows,.)
338 z <- 1 348 z <- 1
339 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 349 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
340 for(z in 1:dim(RAWWORD)[1]){ 350 for(z in 1:dim(RAWWORD)[1]){
341 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 351 if(sum(is.na(RAWWORD[z,])) > 0){
342 z <- z + 1 352 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
353 }
354 if(length(grep("NA",RAWWORD[z,])) > 0){
355 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
356 }
357 z <- z + 1
343 } 358 }
344 359
345 colnames(naroww) <- "ROW_NAs" 360 colnames(naroww) <- "ROW_NAs"
346 RAWWORD <- bind_cols(RAWWORD,naroww) 361 RAWWORD <- bind_cols(RAWWORD,naroww)
347 362
348 363
349 roALZna <- t(ALZDAT) %>% 364 roALZna <- t(ALZDAT) %>%
350 rownames(.) %>% 365 rownames(.) %>%
351 as.data.frame(.) 366 as.data.frame(.)
352 colnames(roALZna) <- "ID_REF" 367 colnames(roALZna) <- "ID_REF"
353 368
354 RAWDAT <- t(ALZDAT) %>% 369 RAWDAT <- t(ALZDAT) %>%
355 as.data.frame(.) 370 as.data.frame(.)
356 colnames(RAWDAT) <- NULL 371 colnames(RAWDAT) <- NULL
357 rownames(RAWDAT) <- NULL 372 rownames(RAWDAT) <- NULL
358 373
359 RAWDAT2 <- RAWDAT %>% 374 RAWDAT2 <- RAWDAT %>%
360 cbind(roALZna,.) %>% 375 cbind(roALZna,.) %>%
361 dplyr::arrange(.,ID_REF) 376 dplyr::arrange(.,ID_REF)
362 377
363 ##Editing the file for R processing 378 ##Editing the file for R processing
364 RAWDATID <- RAWDAT2[,1] %>% 379 RAWDATID <- RAWDAT2[,1] %>%
365 as.matrix(.) 380 as.matrix(.)
366 381
367 RAWDATNUM <- RAWDAT2[,-1] %>% 382 RAWDATNUM <- RAWDAT2[,-1] %>%
368 mapply(.,FUN = as.numeric) %>% 383 mapply(.,FUN = as.numeric) %>%
369 t(.) 384 t(.)
370 385
371 ##Consolidating genes with the same name 386 ##Consolidating genes with the same name
372 ###create empty matrix of size equal to tabRDATID 387 ###create empty matrix of size equal to tabRDATID
373 tabRDATID <- table(RAWDATID) 388 tabRDATID <- table(RAWDATID)
374 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 389 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
375 j <- 1 390 j <- 1
376 for(j in 1:length(tabRDATID)){ 391 for(j in 1:length(tabRDATID)){
377 392
378 ##Putting the ones without duplicates in their new homes 393 ##Putting the ones without duplicates in their new homes
379 if(tabRDATID[j] == 1){ 394 if(tabRDATID[j] == 1){
380 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 395 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
381 } 396 } else if(tabRDATID[j] > 1){
382 ##Averaging duplicates and putting them in their new homes 397 ##Averaging duplicates and putting them in their new homes
383 if(tabRDATID[j] > 1){
384 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 398 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
385 } 399 }
386 j <- j + 1 400 j <- j + 1
387 } 401 }
388 402
389 ##Scaling the Data 403 ##Scaling the Data
390 scrawdat <- NuRDATN%>% 404 scrawdat <- NuRDATN%>%
391 scale() 405 scale()
392 attr(scrawdat,"scaled:center") <- NULL 406 attr(scrawdat,"scaled:center") <- NULL
393 attr(scrawdat,"scaled:scale") <- NULL 407 attr(scrawdat,"scaled:scale") <- NULL
394 colnames(scrawdat) <- rownames(tabRDATID) 408 colnames(scrawdat) <- rownames(tabRDATID)
395 409
396 ##Discretized the Data 410 ##Discretized the Data
397 dialzdat <- scrawdat %>% 411 dialzdat <- scrawdat %>%
398 dndat(.) %>% 412 dndat(.) %>%
399 t()%>% 413 t()%>%
400 as.data.frame(.) 414 as.data.frame(.)
401 colnames(dialzdat) <- rownames(RAWDATNUM) 415 colnames(dialzdat) <- rownames(RAWDATNUM)
402 416
403 ##setting "ID_REF" as a new variable 417 ##setting "ID_REF" as a new variable
404 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 418 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
405 colnames(geneNAM) <- "ID_REF" 419 colnames(geneNAM) <- "ID_REF"
406 rownames(dialzdat) <- NULL 420 rownames(dialzdat) <- NULL
407 dialzdat <-bind_cols(geneNAM,dialzdat) 421 dialzdat <-bind_cols(geneNAM,dialzdat)
408 422
409 ##NAs in a column 423 ##NAs in a column
410 x <- 2 424 x <- 2
411 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 425 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
412 nacol[1,1] = "COL_NAs" 426 nacol[1,1] = "COL_NAs"
413 for(x in 2:dim(dialzdat)[2]){ 427 for(x in 2:dim(dialzdat)[2]){
414 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 428 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
415 x <- x + 1 429 x <- x + 1
416 } 430 }
417 colnames(nacol) <- colnames(dialzdat) 431 colnames(nacol) <- colnames(dialzdat)
418 dialzdat<-bind_rows(dialzdat,nacol) 432 dialzdat<-bind_rows(dialzdat,nacol)
419 433
420 ##NAs in a row 434 ##NAs in a row
421 y <- 1 435 y <- 1
422 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 436 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
423 for(y in 1:dim(dialzdat)[1]){ 437 for(y in 1:dim(dialzdat)[1]){
424 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 438 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
425 y <- y + 1 439 y <- y + 1
426 } 440 }
427 colnames(narowd) <- "ROW_NAs" 441 colnames(narowd) <- "ROW_NAs"
428 dialzdat <- bind_cols(dialzdat,narowd) 442 dialzdat <- bind_cols(dialzdat,narowd)
429 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 443 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
430 colnames(RAWWORD) <- colnames(dialzdat) 444 colnames(RAWWORD) <- colnames(dialzdat)
431 ##converting to character so that the clinical can be brought together with discrete data 445 ##converting to character so that the clinical can be brought together with discrete data
432 k <- 2 446 k <- 2
433 for(k in 2:dim(dialzdat)[2]-1){ 447 for(k in 2:dim(dialzdat)[2]-1){
434 dialzdat[,k] <- as.character(dialzdat[,k]) 448 dialzdat[,k] <- as.character(dialzdat[,k])