Commit 48208ef1ad8e4050e3d595fb12b0e20a87d174e4

Authored by Efrain Gonzalez
1 parent 7360830df3
Exists in master

Update added |^UCSC_RefGene_Name$ to list of potential names in GPL

Showing 1 changed file with 8 additions and 8 deletions   Show diff stats
1 1
2 #Efrain H. Gonzalez 2 #Efrain H. Gonzalez
3 #6/22/2017 3 #6/22/2017
4 options(digits = 11) 4 options(digits = 11)
5 #Libraries required to run the code 5 #Libraries required to run the code
6 library(pryr) 6 library(pryr)
7 library(MASS) 7 library(MASS)
8 library(dplyr) 8 library(dplyr)
9 library(tidyr) 9 library(tidyr)
10 library(readr) 10 library(readr)
11 library(stringr) 11 library(stringr)
12 12
13 13
14 #Necessary Functions 14 #Necessary Functions
15 #1#Function for handling the changing of row names and column names 15 #1#Function for handling the changing of row names and column names
16 chngrownm <- function(mat){ 16 chngrownm <- function(mat){
17 row <- dim(mat)[1] 17 row <- dim(mat)[1]
18 col <- dim(mat)[2] 18 col <- dim(mat)[2]
19 e <- 1 19 e <- 1
20 r <- 1 20 r <- 1
21 a <- 1 21 a <- 1
22 h <- 1 22 h <- 1
23 g <- 1 23 g <- 1
24 o <- 1 24 o <- 1
25 for(e in 1:col){ 25 for(e in 1:col){
26 if("!Sample_source_name_ch1"==mat[1,e]){ 26 if("!Sample_source_name_ch1"==mat[1,e]){
27 colnames(mat)[e] <- "Brain_Region" 27 colnames(mat)[e] <- "Brain_Region"
28 } else if("!Sample_title" == mat[1,e]){ 28 } else if("!Sample_title" == mat[1,e]){
29 colnames(mat)[e] <- "Title" 29 colnames(mat)[e] <- "Title"
30 } else if("!Sample_geo_accession" == mat[1,e]){ 30 } else if("!Sample_geo_accession" == mat[1,e]){
31 colnames(mat)[e] <- "ID_REF" 31 colnames(mat)[e] <- "ID_REF"
32 } else{ 32 } else{
33 if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){ 33 if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){
34 colnames(mat)[e] <- paste0("Sex",r) 34 colnames(mat)[e] <- paste0("Sex",r)
35 r = r + 1 35 r = r + 1
36 } 36 }
37 if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){ 37 if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){
38 colnames(mat)[e] <- paste0("PMI",a) 38 colnames(mat)[e] <- paste0("PMI",a)
39 a = a + 1 39 a = a + 1
40 } 40 }
41 if(grepl("age|Age|AGE",mat[2,e])==TRUE){ 41 if(grepl("age|Age|AGE",mat[2,e])==TRUE){
42 colnames(mat)[e] <- paste0("Age",h) 42 colnames(mat)[e] <- paste0("Age",h)
43 h = h + 1 43 h = h + 1
44 } 44 }
45 if(grepl("braak|b&b",mat[2,e])==TRUE){ 45 if(grepl("braak|b&b",mat[2,e])==TRUE){
46 colnames(mat)[e] <- paste0("Braak",g) 46 colnames(mat)[e] <- paste0("Braak",g)
47 g = g + 1 47 g = g + 1
48 } 48 }
49 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){ 49 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){
50 colnames(mat)[e] <- paste0("Group",o) 50 colnames(mat)[e] <- paste0("Group",o)
51 o = o + 1 51 o = o + 1
52 } 52 }
53 53
54 } 54 }
55 e = e + 1 55 e = e + 1
56 } 56 }
57 mat 57 mat
58 } 58 }
59 59
60 #2#Function for reorganizing information within the columns 60 #2#Function for reorganizing information within the columns
61 cinfo <- function(mat){ 61 cinfo <- function(mat){
62 col <- dim(mat)[2] 62 col <- dim(mat)[2]
63 j <-2 63 j <-2
64 for(j in 2:col){ 64 for(j in 2:col){
65 if(grepl("Group",colnames(mat)[j]) == TRUE){ 65 if(grepl("Group",colnames(mat)[j]) == TRUE){
66 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 66 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
67 } else if(grepl("Age",colnames(mat)[j])==TRUE){ 67 } else if(grepl("Age",colnames(mat)[j])==TRUE){
68 mat[,j] <- gsub("\\D","",mat[,j])%>% 68 mat[,j] <- gsub("\\D","",mat[,j])%>%
69 as.integer() 69 as.integer()
70 } else if(grepl("Sex",colnames(mat)[j])==TRUE){ 70 } else if(grepl("Sex",colnames(mat)[j])==TRUE){
71 mat[,j] <- gsub(".+:\\s","",mat[,j]) 71 mat[,j] <- gsub(".+:\\s","",mat[,j])
72 } else if(grepl("PMI",colnames(mat)[j])==TRUE){ 72 } else if(grepl("PMI",colnames(mat)[j])==TRUE){
73 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 73 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
74 as.numeric() 74 as.numeric()
75 } else if(grepl("Braak",colnames(mat)[j])==TRUE){ 75 } else if(grepl("Braak",colnames(mat)[j])==TRUE){
76 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 76 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
77 as.roman()%>% 77 as.roman()%>%
78 as.integer() 78 as.integer()
79 } 79 }
80 j=j+1 80 j=j+1
81 } 81 }
82 mat 82 mat
83 } 83 }
84 84
85 #3#Function for labeling the gene IDs without names 85 #3#Function for labeling the gene IDs without names
86 NAFIXING <- function(GIDNAM){ 86 NAFIXING <- function(GIDNAM){
87 row <- dim(GIDNAM)[1] 87 row <- dim(GIDNAM)[1]
88 i <- 1 88 i <- 1
89 for(i in 1:row){ 89 for(i in 1:row){
90 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 90 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
91 GIDNAM[i,2] <- GIDNAM[i,1] 91 GIDNAM[i,2] <- GIDNAM[i,1]
92 } 92 }
93 i <- i + 1 93 i <- i + 1
94 } 94 }
95 GIDNAM 95 GIDNAM
96 } 96 }
97 97
98 #4#Function for changing the gene ID to gene name 98 #4#Function for changing the gene ID to gene name
99 cgeneID <- function(GeneName,DATA){ 99 cgeneID <- function(GeneName,DATA){
100 nj <- t(GeneName) 100 nj <- t(GeneName)
101 nq <- t(DATA) 101 nq <- t(DATA)
102 colGene <- dim(nj)[2] 102 colGene <- dim(nj)[2]
103 colDATA <- dim(nq)[2] 103 colDATA <- dim(nq)[2]
104 j <- 1 104 j <- 1
105 for(j in 1:colDATA){ 105 for(j in 1:colDATA){
106 #where is that gene id located within the GPL file 106 #where is that gene id located within the GPL file
107 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,]) 107 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
108 if(is.na(sum(chngreq))==FALSE){ 108 if(is.na(sum(chngreq))==FALSE){
109 if(sum(chngreq) > 0){ 109 if(sum(chngreq) > 0){
110 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j]) 110 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
111 } 111 }
112 } 112 }
113 j <- j + 1 113 j <- j + 1
114 } 114 }
115 nq 115 nq
116 } 116 }
117 #cgeneID <- function(GeneName,DATA){ 117 #cgeneID <- function(GeneName,DATA){
118 # colGene <- dim(GeneName)[2] 118 # colGene <- dim(GeneName)[2]
119 # j <- 1 119 # j <- 1
120 # for(j in 1:colGene){ 120 # for(j in 1:colGene){
121 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) 121 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
122 # if(is.na(sum(chngsreq))==FALSE){ 122 # if(is.na(sum(chngsreq))==FALSE){
123 # if(sum(chngsreq) > 0){ 123 # if(sum(chngsreq) > 0){
124 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 124 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
125 # } 125 # }
126 # } 126 # }
127 # j = j+1 127 # j = j+1
128 # } 128 # }
129 # DATA 129 # DATA
130 #} 130 #}
131 131
132 #5#Function for adjusting the gene names 132 #5#Function for adjusting the gene names
133 gcnames <- function(DiData,usecol=1){ 133 gcnames <- function(DiData,usecol=1){
134 nuruns <- dim(DiData)[2] 134 nuruns <- dim(DiData)[2]
135 i = 1 135 i = 1
136 nwnam <- rep("0",length.out=nuruns) 136 nwnam <- rep("0",length.out=nuruns)
137 for(i in 1:nuruns){ 137 for(i in 1:nuruns){
138 if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){ 138 if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){
139 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol]) 139 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])
140 } else{ 140 } else{
141 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1]) 141 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])
142 } 142 }
143 143
144 } 144 }
145 nwnam 145 nwnam
146 146
147 } 147 }
148 148
149 #6# Function for discretizing the data 149 #6# Function for discretizing the data
150 dndat <- function(NDATA){ 150 dndat <- function(NDATA){
151 rownd <- dim(NDATA)[1] 151 rownd <- dim(NDATA)[1]
152 colnd <- dim(NDATA)[2] 152 colnd <- dim(NDATA)[2]
153 DDATA <- matrix(0,nrow=rownd,ncol=colnd) 153 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
154 colnames(DDATA) <- colnames(NDATA) 154 colnames(DDATA) <- colnames(NDATA)
155 i <- 1 155 i <- 1
156 for(i in 1:rownd){ 156 for(i in 1:rownd){
157 j <- 1 157 j <- 1
158 for(j in 1:colnd){ 158 for(j in 1:colnd){
159 if(is.na(NDATA[i,j])==FALSE){ 159 if(is.na(NDATA[i,j])==FALSE){
160 160
161 if(NDATA[i,j] < -1){ 161 if(NDATA[i,j] < -1){
162 DDATA[i,j]=0L 162 DDATA[i,j]=0L
163 } else if(NDATA[i,j] > 1){ 163 } else if(NDATA[i,j] > 1){
164 DDATA[i,j]=2L 164 DDATA[i,j]=2L
165 } else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){ 165 } else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){
166 DDATA[i,j]=1L 166 DDATA[i,j]=1L
167 } 167 }
168 } else{ 168 } else{
169 DDATA[i,j] = NDATA[i,j] 169 DDATA[i,j] = NDATA[i,j]
170 } 170 }
171 j = j + 1 171 j = j + 1
172 } 172 }
173 i = i + 1 173 i = i + 1
174 } 174 }
175 DDATA 175 DDATA
176 } 176 }
177 177
178 178
179 #MajorFunction#This is the function that does everything else 179 #MajorFunction#This is the function that does everything else
180 THEFT <- function(){ 180 THEFT <- function(){
181 #Set working directory based on the directory of the series matrix file Currently only works for windows 181 #Set working directory based on the directory of the series matrix file Currently only works for windows
182 wd <- getwd() 182 wd <- getwd()
183 #list.files() 183 #list.files()
184 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?") 184 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
185 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) 185 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
186 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files()) 186 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
187 GSEfloc <- list.files()[GSEfileloc] 187 GSEfloc <- list.files()[GSEfileloc]
188 #ALL DATA FILES WILL BE CLEANED 188 #ALL DATA FILES WILL BE CLEANED
189 if(numDAT == 1){ 189 if(numDAT == 1){
190 #indexing the data files 190 #indexing the data files
191 n <- 1 191 n <- 1
192 for(n in 1: length(GSEfloc)){ 192 for(n in 1: length(GSEfloc)){
193 alz <- GSEfloc[n] 193 alz <- GSEfloc[n]
194 194
195 #Working with the wordy part of the document 195 #Working with the wordy part of the document
196 alzword <- alz %>% 196 alzword <- alz %>%
197 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 197 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
198 filter(grepl("!Sample",X1))%>% 198 filter(grepl("!Sample",X1))%>%
199 filter(!grepl("!Sample_contact",X1)) 199 filter(!grepl("!Sample_contact",X1))
200 200
201 #Getting the GPL file 201 #Getting the GPL file
202 genena <- grep("_platform_id",alzword$X1) %>% 202 genena <- grep("_platform_id",alzword$X1) %>%
203 alzword$X2[.] %>% 203 alzword$X2[.] %>%
204 str_trim(.) %>% 204 str_trim(.) %>%
205 paste0("^",.,"\\D") %>% 205 paste0("^",.,"\\D") %>%
206 grep(.,list.files()) %>% 206 grep(.,list.files()) %>%
207 list.files()[.] 207 list.files()[.]
208 208
209 #Find out if it is a soft GPL file or not 209 #Find out if it is a soft GPL file or not
210 soft <- strsplit(genena,"[\\|/]") %>% 210 soft <- strsplit(genena,"[\\|/]") %>%
211 .[[1]] %>% 211 .[[1]] %>%
212 .[length(.)] %>% 212 .[length(.)] %>%
213 grepl("soft",.) 213 grepl("soft",.)
214 214
215 ##Changing row names and column names: 215 ##Changing row names and column names:
216 ALZWORD <- t(alzword) 216 ALZWORD <- t(alzword)
217 rownames(ALZWORD)=NULL 217 rownames(ALZWORD)=NULL
218 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 218 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
219 ALZWORD <- chngrownm(ALZWORD)[-1,] 219 ALZWORD <- chngrownm(ALZWORD)[-1,]
220 ALZWORD <- ALZWORD%>% 220 ALZWORD <- ALZWORD%>%
221 as.data.frame(.,stringsAsFactors = FALSE)%>% 221 as.data.frame(.,stringsAsFactors = FALSE)%>%
222 dplyr::select(-starts_with("col")) 222 dplyr::select(-starts_with("col"))
223 223
224 ##Reorganizing information within the columns and final clinical data 224 ##Reorganizing information within the columns and final clinical data
225 ALZWORDF <- cinfo(ALZWORD) 225 ALZWORDF <- cinfo(ALZWORD)
226 226
227 227
228 #Working with Actual Data part of file 228 #Working with Actual Data part of file
229 alzdat <- alz %>% 229 alzdat <- alz %>%
230 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 230 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
231 ALZDAT <- t(alzdat[,-1]) 231 ALZDAT <- t(alzdat[,-1])
232 rownames(ALZDAT)=NULL 232 rownames(ALZDAT)=NULL
233 233
234 ##Is there a clean version of the GPL file available? 234 ##Is there a clean version of the GPL file available?
235 gplnum <- strsplit(genena,"[\\|/]") %>% 235 gplnum <- strsplit(genena,"[\\|/]") %>%
236 .[[1]] %>% 236 .[[1]] %>%
237 .[length(.)] %>% 237 .[length(.)] %>%
238 gsub("\\D","",.) 238 gsub("\\D","",.)
239 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 239 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
240 if(clfileex >= 1){ 240 if(clfileex >= 1){
241 #use the clean version 241 #use the clean version
242 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 242 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
243 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 243 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
244 244
245 } else if(clfileex == 0){ 245 } else if(clfileex == 0){
246 ##Lets Create a clean version 246 ##Lets Create a clean version
247 247
248 ##Gene ID to Gene Name 248 ##Gene ID to Gene Name
249 if(soft == TRUE){ 249 if(soft == TRUE){
250 #Check to see if there is already a file containing information on soft files 250 #Check to see if there is already a file containing information on soft files
251 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 251 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
252 if(fileex == 1){ 252 if(fileex == 1){
253 #Check to see if this GPL soft file has been used before 253 #Check to see if this GPL soft file has been used before
254 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 254 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
255 .$GPL_FILE_NUM%>% 255 .$GPL_FILE_NUM%>%
256 grepl(gplnum,.) %>% 256 grepl(gplnum,.) %>%
257 sum() 257 sum()
258 if(IDF == 1){ 258 if(IDF == 1){
259 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 259 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
260 .$GPL_FILE_NUM%>% 260 .$GPL_FILE_NUM%>%
261 grep(gplnum,.) 261 grep(gplnum,.)
262 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 262 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
263 .$LOC_ID %>% 263 .$LOC_ID %>%
264 .[IDLOCAL] 264 .[IDLOCAL]
265 geneIDNam <- genena %>% 265 geneIDNam <- genena %>%
266 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 266 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
267 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.))) 267 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
268 } else if(IDF == 0){ 268 } else if(IDF == 0){
269 #No information on this particular GPL file 269 #No information on this particular GPL file
270 idLOCGPL <- genena %>% 270 idLOCGPL <- genena %>%
271 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 271 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
272 t(.) %>% 272 t(.) %>%
273 grep("^ID\\s*$",.) %>% 273 grep("^ID\\s*$",.) %>%
274 -1 274 -1
275 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 275 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
276 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 276 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
277 geneIDNam <- genena %>% 277 geneIDNam <- genena %>%
278 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 278 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
279 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.))) 279 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
280 } 280 }
281 } else if(fileex == 0){ 281 } else if(fileex == 0){
282 #We must create a file that we can access for later use 282 #We must create a file that we can access for later use
283 idLOCGPL <- genena %>% 283 idLOCGPL <- genena %>%
284 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 284 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
285 t(.) %>% 285 t(.) %>%
286 grep("^ID\\s*$",.) %>% 286 grep("^ID\\s*$",.) %>%
287 -1 287 -1
288 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 288 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
289 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 289 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
290 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 290 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
291 geneIDNam <- genena %>% 291 geneIDNam <- genena %>%
292 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 292 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
293 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.))) 293 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
294 } 294 }
295 } else if(soft == FALSE){ 295 } else if(soft == FALSE){
296 geneIDNam <- genena %>% 296 geneIDNam <- genena %>%
297 read_delim(delim="\t",comment = "#")%>% 297 read_delim(delim="\t",comment = "#")%>%
298 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.))) 298 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
299 } 299 }
300 300
301 ##Labeling the gene IDs without names 301 ##Labeling the gene IDs without names
302 geneIDNam <- NAFIXING(geneIDNam) 302 geneIDNam <- NAFIXING(geneIDNam)
303 303
304 ##remove the whitespace 304 ##remove the whitespace
305 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 305 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
306 306
307 ##Here is the clean version 307 ##Here is the clean version
308 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 308 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
309 } 309 }
310 310
311 311
312 312
313 ##Changing the gene ID to gene name 313 ##Changing the gene ID to gene name
314 ALZDAT1 <- cgeneID(geneIDNam,alzdat) 314 ALZDAT1 <- cgeneID(geneIDNam,alzdat)
315 colnames(ALZDAT) = ALZDAT1[1,] 315 colnames(ALZDAT) = ALZDAT1[1,]
316 316
317 317
318 ##Adjusting the column names aka the gene names 318 ##Adjusting the column names aka the gene names
319 colnames(ALZDAT) <- gcnames(ALZDAT) 319 colnames(ALZDAT) <- gcnames(ALZDAT)
320 320
321 321
322 #Full RAW Data 322 #Full RAW Data
323 Fullalzdwr <- ALZDAT %>% 323 Fullalzdwr <- ALZDAT %>%
324 as.data.frame(.,stringsAsFactors = FALSE) %>% 324 as.data.frame(.,stringsAsFactors = FALSE) %>%
325 cbind(ALZWORDF,.) 325 cbind(ALZWORDF,.)
326 326
327 #Raw file is output 327 #Raw file is output
328 nfnaex <- strsplit(alz,"[\\]") %>% 328 nfnaex <- strsplit(alz,"[\\]") %>%
329 .[[1]] %>% 329 .[[1]] %>%
330 .[length(.)] %>% 330 .[length(.)] %>%
331 gsub("\\D","",.) %>% 331 gsub("\\D","",.) %>%
332 c("GSE",.,"aftexcel.txt") %>% 332 c("GSE",.,"aftexcel.txt") %>%
333 paste(collapse = "") 333 paste(collapse = "")
334 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 334 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
335 335
336 336
337 337
338 #Now for the discretization part 338 #Now for the discretization part
339 ##get the wordy part again 339 ##get the wordy part again
340 rawword <- t(ALZWORDF) 340 rawword <- t(ALZWORDF)
341 341
342 ##where is ID_REF located 342 ##where is ID_REF located
343 hereim <- grep("ID_REF",rownames(rawword)) 343 hereim <- grep("ID_REF",rownames(rawword))
344 344
345 ##Subject Names GSM... 345 ##Subject Names GSM...
346 subjnam <- rawword[hereim,] 346 subjnam <- rawword[hereim,]
347 347
348 ##Getting the names for the rows 348 ##Getting the names for the rows
349 namedarows <- rownames(rawword)[-hereim] %>% 349 namedarows <- rownames(rawword)[-hereim] %>%
350 as.data.frame(.,stringsAsFactors = FALSE) 350 as.data.frame(.,stringsAsFactors = FALSE)
351 RAWWORD <- rawword[-hereim,] %>% 351 RAWWORD <- rawword[-hereim,] %>%
352 as.data.frame(.,stringsAsFactors = FALSE) %>% 352 as.data.frame(.,stringsAsFactors = FALSE) %>%
353 bind_cols(namedarows,.) 353 bind_cols(namedarows,.)
354 z <- 1 354 z <- 1
355 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 355 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
356 for(z in 1:dim(RAWWORD)[1]){ 356 for(z in 1:dim(RAWWORD)[1]){
357 if(sum(is.na(RAWWORD[z,])) > 0){ 357 if(sum(is.na(RAWWORD[z,])) > 0){
358 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 358 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
359 } 359 }
360 if(length(grep("NA",RAWWORD[z,])) > 0){ 360 if(length(grep("NA",RAWWORD[z,])) > 0){
361 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] 361 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
362 } 362 }
363 z <- z + 1 363 z <- z + 1
364 } 364 }
365 365
366 colnames(naroww) <- "ROW_NAs" 366 colnames(naroww) <- "ROW_NAs"
367 RAWWORD <- bind_cols(RAWWORD,naroww) 367 RAWWORD <- bind_cols(RAWWORD,naroww)
368 368
369 369
370 roALZna <- t(ALZDAT) %>% 370 roALZna <- t(ALZDAT) %>%
371 rownames(.) %>% 371 rownames(.) %>%
372 as.data.frame(.,stringsAsFactors = FALSE) 372 as.data.frame(.,stringsAsFactors = FALSE)
373 colnames(roALZna) <- "ID_REF" 373 colnames(roALZna) <- "ID_REF"
374 374
375 RAWDAT <- t(ALZDAT) %>% 375 RAWDAT <- t(ALZDAT) %>%
376 as.data.frame(.,stringsAsFactors = FALSE) 376 as.data.frame(.,stringsAsFactors = FALSE)
377 colnames(RAWDAT) <- NULL 377 colnames(RAWDAT) <- NULL
378 rownames(RAWDAT) <- NULL 378 rownames(RAWDAT) <- NULL
379 379
380 RAWDAT2 <- RAWDAT %>% 380 RAWDAT2 <- RAWDAT %>%
381 cbind(roALZna,.) %>% 381 cbind(roALZna,.) %>%
382 dplyr::arrange(.,ID_REF) 382 dplyr::arrange(.,ID_REF)
383 383
384 ##Editing the file for R processing 384 ##Editing the file for R processing
385 RAWDATID <- RAWDAT2[,1] %>% 385 RAWDATID <- RAWDAT2[,1] %>%
386 as.matrix(.) 386 as.matrix(.)
387 387
388 RAWDATNUM <- RAWDAT2[,-1] %>% 388 RAWDATNUM <- RAWDAT2[,-1] %>%
389 mapply(.,FUN = as.numeric) %>% 389 mapply(.,FUN = as.numeric) %>%
390 t(.) 390 t(.)
391 391
392 ##Consolidating genes with the same name 392 ##Consolidating genes with the same name
393 ###create empty matrix of size equal to tabRDATID 393 ###create empty matrix of size equal to tabRDATID
394 tabRDATID <- table(RAWDATID) 394 tabRDATID <- table(RAWDATID)
395 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 395 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
396 j <- 1 396 j <- 1
397 for(j in 1:length(tabRDATID)){ 397 for(j in 1:length(tabRDATID)){
398 ##Putting the ones without duplicates in their new homes 398 ##Putting the ones without duplicates in their new homes
399 if(tabRDATID[j] == 1){ 399 if(tabRDATID[j] == 1){
400 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 400 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
401 } else if(tabRDATID[j] > 1){ 401 } else if(tabRDATID[j] > 1){
402 ##Averaging duplicates and putting them in their new homes 402 ##Averaging duplicates and putting them in their new homes
403 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 403 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
404 } 404 }
405 j <- j + 1 405 j <- j + 1
406 } 406 }
407 407
408 408
409 ##Outputting non Z-score Average over genes 409 ##Outputting non Z-score Average over genes
410 newoutput <-NuRDATN 410 newoutput <-NuRDATN
411 colnames(newoutput) <- rownames(tabRDATID) 411 colnames(newoutput) <- rownames(tabRDATID)
412 nfnewout <- strsplit(alz,"[\\]") %>% 412 nfnewout <- strsplit(alz,"[\\]") %>%
413 .[[1]] %>% 413 .[[1]] %>%
414 .[length(.)] %>% 414 .[length(.)] %>%
415 gsub("\\D","",.) %>% 415 gsub("\\D","",.) %>%
416 c("GSE",.,"avg.txt") %>% 416 c("GSE",.,"avg.txt") %>%
417 paste(collapse = "") 417 paste(collapse = "")
418 noutput <- newoutput %>% 418 noutput <- newoutput %>%
419 t()%>% 419 t()%>%
420 as.data.frame(.,stringsAsFactors = FALSE) 420 as.data.frame(.,stringsAsFactors = FALSE)
421 noutput <- cbind(rownames(noutput),noutput) 421 noutput <- cbind(rownames(noutput),noutput)
422 colnames(noutput) <- c("Gene Symbol",subjnam) 422 colnames(noutput) <- c("Gene Symbol",subjnam)
423 rownames(noutput) <- NULL 423 rownames(noutput) <- NULL
424 write.table(noutput, file = nfnewout, sep = "\t",col.names = TRUE,row.names = FALSE) 424 write.table(noutput, file = nfnewout, sep = "\t",col.names = TRUE,row.names = FALSE)
425 425
426 426
427 ##Scaling the Data 427 ##Scaling the Data
428 scrawdat <- NuRDATN%>% 428 scrawdat <- NuRDATN%>%
429 scale() 429 scale()
430 attr(scrawdat,"scaled:center") <- NULL 430 attr(scrawdat,"scaled:center") <- NULL
431 attr(scrawdat,"scaled:scale") <- NULL 431 attr(scrawdat,"scaled:scale") <- NULL
432 colnames(scrawdat) <- rownames(tabRDATID) 432 colnames(scrawdat) <- rownames(tabRDATID)
433 433
434 #Outputting the Z-score file 434 #Outputting the Z-score file
435 nfnzsc <- strsplit(alz,"[\\]") %>% 435 nfnzsc <- strsplit(alz,"[\\]") %>%
436 .[[1]] %>% 436 .[[1]] %>%
437 .[length(.)] %>% 437 .[length(.)] %>%
438 gsub("\\D","",.) %>% 438 gsub("\\D","",.) %>%
439 c("GSE",.,"zscore.txt") %>% 439 c("GSE",.,"zscore.txt") %>%
440 paste(collapse = "") 440 paste(collapse = "")
441 zscraw <- scrawdat %>% 441 zscraw <- scrawdat %>%
442 t()%>% 442 t()%>%
443 as.data.frame(.,stringsAsFactors = FALSE) 443 as.data.frame(.,stringsAsFactors = FALSE)
444 zscraw <- cbind(rownames(zscraw),zscraw) 444 zscraw <- cbind(rownames(zscraw),zscraw)
445 colnames(zscraw) <- c("Gene Symbol",subjnam) 445 colnames(zscraw) <- c("Gene Symbol",subjnam)
446 rownames(zscraw) <- NULL 446 rownames(zscraw) <- NULL
447 write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = FALSE) 447 write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = FALSE)
448 448
449 449
450 ##Discretized the Data 450 ##Discretized the Data
451 dialzdat <- scrawdat %>% 451 dialzdat <- scrawdat %>%
452 dndat(.) %>% 452 dndat(.) %>%
453 t()%>% 453 t()%>%
454 as.data.frame(.,stringsAsFactors = FALSE) 454 as.data.frame(.,stringsAsFactors = FALSE)
455 colnames(dialzdat) <- rownames(RAWDATNUM) 455 colnames(dialzdat) <- rownames(RAWDATNUM)
456 456
457 ##setting "ID_REF" as a new variable 457 ##setting "ID_REF" as a new variable
458 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE) 458 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)
459 colnames(geneNAM) <- "ID_REF" 459 colnames(geneNAM) <- "ID_REF"
460 rownames(dialzdat) <- NULL 460 rownames(dialzdat) <- NULL
461 dialzdat <-bind_cols(geneNAM,dialzdat) 461 dialzdat <-bind_cols(geneNAM,dialzdat)
462 462
463 ##NAs in a column 463 ##NAs in a column
464 x <- 2 464 x <- 2
465 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 465 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
466 nacol[1,1] = "COL_NAs" 466 nacol[1,1] = "COL_NAs"
467 for(x in 2:dim(dialzdat)[2]){ 467 for(x in 2:dim(dialzdat)[2]){
468 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 468 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
469 x <- x + 1 469 x <- x + 1
470 } 470 }
471 colnames(nacol) <- colnames(dialzdat) 471 colnames(nacol) <- colnames(dialzdat)
472 dialzdat <- bind_rows(dialzdat,nacol) 472 dialzdat <- bind_rows(dialzdat,nacol)
473 473
474 ##NAs in a row 474 ##NAs in a row
475 y <- 1 475 y <- 1
476 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 476 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
477 for(y in 1:dim(dialzdat)[1]){ 477 for(y in 1:dim(dialzdat)[1]){
478 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 478 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
479 y <- y + 1 479 y <- y + 1
480 } 480 }
481 colnames(narowd) <- "ROW_NAs" 481 colnames(narowd) <- "ROW_NAs"
482 dialzdat <- bind_cols(dialzdat,narowd) 482 dialzdat <- bind_cols(dialzdat,narowd)
483 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 483 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
484 colnames(RAWWORD) <- colnames(dialzdat) 484 colnames(RAWWORD) <- colnames(dialzdat)
485 ##converting to character so that the clinical can be brought together with discrete data 485 ##converting to character so that the clinical can be brought together with discrete data
486 k <- 2 486 k <- 2
487 for(k in 2:dim(dialzdat)[2]-1){ 487 for(k in 2:dim(dialzdat)[2]-1){
488 dialzdat[,k] <- as.character(dialzdat[,k]) 488 dialzdat[,k] <- as.character(dialzdat[,k])
489 k <- k + 1 489 k <- k + 1
490 } 490 }
491 #The End the full data 491 #The End the full data
492 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 492 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
493 493
494 #Produces Discrete file 494 #Produces Discrete file
495 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 495 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
496 .[[1]] %>% 496 .[[1]] %>%
497 .[length(.)] %>% 497 .[length(.)] %>%
498 gsub("\\D","",.) %>% 498 gsub("\\D","",.) %>%
499 c("GSE",.,"dscrt.txt") %>% 499 c("GSE",.,"dscrt.txt") %>%
500 paste(collapse = "") 500 paste(collapse = "")
501 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 501 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
502 n <- n +1 502 n <- n +1
503 } 503 }
504 } else if(numDAT == 2){ 504 } else if(numDAT == 2){
505 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN 505 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
506 506
507 #All the files you want to analyze 507 #All the files you want to analyze
508 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:") 508 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
509 if(length(ANDIS) == 0){ 509 if(length(ANDIS) == 0){
510 #Spit out a warning 510 #Spit out a warning
511 warning("You did not select any files and so no cleaning will be performed") 511 warning("You did not select any files and so no cleaning will be performed")
512 } else{ 512 } else{
513 #indexing the data files 513 #indexing the data files
514 n <- 1 514 n <- 1
515 for(n in 1: length(ANDIS)){ 515 for(n in 1: length(ANDIS)){
516 alz <- ANDIS[n] 516 alz <- ANDIS[n]
517 517
518 #Working with the wordy part of the document 518 #Working with the wordy part of the document
519 alzword <- alz %>% 519 alzword <- alz %>%
520 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 520 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
521 filter(grepl("!Sample",X1))%>% 521 filter(grepl("!Sample",X1))%>%
522 filter(!grepl("!Sample_contact",X1)) 522 filter(!grepl("!Sample_contact",X1))
523 523
524 #Getting the GPL file 524 #Getting the GPL file
525 genena <- grep("_platform_id",alzword$X1) %>% 525 genena <- grep("_platform_id",alzword$X1) %>%
526 alzword$X2[.] %>% 526 alzword$X2[.] %>%
527 str_trim(.) %>% 527 str_trim(.) %>%
528 paste0("^",.,"\\D") %>% 528 paste0("^",.,"\\D") %>%
529 grep(.,list.files()) %>% 529 grep(.,list.files()) %>%
530 list.files()[.] 530 list.files()[.]
531 531
532 #Find out if it is a soft GPL file or not 532 #Find out if it is a soft GPL file or not
533 soft <- strsplit(genena,"[\\|/]") %>% 533 soft <- strsplit(genena,"[\\|/]") %>%
534 .[[1]] %>% 534 .[[1]] %>%
535 .[length(.)] %>% 535 .[length(.)] %>%
536 grepl("soft",.) 536 grepl("soft",.)
537 537
538 ##Changing row names and column names: 538 ##Changing row names and column names:
539 ALZWORD <- t(alzword) 539 ALZWORD <- t(alzword)
540 rownames(ALZWORD)=NULL 540 rownames(ALZWORD)=NULL
541 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 541 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
542 ALZWORD <- chngrownm(ALZWORD)[-1,] 542 ALZWORD <- chngrownm(ALZWORD)[-1,]
543 ALZWORD <- ALZWORD%>% 543 ALZWORD <- ALZWORD%>%
544 as.data.frame(.,stringsAsFactors = FALSE)%>% 544 as.data.frame(.,stringsAsFactors = FALSE)%>%
545 dplyr::select(-starts_with("col")) 545 dplyr::select(-starts_with("col"))
546 546
547 ##Reorganizing information within the columns and final clinical data 547 ##Reorganizing information within the columns and final clinical data
548 ALZWORDF <- cinfo(ALZWORD) 548 ALZWORDF <- cinfo(ALZWORD)
549 549
550 550
551 #Working with Actual Data part of file 551 #Working with Actual Data part of file
552 alzdat <- alz %>% 552 alzdat <- alz %>%
553 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 553 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
554 ALZDAT <- t(alzdat[,-1]) 554 ALZDAT <- t(alzdat[,-1])
555 rownames(ALZDAT)=NULL 555 rownames(ALZDAT)=NULL
556 556
557 ##Is there a clean version of the GPL file available? 557 ##Is there a clean version of the GPL file available?
558 gplnum <- strsplit(genena,"[\\|/]") %>% 558 gplnum <- strsplit(genena,"[\\|/]") %>%
559 .[[1]] %>% 559 .[[1]] %>%
560 .[length(.)] %>% 560 .[length(.)] %>%
561 gsub("\\D","",.) 561 gsub("\\D","",.)
562 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 562 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
563 if(clfileex >= 1){ 563 if(clfileex >= 1){
564 #use the clean version 564 #use the clean version
565 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 565 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
566 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 566 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
567 567
568 } else if(clfileex == 0){ 568 } else if(clfileex == 0){
569 ##Lets Create a clean version 569 ##Lets Create a clean version
570 570
571 ##Gene ID to Gene Name 571 ##Gene ID to Gene Name
572 if(soft == TRUE){ 572 if(soft == TRUE){
573 #Check to see if there is already a file containing information on soft files 573 #Check to see if there is already a file containing information on soft files
574 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 574 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
575 if(fileex == 1){ 575 if(fileex == 1){
576 #Check to see if this GPL soft file has been used before 576 #Check to see if this GPL soft file has been used before
577 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 577 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
578 .$GPL_FILE_NUM%>% 578 .$GPL_FILE_NUM%>%
579 grepl(gplnum,.) %>% 579 grepl(gplnum,.) %>%
580 sum() 580 sum()
581 if(IDF == 1){ 581 if(IDF == 1){
582 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 582 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
583 .$GPL_FILE_NUM%>% 583 .$GPL_FILE_NUM%>%
584 grep(gplnum,.) 584 grep(gplnum,.)
585 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 585 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
586 .$LOC_ID %>% 586 .$LOC_ID %>%
587 .[IDLOCAL] 587 .[IDLOCAL]
588 geneIDNam <- genena %>% 588 geneIDNam <- genena %>%
589 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 589 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
590 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.))) 590 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
591 } else if(IDF == 0){ 591 } else if(IDF == 0){
592 #No information on this particular GPL file 592 #No information on this particular GPL file
593 idLOCGPL <- genena %>% 593 idLOCGPL <- genena %>%
594 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 594 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
595 t(.) %>% 595 t(.) %>%
596 grep("^ID\\s*$",.) %>% 596 grep("^ID\\s*$",.) %>%
597 -1 597 -1
598 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 598 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
599 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 599 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
600 geneIDNam <- genena %>% 600 geneIDNam <- genena %>%
601 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 601 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
602 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.))) 602 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
603 } 603 }
604 } else if(fileex == 0){ 604 } else if(fileex == 0){
605 #We must create a file that we can access for later use 605 #We must create a file that we can access for later use
606 idLOCGPL <- genena %>% 606 idLOCGPL <- genena %>%
607 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 607 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
608 t(.) %>% 608 t(.) %>%
609 grep("^ID\\s*$",.) %>% 609 grep("^ID\\s*$",.) %>%
610 -1 610 -1
611 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 611 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
612 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 612 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
613 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 613 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
614 geneIDNam <- genena %>% 614 geneIDNam <- genena %>%
615 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 615 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
616 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.))) 616 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
617 } 617 }
618 } else if(soft == FALSE){ 618 } else if(soft == FALSE){
619 geneIDNam <- genena %>% 619 geneIDNam <- genena %>%
620 read_delim(delim="\t",comment = "#")%>% 620 read_delim(delim="\t",comment = "#")%>%
621 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.))) 621 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
622 } 622 }
623 623
624 ##Labeling the gene IDs without names 624 ##Labeling the gene IDs without names
625 geneIDNam <- NAFIXING(geneIDNam) 625 geneIDNam <- NAFIXING(geneIDNam)
626 626
627 ##remove the whitespace 627 ##remove the whitespace
628 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 628 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
629 629
630 ##Here is the clean version 630 ##Here is the clean version
631 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 631 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
632 } 632 }
633 633
634 634
635 635
636 ##Changing the gene ID to gene name 636 ##Changing the gene ID to gene name
637 ALZDAT1 <- cgeneID(geneIDNam,alzdat) 637 ALZDAT1 <- cgeneID(geneIDNam,alzdat)
638 colnames(ALZDAT) = ALZDAT1[1,] 638 colnames(ALZDAT) = ALZDAT1[1,]
639 639
640 640
641 ##Adjusting the column names aka the gene names 641 ##Adjusting the column names aka the gene names
642 colnames(ALZDAT) <- gcnames(ALZDAT) 642 colnames(ALZDAT) <- gcnames(ALZDAT)
643 643
644 644
645 #Full RAW Data 645 #Full RAW Data
646 Fullalzdwr <- ALZDAT %>% 646 Fullalzdwr <- ALZDAT %>%
647 as.data.frame(.,stringsAsFactors = FALSE) %>% 647 as.data.frame(.,stringsAsFactors = FALSE) %>%
648 cbind(ALZWORDF,.) 648 cbind(ALZWORDF,.)
649 649
650 #Raw file is output 650 #Raw file is output
651 nfnaex <- strsplit(alz,"[\\]") %>% 651 nfnaex <- strsplit(alz,"[\\]") %>%
652 .[[1]] %>% 652 .[[1]] %>%
653 .[length(.)] %>% 653 .[length(.)] %>%
654 gsub("\\D","",.) %>% 654 gsub("\\D","",.) %>%
655 c("GSE",.,"aftexcel.txt") %>% 655 c("GSE",.,"aftexcel.txt") %>%
656 paste(collapse = "") 656 paste(collapse = "")
657 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 657 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
658 658
659 659
660 660
661 #Now for the discretization part 661 #Now for the discretization part
662 ##get the wordy part again 662 ##get the wordy part again
663 rawword <- t(ALZWORDF) 663 rawword <- t(ALZWORDF)
664 664
665 ##where is ID_REF located 665 ##where is ID_REF located
666 hereim <- grep("ID_REF",rownames(rawword)) 666 hereim <- grep("ID_REF",rownames(rawword))
667 667
668 ##Subject Names GSM... 668 ##Subject Names GSM...
669 subjnam <- rawword[hereim,] 669 subjnam <- rawword[hereim,]
670 670
671 ##Getting the names for the rows 671 ##Getting the names for the rows
672 namedarows <- rownames(rawword)[-hereim] %>% 672 namedarows <- rownames(rawword)[-hereim] %>%
673 as.data.frame(.,stringsAsFactors = FALSE) 673 as.data.frame(.,stringsAsFactors = FALSE)
674 RAWWORD <- rawword[-hereim,] %>% 674 RAWWORD <- rawword[-hereim,] %>%
675 as.data.frame(.,stringsAsFactors = FALSE) %>% 675 as.data.frame(.,stringsAsFactors = FALSE) %>%
676 bind_cols(namedarows,.) 676 bind_cols(namedarows,.)
677 z <- 1 677 z <- 1
678 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 678 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
679 for(z in 1:dim(RAWWORD)[1]){ 679 for(z in 1:dim(RAWWORD)[1]){
680 if(sum(is.na(RAWWORD[z,])) > 0){ 680 if(sum(is.na(RAWWORD[z,])) > 0){
681 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 681 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
682 } 682 }
683 if(length(grep("NA",RAWWORD[z,])) > 0){ 683 if(length(grep("NA",RAWWORD[z,])) > 0){
684 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] 684 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
685 } 685 }
686 z <- z + 1 686 z <- z + 1
687 } 687 }
688 688
689 colnames(naroww) <- "ROW_NAs" 689 colnames(naroww) <- "ROW_NAs"
690 RAWWORD <- bind_cols(RAWWORD,naroww) 690 RAWWORD <- bind_cols(RAWWORD,naroww)
691 691
692 692
693 roALZna <- t(ALZDAT) %>% 693 roALZna <- t(ALZDAT) %>%
694 rownames(.) %>% 694 rownames(.) %>%
695 as.data.frame(.,stringsAsFactors = FALSE) 695 as.data.frame(.,stringsAsFactors = FALSE)
696 colnames(roALZna) <- "ID_REF" 696 colnames(roALZna) <- "ID_REF"
697 697
698 RAWDAT <- t(ALZDAT) %>% 698 RAWDAT <- t(ALZDAT) %>%
699 as.data.frame(.,stringsAsFactors = FALSE) 699 as.data.frame(.,stringsAsFactors = FALSE)
700 colnames(RAWDAT) <- NULL 700 colnames(RAWDAT) <- NULL
701 rownames(RAWDAT) <- NULL 701 rownames(RAWDAT) <- NULL
702 702
703 RAWDAT2 <- RAWDAT %>% 703 RAWDAT2 <- RAWDAT %>%
704 cbind(roALZna,.) %>% 704 cbind(roALZna,.) %>%
705 dplyr::arrange(.,ID_REF) 705 dplyr::arrange(.,ID_REF)
706 706
707 ##Editing the file for R processing 707 ##Editing the file for R processing
708 RAWDATID <- RAWDAT2[,1] %>% 708 RAWDATID <- RAWDAT2[,1] %>%
709 as.matrix(.) 709 as.matrix(.)
710 710
711 RAWDATNUM <- RAWDAT2[,-1] %>% 711 RAWDATNUM <- RAWDAT2[,-1] %>%
712 mapply(.,FUN = as.numeric) %>% 712 mapply(.,FUN = as.numeric) %>%
713 t(.) 713 t(.)
714 714
715 ##Consolidating genes with the same name 715 ##Consolidating genes with the same name
716 ###create empty matrix of size equal to tabRDATID 716 ###create empty matrix of size equal to tabRDATID
717 tabRDATID <- table(RAWDATID) 717 tabRDATID <- table(RAWDATID)
718 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 718 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
719 j <- 1 719 j <- 1
720 for(j in 1:length(tabRDATID)){ 720 for(j in 1:length(tabRDATID)){
721 ##Putting the ones without duplicates in their new homes 721 ##Putting the ones without duplicates in their new homes
722 if(tabRDATID[j] == 1){ 722 if(tabRDATID[j] == 1){
723 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 723 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
724 } else if(tabRDATID[j] > 1){ 724 } else if(tabRDATID[j] > 1){
725 ##Averaging duplicates and putting them in their new homes 725 ##Averaging duplicates and putting them in their new homes
726 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 726 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
727 } 727 }
728 j <- j + 1 728 j <- j + 1
729 } 729 }
730 730
731 ##Outputting non Z-score Average over genes 731 ##Outputting non Z-score Average over genes
732 newoutput <-NuRDATN 732 newoutput <-NuRDATN
733 colnames(newoutput) <- rownames(tabRDATID) 733 colnames(newoutput) <- rownames(tabRDATID)
734 nfnewout <- strsplit(alz,"[\\]") %>% 734 nfnewout <- strsplit(alz,"[\\]") %>%
735 .[[1]] %>% 735 .[[1]] %>%
736 .[length(.)] %>% 736 .[length(.)] %>%
737 gsub("\\D","",.) %>% 737 gsub("\\D","",.) %>%
738 c("GSE",.,"avg.txt") %>% 738 c("GSE",.,"avg.txt") %>%
739 paste(collapse = "") 739 paste(collapse = "")
740 noutput <- newoutput %>% 740 noutput <- newoutput %>%
741 t()%>% 741 t()%>%
742 as.data.frame(.,stringsAsFactors = FALSE) 742 as.data.frame(.,stringsAsFactors = FALSE)
743 noutput <- cbind(rownames(noutput),noutput) 743 noutput <- cbind(rownames(noutput),noutput)
744 colnames(noutput) <- c("Gene Symbol",subjnam) 744 colnames(noutput) <- c("Gene Symbol",subjnam)
745 rownames(noutput) <- NULL 745 rownames(noutput) <- NULL
746 write.table(noutput, file = nfnewout, sep = "\t",col.names = TRUE,row.names = FALSE) 746 write.table(noutput, file = nfnewout, sep = "\t",col.names = TRUE,row.names = FALSE)
747 747
748 748
749 ##Scaling the Data 749 ##Scaling the Data
750 scrawdat <- NuRDATN%>% 750 scrawdat <- NuRDATN%>%
751 scale() 751 scale()
752 attr(scrawdat,"scaled:center") <- NULL 752 attr(scrawdat,"scaled:center") <- NULL
753 attr(scrawdat,"scaled:scale") <- NULL 753 attr(scrawdat,"scaled:scale") <- NULL
754 colnames(scrawdat) <- rownames(tabRDATID) 754 colnames(scrawdat) <- rownames(tabRDATID)
755 755
756 #Outputting the Z-score file 756 #Outputting the Z-score file
757 nfnzsc <- strsplit(alz,"[\\]") %>% 757 nfnzsc <- strsplit(alz,"[\\]") %>%
758 .[[1]] %>% 758 .[[1]] %>%
759 .[length(.)] %>% 759 .[length(.)] %>%
760 gsub("\\D","",.) %>% 760 gsub("\\D","",.) %>%
761 c("GSE",.,"zscore.txt") %>% 761 c("GSE",.,"zscore.txt") %>%
762 paste(collapse = "") 762 paste(collapse = "")
763 zscraw <- scrawdat %>% 763 zscraw <- scrawdat %>%
764 t()%>% 764 t()%>%
765 as.data.frame(.,stringsAsFactors = FALSE) 765 as.data.frame(.,stringsAsFactors = FALSE)
766 zscraw <- cbind(rownames(zscraw),zscraw) 766 zscraw <- cbind(rownames(zscraw),zscraw)
767 colnames(zscraw) <- c("Gene Symbol",subjnam) 767 colnames(zscraw) <- c("Gene Symbol",subjnam)
768 rownames(zscraw) <- NULL 768 rownames(zscraw) <- NULL
769 write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = FALSE) 769 write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = FALSE)
770 770
771 ##Discretized the Data 771 ##Discretized the Data
772 dialzdat <- scrawdat %>% 772 dialzdat <- scrawdat %>%
773 dndat(.) %>% 773 dndat(.) %>%
774 t()%>% 774 t()%>%
775 as.data.frame(.,stringsAsFactors = FALSE) 775 as.data.frame(.,stringsAsFactors = FALSE)
776 colnames(dialzdat) <- rownames(RAWDATNUM) 776 colnames(dialzdat) <- rownames(RAWDATNUM)
777 777
778 ##setting "ID_REF" as a new variable 778 ##setting "ID_REF" as a new variable
779 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE) 779 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)
780 colnames(geneNAM) <- "ID_REF" 780 colnames(geneNAM) <- "ID_REF"
781 rownames(dialzdat) <- NULL 781 rownames(dialzdat) <- NULL
782 dialzdat <-bind_cols(geneNAM,dialzdat) 782 dialzdat <-bind_cols(geneNAM,dialzdat)
783 783
784 ##NAs in a column 784 ##NAs in a column
785 x <- 2 785 x <- 2
786 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 786 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
787 nacol[1,1] = "COL_NAs" 787 nacol[1,1] = "COL_NAs"
788 for(x in 2:dim(dialzdat)[2]){ 788 for(x in 2:dim(dialzdat)[2]){
789 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 789 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
790 x <- x + 1 790 x <- x + 1
791 } 791 }
792 colnames(nacol) <- colnames(dialzdat) 792 colnames(nacol) <- colnames(dialzdat)
793 dialzdat <- bind_rows(dialzdat,nacol) 793 dialzdat <- bind_rows(dialzdat,nacol)
794 794
795 ##NAs in a row 795 ##NAs in a row
796 y <- 1 796 y <- 1
797 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 797 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
798 for(y in 1:dim(dialzdat)[1]){ 798 for(y in 1:dim(dialzdat)[1]){
799 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 799 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
800 y <- y + 1 800 y <- y + 1
801 } 801 }
802 colnames(narowd) <- "ROW_NAs" 802 colnames(narowd) <- "ROW_NAs"
803 dialzdat <- bind_cols(dialzdat,narowd) 803 dialzdat <- bind_cols(dialzdat,narowd)
804 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 804 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
805 colnames(RAWWORD) <- colnames(dialzdat) 805 colnames(RAWWORD) <- colnames(dialzdat)
806 ##converting to character so that the clinical can be brought together with discrete data 806 ##converting to character so that the clinical can be brought together with discrete data
807 k <- 2 807 k <- 2
808 for(k in 2:dim(dialzdat)[2]-1){ 808 for(k in 2:dim(dialzdat)[2]-1){
809 dialzdat[,k] <- as.character(dialzdat[,k]) 809 dialzdat[,k] <- as.character(dialzdat[,k])
810 k <- k + 1 810 k <- k + 1
811 } 811 }
812 #The End the full data 812 #The End the full data
813 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 813 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
814 814
815 #Produces Discrete file 815 #Produces Discrete file
816 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 816 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
817 .[[1]] %>% 817 .[[1]] %>%
818 .[length(.)] %>% 818 .[length(.)] %>%
819 gsub("\\D","",.) %>% 819 gsub("\\D","",.) %>%
820 c("GSE",.,"dscrt.txt") %>% 820 c("GSE",.,"dscrt.txt") %>%
821 paste(collapse = "") 821 paste(collapse = "")
822 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 822 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
823 823
824 824
825 n <- n + 1 825 n <- n + 1
826 } 826 }
827 } 827 }
828 } 828 }
829 } 829 }
830 #The Rest of this code will be used every time you want to change a data set 830 #The Rest of this code will be used every time you want to change a data set
831 THEFT() 831 THEFT()