Commit c2f90a541fbda7cd5b56082c0cfc65cdc84acd66

Authored by Efrain Gonzalez
1 parent 2677909e39
Exists in master

Updated Now outputting z-score file as well

Showing 1 changed file with 26 additions and 0 deletions   Show diff stats
1 1
2 #Efrain H. Gonzalez 2 #Efrain H. Gonzalez
3 #6/22/2017 3 #6/22/2017
4 options(digits = 11) 4 options(digits = 11)
5 #Libraries required to run the code 5 #Libraries required to run the code
6 library(pryr) 6 library(pryr)
7 library(MASS) 7 library(MASS)
8 library(dplyr) 8 library(dplyr)
9 library(tidyr) 9 library(tidyr)
10 library(readr) 10 library(readr)
11 library(stringr) 11 library(stringr)
12 12
13 13
14 #Necessary Functions 14 #Necessary Functions
15 #1#Function for handling the changing of row names and column names 15 #1#Function for handling the changing of row names and column names
16 chngrownm <- function(mat){ 16 chngrownm <- function(mat){
17 row <- dim(mat)[1] 17 row <- dim(mat)[1]
18 col <- dim(mat)[2] 18 col <- dim(mat)[2]
19 e <- 1 19 e <- 1
20 r <- 1 20 r <- 1
21 a <- 1 21 a <- 1
22 h <- 1 22 h <- 1
23 g <- 1 23 g <- 1
24 o <- 1 24 o <- 1
25 for(e in 1:col){ 25 for(e in 1:col){
26 if("!Sample_source_name_ch1"==mat[1,e]){ 26 if("!Sample_source_name_ch1"==mat[1,e]){
27 colnames(mat)[e] <- "Brain_Region" 27 colnames(mat)[e] <- "Brain_Region"
28 } else if("!Sample_title" == mat[1,e]){ 28 } else if("!Sample_title" == mat[1,e]){
29 colnames(mat)[e] <- "Title" 29 colnames(mat)[e] <- "Title"
30 } else if("!Sample_geo_accession" == mat[1,e]){ 30 } else if("!Sample_geo_accession" == mat[1,e]){
31 colnames(mat)[e] <- "ID_REF" 31 colnames(mat)[e] <- "ID_REF"
32 } else{ 32 } else{
33 if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){ 33 if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){
34 colnames(mat)[e] <- paste0("Sex",r) 34 colnames(mat)[e] <- paste0("Sex",r)
35 r = r + 1 35 r = r + 1
36 } 36 }
37 if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){ 37 if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){
38 colnames(mat)[e] <- paste0("PMI",a) 38 colnames(mat)[e] <- paste0("PMI",a)
39 a = a + 1 39 a = a + 1
40 } 40 }
41 if(grepl("age|Age|AGE",mat[2,e])==TRUE){ 41 if(grepl("age|Age|AGE",mat[2,e])==TRUE){
42 colnames(mat)[e] <- paste0("Age",h) 42 colnames(mat)[e] <- paste0("Age",h)
43 h = h + 1 43 h = h + 1
44 } 44 }
45 if(grepl("braak|b&b",mat[2,e])==TRUE){ 45 if(grepl("braak|b&b",mat[2,e])==TRUE){
46 colnames(mat)[e] <- paste0("Braak",g) 46 colnames(mat)[e] <- paste0("Braak",g)
47 g = g + 1 47 g = g + 1
48 } 48 }
49 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){ 49 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){
50 colnames(mat)[e] <- paste0("Group",o) 50 colnames(mat)[e] <- paste0("Group",o)
51 o = o + 1 51 o = o + 1
52 } 52 }
53 53
54 } 54 }
55 e = e + 1 55 e = e + 1
56 } 56 }
57 mat 57 mat
58 } 58 }
59 59
60 #2#Function for reorganizing information within the columns 60 #2#Function for reorganizing information within the columns
61 cinfo <- function(mat){ 61 cinfo <- function(mat){
62 col <- dim(mat)[2] 62 col <- dim(mat)[2]
63 j <-2 63 j <-2
64 for(j in 2:col){ 64 for(j in 2:col){
65 if(grepl("Group",colnames(mat)[j]) == TRUE){ 65 if(grepl("Group",colnames(mat)[j]) == TRUE){
66 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 66 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
67 } else if(grepl("Age",colnames(mat)[j])==TRUE){ 67 } else if(grepl("Age",colnames(mat)[j])==TRUE){
68 mat[,j] <- gsub("\\D","",mat[,j])%>% 68 mat[,j] <- gsub("\\D","",mat[,j])%>%
69 as.integer() 69 as.integer()
70 } else if(grepl("Sex",colnames(mat)[j])==TRUE){ 70 } else if(grepl("Sex",colnames(mat)[j])==TRUE){
71 mat[,j] <- gsub(".+:\\s","",mat[,j]) 71 mat[,j] <- gsub(".+:\\s","",mat[,j])
72 } else if(grepl("PMI",colnames(mat)[j])==TRUE){ 72 } else if(grepl("PMI",colnames(mat)[j])==TRUE){
73 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 73 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
74 as.numeric() 74 as.numeric()
75 } else if(grepl("Braak",colnames(mat)[j])==TRUE){ 75 } else if(grepl("Braak",colnames(mat)[j])==TRUE){
76 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 76 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
77 as.roman()%>% 77 as.roman()%>%
78 as.integer() 78 as.integer()
79 } 79 }
80 j=j+1 80 j=j+1
81 } 81 }
82 mat 82 mat
83 } 83 }
84 84
85 #3#Function for labeling the gene IDs without names 85 #3#Function for labeling the gene IDs without names
86 NAFIXING <- function(GIDNAM){ 86 NAFIXING <- function(GIDNAM){
87 row <- dim(GIDNAM)[1] 87 row <- dim(GIDNAM)[1]
88 i <- 1 88 i <- 1
89 for(i in 1:row){ 89 for(i in 1:row){
90 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 90 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
91 GIDNAM[i,2] <- GIDNAM[i,1] 91 GIDNAM[i,2] <- GIDNAM[i,1]
92 } 92 }
93 i <- i + 1 93 i <- i + 1
94 } 94 }
95 GIDNAM 95 GIDNAM
96 } 96 }
97 97
98 #4#Function for changing the gene ID to gene name 98 #4#Function for changing the gene ID to gene name
99 cgeneID <- function(GeneName,DATA){ 99 cgeneID <- function(GeneName,DATA){
100 nj <- t(GeneName) 100 nj <- t(GeneName)
101 nq <- t(DATA) 101 nq <- t(DATA)
102 colGene <- dim(nj)[2] 102 colGene <- dim(nj)[2]
103 colDATA <- dim(nq)[2] 103 colDATA <- dim(nq)[2]
104 j <- 1 104 j <- 1
105 for(j in 1:colDATA){ 105 for(j in 1:colDATA){
106 #where is that gene id located within the GPL file 106 #where is that gene id located within the GPL file
107 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,]) 107 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
108 if(is.na(sum(chngreq))==FALSE){ 108 if(is.na(sum(chngreq))==FALSE){
109 if(sum(chngreq) > 0){ 109 if(sum(chngreq) > 0){
110 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j]) 110 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
111 } 111 }
112 } 112 }
113 j <- j + 1 113 j <- j + 1
114 } 114 }
115 nq 115 nq
116 } 116 }
117 #cgeneID <- function(GeneName,DATA){ 117 #cgeneID <- function(GeneName,DATA){
118 # colGene <- dim(GeneName)[2] 118 # colGene <- dim(GeneName)[2]
119 # j <- 1 119 # j <- 1
120 # for(j in 1:colGene){ 120 # for(j in 1:colGene){
121 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) 121 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
122 # if(is.na(sum(chngsreq))==FALSE){ 122 # if(is.na(sum(chngsreq))==FALSE){
123 # if(sum(chngsreq) > 0){ 123 # if(sum(chngsreq) > 0){
124 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 124 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
125 # } 125 # }
126 # } 126 # }
127 # j = j+1 127 # j = j+1
128 # } 128 # }
129 # DATA 129 # DATA
130 #} 130 #}
131 131
132 #5#Function for adjusting the gene names 132 #5#Function for adjusting the gene names
133 gcnames <- function(DiData,usecol=1){ 133 gcnames <- function(DiData,usecol=1){
134 nuruns <- dim(DiData)[2] 134 nuruns <- dim(DiData)[2]
135 i = 1 135 i = 1
136 nwnam <- rep("0",length.out=nuruns) 136 nwnam <- rep("0",length.out=nuruns)
137 for(i in 1:nuruns){ 137 for(i in 1:nuruns){
138 if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){ 138 if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){
139 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol]) 139 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])
140 } else{ 140 } else{
141 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1]) 141 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])
142 } 142 }
143 143
144 } 144 }
145 nwnam 145 nwnam
146 146
147 } 147 }
148 148
149 #6# Function for discretizing the data 149 #6# Function for discretizing the data
150 dndat <- function(NDATA){ 150 dndat <- function(NDATA){
151 rownd <- dim(NDATA)[1] 151 rownd <- dim(NDATA)[1]
152 colnd <- dim(NDATA)[2] 152 colnd <- dim(NDATA)[2]
153 DDATA <- matrix(0,nrow=rownd,ncol=colnd) 153 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
154 colnames(DDATA) <- colnames(NDATA) 154 colnames(DDATA) <- colnames(NDATA)
155 i <- 1 155 i <- 1
156 for(i in 1:rownd){ 156 for(i in 1:rownd){
157 j <- 1 157 j <- 1
158 for(j in 1:colnd){ 158 for(j in 1:colnd){
159 if(is.na(NDATA[i,j])==FALSE){ 159 if(is.na(NDATA[i,j])==FALSE){
160 160
161 if(NDATA[i,j] < -1){ 161 if(NDATA[i,j] < -1){
162 DDATA[i,j]=0L 162 DDATA[i,j]=0L
163 } else if(NDATA[i,j] > 1){ 163 } else if(NDATA[i,j] > 1){
164 DDATA[i,j]=2L 164 DDATA[i,j]=2L
165 } else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){ 165 } else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){
166 DDATA[i,j]=1L 166 DDATA[i,j]=1L
167 } 167 }
168 } else{ 168 } else{
169 DDATA[i,j] = NDATA[i,j] 169 DDATA[i,j] = NDATA[i,j]
170 } 170 }
171 j = j + 1 171 j = j + 1
172 } 172 }
173 i = i + 1 173 i = i + 1
174 } 174 }
175 DDATA 175 DDATA
176 } 176 }
177 177
178 178
179 #MajorFunction#This is the function that does everything else 179 #MajorFunction#This is the function that does everything else
180 THEFT <- function(){ 180 THEFT <- function(){
181 #Set working directory based on the directory of the series matrix file Currently only works for windows 181 #Set working directory based on the directory of the series matrix file Currently only works for windows
182 wd <- getwd() 182 wd <- getwd()
183 #list.files() 183 #list.files()
184 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?") 184 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
185 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) 185 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
186 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files()) 186 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
187 GSEfloc <- list.files()[GSEfileloc] 187 GSEfloc <- list.files()[GSEfileloc]
188 #ALL DATA FILES WILL BE CLEANED 188 #ALL DATA FILES WILL BE CLEANED
189 if(numDAT == 1){ 189 if(numDAT == 1){
190 #indexing the data files 190 #indexing the data files
191 n <- 1 191 n <- 1
192 for(n in 1: length(GSEfloc)){ 192 for(n in 1: length(GSEfloc)){
193 alz <- GSEfloc[n] 193 alz <- GSEfloc[n]
194 194
195 #Working with the wordy part of the document 195 #Working with the wordy part of the document
196 alzword <- alz %>% 196 alzword <- alz %>%
197 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 197 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
198 filter(grepl("!Sample",X1))%>% 198 filter(grepl("!Sample",X1))%>%
199 filter(!grepl("!Sample_contact",X1)) 199 filter(!grepl("!Sample_contact",X1))
200 200
201 #Getting the GPL file 201 #Getting the GPL file
202 genena <- grep("_platform_id",alzword$X1) %>% 202 genena <- grep("_platform_id",alzword$X1) %>%
203 alzword$X2[.] %>% 203 alzword$X2[.] %>%
204 str_trim(.) %>% 204 str_trim(.) %>%
205 paste0("^",.,"\\D") %>% 205 paste0("^",.,"\\D") %>%
206 grep(.,list.files()) %>% 206 grep(.,list.files()) %>%
207 list.files()[.] 207 list.files()[.]
208 208
209 #Find out if it is a soft GPL file or not 209 #Find out if it is a soft GPL file or not
210 soft <- strsplit(genena,"[\\|/]") %>% 210 soft <- strsplit(genena,"[\\|/]") %>%
211 .[[1]] %>% 211 .[[1]] %>%
212 .[length(.)] %>% 212 .[length(.)] %>%
213 grepl("soft",.) 213 grepl("soft",.)
214 214
215 ##Changing row names and column names: 215 ##Changing row names and column names:
216 ALZWORD <- t(alzword) 216 ALZWORD <- t(alzword)
217 rownames(ALZWORD)=NULL 217 rownames(ALZWORD)=NULL
218 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 218 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
219 ALZWORD <- chngrownm(ALZWORD)[-1,] 219 ALZWORD <- chngrownm(ALZWORD)[-1,]
220 ALZWORD <- ALZWORD%>% 220 ALZWORD <- ALZWORD%>%
221 as.data.frame(.,stringsAsFactors = FALSE)%>% 221 as.data.frame(.,stringsAsFactors = FALSE)%>%
222 dplyr::select(-starts_with("col")) 222 dplyr::select(-starts_with("col"))
223 223
224 ##Reorganizing information within the columns and final clinical data 224 ##Reorganizing information within the columns and final clinical data
225 ALZWORDF <- cinfo(ALZWORD) 225 ALZWORDF <- cinfo(ALZWORD)
226 226
227 227
228 #Working with Actual Data part of file 228 #Working with Actual Data part of file
229 alzdat <- alz %>% 229 alzdat <- alz %>%
230 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 230 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
231 ALZDAT <- t(alzdat[,-1]) 231 ALZDAT <- t(alzdat[,-1])
232 rownames(ALZDAT)=NULL 232 rownames(ALZDAT)=NULL
233 233
234 ##Is there a clean version of the GPL file available? 234 ##Is there a clean version of the GPL file available?
235 gplnum <- strsplit(genena,"[\\|/]") %>% 235 gplnum <- strsplit(genena,"[\\|/]") %>%
236 .[[1]] %>% 236 .[[1]] %>%
237 .[length(.)] %>% 237 .[length(.)] %>%
238 gsub("\\D","",.) 238 gsub("\\D","",.)
239 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 239 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
240 if(clfileex >= 1){ 240 if(clfileex >= 1){
241 #use the clean version 241 #use the clean version
242 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 242 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
243 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 243 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
244 244
245 } else if(clfileex == 0){ 245 } else if(clfileex == 0){
246 ##Lets Create a clean version 246 ##Lets Create a clean version
247 247
248 ##Gene ID to Gene Name 248 ##Gene ID to Gene Name
249 if(soft == TRUE){ 249 if(soft == TRUE){
250 #Check to see if there is already a file containing information on soft files 250 #Check to see if there is already a file containing information on soft files
251 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 251 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
252 if(fileex == 1){ 252 if(fileex == 1){
253 #Check to see if this GPL soft file has been used before 253 #Check to see if this GPL soft file has been used before
254 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 254 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
255 .$GPL_FILE_NUM%>% 255 .$GPL_FILE_NUM%>%
256 grepl(gplnum,.) %>% 256 grepl(gplnum,.) %>%
257 sum() 257 sum()
258 if(IDF == 1){ 258 if(IDF == 1){
259 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 259 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
260 .$GPL_FILE_NUM%>% 260 .$GPL_FILE_NUM%>%
261 grep(gplnum,.) 261 grep(gplnum,.)
262 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 262 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
263 .$LOC_ID %>% 263 .$LOC_ID %>%
264 .[IDLOCAL] 264 .[IDLOCAL]
265 geneIDNam <- genena %>% 265 geneIDNam <- genena %>%
266 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 266 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
267 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.))) 267 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))
268 } else if(IDF == 0){ 268 } else if(IDF == 0){
269 #No information on this particular GPL file 269 #No information on this particular GPL file
270 idLOCGPL <- genena %>% 270 idLOCGPL <- genena %>%
271 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 271 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
272 t(.) %>% 272 t(.) %>%
273 grep("^ID\\s*$",.) %>% 273 grep("^ID\\s*$",.) %>%
274 -1 274 -1
275 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 275 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
276 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 276 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
277 geneIDNam <- genena %>% 277 geneIDNam <- genena %>%
278 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 278 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
279 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.))) 279 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))
280 } 280 }
281 } else if(fileex == 0){ 281 } else if(fileex == 0){
282 #We must create a file that we can access for later use 282 #We must create a file that we can access for later use
283 idLOCGPL <- genena %>% 283 idLOCGPL <- genena %>%
284 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 284 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
285 t(.) %>% 285 t(.) %>%
286 grep("^ID\\s*$",.) %>% 286 grep("^ID\\s*$",.) %>%
287 -1 287 -1
288 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 288 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
289 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 289 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
290 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 290 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
291 geneIDNam <- genena %>% 291 geneIDNam <- genena %>%
292 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 292 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
293 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.))) 293 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))
294 } 294 }
295 } else if(soft == FALSE){ 295 } else if(soft == FALSE){
296 geneIDNam <- genena %>% 296 geneIDNam <- genena %>%
297 read_delim(delim="\t",comment = "#")%>% 297 read_delim(delim="\t",comment = "#")%>%
298 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.))) 298 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))
299 } 299 }
300 300
301 ##Labeling the gene IDs without names 301 ##Labeling the gene IDs without names
302 geneIDNam <- NAFIXING(geneIDNam) 302 geneIDNam <- NAFIXING(geneIDNam)
303 303
304 ##remove the whitespace 304 ##remove the whitespace
305 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 305 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
306 306
307 ##Here is the clean version 307 ##Here is the clean version
308 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 308 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
309 } 309 }
310 310
311 311
312 312
313 ##Changing the gene ID to gene name 313 ##Changing the gene ID to gene name
314 ALZDAT1 <- cgeneID(geneIDNam,alzdat) 314 ALZDAT1 <- cgeneID(geneIDNam,alzdat)
315 colnames(ALZDAT) = ALZDAT1[1,] 315 colnames(ALZDAT) = ALZDAT1[1,]
316 316
317 317
318 ##Adjusting the column names aka the gene names 318 ##Adjusting the column names aka the gene names
319 colnames(ALZDAT) <- gcnames(ALZDAT) 319 colnames(ALZDAT) <- gcnames(ALZDAT)
320 320
321 321
322 #Full RAW Data 322 #Full RAW Data
323 Fullalzdwr <- ALZDAT %>% 323 Fullalzdwr <- ALZDAT %>%
324 as.data.frame(.,stringsAsFactors = FALSE) %>% 324 as.data.frame(.,stringsAsFactors = FALSE) %>%
325 cbind(ALZWORDF,.) 325 cbind(ALZWORDF,.)
326 326
327 #Raw file is output 327 #Raw file is output
328 nfnaex <- strsplit(alz,"[\\]") %>% 328 nfnaex <- strsplit(alz,"[\\]") %>%
329 .[[1]] %>% 329 .[[1]] %>%
330 .[length(.)] %>% 330 .[length(.)] %>%
331 gsub("\\D","",.) %>% 331 gsub("\\D","",.) %>%
332 c("GSE",.,"aftexcel.txt") %>% 332 c("GSE",.,"aftexcel.txt") %>%
333 paste(collapse = "") 333 paste(collapse = "")
334 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 334 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
335 335
336 336
337 337
338 #Now for the discretization part 338 #Now for the discretization part
339 ##get the wordy part again 339 ##get the wordy part again
340 rawword <- t(ALZWORDF) 340 rawword <- t(ALZWORDF)
341 341
342 ##where is ID_REF located 342 ##where is ID_REF located
343 hereim <- grep("ID_REF",rownames(rawword)) 343 hereim <- grep("ID_REF",rownames(rawword))
344 344
345 ##Subject Names GSM... 345 ##Subject Names GSM...
346 subjnam <- rawword[hereim,] 346 subjnam <- rawword[hereim,]
347 347
348 ##Getting the names for the rows 348 ##Getting the names for the rows
349 namedarows <- rownames(rawword)[-hereim] %>% 349 namedarows <- rownames(rawword)[-hereim] %>%
350 as.data.frame(.,stringsAsFactors = FALSE) 350 as.data.frame(.,stringsAsFactors = FALSE)
351 RAWWORD <- rawword[-hereim,] %>% 351 RAWWORD <- rawword[-hereim,] %>%
352 as.data.frame(.,stringsAsFactors = FALSE) %>% 352 as.data.frame(.,stringsAsFactors = FALSE) %>%
353 bind_cols(namedarows,.) 353 bind_cols(namedarows,.)
354 z <- 1 354 z <- 1
355 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 355 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
356 for(z in 1:dim(RAWWORD)[1]){ 356 for(z in 1:dim(RAWWORD)[1]){
357 if(sum(is.na(RAWWORD[z,])) > 0){ 357 if(sum(is.na(RAWWORD[z,])) > 0){
358 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 358 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
359 } 359 }
360 if(length(grep("NA",RAWWORD[z,])) > 0){ 360 if(length(grep("NA",RAWWORD[z,])) > 0){
361 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] 361 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
362 } 362 }
363 z <- z + 1 363 z <- z + 1
364 } 364 }
365 365
366 colnames(naroww) <- "ROW_NAs" 366 colnames(naroww) <- "ROW_NAs"
367 RAWWORD <- bind_cols(RAWWORD,naroww) 367 RAWWORD <- bind_cols(RAWWORD,naroww)
368 368
369 369
370 roALZna <- t(ALZDAT) %>% 370 roALZna <- t(ALZDAT) %>%
371 rownames(.) %>% 371 rownames(.) %>%
372 as.data.frame(.,stringsAsFactors = FALSE) 372 as.data.frame(.,stringsAsFactors = FALSE)
373 colnames(roALZna) <- "ID_REF" 373 colnames(roALZna) <- "ID_REF"
374 374
375 RAWDAT <- t(ALZDAT) %>% 375 RAWDAT <- t(ALZDAT) %>%
376 as.data.frame(.,stringsAsFactors = FALSE) 376 as.data.frame(.,stringsAsFactors = FALSE)
377 colnames(RAWDAT) <- NULL 377 colnames(RAWDAT) <- NULL
378 rownames(RAWDAT) <- NULL 378 rownames(RAWDAT) <- NULL
379 379
380 RAWDAT2 <- RAWDAT %>% 380 RAWDAT2 <- RAWDAT %>%
381 cbind(roALZna,.) %>% 381 cbind(roALZna,.) %>%
382 dplyr::arrange(.,ID_REF) 382 dplyr::arrange(.,ID_REF)
383 383
384 ##Editing the file for R processing 384 ##Editing the file for R processing
385 RAWDATID <- RAWDAT2[,1] %>% 385 RAWDATID <- RAWDAT2[,1] %>%
386 as.matrix(.) 386 as.matrix(.)
387 387
388 RAWDATNUM <- RAWDAT2[,-1] %>% 388 RAWDATNUM <- RAWDAT2[,-1] %>%
389 mapply(.,FUN = as.numeric) %>% 389 mapply(.,FUN = as.numeric) %>%
390 t(.) 390 t(.)
391 391
392 ##Consolidating genes with the same name 392 ##Consolidating genes with the same name
393 ###create empty matrix of size equal to tabRDATID 393 ###create empty matrix of size equal to tabRDATID
394 tabRDATID <- table(RAWDATID) 394 tabRDATID <- table(RAWDATID)
395 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 395 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
396 j <- 1 396 j <- 1
397 for(j in 1:length(tabRDATID)){ 397 for(j in 1:length(tabRDATID)){
398 ##Putting the ones without duplicates in their new homes 398 ##Putting the ones without duplicates in their new homes
399 if(tabRDATID[j] == 1){ 399 if(tabRDATID[j] == 1){
400 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 400 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
401 } else if(tabRDATID[j] > 1){ 401 } else if(tabRDATID[j] > 1){
402 ##Averaging duplicates and putting them in their new homes 402 ##Averaging duplicates and putting them in their new homes
403 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 403 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
404 } 404 }
405 j <- j + 1 405 j <- j + 1
406 } 406 }
407 407
408 ##Scaling the Data 408 ##Scaling the Data
409 scrawdat <- NuRDATN%>% 409 scrawdat <- NuRDATN%>%
410 scale() 410 scale()
411 attr(scrawdat,"scaled:center") <- NULL 411 attr(scrawdat,"scaled:center") <- NULL
412 attr(scrawdat,"scaled:scale") <- NULL 412 attr(scrawdat,"scaled:scale") <- NULL
413 colnames(scrawdat) <- rownames(tabRDATID) 413 colnames(scrawdat) <- rownames(tabRDATID)
414 414
415 #Outputting the Z-score file
416 nfnzsc <- strsplit(alz,"[\\]") %>%
417 .[[1]] %>%
418 .[length(.)] %>%
419 gsub("\\D","",.) %>%
420 c("GSE",.,"zscore.txt") %>%
421 paste(collapse = "")
422 zscraw <- scrawdat %>%
423 t()%>%
424 as.data.frame(.,stringsAsFactors = FALSE)
425 colnames(zscraw) <- subjnam
426 write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = TRUE)
427
415 ##Discretized the Data 428 ##Discretized the Data
416 dialzdat <- scrawdat %>% 429 dialzdat <- scrawdat %>%
417 dndat(.) %>% 430 dndat(.) %>%
418 t()%>% 431 t()%>%
419 as.data.frame(.,stringsAsFactors = FALSE) 432 as.data.frame(.,stringsAsFactors = FALSE)
420 colnames(dialzdat) <- rownames(RAWDATNUM) 433 colnames(dialzdat) <- rownames(RAWDATNUM)
421 434
422 ##setting "ID_REF" as a new variable 435 ##setting "ID_REF" as a new variable
423 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE) 436 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)
424 colnames(geneNAM) <- "ID_REF" 437 colnames(geneNAM) <- "ID_REF"
425 rownames(dialzdat) <- NULL 438 rownames(dialzdat) <- NULL
426 dialzdat <-bind_cols(geneNAM,dialzdat) 439 dialzdat <-bind_cols(geneNAM,dialzdat)
427 440
428 ##NAs in a column 441 ##NAs in a column
429 x <- 2 442 x <- 2
430 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 443 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
431 nacol[1,1] = "COL_NAs" 444 nacol[1,1] = "COL_NAs"
432 for(x in 2:dim(dialzdat)[2]){ 445 for(x in 2:dim(dialzdat)[2]){
433 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 446 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
434 x <- x + 1 447 x <- x + 1
435 } 448 }
436 colnames(nacol) <- colnames(dialzdat) 449 colnames(nacol) <- colnames(dialzdat)
437 dialzdat <- bind_rows(dialzdat,nacol) 450 dialzdat <- bind_rows(dialzdat,nacol)
438 451
439 ##NAs in a row 452 ##NAs in a row
440 y <- 1 453 y <- 1
441 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 454 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
442 for(y in 1:dim(dialzdat)[1]){ 455 for(y in 1:dim(dialzdat)[1]){
443 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 456 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
444 y <- y + 1 457 y <- y + 1
445 } 458 }
446 colnames(narowd) <- "ROW_NAs" 459 colnames(narowd) <- "ROW_NAs"
447 dialzdat <- bind_cols(dialzdat,narowd) 460 dialzdat <- bind_cols(dialzdat,narowd)
448 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 461 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
449 colnames(RAWWORD) <- colnames(dialzdat) 462 colnames(RAWWORD) <- colnames(dialzdat)
450 ##converting to character so that the clinical can be brought together with discrete data 463 ##converting to character so that the clinical can be brought together with discrete data
451 k <- 2 464 k <- 2
452 for(k in 2:dim(dialzdat)[2]-1){ 465 for(k in 2:dim(dialzdat)[2]-1){
453 dialzdat[,k] <- as.character(dialzdat[,k]) 466 dialzdat[,k] <- as.character(dialzdat[,k])
454 k <- k + 1 467 k <- k + 1
455 } 468 }
456 #The End the full data 469 #The End the full data
457 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 470 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
458 471
459 #Produces Discrete file 472 #Produces Discrete file
460 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 473 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
461 .[[1]] %>% 474 .[[1]] %>%
462 .[length(.)] %>% 475 .[length(.)] %>%
463 gsub("\\D","",.) %>% 476 gsub("\\D","",.) %>%
464 c("GSE",.,"dscrt.txt") %>% 477 c("GSE",.,"dscrt.txt") %>%
465 paste(collapse = "") 478 paste(collapse = "")
466 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 479 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
467 n <- n +1 480 n <- n +1
468 } 481 }
469 } else if(numDAT == 2){ 482 } else if(numDAT == 2){
470 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN 483 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
471 484
472 #All the files you want to analyze 485 #All the files you want to analyze
473 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:") 486 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
474 if(length(ANDIS) == 0){ 487 if(length(ANDIS) == 0){
475 #Spit out a warning 488 #Spit out a warning
476 warning("You did not select any files and so no cleaning will be performed") 489 warning("You did not select any files and so no cleaning will be performed")
477 } else{ 490 } else{
478 #indexing the data files 491 #indexing the data files
479 n <- 1 492 n <- 1
480 for(n in 1: length(ANDIS)){ 493 for(n in 1: length(ANDIS)){
481 alz <- ANDIS[n] 494 alz <- ANDIS[n]
482 495
483 #Working with the wordy part of the document 496 #Working with the wordy part of the document
484 alzword <- alz %>% 497 alzword <- alz %>%
485 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 498 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
486 filter(grepl("!Sample",X1))%>% 499 filter(grepl("!Sample",X1))%>%
487 filter(!grepl("!Sample_contact",X1)) 500 filter(!grepl("!Sample_contact",X1))
488 501
489 #Getting the GPL file 502 #Getting the GPL file
490 genena <- grep("_platform_id",alzword$X1) %>% 503 genena <- grep("_platform_id",alzword$X1) %>%
491 alzword$X2[.] %>% 504 alzword$X2[.] %>%
492 str_trim(.) %>% 505 str_trim(.) %>%
493 paste0("^",.,"\\D") %>% 506 paste0("^",.,"\\D") %>%
494 grep(.,list.files()) %>% 507 grep(.,list.files()) %>%
495 list.files()[.] 508 list.files()[.]
496 509
497 #Find out if it is a soft GPL file or not 510 #Find out if it is a soft GPL file or not
498 soft <- strsplit(genena,"[\\|/]") %>% 511 soft <- strsplit(genena,"[\\|/]") %>%
499 .[[1]] %>% 512 .[[1]] %>%
500 .[length(.)] %>% 513 .[length(.)] %>%
501 grepl("soft",.) 514 grepl("soft",.)
502 515
503 ##Changing row names and column names: 516 ##Changing row names and column names:
504 ALZWORD <- t(alzword) 517 ALZWORD <- t(alzword)
505 rownames(ALZWORD)=NULL 518 rownames(ALZWORD)=NULL
506 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 519 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
507 ALZWORD <- chngrownm(ALZWORD)[-1,] 520 ALZWORD <- chngrownm(ALZWORD)[-1,]
508 ALZWORD <- ALZWORD%>% 521 ALZWORD <- ALZWORD%>%
509 as.data.frame(.,stringsAsFactors = FALSE)%>% 522 as.data.frame(.,stringsAsFactors = FALSE)%>%
510 dplyr::select(-starts_with("col")) 523 dplyr::select(-starts_with("col"))
511 524
512 ##Reorganizing information within the columns and final clinical data 525 ##Reorganizing information within the columns and final clinical data
513 ALZWORDF <- cinfo(ALZWORD) 526 ALZWORDF <- cinfo(ALZWORD)
514 527
515 528
516 #Working with Actual Data part of file 529 #Working with Actual Data part of file
517 alzdat <- alz %>% 530 alzdat <- alz %>%
518 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 531 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
519 ALZDAT <- t(alzdat[,-1]) 532 ALZDAT <- t(alzdat[,-1])
520 rownames(ALZDAT)=NULL 533 rownames(ALZDAT)=NULL
521 534
522 ##Is there a clean version of the GPL file available? 535 ##Is there a clean version of the GPL file available?
523 gplnum <- strsplit(genena,"[\\|/]") %>% 536 gplnum <- strsplit(genena,"[\\|/]") %>%
524 .[[1]] %>% 537 .[[1]] %>%
525 .[length(.)] %>% 538 .[length(.)] %>%
526 gsub("\\D","",.) 539 gsub("\\D","",.)
527 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 540 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
528 if(clfileex >= 1){ 541 if(clfileex >= 1){
529 #use the clean version 542 #use the clean version
530 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 543 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
531 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 544 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
532 545
533 } else if(clfileex == 0){ 546 } else if(clfileex == 0){
534 ##Lets Create a clean version 547 ##Lets Create a clean version
535 548
536 ##Gene ID to Gene Name 549 ##Gene ID to Gene Name
537 if(soft == TRUE){ 550 if(soft == TRUE){
538 #Check to see if there is already a file containing information on soft files 551 #Check to see if there is already a file containing information on soft files
539 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 552 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
540 if(fileex == 1){ 553 if(fileex == 1){
541 #Check to see if this GPL soft file has been used before 554 #Check to see if this GPL soft file has been used before
542 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 555 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
543 .$GPL_FILE_NUM%>% 556 .$GPL_FILE_NUM%>%
544 grepl(gplnum,.) %>% 557 grepl(gplnum,.) %>%
545 sum() 558 sum()
546 if(IDF == 1){ 559 if(IDF == 1){
547 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 560 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
548 .$GPL_FILE_NUM%>% 561 .$GPL_FILE_NUM%>%
549 grep(gplnum,.) 562 grep(gplnum,.)
550 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 563 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
551 .$LOC_ID %>% 564 .$LOC_ID %>%
552 .[IDLOCAL] 565 .[IDLOCAL]
553 geneIDNam <- genena %>% 566 geneIDNam <- genena %>%
554 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 567 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
555 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.))) 568 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))
556 } else if(IDF == 0){ 569 } else if(IDF == 0){
557 #No information on this particular GPL file 570 #No information on this particular GPL file
558 idLOCGPL <- genena %>% 571 idLOCGPL <- genena %>%
559 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 572 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
560 t(.) %>% 573 t(.) %>%
561 grep("^ID\\s*$",.) %>% 574 grep("^ID\\s*$",.) %>%
562 -1 575 -1
563 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 576 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
564 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 577 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
565 geneIDNam <- genena %>% 578 geneIDNam <- genena %>%
566 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 579 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
567 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.))) 580 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))
568 } 581 }
569 } else if(fileex == 0){ 582 } else if(fileex == 0){
570 #We must create a file that we can access for later use 583 #We must create a file that we can access for later use
571 idLOCGPL <- genena %>% 584 idLOCGPL <- genena %>%
572 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 585 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
573 t(.) %>% 586 t(.) %>%
574 grep("^ID\\s*$",.) %>% 587 grep("^ID\\s*$",.) %>%
575 -1 588 -1
576 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 589 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
577 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 590 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
578 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 591 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
579 geneIDNam <- genena %>% 592 geneIDNam <- genena %>%
580 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 593 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
581 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.))) 594 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))
582 } 595 }
583 } else if(soft == FALSE){ 596 } else if(soft == FALSE){
584 geneIDNam <- genena %>% 597 geneIDNam <- genena %>%
585 read_delim(delim="\t",comment = "#")%>% 598 read_delim(delim="\t",comment = "#")%>%
586 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.))) 599 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))
587 } 600 }
588 601
589 ##Labeling the gene IDs without names 602 ##Labeling the gene IDs without names
590 geneIDNam <- NAFIXING(geneIDNam) 603 geneIDNam <- NAFIXING(geneIDNam)
591 604
592 ##remove the whitespace 605 ##remove the whitespace
593 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 606 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
594 607
595 ##Here is the clean version 608 ##Here is the clean version
596 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 609 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
597 } 610 }
598 611
599 612
600 613
601 ##Changing the gene ID to gene name 614 ##Changing the gene ID to gene name
602 ALZDAT1 <- cgeneID(geneIDNam,alzdat) 615 ALZDAT1 <- cgeneID(geneIDNam,alzdat)
603 colnames(ALZDAT) = ALZDAT1[1,] 616 colnames(ALZDAT) = ALZDAT1[1,]
604 617
605 618
606 ##Adjusting the column names aka the gene names 619 ##Adjusting the column names aka the gene names
607 colnames(ALZDAT) <- gcnames(ALZDAT) 620 colnames(ALZDAT) <- gcnames(ALZDAT)
608 621
609 622
610 #Full RAW Data 623 #Full RAW Data
611 Fullalzdwr <- ALZDAT %>% 624 Fullalzdwr <- ALZDAT %>%
612 as.data.frame(.,stringsAsFactors = FALSE) %>% 625 as.data.frame(.,stringsAsFactors = FALSE) %>%
613 cbind(ALZWORDF,.) 626 cbind(ALZWORDF,.)
614 627
615 #Raw file is output 628 #Raw file is output
616 nfnaex <- strsplit(alz,"[\\]") %>% 629 nfnaex <- strsplit(alz,"[\\]") %>%
617 .[[1]] %>% 630 .[[1]] %>%
618 .[length(.)] %>% 631 .[length(.)] %>%
619 gsub("\\D","",.) %>% 632 gsub("\\D","",.) %>%
620 c("GSE",.,"aftexcel.txt") %>% 633 c("GSE",.,"aftexcel.txt") %>%
621 paste(collapse = "") 634 paste(collapse = "")
622 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 635 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
623 636
624 637
625 638
626 #Now for the discretization part 639 #Now for the discretization part
627 ##get the wordy part again 640 ##get the wordy part again
628 rawword <- t(ALZWORDF) 641 rawword <- t(ALZWORDF)
629 642
630 ##where is ID_REF located 643 ##where is ID_REF located
631 hereim <- grep("ID_REF",rownames(rawword)) 644 hereim <- grep("ID_REF",rownames(rawword))
632 645
633 ##Subject Names GSM... 646 ##Subject Names GSM...
634 subjnam <- rawword[hereim,] 647 subjnam <- rawword[hereim,]
635 648
636 ##Getting the names for the rows 649 ##Getting the names for the rows
637 namedarows <- rownames(rawword)[-hereim] %>% 650 namedarows <- rownames(rawword)[-hereim] %>%
638 as.data.frame(.,stringsAsFactors = FALSE) 651 as.data.frame(.,stringsAsFactors = FALSE)
639 RAWWORD <- rawword[-hereim,] %>% 652 RAWWORD <- rawword[-hereim,] %>%
640 as.data.frame(.,stringsAsFactors = FALSE) %>% 653 as.data.frame(.,stringsAsFactors = FALSE) %>%
641 bind_cols(namedarows,.) 654 bind_cols(namedarows,.)
642 z <- 1 655 z <- 1
643 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 656 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
644 for(z in 1:dim(RAWWORD)[1]){ 657 for(z in 1:dim(RAWWORD)[1]){
645 if(sum(is.na(RAWWORD[z,])) > 0){ 658 if(sum(is.na(RAWWORD[z,])) > 0){
646 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 659 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
647 } 660 }
648 if(length(grep("NA",RAWWORD[z,])) > 0){ 661 if(length(grep("NA",RAWWORD[z,])) > 0){
649 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] 662 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
650 } 663 }
651 z <- z + 1 664 z <- z + 1
652 } 665 }
653 666
654 colnames(naroww) <- "ROW_NAs" 667 colnames(naroww) <- "ROW_NAs"
655 RAWWORD <- bind_cols(RAWWORD,naroww) 668 RAWWORD <- bind_cols(RAWWORD,naroww)
656 669
657 670
658 roALZna <- t(ALZDAT) %>% 671 roALZna <- t(ALZDAT) %>%
659 rownames(.) %>% 672 rownames(.) %>%
660 as.data.frame(.,stringsAsFactors = FALSE) 673 as.data.frame(.,stringsAsFactors = FALSE)
661 colnames(roALZna) <- "ID_REF" 674 colnames(roALZna) <- "ID_REF"
662 675
663 RAWDAT <- t(ALZDAT) %>% 676 RAWDAT <- t(ALZDAT) %>%
664 as.data.frame(.,stringsAsFactors = FALSE) 677 as.data.frame(.,stringsAsFactors = FALSE)
665 colnames(RAWDAT) <- NULL 678 colnames(RAWDAT) <- NULL
666 rownames(RAWDAT) <- NULL 679 rownames(RAWDAT) <- NULL
667 680
668 RAWDAT2 <- RAWDAT %>% 681 RAWDAT2 <- RAWDAT %>%
669 cbind(roALZna,.) %>% 682 cbind(roALZna,.) %>%
670 dplyr::arrange(.,ID_REF) 683 dplyr::arrange(.,ID_REF)
671 684
672 ##Editing the file for R processing 685 ##Editing the file for R processing
673 RAWDATID <- RAWDAT2[,1] %>% 686 RAWDATID <- RAWDAT2[,1] %>%
674 as.matrix(.) 687 as.matrix(.)
675 688
676 RAWDATNUM <- RAWDAT2[,-1] %>% 689 RAWDATNUM <- RAWDAT2[,-1] %>%
677 mapply(.,FUN = as.numeric) %>% 690 mapply(.,FUN = as.numeric) %>%
678 t(.) 691 t(.)
679 692
680 ##Consolidating genes with the same name 693 ##Consolidating genes with the same name
681 ###create empty matrix of size equal to tabRDATID 694 ###create empty matrix of size equal to tabRDATID
682 tabRDATID <- table(RAWDATID) 695 tabRDATID <- table(RAWDATID)
683 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 696 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
684 j <- 1 697 j <- 1
685 for(j in 1:length(tabRDATID)){ 698 for(j in 1:length(tabRDATID)){
686 ##Putting the ones without duplicates in their new homes 699 ##Putting the ones without duplicates in their new homes
687 if(tabRDATID[j] == 1){ 700 if(tabRDATID[j] == 1){
688 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 701 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
689 } else if(tabRDATID[j] > 1){ 702 } else if(tabRDATID[j] > 1){
690 ##Averaging duplicates and putting them in their new homes 703 ##Averaging duplicates and putting them in their new homes
691 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 704 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
692 } 705 }
693 j <- j + 1 706 j <- j + 1
694 } 707 }
695 708
696 ##Scaling the Data 709 ##Scaling the Data
697 scrawdat <- NuRDATN%>% 710 scrawdat <- NuRDATN%>%
698 scale() 711 scale()
699 attr(scrawdat,"scaled:center") <- NULL 712 attr(scrawdat,"scaled:center") <- NULL
700 attr(scrawdat,"scaled:scale") <- NULL 713 attr(scrawdat,"scaled:scale") <- NULL
701 colnames(scrawdat) <- rownames(tabRDATID) 714 colnames(scrawdat) <- rownames(tabRDATID)
702 715
716 #Outputting the Z-score file
717 nfnzsc <- strsplit(alz,"[\\]") %>%
718 .[[1]] %>%
719 .[length(.)] %>%
720 gsub("\\D","",.) %>%
721 c("GSE",.,"zscore.txt") %>%
722 paste(collapse = "")
723 zscraw <- scrawdat %>%
724 t()%>%
725 as.data.frame(.,stringsAsFactors = FALSE)
726 colnames(zscraw) <- subjnam
727 write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = TRUE)
728
703 ##Discretized the Data 729 ##Discretized the Data
704 dialzdat <- scrawdat %>% 730 dialzdat <- scrawdat %>%
705 dndat(.) %>% 731 dndat(.) %>%
706 t()%>% 732 t()%>%
707 as.data.frame(.,stringsAsFactors = FALSE) 733 as.data.frame(.,stringsAsFactors = FALSE)
708 colnames(dialzdat) <- rownames(RAWDATNUM) 734 colnames(dialzdat) <- rownames(RAWDATNUM)
709 735
710 ##setting "ID_REF" as a new variable 736 ##setting "ID_REF" as a new variable
711 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE) 737 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)
712 colnames(geneNAM) <- "ID_REF" 738 colnames(geneNAM) <- "ID_REF"
713 rownames(dialzdat) <- NULL 739 rownames(dialzdat) <- NULL
714 dialzdat <-bind_cols(geneNAM,dialzdat) 740 dialzdat <-bind_cols(geneNAM,dialzdat)
715 741
716 ##NAs in a column 742 ##NAs in a column
717 x <- 2 743 x <- 2
718 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 744 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
719 nacol[1,1] = "COL_NAs" 745 nacol[1,1] = "COL_NAs"
720 for(x in 2:dim(dialzdat)[2]){ 746 for(x in 2:dim(dialzdat)[2]){
721 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 747 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
722 x <- x + 1 748 x <- x + 1
723 } 749 }
724 colnames(nacol) <- colnames(dialzdat) 750 colnames(nacol) <- colnames(dialzdat)
725 dialzdat <- bind_rows(dialzdat,nacol) 751 dialzdat <- bind_rows(dialzdat,nacol)
726 752
727 ##NAs in a row 753 ##NAs in a row
728 y <- 1 754 y <- 1
729 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 755 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
730 for(y in 1:dim(dialzdat)[1]){ 756 for(y in 1:dim(dialzdat)[1]){
731 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 757 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
732 y <- y + 1 758 y <- y + 1
733 } 759 }
734 colnames(narowd) <- "ROW_NAs" 760 colnames(narowd) <- "ROW_NAs"
735 dialzdat <- bind_cols(dialzdat,narowd) 761 dialzdat <- bind_cols(dialzdat,narowd)
736 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 762 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
737 colnames(RAWWORD) <- colnames(dialzdat) 763 colnames(RAWWORD) <- colnames(dialzdat)
738 ##converting to character so that the clinical can be brought together with discrete data 764 ##converting to character so that the clinical can be brought together with discrete data
739 k <- 2 765 k <- 2
740 for(k in 2:dim(dialzdat)[2]-1){ 766 for(k in 2:dim(dialzdat)[2]-1){
741 dialzdat[,k] <- as.character(dialzdat[,k]) 767 dialzdat[,k] <- as.character(dialzdat[,k])
742 k <- k + 1 768 k <- k + 1
743 } 769 }
744 #The End the full data 770 #The End the full data
745 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 771 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
746 772
747 #Produces Discrete file 773 #Produces Discrete file
748 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 774 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
749 .[[1]] %>% 775 .[[1]] %>%
750 .[length(.)] %>% 776 .[length(.)] %>%
751 gsub("\\D","",.) %>% 777 gsub("\\D","",.) %>%
752 c("GSE",.,"dscrt.txt") %>% 778 c("GSE",.,"dscrt.txt") %>%
753 paste(collapse = "") 779 paste(collapse = "")
754 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 780 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
755 781
756 782
757 n <- n + 1 783 n <- n + 1
758 } 784 }
759 } 785 }
760 } 786 }
761 } 787 }
762 #The Rest of this code will be used every time you want to change a data set 788 #The Rest of this code will be used every time you want to change a data set
763 THEFT() 789 THEFT()