Commit 1ac191ee2b1395744077403f9ffb6b06e51dcf38

Authored by Efrain Gonzalez
1 parent 24ce5ef2bd
Exists in master

Update

Showing 1 changed file with 7 additions and 8 deletions   Show diff stats
1
2 #Efrain H. Gonzalez 1 #Efrain H. Gonzalez
3 #6/22/2017 2 #6/25/2018
4 options(digits = 11) 3 options(digits = 11)
5 #Libraries required to run the code 4 #Libraries required to run the code
6 library(pryr) 5 library(pryr)
7 library(MASS) 6 library(MASS)
8 library(dplyr) 7 library(dplyr)
9 library(tidyr) 8 library(tidyr)
10 library(readr) 9 library(readr)
11 library(stringr) 10 library(stringr)
12 11
13 12
14 #Necessary Functions 13 #Necessary Functions
15 #1#Function for handling the changing of row names and column names 14 #1#Function for handling the changing of row names and column names
16 chngrownm <- function(mat){ 15 chngrownm <- function(mat){
17 row <- dim(mat)[1] 16 row <- dim(mat)[1]
18 col <- dim(mat)[2] 17 col <- dim(mat)[2]
19 e <- 1 18 e <- 1
20 r <- 1 19 r <- 1
21 a <- 1 20 a <- 1
22 h <- 1 21 h <- 1
23 g <- 1 22 g <- 1
24 o <- 1 23 o <- 1
25 for(e in 1:col){ 24 for(e in 1:col){
26 if("!Sample_source_name_ch1"==mat[1,e]){ 25 if("!Sample_source_name_ch1"==mat[1,e]){
27 colnames(mat)[e] <- "Brain_Region" 26 colnames(mat)[e] <- "Brain_Region"
28 } else if("!Sample_title" == mat[1,e]){ 27 } else if("!Sample_title" == mat[1,e]){
29 colnames(mat)[e] <- "Title" 28 colnames(mat)[e] <- "Title"
30 } else if("!Sample_geo_accession" == mat[1,e]){ 29 } else if("!Sample_geo_accession" == mat[1,e]){
31 colnames(mat)[e] <- "ID_REF" 30 colnames(mat)[e] <- "ID_REF"
32 } else{ 31 } else{
33 if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){ 32 if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){
34 colnames(mat)[e] <- paste0("Sex",r) 33 colnames(mat)[e] <- paste0("Sex",r)
35 r = r + 1 34 r = r + 1
36 } 35 }
37 if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){ 36 if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){
38 colnames(mat)[e] <- paste0("PMI",a) 37 colnames(mat)[e] <- paste0("PMI",a)
39 a = a + 1 38 a = a + 1
40 } 39 }
41 if(grepl("age|Age|AGE",mat[2,e])==TRUE){ 40 if(grepl("age|Age|AGE",mat[2,e])==TRUE){
42 colnames(mat)[e] <- paste0("Age",h) 41 colnames(mat)[e] <- paste0("Age",h)
43 h = h + 1 42 h = h + 1
44 } 43 }
45 if(grepl("braak|b&b",mat[2,e])==TRUE){ 44 if(grepl("braak|b&b",mat[2,e])==TRUE){
46 colnames(mat)[e] <- paste0("Braak",g) 45 colnames(mat)[e] <- paste0("Braak",g)
47 g = g + 1 46 g = g + 1
48 } 47 }
49 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){ 48 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){
50 colnames(mat)[e] <- paste0("Group",o) 49 colnames(mat)[e] <- paste0("Group",o)
51 o = o + 1 50 o = o + 1
52 } 51 }
53 52
54 } 53 }
55 e = e + 1 54 e = e + 1
56 } 55 }
57 mat 56 mat
58 } 57 }
59 58
60 #2#Function for reorganizing information within the columns 59 #2#Function for reorganizing information within the columns
61 cinfo <- function(mat){ 60 cinfo <- function(mat){
62 col <- dim(mat)[2] 61 col <- dim(mat)[2]
63 j <-2 62 j <-2
64 for(j in 2:col){ 63 for(j in 2:col){
65 if(grepl("Group",colnames(mat)[j]) == TRUE){ 64 if(grepl("Group",colnames(mat)[j]) == TRUE){
66 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 65 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
67 } else if(grepl("Age",colnames(mat)[j])==TRUE){ 66 } else if(grepl("Age",colnames(mat)[j])==TRUE){
68 mat[,j] <- gsub("\\D","",mat[,j])%>% 67 mat[,j] <- gsub("\\D","",mat[,j])%>%
69 as.integer() 68 as.integer()
70 } else if(grepl("Sex",colnames(mat)[j])==TRUE){ 69 } else if(grepl("Sex",colnames(mat)[j])==TRUE){
71 mat[,j] <- gsub(".+:\\s","",mat[,j]) 70 mat[,j] <- gsub(".+:\\s","",mat[,j])
72 } else if(grepl("PMI",colnames(mat)[j])==TRUE){ 71 } else if(grepl("PMI",colnames(mat)[j])==TRUE){
73 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 72 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
74 as.numeric() 73 as.numeric()
75 } else if(grepl("Braak",colnames(mat)[j])==TRUE){ 74 } else if(grepl("Braak",colnames(mat)[j])==TRUE){
76 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 75 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
77 as.roman()%>% 76 as.roman()%>%
78 as.integer() 77 as.integer()
79 } 78 }
80 j=j+1 79 j=j+1
81 } 80 }
82 mat 81 mat
83 } 82 }
84 83
85 #3#Function for labeling the gene IDs without names 84 #3#Function for labeling the gene IDs without names
86 NAFIXING <- function(GIDNAM){ 85 NAFIXING <- function(GIDNAM){
87 row <- dim(GIDNAM)[1] 86 row <- dim(GIDNAM)[1]
88 i <- 1 87 i <- 1
89 for(i in 1:row){ 88 for(i in 1:row){
90 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 89 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
91 GIDNAM[i,2] <- GIDNAM[i,1] 90 GIDNAM[i,2] <- GIDNAM[i,1]
92 } 91 }
93 i <- i + 1 92 i <- i + 1
94 } 93 }
95 GIDNAM 94 GIDNAM
96 } 95 }
97 96
98 #4#Function for changing the gene ID to gene name 97 #4#Function for changing the gene ID to gene name
99 cgeneID <- function(GeneName,DATA){ 98 cgeneID <- function(GeneName,DATA){
100 nj <- t(GeneName) 99 nj <- t(GeneName)
101 nq <- t(DATA) 100 nq <- t(DATA)
102 colGene <- dim(nj)[2] 101 colGene <- dim(nj)[2]
103 colDATA <- dim(nq)[2] 102 colDATA <- dim(nq)[2]
104 j <- 1 103 j <- 1
105 for(j in 1:colDATA){ 104 for(j in 1:colDATA){
106 #where is that gene id located within the GPL file 105 #where is that gene id located within the GPL file
107 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,]) 106 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
108 if(is.na(sum(chngreq))==FALSE){ 107 if(is.na(sum(chngreq))==FALSE){
109 if(sum(chngreq) > 0){ 108 if(sum(chngreq) > 0){
110 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j]) 109 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
111 } 110 }
112 } 111 }
113 j <- j + 1 112 j <- j + 1
114 } 113 }
115 nq 114 nq
116 } 115 }
117 #cgeneID <- function(GeneName,DATA){ 116 #cgeneID <- function(GeneName,DATA){
118 # colGene <- dim(GeneName)[2] 117 # colGene <- dim(GeneName)[2]
119 # j <- 1 118 # j <- 1
120 # for(j in 1:colGene){ 119 # for(j in 1:colGene){
121 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) 120 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
122 # if(is.na(sum(chngsreq))==FALSE){ 121 # if(is.na(sum(chngsreq))==FALSE){
123 # if(sum(chngsreq) > 0){ 122 # if(sum(chngsreq) > 0){
124 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 123 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
125 # } 124 # }
126 # } 125 # }
127 # j = j+1 126 # j = j+1
128 # } 127 # }
129 # DATA 128 # DATA
130 #} 129 #}
131 130
132 #5#Function for adjusting the gene names 131 #5#Function for adjusting the gene names
133 gcnames <- function(DiData,usecol=1){ 132 gcnames <- function(DiData,usecol=1){
134 nuruns <- dim(DiData)[2] 133 nuruns <- dim(DiData)[2]
135 i = 1 134 i = 1
136 nwnam <- rep("0",length.out=nuruns) 135 nwnam <- rep("0",length.out=nuruns)
137 for(i in 1:nuruns){ 136 for(i in 1:nuruns){
138 if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){ 137 if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){
139 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol]) 138 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])
140 } else{ 139 } else{
141 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1]) 140 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])
142 } 141 }
143 142
144 } 143 }
145 nwnam 144 nwnam
146 145
147 } 146 }
148 147
149 #6# Function for discretizing the data 148 #6# Function for discretizing the data
150 dndat <- function(NDATA){ 149 dndat <- function(NDATA){
151 rownd <- dim(NDATA)[1] 150 rownd <- dim(NDATA)[1]
152 colnd <- dim(NDATA)[2] 151 colnd <- dim(NDATA)[2]
153 DDATA <- matrix(0,nrow=rownd,ncol=colnd) 152 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
154 colnames(DDATA) <- colnames(NDATA) 153 colnames(DDATA) <- colnames(NDATA)
155 i <- 1 154 i <- 1
156 for(i in 1:rownd){ 155 for(i in 1:rownd){
157 j <- 1 156 j <- 1
158 for(j in 1:colnd){ 157 for(j in 1:colnd){
159 if(is.na(NDATA[i,j])==FALSE){ 158 if(is.na(NDATA[i,j])==FALSE){
160 159
161 if(NDATA[i,j] < -1){ 160 if(NDATA[i,j] < -1){
162 DDATA[i,j]=0L 161 DDATA[i,j]=0L
163 } else if(NDATA[i,j] > 1){ 162 } else if(NDATA[i,j] > 1){
164 DDATA[i,j]=2L 163 DDATA[i,j]=2L
165 } else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){ 164 } else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){
166 DDATA[i,j]=1L 165 DDATA[i,j]=1L
167 } 166 }
168 } else{ 167 } else{
169 DDATA[i,j] = NDATA[i,j] 168 DDATA[i,j] = NDATA[i,j]
170 } 169 }
171 j = j + 1 170 j = j + 1
172 } 171 }
173 i = i + 1 172 i = i + 1
174 } 173 }
175 DDATA 174 DDATA
176 } 175 }
177 176
178 177
179 #MajorFunction#This is the function that does everything else 178 #MajorFunction#This is the function that does everything else
180 THEFT <- function(){ 179 THEFT <- function(){
181 #Set working directory based on the directory of the series matrix file Currently only works for windows 180 #Set working directory based on the directory of the series matrix file Currently only works for windows
182 wd <- getwd() 181 wd <- getwd()
183 #list.files() 182 #list.files()
184 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?") 183 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
185 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) 184 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
186 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files()) 185 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
187 GSEfloc <- list.files()[GSEfileloc] 186 GSEfloc <- list.files()[GSEfileloc]
188 #ALL DATA FILES WILL BE CLEANED 187 #ALL DATA FILES WILL BE CLEANED
189 if(numDAT == 1){ 188 if(numDAT == 1){
190 #indexing the data files 189 #indexing the data files
191 n <- 1 190 n <- 1
192 for(n in 1: length(GSEfloc)){ 191 for(n in 1: length(GSEfloc)){
193 alz <- GSEfloc[n] 192 alz <- GSEfloc[n]
194 193
195 #Working with the wordy part of the document 194 #Working with the wordy part of the document
196 alzword <- alz %>% 195 alzword <- alz %>%
197 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 196 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
198 filter(grepl("!Sample",X1))%>% 197 filter(grepl("!Sample",X1))%>%
199 filter(!grepl("!Sample_contact",X1)) 198 filter(!grepl("!Sample_contact",X1))
200 199
201 #Getting the GPL file 200 #Getting the GPL file
202 genena <- grep("_platform_id",alzword$X1) %>% 201 genena <- grep("_platform_id",alzword$X1) %>%
203 alzword$X2[.] %>% 202 alzword$X2[.] %>%
204 str_trim(.) %>% 203 str_trim(.) %>%
205 paste0("^",.,"\\D") %>% 204 paste0("^",.,"\\D") %>%
206 grep(.,list.files()) %>% 205 grep(.,list.files()) %>%
207 list.files()[.] 206 list.files()[.]
208 207
209 #Find out if it is a soft GPL file or not 208 #Find out if it is a soft GPL file or not
210 soft <- strsplit(genena,"[\\|/]") %>% 209 soft <- strsplit(genena,"[\\|/]") %>%
211 .[[1]] %>% 210 .[[1]] %>%
212 .[length(.)] %>% 211 .[length(.)] %>%
213 grepl("soft",.) 212 grepl("soft",.)
214 213
215 ##Changing row names and column names: 214 ##Changing row names and column names:
216 ALZWORD <- t(alzword) 215 ALZWORD <- t(alzword)
217 rownames(ALZWORD)=NULL 216 rownames(ALZWORD)=NULL
218 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 217 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
219 ALZWORD <- chngrownm(ALZWORD)[-1,] 218 ALZWORD <- chngrownm(ALZWORD)[-1,]
220 ALZWORD <- ALZWORD%>% 219 ALZWORD <- ALZWORD%>%
221 as.data.frame(.,stringsAsFactors = FALSE)%>% 220 as.data.frame(.,stringsAsFactors = FALSE)%>%
222 dplyr::select(-starts_with("col")) 221 dplyr::select(-starts_with("col"))
223 222
224 ##Reorganizing information within the columns and final clinical data 223 ##Reorganizing information within the columns and final clinical data
225 ALZWORDF <- cinfo(ALZWORD) 224 ALZWORDF <- cinfo(ALZWORD)
226 225
227 226
228 #Working with Actual Data part of file 227 #Working with Actual Data part of file
229 alzdat <- alz %>% 228 alzdat <- alz %>%
230 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 229 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
231 ALZDAT <- t(alzdat[,-1]) 230 ALZDAT <- t(alzdat[,-1])
232 rownames(ALZDAT)=NULL 231 rownames(ALZDAT)=NULL
233 232
234 ##Is there a clean version of the GPL file available? 233 ##Is there a clean version of the GPL file available?
235 gplnum <- strsplit(genena,"[\\|/]") %>% 234 gplnum <- strsplit(genena,"[\\|/]") %>%
236 .[[1]] %>% 235 .[[1]] %>%
237 .[length(.)] %>% 236 .[length(.)] %>%
238 gsub("\\D","",.) 237 gsub("\\D","",.)
239 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 238 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
240 if(clfileex >= 1){ 239 if(clfileex >= 1){
241 #use the clean version 240 #use the clean version
242 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 241 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
243 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 242 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
244 243
245 } else if(clfileex == 0){ 244 } else if(clfileex == 0){
246 ##Lets Create a clean version 245 ##Lets Create a clean version
247 246
248 ##Gene ID to Gene Name 247 ##Gene ID to Gene Name
249 if(soft == TRUE){ 248 if(soft == TRUE){
250 #Check to see if there is already a file containing information on soft files 249 #Check to see if there is already a file containing information on soft files
251 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 250 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
252 if(fileex == 1){ 251 if(fileex == 1){
253 #Check to see if this GPL soft file has been used before 252 #Check to see if this GPL soft file has been used before
254 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 253 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
255 .$GPL_FILE_NUM%>% 254 .$GPL_FILE_NUM%>%
256 grepl(gplnum,.) %>% 255 grepl(gplnum,.) %>%
257 sum() 256 sum()
258 if(IDF == 1){ 257 if(IDF == 1){
259 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 258 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
260 .$GPL_FILE_NUM%>% 259 .$GPL_FILE_NUM%>%
261 grep(gplnum,.) 260 grep(gplnum,.)
262 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 261 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
263 .$LOC_ID %>% 262 .$LOC_ID %>%
264 .[IDLOCAL] 263 .[IDLOCAL]
265 geneIDNam <- genena %>% 264 geneIDNam <- genena %>%
266 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 265 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
267 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.))) 266 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
268 } else if(IDF == 0){ 267 } else if(IDF == 0){
269 #No information on this particular GPL file 268 #No information on this particular GPL file
270 idLOCGPL <- genena %>% 269 idLOCGPL <- genena %>%
271 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 270 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
272 t(.) %>% 271 t(.) %>%
273 grep("^ID\\s*$",.) %>% 272 grep("^ID\\s*$",.) %>%
274 -1 273 -1
275 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 274 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
276 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 275 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
277 geneIDNam <- genena %>% 276 geneIDNam <- genena %>%
278 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 277 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
279 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.))) 278 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
280 } 279 }
281 } else if(fileex == 0){ 280 } else if(fileex == 0){
282 #We must create a file that we can access for later use 281 #We must create a file that we can access for later use
283 idLOCGPL <- genena %>% 282 idLOCGPL <- genena %>%
284 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 283 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
285 t(.) %>% 284 t(.) %>%
286 grep("^ID\\s*$",.) %>% 285 grep("^ID\\s*$",.) %>%
287 -1 286 -1
288 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 287 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
289 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 288 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
290 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 289 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
291 geneIDNam <- genena %>% 290 geneIDNam <- genena %>%
292 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 291 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
293 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.))) 292 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
294 } 293 }
295 } else if(soft == FALSE){ 294 } else if(soft == FALSE){
296 geneIDNam <- genena %>% 295 geneIDNam <- genena %>%
297 read_delim(delim="\t",comment = "#")%>% 296 read_delim(delim="\t",comment = "#")%>%
298 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.))) 297 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
299 } 298 }
300 299
301 ##Labeling the gene IDs without names 300 ##Labeling the gene IDs without names
302 geneIDNam <- NAFIXING(geneIDNam) 301 geneIDNam <- NAFIXING(geneIDNam)
303 302
304 ##remove the whitespace 303 ##remove the whitespace
305 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 304 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
306 305
307 ##Here is the clean version 306 ##Here is the clean version
308 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 307 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
309 } 308 }
310 309
311 310
312 311
313 ##Changing the gene ID to gene name 312 ##Changing the gene ID to gene name
314 ALZDAT1 <- cgeneID(geneIDNam,alzdat) 313 ALZDAT1 <- cgeneID(geneIDNam,alzdat)
315 colnames(ALZDAT) = ALZDAT1[1,] 314 colnames(ALZDAT) = ALZDAT1[1,]
316 315
317 316
318 ##Adjusting the column names aka the gene names 317 ##Adjusting the column names aka the gene names
319 colnames(ALZDAT) <- gcnames(ALZDAT) 318 colnames(ALZDAT) <- gcnames(ALZDAT)
320 319
321 320
322 #Full RAW Data 321 #Full RAW Data
323 Fullalzdwr <- ALZDAT %>% 322 Fullalzdwr <- ALZDAT %>%
324 as.data.frame(.,stringsAsFactors = FALSE) %>% 323 as.data.frame(.,stringsAsFactors = FALSE) %>%
325 cbind(ALZWORDF,.) 324 cbind(ALZWORDF,.)
326 325
327 #Raw file is output 326 #Raw file is output
328 nfnaex <- strsplit(alz,"[\\]") %>% 327 nfnaex <- strsplit(alz,"[\\|/]") %>%
329 .[[1]] %>% 328 .[[1]] %>%
330 .[length(.)] %>% 329 .[length(.)] %>%
331 gsub("\\D","",.) %>% 330 gsub("\\D","",.) %>%
332 c("GSE",.,"aftexcel.txt") %>% 331 c("GSE",.,"aftexcel.txt") %>%
333 paste(collapse = "") 332 paste(collapse = "")
334 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 333 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
335 334
336 335
337 336
338 #Now for the discretization part 337 #Now for the discretization part
339 ##get the wordy part again 338 ##get the wordy part again
340 rawword <- t(ALZWORDF) 339 rawword <- t(ALZWORDF)
341 340
342 ##where is ID_REF located 341 ##where is ID_REF located
343 hereim <- grep("ID_REF",rownames(rawword)) 342 hereim <- grep("ID_REF",rownames(rawword))
344 343
345 ##Subject Names GSM... 344 ##Subject Names GSM...
346 subjnam <- rawword[hereim,] 345 subjnam <- rawword[hereim,]
347 346
348 ##Getting the names for the rows 347 ##Getting the names for the rows
349 namedarows <- rownames(rawword)[-hereim] %>% 348 namedarows <- rownames(rawword)[-hereim] %>%
350 as.data.frame(.,stringsAsFactors = FALSE) 349 as.data.frame(.,stringsAsFactors = FALSE)
351 RAWWORD <- rawword[-hereim,] %>% 350 RAWWORD <- rawword[-hereim,] %>%
352 as.data.frame(.,stringsAsFactors = FALSE) %>% 351 as.data.frame(.,stringsAsFactors = FALSE) %>%
353 bind_cols(namedarows,.) 352 bind_cols(namedarows,.)
354 z <- 1 353 z <- 1
355 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 354 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
356 for(z in 1:dim(RAWWORD)[1]){ 355 for(z in 1:dim(RAWWORD)[1]){
357 if(sum(is.na(RAWWORD[z,])) > 0){ 356 if(sum(is.na(RAWWORD[z,])) > 0){
358 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 357 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
359 } 358 }
360 if(length(grep("NA",RAWWORD[z,])) > 0){ 359 if(length(grep("NA",RAWWORD[z,])) > 0){
361 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] 360 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
362 } 361 }
363 z <- z + 1 362 z <- z + 1
364 } 363 }
365 364
366 colnames(naroww) <- "ROW_NAs" 365 colnames(naroww) <- "ROW_NAs"
367 RAWWORD <- bind_cols(RAWWORD,naroww) 366 RAWWORD <- bind_cols(RAWWORD,naroww)
368 367
369 368
370 roALZna <- t(ALZDAT) %>% 369 roALZna <- t(ALZDAT) %>%
371 rownames(.) %>% 370 rownames(.) %>%
372 as.data.frame(.,stringsAsFactors = FALSE) 371 as.data.frame(.,stringsAsFactors = FALSE)
373 colnames(roALZna) <- "ID_REF" 372 colnames(roALZna) <- "ID_REF"
374 373
375 RAWDAT <- t(ALZDAT) %>% 374 RAWDAT <- t(ALZDAT) %>%
376 as.data.frame(.,stringsAsFactors = FALSE) 375 as.data.frame(.,stringsAsFactors = FALSE)
377 colnames(RAWDAT) <- NULL 376 colnames(RAWDAT) <- NULL
378 rownames(RAWDAT) <- NULL 377 rownames(RAWDAT) <- NULL
379 378
380 RAWDAT2 <- RAWDAT %>% 379 RAWDAT2 <- RAWDAT %>%
381 cbind(roALZna,.) %>% 380 cbind(roALZna,.) %>%
382 dplyr::arrange(.,ID_REF) 381 dplyr::arrange(.,ID_REF)
383 382
384 ##Editing the file for R processing 383 ##Editing the file for R processing
385 RAWDATID <- RAWDAT2[,1] %>% 384 RAWDATID <- RAWDAT2[,1] %>%
386 as.matrix(.) 385 as.matrix(.)
387 386
388 RAWDATNUM <- RAWDAT2[,-1] %>% 387 RAWDATNUM <- RAWDAT2[,-1] %>%
389 mapply(.,FUN = as.numeric) %>% 388 mapply(.,FUN = as.numeric) %>%
390 t(.) 389 t(.)
391 390
392 ##Consolidating genes with the same name 391 ##Consolidating genes with the same name
393 ###create empty matrix of size equal to tabRDATID 392 ###create empty matrix of size equal to tabRDATID
394 tabRDATID <- table(RAWDATID) 393 tabRDATID <- table(RAWDATID)
395 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 394 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
396 j <- 1 395 j <- 1
397 for(j in 1:length(tabRDATID)){ 396 for(j in 1:length(tabRDATID)){
398 ##Putting the ones without duplicates in their new homes 397 ##Putting the ones without duplicates in their new homes
399 if(tabRDATID[j] == 1){ 398 if(tabRDATID[j] == 1){
400 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 399 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
401 } else if(tabRDATID[j] > 1){ 400 } else if(tabRDATID[j] > 1){
402 ##Averaging duplicates and putting them in their new homes 401 ##Averaging duplicates and putting them in their new homes
403 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 402 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
404 } 403 }
405 j <- j + 1 404 j <- j + 1
406 } 405 }
407 406
408 407
409 ##Outputting non Z-score Average over genes 408 ##Outputting non Z-score Average over genes
410 newoutput <-NuRDATN 409 newoutput <-NuRDATN
411 colnames(newoutput) <- rownames(tabRDATID) 410 colnames(newoutput) <- rownames(tabRDATID)
412 nfnewout <- strsplit(alz,"[\\]") %>% 411 nfnewout <- strsplit(alz,"[\\|/]") %>%
413 .[[1]] %>% 412 .[[1]] %>%
414 .[length(.)] %>% 413 .[length(.)] %>%
415 gsub("\\D","",.) %>% 414 gsub("\\D","",.) %>%
416 c("GSE",.,"avg.txt") %>% 415 c("GSE",.,"avg.txt") %>%
417 paste(collapse = "") 416 paste(collapse = "")
418 noutput <- newoutput %>% 417 noutput <- newoutput %>%
419 t()%>% 418 t()%>%
420 as.data.frame(.,stringsAsFactors = FALSE) 419 as.data.frame(.,stringsAsFactors = FALSE)
421 noutput <- cbind(rownames(noutput),noutput) 420 noutput <- cbind(rownames(noutput),noutput)
422 colnames(noutput) <- c("Gene Symbol",subjnam) 421 colnames(noutput) <- c("Gene Symbol",subjnam)
423 rownames(noutput) <- NULL 422 rownames(noutput) <- NULL
424 write.table(noutput, file = nfnewout, sep = "\t",col.names = TRUE,row.names = FALSE) 423 write.table(noutput, file = nfnewout, sep = "\t",col.names = TRUE,row.names = FALSE)
425 424
426 425
427 ##Scaling the Data 426 ##Scaling the Data
428 scrawdat <- NuRDATN%>% 427 scrawdat <- NuRDATN%>%
429 scale() 428 scale()
430 attr(scrawdat,"scaled:center") <- NULL 429 attr(scrawdat,"scaled:center") <- NULL
431 attr(scrawdat,"scaled:scale") <- NULL 430 attr(scrawdat,"scaled:scale") <- NULL
432 colnames(scrawdat) <- rownames(tabRDATID) 431 colnames(scrawdat) <- rownames(tabRDATID)
433 432
434 #Outputting the Z-score file 433 #Outputting the Z-score file
435 nfnzsc <- strsplit(alz,"[\\]") %>% 434 nfnzsc <- strsplit(alz,"[\\|/]") %>%
436 .[[1]] %>% 435 .[[1]] %>%
437 .[length(.)] %>% 436 .[length(.)] %>%
438 gsub("\\D","",.) %>% 437 gsub("\\D","",.) %>%
439 c("GSE",.,"zscore.txt") %>% 438 c("GSE",.,"zscore.txt") %>%
440 paste(collapse = "") 439 paste(collapse = "")
441 zscraw <- scrawdat %>% 440 zscraw <- scrawdat %>%
442 t()%>% 441 t()%>%
443 as.data.frame(.,stringsAsFactors = FALSE) 442 as.data.frame(.,stringsAsFactors = FALSE)
444 zscraw <- cbind(rownames(zscraw),zscraw) 443 zscraw <- cbind(rownames(zscraw),zscraw)
445 colnames(zscraw) <- c("Gene Symbol",subjnam) 444 colnames(zscraw) <- c("Gene Symbol",subjnam)
446 rownames(zscraw) <- NULL 445 rownames(zscraw) <- NULL
447 write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = FALSE) 446 write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = FALSE)
448 447
449 448
450 ##Discretized the Data 449 ##Discretized the Data
451 dialzdat <- scrawdat %>% 450 dialzdat <- scrawdat %>%
452 dndat(.) %>% 451 dndat(.) %>%
453 t()%>% 452 t()%>%
454 as.data.frame(.,stringsAsFactors = FALSE) 453 as.data.frame(.,stringsAsFactors = FALSE)
455 colnames(dialzdat) <- rownames(RAWDATNUM) 454 colnames(dialzdat) <- rownames(RAWDATNUM)
456 455
457 ##setting "ID_REF" as a new variable 456 ##setting "ID_REF" as a new variable
458 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE) 457 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)
459 colnames(geneNAM) <- "ID_REF" 458 colnames(geneNAM) <- "ID_REF"
460 rownames(dialzdat) <- NULL 459 rownames(dialzdat) <- NULL
461 dialzdat <-bind_cols(geneNAM,dialzdat) 460 dialzdat <-bind_cols(geneNAM,dialzdat)
462 461
463 ##NAs in a column 462 ##NAs in a column
464 x <- 2 463 x <- 2
465 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 464 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
466 nacol[1,1] = "COL_NAs" 465 nacol[1,1] = "COL_NAs"
467 for(x in 2:dim(dialzdat)[2]){ 466 for(x in 2:dim(dialzdat)[2]){
468 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 467 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
469 x <- x + 1 468 x <- x + 1
470 } 469 }
471 colnames(nacol) <- colnames(dialzdat) 470 colnames(nacol) <- colnames(dialzdat)
472 dialzdat <- bind_rows(dialzdat,nacol) 471 dialzdat <- bind_rows(dialzdat,nacol)
473 472
474 ##NAs in a row 473 ##NAs in a row
475 y <- 1 474 y <- 1
476 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 475 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
477 for(y in 1:dim(dialzdat)[1]){ 476 for(y in 1:dim(dialzdat)[1]){
478 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 477 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
479 y <- y + 1 478 y <- y + 1
480 } 479 }
481 colnames(narowd) <- "ROW_NAs" 480 colnames(narowd) <- "ROW_NAs"
482 dialzdat <- bind_cols(dialzdat,narowd) 481 dialzdat <- bind_cols(dialzdat,narowd)
483 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 482 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
484 colnames(RAWWORD) <- colnames(dialzdat) 483 colnames(RAWWORD) <- colnames(dialzdat)
485 ##converting to character so that the clinical can be brought together with discrete data 484 ##converting to character so that the clinical can be brought together with discrete data
486 k <- 2 485 k <- 2
487 for(k in 2:dim(dialzdat)[2]-1){ 486 for(k in 2:dim(dialzdat)[2]-1){
488 dialzdat[,k] <- as.character(dialzdat[,k]) 487 dialzdat[,k] <- as.character(dialzdat[,k])
489 k <- k + 1 488 k <- k + 1
490 } 489 }
491 #The End the full data 490 #The End the full data
492 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 491 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
493 492
494 #Produces Discrete file 493 #Produces Discrete file
495 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 494 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
496 .[[1]] %>% 495 .[[1]] %>%
497 .[length(.)] %>% 496 .[length(.)] %>%
498 gsub("\\D","",.) %>% 497 gsub("\\D","",.) %>%
499 c("GSE",.,"dscrt.txt") %>% 498 c("GSE",.,"dscrt.txt") %>%
500 paste(collapse = "") 499 paste(collapse = "")
501 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 500 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
502 n <- n +1 501 n <- n +1
503 } 502 }
504 } else if(numDAT == 2){ 503 } else if(numDAT == 2){
505 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN 504 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
506 505
507 #All the files you want to analyze 506 #All the files you want to analyze
508 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:") 507 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
509 if(length(ANDIS) == 0){ 508 if(length(ANDIS) == 0){
510 #Spit out a warning 509 #Spit out a warning
511 warning("You did not select any files and so no cleaning will be performed") 510 warning("You did not select any files and so no cleaning will be performed")
512 } else{ 511 } else{
513 #indexing the data files 512 #indexing the data files
514 n <- 1 513 n <- 1
515 for(n in 1: length(ANDIS)){ 514 for(n in 1: length(ANDIS)){
516 alz <- ANDIS[n] 515 alz <- ANDIS[n]
517 516
518 #Working with the wordy part of the document 517 #Working with the wordy part of the document
519 alzword <- alz %>% 518 alzword <- alz %>%
520 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 519 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
521 filter(grepl("!Sample",X1))%>% 520 filter(grepl("!Sample",X1))%>%
522 filter(!grepl("!Sample_contact",X1)) 521 filter(!grepl("!Sample_contact",X1))
523 522
524 #Getting the GPL file 523 #Getting the GPL file
525 genena <- grep("_platform_id",alzword$X1) %>% 524 genena <- grep("_platform_id",alzword$X1) %>%
526 alzword$X2[.] %>% 525 alzword$X2[.] %>%
527 str_trim(.) %>% 526 str_trim(.) %>%
528 paste0("^",.,"\\D") %>% 527 paste0("^",.,"\\D") %>%
529 grep(.,list.files()) %>% 528 grep(.,list.files()) %>%
530 list.files()[.] 529 list.files()[.]
531 530
532 #Find out if it is a soft GPL file or not 531 #Find out if it is a soft GPL file or not
533 soft <- strsplit(genena,"[\\|/]") %>% 532 soft <- strsplit(genena,"[\\|/]") %>%
534 .[[1]] %>% 533 .[[1]] %>%
535 .[length(.)] %>% 534 .[length(.)] %>%
536 grepl("soft",.) 535 grepl("soft",.)
537 536
538 ##Changing row names and column names: 537 ##Changing row names and column names:
539 ALZWORD <- t(alzword) 538 ALZWORD <- t(alzword)
540 rownames(ALZWORD)=NULL 539 rownames(ALZWORD)=NULL
541 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 540 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
542 ALZWORD <- chngrownm(ALZWORD)[-1,] 541 ALZWORD <- chngrownm(ALZWORD)[-1,]
543 ALZWORD <- ALZWORD%>% 542 ALZWORD <- ALZWORD%>%
544 as.data.frame(.,stringsAsFactors = FALSE)%>% 543 as.data.frame(.,stringsAsFactors = FALSE)%>%
545 dplyr::select(-starts_with("col")) 544 dplyr::select(-starts_with("col"))
546 545
547 ##Reorganizing information within the columns and final clinical data 546 ##Reorganizing information within the columns and final clinical data
548 ALZWORDF <- cinfo(ALZWORD) 547 ALZWORDF <- cinfo(ALZWORD)
549 548
550 549
551 #Working with Actual Data part of file 550 #Working with Actual Data part of file
552 alzdat <- alz %>% 551 alzdat <- alz %>%
553 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 552 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
554 ALZDAT <- t(alzdat[,-1]) 553 ALZDAT <- t(alzdat[,-1])
555 rownames(ALZDAT)=NULL 554 rownames(ALZDAT)=NULL
556 555
557 ##Is there a clean version of the GPL file available? 556 ##Is there a clean version of the GPL file available?
558 gplnum <- strsplit(genena,"[\\|/]") %>% 557 gplnum <- strsplit(genena,"[\\|/]") %>%
559 .[[1]] %>% 558 .[[1]] %>%
560 .[length(.)] %>% 559 .[length(.)] %>%
561 gsub("\\D","",.) 560 gsub("\\D","",.)
562 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 561 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
563 if(clfileex >= 1){ 562 if(clfileex >= 1){
564 #use the clean version 563 #use the clean version
565 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 564 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
566 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 565 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
567 566
568 } else if(clfileex == 0){ 567 } else if(clfileex == 0){
569 ##Lets Create a clean version 568 ##Lets Create a clean version
570 569
571 ##Gene ID to Gene Name 570 ##Gene ID to Gene Name
572 if(soft == TRUE){ 571 if(soft == TRUE){
573 #Check to see if there is already a file containing information on soft files 572 #Check to see if there is already a file containing information on soft files
574 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 573 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
575 if(fileex == 1){ 574 if(fileex == 1){
576 #Check to see if this GPL soft file has been used before 575 #Check to see if this GPL soft file has been used before
577 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 576 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
578 .$GPL_FILE_NUM%>% 577 .$GPL_FILE_NUM%>%
579 grepl(gplnum,.) %>% 578 grepl(gplnum,.) %>%
580 sum() 579 sum()
581 if(IDF == 1){ 580 if(IDF == 1){
582 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 581 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
583 .$GPL_FILE_NUM%>% 582 .$GPL_FILE_NUM%>%
584 grep(gplnum,.) 583 grep(gplnum,.)
585 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 584 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
586 .$LOC_ID %>% 585 .$LOC_ID %>%
587 .[IDLOCAL] 586 .[IDLOCAL]
588 geneIDNam <- genena %>% 587 geneIDNam <- genena %>%
589 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 588 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
590 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.))) 589 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
591 } else if(IDF == 0){ 590 } else if(IDF == 0){
592 #No information on this particular GPL file 591 #No information on this particular GPL file
593 idLOCGPL <- genena %>% 592 idLOCGPL <- genena %>%
594 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 593 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
595 t(.) %>% 594 t(.) %>%
596 grep("^ID\\s*$",.) %>% 595 grep("^ID\\s*$",.) %>%
597 -1 596 -1
598 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 597 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
599 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 598 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
600 geneIDNam <- genena %>% 599 geneIDNam <- genena %>%
601 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 600 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
602 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.))) 601 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
603 } 602 }
604 } else if(fileex == 0){ 603 } else if(fileex == 0){
605 #We must create a file that we can access for later use 604 #We must create a file that we can access for later use
606 idLOCGPL <- genena %>% 605 idLOCGPL <- genena %>%
607 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 606 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
608 t(.) %>% 607 t(.) %>%
609 grep("^ID\\s*$",.) %>% 608 grep("^ID\\s*$",.) %>%
610 -1 609 -1
611 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 610 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
612 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 611 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
613 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 612 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
614 geneIDNam <- genena %>% 613 geneIDNam <- genena %>%
615 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 614 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
616 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.))) 615 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
617 } 616 }
618 } else if(soft == FALSE){ 617 } else if(soft == FALSE){
619 geneIDNam <- genena %>% 618 geneIDNam <- genena %>%
620 read_delim(delim="\t",comment = "#")%>% 619 read_delim(delim="\t",comment = "#")%>%
621 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.))) 620 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$|^UCSC_RefGene_Name$",colnames(.)))
622 } 621 }
623 622
624 ##Labeling the gene IDs without names 623 ##Labeling the gene IDs without names
625 geneIDNam <- NAFIXING(geneIDNam) 624 geneIDNam <- NAFIXING(geneIDNam)
626 625
627 ##remove the whitespace 626 ##remove the whitespace
628 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 627 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
629 628
630 ##Here is the clean version 629 ##Here is the clean version
631 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 630 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
632 } 631 }
633 632
634 633
635 634
636 ##Changing the gene ID to gene name 635 ##Changing the gene ID to gene name
637 ALZDAT1 <- cgeneID(geneIDNam,alzdat) 636 ALZDAT1 <- cgeneID(geneIDNam,alzdat)
638 colnames(ALZDAT) = ALZDAT1[1,] 637 colnames(ALZDAT) = ALZDAT1[1,]
639 638
640 639
641 ##Adjusting the column names aka the gene names 640 ##Adjusting the column names aka the gene names
642 colnames(ALZDAT) <- gcnames(ALZDAT) 641 colnames(ALZDAT) <- gcnames(ALZDAT)
643 642
644 643
645 #Full RAW Data 644 #Full RAW Data
646 Fullalzdwr <- ALZDAT %>% 645 Fullalzdwr <- ALZDAT %>%
647 as.data.frame(.,stringsAsFactors = FALSE) %>% 646 as.data.frame(.,stringsAsFactors = FALSE) %>%
648 cbind(ALZWORDF,.) 647 cbind(ALZWORDF,.)
649 648
650 #Raw file is output 649 #Raw file is output
651 nfnaex <- strsplit(alz,"[\\]") %>% 650 nfnaex <- strsplit(alz,"[\\|/]") %>%
652 .[[1]] %>% 651 .[[1]] %>%
653 .[length(.)] %>% 652 .[length(.)] %>%
654 gsub("\\D","",.) %>% 653 gsub("\\D","",.) %>%
655 c("GSE",.,"aftexcel.txt") %>% 654 c("GSE",.,"aftexcel.txt") %>%
656 paste(collapse = "") 655 paste(collapse = "")
657 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 656 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
658 657
659 658
660 659
661 #Now for the discretization part 660 #Now for the discretization part
662 ##get the wordy part again 661 ##get the wordy part again
663 rawword <- t(ALZWORDF) 662 rawword <- t(ALZWORDF)
664 663
665 ##where is ID_REF located 664 ##where is ID_REF located
666 hereim <- grep("ID_REF",rownames(rawword)) 665 hereim <- grep("ID_REF",rownames(rawword))
667 666
668 ##Subject Names GSM... 667 ##Subject Names GSM...
669 subjnam <- rawword[hereim,] 668 subjnam <- rawword[hereim,]
670 669
671 ##Getting the names for the rows 670 ##Getting the names for the rows
672 namedarows <- rownames(rawword)[-hereim] %>% 671 namedarows <- rownames(rawword)[-hereim] %>%
673 as.data.frame(.,stringsAsFactors = FALSE) 672 as.data.frame(.,stringsAsFactors = FALSE)
674 RAWWORD <- rawword[-hereim,] %>% 673 RAWWORD <- rawword[-hereim,] %>%
675 as.data.frame(.,stringsAsFactors = FALSE) %>% 674 as.data.frame(.,stringsAsFactors = FALSE) %>%
676 bind_cols(namedarows,.) 675 bind_cols(namedarows,.)
677 z <- 1 676 z <- 1
678 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 677 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
679 for(z in 1:dim(RAWWORD)[1]){ 678 for(z in 1:dim(RAWWORD)[1]){
680 if(sum(is.na(RAWWORD[z,])) > 0){ 679 if(sum(is.na(RAWWORD[z,])) > 0){
681 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 680 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
682 } 681 }
683 if(length(grep("NA",RAWWORD[z,])) > 0){ 682 if(length(grep("NA",RAWWORD[z,])) > 0){
684 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] 683 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
685 } 684 }
686 z <- z + 1 685 z <- z + 1
687 } 686 }
688 687
689 colnames(naroww) <- "ROW_NAs" 688 colnames(naroww) <- "ROW_NAs"
690 RAWWORD <- bind_cols(RAWWORD,naroww) 689 RAWWORD <- bind_cols(RAWWORD,naroww)
691 690
692 691
693 roALZna <- t(ALZDAT) %>% 692 roALZna <- t(ALZDAT) %>%
694 rownames(.) %>% 693 rownames(.) %>%
695 as.data.frame(.,stringsAsFactors = FALSE) 694 as.data.frame(.,stringsAsFactors = FALSE)
696 colnames(roALZna) <- "ID_REF" 695 colnames(roALZna) <- "ID_REF"
697 696
698 RAWDAT <- t(ALZDAT) %>% 697 RAWDAT <- t(ALZDAT) %>%
699 as.data.frame(.,stringsAsFactors = FALSE) 698 as.data.frame(.,stringsAsFactors = FALSE)
700 colnames(RAWDAT) <- NULL 699 colnames(RAWDAT) <- NULL
701 rownames(RAWDAT) <- NULL 700 rownames(RAWDAT) <- NULL
702 701
703 RAWDAT2 <- RAWDAT %>% 702 RAWDAT2 <- RAWDAT %>%
704 cbind(roALZna,.) %>% 703 cbind(roALZna,.) %>%
705 dplyr::arrange(.,ID_REF) 704 dplyr::arrange(.,ID_REF)
706 705
707 ##Editing the file for R processing 706 ##Editing the file for R processing
708 RAWDATID <- RAWDAT2[,1] %>% 707 RAWDATID <- RAWDAT2[,1] %>%
709 as.matrix(.) 708 as.matrix(.)
710 709
711 RAWDATNUM <- RAWDAT2[,-1] %>% 710 RAWDATNUM <- RAWDAT2[,-1] %>%
712 mapply(.,FUN = as.numeric) %>% 711 mapply(.,FUN = as.numeric) %>%
713 t(.) 712 t(.)
714 713
715 ##Consolidating genes with the same name 714 ##Consolidating genes with the same name
716 ###create empty matrix of size equal to tabRDATID 715 ###create empty matrix of size equal to tabRDATID
717 tabRDATID <- table(RAWDATID) 716 tabRDATID <- table(RAWDATID)
718 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 717 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
719 j <- 1 718 j <- 1
720 for(j in 1:length(tabRDATID)){ 719 for(j in 1:length(tabRDATID)){
721 ##Putting the ones without duplicates in their new homes 720 ##Putting the ones without duplicates in their new homes
722 if(tabRDATID[j] == 1){ 721 if(tabRDATID[j] == 1){
723 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 722 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
724 } else if(tabRDATID[j] > 1){ 723 } else if(tabRDATID[j] > 1){
725 ##Averaging duplicates and putting them in their new homes 724 ##Averaging duplicates and putting them in their new homes
726 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 725 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
727 } 726 }
728 j <- j + 1 727 j <- j + 1
729 } 728 }
730 729
731 ##Outputting non Z-score Average over genes 730 ##Outputting non Z-score Average over genes
732 newoutput <-NuRDATN 731 newoutput <-NuRDATN
733 colnames(newoutput) <- rownames(tabRDATID) 732 colnames(newoutput) <- rownames(tabRDATID)
734 nfnewout <- strsplit(alz,"[\\]") %>% 733 nfnewout <- strsplit(alz,"[\\|/]") %>%
735 .[[1]] %>% 734 .[[1]] %>%
736 .[length(.)] %>% 735 .[length(.)] %>%
737 gsub("\\D","",.) %>% 736 gsub("\\D","",.) %>%
738 c("GSE",.,"avg.txt") %>% 737 c("GSE",.,"avg.txt") %>%
739 paste(collapse = "") 738 paste(collapse = "")
740 noutput <- newoutput %>% 739 noutput <- newoutput %>%
741 t()%>% 740 t()%>%
742 as.data.frame(.,stringsAsFactors = FALSE) 741 as.data.frame(.,stringsAsFactors = FALSE)
743 noutput <- cbind(rownames(noutput),noutput) 742 noutput <- cbind(rownames(noutput),noutput)
744 colnames(noutput) <- c("Gene Symbol",subjnam) 743 colnames(noutput) <- c("Gene Symbol",subjnam)
745 rownames(noutput) <- NULL 744 rownames(noutput) <- NULL
746 write.table(noutput, file = nfnewout, sep = "\t",col.names = TRUE,row.names = FALSE) 745 write.table(noutput, file = nfnewout, sep = "\t",col.names = TRUE,row.names = FALSE)
747 746
748 747
749 ##Scaling the Data 748 ##Scaling the Data
750 scrawdat <- NuRDATN%>% 749 scrawdat <- NuRDATN%>%
751 scale() 750 scale()
752 attr(scrawdat,"scaled:center") <- NULL 751 attr(scrawdat,"scaled:center") <- NULL
753 attr(scrawdat,"scaled:scale") <- NULL 752 attr(scrawdat,"scaled:scale") <- NULL
754 colnames(scrawdat) <- rownames(tabRDATID) 753 colnames(scrawdat) <- rownames(tabRDATID)
755 754
756 #Outputting the Z-score file 755 #Outputting the Z-score file
757 nfnzsc <- strsplit(alz,"[\\]") %>% 756 nfnzsc <- strsplit(alz,"[\\|/]") %>%
758 .[[1]] %>% 757 .[[1]] %>%
759 .[length(.)] %>% 758 .[length(.)] %>%
760 gsub("\\D","",.) %>% 759 gsub("\\D","",.) %>%
761 c("GSE",.,"zscore.txt") %>% 760 c("GSE",.,"zscore.txt") %>%
762 paste(collapse = "") 761 paste(collapse = "")
763 zscraw <- scrawdat %>% 762 zscraw <- scrawdat %>%
764 t()%>% 763 t()%>%
765 as.data.frame(.,stringsAsFactors = FALSE) 764 as.data.frame(.,stringsAsFactors = FALSE)
766 zscraw <- cbind(rownames(zscraw),zscraw) 765 zscraw <- cbind(rownames(zscraw),zscraw)
767 colnames(zscraw) <- c("Gene Symbol",subjnam) 766 colnames(zscraw) <- c("Gene Symbol",subjnam)
768 rownames(zscraw) <- NULL 767 rownames(zscraw) <- NULL
769 write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = FALSE) 768 write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = FALSE)
770 769
771 ##Discretized the Data 770 ##Discretized the Data
772 dialzdat <- scrawdat %>% 771 dialzdat <- scrawdat %>%
773 dndat(.) %>% 772 dndat(.) %>%
774 t()%>% 773 t()%>%
775 as.data.frame(.,stringsAsFactors = FALSE) 774 as.data.frame(.,stringsAsFactors = FALSE)
776 colnames(dialzdat) <- rownames(RAWDATNUM) 775 colnames(dialzdat) <- rownames(RAWDATNUM)
777 776
778 ##setting "ID_REF" as a new variable 777 ##setting "ID_REF" as a new variable
779 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE) 778 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)
780 colnames(geneNAM) <- "ID_REF" 779 colnames(geneNAM) <- "ID_REF"
781 rownames(dialzdat) <- NULL 780 rownames(dialzdat) <- NULL
782 dialzdat <-bind_cols(geneNAM,dialzdat) 781 dialzdat <-bind_cols(geneNAM,dialzdat)
783 782
784 ##NAs in a column 783 ##NAs in a column
785 x <- 2 784 x <- 2
786 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 785 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
787 nacol[1,1] = "COL_NAs" 786 nacol[1,1] = "COL_NAs"
788 for(x in 2:dim(dialzdat)[2]){ 787 for(x in 2:dim(dialzdat)[2]){
789 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 788 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
790 x <- x + 1 789 x <- x + 1
791 } 790 }
792 colnames(nacol) <- colnames(dialzdat) 791 colnames(nacol) <- colnames(dialzdat)
793 dialzdat <- bind_rows(dialzdat,nacol) 792 dialzdat <- bind_rows(dialzdat,nacol)
794 793
795 ##NAs in a row 794 ##NAs in a row
796 y <- 1 795 y <- 1
797 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 796 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
798 for(y in 1:dim(dialzdat)[1]){ 797 for(y in 1:dim(dialzdat)[1]){
799 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 798 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
800 y <- y + 1 799 y <- y + 1
801 } 800 }
802 colnames(narowd) <- "ROW_NAs" 801 colnames(narowd) <- "ROW_NAs"
803 dialzdat <- bind_cols(dialzdat,narowd) 802 dialzdat <- bind_cols(dialzdat,narowd)
804 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 803 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
805 colnames(RAWWORD) <- colnames(dialzdat) 804 colnames(RAWWORD) <- colnames(dialzdat)
806 ##converting to character so that the clinical can be brought together with discrete data 805 ##converting to character so that the clinical can be brought together with discrete data
807 k <- 2 806 k <- 2
808 for(k in 2:dim(dialzdat)[2]-1){ 807 for(k in 2:dim(dialzdat)[2]-1){
809 dialzdat[,k] <- as.character(dialzdat[,k]) 808 dialzdat[,k] <- as.character(dialzdat[,k])
810 k <- k + 1 809 k <- k + 1
811 } 810 }
812 #The End the full data 811 #The End the full data
813 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 812 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
814 813
815 #Produces Discrete file 814 #Produces Discrete file
816 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 815 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
817 .[[1]] %>% 816 .[[1]] %>%
818 .[length(.)] %>% 817 .[length(.)] %>%
819 gsub("\\D","",.) %>% 818 gsub("\\D","",.) %>%
820 c("GSE",.,"dscrt.txt") %>% 819 c("GSE",.,"dscrt.txt") %>%
821 paste(collapse = "") 820 paste(collapse = "")
822 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 821 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
823 822
824 823
825 n <- n + 1 824 n <- n + 1
826 } 825 }
827 } 826 }
828 } 827 }
829 } 828 }
830 #The Rest of this code will be used every time you want to change a data set 829 #The Rest of this code will be used every time you want to change a data set
831 THEFT() 830 THEFT()