Commit 6a66705877a70dd83cb8cff53f5b52f447b4dc76

Authored by Efrain Gonzalez
1 parent 769f081f9b
Exists in master

Don't use this code yet

updated else if statements
Showing 1 changed file with 20 additions and 38 deletions   Show diff stats
1 ######################################################################## 1 ########################################################################
2 # Don't Use This Code Just Yet # 2 # Don't Use This Code Just Yet #
3 ######################################################################## 3 ########################################################################
4 #Efrain H. Gonzalez 4 #Efrain H. Gonzalez
5 #6/16/2017 5 #6/16/2017
6 6
7 #Libraries required to run the code 7 #Libraries required to run the code
8 library(pryr) 8 library(pryr)
9 library(MASS) 9 library(MASS)
10 library(dplyr) 10 library(dplyr)
11 library(tidyr) 11 library(tidyr)
12 library(readr) 12 library(readr)
13 library(stringr) 13 library(stringr)
14 14
15 15
16 #Necessary Functions 16 #Necessary Functions
17 #1#Function for handling the changing of row names and column names 17 #1#Function for handling the changing of row names and column names
18 chngrownm <- function(mat){ 18 chngrownm <- function(mat){
19 row <- dim(mat)[1] 19 row <- dim(mat)[1]
20 col <- dim(mat)[2] 20 col <- dim(mat)[2]
21 e <- 1 21 e <- 1
22 r <- 1 22 r <- 1
23 a <- 1 23 a <- 1
24 h <- 1 24 h <- 1
25 g <- 1 25 g <- 1
26 o <- 1 26 o <- 1
27 for(e in 1:col){ 27 for(e in 1:col){
28 if("!Sample_source_name_ch1"==mat[1,e]){ 28 if("!Sample_source_name_ch1"==mat[1,e]){
29 colnames(mat)[e] <- "Brain_Region" 29 colnames(mat)[e] <- "Brain_Region"
30 } 30 } else if("!Sample_title" == mat[1,e]){
31 else if("!Sample_title" == mat[1,e]){
32 colnames(mat)[e] <- "Title" 31 colnames(mat)[e] <- "Title"
33 } 32 } else if("!Sample_geo_accession" == mat[1,e]){
34 else if("!Sample_geo_accession" == mat[1,e]){
35 colnames(mat)[e] <- "ID_REF" 33 colnames(mat)[e] <- "ID_REF"
36 } else{ 34 } else{
37 if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){ 35 if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){
38 colnames(mat)[e] <- paste0("Sex",r) 36 colnames(mat)[e] <- paste0("Sex",r)
39 r = r + 1 37 r = r + 1
40 } 38 }
41 if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){ 39 if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){
42 colnames(mat)[e] <- paste0("PMI",a) 40 colnames(mat)[e] <- paste0("PMI",a)
43 a = a + 1 41 a = a + 1
44 } 42 }
45 if(grepl("age|Age|AGE",mat[2,e])==TRUE){ 43 if(grepl("age|Age|AGE",mat[2,e])==TRUE){
46 colnames(mat)[e] <- paste0("Age",h) 44 colnames(mat)[e] <- paste0("Age",h)
47 h = h + 1 45 h = h + 1
48 } 46 }
49 if(grepl("braak|b&b",mat[2,e])==TRUE){ 47 if(grepl("braak|b&b",mat[2,e])==TRUE){
50 colnames(mat)[e] <- paste0("Braak",g) 48 colnames(mat)[e] <- paste0("Braak",g)
51 g = g + 1 49 g = g + 1
52 } 50 }
53 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){ 51 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){
54 colnames(mat)[e] <- paste0("Group",o) 52 colnames(mat)[e] <- paste0("Group",o)
55 o = o + 1 53 o = o + 1
56 } 54 }
57 55
58 } 56 }
59 e = e + 1 57 e = e + 1
60 } 58 }
61 mat 59 mat
62 } 60 }
63 61
64 #2#Function for reorganizing information within the columns 62 #2#Function for reorganizing information within the columns
65 cinfo <- function(mat){ 63 cinfo <- function(mat){
66 col <- dim(mat)[2] 64 col <- dim(mat)[2]
67 j <-2 65 j <-2
68 for(j in 2:col){ 66 for(j in 2:col){
69 if(grepl("Group",colnames(mat)[j]) == TRUE){ 67 if(grepl("Group",colnames(mat)[j]) == TRUE){
70 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 68 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
71 } 69 } else if(grepl("Age",colnames(mat)[j])==TRUE){
72 else if(grepl("Age",colnames(mat)[j])==TRUE){
73 mat[,j] <- gsub("\\D","",mat[,j])%>% 70 mat[,j] <- gsub("\\D","",mat[,j])%>%
74 as.integer() 71 as.integer()
75 } 72 } else if(grepl("Sex",colnames(mat)[j])==TRUE){
76 else if(grepl("Sex",colnames(mat)[j])==TRUE){
77 mat[,j] <- gsub(".+:\\s","",mat[,j]) 73 mat[,j] <- gsub(".+:\\s","",mat[,j])
78 } 74 } else if(grepl("PMI",colnames(mat)[j])==TRUE){
79 else if(grepl("PMI",colnames(mat)[j])==TRUE){
80 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 75 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
81 as.numeric() 76 as.numeric()
82 } 77 } else if(grepl("Braak",colnames(mat)[j])==TRUE){
83 else if(grepl("Braak",colnames(mat)[j])==TRUE){
84 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
85 as.roman()%>% 79 as.roman()%>%
86 as.integer() 80 as.integer()
87 } 81 }
88 j=j+1 82 j=j+1
89 } 83 }
90 mat 84 mat
91 } 85 }
92 86
93 #3#Function for labeling the gene IDs without names 87 #3#Function for labeling the gene IDs without names
94 NAFIXING <- function(GIDNAM){ 88 NAFIXING <- function(GIDNAM){
95 row <- dim(GIDNAM)[1] 89 row <- dim(GIDNAM)[1]
96 i <- 1 90 i <- 1
97 for(i in 1:row){ 91 for(i in 1:row){
98 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 92 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
99 GIDNAM[i,2] <- GIDNAM[i,1] 93 GIDNAM[i,2] <- GIDNAM[i,1]
100 } 94 }
101 i <- i + 1 95 i <- i + 1
102 } 96 }
103 GIDNAM 97 GIDNAM
104 } 98 }
105 99
106 #4#Function for changing the gene ID to gene name 100 #4#Function for changing the gene ID to gene name
107 cgeneID <- function(GeneName,DATA){ 101 cgeneID <- function(GeneName,DATA){
108 nj <- t(GeneName) 102 nj <- t(GeneName)
109 nq <- t(DATA) 103 nq <- t(DATA)
110 colGene <- dim(nj)[2] 104 colGene <- dim(nj)[2]
111 colDATA <- dim(nq)[2] 105 colDATA <- dim(nq)[2]
112 j <- 1 106 j <- 1
113 for(j in 1:colDATA){ 107 for(j in 1:colDATA){
114 #where is that gene id located within the GPL file 108 #where is that gene id located within the GPL file
115 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,]) 109 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
116 if(is.na(sum(chngreq))==FALSE){ 110 if(is.na(sum(chngreq))==FALSE){
117 if(sum(chngreq) > 0){ 111 if(sum(chngreq) > 0){
118 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j]) 112 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
119 } 113 }
120 } 114 }
121 j <- j + 1 115 j <- j + 1
122 } 116 }
123 nq 117 nq
124 } 118 }
125 #cgeneID <- function(GeneName,DATA){ 119 #cgeneID <- function(GeneName,DATA){
126 # colGene <- dim(GeneName)[2] 120 # colGene <- dim(GeneName)[2]
127 # j <- 1 121 # j <- 1
128 # for(j in 1:colGene){ 122 # for(j in 1:colGene){
129 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) 123 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
130 # if(is.na(sum(chngsreq))==FALSE){ 124 # if(is.na(sum(chngsreq))==FALSE){
131 # if(sum(chngsreq) > 0){ 125 # if(sum(chngsreq) > 0){
132 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 126 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
133 # } 127 # }
134 # } 128 # }
135 # j = j+1 129 # j = j+1
136 # } 130 # }
137 # DATA 131 # DATA
138 #} 132 #}
139 133
140 #5#Function for adjusting the gene names 134 #5#Function for adjusting the gene names
141 gcnames <- function(DiData,usecol=1){ 135 gcnames <- function(DiData,usecol=1){
142 nuruns <- dim(DiData)[2] 136 nuruns <- dim(DiData)[2]
143 i = 1 137 i = 1
144 nwnam <- rep("0",length.out=nuruns) 138 nwnam <- rep("0",length.out=nuruns)
145 for(i in 1:nuruns){ 139 for(i in 1:nuruns){
146 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ 140 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
147 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol]) 141 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
148 } else{ 142 } else{
149 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1]) 143 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
150 } 144 }
151 145
152 } 146 }
153 nwnam 147 nwnam
154 148
155 } 149 }
156 150
157 #6# Function for discretizing the data 151 #6# Function for discretizing the data
158 dndat <- function(NDATA){ 152 dndat <- function(NDATA){
159 rownd <- dim(NDATA)[1] 153 rownd <- dim(NDATA)[1]
160 colnd <- dim(NDATA)[2] 154 colnd <- dim(NDATA)[2]
161 DDATA <- matrix(0,nrow=rownd,ncol=colnd) 155 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
162 colnames(DDATA) <- colnames(NDATA) 156 colnames(DDATA) <- colnames(NDATA)
163 i <- 1 157 i <- 1
164 for(i in 1:rownd){ 158 for(i in 1:rownd){
165 j <- 1 159 j <- 1
166 for(j in 1:colnd){ 160 for(j in 1:colnd){
167 if(is.na(NDATA[i,j])==FALSE){ 161 if(is.na(NDATA[i,j])==FALSE){
168 162
169 if(NDATA[i,j] < -1){ 163 if(NDATA[i,j] < -1){
170 DDATA[i,j]=0L 164 DDATA[i,j]=0L
171 } 165 } else if(NDATA[i,j] > 1){
172 else if(NDATA[i,j] > 1){
173 DDATA[i,j]=2L 166 DDATA[i,j]=2L
174 } 167 } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
175 else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
176 DDATA[i,j]=1L 168 DDATA[i,j]=1L
177 } 169 }
178 } else{ 170 } else{
179 DDATA[i,j] = NDATA[i,j] 171 DDATA[i,j] = NDATA[i,j]
180 } 172 }
181 j = j + 1 173 j = j + 1
182 } 174 }
183 i = i + 1 175 i = i + 1
184 } 176 }
185 DDATA 177 DDATA
186 } 178 }
187 179
188 180
189 #MajorFunction#This is the function that does everything else 181 #MajorFunction#This is the function that does everything else
190 THEFT <- function(){ 182 THEFT <- function(){
191 #Set working directory based on the directory of the series matrix file Currently only works for windows 183 #Set working directory based on the directory of the series matrix file Currently only works for windows
192 wd <- getwd() 184 wd <- getwd()
193 #list.files() 185 #list.files()
194 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?") 186 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
195 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) 187 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
196 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files()) 188 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
197 GSEfloc <- list.files()[GSEfileloc] 189 GSEfloc <- list.files()[GSEfileloc]
198 #ALL DATA FILES WILL BE CLEANED 190 #ALL DATA FILES WILL BE CLEANED
199 if(numDAT == 1){ 191 if(numDAT == 1){
200 #indexing the data files 192 #indexing the data files
201 n <- 1 193 n <- 1
202 for(n in 1: length(GSEfloc)){ 194 for(n in 1: length(GSEfloc)){
203 alz <- GSEfloc[n] 195 alz <- GSEfloc[n]
204 196
205 #Working with the wordy part of the document 197 #Working with the wordy part of the document
206 alzword <- alz %>% 198 alzword <- alz %>%
207 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 199 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
208 filter(grepl("!Sample",X1))%>% 200 filter(grepl("!Sample",X1))%>%
209 filter(!grepl("!Sample_contact",X1)) 201 filter(!grepl("!Sample_contact",X1))
210 202
211 #Getting the GPL file 203 #Getting the GPL file
212 genena <- grep("_platform_id",alzword$X1) %>% 204 genena <- grep("_platform_id",alzword$X1) %>%
213 alzword$X2[.] %>% 205 alzword$X2[.] %>%
214 str_trim(.) %>% 206 str_trim(.) %>%
215 paste0("^",.,"\\D") %>% 207 paste0("^",.,"\\D") %>%
216 grep(.,list.files()) %>% 208 grep(.,list.files()) %>%
217 list.files()[.] 209 list.files()[.]
218 210
219 #Find out if it is a soft GPL file or not 211 #Find out if it is a soft GPL file or not
220 soft <- strsplit(genena,"[\\|/]") %>% 212 soft <- strsplit(genena,"[\\|/]") %>%
221 .[[1]] %>% 213 .[[1]] %>%
222 .[length(.)] %>% 214 .[length(.)] %>%
223 grepl("soft",.) 215 grepl("soft",.)
224 216
225 ##Changing row names and column names: 217 ##Changing row names and column names:
226 ALZWORD <- t(alzword) 218 ALZWORD <- t(alzword)
227 rownames(ALZWORD)=NULL 219 rownames(ALZWORD)=NULL
228 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 220 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
229 ALZWORD <- chngrownm(ALZWORD)[-1,] 221 ALZWORD <- chngrownm(ALZWORD)[-1,]
230 ALZWORD <- ALZWORD%>% 222 ALZWORD <- ALZWORD%>%
231 as.data.frame()%>% 223 as.data.frame()%>%
232 dplyr::select(-starts_with("col")) 224 dplyr::select(-starts_with("col"))
233 225
234 ##Reorganizing information within the columns and final clinical data 226 ##Reorganizing information within the columns and final clinical data
235 ALZWORDF <- cinfo(ALZWORD) 227 ALZWORDF <- cinfo(ALZWORD)
236 228
237 229
238 #Working with Actual Data part of file 230 #Working with Actual Data part of file
239 alzdat <- alz %>% 231 alzdat <- alz %>%
240 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 232 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
241 ALZDAT <- t(alzdat[,-1]) 233 ALZDAT <- t(alzdat[,-1])
242 rownames(ALZDAT)=NULL 234 rownames(ALZDAT)=NULL
243 235
244 ##Is there a clean version of the GPL file available? 236 ##Is there a clean version of the GPL file available?
245 gplnum <- strsplit(genena,"[\\|/]") %>% 237 gplnum <- strsplit(genena,"[\\|/]") %>%
246 .[[1]] %>% 238 .[[1]] %>%
247 .[length(.)] %>% 239 .[length(.)] %>%
248 gsub("\\D","",.) 240 gsub("\\D","",.)
249 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 241 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
250 if(clfileex >= 1){ 242 if(clfileex >= 1){
251 #use the clean version 243 #use the clean version
252 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 244 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
253 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 245 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
254 246
255 } 247 }
256 else if(clfileex == 0){ 248 else if(clfileex == 0){
257 ##Lets Create a clean version 249 ##Lets Create a clean version
258 250
259 ##Gene ID to Gene Name 251 ##Gene ID to Gene Name
260 if(soft == TRUE){ 252 if(soft == TRUE){
261 #Check to see if there is already a file containing information on soft files 253 #Check to see if there is already a file containing information on soft files
262 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 254 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
263 if(fileex == 1){ 255 if(fileex == 1){
264 #Check to see if this GPL soft file has been used before 256 #Check to see if this GPL soft file has been used before
265 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 257 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
266 .$GPL_FILE_NUM%>% 258 .$GPL_FILE_NUM%>%
267 grepl(gplnum,.) %>% 259 grepl(gplnum,.) %>%
268 sum() 260 sum()
269 if(IDF == 1){ 261 if(IDF == 1){
270 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 262 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
271 .$GPL_FILE_NUM%>% 263 .$GPL_FILE_NUM%>%
272 grep(gplnum,.) 264 grep(gplnum,.)
273 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 265 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
274 .$LOC_ID %>% 266 .$LOC_ID %>%
275 .[IDLOCAL] 267 .[IDLOCAL]
276 geneIDNam <- genena %>% 268 geneIDNam <- genena %>%
277 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 269 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
278 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 270 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
279 } 271 } else if(IDF == 0){
280 else if(IDF == 0){
281 #No information on this particular GPL file 272 #No information on this particular GPL file
282 idLOCGPL <- genena %>% 273 idLOCGPL <- genena %>%
283 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 274 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
284 t(.) %>% 275 t(.) %>%
285 grep("^ID\\s*$",.) %>% 276 grep("^ID\\s*$",.) %>%
286 -1 277 -1
287 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 278 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
288 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 279 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
289 geneIDNam <- genena %>% 280 geneIDNam <- genena %>%
290 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 281 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
291 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 282 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
292 } 283 }
293 } 284 } else if(fileex == 0){
294 else if(fileex == 0){
295 #We must create a file that we can access for later use 285 #We must create a file that we can access for later use
296 idLOCGPL <- genena %>% 286 idLOCGPL <- genena %>%
297 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 287 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
298 t(.) %>% 288 t(.) %>%
299 grep("^ID\\s*$",.) %>% 289 grep("^ID\\s*$",.) %>%
300 -1 290 -1
301 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 291 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
302 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 292 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
303 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 293 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
304 geneIDNam <- genena %>% 294 geneIDNam <- genena %>%
305 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 295 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
306 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 296 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
307 } 297 }
308 } 298 } else if(soft == FALSE){
309 else if(soft == FALSE){
310 geneIDNam <- genena %>% 299 geneIDNam <- genena %>%
311 read_delim(delim="\t",comment = "#")%>% 300 read_delim(delim="\t",comment = "#")%>%
312 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 301 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
313 } 302 }
314 303
315 ##Labeling the gene IDs without names 304 ##Labeling the gene IDs without names
316 geneIDNam <- NAFIXING(geneIDNam) 305 geneIDNam <- NAFIXING(geneIDNam)
317 306
318 ##remove the whitespace 307 ##remove the whitespace
319 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 308 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
320 309
321 ##Here is the clean version 310 ##Here is the clean version
322 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 311 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
323 } 312 }
324 313
325 314
326 315
327 ##Changing the gene ID to gene name 316 ##Changing the gene ID to gene name
328 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 317 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
329 colnames(ALZDAT) = ALZDAT1[1,] 318 colnames(ALZDAT) = ALZDAT1[1,]
330 319
331 320
332 ##Adjusting the column names aka the gene names 321 ##Adjusting the column names aka the gene names
333 colnames(ALZDAT) <- gcnames(ALZDAT) 322 colnames(ALZDAT) <- gcnames(ALZDAT)
334 323
335 324
336 #Full RAW Data 325 #Full RAW Data
337 Fullalzdwr <- ALZDAT %>% 326 Fullalzdwr <- ALZDAT %>%
338 as.data.frame() %>% 327 as.data.frame() %>%
339 cbind(ALZWORDF,.) 328 cbind(ALZWORDF,.)
340 329
341 #Raw file is output 330 #Raw file is output
342 nfnaex <- strsplit(alz,"[\\]") %>% 331 nfnaex <- strsplit(alz,"[\\]") %>%
343 .[[1]] %>% 332 .[[1]] %>%
344 .[length(.)] %>% 333 .[length(.)] %>%
345 gsub("\\D","",.) %>% 334 gsub("\\D","",.) %>%
346 c("GSE",.,"aftexcel.txt") %>% 335 c("GSE",.,"aftexcel.txt") %>%
347 paste(collapse = "") 336 paste(collapse = "")
348 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 337 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
349 338
350 339
351 340
352 #Now for the discretization part 341 #Now for the discretization part
353 ##get the wordy part again 342 ##get the wordy part again
354 rawword <- t(ALZWORDF) 343 rawword <- t(ALZWORDF)
355 344
356 ##where is ID_REF located 345 ##where is ID_REF located
357 hereim <- grep("ID_REF",rownames(rawword)) 346 hereim <- grep("ID_REF",rownames(rawword))
358 347
359 ##Subject Names GSM... 348 ##Subject Names GSM...
360 subjnam <- rawword[hereim,] 349 subjnam <- rawword[hereim,]
361 350
362 ##Getting the names for the rows 351 ##Getting the names for the rows
363 namedarows <- rownames(rawword)[-hereim] %>% 352 namedarows <- rownames(rawword)[-hereim] %>%
364 as.data.frame() 353 as.data.frame()
365 RAWWORD <- rawword[-hereim,] %>% 354 RAWWORD <- rawword[-hereim,] %>%
366 as.data.frame() %>% 355 as.data.frame() %>%
367 bind_cols(namedarows,.) 356 bind_cols(namedarows,.)
368 z <- 1 357 z <- 1
369 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 358 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
370 for(z in 1:dim(RAWWORD)[1]){ 359 for(z in 1:dim(RAWWORD)[1]){
371 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 360 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
372 z <- z + 1 361 z <- z + 1
373 } 362 }
374 363
375 colnames(naroww) <- "ROW_NAs" 364 colnames(naroww) <- "ROW_NAs"
376 RAWWORD <- bind_cols(RAWWORD,naroww) 365 RAWWORD <- bind_cols(RAWWORD,naroww)
377 366
378 367
379 roALZna <- t(ALZDAT) %>% 368 roALZna <- t(ALZDAT) %>%
380 rownames(.) %>% 369 rownames(.) %>%
381 as.data.frame(.) 370 as.data.frame(.)
382 colnames(roALZna) <- "ID_REF" 371 colnames(roALZna) <- "ID_REF"
383 372
384 RAWDAT <- t(ALZDAT) %>% 373 RAWDAT <- t(ALZDAT) %>%
385 as.data.frame(.) 374 as.data.frame(.)
386 colnames(RAWDAT) <- NULL 375 colnames(RAWDAT) <- NULL
387 rownames(RAWDAT) <- NULL 376 rownames(RAWDAT) <- NULL
388 377
389 RAWDAT2 <- RAWDAT %>% 378 RAWDAT2 <- RAWDAT %>%
390 cbind(roALZna,.) %>% 379 cbind(roALZna,.) %>%
391 dplyr::arrange(.,ID_REF) 380 dplyr::arrange(.,ID_REF)
392 381
393 ##Editing the file for R processing 382 ##Editing the file for R processing
394 RAWDATID <- RAWDAT2[,1] %>% 383 RAWDATID <- RAWDAT2[,1] %>%
395 as.matrix(.) 384 as.matrix(.)
396 385
397 RAWDATNUM <- RAWDAT2[,-1] %>% 386 RAWDATNUM <- RAWDAT2[,-1] %>%
398 mapply(.,FUN = as.numeric) %>% 387 mapply(.,FUN = as.numeric) %>%
399 t(.) 388 t(.)
400 389
401 ##Consolidating genes with the same name 390 ##Consolidating genes with the same name
402 ###create empty matrix of size equal to tabRDATID 391 ###create empty matrix of size equal to tabRDATID
403 tabRDATID <- table(RAWDATID) 392 tabRDATID <- table(RAWDATID)
404 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 393 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
405 j <- 1 394 j <- 1
406 for(j in 1:length(tabRDATID)){ 395 for(j in 1:length(tabRDATID)){
407 ##Putting the ones without duplicates in their new homes 396 ##Putting the ones without duplicates in their new homes
408 if(tabRDATID[j] == 1){ 397 if(tabRDATID[j] == 1){
409 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 398 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
410 } 399 } else if(tabRDATID[j] > 1){
411 ##Averaging duplicates and putting them in their new homes 400 ##Averaging duplicates and putting them in their new homes
412 else if(tabRDATID[j] > 1){
413 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 401 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
414 } 402 }
415 j <- j + 1 403 j <- j + 1
416 } 404 }
417 405
418 ##Scaling the Data 406 ##Scaling the Data
419 scrawdat <- NuRDATN%>% 407 scrawdat <- NuRDATN%>%
420 scale() 408 scale()
421 attr(scrawdat,"scaled:center") <- NULL 409 attr(scrawdat,"scaled:center") <- NULL
422 attr(scrawdat,"scaled:scale") <- NULL 410 attr(scrawdat,"scaled:scale") <- NULL
423 colnames(scrawdat) <- rownames(tabRDATID) 411 colnames(scrawdat) <- rownames(tabRDATID)
424 412
425 ##Discretized the Data 413 ##Discretized the Data
426 dialzdat <- scrawdat %>% 414 dialzdat <- scrawdat %>%
427 dndat(.) %>% 415 dndat(.) %>%
428 t()%>% 416 t()%>%
429 as.data.frame(.) 417 as.data.frame(.)
430 colnames(dialzdat) <- rownames(RAWDATNUM) 418 colnames(dialzdat) <- rownames(RAWDATNUM)
431 419
432 ##setting "ID_REF" as a new variable 420 ##setting "ID_REF" as a new variable
433 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 421 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
434 colnames(geneNAM) <- "ID_REF" 422 colnames(geneNAM) <- "ID_REF"
435 rownames(dialzdat) <- NULL 423 rownames(dialzdat) <- NULL
436 dialzdat <-bind_cols(geneNAM,dialzdat) 424 dialzdat <-bind_cols(geneNAM,dialzdat)
437 425
438 ##NAs in a column 426 ##NAs in a column
439 x <- 2 427 x <- 2
440 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 428 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
441 nacol[1,1] = "COL_NAs" 429 nacol[1,1] = "COL_NAs"
442 for(x in 2:dim(dialzdat)[2]){ 430 for(x in 2:dim(dialzdat)[2]){
443 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 431 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
444 x <- x + 1 432 x <- x + 1
445 } 433 }
446 colnames(nacol) <- colnames(dialzdat) 434 colnames(nacol) <- colnames(dialzdat)
447 dialzdat <- bind_rows(dialzdat,nacol) 435 dialzdat <- bind_rows(dialzdat,nacol)
448 436
449 ##NAs in a row 437 ##NAs in a row
450 y <- 1 438 y <- 1
451 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 439 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
452 for(y in 1:dim(dialzdat)[1]){ 440 for(y in 1:dim(dialzdat)[1]){
453 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 441 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
454 y <- y + 1 442 y <- y + 1
455 } 443 }
456 colnames(narowd) <- "ROW_NAs" 444 colnames(narowd) <- "ROW_NAs"
457 dialzdat <- bind_cols(dialzdat,narowd) 445 dialzdat <- bind_cols(dialzdat,narowd)
458 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 446 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
459 colnames(RAWWORD) <- colnames(dialzdat) 447 colnames(RAWWORD) <- colnames(dialzdat)
460 ##converting to character so that the clinical can be brought together with discrete data 448 ##converting to character so that the clinical can be brought together with discrete data
461 k <- 2 449 k <- 2
462 for(k in 2:dim(dialzdat)[2]-1){ 450 for(k in 2:dim(dialzdat)[2]-1){
463 dialzdat[,k] <- as.character(dialzdat[,k]) 451 dialzdat[,k] <- as.character(dialzdat[,k])
464 k <- k + 1 452 k <- k + 1
465 } 453 }
466 #The End the full data 454 #The End the full data
467 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 455 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
468 456
469 #Produces Discrete file 457 #Produces Discrete file
470 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 458 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
471 .[[1]] %>% 459 .[[1]] %>%
472 .[length(.)] %>% 460 .[length(.)] %>%
473 gsub("\\D","",.) %>% 461 gsub("\\D","",.) %>%
474 c("GSE",.,"dscrt.txt") %>% 462 c("GSE",.,"dscrt.txt") %>%
475 paste(collapse = "") 463 paste(collapse = "")
476 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 464 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
477 n <- n +1 465 n <- n +1
478 } 466 }
479 } 467 } else if(numDAT == 2){
480
481 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN 468 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
482 else if(numDAT == 2){ 469
483 #All the files you want to analyze 470 #All the files you want to analyze
484 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:") 471 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
485 if(length(ANDIS) == 0){ 472 if(length(ANDIS) == 0){
486 #Spit out a warning 473 #Spit out a warning
487 warning("You did not select any files and so no cleaning will be performed") 474 warning("You did not select any files and so no cleaning will be performed")
488 } else{ 475 } else{
489 #indexing the data files 476 #indexing the data files
490 n <- 1 477 n <- 1
491 for(n in 1: length(ANDIS)){ 478 for(n in 1: length(ANDIS)){
492 alz <- ANDIS[n] 479 alz <- ANDIS[n]
493 480
494 #Working with the wordy part of the document 481 #Working with the wordy part of the document
495 alzword <- alz %>% 482 alzword <- alz %>%
496 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 483 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
497 filter(grepl("!Sample",X1))%>% 484 filter(grepl("!Sample",X1))%>%
498 filter(!grepl("!Sample_contact",X1)) 485 filter(!grepl("!Sample_contact",X1))
499 486
500 #Getting the GPL file 487 #Getting the GPL file
501 genena <- grep("_platform_id",alzword$X1) %>% 488 genena <- grep("_platform_id",alzword$X1) %>%
502 alzword$X2[.] %>% 489 alzword$X2[.] %>%
503 str_trim(.) %>% 490 str_trim(.) %>%
504 paste0("^",.,"\\D") %>% 491 paste0("^",.,"\\D") %>%
505 grep(.,list.files()) %>% 492 grep(.,list.files()) %>%
506 list.files()[.] 493 list.files()[.]
507 494
508 #Find out if it is a soft GPL file or not 495 #Find out if it is a soft GPL file or not
509 soft <- strsplit(genena,"[\\|/]") %>% 496 soft <- strsplit(genena,"[\\|/]") %>%
510 .[[1]] %>% 497 .[[1]] %>%
511 .[length(.)] %>% 498 .[length(.)] %>%
512 grepl("soft",.) 499 grepl("soft",.)
513 500
514 ##Changing row names and column names: 501 ##Changing row names and column names:
515 ALZWORD <- t(alzword) 502 ALZWORD <- t(alzword)
516 rownames(ALZWORD)=NULL 503 rownames(ALZWORD)=NULL
517 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 504 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
518 ALZWORD <- chngrownm(ALZWORD)[-1,] 505 ALZWORD <- chngrownm(ALZWORD)[-1,]
519 ALZWORD <- ALZWORD%>% 506 ALZWORD <- ALZWORD%>%
520 as.data.frame()%>% 507 as.data.frame()%>%
521 dplyr::select(-starts_with("col")) 508 dplyr::select(-starts_with("col"))
522 509
523 ##Reorganizing information within the columns and final clinical data 510 ##Reorganizing information within the columns and final clinical data
524 ALZWORDF <- cinfo(ALZWORD) 511 ALZWORDF <- cinfo(ALZWORD)
525 512
526 513
527 #Working with Actual Data part of file 514 #Working with Actual Data part of file
528 alzdat <- alz %>% 515 alzdat <- alz %>%
529 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 516 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
530 ALZDAT <- t(alzdat[,-1]) 517 ALZDAT <- t(alzdat[,-1])
531 rownames(ALZDAT)=NULL 518 rownames(ALZDAT)=NULL
532 519
533 ##Is there a clean version of the GPL file available? 520 ##Is there a clean version of the GPL file available?
534 gplnum <- strsplit(genena,"[\\|/]") %>% 521 gplnum <- strsplit(genena,"[\\|/]") %>%
535 .[[1]] %>% 522 .[[1]] %>%
536 .[length(.)] %>% 523 .[length(.)] %>%
537 gsub("\\D","",.) 524 gsub("\\D","",.)
538 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 525 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
539 if(clfileex >= 1){ 526 if(clfileex >= 1){
540 #use the clean version 527 #use the clean version
541 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 528 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
542 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 529 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
543 530
544 } 531 } else if(clfileex == 0){
545 else if(clfileex == 0){
546 ##Lets Create a clean version 532 ##Lets Create a clean version
547 533
548 ##Gene ID to Gene Name 534 ##Gene ID to Gene Name
549 if(soft == TRUE){ 535 if(soft == TRUE){
550 #Check to see if there is already a file containing information on soft files 536 #Check to see if there is already a file containing information on soft files
551 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 537 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
552 if(fileex == 1){ 538 if(fileex == 1){
553 #Check to see if this GPL soft file has been used before 539 #Check to see if this GPL soft file has been used before
554 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 540 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
555 .$GPL_FILE_NUM%>% 541 .$GPL_FILE_NUM%>%
556 grepl(gplnum,.) %>% 542 grepl(gplnum,.) %>%
557 sum() 543 sum()
558 if(IDF == 1){ 544 if(IDF == 1){
559 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 545 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
560 .$GPL_FILE_NUM%>% 546 .$GPL_FILE_NUM%>%
561 grep(gplnum,.) 547 grep(gplnum,.)
562 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 548 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
563 .$LOC_ID %>% 549 .$LOC_ID %>%
564 .[IDLOCAL] 550 .[IDLOCAL]
565 geneIDNam <- genena %>% 551 geneIDNam <- genena %>%
566 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 552 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
567 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 553 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
568 } 554 } else if(IDF == 0){
569 else if(IDF == 0){
570 #No information on this particular GPL file 555 #No information on this particular GPL file
571 idLOCGPL <- genena %>% 556 idLOCGPL <- genena %>%
572 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 557 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
573 t(.) %>% 558 t(.) %>%
574 grep("^ID\\s*$",.) %>% 559 grep("^ID\\s*$",.) %>%
575 -1 560 -1
576 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 561 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
577 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 562 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
578 geneIDNam <- genena %>% 563 geneIDNam <- genena %>%
579 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 564 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
580 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 565 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
581 } 566 }
582 } 567 } else if(fileex == 0){
583 else if(fileex == 0){
584 #We must create a file that we can access for later use 568 #We must create a file that we can access for later use
585 idLOCGPL <- genena %>% 569 idLOCGPL <- genena %>%
586 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 570 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
587 t(.) %>% 571 t(.) %>%
588 grep("^ID\\s*$",.) %>% 572 grep("^ID\\s*$",.) %>%
589 -1 573 -1
590 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 574 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
591 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 575 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
592 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 576 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
593 geneIDNam <- genena %>% 577 geneIDNam <- genena %>%
594 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 578 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
595 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 579 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
596 } 580 }
597 } 581 } else if(soft == FALSE){
598 else if(soft == FALSE){
599 geneIDNam <- genena %>% 582 geneIDNam <- genena %>%
600 read_delim(delim="\t",comment = "#")%>% 583 read_delim(delim="\t",comment = "#")%>%
601 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 584 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
602 } 585 }
603 586
604 ##Labeling the gene IDs without names 587 ##Labeling the gene IDs without names
605 geneIDNam <- NAFIXING(geneIDNam) 588 geneIDNam <- NAFIXING(geneIDNam)
606 589
607 ##remove the whitespace 590 ##remove the whitespace
608 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 591 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
609 592
610 ##Here is the clean version 593 ##Here is the clean version
611 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 594 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
612 } 595 }
613 596
614 597
615 598
616 ##Changing the gene ID to gene name 599 ##Changing the gene ID to gene name
617 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 600 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
618 colnames(ALZDAT) = ALZDAT1[1,] 601 colnames(ALZDAT) = ALZDAT1[1,]
619 602
620 603
621 ##Adjusting the column names aka the gene names 604 ##Adjusting the column names aka the gene names
622 colnames(ALZDAT) <- gcnames(ALZDAT) 605 colnames(ALZDAT) <- gcnames(ALZDAT)
623 606
624 607
625 #Full RAW Data 608 #Full RAW Data
626 Fullalzdwr <- ALZDAT %>% 609 Fullalzdwr <- ALZDAT %>%
627 as.data.frame() %>% 610 as.data.frame() %>%
628 cbind(ALZWORDF,.) 611 cbind(ALZWORDF,.)
629 612
630 #Raw file is output 613 #Raw file is output
631 nfnaex <- strsplit(alz,"[\\]") %>% 614 nfnaex <- strsplit(alz,"[\\]") %>%
632 .[[1]] %>% 615 .[[1]] %>%
633 .[length(.)] %>% 616 .[length(.)] %>%
634 gsub("\\D","",.) %>% 617 gsub("\\D","",.) %>%
635 c("GSE",.,"aftexcel.txt") %>% 618 c("GSE",.,"aftexcel.txt") %>%
636 paste(collapse = "") 619 paste(collapse = "")
637 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 620 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
638 621
639 622
640 623
641 #Now for the discretization part 624 #Now for the discretization part
642 ##get the wordy part again 625 ##get the wordy part again
643 rawword <- t(ALZWORDF) 626 rawword <- t(ALZWORDF)
644 627
645 ##where is ID_REF located 628 ##where is ID_REF located
646 hereim <- grep("ID_REF",rownames(rawword)) 629 hereim <- grep("ID_REF",rownames(rawword))
647 630
648 ##Subject Names GSM... 631 ##Subject Names GSM...
649 subjnam <- rawword[hereim,] 632 subjnam <- rawword[hereim,]
650 633
651 ##Getting the names for the rows 634 ##Getting the names for the rows
652 namedarows <- rownames(rawword)[-hereim] %>% 635 namedarows <- rownames(rawword)[-hereim] %>%
653 as.data.frame() 636 as.data.frame()
654 RAWWORD <- rawword[-hereim,] %>% 637 RAWWORD <- rawword[-hereim,] %>%
655 as.data.frame() %>% 638 as.data.frame() %>%
656 bind_cols(namedarows,.) 639 bind_cols(namedarows,.)
657 z <- 1 640 z <- 1
658 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 641 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
659 for(z in 1:dim(RAWWORD)[1]){ 642 for(z in 1:dim(RAWWORD)[1]){
660 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 643 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
661 z <- z + 1 644 z <- z + 1
662 } 645 }
663 646
664 colnames(naroww) <- "ROW_NAs" 647 colnames(naroww) <- "ROW_NAs"
665 RAWWORD <- bind_cols(RAWWORD,naroww) 648 RAWWORD <- bind_cols(RAWWORD,naroww)
666 649
667 650
668 roALZna <- t(ALZDAT) %>% 651 roALZna <- t(ALZDAT) %>%
669 rownames(.) %>% 652 rownames(.) %>%
670 as.data.frame(.) 653 as.data.frame(.)
671 colnames(roALZna) <- "ID_REF" 654 colnames(roALZna) <- "ID_REF"
672 655
673 RAWDAT <- t(ALZDAT) %>% 656 RAWDAT <- t(ALZDAT) %>%
674 as.data.frame(.) 657 as.data.frame(.)
675 colnames(RAWDAT) <- NULL 658 colnames(RAWDAT) <- NULL
676 rownames(RAWDAT) <- NULL 659 rownames(RAWDAT) <- NULL
677 660
678 RAWDAT2 <- RAWDAT %>% 661 RAWDAT2 <- RAWDAT %>%
679 cbind(roALZna,.) %>% 662 cbind(roALZna,.) %>%
680 dplyr::arrange(.,ID_REF) 663 dplyr::arrange(.,ID_REF)
681 664
682 ##Editing the file for R processing 665 ##Editing the file for R processing
683 RAWDATID <- RAWDAT2[,1] %>% 666 RAWDATID <- RAWDAT2[,1] %>%
684 as.matrix(.) 667 as.matrix(.)
685 668
686 RAWDATNUM <- RAWDAT2[,-1] %>% 669 RAWDATNUM <- RAWDAT2[,-1] %>%
687 mapply(.,FUN = as.numeric) %>% 670 mapply(.,FUN = as.numeric) %>%
688 t(.) 671 t(.)
689 672
690 ##Consolidating genes with the same name 673 ##Consolidating genes with the same name
691 ###create empty matrix of size equal to tabRDATID 674 ###create empty matrix of size equal to tabRDATID
692 tabRDATID <- table(RAWDATID) 675 tabRDATID <- table(RAWDATID)
693 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 676 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
694 j <- 1 677 j <- 1
695 for(j in 1:length(tabRDATID)){ 678 for(j in 1:length(tabRDATID)){
696 ##Putting the ones without duplicates in their new homes 679 ##Putting the ones without duplicates in their new homes
697 if(tabRDATID[j] == 1){ 680 if(tabRDATID[j] == 1){
698 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 681 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
699 } 682 } else if(tabRDATID[j] > 1){
700 ##Averaging duplicates and putting them in their new homes 683 ##Averaging duplicates and putting them in their new homes
701 else if(tabRDATID[j] > 1){
702 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 684 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
703 } 685 }
704 j <- j + 1 686 j <- j + 1
705 } 687 }
706 688
707 ##Scaling the Data 689 ##Scaling the Data
708 scrawdat <- NuRDATN%>% 690 scrawdat <- NuRDATN%>%
709 scale() 691 scale()
710 attr(scrawdat,"scaled:center") <- NULL 692 attr(scrawdat,"scaled:center") <- NULL
711 attr(scrawdat,"scaled:scale") <- NULL 693 attr(scrawdat,"scaled:scale") <- NULL
712 colnames(scrawdat) <- rownames(tabRDATID) 694 colnames(scrawdat) <- rownames(tabRDATID)
713 695
714 ##Discretized the Data 696 ##Discretized the Data
715 dialzdat <- scrawdat %>% 697 dialzdat <- scrawdat %>%
716 dndat(.) %>% 698 dndat(.) %>%
717 t()%>% 699 t()%>%
718 as.data.frame(.) 700 as.data.frame(.)
719 colnames(dialzdat) <- rownames(RAWDATNUM) 701 colnames(dialzdat) <- rownames(RAWDATNUM)
720 702
721 ##setting "ID_REF" as a new variable 703 ##setting "ID_REF" as a new variable
722 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 704 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
723 colnames(geneNAM) <- "ID_REF" 705 colnames(geneNAM) <- "ID_REF"
724 rownames(dialzdat) <- NULL 706 rownames(dialzdat) <- NULL
725 dialzdat <-bind_cols(geneNAM,dialzdat) 707 dialzdat <-bind_cols(geneNAM,dialzdat)
726 708
727 ##NAs in a column 709 ##NAs in a column
728 x <- 2 710 x <- 2
729 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 711 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
730 nacol[1,1] = "COL_NAs" 712 nacol[1,1] = "COL_NAs"
731 for(x in 2:dim(dialzdat)[2]){ 713 for(x in 2:dim(dialzdat)[2]){
732 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 714 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
733 x <- x + 1 715 x <- x + 1
734 } 716 }
735 colnames(nacol) <- colnames(dialzdat) 717 colnames(nacol) <- colnames(dialzdat)
736 dialzdat <- bind_rows(dialzdat,nacol) 718 dialzdat <- bind_rows(dialzdat,nacol)
737 719
738 ##NAs in a row 720 ##NAs in a row
739 y <- 1 721 y <- 1
740 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 722 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
741 for(y in 1:dim(dialzdat)[1]){ 723 for(y in 1:dim(dialzdat)[1]){
742 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 724 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
743 y <- y + 1 725 y <- y + 1
744 } 726 }
745 colnames(narowd) <- "ROW_NAs" 727 colnames(narowd) <- "ROW_NAs"
746 dialzdat <- bind_cols(dialzdat,narowd) 728 dialzdat <- bind_cols(dialzdat,narowd)
747 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 729 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
748 colnames(RAWWORD) <- colnames(dialzdat) 730 colnames(RAWWORD) <- colnames(dialzdat)
749 ##converting to character so that the clinical can be brought together with discrete data 731 ##converting to character so that the clinical can be brought together with discrete data
750 k <- 2 732 k <- 2
751 for(k in 2:dim(dialzdat)[2]-1){ 733 for(k in 2:dim(dialzdat)[2]-1){
752 dialzdat[,k] <- as.character(dialzdat[,k]) 734 dialzdat[,k] <- as.character(dialzdat[,k])
753 k <- k + 1 735 k <- k + 1
754 } 736 }
755 #The End the full data 737 #The End the full data
756 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 738 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
757 739
758 #Produces Discrete file 740 #Produces Discrete file
759 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 741 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
760 .[[1]] %>% 742 .[[1]] %>%
761 .[length(.)] %>% 743 .[length(.)] %>%
762 gsub("\\D","",.) %>% 744 gsub("\\D","",.) %>%
763 c("GSE",.,"dscrt.txt") %>% 745 c("GSE",.,"dscrt.txt") %>%
764 paste(collapse = "") 746 paste(collapse = "")
765 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 747 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
766 748
767 749
768 n <- n + 1 750 n <- n + 1
769 } 751 }
770 } 752 }
771 } 753 }
772 } 754 }
773 #The Rest of this code will be used every time you want to change a data set 755 #The Rest of this code will be used every time you want to change a data set
774 THEFT() 756 THEFT()