Commit b32697f8969fdeb47009f230e2445acebfdb4915

Authored by Efrain Gonzalez
1 parent eccb7a19e2
Exists in master

Automated version of RCleanDscret.R

Updated gcnames function
Showing 1 changed file with 5 additions and 7 deletions   Show diff stats
1 ######################################################################## 1
2 # Don't Use This Code Just Yet #
3 ########################################################################
4 #Efrain H. Gonzalez 2 #Efrain H. Gonzalez
5 #6/21/2017 3 #6/22/2017
6 options(digits = 11) 4 options(digits = 11)
7 #Libraries required to run the code 5 #Libraries required to run the code
8 library(pryr) 6 library(pryr)
9 library(MASS) 7 library(MASS)
10 library(dplyr) 8 library(dplyr)
11 library(tidyr) 9 library(tidyr)
12 library(readr) 10 library(readr)
13 library(stringr) 11 library(stringr)
14 12
15 13
16 #Necessary Functions 14 #Necessary Functions
17 #1#Function for handling the changing of row names and column names 15 #1#Function for handling the changing of row names and column names
18 chngrownm <- function(mat){ 16 chngrownm <- function(mat){
19 row <- dim(mat)[1] 17 row <- dim(mat)[1]
20 col <- dim(mat)[2] 18 col <- dim(mat)[2]
21 e <- 1 19 e <- 1
22 r <- 1 20 r <- 1
23 a <- 1 21 a <- 1
24 h <- 1 22 h <- 1
25 g <- 1 23 g <- 1
26 o <- 1 24 o <- 1
27 for(e in 1:col){ 25 for(e in 1:col){
28 if("!Sample_source_name_ch1"==mat[1,e]){ 26 if("!Sample_source_name_ch1"==mat[1,e]){
29 colnames(mat)[e] <- "Brain_Region" 27 colnames(mat)[e] <- "Brain_Region"
30 } else if("!Sample_title" == mat[1,e]){ 28 } else if("!Sample_title" == mat[1,e]){
31 colnames(mat)[e] <- "Title" 29 colnames(mat)[e] <- "Title"
32 } else if("!Sample_geo_accession" == mat[1,e]){ 30 } else if("!Sample_geo_accession" == mat[1,e]){
33 colnames(mat)[e] <- "ID_REF" 31 colnames(mat)[e] <- "ID_REF"
34 } else{ 32 } else{
35 if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){ 33 if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){
36 colnames(mat)[e] <- paste0("Sex",r) 34 colnames(mat)[e] <- paste0("Sex",r)
37 r = r + 1 35 r = r + 1
38 } 36 }
39 if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){ 37 if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){
40 colnames(mat)[e] <- paste0("PMI",a) 38 colnames(mat)[e] <- paste0("PMI",a)
41 a = a + 1 39 a = a + 1
42 } 40 }
43 if(grepl("age|Age|AGE",mat[2,e])==TRUE){ 41 if(grepl("age|Age|AGE",mat[2,e])==TRUE){
44 colnames(mat)[e] <- paste0("Age",h) 42 colnames(mat)[e] <- paste0("Age",h)
45 h = h + 1 43 h = h + 1
46 } 44 }
47 if(grepl("braak|b&b",mat[2,e])==TRUE){ 45 if(grepl("braak|b&b",mat[2,e])==TRUE){
48 colnames(mat)[e] <- paste0("Braak",g) 46 colnames(mat)[e] <- paste0("Braak",g)
49 g = g + 1 47 g = g + 1
50 } 48 }
51 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){ 49 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){
52 colnames(mat)[e] <- paste0("Group",o) 50 colnames(mat)[e] <- paste0("Group",o)
53 o = o + 1 51 o = o + 1
54 } 52 }
55 53
56 } 54 }
57 e = e + 1 55 e = e + 1
58 } 56 }
59 mat 57 mat
60 } 58 }
61 59
62 #2#Function for reorganizing information within the columns 60 #2#Function for reorganizing information within the columns
63 cinfo <- function(mat){ 61 cinfo <- function(mat){
64 col <- dim(mat)[2] 62 col <- dim(mat)[2]
65 j <-2 63 j <-2
66 for(j in 2:col){ 64 for(j in 2:col){
67 if(grepl("Group",colnames(mat)[j]) == TRUE){ 65 if(grepl("Group",colnames(mat)[j]) == TRUE){
68 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 66 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
69 } else if(grepl("Age",colnames(mat)[j])==TRUE){ 67 } else if(grepl("Age",colnames(mat)[j])==TRUE){
70 mat[,j] <- gsub("\\D","",mat[,j])%>% 68 mat[,j] <- gsub("\\D","",mat[,j])%>%
71 as.integer() 69 as.integer()
72 } else if(grepl("Sex",colnames(mat)[j])==TRUE){ 70 } else if(grepl("Sex",colnames(mat)[j])==TRUE){
73 mat[,j] <- gsub(".+:\\s","",mat[,j]) 71 mat[,j] <- gsub(".+:\\s","",mat[,j])
74 } else if(grepl("PMI",colnames(mat)[j])==TRUE){ 72 } else if(grepl("PMI",colnames(mat)[j])==TRUE){
75 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 73 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
76 as.numeric() 74 as.numeric()
77 } else if(grepl("Braak",colnames(mat)[j])==TRUE){ 75 } else if(grepl("Braak",colnames(mat)[j])==TRUE){
78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 76 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
79 as.roman()%>% 77 as.roman()%>%
80 as.integer() 78 as.integer()
81 } 79 }
82 j=j+1 80 j=j+1
83 } 81 }
84 mat 82 mat
85 } 83 }
86 84
87 #3#Function for labeling the gene IDs without names 85 #3#Function for labeling the gene IDs without names
88 NAFIXING <- function(GIDNAM){ 86 NAFIXING <- function(GIDNAM){
89 row <- dim(GIDNAM)[1] 87 row <- dim(GIDNAM)[1]
90 i <- 1 88 i <- 1
91 for(i in 1:row){ 89 for(i in 1:row){
92 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 90 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
93 GIDNAM[i,2] <- GIDNAM[i,1] 91 GIDNAM[i,2] <- GIDNAM[i,1]
94 } 92 }
95 i <- i + 1 93 i <- i + 1
96 } 94 }
97 GIDNAM 95 GIDNAM
98 } 96 }
99 97
100 #4#Function for changing the gene ID to gene name 98 #4#Function for changing the gene ID to gene name
101 cgeneID <- function(GeneName,DATA){ 99 cgeneID <- function(GeneName,DATA){
102 nj <- t(GeneName) 100 nj <- t(GeneName)
103 nq <- t(DATA) 101 nq <- t(DATA)
104 colGene <- dim(nj)[2] 102 colGene <- dim(nj)[2]
105 colDATA <- dim(nq)[2] 103 colDATA <- dim(nq)[2]
106 j <- 1 104 j <- 1
107 for(j in 1:colDATA){ 105 for(j in 1:colDATA){
108 #where is that gene id located within the GPL file 106 #where is that gene id located within the GPL file
109 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,]) 107 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
110 if(is.na(sum(chngreq))==FALSE){ 108 if(is.na(sum(chngreq))==FALSE){
111 if(sum(chngreq) > 0){ 109 if(sum(chngreq) > 0){
112 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j]) 110 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
113 } 111 }
114 } 112 }
115 j <- j + 1 113 j <- j + 1
116 } 114 }
117 nq 115 nq
118 } 116 }
119 #cgeneID <- function(GeneName,DATA){ 117 #cgeneID <- function(GeneName,DATA){
120 # colGene <- dim(GeneName)[2] 118 # colGene <- dim(GeneName)[2]
121 # j <- 1 119 # j <- 1
122 # for(j in 1:colGene){ 120 # for(j in 1:colGene){
123 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) 121 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
124 # if(is.na(sum(chngsreq))==FALSE){ 122 # if(is.na(sum(chngsreq))==FALSE){
125 # if(sum(chngsreq) > 0){ 123 # if(sum(chngsreq) > 0){
126 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 124 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
127 # } 125 # }
128 # } 126 # }
129 # j = j+1 127 # j = j+1
130 # } 128 # }
131 # DATA 129 # DATA
132 #} 130 #}
133 131
134 #5#Function for adjusting the gene names 132 #5#Function for adjusting the gene names
135 gcnames <- function(DiData,usecol=1){ 133 gcnames <- function(DiData,usecol=1){
136 nuruns <- dim(DiData)[2] 134 nuruns <- dim(DiData)[2]
137 i = 1 135 i = 1
138 nwnam <- rep("0",length.out=nuruns) 136 nwnam <- rep("0",length.out=nuruns)
139 for(i in 1:nuruns){ 137 for(i in 1:nuruns){
140 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ 138 if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){
141 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol]) 139 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])
142 } else{ 140 } else{
143 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1]) 141 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])
144 } 142 }
145 143
146 } 144 }
147 nwnam 145 nwnam
148 146
149 } 147 }
150 148
151 #6# Function for discretizing the data 149 #6# Function for discretizing the data
152 dndat <- function(NDATA){ 150 dndat <- function(NDATA){
153 rownd <- dim(NDATA)[1] 151 rownd <- dim(NDATA)[1]
154 colnd <- dim(NDATA)[2] 152 colnd <- dim(NDATA)[2]
155 DDATA <- matrix(0,nrow=rownd,ncol=colnd) 153 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
156 colnames(DDATA) <- colnames(NDATA) 154 colnames(DDATA) <- colnames(NDATA)
157 i <- 1 155 i <- 1
158 for(i in 1:rownd){ 156 for(i in 1:rownd){
159 j <- 1 157 j <- 1
160 for(j in 1:colnd){ 158 for(j in 1:colnd){
161 if(is.na(NDATA[i,j])==FALSE){ 159 if(is.na(NDATA[i,j])==FALSE){
162 160
163 if(NDATA[i,j] < -1){ 161 if(NDATA[i,j] < -1){
164 DDATA[i,j]=0L 162 DDATA[i,j]=0L
165 } else if(NDATA[i,j] > 1){ 163 } else if(NDATA[i,j] > 1){
166 DDATA[i,j]=2L 164 DDATA[i,j]=2L
167 } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ 165 } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
168 DDATA[i,j]=1L 166 DDATA[i,j]=1L
169 } 167 }
170 } else{ 168 } else{
171 DDATA[i,j] = NDATA[i,j] 169 DDATA[i,j] = NDATA[i,j]
172 } 170 }
173 j = j + 1 171 j = j + 1
174 } 172 }
175 i = i + 1 173 i = i + 1
176 } 174 }
177 DDATA 175 DDATA
178 } 176 }
179 177
180 178
181 #MajorFunction#This is the function that does everything else 179 #MajorFunction#This is the function that does everything else
182 THEFT <- function(){ 180 THEFT <- function(){
183 #Set working directory based on the directory of the series matrix file Currently only works for windows 181 #Set working directory based on the directory of the series matrix file Currently only works for windows
184 wd <- getwd() 182 wd <- getwd()
185 #list.files() 183 #list.files()
186 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?") 184 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
187 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) 185 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
188 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files()) 186 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
189 GSEfloc <- list.files()[GSEfileloc] 187 GSEfloc <- list.files()[GSEfileloc]
190 #ALL DATA FILES WILL BE CLEANED 188 #ALL DATA FILES WILL BE CLEANED
191 if(numDAT == 1){ 189 if(numDAT == 1){
192 #indexing the data files 190 #indexing the data files
193 n <- 1 191 n <- 1
194 for(n in 1: length(GSEfloc)){ 192 for(n in 1: length(GSEfloc)){
195 alz <- GSEfloc[n] 193 alz <- GSEfloc[n]
196 194
197 #Working with the wordy part of the document 195 #Working with the wordy part of the document
198 alzword <- alz %>% 196 alzword <- alz %>%
199 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 197 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
200 filter(grepl("!Sample",X1))%>% 198 filter(grepl("!Sample",X1))%>%
201 filter(!grepl("!Sample_contact",X1)) 199 filter(!grepl("!Sample_contact",X1))
202 200
203 #Getting the GPL file 201 #Getting the GPL file
204 genena <- grep("_platform_id",alzword$X1) %>% 202 genena <- grep("_platform_id",alzword$X1) %>%
205 alzword$X2[.] %>% 203 alzword$X2[.] %>%
206 str_trim(.) %>% 204 str_trim(.) %>%
207 paste0("^",.,"\\D") %>% 205 paste0("^",.,"\\D") %>%
208 grep(.,list.files()) %>% 206 grep(.,list.files()) %>%
209 list.files()[.] 207 list.files()[.]
210 208
211 #Find out if it is a soft GPL file or not 209 #Find out if it is a soft GPL file or not
212 soft <- strsplit(genena,"[\\|/]") %>% 210 soft <- strsplit(genena,"[\\|/]") %>%
213 .[[1]] %>% 211 .[[1]] %>%
214 .[length(.)] %>% 212 .[length(.)] %>%
215 grepl("soft",.) 213 grepl("soft",.)
216 214
217 ##Changing row names and column names: 215 ##Changing row names and column names:
218 ALZWORD <- t(alzword) 216 ALZWORD <- t(alzword)
219 rownames(ALZWORD)=NULL 217 rownames(ALZWORD)=NULL
220 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 218 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
221 ALZWORD <- chngrownm(ALZWORD)[-1,] 219 ALZWORD <- chngrownm(ALZWORD)[-1,]
222 ALZWORD <- ALZWORD%>% 220 ALZWORD <- ALZWORD%>%
223 as.data.frame()%>% 221 as.data.frame()%>%
224 dplyr::select(-starts_with("col")) 222 dplyr::select(-starts_with("col"))
225 223
226 ##Reorganizing information within the columns and final clinical data 224 ##Reorganizing information within the columns and final clinical data
227 ALZWORDF <- cinfo(ALZWORD) 225 ALZWORDF <- cinfo(ALZWORD)
228 226
229 227
230 #Working with Actual Data part of file 228 #Working with Actual Data part of file
231 alzdat <- alz %>% 229 alzdat <- alz %>%
232 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 230 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
233 ALZDAT <- t(alzdat[,-1]) 231 ALZDAT <- t(alzdat[,-1])
234 rownames(ALZDAT)=NULL 232 rownames(ALZDAT)=NULL
235 233
236 ##Is there a clean version of the GPL file available? 234 ##Is there a clean version of the GPL file available?
237 gplnum <- strsplit(genena,"[\\|/]") %>% 235 gplnum <- strsplit(genena,"[\\|/]") %>%
238 .[[1]] %>% 236 .[[1]] %>%
239 .[length(.)] %>% 237 .[length(.)] %>%
240 gsub("\\D","",.) 238 gsub("\\D","",.)
241 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 239 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
242 if(clfileex >= 1){ 240 if(clfileex >= 1){
243 #use the clean version 241 #use the clean version
244 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 242 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
245 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 243 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
246 244
247 } else if(clfileex == 0){ 245 } else if(clfileex == 0){
248 ##Lets Create a clean version 246 ##Lets Create a clean version
249 247
250 ##Gene ID to Gene Name 248 ##Gene ID to Gene Name
251 if(soft == TRUE){ 249 if(soft == TRUE){
252 #Check to see if there is already a file containing information on soft files 250 #Check to see if there is already a file containing information on soft files
253 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 251 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
254 if(fileex == 1){ 252 if(fileex == 1){
255 #Check to see if this GPL soft file has been used before 253 #Check to see if this GPL soft file has been used before
256 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 254 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
257 .$GPL_FILE_NUM%>% 255 .$GPL_FILE_NUM%>%
258 grepl(gplnum,.) %>% 256 grepl(gplnum,.) %>%
259 sum() 257 sum()
260 if(IDF == 1){ 258 if(IDF == 1){
261 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 259 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
262 .$GPL_FILE_NUM%>% 260 .$GPL_FILE_NUM%>%
263 grep(gplnum,.) 261 grep(gplnum,.)
264 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 262 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
265 .$LOC_ID %>% 263 .$LOC_ID %>%
266 .[IDLOCAL] 264 .[IDLOCAL]
267 geneIDNam <- genena %>% 265 geneIDNam <- genena %>%
268 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 266 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
269 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 267 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
270 } else if(IDF == 0){ 268 } else if(IDF == 0){
271 #No information on this particular GPL file 269 #No information on this particular GPL file
272 idLOCGPL <- genena %>% 270 idLOCGPL <- genena %>%
273 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 271 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
274 t(.) %>% 272 t(.) %>%
275 grep("^ID\\s*$",.) %>% 273 grep("^ID\\s*$",.) %>%
276 -1 274 -1
277 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 275 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
278 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 276 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
279 geneIDNam <- genena %>% 277 geneIDNam <- genena %>%
280 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 278 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
281 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 279 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
282 } 280 }
283 } else if(fileex == 0){ 281 } else if(fileex == 0){
284 #We must create a file that we can access for later use 282 #We must create a file that we can access for later use
285 idLOCGPL <- genena %>% 283 idLOCGPL <- genena %>%
286 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 284 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
287 t(.) %>% 285 t(.) %>%
288 grep("^ID\\s*$",.) %>% 286 grep("^ID\\s*$",.) %>%
289 -1 287 -1
290 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 288 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
291 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 289 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
292 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 290 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
293 geneIDNam <- genena %>% 291 geneIDNam <- genena %>%
294 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 292 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
295 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 293 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
296 } 294 }
297 } else if(soft == FALSE){ 295 } else if(soft == FALSE){
298 geneIDNam <- genena %>% 296 geneIDNam <- genena %>%
299 read_delim(delim="\t",comment = "#")%>% 297 read_delim(delim="\t",comment = "#")%>%
300 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 298 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
301 } 299 }
302 300
303 ##Labeling the gene IDs without names 301 ##Labeling the gene IDs without names
304 geneIDNam <- NAFIXING(geneIDNam) 302 geneIDNam <- NAFIXING(geneIDNam)
305 303
306 ##remove the whitespace 304 ##remove the whitespace
307 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 305 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
308 306
309 ##Here is the clean version 307 ##Here is the clean version
310 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 308 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
311 } 309 }
312 310
313 311
314 312
315 ##Changing the gene ID to gene name 313 ##Changing the gene ID to gene name
316 ALZDAT1 <- cgeneID(geneIDNam,alzdat) 314 ALZDAT1 <- cgeneID(geneIDNam,alzdat)
317 colnames(ALZDAT) = ALZDAT1[1,] 315 colnames(ALZDAT) = ALZDAT1[1,]
318 316
319 317
320 ##Adjusting the column names aka the gene names 318 ##Adjusting the column names aka the gene names
321 colnames(ALZDAT) <- gcnames(ALZDAT) 319 colnames(ALZDAT) <- gcnames(ALZDAT)
322 320
323 321
324 #Full RAW Data 322 #Full RAW Data
325 Fullalzdwr <- ALZDAT %>% 323 Fullalzdwr <- ALZDAT %>%
326 as.data.frame() %>% 324 as.data.frame() %>%
327 cbind(ALZWORDF,.) 325 cbind(ALZWORDF,.)
328 326
329 #Raw file is output 327 #Raw file is output
330 nfnaex <- strsplit(alz,"[\\]") %>% 328 nfnaex <- strsplit(alz,"[\\]") %>%
331 .[[1]] %>% 329 .[[1]] %>%
332 .[length(.)] %>% 330 .[length(.)] %>%
333 gsub("\\D","",.) %>% 331 gsub("\\D","",.) %>%
334 c("GSE",.,"aftexcel.txt") %>% 332 c("GSE",.,"aftexcel.txt") %>%
335 paste(collapse = "") 333 paste(collapse = "")
336 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 334 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
337 335
338 336
339 337
340 #Now for the discretization part 338 #Now for the discretization part
341 ##get the wordy part again 339 ##get the wordy part again
342 rawword <- t(ALZWORDF) 340 rawword <- t(ALZWORDF)
343 341
344 ##where is ID_REF located 342 ##where is ID_REF located
345 hereim <- grep("ID_REF",rownames(rawword)) 343 hereim <- grep("ID_REF",rownames(rawword))
346 344
347 ##Subject Names GSM... 345 ##Subject Names GSM...
348 subjnam <- rawword[hereim,] 346 subjnam <- rawword[hereim,]
349 347
350 ##Getting the names for the rows 348 ##Getting the names for the rows
351 namedarows <- rownames(rawword)[-hereim] %>% 349 namedarows <- rownames(rawword)[-hereim] %>%
352 as.data.frame() 350 as.data.frame()
353 RAWWORD <- rawword[-hereim,] %>% 351 RAWWORD <- rawword[-hereim,] %>%
354 as.data.frame() %>% 352 as.data.frame() %>%
355 bind_cols(namedarows,.) 353 bind_cols(namedarows,.)
356 z <- 1 354 z <- 1
357 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 355 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
358 for(z in 1:dim(RAWWORD)[1]){ 356 for(z in 1:dim(RAWWORD)[1]){
359 if(sum(is.na(RAWWORD[z,])) > 0){ 357 if(sum(is.na(RAWWORD[z,])) > 0){
360 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 358 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
361 } 359 }
362 if(length(grep("NA",RAWWORD[z,])) > 0){ 360 if(length(grep("NA",RAWWORD[z,])) > 0){
363 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] 361 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
364 } 362 }
365 z <- z + 1 363 z <- z + 1
366 } 364 }
367 365
368 colnames(naroww) <- "ROW_NAs" 366 colnames(naroww) <- "ROW_NAs"
369 RAWWORD <- bind_cols(RAWWORD,naroww) 367 RAWWORD <- bind_cols(RAWWORD,naroww)
370 368
371 369
372 roALZna <- t(ALZDAT) %>% 370 roALZna <- t(ALZDAT) %>%
373 rownames(.) %>% 371 rownames(.) %>%
374 as.data.frame(.) 372 as.data.frame(.)
375 colnames(roALZna) <- "ID_REF" 373 colnames(roALZna) <- "ID_REF"
376 374
377 RAWDAT <- t(ALZDAT) %>% 375 RAWDAT <- t(ALZDAT) %>%
378 as.data.frame(.) 376 as.data.frame(.)
379 colnames(RAWDAT) <- NULL 377 colnames(RAWDAT) <- NULL
380 rownames(RAWDAT) <- NULL 378 rownames(RAWDAT) <- NULL
381 379
382 RAWDAT2 <- RAWDAT %>% 380 RAWDAT2 <- RAWDAT %>%
383 cbind(roALZna,.) %>% 381 cbind(roALZna,.) %>%
384 dplyr::arrange(.,ID_REF) 382 dplyr::arrange(.,ID_REF)
385 383
386 ##Editing the file for R processing 384 ##Editing the file for R processing
387 RAWDATID <- RAWDAT2[,1] %>% 385 RAWDATID <- RAWDAT2[,1] %>%
388 as.matrix(.) 386 as.matrix(.)
389 387
390 RAWDATNUM <- RAWDAT2[,-1] %>% 388 RAWDATNUM <- RAWDAT2[,-1] %>%
391 mapply(.,FUN = as.numeric) %>% 389 mapply(.,FUN = as.numeric) %>%
392 t(.) 390 t(.)
393 391
394 ##Consolidating genes with the same name 392 ##Consolidating genes with the same name
395 ###create empty matrix of size equal to tabRDATID 393 ###create empty matrix of size equal to tabRDATID
396 tabRDATID <- table(RAWDATID) 394 tabRDATID <- table(RAWDATID)
397 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 395 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
398 j <- 1 396 j <- 1
399 for(j in 1:length(tabRDATID)){ 397 for(j in 1:length(tabRDATID)){
400 ##Putting the ones without duplicates in their new homes 398 ##Putting the ones without duplicates in their new homes
401 if(tabRDATID[j] == 1){ 399 if(tabRDATID[j] == 1){
402 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 400 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
403 } else if(tabRDATID[j] > 1){ 401 } else if(tabRDATID[j] > 1){
404 ##Averaging duplicates and putting them in their new homes 402 ##Averaging duplicates and putting them in their new homes
405 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 403 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
406 } 404 }
407 j <- j + 1 405 j <- j + 1
408 } 406 }
409 407
410 ##Scaling the Data 408 ##Scaling the Data
411 scrawdat <- NuRDATN%>% 409 scrawdat <- NuRDATN%>%
412 scale() 410 scale()
413 attr(scrawdat,"scaled:center") <- NULL 411 attr(scrawdat,"scaled:center") <- NULL
414 attr(scrawdat,"scaled:scale") <- NULL 412 attr(scrawdat,"scaled:scale") <- NULL
415 colnames(scrawdat) <- rownames(tabRDATID) 413 colnames(scrawdat) <- rownames(tabRDATID)
416 414
417 ##Discretized the Data 415 ##Discretized the Data
418 dialzdat <- scrawdat %>% 416 dialzdat <- scrawdat %>%
419 dndat(.) %>% 417 dndat(.) %>%
420 t()%>% 418 t()%>%
421 as.data.frame(.) 419 as.data.frame(.)
422 colnames(dialzdat) <- rownames(RAWDATNUM) 420 colnames(dialzdat) <- rownames(RAWDATNUM)
423 421
424 ##setting "ID_REF" as a new variable 422 ##setting "ID_REF" as a new variable
425 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 423 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
426 colnames(geneNAM) <- "ID_REF" 424 colnames(geneNAM) <- "ID_REF"
427 rownames(dialzdat) <- NULL 425 rownames(dialzdat) <- NULL
428 dialzdat <-bind_cols(geneNAM,dialzdat) 426 dialzdat <-bind_cols(geneNAM,dialzdat)
429 427
430 ##NAs in a column 428 ##NAs in a column
431 x <- 2 429 x <- 2
432 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 430 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
433 nacol[1,1] = "COL_NAs" 431 nacol[1,1] = "COL_NAs"
434 for(x in 2:dim(dialzdat)[2]){ 432 for(x in 2:dim(dialzdat)[2]){
435 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 433 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
436 x <- x + 1 434 x <- x + 1
437 } 435 }
438 colnames(nacol) <- colnames(dialzdat) 436 colnames(nacol) <- colnames(dialzdat)
439 dialzdat <- bind_rows(dialzdat,nacol) 437 dialzdat <- bind_rows(dialzdat,nacol)
440 438
441 ##NAs in a row 439 ##NAs in a row
442 y <- 1 440 y <- 1
443 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 441 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
444 for(y in 1:dim(dialzdat)[1]){ 442 for(y in 1:dim(dialzdat)[1]){
445 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 443 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
446 y <- y + 1 444 y <- y + 1
447 } 445 }
448 colnames(narowd) <- "ROW_NAs" 446 colnames(narowd) <- "ROW_NAs"
449 dialzdat <- bind_cols(dialzdat,narowd) 447 dialzdat <- bind_cols(dialzdat,narowd)
450 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 448 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
451 colnames(RAWWORD) <- colnames(dialzdat) 449 colnames(RAWWORD) <- colnames(dialzdat)
452 ##converting to character so that the clinical can be brought together with discrete data 450 ##converting to character so that the clinical can be brought together with discrete data
453 k <- 2 451 k <- 2
454 for(k in 2:dim(dialzdat)[2]-1){ 452 for(k in 2:dim(dialzdat)[2]-1){
455 dialzdat[,k] <- as.character(dialzdat[,k]) 453 dialzdat[,k] <- as.character(dialzdat[,k])
456 k <- k + 1 454 k <- k + 1
457 } 455 }
458 #The End the full data 456 #The End the full data
459 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 457 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
460 458
461 #Produces Discrete file 459 #Produces Discrete file
462 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 460 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
463 .[[1]] %>% 461 .[[1]] %>%
464 .[length(.)] %>% 462 .[length(.)] %>%
465 gsub("\\D","",.) %>% 463 gsub("\\D","",.) %>%
466 c("GSE",.,"dscrt.txt") %>% 464 c("GSE",.,"dscrt.txt") %>%
467 paste(collapse = "") 465 paste(collapse = "")
468 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 466 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
469 n <- n +1 467 n <- n +1
470 } 468 }
471 } else if(numDAT == 2){ 469 } else if(numDAT == 2){
472 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN 470 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
473 471
474 #All the files you want to analyze 472 #All the files you want to analyze
475 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:") 473 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
476 if(length(ANDIS) == 0){ 474 if(length(ANDIS) == 0){
477 #Spit out a warning 475 #Spit out a warning
478 warning("You did not select any files and so no cleaning will be performed") 476 warning("You did not select any files and so no cleaning will be performed")
479 } else{ 477 } else{
480 #indexing the data files 478 #indexing the data files
481 n <- 1 479 n <- 1
482 for(n in 1: length(ANDIS)){ 480 for(n in 1: length(ANDIS)){
483 alz <- ANDIS[n] 481 alz <- ANDIS[n]
484 482
485 #Working with the wordy part of the document 483 #Working with the wordy part of the document
486 alzword <- alz %>% 484 alzword <- alz %>%
487 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 485 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
488 filter(grepl("!Sample",X1))%>% 486 filter(grepl("!Sample",X1))%>%
489 filter(!grepl("!Sample_contact",X1)) 487 filter(!grepl("!Sample_contact",X1))
490 488
491 #Getting the GPL file 489 #Getting the GPL file
492 genena <- grep("_platform_id",alzword$X1) %>% 490 genena <- grep("_platform_id",alzword$X1) %>%
493 alzword$X2[.] %>% 491 alzword$X2[.] %>%
494 str_trim(.) %>% 492 str_trim(.) %>%
495 paste0("^",.,"\\D") %>% 493 paste0("^",.,"\\D") %>%
496 grep(.,list.files()) %>% 494 grep(.,list.files()) %>%
497 list.files()[.] 495 list.files()[.]
498 496
499 #Find out if it is a soft GPL file or not 497 #Find out if it is a soft GPL file or not
500 soft <- strsplit(genena,"[\\|/]") %>% 498 soft <- strsplit(genena,"[\\|/]") %>%
501 .[[1]] %>% 499 .[[1]] %>%
502 .[length(.)] %>% 500 .[length(.)] %>%
503 grepl("soft",.) 501 grepl("soft",.)
504 502
505 ##Changing row names and column names: 503 ##Changing row names and column names:
506 ALZWORD <- t(alzword) 504 ALZWORD <- t(alzword)
507 rownames(ALZWORD)=NULL 505 rownames(ALZWORD)=NULL
508 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 506 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
509 ALZWORD <- chngrownm(ALZWORD)[-1,] 507 ALZWORD <- chngrownm(ALZWORD)[-1,]
510 ALZWORD <- ALZWORD%>% 508 ALZWORD <- ALZWORD%>%
511 as.data.frame()%>% 509 as.data.frame()%>%
512 dplyr::select(-starts_with("col")) 510 dplyr::select(-starts_with("col"))
513 511
514 ##Reorganizing information within the columns and final clinical data 512 ##Reorganizing information within the columns and final clinical data
515 ALZWORDF <- cinfo(ALZWORD) 513 ALZWORDF <- cinfo(ALZWORD)
516 514
517 515
518 #Working with Actual Data part of file 516 #Working with Actual Data part of file
519 alzdat <- alz %>% 517 alzdat <- alz %>%
520 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 518 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
521 ALZDAT <- t(alzdat[,-1]) 519 ALZDAT <- t(alzdat[,-1])
522 rownames(ALZDAT)=NULL 520 rownames(ALZDAT)=NULL
523 521
524 ##Is there a clean version of the GPL file available? 522 ##Is there a clean version of the GPL file available?
525 gplnum <- strsplit(genena,"[\\|/]") %>% 523 gplnum <- strsplit(genena,"[\\|/]") %>%
526 .[[1]] %>% 524 .[[1]] %>%
527 .[length(.)] %>% 525 .[length(.)] %>%
528 gsub("\\D","",.) 526 gsub("\\D","",.)
529 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 527 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
530 if(clfileex >= 1){ 528 if(clfileex >= 1){
531 #use the clean version 529 #use the clean version
532 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 530 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
533 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 531 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
534 532
535 } else if(clfileex == 0){ 533 } else if(clfileex == 0){
536 ##Lets Create a clean version 534 ##Lets Create a clean version
537 535
538 ##Gene ID to Gene Name 536 ##Gene ID to Gene Name
539 if(soft == TRUE){ 537 if(soft == TRUE){
540 #Check to see if there is already a file containing information on soft files 538 #Check to see if there is already a file containing information on soft files
541 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 539 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
542 if(fileex == 1){ 540 if(fileex == 1){
543 #Check to see if this GPL soft file has been used before 541 #Check to see if this GPL soft file has been used before
544 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 542 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
545 .$GPL_FILE_NUM%>% 543 .$GPL_FILE_NUM%>%
546 grepl(gplnum,.) %>% 544 grepl(gplnum,.) %>%
547 sum() 545 sum()
548 if(IDF == 1){ 546 if(IDF == 1){
549 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 547 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
550 .$GPL_FILE_NUM%>% 548 .$GPL_FILE_NUM%>%
551 grep(gplnum,.) 549 grep(gplnum,.)
552 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 550 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
553 .$LOC_ID %>% 551 .$LOC_ID %>%
554 .[IDLOCAL] 552 .[IDLOCAL]
555 geneIDNam <- genena %>% 553 geneIDNam <- genena %>%
556 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 554 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
557 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 555 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
558 } else if(IDF == 0){ 556 } else if(IDF == 0){
559 #No information on this particular GPL file 557 #No information on this particular GPL file
560 idLOCGPL <- genena %>% 558 idLOCGPL <- genena %>%
561 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 559 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
562 t(.) %>% 560 t(.) %>%
563 grep("^ID\\s*$",.) %>% 561 grep("^ID\\s*$",.) %>%
564 -1 562 -1
565 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 563 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
566 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 564 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
567 geneIDNam <- genena %>% 565 geneIDNam <- genena %>%
568 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 566 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
569 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 567 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
570 } 568 }
571 } else if(fileex == 0){ 569 } else if(fileex == 0){
572 #We must create a file that we can access for later use 570 #We must create a file that we can access for later use
573 idLOCGPL <- genena %>% 571 idLOCGPL <- genena %>%
574 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 572 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
575 t(.) %>% 573 t(.) %>%
576 grep("^ID\\s*$",.) %>% 574 grep("^ID\\s*$",.) %>%
577 -1 575 -1
578 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 576 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
579 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 577 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
580 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 578 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
581 geneIDNam <- genena %>% 579 geneIDNam <- genena %>%
582 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 580 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
583 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 581 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
584 } 582 }
585 } else if(soft == FALSE){ 583 } else if(soft == FALSE){
586 geneIDNam <- genena %>% 584 geneIDNam <- genena %>%
587 read_delim(delim="\t",comment = "#")%>% 585 read_delim(delim="\t",comment = "#")%>%
588 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 586 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
589 } 587 }
590 588
591 ##Labeling the gene IDs without names 589 ##Labeling the gene IDs without names
592 geneIDNam <- NAFIXING(geneIDNam) 590 geneIDNam <- NAFIXING(geneIDNam)
593 591
594 ##remove the whitespace 592 ##remove the whitespace
595 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 593 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
596 594
597 ##Here is the clean version 595 ##Here is the clean version
598 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 596 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
599 } 597 }
600 598
601 599
602 600
603 ##Changing the gene ID to gene name 601 ##Changing the gene ID to gene name
604 ALZDAT1 <- cgeneID(geneIDNam,alzdat) 602 ALZDAT1 <- cgeneID(geneIDNam,alzdat)
605 colnames(ALZDAT) = ALZDAT1[1,] 603 colnames(ALZDAT) = ALZDAT1[1,]
606 604
607 605
608 ##Adjusting the column names aka the gene names 606 ##Adjusting the column names aka the gene names
609 colnames(ALZDAT) <- gcnames(ALZDAT) 607 colnames(ALZDAT) <- gcnames(ALZDAT)
610 608
611 609
612 #Full RAW Data 610 #Full RAW Data
613 Fullalzdwr <- ALZDAT %>% 611 Fullalzdwr <- ALZDAT %>%
614 as.data.frame() %>% 612 as.data.frame() %>%
615 cbind(ALZWORDF,.) 613 cbind(ALZWORDF,.)
616 614
617 #Raw file is output 615 #Raw file is output
618 nfnaex <- strsplit(alz,"[\\]") %>% 616 nfnaex <- strsplit(alz,"[\\]") %>%
619 .[[1]] %>% 617 .[[1]] %>%
620 .[length(.)] %>% 618 .[length(.)] %>%
621 gsub("\\D","",.) %>% 619 gsub("\\D","",.) %>%
622 c("GSE",.,"aftexcel.txt") %>% 620 c("GSE",.,"aftexcel.txt") %>%
623 paste(collapse = "") 621 paste(collapse = "")
624 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 622 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
625 623
626 624
627 625
628 #Now for the discretization part 626 #Now for the discretization part
629 ##get the wordy part again 627 ##get the wordy part again
630 rawword <- t(ALZWORDF) 628 rawword <- t(ALZWORDF)
631 629
632 ##where is ID_REF located 630 ##where is ID_REF located
633 hereim <- grep("ID_REF",rownames(rawword)) 631 hereim <- grep("ID_REF",rownames(rawword))
634 632
635 ##Subject Names GSM... 633 ##Subject Names GSM...
636 subjnam <- rawword[hereim,] 634 subjnam <- rawword[hereim,]
637 635
638 ##Getting the names for the rows 636 ##Getting the names for the rows
639 namedarows <- rownames(rawword)[-hereim] %>% 637 namedarows <- rownames(rawword)[-hereim] %>%
640 as.data.frame() 638 as.data.frame()
641 RAWWORD <- rawword[-hereim,] %>% 639 RAWWORD <- rawword[-hereim,] %>%
642 as.data.frame() %>% 640 as.data.frame() %>%
643 bind_cols(namedarows,.) 641 bind_cols(namedarows,.)
644 z <- 1 642 z <- 1
645 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 643 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
646 for(z in 1:dim(RAWWORD)[1]){ 644 for(z in 1:dim(RAWWORD)[1]){
647 if(sum(is.na(RAWWORD[z,])) > 0){ 645 if(sum(is.na(RAWWORD[z,])) > 0){
648 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 646 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
649 } 647 }
650 if(length(grep("NA",RAWWORD[z,])) > 0){ 648 if(length(grep("NA",RAWWORD[z,])) > 0){
651 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1] 649 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
652 } 650 }
653 z <- z + 1 651 z <- z + 1
654 } 652 }
655 653
656 colnames(naroww) <- "ROW_NAs" 654 colnames(naroww) <- "ROW_NAs"
657 RAWWORD <- bind_cols(RAWWORD,naroww) 655 RAWWORD <- bind_cols(RAWWORD,naroww)
658 656
659 657
660 roALZna <- t(ALZDAT) %>% 658 roALZna <- t(ALZDAT) %>%
661 rownames(.) %>% 659 rownames(.) %>%
662 as.data.frame(.) 660 as.data.frame(.)
663 colnames(roALZna) <- "ID_REF" 661 colnames(roALZna) <- "ID_REF"
664 662
665 RAWDAT <- t(ALZDAT) %>% 663 RAWDAT <- t(ALZDAT) %>%
666 as.data.frame(.) 664 as.data.frame(.)
667 colnames(RAWDAT) <- NULL 665 colnames(RAWDAT) <- NULL
668 rownames(RAWDAT) <- NULL 666 rownames(RAWDAT) <- NULL
669 667
670 RAWDAT2 <- RAWDAT %>% 668 RAWDAT2 <- RAWDAT %>%
671 cbind(roALZna,.) %>% 669 cbind(roALZna,.) %>%
672 dplyr::arrange(.,ID_REF) 670 dplyr::arrange(.,ID_REF)
673 671
674 ##Editing the file for R processing 672 ##Editing the file for R processing
675 RAWDATID <- RAWDAT2[,1] %>% 673 RAWDATID <- RAWDAT2[,1] %>%
676 as.matrix(.) 674 as.matrix(.)
677 675
678 RAWDATNUM <- RAWDAT2[,-1] %>% 676 RAWDATNUM <- RAWDAT2[,-1] %>%
679 mapply(.,FUN = as.numeric) %>% 677 mapply(.,FUN = as.numeric) %>%
680 t(.) 678 t(.)
681 679
682 ##Consolidating genes with the same name 680 ##Consolidating genes with the same name
683 ###create empty matrix of size equal to tabRDATID 681 ###create empty matrix of size equal to tabRDATID
684 tabRDATID <- table(RAWDATID) 682 tabRDATID <- table(RAWDATID)
685 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 683 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
686 j <- 1 684 j <- 1
687 for(j in 1:length(tabRDATID)){ 685 for(j in 1:length(tabRDATID)){
688 ##Putting the ones without duplicates in their new homes 686 ##Putting the ones without duplicates in their new homes
689 if(tabRDATID[j] == 1){ 687 if(tabRDATID[j] == 1){
690 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 688 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
691 } else if(tabRDATID[j] > 1){ 689 } else if(tabRDATID[j] > 1){
692 ##Averaging duplicates and putting them in their new homes 690 ##Averaging duplicates and putting them in their new homes
693 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 691 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
694 } 692 }
695 j <- j + 1 693 j <- j + 1
696 } 694 }
697 695
698 ##Scaling the Data 696 ##Scaling the Data
699 scrawdat <- NuRDATN%>% 697 scrawdat <- NuRDATN%>%
700 scale() 698 scale()
701 attr(scrawdat,"scaled:center") <- NULL 699 attr(scrawdat,"scaled:center") <- NULL
702 attr(scrawdat,"scaled:scale") <- NULL 700 attr(scrawdat,"scaled:scale") <- NULL
703 colnames(scrawdat) <- rownames(tabRDATID) 701 colnames(scrawdat) <- rownames(tabRDATID)
704 702
705 ##Discretized the Data 703 ##Discretized the Data
706 dialzdat <- scrawdat %>% 704 dialzdat <- scrawdat %>%
707 dndat(.) %>% 705 dndat(.) %>%
708 t()%>% 706 t()%>%
709 as.data.frame(.) 707 as.data.frame(.)
710 colnames(dialzdat) <- rownames(RAWDATNUM) 708 colnames(dialzdat) <- rownames(RAWDATNUM)
711 709
712 ##setting "ID_REF" as a new variable 710 ##setting "ID_REF" as a new variable
713 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 711 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
714 colnames(geneNAM) <- "ID_REF" 712 colnames(geneNAM) <- "ID_REF"
715 rownames(dialzdat) <- NULL 713 rownames(dialzdat) <- NULL
716 dialzdat <-bind_cols(geneNAM,dialzdat) 714 dialzdat <-bind_cols(geneNAM,dialzdat)
717 715
718 ##NAs in a column 716 ##NAs in a column
719 x <- 2 717 x <- 2
720 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 718 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
721 nacol[1,1] = "COL_NAs" 719 nacol[1,1] = "COL_NAs"
722 for(x in 2:dim(dialzdat)[2]){ 720 for(x in 2:dim(dialzdat)[2]){
723 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 721 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
724 x <- x + 1 722 x <- x + 1
725 } 723 }
726 colnames(nacol) <- colnames(dialzdat) 724 colnames(nacol) <- colnames(dialzdat)
727 dialzdat <- bind_rows(dialzdat,nacol) 725 dialzdat <- bind_rows(dialzdat,nacol)
728 726
729 ##NAs in a row 727 ##NAs in a row
730 y <- 1 728 y <- 1
731 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 729 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
732 for(y in 1:dim(dialzdat)[1]){ 730 for(y in 1:dim(dialzdat)[1]){
733 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 731 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
734 y <- y + 1 732 y <- y + 1
735 } 733 }
736 colnames(narowd) <- "ROW_NAs" 734 colnames(narowd) <- "ROW_NAs"
737 dialzdat <- bind_cols(dialzdat,narowd) 735 dialzdat <- bind_cols(dialzdat,narowd)
738 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 736 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
739 colnames(RAWWORD) <- colnames(dialzdat) 737 colnames(RAWWORD) <- colnames(dialzdat)
740 ##converting to character so that the clinical can be brought together with discrete data 738 ##converting to character so that the clinical can be brought together with discrete data
741 k <- 2 739 k <- 2
742 for(k in 2:dim(dialzdat)[2]-1){ 740 for(k in 2:dim(dialzdat)[2]-1){
743 dialzdat[,k] <- as.character(dialzdat[,k]) 741 dialzdat[,k] <- as.character(dialzdat[,k])
744 k <- k + 1 742 k <- k + 1
745 } 743 }
746 #The End the full data 744 #The End the full data
747 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 745 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
748 746
749 #Produces Discrete file 747 #Produces Discrete file
750 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 748 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
751 .[[1]] %>% 749 .[[1]] %>%
752 .[length(.)] %>% 750 .[length(.)] %>%
753 gsub("\\D","",.) %>% 751 gsub("\\D","",.) %>%
754 c("GSE",.,"dscrt.txt") %>% 752 c("GSE",.,"dscrt.txt") %>%
755 paste(collapse = "") 753 paste(collapse = "")
756 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 754 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
757 755
758 756
759 n <- n + 1 757 n <- n + 1
760 } 758 }
761 } 759 }
762 } 760 }
763 } 761 }
764 #The Rest of this code will be used every time you want to change a data set 762 #The Rest of this code will be used every time you want to change a data set
765 THEFT() 763 THEFT()