Commit e53c3c590875d03bcbff0937982b292bfe67c94c

Authored by Efrain Gonzalez
1 parent 837f1e07ad
Exists in master

An automated version of RCleanDscret.R

Working on outputting more insightful errors and warnings. (UNTESTED)
Showing 1 changed file with 2 additions and 2 deletions   Show diff stats
1 #Efrain H. Gonzalez 1 #Efrain H. Gonzalez
2 #6/16/2017 2 #6/16/2017
3 #Libraries required to run the code 3 #Libraries required to run the code
4 library(pryr) 4 library(pryr)
5 library(MASS) 5 library(MASS)
6 library(dplyr) 6 library(dplyr)
7 library(tidyr) 7 library(tidyr)
8 library(readr) 8 library(readr)
9 library(stringr) 9 library(stringr)
10 10
11 11
12 #Necessary Functions 12 #Necessary Functions
13 #1#Function for handling the changing of row names and column names 13 #1#Function for handling the changing of row names and column names
14 chngrownm <- function(mat){ 14 chngrownm <- function(mat){
15 row <- dim(mat)[1] 15 row <- dim(mat)[1]
16 col <- dim(mat)[2] 16 col <- dim(mat)[2]
17 j <- 1 17 j <- 1
18 x <- 1 18 x <- 1
19 p <- 1 19 p <- 1
20 a <- 1 20 a <- 1
21 b <- 1 21 b <- 1
22 g <- 1 22 g <- 1
23 for(j in 1:col){ 23 for(j in 1:col){
24 if("!Sample_source_name_ch1"==mat[1,j]){ 24 if("!Sample_source_name_ch1"==mat[1,j]){
25 colnames(mat)[j] <- "Brain_Region" 25 colnames(mat)[j] <- "Brain_Region"
26 } 26 }
27 if("!Sample_title" == mat[1,j]){ 27 if("!Sample_title" == mat[1,j]){
28 colnames(mat)[j] <- "Title" 28 colnames(mat)[j] <- "Title"
29 } 29 }
30 if("!Sample_geo_accession" == mat[1,j]){ 30 if("!Sample_geo_accession" == mat[1,j]){
31 colnames(mat)[j] <- "ID_REF" 31 colnames(mat)[j] <- "ID_REF"
32 } else{ 32 } else{
33 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ 33 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
34 colnames(mat)[j] <- paste0("Sex",x) 34 colnames(mat)[j] <- paste0("Sex",x)
35 x = x + 1 35 x = x + 1
36 } 36 }
37 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ 37 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
38 colnames(mat)[j] <- paste0("PMI",p) 38 colnames(mat)[j] <- paste0("PMI",p)
39 p = p + 1 39 p = p + 1
40 } 40 }
41 if(grepl("age|Age|AGE",mat[2,j])==TRUE){ 41 if(grepl("age|Age|AGE",mat[2,j])==TRUE){
42 colnames(mat)[j] <- paste0("Age",a) 42 colnames(mat)[j] <- paste0("Age",a)
43 a = a + 1 43 a = a + 1
44 } 44 }
45 if(grepl("braak|b&b",mat[2,j])==TRUE){ 45 if(grepl("braak|b&b",mat[2,j])==TRUE){
46 colnames(mat)[j] <- paste0("Braak",b) 46 colnames(mat)[j] <- paste0("Braak",b)
47 b = b + 1 47 b = b + 1
48 } 48 }
49 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){ 49 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){
50 colnames(mat)[j] <- paste0("Group",g) 50 colnames(mat)[j] <- paste0("Group",g)
51 g = g + 1 51 g = g + 1
52 } 52 }
53 53
54 } 54 }
55 j = j + 1 55 j = j + 1
56 } 56 }
57 mat 57 mat
58 } 58 }
59 59
60 #2#Function for reorganizing information within the columns 60 #2#Function for reorganizing information within the columns
61 cinfo <- function(mat){ 61 cinfo <- function(mat){
62 col <- dim(mat)[2] 62 col <- dim(mat)[2]
63 j <-2 63 j <-2
64 for(j in 2:col){ 64 for(j in 2:col){
65 if(grepl("Group",colnames(mat)[j]) == TRUE){ 65 if(grepl("Group",colnames(mat)[j]) == TRUE){
66 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 66 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
67 } 67 }
68 if(grepl("Age",colnames(mat)[j])==TRUE){ 68 if(grepl("Age",colnames(mat)[j])==TRUE){
69 mat[,j] <- gsub("\\D","",mat[,j])%>% 69 mat[,j] <- gsub("\\D","",mat[,j])%>%
70 as.integer() 70 as.integer()
71 } 71 }
72 if(grepl("Sex",colnames(mat)[j])==TRUE){ 72 if(grepl("Sex",colnames(mat)[j])==TRUE){
73 mat[,j] <- gsub(".+:\\s","",mat[,j]) 73 mat[,j] <- gsub(".+:\\s","",mat[,j])
74 } 74 }
75 if(grepl("PMI",colnames(mat)[j])==TRUE){ 75 if(grepl("PMI",colnames(mat)[j])==TRUE){
76 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 76 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
77 as.numeric() 77 as.numeric()
78 } 78 }
79 if(grepl("Braak",colnames(mat)[j])==TRUE){ 79 if(grepl("Braak",colnames(mat)[j])==TRUE){
80 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 80 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
81 as.roman()%>% 81 as.roman()%>%
82 as.integer() 82 as.integer()
83 } 83 }
84 j=j+1 84 j=j+1
85 } 85 }
86 mat 86 mat
87 } 87 }
88 88
89 #3#Function for labeling the gene IDs without names 89 #3#Function for labeling the gene IDs without names
90 NAFIXING <- function(GIDNAM){ 90 NAFIXING <- function(GIDNAM){
91 row <- dim(GIDNAM)[1] 91 row <- dim(GIDNAM)[1]
92 i <- 1 92 i <- 1
93 for(i in 1:row){ 93 for(i in 1:row){
94 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 94 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
95 GIDNAM[i,2] <- GIDNAM[i,1] 95 GIDNAM[i,2] <- GIDNAM[i,1]
96 } 96 }
97 i <- i + 1 97 i <- i + 1
98 } 98 }
99 GIDNAM 99 GIDNAM
100 } 100 }
101 101
102 #4#Function for changing the gene ID to gene name 102 #4#Function for changing the gene ID to gene name
103 cgeneID <- function(GeneName,DATA){ 103 cgeneID <- function(GeneName,DATA){
104 colGene <- dim(GeneName)[2] 104 colGene <- dim(GeneName)[2]
105 j <- 1 105 j <- 1
106 for(j in 1:colGene){ 106 for(j in 1:colGene){
107 chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) 107 chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
108 if(is.na(sum(chngsreq))==FALSE){ 108 if(is.na(sum(chngsreq))==FALSE){
109 if(sum(chngsreq) > 0){ 109 if(sum(chngsreq) > 0){
110 DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 110 DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
111 } 111 }
112 } 112 }
113 j = j+1 113 j = j+1
114 } 114 }
115 DATA 115 DATA
116 } 116 }
117 117
118 #5#Function for adjusting the gene names 118 #5#Function for adjusting the gene names
119 gcnames <- function(DiData,usecol=1){ 119 gcnames <- function(DiData,usecol=1){
120 nuruns <- dim(DiData)[2] 120 nuruns <- dim(DiData)[2]
121 i = 1 121 i = 1
122 nwnam <- rep("0",length.out=nuruns) 122 nwnam <- rep("0",length.out=nuruns)
123 for(i in 1:nuruns){ 123 for(i in 1:nuruns){
124 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ 124 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
125 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol]) 125 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
126 } else{ 126 } else{
127 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1]) 127 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
128 } 128 }
129 129
130 } 130 }
131 nwnam 131 nwnam
132 132
133 } 133 }
134 134
135 #6# Function for discretizing the data 135 #6# Function for discretizing the data
136 dndat <- function(NDATA){ 136 dndat <- function(NDATA){
137 rownd <- dim(NDATA)[1] 137 rownd <- dim(NDATA)[1]
138 colnd <- dim(NDATA)[2] 138 colnd <- dim(NDATA)[2]
139 DDATA <- matrix(0,nrow=rownd,ncol=colnd) 139 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
140 colnames(DDATA) <- colnames(NDATA) 140 colnames(DDATA) <- colnames(NDATA)
141 i <- 1 141 i <- 1
142 for(i in 1:rownd){ 142 for(i in 1:rownd){
143 j <- 1 143 j <- 1
144 for(j in 1:colnd){ 144 for(j in 1:colnd){
145 if(is.na(NDATA[i,j])==FALSE){ 145 if(is.na(NDATA[i,j])==FALSE){
146 146
147 if(NDATA[i,j] < -1){ 147 if(NDATA[i,j] < -1){
148 DDATA[i,j]=0L 148 DDATA[i,j]=0L
149 } 149 }
150 if(NDATA[i,j] > 1){ 150 if(NDATA[i,j] > 1){
151 DDATA[i,j]=2L 151 DDATA[i,j]=2L
152 } 152 }
153 if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ 153 if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
154 DDATA[i,j]=1L 154 DDATA[i,j]=1L
155 } 155 }
156 } else{ 156 } else{
157 DDATA[i,j] = NDATA[i,j] 157 DDATA[i,j] = NDATA[i,j]
158 } 158 }
159 j = j + 1 159 j = j + 1
160 } 160 }
161 i = i + 1 161 i = i + 1
162 } 162 }
163 DDATA 163 DDATA
164 } 164 }
165 165
166 166
167 #MajorFunction#This is the function that does everything else 167 #MajorFunction#This is the function that does everything else
168 THEFT <- function(){ 168 THEFT <- function(){
169 #Set working directory based on the directory of the series matrix file Currently only works for windows 169 #Set working directory based on the directory of the series matrix file Currently only works for windows
170 wd <- getwd() 170 wd <- getwd()
171 #list.files() 171 #list.files()
172 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?") 172 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
173 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) 173 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
174 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files()) 174 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
175 175
176 #ALL DATA FILES WILL BE CLEANED 176 #ALL DATA FILES WILL BE CLEANED
177 if(numDAT == 1){ 177 if(numDAT == 1){
178 #indexing the data files 178 #indexing the data files
179 n <- 1 179 n <- 1
180 for(n in 1: length(GSEfileloc)){ 180 for(n in 1: length(GSEfileloc)){
181 alz <- list.files()[GSEfileloc[n]] 181 alz <- list.files()[GSEfileloc[n]]
182 182
183 #Working with the wordy part of the document 183 #Working with the wordy part of the document
184 alzword <- alz %>% 184 alzword <- alz %>%
185 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 185 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
186 filter(grepl("!Sample",X1))%>% 186 filter(grepl("!Sample",X1))%>%
187 filter(!grepl("!Sample_contact",X1)) 187 filter(!grepl("!Sample_contact",X1))
188 188
189 #Getting the GPL file 189 #Getting the GPL file
190 genena <- grep("_platform_id",alzword$X1) %>% 190 genena <- grep("_platform_id",alzword$X1) %>%
191 alzword$X2[.] %>% 191 alzword$X2[.] %>%
192 str_trim(.) %>% 192 str_trim(.) %>%
193 paste0("^",.) %>% 193 paste0("^",.,"\\D") %>%
194 grep(.,list.files()) %>% 194 grep(.,list.files()) %>%
195 list.files()[.] 195 list.files()[.]
196 196
197 #Find out if it is a soft GPL file or not 197 #Find out if it is a soft GPL file or not
198 soft <- strsplit(genena,"[\\|/]") %>% 198 soft <- strsplit(genena,"[\\|/]") %>%
199 .[[1]] %>% 199 .[[1]] %>%
200 .[length(.)] %>% 200 .[length(.)] %>%
201 grepl("soft",.) 201 grepl("soft",.)
202 202
203 ##Changing row names and column names: 203 ##Changing row names and column names:
204 ALZWORD <- t(alzword) 204 ALZWORD <- t(alzword)
205 rownames(ALZWORD)=NULL 205 rownames(ALZWORD)=NULL
206 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 206 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
207 ALZWORD <- chngrownm(ALZWORD)[-1,] 207 ALZWORD <- chngrownm(ALZWORD)[-1,]
208 ALZWORD <- ALZWORD%>% 208 ALZWORD <- ALZWORD%>%
209 as.data.frame()%>% 209 as.data.frame()%>%
210 dplyr::select(-starts_with("col")) 210 dplyr::select(-starts_with("col"))
211 211
212 ##Reorganizing information within the columns and final clinical data 212 ##Reorganizing information within the columns and final clinical data
213 ALZWORDF <- cinfo(ALZWORD) 213 ALZWORDF <- cinfo(ALZWORD)
214 214
215 215
216 #Working with Actual Data part of file 216 #Working with Actual Data part of file
217 alzdat <- alz %>% 217 alzdat <- alz %>%
218 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 218 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
219 ALZDAT <- t(alzdat[,-1]) 219 ALZDAT <- t(alzdat[,-1])
220 rownames(ALZDAT)=NULL 220 rownames(ALZDAT)=NULL
221 221
222 ##Is there a clean version of the GPL file available? 222 ##Is there a clean version of the GPL file available?
223 gplnum <- strsplit(genena,"[\\|/]") %>% 223 gplnum <- strsplit(genena,"[\\|/]") %>%
224 .[[1]] %>% 224 .[[1]] %>%
225 .[length(.)] %>% 225 .[length(.)] %>%
226 gsub("\\D","",.) 226 gsub("\\D","",.)
227 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 227 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
228 if(clfileex >= 1){ 228 if(clfileex >= 1){
229 #use the clean version 229 #use the clean version
230 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 230 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
231 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 231 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
232 232
233 } 233 }
234 if(clfileex == 0){ 234 if(clfileex == 0){
235 ##Lets Create a clean version 235 ##Lets Create a clean version
236 236
237 ##Gene ID to Gene Name 237 ##Gene ID to Gene Name
238 if(soft == TRUE){ 238 if(soft == TRUE){
239 #Check to see if there is already a file containing information on soft files 239 #Check to see if there is already a file containing information on soft files
240 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 240 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
241 if(fileex == 1){ 241 if(fileex == 1){
242 #Check to see if this GPL soft file has been used before 242 #Check to see if this GPL soft file has been used before
243 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 243 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
244 .$GPL_FILE_NUM%>% 244 .$GPL_FILE_NUM%>%
245 grepl(gplnum,.) %>% 245 grepl(gplnum,.) %>%
246 sum() 246 sum()
247 if(IDF == 1){ 247 if(IDF == 1){
248 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 248 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
249 .$GPL_FILE_NUM%>% 249 .$GPL_FILE_NUM%>%
250 grep(gplnum,.) 250 grep(gplnum,.)
251 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 251 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
252 .$LOC_ID %>% 252 .$LOC_ID %>%
253 .[IDLOCAL] 253 .[IDLOCAL]
254 geneIDNam <- genena %>% 254 geneIDNam <- genena %>%
255 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 255 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
256 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 256 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
257 } 257 }
258 if(IDF == 0){ 258 if(IDF == 0){
259 #No information on this particular GPL file 259 #No information on this particular GPL file
260 idLOCGPL <- genena %>% 260 idLOCGPL <- genena %>%
261 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 261 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
262 t(.) %>% 262 t(.) %>%
263 grep("^ID\\s*$",.) %>% 263 grep("^ID\\s*$",.) %>%
264 -1 264 -1
265 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 265 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
266 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 266 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
267 geneIDNam <- genena %>% 267 geneIDNam <- genena %>%
268 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 268 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
269 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 269 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
270 } 270 }
271 } 271 }
272 if(fileex == 0){ 272 if(fileex == 0){
273 #We must create a file that we can access for later use 273 #We must create a file that we can access for later use
274 idLOCGPL <- genena %>% 274 idLOCGPL <- genena %>%
275 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 275 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
276 t(.) %>% 276 t(.) %>%
277 grep("^ID\\s*$",.) %>% 277 grep("^ID\\s*$",.) %>%
278 -1 278 -1
279 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 279 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
280 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 280 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
281 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 281 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
282 geneIDNam <- genena %>% 282 geneIDNam <- genena %>%
283 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 283 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
284 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 284 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
285 } 285 }
286 } 286 }
287 if(soft == FALSE){ 287 if(soft == FALSE){
288 geneIDNam <- genena %>% 288 geneIDNam <- genena %>%
289 read_delim(delim="\t",comment = "#")%>% 289 read_delim(delim="\t",comment = "#")%>%
290 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 290 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
291 } 291 }
292 292
293 ##Labeling the gene IDs without names 293 ##Labeling the gene IDs without names
294 geneIDNam <- NAFIXING(geneIDNam) 294 geneIDNam <- NAFIXING(geneIDNam)
295 295
296 ##remove the whitespace 296 ##remove the whitespace
297 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 297 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
298 298
299 ##Here is the clean version 299 ##Here is the clean version
300 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 300 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
301 } 301 }
302 302
303 303
304 304
305 ##Changing the gene ID to gene name 305 ##Changing the gene ID to gene name
306 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 306 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
307 colnames(ALZDAT) = ALZDAT1[1,] 307 colnames(ALZDAT) = ALZDAT1[1,]
308 308
309 309
310 ##Adjusting the column names aka the gene names 310 ##Adjusting the column names aka the gene names
311 colnames(ALZDAT) <- gcnames(ALZDAT) 311 colnames(ALZDAT) <- gcnames(ALZDAT)
312 312
313 313
314 #Full RAW Data 314 #Full RAW Data
315 Fullalzdwr <- ALZDAT %>% 315 Fullalzdwr <- ALZDAT %>%
316 as.data.frame() %>% 316 as.data.frame() %>%
317 cbind(ALZWORDF,.) 317 cbind(ALZWORDF,.)
318 318
319 #Raw file is output 319 #Raw file is output
320 nfnaex <- strsplit(alz,"[\\]") %>% 320 nfnaex <- strsplit(alz,"[\\]") %>%
321 .[[1]] %>% 321 .[[1]] %>%
322 .[length(.)] %>% 322 .[length(.)] %>%
323 gsub("\\D","",.) %>% 323 gsub("\\D","",.) %>%
324 c("GSE",.,"aftexcel.txt") %>% 324 c("GSE",.,"aftexcel.txt") %>%
325 paste(collapse = "") 325 paste(collapse = "")
326 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 326 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
327 327
328 328
329 329
330 #Now for the discretization part 330 #Now for the discretization part
331 ##get the wordy part again 331 ##get the wordy part again
332 rawword <- t(ALZWORDF) 332 rawword <- t(ALZWORDF)
333 333
334 ##where is ID_REF located 334 ##where is ID_REF located
335 hereim <- grep("ID_REF",rownames(rawword)) 335 hereim <- grep("ID_REF",rownames(rawword))
336 336
337 ##Subject Names GSM... 337 ##Subject Names GSM...
338 subjnam <- rawword[hereim,] 338 subjnam <- rawword[hereim,]
339 339
340 ##Getting the names for the rows 340 ##Getting the names for the rows
341 namedarows <- rownames(rawword)[-hereim] %>% 341 namedarows <- rownames(rawword)[-hereim] %>%
342 as.data.frame() 342 as.data.frame()
343 RAWWORD <- rawword[-hereim,] %>% 343 RAWWORD <- rawword[-hereim,] %>%
344 as.data.frame() %>% 344 as.data.frame() %>%
345 bind_cols(namedarows,.) 345 bind_cols(namedarows,.)
346 z <- 1 346 z <- 1
347 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 347 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
348 for(z in 1:dim(RAWWORD)[1]){ 348 for(z in 1:dim(RAWWORD)[1]){
349 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 349 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
350 z <- z + 1 350 z <- z + 1
351 } 351 }
352 352
353 colnames(naroww) <- "ROW_NAs" 353 colnames(naroww) <- "ROW_NAs"
354 RAWWORD <- bind_cols(RAWWORD,naroww) 354 RAWWORD <- bind_cols(RAWWORD,naroww)
355 355
356 356
357 roALZna <- t(ALZDAT) %>% 357 roALZna <- t(ALZDAT) %>%
358 rownames(.) %>% 358 rownames(.) %>%
359 as.data.frame(.) 359 as.data.frame(.)
360 colnames(roALZna) <- "ID_REF" 360 colnames(roALZna) <- "ID_REF"
361 361
362 RAWDAT <- t(ALZDAT) %>% 362 RAWDAT <- t(ALZDAT) %>%
363 as.data.frame(.) 363 as.data.frame(.)
364 colnames(RAWDAT) <- NULL 364 colnames(RAWDAT) <- NULL
365 rownames(RAWDAT) <- NULL 365 rownames(RAWDAT) <- NULL
366 366
367 RAWDAT2 <- RAWDAT %>% 367 RAWDAT2 <- RAWDAT %>%
368 cbind(roALZna,.) %>% 368 cbind(roALZna,.) %>%
369 dplyr::arrange(.,ID_REF) 369 dplyr::arrange(.,ID_REF)
370 370
371 ##Editing the file for R processing 371 ##Editing the file for R processing
372 RAWDATID <- RAWDAT2[,1] %>% 372 RAWDATID <- RAWDAT2[,1] %>%
373 as.matrix(.) 373 as.matrix(.)
374 374
375 RAWDATNUM <- RAWDAT2[,-1] %>% 375 RAWDATNUM <- RAWDAT2[,-1] %>%
376 mapply(.,FUN = as.numeric) %>% 376 mapply(.,FUN = as.numeric) %>%
377 t(.) 377 t(.)
378 378
379 ##Consolidating genes with the same name 379 ##Consolidating genes with the same name
380 ###create empty matrix of size equal to tabRDATID 380 ###create empty matrix of size equal to tabRDATID
381 tabRDATID <- table(RAWDATID) 381 tabRDATID <- table(RAWDATID)
382 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 382 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
383 j <- 1 383 j <- 1
384 for(j in 1:length(tabRDATID)){ 384 for(j in 1:length(tabRDATID)){
385 ##Putting the ones without duplicates in their new homes 385 ##Putting the ones without duplicates in their new homes
386 if(tabRDATID[j] == 1){ 386 if(tabRDATID[j] == 1){
387 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 387 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
388 } 388 }
389 ##Averaging duplicates and putting them in their new homes 389 ##Averaging duplicates and putting them in their new homes
390 if(tabRDATID[j] > 1){ 390 if(tabRDATID[j] > 1){
391 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 391 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
392 } 392 }
393 j <- j + 1 393 j <- j + 1
394 } 394 }
395 395
396 ##Scaling the Data 396 ##Scaling the Data
397 scrawdat <- NuRDATN%>% 397 scrawdat <- NuRDATN%>%
398 scale() 398 scale()
399 attr(scrawdat,"scaled:center") <- NULL 399 attr(scrawdat,"scaled:center") <- NULL
400 attr(scrawdat,"scaled:scale") <- NULL 400 attr(scrawdat,"scaled:scale") <- NULL
401 colnames(scrawdat) <- rownames(tabRDATID) 401 colnames(scrawdat) <- rownames(tabRDATID)
402 402
403 ##Discretized the Data 403 ##Discretized the Data
404 dialzdat <- scrawdat %>% 404 dialzdat <- scrawdat %>%
405 dndat(.) %>% 405 dndat(.) %>%
406 t()%>% 406 t()%>%
407 as.data.frame(.) 407 as.data.frame(.)
408 colnames(dialzdat) <- rownames(RAWDATNUM) 408 colnames(dialzdat) <- rownames(RAWDATNUM)
409 409
410 ##setting "ID_REF" as a new variable 410 ##setting "ID_REF" as a new variable
411 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 411 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
412 colnames(geneNAM) <- "ID_REF" 412 colnames(geneNAM) <- "ID_REF"
413 rownames(dialzdat) <- NULL 413 rownames(dialzdat) <- NULL
414 dialzdat <-bind_cols(geneNAM,dialzdat) 414 dialzdat <-bind_cols(geneNAM,dialzdat)
415 415
416 ##NAs in a column 416 ##NAs in a column
417 x <- 2 417 x <- 2
418 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 418 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
419 nacol[1,1] = "COL_NAs" 419 nacol[1,1] = "COL_NAs"
420 for(x in 2:dim(dialzdat)[2]){ 420 for(x in 2:dim(dialzdat)[2]){
421 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 421 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
422 x <- x + 1 422 x <- x + 1
423 } 423 }
424 colnames(nacol) <- colnames(dialzdat) 424 colnames(nacol) <- colnames(dialzdat)
425 dialzdat <- bind_rows(dialzdat,nacol) 425 dialzdat <- bind_rows(dialzdat,nacol)
426 426
427 ##NAs in a row 427 ##NAs in a row
428 y <- 1 428 y <- 1
429 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 429 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
430 for(y in 1:dim(dialzdat)[1]){ 430 for(y in 1:dim(dialzdat)[1]){
431 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 431 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
432 y <- y + 1 432 y <- y + 1
433 } 433 }
434 colnames(narowd) <- "ROW_NAs" 434 colnames(narowd) <- "ROW_NAs"
435 dialzdat <- bind_cols(dialzdat,narowd) 435 dialzdat <- bind_cols(dialzdat,narowd)
436 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 436 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
437 colnames(RAWWORD) <- colnames(dialzdat) 437 colnames(RAWWORD) <- colnames(dialzdat)
438 ##converting to character so that the clinical can be brought together with discrete data 438 ##converting to character so that the clinical can be brought together with discrete data
439 k <- 2 439 k <- 2
440 for(k in 2:dim(dialzdat)[2]-1){ 440 for(k in 2:dim(dialzdat)[2]-1){
441 dialzdat[,k] <- as.character(dialzdat[,k]) 441 dialzdat[,k] <- as.character(dialzdat[,k])
442 k <- k + 1 442 k <- k + 1
443 } 443 }
444 #The End the full data 444 #The End the full data
445 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 445 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
446 446
447 #Produces Discrete file 447 #Produces Discrete file
448 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 448 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
449 .[[1]] %>% 449 .[[1]] %>%
450 .[length(.)] %>% 450 .[length(.)] %>%
451 gsub("\\D","",.) %>% 451 gsub("\\D","",.) %>%
452 c("GSE",.,"dscrt.txt") %>% 452 c("GSE",.,"dscrt.txt") %>%
453 paste(collapse = "") 453 paste(collapse = "")
454 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 454 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
455 n <- n +1 455 n <- n +1
456 } 456 }
457 } 457 }
458 458
459 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN 459 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
460 if(numDAT == 2){ 460 if(numDAT == 2){
461 #All the files you want to analyze 461 #All the files you want to analyze
462 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:") 462 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
463 if(length(ANDIS) == 0){ 463 if(length(ANDIS) == 0){
464 #Spit out a warning 464 #Spit out a warning
465 warning("You did not select any files and so no cleaning will be performed") 465 warning("You did not select any files and so no cleaning will be performed")
466 } else{ 466 } else{
467 #indexing the data files 467 #indexing the data files
468 n <- 1 468 n <- 1
469 for(n in 1: length(ANDIS)){ 469 for(n in 1: length(ANDIS)){
470 alz <- ANDIS[n] 470 alz <- ANDIS[n]
471 471
472 #Working with the wordy part of the document 472 #Working with the wordy part of the document
473 alzword <- alz %>% 473 alzword <- alz %>%
474 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 474 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
475 filter(grepl("!Sample",X1))%>% 475 filter(grepl("!Sample",X1))%>%
476 filter(!grepl("!Sample_contact",X1)) 476 filter(!grepl("!Sample_contact",X1))
477 477
478 #Getting the GPL file 478 #Getting the GPL file
479 genena <- grep("_platform_id",alzword$X1) %>% 479 genena <- grep("_platform_id",alzword$X1) %>%
480 alzword$X2[.] %>% 480 alzword$X2[.] %>%
481 str_trim(.) %>% 481 str_trim(.) %>%
482 paste0("^",.) %>% 482 paste0("^",.,"\\D") %>%
483 grep(.,list.files()) %>% 483 grep(.,list.files()) %>%
484 list.files()[.] 484 list.files()[.]
485 485
486 #Find out if it is a soft GPL file or not 486 #Find out if it is a soft GPL file or not
487 soft <- strsplit(genena,"[\\|/]") %>% 487 soft <- strsplit(genena,"[\\|/]") %>%
488 .[[1]] %>% 488 .[[1]] %>%
489 .[length(.)] %>% 489 .[length(.)] %>%
490 grepl("soft",.) 490 grepl("soft",.)
491 491
492 ##Changing row names and column names: 492 ##Changing row names and column names:
493 ALZWORD <- t(alzword) 493 ALZWORD <- t(alzword)
494 rownames(ALZWORD)=NULL 494 rownames(ALZWORD)=NULL
495 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 495 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
496 ALZWORD <- chngrownm(ALZWORD)[-1,] 496 ALZWORD <- chngrownm(ALZWORD)[-1,]
497 ALZWORD <- ALZWORD%>% 497 ALZWORD <- ALZWORD%>%
498 as.data.frame()%>% 498 as.data.frame()%>%
499 dplyr::select(-starts_with("col")) 499 dplyr::select(-starts_with("col"))
500 500
501 ##Reorganizing information within the columns and final clinical data 501 ##Reorganizing information within the columns and final clinical data
502 ALZWORDF <- cinfo(ALZWORD) 502 ALZWORDF <- cinfo(ALZWORD)
503 503
504 504
505 #Working with Actual Data part of file 505 #Working with Actual Data part of file
506 alzdat <- alz %>% 506 alzdat <- alz %>%
507 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 507 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
508 ALZDAT <- t(alzdat[,-1]) 508 ALZDAT <- t(alzdat[,-1])
509 rownames(ALZDAT)=NULL 509 rownames(ALZDAT)=NULL
510 510
511 ##Is there a clean version of the GPL file available? 511 ##Is there a clean version of the GPL file available?
512 gplnum <- strsplit(genena,"[\\|/]") %>% 512 gplnum <- strsplit(genena,"[\\|/]") %>%
513 .[[1]] %>% 513 .[[1]] %>%
514 .[length(.)] %>% 514 .[length(.)] %>%
515 gsub("\\D","",.) 515 gsub("\\D","",.)
516 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 516 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
517 if(clfileex >= 1){ 517 if(clfileex >= 1){
518 #use the clean version 518 #use the clean version
519 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 519 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
520 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 520 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
521 521
522 } 522 }
523 if(clfileex == 0){ 523 if(clfileex == 0){
524 ##Lets Create a clean version 524 ##Lets Create a clean version
525 525
526 ##Gene ID to Gene Name 526 ##Gene ID to Gene Name
527 if(soft == TRUE){ 527 if(soft == TRUE){
528 #Check to see if there is already a file containing information on soft files 528 #Check to see if there is already a file containing information on soft files
529 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 529 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
530 if(fileex == 1){ 530 if(fileex == 1){
531 #Check to see if this GPL soft file has been used before 531 #Check to see if this GPL soft file has been used before
532 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 532 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
533 .$GPL_FILE_NUM%>% 533 .$GPL_FILE_NUM%>%
534 grepl(gplnum,.) %>% 534 grepl(gplnum,.) %>%
535 sum() 535 sum()
536 if(IDF == 1){ 536 if(IDF == 1){
537 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 537 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
538 .$GPL_FILE_NUM%>% 538 .$GPL_FILE_NUM%>%
539 grep(gplnum,.) 539 grep(gplnum,.)
540 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 540 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
541 .$LOC_ID %>% 541 .$LOC_ID %>%
542 .[IDLOCAL] 542 .[IDLOCAL]
543 geneIDNam <- genena %>% 543 geneIDNam <- genena %>%
544 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 544 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
545 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 545 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
546 } 546 }
547 if(IDF == 0){ 547 if(IDF == 0){
548 #No information on this particular GPL file 548 #No information on this particular GPL file
549 idLOCGPL <- genena %>% 549 idLOCGPL <- genena %>%
550 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 550 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
551 t(.) %>% 551 t(.) %>%
552 grep("^ID\\s*$",.) %>% 552 grep("^ID\\s*$",.) %>%
553 -1 553 -1
554 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 554 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
555 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 555 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
556 geneIDNam <- genena %>% 556 geneIDNam <- genena %>%
557 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 557 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
558 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 558 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
559 } 559 }
560 } 560 }
561 if(fileex == 0){ 561 if(fileex == 0){
562 #We must create a file that we can access for later use 562 #We must create a file that we can access for later use
563 idLOCGPL <- genena %>% 563 idLOCGPL <- genena %>%
564 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 564 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
565 t(.) %>% 565 t(.) %>%
566 grep("^ID\\s*$",.) %>% 566 grep("^ID\\s*$",.) %>%
567 -1 567 -1
568 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 568 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
569 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 569 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
570 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 570 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
571 geneIDNam <- genena %>% 571 geneIDNam <- genena %>%
572 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 572 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
573 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 573 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
574 } 574 }
575 } 575 }
576 if(soft == FALSE){ 576 if(soft == FALSE){
577 geneIDNam <- genena %>% 577 geneIDNam <- genena %>%
578 read_delim(delim="\t",comment = "#")%>% 578 read_delim(delim="\t",comment = "#")%>%
579 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 579 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
580 } 580 }
581 581
582 ##Labeling the gene IDs without names 582 ##Labeling the gene IDs without names
583 geneIDNam <- NAFIXING(geneIDNam) 583 geneIDNam <- NAFIXING(geneIDNam)
584 584
585 ##remove the whitespace 585 ##remove the whitespace
586 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 586 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
587 587
588 ##Here is the clean version 588 ##Here is the clean version
589 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 589 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
590 } 590 }
591 591
592 592
593 593
594 ##Changing the gene ID to gene name 594 ##Changing the gene ID to gene name
595 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 595 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
596 colnames(ALZDAT) = ALZDAT1[1,] 596 colnames(ALZDAT) = ALZDAT1[1,]
597 597
598 598
599 ##Adjusting the column names aka the gene names 599 ##Adjusting the column names aka the gene names
600 colnames(ALZDAT) <- gcnames(ALZDAT) 600 colnames(ALZDAT) <- gcnames(ALZDAT)
601 601
602 602
603 #Full RAW Data 603 #Full RAW Data
604 Fullalzdwr <- ALZDAT %>% 604 Fullalzdwr <- ALZDAT %>%
605 as.data.frame() %>% 605 as.data.frame() %>%
606 cbind(ALZWORDF,.) 606 cbind(ALZWORDF,.)
607 607
608 #Raw file is output 608 #Raw file is output
609 nfnaex <- strsplit(alz,"[\\]") %>% 609 nfnaex <- strsplit(alz,"[\\]") %>%
610 .[[1]] %>% 610 .[[1]] %>%
611 .[length(.)] %>% 611 .[length(.)] %>%
612 gsub("\\D","",.) %>% 612 gsub("\\D","",.) %>%
613 c("GSE",.,"aftexcel.txt") %>% 613 c("GSE",.,"aftexcel.txt") %>%
614 paste(collapse = "") 614 paste(collapse = "")
615 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 615 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
616 616
617 617
618 618
619 #Now for the discretization part 619 #Now for the discretization part
620 ##get the wordy part again 620 ##get the wordy part again
621 rawword <- t(ALZWORDF) 621 rawword <- t(ALZWORDF)
622 622
623 ##where is ID_REF located 623 ##where is ID_REF located
624 hereim <- grep("ID_REF",rownames(rawword)) 624 hereim <- grep("ID_REF",rownames(rawword))
625 625
626 ##Subject Names GSM... 626 ##Subject Names GSM...
627 subjnam <- rawword[hereim,] 627 subjnam <- rawword[hereim,]
628 628
629 ##Getting the names for the rows 629 ##Getting the names for the rows
630 namedarows <- rownames(rawword)[-hereim] %>% 630 namedarows <- rownames(rawword)[-hereim] %>%
631 as.data.frame() 631 as.data.frame()
632 RAWWORD <- rawword[-hereim,] %>% 632 RAWWORD <- rawword[-hereim,] %>%
633 as.data.frame() %>% 633 as.data.frame() %>%
634 bind_cols(namedarows,.) 634 bind_cols(namedarows,.)
635 z <- 1 635 z <- 1
636 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 636 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
637 for(z in 1:dim(RAWWORD)[1]){ 637 for(z in 1:dim(RAWWORD)[1]){
638 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 638 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
639 z <- z + 1 639 z <- z + 1
640 } 640 }
641 641
642 colnames(naroww) <- "ROW_NAs" 642 colnames(naroww) <- "ROW_NAs"
643 RAWWORD <- bind_cols(RAWWORD,naroww) 643 RAWWORD <- bind_cols(RAWWORD,naroww)
644 644
645 645
646 roALZna <- t(ALZDAT) %>% 646 roALZna <- t(ALZDAT) %>%
647 rownames(.) %>% 647 rownames(.) %>%
648 as.data.frame(.) 648 as.data.frame(.)
649 colnames(roALZna) <- "ID_REF" 649 colnames(roALZna) <- "ID_REF"
650 650
651 RAWDAT <- t(ALZDAT) %>% 651 RAWDAT <- t(ALZDAT) %>%
652 as.data.frame(.) 652 as.data.frame(.)
653 colnames(RAWDAT) <- NULL 653 colnames(RAWDAT) <- NULL
654 rownames(RAWDAT) <- NULL 654 rownames(RAWDAT) <- NULL
655 655
656 RAWDAT2 <- RAWDAT %>% 656 RAWDAT2 <- RAWDAT %>%
657 cbind(roALZna,.) %>% 657 cbind(roALZna,.) %>%
658 dplyr::arrange(.,ID_REF) 658 dplyr::arrange(.,ID_REF)
659 659
660 ##Editing the file for R processing 660 ##Editing the file for R processing
661 RAWDATID <- RAWDAT2[,1] %>% 661 RAWDATID <- RAWDAT2[,1] %>%
662 as.matrix(.) 662 as.matrix(.)
663 663
664 RAWDATNUM <- RAWDAT2[,-1] %>% 664 RAWDATNUM <- RAWDAT2[,-1] %>%
665 mapply(.,FUN = as.numeric) %>% 665 mapply(.,FUN = as.numeric) %>%
666 t(.) 666 t(.)
667 667
668 ##Consolidating genes with the same name 668 ##Consolidating genes with the same name
669 ###create empty matrix of size equal to tabRDATID 669 ###create empty matrix of size equal to tabRDATID
670 tabRDATID <- table(RAWDATID) 670 tabRDATID <- table(RAWDATID)
671 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 671 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
672 j <- 1 672 j <- 1
673 for(j in 1:length(tabRDATID)){ 673 for(j in 1:length(tabRDATID)){
674 ##Putting the ones without duplicates in their new homes 674 ##Putting the ones without duplicates in their new homes
675 if(tabRDATID[j] == 1){ 675 if(tabRDATID[j] == 1){
676 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 676 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
677 } 677 }
678 ##Averaging duplicates and putting them in their new homes 678 ##Averaging duplicates and putting them in their new homes
679 if(tabRDATID[j] > 1){ 679 if(tabRDATID[j] > 1){
680 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 680 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
681 } 681 }
682 j <- j + 1 682 j <- j + 1
683 } 683 }
684 684
685 ##Scaling the Data 685 ##Scaling the Data
686 scrawdat <- NuRDATN%>% 686 scrawdat <- NuRDATN%>%
687 scale() 687 scale()
688 attr(scrawdat,"scaled:center") <- NULL 688 attr(scrawdat,"scaled:center") <- NULL
689 attr(scrawdat,"scaled:scale") <- NULL 689 attr(scrawdat,"scaled:scale") <- NULL
690 colnames(scrawdat) <- rownames(tabRDATID) 690 colnames(scrawdat) <- rownames(tabRDATID)
691 691
692 ##Discretized the Data 692 ##Discretized the Data
693 dialzdat <- scrawdat %>% 693 dialzdat <- scrawdat %>%
694 dndat(.) %>% 694 dndat(.) %>%
695 t()%>% 695 t()%>%
696 as.data.frame(.) 696 as.data.frame(.)
697 colnames(dialzdat) <- rownames(RAWDATNUM) 697 colnames(dialzdat) <- rownames(RAWDATNUM)
698 698
699 ##setting "ID_REF" as a new variable 699 ##setting "ID_REF" as a new variable
700 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 700 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
701 colnames(geneNAM) <- "ID_REF" 701 colnames(geneNAM) <- "ID_REF"
702 rownames(dialzdat) <- NULL 702 rownames(dialzdat) <- NULL
703 dialzdat <-bind_cols(geneNAM,dialzdat) 703 dialzdat <-bind_cols(geneNAM,dialzdat)
704 704
705 ##NAs in a column 705 ##NAs in a column
706 x <- 2 706 x <- 2
707 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 707 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
708 nacol[1,1] = "COL_NAs" 708 nacol[1,1] = "COL_NAs"
709 for(x in 2:dim(dialzdat)[2]){ 709 for(x in 2:dim(dialzdat)[2]){
710 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 710 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
711 x <- x + 1 711 x <- x + 1
712 } 712 }
713 colnames(nacol) <- colnames(dialzdat) 713 colnames(nacol) <- colnames(dialzdat)
714 dialzdat <- bind_rows(dialzdat,nacol) 714 dialzdat <- bind_rows(dialzdat,nacol)
715 715
716 ##NAs in a row 716 ##NAs in a row
717 y <- 1 717 y <- 1
718 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 718 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
719 for(y in 1:dim(dialzdat)[1]){ 719 for(y in 1:dim(dialzdat)[1]){
720 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 720 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
721 y <- y + 1 721 y <- y + 1
722 } 722 }
723 colnames(narowd) <- "ROW_NAs" 723 colnames(narowd) <- "ROW_NAs"
724 dialzdat <- bind_cols(dialzdat,narowd) 724 dialzdat <- bind_cols(dialzdat,narowd)
725 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 725 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
726 colnames(RAWWORD) <- colnames(dialzdat) 726 colnames(RAWWORD) <- colnames(dialzdat)
727 ##converting to character so that the clinical can be brought together with discrete data 727 ##converting to character so that the clinical can be brought together with discrete data
728 k <- 2 728 k <- 2
729 for(k in 2:dim(dialzdat)[2]-1){ 729 for(k in 2:dim(dialzdat)[2]-1){
730 dialzdat[,k] <- as.character(dialzdat[,k]) 730 dialzdat[,k] <- as.character(dialzdat[,k])
731 k <- k + 1 731 k <- k + 1
732 } 732 }
733 #The End the full data 733 #The End the full data
734 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 734 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
735 735
736 #Produces Discrete file 736 #Produces Discrete file
737 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 737 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
738 .[[1]] %>% 738 .[[1]] %>%
739 .[length(.)] %>% 739 .[length(.)] %>%
740 gsub("\\D","",.) %>% 740 gsub("\\D","",.) %>%
741 c("GSE",.,"dscrt.txt") %>% 741 c("GSE",.,"dscrt.txt") %>%
742 paste(collapse = "") 742 paste(collapse = "")
743 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 743 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
744 744
745 745
746 n <- n + 1 746 n <- n + 1
747 } 747 }
748 } 748 }
749 } 749 }
750 } 750 }
751 #The Rest of this code will be used every time you want to change a data set 751 #The Rest of this code will be used every time you want to change a data set
752 THEFT() 752 THEFT()