Commit db3a0d69f1e7afde29061d125c6b122bda04a033

Authored by Efrain Gonzalez
1 parent 6a66705877
Exists in master

Don't use this code yet

Updated else if statements
Showing 1 changed file with 1 additions and 2 deletions   Show diff stats
1 ######################################################################## 1 ########################################################################
2 # Don't Use This Code Just Yet # 2 # Don't Use This Code Just Yet #
3 ######################################################################## 3 ########################################################################
4 #Efrain H. Gonzalez 4 #Efrain H. Gonzalez
5 #6/16/2017 5 #6/16/2017
6 6
7 #Libraries required to run the code 7 #Libraries required to run the code
8 library(pryr) 8 library(pryr)
9 library(MASS) 9 library(MASS)
10 library(dplyr) 10 library(dplyr)
11 library(tidyr) 11 library(tidyr)
12 library(readr) 12 library(readr)
13 library(stringr) 13 library(stringr)
14 14
15 15
16 #Necessary Functions 16 #Necessary Functions
17 #1#Function for handling the changing of row names and column names 17 #1#Function for handling the changing of row names and column names
18 chngrownm <- function(mat){ 18 chngrownm <- function(mat){
19 row <- dim(mat)[1] 19 row <- dim(mat)[1]
20 col <- dim(mat)[2] 20 col <- dim(mat)[2]
21 e <- 1 21 e <- 1
22 r <- 1 22 r <- 1
23 a <- 1 23 a <- 1
24 h <- 1 24 h <- 1
25 g <- 1 25 g <- 1
26 o <- 1 26 o <- 1
27 for(e in 1:col){ 27 for(e in 1:col){
28 if("!Sample_source_name_ch1"==mat[1,e]){ 28 if("!Sample_source_name_ch1"==mat[1,e]){
29 colnames(mat)[e] <- "Brain_Region" 29 colnames(mat)[e] <- "Brain_Region"
30 } else if("!Sample_title" == mat[1,e]){ 30 } else if("!Sample_title" == mat[1,e]){
31 colnames(mat)[e] <- "Title" 31 colnames(mat)[e] <- "Title"
32 } else if("!Sample_geo_accession" == mat[1,e]){ 32 } else if("!Sample_geo_accession" == mat[1,e]){
33 colnames(mat)[e] <- "ID_REF" 33 colnames(mat)[e] <- "ID_REF"
34 } else{ 34 } else{
35 if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){ 35 if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){
36 colnames(mat)[e] <- paste0("Sex",r) 36 colnames(mat)[e] <- paste0("Sex",r)
37 r = r + 1 37 r = r + 1
38 } 38 }
39 if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){ 39 if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){
40 colnames(mat)[e] <- paste0("PMI",a) 40 colnames(mat)[e] <- paste0("PMI",a)
41 a = a + 1 41 a = a + 1
42 } 42 }
43 if(grepl("age|Age|AGE",mat[2,e])==TRUE){ 43 if(grepl("age|Age|AGE",mat[2,e])==TRUE){
44 colnames(mat)[e] <- paste0("Age",h) 44 colnames(mat)[e] <- paste0("Age",h)
45 h = h + 1 45 h = h + 1
46 } 46 }
47 if(grepl("braak|b&b",mat[2,e])==TRUE){ 47 if(grepl("braak|b&b",mat[2,e])==TRUE){
48 colnames(mat)[e] <- paste0("Braak",g) 48 colnames(mat)[e] <- paste0("Braak",g)
49 g = g + 1 49 g = g + 1
50 } 50 }
51 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){ 51 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){
52 colnames(mat)[e] <- paste0("Group",o) 52 colnames(mat)[e] <- paste0("Group",o)
53 o = o + 1 53 o = o + 1
54 } 54 }
55 55
56 } 56 }
57 e = e + 1 57 e = e + 1
58 } 58 }
59 mat 59 mat
60 } 60 }
61 61
62 #2#Function for reorganizing information within the columns 62 #2#Function for reorganizing information within the columns
63 cinfo <- function(mat){ 63 cinfo <- function(mat){
64 col <- dim(mat)[2] 64 col <- dim(mat)[2]
65 j <-2 65 j <-2
66 for(j in 2:col){ 66 for(j in 2:col){
67 if(grepl("Group",colnames(mat)[j]) == TRUE){ 67 if(grepl("Group",colnames(mat)[j]) == TRUE){
68 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 68 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
69 } else if(grepl("Age",colnames(mat)[j])==TRUE){ 69 } else if(grepl("Age",colnames(mat)[j])==TRUE){
70 mat[,j] <- gsub("\\D","",mat[,j])%>% 70 mat[,j] <- gsub("\\D","",mat[,j])%>%
71 as.integer() 71 as.integer()
72 } else if(grepl("Sex",colnames(mat)[j])==TRUE){ 72 } else if(grepl("Sex",colnames(mat)[j])==TRUE){
73 mat[,j] <- gsub(".+:\\s","",mat[,j]) 73 mat[,j] <- gsub(".+:\\s","",mat[,j])
74 } else if(grepl("PMI",colnames(mat)[j])==TRUE){ 74 } else if(grepl("PMI",colnames(mat)[j])==TRUE){
75 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 75 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
76 as.numeric() 76 as.numeric()
77 } else if(grepl("Braak",colnames(mat)[j])==TRUE){ 77 } else if(grepl("Braak",colnames(mat)[j])==TRUE){
78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
79 as.roman()%>% 79 as.roman()%>%
80 as.integer() 80 as.integer()
81 } 81 }
82 j=j+1 82 j=j+1
83 } 83 }
84 mat 84 mat
85 } 85 }
86 86
87 #3#Function for labeling the gene IDs without names 87 #3#Function for labeling the gene IDs without names
88 NAFIXING <- function(GIDNAM){ 88 NAFIXING <- function(GIDNAM){
89 row <- dim(GIDNAM)[1] 89 row <- dim(GIDNAM)[1]
90 i <- 1 90 i <- 1
91 for(i in 1:row){ 91 for(i in 1:row){
92 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 92 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
93 GIDNAM[i,2] <- GIDNAM[i,1] 93 GIDNAM[i,2] <- GIDNAM[i,1]
94 } 94 }
95 i <- i + 1 95 i <- i + 1
96 } 96 }
97 GIDNAM 97 GIDNAM
98 } 98 }
99 99
100 #4#Function for changing the gene ID to gene name 100 #4#Function for changing the gene ID to gene name
101 cgeneID <- function(GeneName,DATA){ 101 cgeneID <- function(GeneName,DATA){
102 nj <- t(GeneName) 102 nj <- t(GeneName)
103 nq <- t(DATA) 103 nq <- t(DATA)
104 colGene <- dim(nj)[2] 104 colGene <- dim(nj)[2]
105 colDATA <- dim(nq)[2] 105 colDATA <- dim(nq)[2]
106 j <- 1 106 j <- 1
107 for(j in 1:colDATA){ 107 for(j in 1:colDATA){
108 #where is that gene id located within the GPL file 108 #where is that gene id located within the GPL file
109 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,]) 109 chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
110 if(is.na(sum(chngreq))==FALSE){ 110 if(is.na(sum(chngreq))==FALSE){
111 if(sum(chngreq) > 0){ 111 if(sum(chngreq) > 0){
112 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j]) 112 nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
113 } 113 }
114 } 114 }
115 j <- j + 1 115 j <- j + 1
116 } 116 }
117 nq 117 nq
118 } 118 }
119 #cgeneID <- function(GeneName,DATA){ 119 #cgeneID <- function(GeneName,DATA){
120 # colGene <- dim(GeneName)[2] 120 # colGene <- dim(GeneName)[2]
121 # j <- 1 121 # j <- 1
122 # for(j in 1:colGene){ 122 # for(j in 1:colGene){
123 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) 123 # chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
124 # if(is.na(sum(chngsreq))==FALSE){ 124 # if(is.na(sum(chngsreq))==FALSE){
125 # if(sum(chngsreq) > 0){ 125 # if(sum(chngsreq) > 0){
126 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 126 # DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
127 # } 127 # }
128 # } 128 # }
129 # j = j+1 129 # j = j+1
130 # } 130 # }
131 # DATA 131 # DATA
132 #} 132 #}
133 133
134 #5#Function for adjusting the gene names 134 #5#Function for adjusting the gene names
135 gcnames <- function(DiData,usecol=1){ 135 gcnames <- function(DiData,usecol=1){
136 nuruns <- dim(DiData)[2] 136 nuruns <- dim(DiData)[2]
137 i = 1 137 i = 1
138 nwnam <- rep("0",length.out=nuruns) 138 nwnam <- rep("0",length.out=nuruns)
139 for(i in 1:nuruns){ 139 for(i in 1:nuruns){
140 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ 140 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
141 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol]) 141 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
142 } else{ 142 } else{
143 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1]) 143 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
144 } 144 }
145 145
146 } 146 }
147 nwnam 147 nwnam
148 148
149 } 149 }
150 150
151 #6# Function for discretizing the data 151 #6# Function for discretizing the data
152 dndat <- function(NDATA){ 152 dndat <- function(NDATA){
153 rownd <- dim(NDATA)[1] 153 rownd <- dim(NDATA)[1]
154 colnd <- dim(NDATA)[2] 154 colnd <- dim(NDATA)[2]
155 DDATA <- matrix(0,nrow=rownd,ncol=colnd) 155 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
156 colnames(DDATA) <- colnames(NDATA) 156 colnames(DDATA) <- colnames(NDATA)
157 i <- 1 157 i <- 1
158 for(i in 1:rownd){ 158 for(i in 1:rownd){
159 j <- 1 159 j <- 1
160 for(j in 1:colnd){ 160 for(j in 1:colnd){
161 if(is.na(NDATA[i,j])==FALSE){ 161 if(is.na(NDATA[i,j])==FALSE){
162 162
163 if(NDATA[i,j] < -1){ 163 if(NDATA[i,j] < -1){
164 DDATA[i,j]=0L 164 DDATA[i,j]=0L
165 } else if(NDATA[i,j] > 1){ 165 } else if(NDATA[i,j] > 1){
166 DDATA[i,j]=2L 166 DDATA[i,j]=2L
167 } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ 167 } else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
168 DDATA[i,j]=1L 168 DDATA[i,j]=1L
169 } 169 }
170 } else{ 170 } else{
171 DDATA[i,j] = NDATA[i,j] 171 DDATA[i,j] = NDATA[i,j]
172 } 172 }
173 j = j + 1 173 j = j + 1
174 } 174 }
175 i = i + 1 175 i = i + 1
176 } 176 }
177 DDATA 177 DDATA
178 } 178 }
179 179
180 180
181 #MajorFunction#This is the function that does everything else 181 #MajorFunction#This is the function that does everything else
182 THEFT <- function(){ 182 THEFT <- function(){
183 #Set working directory based on the directory of the series matrix file Currently only works for windows 183 #Set working directory based on the directory of the series matrix file Currently only works for windows
184 wd <- getwd() 184 wd <- getwd()
185 #list.files() 185 #list.files()
186 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?") 186 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
187 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) 187 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
188 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files()) 188 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
189 GSEfloc <- list.files()[GSEfileloc] 189 GSEfloc <- list.files()[GSEfileloc]
190 #ALL DATA FILES WILL BE CLEANED 190 #ALL DATA FILES WILL BE CLEANED
191 if(numDAT == 1){ 191 if(numDAT == 1){
192 #indexing the data files 192 #indexing the data files
193 n <- 1 193 n <- 1
194 for(n in 1: length(GSEfloc)){ 194 for(n in 1: length(GSEfloc)){
195 alz <- GSEfloc[n] 195 alz <- GSEfloc[n]
196 196
197 #Working with the wordy part of the document 197 #Working with the wordy part of the document
198 alzword <- alz %>% 198 alzword <- alz %>%
199 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 199 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
200 filter(grepl("!Sample",X1))%>% 200 filter(grepl("!Sample",X1))%>%
201 filter(!grepl("!Sample_contact",X1)) 201 filter(!grepl("!Sample_contact",X1))
202 202
203 #Getting the GPL file 203 #Getting the GPL file
204 genena <- grep("_platform_id",alzword$X1) %>% 204 genena <- grep("_platform_id",alzword$X1) %>%
205 alzword$X2[.] %>% 205 alzword$X2[.] %>%
206 str_trim(.) %>% 206 str_trim(.) %>%
207 paste0("^",.,"\\D") %>% 207 paste0("^",.,"\\D") %>%
208 grep(.,list.files()) %>% 208 grep(.,list.files()) %>%
209 list.files()[.] 209 list.files()[.]
210 210
211 #Find out if it is a soft GPL file or not 211 #Find out if it is a soft GPL file or not
212 soft <- strsplit(genena,"[\\|/]") %>% 212 soft <- strsplit(genena,"[\\|/]") %>%
213 .[[1]] %>% 213 .[[1]] %>%
214 .[length(.)] %>% 214 .[length(.)] %>%
215 grepl("soft",.) 215 grepl("soft",.)
216 216
217 ##Changing row names and column names: 217 ##Changing row names and column names:
218 ALZWORD <- t(alzword) 218 ALZWORD <- t(alzword)
219 rownames(ALZWORD)=NULL 219 rownames(ALZWORD)=NULL
220 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 220 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
221 ALZWORD <- chngrownm(ALZWORD)[-1,] 221 ALZWORD <- chngrownm(ALZWORD)[-1,]
222 ALZWORD <- ALZWORD%>% 222 ALZWORD <- ALZWORD%>%
223 as.data.frame()%>% 223 as.data.frame()%>%
224 dplyr::select(-starts_with("col")) 224 dplyr::select(-starts_with("col"))
225 225
226 ##Reorganizing information within the columns and final clinical data 226 ##Reorganizing information within the columns and final clinical data
227 ALZWORDF <- cinfo(ALZWORD) 227 ALZWORDF <- cinfo(ALZWORD)
228 228
229 229
230 #Working with Actual Data part of file 230 #Working with Actual Data part of file
231 alzdat <- alz %>% 231 alzdat <- alz %>%
232 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 232 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
233 ALZDAT <- t(alzdat[,-1]) 233 ALZDAT <- t(alzdat[,-1])
234 rownames(ALZDAT)=NULL 234 rownames(ALZDAT)=NULL
235 235
236 ##Is there a clean version of the GPL file available? 236 ##Is there a clean version of the GPL file available?
237 gplnum <- strsplit(genena,"[\\|/]") %>% 237 gplnum <- strsplit(genena,"[\\|/]") %>%
238 .[[1]] %>% 238 .[[1]] %>%
239 .[length(.)] %>% 239 .[length(.)] %>%
240 gsub("\\D","",.) 240 gsub("\\D","",.)
241 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 241 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
242 if(clfileex >= 1){ 242 if(clfileex >= 1){
243 #use the clean version 243 #use the clean version
244 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 244 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
245 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 245 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
246 246
247 } 247 } else if(clfileex == 0){
248 else if(clfileex == 0){
249 ##Lets Create a clean version 248 ##Lets Create a clean version
250 249
251 ##Gene ID to Gene Name 250 ##Gene ID to Gene Name
252 if(soft == TRUE){ 251 if(soft == TRUE){
253 #Check to see if there is already a file containing information on soft files 252 #Check to see if there is already a file containing information on soft files
254 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 253 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
255 if(fileex == 1){ 254 if(fileex == 1){
256 #Check to see if this GPL soft file has been used before 255 #Check to see if this GPL soft file has been used before
257 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 256 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
258 .$GPL_FILE_NUM%>% 257 .$GPL_FILE_NUM%>%
259 grepl(gplnum,.) %>% 258 grepl(gplnum,.) %>%
260 sum() 259 sum()
261 if(IDF == 1){ 260 if(IDF == 1){
262 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 261 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
263 .$GPL_FILE_NUM%>% 262 .$GPL_FILE_NUM%>%
264 grep(gplnum,.) 263 grep(gplnum,.)
265 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 264 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
266 .$LOC_ID %>% 265 .$LOC_ID %>%
267 .[IDLOCAL] 266 .[IDLOCAL]
268 geneIDNam <- genena %>% 267 geneIDNam <- genena %>%
269 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 268 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
270 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 269 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
271 } else if(IDF == 0){ 270 } else if(IDF == 0){
272 #No information on this particular GPL file 271 #No information on this particular GPL file
273 idLOCGPL <- genena %>% 272 idLOCGPL <- genena %>%
274 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 273 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
275 t(.) %>% 274 t(.) %>%
276 grep("^ID\\s*$",.) %>% 275 grep("^ID\\s*$",.) %>%
277 -1 276 -1
278 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 277 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
279 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 278 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
280 geneIDNam <- genena %>% 279 geneIDNam <- genena %>%
281 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 280 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
282 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 281 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
283 } 282 }
284 } else if(fileex == 0){ 283 } else if(fileex == 0){
285 #We must create a file that we can access for later use 284 #We must create a file that we can access for later use
286 idLOCGPL <- genena %>% 285 idLOCGPL <- genena %>%
287 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 286 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
288 t(.) %>% 287 t(.) %>%
289 grep("^ID\\s*$",.) %>% 288 grep("^ID\\s*$",.) %>%
290 -1 289 -1
291 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 290 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
292 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 291 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
293 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 292 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
294 geneIDNam <- genena %>% 293 geneIDNam <- genena %>%
295 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 294 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
296 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 295 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
297 } 296 }
298 } else if(soft == FALSE){ 297 } else if(soft == FALSE){
299 geneIDNam <- genena %>% 298 geneIDNam <- genena %>%
300 read_delim(delim="\t",comment = "#")%>% 299 read_delim(delim="\t",comment = "#")%>%
301 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 300 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
302 } 301 }
303 302
304 ##Labeling the gene IDs without names 303 ##Labeling the gene IDs without names
305 geneIDNam <- NAFIXING(geneIDNam) 304 geneIDNam <- NAFIXING(geneIDNam)
306 305
307 ##remove the whitespace 306 ##remove the whitespace
308 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 307 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
309 308
310 ##Here is the clean version 309 ##Here is the clean version
311 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 310 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
312 } 311 }
313 312
314 313
315 314
316 ##Changing the gene ID to gene name 315 ##Changing the gene ID to gene name
317 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 316 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
318 colnames(ALZDAT) = ALZDAT1[1,] 317 colnames(ALZDAT) = ALZDAT1[1,]
319 318
320 319
321 ##Adjusting the column names aka the gene names 320 ##Adjusting the column names aka the gene names
322 colnames(ALZDAT) <- gcnames(ALZDAT) 321 colnames(ALZDAT) <- gcnames(ALZDAT)
323 322
324 323
325 #Full RAW Data 324 #Full RAW Data
326 Fullalzdwr <- ALZDAT %>% 325 Fullalzdwr <- ALZDAT %>%
327 as.data.frame() %>% 326 as.data.frame() %>%
328 cbind(ALZWORDF,.) 327 cbind(ALZWORDF,.)
329 328
330 #Raw file is output 329 #Raw file is output
331 nfnaex <- strsplit(alz,"[\\]") %>% 330 nfnaex <- strsplit(alz,"[\\]") %>%
332 .[[1]] %>% 331 .[[1]] %>%
333 .[length(.)] %>% 332 .[length(.)] %>%
334 gsub("\\D","",.) %>% 333 gsub("\\D","",.) %>%
335 c("GSE",.,"aftexcel.txt") %>% 334 c("GSE",.,"aftexcel.txt") %>%
336 paste(collapse = "") 335 paste(collapse = "")
337 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 336 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
338 337
339 338
340 339
341 #Now for the discretization part 340 #Now for the discretization part
342 ##get the wordy part again 341 ##get the wordy part again
343 rawword <- t(ALZWORDF) 342 rawword <- t(ALZWORDF)
344 343
345 ##where is ID_REF located 344 ##where is ID_REF located
346 hereim <- grep("ID_REF",rownames(rawword)) 345 hereim <- grep("ID_REF",rownames(rawword))
347 346
348 ##Subject Names GSM... 347 ##Subject Names GSM...
349 subjnam <- rawword[hereim,] 348 subjnam <- rawword[hereim,]
350 349
351 ##Getting the names for the rows 350 ##Getting the names for the rows
352 namedarows <- rownames(rawword)[-hereim] %>% 351 namedarows <- rownames(rawword)[-hereim] %>%
353 as.data.frame() 352 as.data.frame()
354 RAWWORD <- rawword[-hereim,] %>% 353 RAWWORD <- rawword[-hereim,] %>%
355 as.data.frame() %>% 354 as.data.frame() %>%
356 bind_cols(namedarows,.) 355 bind_cols(namedarows,.)
357 z <- 1 356 z <- 1
358 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 357 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
359 for(z in 1:dim(RAWWORD)[1]){ 358 for(z in 1:dim(RAWWORD)[1]){
360 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 359 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
361 z <- z + 1 360 z <- z + 1
362 } 361 }
363 362
364 colnames(naroww) <- "ROW_NAs" 363 colnames(naroww) <- "ROW_NAs"
365 RAWWORD <- bind_cols(RAWWORD,naroww) 364 RAWWORD <- bind_cols(RAWWORD,naroww)
366 365
367 366
368 roALZna <- t(ALZDAT) %>% 367 roALZna <- t(ALZDAT) %>%
369 rownames(.) %>% 368 rownames(.) %>%
370 as.data.frame(.) 369 as.data.frame(.)
371 colnames(roALZna) <- "ID_REF" 370 colnames(roALZna) <- "ID_REF"
372 371
373 RAWDAT <- t(ALZDAT) %>% 372 RAWDAT <- t(ALZDAT) %>%
374 as.data.frame(.) 373 as.data.frame(.)
375 colnames(RAWDAT) <- NULL 374 colnames(RAWDAT) <- NULL
376 rownames(RAWDAT) <- NULL 375 rownames(RAWDAT) <- NULL
377 376
378 RAWDAT2 <- RAWDAT %>% 377 RAWDAT2 <- RAWDAT %>%
379 cbind(roALZna,.) %>% 378 cbind(roALZna,.) %>%
380 dplyr::arrange(.,ID_REF) 379 dplyr::arrange(.,ID_REF)
381 380
382 ##Editing the file for R processing 381 ##Editing the file for R processing
383 RAWDATID <- RAWDAT2[,1] %>% 382 RAWDATID <- RAWDAT2[,1] %>%
384 as.matrix(.) 383 as.matrix(.)
385 384
386 RAWDATNUM <- RAWDAT2[,-1] %>% 385 RAWDATNUM <- RAWDAT2[,-1] %>%
387 mapply(.,FUN = as.numeric) %>% 386 mapply(.,FUN = as.numeric) %>%
388 t(.) 387 t(.)
389 388
390 ##Consolidating genes with the same name 389 ##Consolidating genes with the same name
391 ###create empty matrix of size equal to tabRDATID 390 ###create empty matrix of size equal to tabRDATID
392 tabRDATID <- table(RAWDATID) 391 tabRDATID <- table(RAWDATID)
393 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 392 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
394 j <- 1 393 j <- 1
395 for(j in 1:length(tabRDATID)){ 394 for(j in 1:length(tabRDATID)){
396 ##Putting the ones without duplicates in their new homes 395 ##Putting the ones without duplicates in their new homes
397 if(tabRDATID[j] == 1){ 396 if(tabRDATID[j] == 1){
398 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 397 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
399 } else if(tabRDATID[j] > 1){ 398 } else if(tabRDATID[j] > 1){
400 ##Averaging duplicates and putting them in their new homes 399 ##Averaging duplicates and putting them in their new homes
401 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 400 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
402 } 401 }
403 j <- j + 1 402 j <- j + 1
404 } 403 }
405 404
406 ##Scaling the Data 405 ##Scaling the Data
407 scrawdat <- NuRDATN%>% 406 scrawdat <- NuRDATN%>%
408 scale() 407 scale()
409 attr(scrawdat,"scaled:center") <- NULL 408 attr(scrawdat,"scaled:center") <- NULL
410 attr(scrawdat,"scaled:scale") <- NULL 409 attr(scrawdat,"scaled:scale") <- NULL
411 colnames(scrawdat) <- rownames(tabRDATID) 410 colnames(scrawdat) <- rownames(tabRDATID)
412 411
413 ##Discretized the Data 412 ##Discretized the Data
414 dialzdat <- scrawdat %>% 413 dialzdat <- scrawdat %>%
415 dndat(.) %>% 414 dndat(.) %>%
416 t()%>% 415 t()%>%
417 as.data.frame(.) 416 as.data.frame(.)
418 colnames(dialzdat) <- rownames(RAWDATNUM) 417 colnames(dialzdat) <- rownames(RAWDATNUM)
419 418
420 ##setting "ID_REF" as a new variable 419 ##setting "ID_REF" as a new variable
421 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 420 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
422 colnames(geneNAM) <- "ID_REF" 421 colnames(geneNAM) <- "ID_REF"
423 rownames(dialzdat) <- NULL 422 rownames(dialzdat) <- NULL
424 dialzdat <-bind_cols(geneNAM,dialzdat) 423 dialzdat <-bind_cols(geneNAM,dialzdat)
425 424
426 ##NAs in a column 425 ##NAs in a column
427 x <- 2 426 x <- 2
428 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 427 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
429 nacol[1,1] = "COL_NAs" 428 nacol[1,1] = "COL_NAs"
430 for(x in 2:dim(dialzdat)[2]){ 429 for(x in 2:dim(dialzdat)[2]){
431 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 430 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
432 x <- x + 1 431 x <- x + 1
433 } 432 }
434 colnames(nacol) <- colnames(dialzdat) 433 colnames(nacol) <- colnames(dialzdat)
435 dialzdat <- bind_rows(dialzdat,nacol) 434 dialzdat <- bind_rows(dialzdat,nacol)
436 435
437 ##NAs in a row 436 ##NAs in a row
438 y <- 1 437 y <- 1
439 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 438 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
440 for(y in 1:dim(dialzdat)[1]){ 439 for(y in 1:dim(dialzdat)[1]){
441 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 440 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
442 y <- y + 1 441 y <- y + 1
443 } 442 }
444 colnames(narowd) <- "ROW_NAs" 443 colnames(narowd) <- "ROW_NAs"
445 dialzdat <- bind_cols(dialzdat,narowd) 444 dialzdat <- bind_cols(dialzdat,narowd)
446 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 445 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
447 colnames(RAWWORD) <- colnames(dialzdat) 446 colnames(RAWWORD) <- colnames(dialzdat)
448 ##converting to character so that the clinical can be brought together with discrete data 447 ##converting to character so that the clinical can be brought together with discrete data
449 k <- 2 448 k <- 2
450 for(k in 2:dim(dialzdat)[2]-1){ 449 for(k in 2:dim(dialzdat)[2]-1){
451 dialzdat[,k] <- as.character(dialzdat[,k]) 450 dialzdat[,k] <- as.character(dialzdat[,k])
452 k <- k + 1 451 k <- k + 1
453 } 452 }
454 #The End the full data 453 #The End the full data
455 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 454 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
456 455
457 #Produces Discrete file 456 #Produces Discrete file
458 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 457 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
459 .[[1]] %>% 458 .[[1]] %>%
460 .[length(.)] %>% 459 .[length(.)] %>%
461 gsub("\\D","",.) %>% 460 gsub("\\D","",.) %>%
462 c("GSE",.,"dscrt.txt") %>% 461 c("GSE",.,"dscrt.txt") %>%
463 paste(collapse = "") 462 paste(collapse = "")
464 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 463 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
465 n <- n +1 464 n <- n +1
466 } 465 }
467 } else if(numDAT == 2){ 466 } else if(numDAT == 2){
468 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN 467 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
469 468
470 #All the files you want to analyze 469 #All the files you want to analyze
471 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:") 470 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
472 if(length(ANDIS) == 0){ 471 if(length(ANDIS) == 0){
473 #Spit out a warning 472 #Spit out a warning
474 warning("You did not select any files and so no cleaning will be performed") 473 warning("You did not select any files and so no cleaning will be performed")
475 } else{ 474 } else{
476 #indexing the data files 475 #indexing the data files
477 n <- 1 476 n <- 1
478 for(n in 1: length(ANDIS)){ 477 for(n in 1: length(ANDIS)){
479 alz <- ANDIS[n] 478 alz <- ANDIS[n]
480 479
481 #Working with the wordy part of the document 480 #Working with the wordy part of the document
482 alzword <- alz %>% 481 alzword <- alz %>%
483 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 482 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
484 filter(grepl("!Sample",X1))%>% 483 filter(grepl("!Sample",X1))%>%
485 filter(!grepl("!Sample_contact",X1)) 484 filter(!grepl("!Sample_contact",X1))
486 485
487 #Getting the GPL file 486 #Getting the GPL file
488 genena <- grep("_platform_id",alzword$X1) %>% 487 genena <- grep("_platform_id",alzword$X1) %>%
489 alzword$X2[.] %>% 488 alzword$X2[.] %>%
490 str_trim(.) %>% 489 str_trim(.) %>%
491 paste0("^",.,"\\D") %>% 490 paste0("^",.,"\\D") %>%
492 grep(.,list.files()) %>% 491 grep(.,list.files()) %>%
493 list.files()[.] 492 list.files()[.]
494 493
495 #Find out if it is a soft GPL file or not 494 #Find out if it is a soft GPL file or not
496 soft <- strsplit(genena,"[\\|/]") %>% 495 soft <- strsplit(genena,"[\\|/]") %>%
497 .[[1]] %>% 496 .[[1]] %>%
498 .[length(.)] %>% 497 .[length(.)] %>%
499 grepl("soft",.) 498 grepl("soft",.)
500 499
501 ##Changing row names and column names: 500 ##Changing row names and column names:
502 ALZWORD <- t(alzword) 501 ALZWORD <- t(alzword)
503 rownames(ALZWORD)=NULL 502 rownames(ALZWORD)=NULL
504 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 503 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
505 ALZWORD <- chngrownm(ALZWORD)[-1,] 504 ALZWORD <- chngrownm(ALZWORD)[-1,]
506 ALZWORD <- ALZWORD%>% 505 ALZWORD <- ALZWORD%>%
507 as.data.frame()%>% 506 as.data.frame()%>%
508 dplyr::select(-starts_with("col")) 507 dplyr::select(-starts_with("col"))
509 508
510 ##Reorganizing information within the columns and final clinical data 509 ##Reorganizing information within the columns and final clinical data
511 ALZWORDF <- cinfo(ALZWORD) 510 ALZWORDF <- cinfo(ALZWORD)
512 511
513 512
514 #Working with Actual Data part of file 513 #Working with Actual Data part of file
515 alzdat <- alz %>% 514 alzdat <- alz %>%
516 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 515 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
517 ALZDAT <- t(alzdat[,-1]) 516 ALZDAT <- t(alzdat[,-1])
518 rownames(ALZDAT)=NULL 517 rownames(ALZDAT)=NULL
519 518
520 ##Is there a clean version of the GPL file available? 519 ##Is there a clean version of the GPL file available?
521 gplnum <- strsplit(genena,"[\\|/]") %>% 520 gplnum <- strsplit(genena,"[\\|/]") %>%
522 .[[1]] %>% 521 .[[1]] %>%
523 .[length(.)] %>% 522 .[length(.)] %>%
524 gsub("\\D","",.) 523 gsub("\\D","",.)
525 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 524 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
526 if(clfileex >= 1){ 525 if(clfileex >= 1){
527 #use the clean version 526 #use the clean version
528 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 527 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
529 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 528 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
530 529
531 } else if(clfileex == 0){ 530 } else if(clfileex == 0){
532 ##Lets Create a clean version 531 ##Lets Create a clean version
533 532
534 ##Gene ID to Gene Name 533 ##Gene ID to Gene Name
535 if(soft == TRUE){ 534 if(soft == TRUE){
536 #Check to see if there is already a file containing information on soft files 535 #Check to see if there is already a file containing information on soft files
537 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 536 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
538 if(fileex == 1){ 537 if(fileex == 1){
539 #Check to see if this GPL soft file has been used before 538 #Check to see if this GPL soft file has been used before
540 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 539 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
541 .$GPL_FILE_NUM%>% 540 .$GPL_FILE_NUM%>%
542 grepl(gplnum,.) %>% 541 grepl(gplnum,.) %>%
543 sum() 542 sum()
544 if(IDF == 1){ 543 if(IDF == 1){
545 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 544 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
546 .$GPL_FILE_NUM%>% 545 .$GPL_FILE_NUM%>%
547 grep(gplnum,.) 546 grep(gplnum,.)
548 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 547 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
549 .$LOC_ID %>% 548 .$LOC_ID %>%
550 .[IDLOCAL] 549 .[IDLOCAL]
551 geneIDNam <- genena %>% 550 geneIDNam <- genena %>%
552 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 551 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
553 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 552 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
554 } else if(IDF == 0){ 553 } else if(IDF == 0){
555 #No information on this particular GPL file 554 #No information on this particular GPL file
556 idLOCGPL <- genena %>% 555 idLOCGPL <- genena %>%
557 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 556 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
558 t(.) %>% 557 t(.) %>%
559 grep("^ID\\s*$",.) %>% 558 grep("^ID\\s*$",.) %>%
560 -1 559 -1
561 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 560 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
562 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 561 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
563 geneIDNam <- genena %>% 562 geneIDNam <- genena %>%
564 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 563 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
565 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 564 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
566 } 565 }
567 } else if(fileex == 0){ 566 } else if(fileex == 0){
568 #We must create a file that we can access for later use 567 #We must create a file that we can access for later use
569 idLOCGPL <- genena %>% 568 idLOCGPL <- genena %>%
570 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 569 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
571 t(.) %>% 570 t(.) %>%
572 grep("^ID\\s*$",.) %>% 571 grep("^ID\\s*$",.) %>%
573 -1 572 -1
574 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 573 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
575 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 574 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
576 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 575 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
577 geneIDNam <- genena %>% 576 geneIDNam <- genena %>%
578 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 577 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
579 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 578 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
580 } 579 }
581 } else if(soft == FALSE){ 580 } else if(soft == FALSE){
582 geneIDNam <- genena %>% 581 geneIDNam <- genena %>%
583 read_delim(delim="\t",comment = "#")%>% 582 read_delim(delim="\t",comment = "#")%>%
584 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 583 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
585 } 584 }
586 585
587 ##Labeling the gene IDs without names 586 ##Labeling the gene IDs without names
588 geneIDNam <- NAFIXING(geneIDNam) 587 geneIDNam <- NAFIXING(geneIDNam)
589 588
590 ##remove the whitespace 589 ##remove the whitespace
591 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 590 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
592 591
593 ##Here is the clean version 592 ##Here is the clean version
594 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 593 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
595 } 594 }
596 595
597 596
598 597
599 ##Changing the gene ID to gene name 598 ##Changing the gene ID to gene name
600 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 599 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
601 colnames(ALZDAT) = ALZDAT1[1,] 600 colnames(ALZDAT) = ALZDAT1[1,]
602 601
603 602
604 ##Adjusting the column names aka the gene names 603 ##Adjusting the column names aka the gene names
605 colnames(ALZDAT) <- gcnames(ALZDAT) 604 colnames(ALZDAT) <- gcnames(ALZDAT)
606 605
607 606
608 #Full RAW Data 607 #Full RAW Data
609 Fullalzdwr <- ALZDAT %>% 608 Fullalzdwr <- ALZDAT %>%
610 as.data.frame() %>% 609 as.data.frame() %>%
611 cbind(ALZWORDF,.) 610 cbind(ALZWORDF,.)
612 611
613 #Raw file is output 612 #Raw file is output
614 nfnaex <- strsplit(alz,"[\\]") %>% 613 nfnaex <- strsplit(alz,"[\\]") %>%
615 .[[1]] %>% 614 .[[1]] %>%
616 .[length(.)] %>% 615 .[length(.)] %>%
617 gsub("\\D","",.) %>% 616 gsub("\\D","",.) %>%
618 c("GSE",.,"aftexcel.txt") %>% 617 c("GSE",.,"aftexcel.txt") %>%
619 paste(collapse = "") 618 paste(collapse = "")
620 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 619 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
621 620
622 621
623 622
624 #Now for the discretization part 623 #Now for the discretization part
625 ##get the wordy part again 624 ##get the wordy part again
626 rawword <- t(ALZWORDF) 625 rawword <- t(ALZWORDF)
627 626
628 ##where is ID_REF located 627 ##where is ID_REF located
629 hereim <- grep("ID_REF",rownames(rawword)) 628 hereim <- grep("ID_REF",rownames(rawword))
630 629
631 ##Subject Names GSM... 630 ##Subject Names GSM...
632 subjnam <- rawword[hereim,] 631 subjnam <- rawword[hereim,]
633 632
634 ##Getting the names for the rows 633 ##Getting the names for the rows
635 namedarows <- rownames(rawword)[-hereim] %>% 634 namedarows <- rownames(rawword)[-hereim] %>%
636 as.data.frame() 635 as.data.frame()
637 RAWWORD <- rawword[-hereim,] %>% 636 RAWWORD <- rawword[-hereim,] %>%
638 as.data.frame() %>% 637 as.data.frame() %>%
639 bind_cols(namedarows,.) 638 bind_cols(namedarows,.)
640 z <- 1 639 z <- 1
641 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 640 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
642 for(z in 1:dim(RAWWORD)[1]){ 641 for(z in 1:dim(RAWWORD)[1]){
643 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 642 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
644 z <- z + 1 643 z <- z + 1
645 } 644 }
646 645
647 colnames(naroww) <- "ROW_NAs" 646 colnames(naroww) <- "ROW_NAs"
648 RAWWORD <- bind_cols(RAWWORD,naroww) 647 RAWWORD <- bind_cols(RAWWORD,naroww)
649 648
650 649
651 roALZna <- t(ALZDAT) %>% 650 roALZna <- t(ALZDAT) %>%
652 rownames(.) %>% 651 rownames(.) %>%
653 as.data.frame(.) 652 as.data.frame(.)
654 colnames(roALZna) <- "ID_REF" 653 colnames(roALZna) <- "ID_REF"
655 654
656 RAWDAT <- t(ALZDAT) %>% 655 RAWDAT <- t(ALZDAT) %>%
657 as.data.frame(.) 656 as.data.frame(.)
658 colnames(RAWDAT) <- NULL 657 colnames(RAWDAT) <- NULL
659 rownames(RAWDAT) <- NULL 658 rownames(RAWDAT) <- NULL
660 659
661 RAWDAT2 <- RAWDAT %>% 660 RAWDAT2 <- RAWDAT %>%
662 cbind(roALZna,.) %>% 661 cbind(roALZna,.) %>%
663 dplyr::arrange(.,ID_REF) 662 dplyr::arrange(.,ID_REF)
664 663
665 ##Editing the file for R processing 664 ##Editing the file for R processing
666 RAWDATID <- RAWDAT2[,1] %>% 665 RAWDATID <- RAWDAT2[,1] %>%
667 as.matrix(.) 666 as.matrix(.)
668 667
669 RAWDATNUM <- RAWDAT2[,-1] %>% 668 RAWDATNUM <- RAWDAT2[,-1] %>%
670 mapply(.,FUN = as.numeric) %>% 669 mapply(.,FUN = as.numeric) %>%
671 t(.) 670 t(.)
672 671
673 ##Consolidating genes with the same name 672 ##Consolidating genes with the same name
674 ###create empty matrix of size equal to tabRDATID 673 ###create empty matrix of size equal to tabRDATID
675 tabRDATID <- table(RAWDATID) 674 tabRDATID <- table(RAWDATID)
676 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 675 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
677 j <- 1 676 j <- 1
678 for(j in 1:length(tabRDATID)){ 677 for(j in 1:length(tabRDATID)){
679 ##Putting the ones without duplicates in their new homes 678 ##Putting the ones without duplicates in their new homes
680 if(tabRDATID[j] == 1){ 679 if(tabRDATID[j] == 1){
681 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 680 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
682 } else if(tabRDATID[j] > 1){ 681 } else if(tabRDATID[j] > 1){
683 ##Averaging duplicates and putting them in their new homes 682 ##Averaging duplicates and putting them in their new homes
684 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 683 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
685 } 684 }
686 j <- j + 1 685 j <- j + 1
687 } 686 }
688 687
689 ##Scaling the Data 688 ##Scaling the Data
690 scrawdat <- NuRDATN%>% 689 scrawdat <- NuRDATN%>%
691 scale() 690 scale()
692 attr(scrawdat,"scaled:center") <- NULL 691 attr(scrawdat,"scaled:center") <- NULL
693 attr(scrawdat,"scaled:scale") <- NULL 692 attr(scrawdat,"scaled:scale") <- NULL
694 colnames(scrawdat) <- rownames(tabRDATID) 693 colnames(scrawdat) <- rownames(tabRDATID)
695 694
696 ##Discretized the Data 695 ##Discretized the Data
697 dialzdat <- scrawdat %>% 696 dialzdat <- scrawdat %>%
698 dndat(.) %>% 697 dndat(.) %>%
699 t()%>% 698 t()%>%
700 as.data.frame(.) 699 as.data.frame(.)
701 colnames(dialzdat) <- rownames(RAWDATNUM) 700 colnames(dialzdat) <- rownames(RAWDATNUM)
702 701
703 ##setting "ID_REF" as a new variable 702 ##setting "ID_REF" as a new variable
704 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 703 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
705 colnames(geneNAM) <- "ID_REF" 704 colnames(geneNAM) <- "ID_REF"
706 rownames(dialzdat) <- NULL 705 rownames(dialzdat) <- NULL
707 dialzdat <-bind_cols(geneNAM,dialzdat) 706 dialzdat <-bind_cols(geneNAM,dialzdat)
708 707
709 ##NAs in a column 708 ##NAs in a column
710 x <- 2 709 x <- 2
711 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 710 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
712 nacol[1,1] = "COL_NAs" 711 nacol[1,1] = "COL_NAs"
713 for(x in 2:dim(dialzdat)[2]){ 712 for(x in 2:dim(dialzdat)[2]){
714 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 713 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
715 x <- x + 1 714 x <- x + 1
716 } 715 }
717 colnames(nacol) <- colnames(dialzdat) 716 colnames(nacol) <- colnames(dialzdat)
718 dialzdat <- bind_rows(dialzdat,nacol) 717 dialzdat <- bind_rows(dialzdat,nacol)
719 718
720 ##NAs in a row 719 ##NAs in a row
721 y <- 1 720 y <- 1
722 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 721 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
723 for(y in 1:dim(dialzdat)[1]){ 722 for(y in 1:dim(dialzdat)[1]){
724 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 723 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
725 y <- y + 1 724 y <- y + 1
726 } 725 }
727 colnames(narowd) <- "ROW_NAs" 726 colnames(narowd) <- "ROW_NAs"
728 dialzdat <- bind_cols(dialzdat,narowd) 727 dialzdat <- bind_cols(dialzdat,narowd)
729 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 728 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
730 colnames(RAWWORD) <- colnames(dialzdat) 729 colnames(RAWWORD) <- colnames(dialzdat)
731 ##converting to character so that the clinical can be brought together with discrete data 730 ##converting to character so that the clinical can be brought together with discrete data
732 k <- 2 731 k <- 2
733 for(k in 2:dim(dialzdat)[2]-1){ 732 for(k in 2:dim(dialzdat)[2]-1){
734 dialzdat[,k] <- as.character(dialzdat[,k]) 733 dialzdat[,k] <- as.character(dialzdat[,k])
735 k <- k + 1 734 k <- k + 1
736 } 735 }
737 #The End the full data 736 #The End the full data
738 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 737 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
739 738
740 #Produces Discrete file 739 #Produces Discrete file
741 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 740 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
742 .[[1]] %>% 741 .[[1]] %>%
743 .[length(.)] %>% 742 .[length(.)] %>%
744 gsub("\\D","",.) %>% 743 gsub("\\D","",.) %>%
745 c("GSE",.,"dscrt.txt") %>% 744 c("GSE",.,"dscrt.txt") %>%
746 paste(collapse = "") 745 paste(collapse = "")
747 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 746 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
748 747
749 748
750 n <- n + 1 749 n <- n + 1
751 } 750 }
752 } 751 }
753 } 752 }
754 } 753 }
755 #The Rest of this code will be used every time you want to change a data set 754 #The Rest of this code will be used every time you want to change a data set
756 THEFT() 755 THEFT()