Commit ae780ea6506e69a475d10fb59354601548e140cf

Authored by Efrain Gonzalez
1 parent 83db0077e4
Exists in master

Don't use this code

fixed issue with using GSEfileloc
Showing 1 changed file with 3 additions and 3 deletions   Show diff stats
1 ######################################################################## 1 ########################################################################
2 # Don't Use This Code Just Yet # 2 # Don't Use This Code Just Yet #
3 ######################################################################## 3 ########################################################################
4 #Efrain H. Gonzalez 4 #Efrain H. Gonzalez
5 #6/16/2017 5 #6/16/2017
6 6
7 #Libraries required to run the code 7 #Libraries required to run the code
8 library(pryr) 8 library(pryr)
9 library(MASS) 9 library(MASS)
10 library(dplyr) 10 library(dplyr)
11 library(tidyr) 11 library(tidyr)
12 library(readr) 12 library(readr)
13 library(stringr) 13 library(stringr)
14 14
15 15
16 #Necessary Functions 16 #Necessary Functions
17 #1#Function for handling the changing of row names and column names 17 #1#Function for handling the changing of row names and column names
18 chngrownm <- function(mat){ 18 chngrownm <- function(mat){
19 row <- dim(mat)[1] 19 row <- dim(mat)[1]
20 col <- dim(mat)[2] 20 col <- dim(mat)[2]
21 e <- 1 21 e <- 1
22 r <- 1 22 r <- 1
23 a <- 1 23 a <- 1
24 h <- 1 24 h <- 1
25 g <- 1 25 g <- 1
26 o <- 1 26 o <- 1
27 for(e in 1:col){ 27 for(e in 1:col){
28 if("!Sample_source_name_ch1"==mat[1,e]){ 28 if("!Sample_source_name_ch1"==mat[1,e]){
29 colnames(mat)[e] <- "Brain_Region" 29 colnames(mat)[e] <- "Brain_Region"
30 } 30 }
31 else if("!Sample_title" == mat[1,e]){ 31 else if("!Sample_title" == mat[1,e]){
32 colnames(mat)[e] <- "Title" 32 colnames(mat)[e] <- "Title"
33 } 33 }
34 else if("!Sample_geo_accession" == mat[1,e]){ 34 else if("!Sample_geo_accession" == mat[1,e]){
35 colnames(mat)[e] <- "ID_REF" 35 colnames(mat)[e] <- "ID_REF"
36 } else{ 36 } else{
37 if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){ 37 if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){
38 colnames(mat)[e] <- paste0("Sex",r) 38 colnames(mat)[e] <- paste0("Sex",r)
39 r = r + 1 39 r = r + 1
40 } 40 }
41 else if(grepl("postmorteminterval|PMI|pmi",mat[2,e])==TRUE){ 41 else if(grepl("postmorteminterval|PMI|pmi",mat[2,e])==TRUE){
42 colnames(mat)[e] <- paste0("PMI",a) 42 colnames(mat)[e] <- paste0("PMI",a)
43 a = a + 1 43 a = a + 1
44 } 44 }
45 else if(grepl("age|Age|AGE",mat[2,e])==TRUE){ 45 else if(grepl("age|Age|AGE",mat[2,e])==TRUE){
46 colnames(mat)[e] <- paste0("Age",h) 46 colnames(mat)[e] <- paste0("Age",h)
47 h = h + 1 47 h = h + 1
48 } 48 }
49 else if(grepl("braak|b&b",mat[2,e])==TRUE){ 49 else if(grepl("braak|b&b",mat[2,e])==TRUE){
50 colnames(mat)[e] <- paste0("Braak",g) 50 colnames(mat)[e] <- paste0("Braak",g)
51 g = g + 1 51 g = g + 1
52 } 52 }
53 else if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){ 53 else if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){
54 colnames(mat)[e] <- paste0("Group",o) 54 colnames(mat)[e] <- paste0("Group",o)
55 o = o + 1 55 o = o + 1
56 } 56 }
57 57
58 } 58 }
59 e = e + 1 59 e = e + 1
60 } 60 }
61 mat 61 mat
62 } 62 }
63 63
64 #2#Function for reorganizing information within the columns 64 #2#Function for reorganizing information within the columns
65 cinfo <- function(mat){ 65 cinfo <- function(mat){
66 col <- dim(mat)[2] 66 col <- dim(mat)[2]
67 j <-2 67 j <-2
68 for(j in 2:col){ 68 for(j in 2:col){
69 if(grepl("Group",colnames(mat)[j]) == TRUE){ 69 if(grepl("Group",colnames(mat)[j]) == TRUE){
70 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 70 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
71 } 71 }
72 else if(grepl("Age",colnames(mat)[j])==TRUE){ 72 else if(grepl("Age",colnames(mat)[j])==TRUE){
73 mat[,j] <- gsub("\\D","",mat[,j])%>% 73 mat[,j] <- gsub("\\D","",mat[,j])%>%
74 as.integer() 74 as.integer()
75 } 75 }
76 else if(grepl("Sex",colnames(mat)[j])==TRUE){ 76 else if(grepl("Sex",colnames(mat)[j])==TRUE){
77 mat[,j] <- gsub(".+:\\s","",mat[,j]) 77 mat[,j] <- gsub(".+:\\s","",mat[,j])
78 } 78 }
79 else if(grepl("PMI",colnames(mat)[j])==TRUE){ 79 else if(grepl("PMI",colnames(mat)[j])==TRUE){
80 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 80 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
81 as.numeric() 81 as.numeric()
82 } 82 }
83 else if(grepl("Braak",colnames(mat)[j])==TRUE){ 83 else if(grepl("Braak",colnames(mat)[j])==TRUE){
84 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 84 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
85 as.roman()%>% 85 as.roman()%>%
86 as.integer() 86 as.integer()
87 } 87 }
88 j=j+1 88 j=j+1
89 } 89 }
90 mat 90 mat
91 } 91 }
92 92
93 #3#Function for labeling the gene IDs without names 93 #3#Function for labeling the gene IDs without names
94 NAFIXING <- function(GIDNAM){ 94 NAFIXING <- function(GIDNAM){
95 row <- dim(GIDNAM)[1] 95 row <- dim(GIDNAM)[1]
96 i <- 1 96 i <- 1
97 for(i in 1:row){ 97 for(i in 1:row){
98 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 98 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
99 GIDNAM[i,2] <- GIDNAM[i,1] 99 GIDNAM[i,2] <- GIDNAM[i,1]
100 } 100 }
101 i <- i + 1 101 i <- i + 1
102 } 102 }
103 GIDNAM 103 GIDNAM
104 } 104 }
105 105
106 #4#Function for changing the gene ID to gene name 106 #4#Function for changing the gene ID to gene name
107 cgeneID <- function(GeneName,DATA){ 107 cgeneID <- function(GeneName,DATA){
108 colGene <- dim(GeneName)[2] 108 colGene <- dim(GeneName)[2]
109 j <- 1 109 j <- 1
110 for(j in 1:colGene){ 110 for(j in 1:colGene){
111 chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,]) 111 chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
112 if(is.na(sum(chngsreq))==FALSE){ 112 if(is.na(sum(chngsreq))==FALSE){
113 if(sum(chngsreq) > 0){ 113 if(sum(chngsreq) > 0){
114 DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 114 DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
115 } 115 }
116 } 116 }
117 j = j+1 117 j = j+1
118 } 118 }
119 DATA 119 DATA
120 } 120 }
121 121
122 #5#Function for adjusting the gene names 122 #5#Function for adjusting the gene names
123 gcnames <- function(DiData,usecol=1){ 123 gcnames <- function(DiData,usecol=1){
124 nuruns <- dim(DiData)[2] 124 nuruns <- dim(DiData)[2]
125 i = 1 125 i = 1
126 nwnam <- rep("0",length.out=nuruns) 126 nwnam <- rep("0",length.out=nuruns)
127 for(i in 1:nuruns){ 127 for(i in 1:nuruns){
128 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ 128 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
129 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol]) 129 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
130 } else{ 130 } else{
131 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1]) 131 nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
132 } 132 }
133 133
134 } 134 }
135 nwnam 135 nwnam
136 136
137 } 137 }
138 138
139 #6# Function for discretizing the data 139 #6# Function for discretizing the data
140 dndat <- function(NDATA){ 140 dndat <- function(NDATA){
141 rownd <- dim(NDATA)[1] 141 rownd <- dim(NDATA)[1]
142 colnd <- dim(NDATA)[2] 142 colnd <- dim(NDATA)[2]
143 DDATA <- matrix(0,nrow=rownd,ncol=colnd) 143 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
144 colnames(DDATA) <- colnames(NDATA) 144 colnames(DDATA) <- colnames(NDATA)
145 i <- 1 145 i <- 1
146 for(i in 1:rownd){ 146 for(i in 1:rownd){
147 j <- 1 147 j <- 1
148 for(j in 1:colnd){ 148 for(j in 1:colnd){
149 if(is.na(NDATA[i,j])==FALSE){ 149 if(is.na(NDATA[i,j])==FALSE){
150 150
151 if(NDATA[i,j] < -1){ 151 if(NDATA[i,j] < -1){
152 DDATA[i,j]=0L 152 DDATA[i,j]=0L
153 } 153 }
154 if(NDATA[i,j] > 1){ 154 if(NDATA[i,j] > 1){
155 DDATA[i,j]=2L 155 DDATA[i,j]=2L
156 } 156 }
157 if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ 157 if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
158 DDATA[i,j]=1L 158 DDATA[i,j]=1L
159 } 159 }
160 } else{ 160 } else{
161 DDATA[i,j] = NDATA[i,j] 161 DDATA[i,j] = NDATA[i,j]
162 } 162 }
163 j = j + 1 163 j = j + 1
164 } 164 }
165 i = i + 1 165 i = i + 1
166 } 166 }
167 DDATA 167 DDATA
168 } 168 }
169 169
170 170
171 #MajorFunction#This is the function that does everything else 171 #MajorFunction#This is the function that does everything else
172 THEFT <- function(){ 172 THEFT <- function(){
173 #Set working directory based on the directory of the series matrix file Currently only works for windows 173 #Set working directory based on the directory of the series matrix file Currently only works for windows
174 wd <- getwd() 174 wd <- getwd()
175 #list.files() 175 #list.files()
176 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?") 176 #gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
177 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L) 177 numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
178 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files()) 178 GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
179 179 GSEfloc <- list.files()[GSEfileloc]
180 #ALL DATA FILES WILL BE CLEANED 180 #ALL DATA FILES WILL BE CLEANED
181 if(numDAT == 1){ 181 if(numDAT == 1){
182 #indexing the data files 182 #indexing the data files
183 n <- 1 183 n <- 1
184 for(n in 1: length(GSEfileloc)){ 184 for(n in 1: length(GSEfloc)){
185 alz <- list.files()[GSEfileloc[n]] 185 alz <- GSEfloc[n]
186 186
187 #Working with the wordy part of the document 187 #Working with the wordy part of the document
188 alzword <- alz %>% 188 alzword <- alz %>%
189 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 189 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
190 filter(grepl("!Sample",X1))%>% 190 filter(grepl("!Sample",X1))%>%
191 filter(!grepl("!Sample_contact",X1)) 191 filter(!grepl("!Sample_contact",X1))
192 192
193 #Getting the GPL file 193 #Getting the GPL file
194 genena <- grep("_platform_id",alzword$X1) %>% 194 genena <- grep("_platform_id",alzword$X1) %>%
195 alzword$X2[.] %>% 195 alzword$X2[.] %>%
196 str_trim(.) %>% 196 str_trim(.) %>%
197 paste0("^",.,"\\D") %>% 197 paste0("^",.,"\\D") %>%
198 grep(.,list.files()) %>% 198 grep(.,list.files()) %>%
199 list.files()[.] 199 list.files()[.]
200 200
201 #Find out if it is a soft GPL file or not 201 #Find out if it is a soft GPL file or not
202 soft <- strsplit(genena,"[\\|/]") %>% 202 soft <- strsplit(genena,"[\\|/]") %>%
203 .[[1]] %>% 203 .[[1]] %>%
204 .[length(.)] %>% 204 .[length(.)] %>%
205 grepl("soft",.) 205 grepl("soft",.)
206 206
207 ##Changing row names and column names: 207 ##Changing row names and column names:
208 ALZWORD <- t(alzword) 208 ALZWORD <- t(alzword)
209 rownames(ALZWORD)=NULL 209 rownames(ALZWORD)=NULL
210 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 210 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
211 ALZWORD <- chngrownm(ALZWORD)[-1,] 211 ALZWORD <- chngrownm(ALZWORD)[-1,]
212 ALZWORD <- ALZWORD%>% 212 ALZWORD <- ALZWORD%>%
213 as.data.frame()%>% 213 as.data.frame()%>%
214 dplyr::select(-starts_with("col")) 214 dplyr::select(-starts_with("col"))
215 215
216 ##Reorganizing information within the columns and final clinical data 216 ##Reorganizing information within the columns and final clinical data
217 ALZWORDF <- cinfo(ALZWORD) 217 ALZWORDF <- cinfo(ALZWORD)
218 218
219 219
220 #Working with Actual Data part of file 220 #Working with Actual Data part of file
221 alzdat <- alz %>% 221 alzdat <- alz %>%
222 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 222 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
223 ALZDAT <- t(alzdat[,-1]) 223 ALZDAT <- t(alzdat[,-1])
224 rownames(ALZDAT)=NULL 224 rownames(ALZDAT)=NULL
225 225
226 ##Is there a clean version of the GPL file available? 226 ##Is there a clean version of the GPL file available?
227 gplnum <- strsplit(genena,"[\\|/]") %>% 227 gplnum <- strsplit(genena,"[\\|/]") %>%
228 .[[1]] %>% 228 .[[1]] %>%
229 .[length(.)] %>% 229 .[length(.)] %>%
230 gsub("\\D","",.) 230 gsub("\\D","",.)
231 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 231 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
232 if(clfileex >= 1){ 232 if(clfileex >= 1){
233 #use the clean version 233 #use the clean version
234 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 234 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
235 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 235 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
236 236
237 } 237 }
238 else if(clfileex == 0){ 238 else if(clfileex == 0){
239 ##Lets Create a clean version 239 ##Lets Create a clean version
240 240
241 ##Gene ID to Gene Name 241 ##Gene ID to Gene Name
242 if(soft == TRUE){ 242 if(soft == TRUE){
243 #Check to see if there is already a file containing information on soft files 243 #Check to see if there is already a file containing information on soft files
244 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 244 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
245 if(fileex == 1){ 245 if(fileex == 1){
246 #Check to see if this GPL soft file has been used before 246 #Check to see if this GPL soft file has been used before
247 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 247 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
248 .$GPL_FILE_NUM%>% 248 .$GPL_FILE_NUM%>%
249 grepl(gplnum,.) %>% 249 grepl(gplnum,.) %>%
250 sum() 250 sum()
251 if(IDF == 1){ 251 if(IDF == 1){
252 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 252 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
253 .$GPL_FILE_NUM%>% 253 .$GPL_FILE_NUM%>%
254 grep(gplnum,.) 254 grep(gplnum,.)
255 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 255 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
256 .$LOC_ID %>% 256 .$LOC_ID %>%
257 .[IDLOCAL] 257 .[IDLOCAL]
258 geneIDNam <- genena %>% 258 geneIDNam <- genena %>%
259 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 259 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
260 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 260 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
261 } 261 }
262 else if(IDF == 0){ 262 else if(IDF == 0){
263 #No information on this particular GPL file 263 #No information on this particular GPL file
264 idLOCGPL <- genena %>% 264 idLOCGPL <- genena %>%
265 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 265 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
266 t(.) %>% 266 t(.) %>%
267 grep("^ID\\s*$",.) %>% 267 grep("^ID\\s*$",.) %>%
268 -1 268 -1
269 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 269 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
270 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 270 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
271 geneIDNam <- genena %>% 271 geneIDNam <- genena %>%
272 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 272 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
273 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 273 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
274 } 274 }
275 } 275 }
276 else if(fileex == 0){ 276 else if(fileex == 0){
277 #We must create a file that we can access for later use 277 #We must create a file that we can access for later use
278 idLOCGPL <- genena %>% 278 idLOCGPL <- genena %>%
279 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 279 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
280 t(.) %>% 280 t(.) %>%
281 grep("^ID\\s*$",.) %>% 281 grep("^ID\\s*$",.) %>%
282 -1 282 -1
283 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 283 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
284 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 284 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
285 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 285 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
286 geneIDNam <- genena %>% 286 geneIDNam <- genena %>%
287 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 287 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
288 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 288 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
289 } 289 }
290 } 290 }
291 else if(soft == FALSE){ 291 else if(soft == FALSE){
292 geneIDNam <- genena %>% 292 geneIDNam <- genena %>%
293 read_delim(delim="\t",comment = "#")%>% 293 read_delim(delim="\t",comment = "#")%>%
294 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 294 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
295 } 295 }
296 296
297 ##Labeling the gene IDs without names 297 ##Labeling the gene IDs without names
298 geneIDNam <- NAFIXING(geneIDNam) 298 geneIDNam <- NAFIXING(geneIDNam)
299 299
300 ##remove the whitespace 300 ##remove the whitespace
301 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 301 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
302 302
303 ##Here is the clean version 303 ##Here is the clean version
304 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 304 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
305 } 305 }
306 306
307 307
308 308
309 ##Changing the gene ID to gene name 309 ##Changing the gene ID to gene name
310 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 310 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
311 colnames(ALZDAT) = ALZDAT1[1,] 311 colnames(ALZDAT) = ALZDAT1[1,]
312 312
313 313
314 ##Adjusting the column names aka the gene names 314 ##Adjusting the column names aka the gene names
315 colnames(ALZDAT) <- gcnames(ALZDAT) 315 colnames(ALZDAT) <- gcnames(ALZDAT)
316 316
317 317
318 #Full RAW Data 318 #Full RAW Data
319 Fullalzdwr <- ALZDAT %>% 319 Fullalzdwr <- ALZDAT %>%
320 as.data.frame() %>% 320 as.data.frame() %>%
321 cbind(ALZWORDF,.) 321 cbind(ALZWORDF,.)
322 322
323 #Raw file is output 323 #Raw file is output
324 nfnaex <- strsplit(alz,"[\\]") %>% 324 nfnaex <- strsplit(alz,"[\\]") %>%
325 .[[1]] %>% 325 .[[1]] %>%
326 .[length(.)] %>% 326 .[length(.)] %>%
327 gsub("\\D","",.) %>% 327 gsub("\\D","",.) %>%
328 c("GSE",.,"aftexcel.txt") %>% 328 c("GSE",.,"aftexcel.txt") %>%
329 paste(collapse = "") 329 paste(collapse = "")
330 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 330 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
331 331
332 332
333 333
334 #Now for the discretization part 334 #Now for the discretization part
335 ##get the wordy part again 335 ##get the wordy part again
336 rawword <- t(ALZWORDF) 336 rawword <- t(ALZWORDF)
337 337
338 ##where is ID_REF located 338 ##where is ID_REF located
339 hereim <- grep("ID_REF",rownames(rawword)) 339 hereim <- grep("ID_REF",rownames(rawword))
340 340
341 ##Subject Names GSM... 341 ##Subject Names GSM...
342 subjnam <- rawword[hereim,] 342 subjnam <- rawword[hereim,]
343 343
344 ##Getting the names for the rows 344 ##Getting the names for the rows
345 namedarows <- rownames(rawword)[-hereim] %>% 345 namedarows <- rownames(rawword)[-hereim] %>%
346 as.data.frame() 346 as.data.frame()
347 RAWWORD <- rawword[-hereim,] %>% 347 RAWWORD <- rawword[-hereim,] %>%
348 as.data.frame() %>% 348 as.data.frame() %>%
349 bind_cols(namedarows,.) 349 bind_cols(namedarows,.)
350 z <- 1 350 z <- 1
351 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 351 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
352 for(z in 1:dim(RAWWORD)[1]){ 352 for(z in 1:dim(RAWWORD)[1]){
353 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 353 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
354 z <- z + 1 354 z <- z + 1
355 } 355 }
356 356
357 colnames(naroww) <- "ROW_NAs" 357 colnames(naroww) <- "ROW_NAs"
358 RAWWORD <- bind_cols(RAWWORD,naroww) 358 RAWWORD <- bind_cols(RAWWORD,naroww)
359 359
360 360
361 roALZna <- t(ALZDAT) %>% 361 roALZna <- t(ALZDAT) %>%
362 rownames(.) %>% 362 rownames(.) %>%
363 as.data.frame(.) 363 as.data.frame(.)
364 colnames(roALZna) <- "ID_REF" 364 colnames(roALZna) <- "ID_REF"
365 365
366 RAWDAT <- t(ALZDAT) %>% 366 RAWDAT <- t(ALZDAT) %>%
367 as.data.frame(.) 367 as.data.frame(.)
368 colnames(RAWDAT) <- NULL 368 colnames(RAWDAT) <- NULL
369 rownames(RAWDAT) <- NULL 369 rownames(RAWDAT) <- NULL
370 370
371 RAWDAT2 <- RAWDAT %>% 371 RAWDAT2 <- RAWDAT %>%
372 cbind(roALZna,.) %>% 372 cbind(roALZna,.) %>%
373 dplyr::arrange(.,ID_REF) 373 dplyr::arrange(.,ID_REF)
374 374
375 ##Editing the file for R processing 375 ##Editing the file for R processing
376 RAWDATID <- RAWDAT2[,1] %>% 376 RAWDATID <- RAWDAT2[,1] %>%
377 as.matrix(.) 377 as.matrix(.)
378 378
379 RAWDATNUM <- RAWDAT2[,-1] %>% 379 RAWDATNUM <- RAWDAT2[,-1] %>%
380 mapply(.,FUN = as.numeric) %>% 380 mapply(.,FUN = as.numeric) %>%
381 t(.) 381 t(.)
382 382
383 ##Consolidating genes with the same name 383 ##Consolidating genes with the same name
384 ###create empty matrix of size equal to tabRDATID 384 ###create empty matrix of size equal to tabRDATID
385 tabRDATID <- table(RAWDATID) 385 tabRDATID <- table(RAWDATID)
386 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 386 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
387 j <- 1 387 j <- 1
388 for(j in 1:length(tabRDATID)){ 388 for(j in 1:length(tabRDATID)){
389 ##Putting the ones without duplicates in their new homes 389 ##Putting the ones without duplicates in their new homes
390 if(tabRDATID[j] == 1){ 390 if(tabRDATID[j] == 1){
391 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 391 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
392 } 392 }
393 ##Averaging duplicates and putting them in their new homes 393 ##Averaging duplicates and putting them in their new homes
394 else if(tabRDATID[j] > 1){ 394 else if(tabRDATID[j] > 1){
395 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 395 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
396 } 396 }
397 j <- j + 1 397 j <- j + 1
398 } 398 }
399 399
400 ##Scaling the Data 400 ##Scaling the Data
401 scrawdat <- NuRDATN%>% 401 scrawdat <- NuRDATN%>%
402 scale() 402 scale()
403 attr(scrawdat,"scaled:center") <- NULL 403 attr(scrawdat,"scaled:center") <- NULL
404 attr(scrawdat,"scaled:scale") <- NULL 404 attr(scrawdat,"scaled:scale") <- NULL
405 colnames(scrawdat) <- rownames(tabRDATID) 405 colnames(scrawdat) <- rownames(tabRDATID)
406 406
407 ##Discretized the Data 407 ##Discretized the Data
408 dialzdat <- scrawdat %>% 408 dialzdat <- scrawdat %>%
409 dndat(.) %>% 409 dndat(.) %>%
410 t()%>% 410 t()%>%
411 as.data.frame(.) 411 as.data.frame(.)
412 colnames(dialzdat) <- rownames(RAWDATNUM) 412 colnames(dialzdat) <- rownames(RAWDATNUM)
413 413
414 ##setting "ID_REF" as a new variable 414 ##setting "ID_REF" as a new variable
415 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 415 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
416 colnames(geneNAM) <- "ID_REF" 416 colnames(geneNAM) <- "ID_REF"
417 rownames(dialzdat) <- NULL 417 rownames(dialzdat) <- NULL
418 dialzdat <-bind_cols(geneNAM,dialzdat) 418 dialzdat <-bind_cols(geneNAM,dialzdat)
419 419
420 ##NAs in a column 420 ##NAs in a column
421 x <- 2 421 x <- 2
422 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 422 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
423 nacol[1,1] = "COL_NAs" 423 nacol[1,1] = "COL_NAs"
424 for(x in 2:dim(dialzdat)[2]){ 424 for(x in 2:dim(dialzdat)[2]){
425 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 425 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
426 x <- x + 1 426 x <- x + 1
427 } 427 }
428 colnames(nacol) <- colnames(dialzdat) 428 colnames(nacol) <- colnames(dialzdat)
429 dialzdat <- bind_rows(dialzdat,nacol) 429 dialzdat <- bind_rows(dialzdat,nacol)
430 430
431 ##NAs in a row 431 ##NAs in a row
432 y <- 1 432 y <- 1
433 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 433 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
434 for(y in 1:dim(dialzdat)[1]){ 434 for(y in 1:dim(dialzdat)[1]){
435 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 435 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
436 y <- y + 1 436 y <- y + 1
437 } 437 }
438 colnames(narowd) <- "ROW_NAs" 438 colnames(narowd) <- "ROW_NAs"
439 dialzdat <- bind_cols(dialzdat,narowd) 439 dialzdat <- bind_cols(dialzdat,narowd)
440 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 440 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
441 colnames(RAWWORD) <- colnames(dialzdat) 441 colnames(RAWWORD) <- colnames(dialzdat)
442 ##converting to character so that the clinical can be brought together with discrete data 442 ##converting to character so that the clinical can be brought together with discrete data
443 k <- 2 443 k <- 2
444 for(k in 2:dim(dialzdat)[2]-1){ 444 for(k in 2:dim(dialzdat)[2]-1){
445 dialzdat[,k] <- as.character(dialzdat[,k]) 445 dialzdat[,k] <- as.character(dialzdat[,k])
446 k <- k + 1 446 k <- k + 1
447 } 447 }
448 #The End the full data 448 #The End the full data
449 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 449 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
450 450
451 #Produces Discrete file 451 #Produces Discrete file
452 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 452 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
453 .[[1]] %>% 453 .[[1]] %>%
454 .[length(.)] %>% 454 .[length(.)] %>%
455 gsub("\\D","",.) %>% 455 gsub("\\D","",.) %>%
456 c("GSE",.,"dscrt.txt") %>% 456 c("GSE",.,"dscrt.txt") %>%
457 paste(collapse = "") 457 paste(collapse = "")
458 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 458 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
459 n <- n +1 459 n <- n +1
460 } 460 }
461 } 461 }
462 462
463 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN 463 #CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
464 else if(numDAT == 2){ 464 else if(numDAT == 2){
465 #All the files you want to analyze 465 #All the files you want to analyze
466 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:") 466 ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
467 if(length(ANDIS) == 0){ 467 if(length(ANDIS) == 0){
468 #Spit out a warning 468 #Spit out a warning
469 warning("You did not select any files and so no cleaning will be performed") 469 warning("You did not select any files and so no cleaning will be performed")
470 } else{ 470 } else{
471 #indexing the data files 471 #indexing the data files
472 n <- 1 472 n <- 1
473 for(n in 1: length(ANDIS)){ 473 for(n in 1: length(ANDIS)){
474 alz <- ANDIS[n] 474 alz <- ANDIS[n]
475 475
476 #Working with the wordy part of the document 476 #Working with the wordy part of the document
477 alzword <- alz %>% 477 alzword <- alz %>%
478 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 478 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
479 filter(grepl("!Sample",X1))%>% 479 filter(grepl("!Sample",X1))%>%
480 filter(!grepl("!Sample_contact",X1)) 480 filter(!grepl("!Sample_contact",X1))
481 481
482 #Getting the GPL file 482 #Getting the GPL file
483 genena <- grep("_platform_id",alzword$X1) %>% 483 genena <- grep("_platform_id",alzword$X1) %>%
484 alzword$X2[.] %>% 484 alzword$X2[.] %>%
485 str_trim(.) %>% 485 str_trim(.) %>%
486 paste0("^",.,"\\D") %>% 486 paste0("^",.,"\\D") %>%
487 grep(.,list.files()) %>% 487 grep(.,list.files()) %>%
488 list.files()[.] 488 list.files()[.]
489 489
490 #Find out if it is a soft GPL file or not 490 #Find out if it is a soft GPL file or not
491 soft <- strsplit(genena,"[\\|/]") %>% 491 soft <- strsplit(genena,"[\\|/]") %>%
492 .[[1]] %>% 492 .[[1]] %>%
493 .[length(.)] %>% 493 .[length(.)] %>%
494 grepl("soft",.) 494 grepl("soft",.)
495 495
496 ##Changing row names and column names: 496 ##Changing row names and column names:
497 ALZWORD <- t(alzword) 497 ALZWORD <- t(alzword)
498 rownames(ALZWORD)=NULL 498 rownames(ALZWORD)=NULL
499 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 499 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
500 ALZWORD <- chngrownm(ALZWORD)[-1,] 500 ALZWORD <- chngrownm(ALZWORD)[-1,]
501 ALZWORD <- ALZWORD%>% 501 ALZWORD <- ALZWORD%>%
502 as.data.frame()%>% 502 as.data.frame()%>%
503 dplyr::select(-starts_with("col")) 503 dplyr::select(-starts_with("col"))
504 504
505 ##Reorganizing information within the columns and final clinical data 505 ##Reorganizing information within the columns and final clinical data
506 ALZWORDF <- cinfo(ALZWORD) 506 ALZWORDF <- cinfo(ALZWORD)
507 507
508 508
509 #Working with Actual Data part of file 509 #Working with Actual Data part of file
510 alzdat <- alz %>% 510 alzdat <- alz %>%
511 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 511 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
512 ALZDAT <- t(alzdat[,-1]) 512 ALZDAT <- t(alzdat[,-1])
513 rownames(ALZDAT)=NULL 513 rownames(ALZDAT)=NULL
514 514
515 ##Is there a clean version of the GPL file available? 515 ##Is there a clean version of the GPL file available?
516 gplnum <- strsplit(genena,"[\\|/]") %>% 516 gplnum <- strsplit(genena,"[\\|/]") %>%
517 .[[1]] %>% 517 .[[1]] %>%
518 .[length(.)] %>% 518 .[length(.)] %>%
519 gsub("\\D","",.) 519 gsub("\\D","",.)
520 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 520 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
521 if(clfileex >= 1){ 521 if(clfileex >= 1){
522 #use the clean version 522 #use the clean version
523 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 523 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
524 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 524 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
525 525
526 } 526 }
527 else if(clfileex == 0){ 527 else if(clfileex == 0){
528 ##Lets Create a clean version 528 ##Lets Create a clean version
529 529
530 ##Gene ID to Gene Name 530 ##Gene ID to Gene Name
531 if(soft == TRUE){ 531 if(soft == TRUE){
532 #Check to see if there is already a file containing information on soft files 532 #Check to see if there is already a file containing information on soft files
533 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 533 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
534 if(fileex == 1){ 534 if(fileex == 1){
535 #Check to see if this GPL soft file has been used before 535 #Check to see if this GPL soft file has been used before
536 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 536 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
537 .$GPL_FILE_NUM%>% 537 .$GPL_FILE_NUM%>%
538 grepl(gplnum,.) %>% 538 grepl(gplnum,.) %>%
539 sum() 539 sum()
540 if(IDF == 1){ 540 if(IDF == 1){
541 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 541 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
542 .$GPL_FILE_NUM%>% 542 .$GPL_FILE_NUM%>%
543 grep(gplnum,.) 543 grep(gplnum,.)
544 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 544 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
545 .$LOC_ID %>% 545 .$LOC_ID %>%
546 .[IDLOCAL] 546 .[IDLOCAL]
547 geneIDNam <- genena %>% 547 geneIDNam <- genena %>%
548 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 548 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
549 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 549 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
550 } 550 }
551 else if(IDF == 0){ 551 else if(IDF == 0){
552 #No information on this particular GPL file 552 #No information on this particular GPL file
553 idLOCGPL <- genena %>% 553 idLOCGPL <- genena %>%
554 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 554 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
555 t(.) %>% 555 t(.) %>%
556 grep("^ID\\s*$",.) %>% 556 grep("^ID\\s*$",.) %>%
557 -1 557 -1
558 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 558 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
559 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 559 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
560 geneIDNam <- genena %>% 560 geneIDNam <- genena %>%
561 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 561 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
562 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 562 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
563 } 563 }
564 } 564 }
565 else if(fileex == 0){ 565 else if(fileex == 0){
566 #We must create a file that we can access for later use 566 #We must create a file that we can access for later use
567 idLOCGPL <- genena %>% 567 idLOCGPL <- genena %>%
568 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 568 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
569 t(.) %>% 569 t(.) %>%
570 grep("^ID\\s*$",.) %>% 570 grep("^ID\\s*$",.) %>%
571 -1 571 -1
572 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 572 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
573 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 573 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
574 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 574 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
575 geneIDNam <- genena %>% 575 geneIDNam <- genena %>%
576 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 576 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
577 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 577 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
578 } 578 }
579 } 579 }
580 else if(soft == FALSE){ 580 else if(soft == FALSE){
581 geneIDNam <- genena %>% 581 geneIDNam <- genena %>%
582 read_delim(delim="\t",comment = "#")%>% 582 read_delim(delim="\t",comment = "#")%>%
583 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.))) 583 dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
584 } 584 }
585 585
586 ##Labeling the gene IDs without names 586 ##Labeling the gene IDs without names
587 geneIDNam <- NAFIXING(geneIDNam) 587 geneIDNam <- NAFIXING(geneIDNam)
588 588
589 ##remove the whitespace 589 ##remove the whitespace
590 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 590 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
591 591
592 ##Here is the clean version 592 ##Here is the clean version
593 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 593 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
594 } 594 }
595 595
596 596
597 597
598 ##Changing the gene ID to gene name 598 ##Changing the gene ID to gene name
599 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 599 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
600 colnames(ALZDAT) = ALZDAT1[1,] 600 colnames(ALZDAT) = ALZDAT1[1,]
601 601
602 602
603 ##Adjusting the column names aka the gene names 603 ##Adjusting the column names aka the gene names
604 colnames(ALZDAT) <- gcnames(ALZDAT) 604 colnames(ALZDAT) <- gcnames(ALZDAT)
605 605
606 606
607 #Full RAW Data 607 #Full RAW Data
608 Fullalzdwr <- ALZDAT %>% 608 Fullalzdwr <- ALZDAT %>%
609 as.data.frame() %>% 609 as.data.frame() %>%
610 cbind(ALZWORDF,.) 610 cbind(ALZWORDF,.)
611 611
612 #Raw file is output 612 #Raw file is output
613 nfnaex <- strsplit(alz,"[\\]") %>% 613 nfnaex <- strsplit(alz,"[\\]") %>%
614 .[[1]] %>% 614 .[[1]] %>%
615 .[length(.)] %>% 615 .[length(.)] %>%
616 gsub("\\D","",.) %>% 616 gsub("\\D","",.) %>%
617 c("GSE",.,"aftexcel.txt") %>% 617 c("GSE",.,"aftexcel.txt") %>%
618 paste(collapse = "") 618 paste(collapse = "")
619 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t") 619 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
620 620
621 621
622 622
623 #Now for the discretization part 623 #Now for the discretization part
624 ##get the wordy part again 624 ##get the wordy part again
625 rawword <- t(ALZWORDF) 625 rawword <- t(ALZWORDF)
626 626
627 ##where is ID_REF located 627 ##where is ID_REF located
628 hereim <- grep("ID_REF",rownames(rawword)) 628 hereim <- grep("ID_REF",rownames(rawword))
629 629
630 ##Subject Names GSM... 630 ##Subject Names GSM...
631 subjnam <- rawword[hereim,] 631 subjnam <- rawword[hereim,]
632 632
633 ##Getting the names for the rows 633 ##Getting the names for the rows
634 namedarows <- rownames(rawword)[-hereim] %>% 634 namedarows <- rownames(rawword)[-hereim] %>%
635 as.data.frame() 635 as.data.frame()
636 RAWWORD <- rawword[-hereim,] %>% 636 RAWWORD <- rawword[-hereim,] %>%
637 as.data.frame() %>% 637 as.data.frame() %>%
638 bind_cols(namedarows,.) 638 bind_cols(namedarows,.)
639 z <- 1 639 z <- 1
640 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 640 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
641 for(z in 1:dim(RAWWORD)[1]){ 641 for(z in 1:dim(RAWWORD)[1]){
642 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 642 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
643 z <- z + 1 643 z <- z + 1
644 } 644 }
645 645
646 colnames(naroww) <- "ROW_NAs" 646 colnames(naroww) <- "ROW_NAs"
647 RAWWORD <- bind_cols(RAWWORD,naroww) 647 RAWWORD <- bind_cols(RAWWORD,naroww)
648 648
649 649
650 roALZna <- t(ALZDAT) %>% 650 roALZna <- t(ALZDAT) %>%
651 rownames(.) %>% 651 rownames(.) %>%
652 as.data.frame(.) 652 as.data.frame(.)
653 colnames(roALZna) <- "ID_REF" 653 colnames(roALZna) <- "ID_REF"
654 654
655 RAWDAT <- t(ALZDAT) %>% 655 RAWDAT <- t(ALZDAT) %>%
656 as.data.frame(.) 656 as.data.frame(.)
657 colnames(RAWDAT) <- NULL 657 colnames(RAWDAT) <- NULL
658 rownames(RAWDAT) <- NULL 658 rownames(RAWDAT) <- NULL
659 659
660 RAWDAT2 <- RAWDAT %>% 660 RAWDAT2 <- RAWDAT %>%
661 cbind(roALZna,.) %>% 661 cbind(roALZna,.) %>%
662 dplyr::arrange(.,ID_REF) 662 dplyr::arrange(.,ID_REF)
663 663
664 ##Editing the file for R processing 664 ##Editing the file for R processing
665 RAWDATID <- RAWDAT2[,1] %>% 665 RAWDATID <- RAWDAT2[,1] %>%
666 as.matrix(.) 666 as.matrix(.)
667 667
668 RAWDATNUM <- RAWDAT2[,-1] %>% 668 RAWDATNUM <- RAWDAT2[,-1] %>%
669 mapply(.,FUN = as.numeric) %>% 669 mapply(.,FUN = as.numeric) %>%
670 t(.) 670 t(.)
671 671
672 ##Consolidating genes with the same name 672 ##Consolidating genes with the same name
673 ###create empty matrix of size equal to tabRDATID 673 ###create empty matrix of size equal to tabRDATID
674 tabRDATID <- table(RAWDATID) 674 tabRDATID <- table(RAWDATID)
675 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 675 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
676 j <- 1 676 j <- 1
677 for(j in 1:length(tabRDATID)){ 677 for(j in 1:length(tabRDATID)){
678 ##Putting the ones without duplicates in their new homes 678 ##Putting the ones without duplicates in their new homes
679 if(tabRDATID[j] == 1){ 679 if(tabRDATID[j] == 1){
680 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 680 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
681 } 681 }
682 ##Averaging duplicates and putting them in their new homes 682 ##Averaging duplicates and putting them in their new homes
683 else if(tabRDATID[j] > 1){ 683 else if(tabRDATID[j] > 1){
684 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 684 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
685 } 685 }
686 j <- j + 1 686 j <- j + 1
687 } 687 }
688 688
689 ##Scaling the Data 689 ##Scaling the Data
690 scrawdat <- NuRDATN%>% 690 scrawdat <- NuRDATN%>%
691 scale() 691 scale()
692 attr(scrawdat,"scaled:center") <- NULL 692 attr(scrawdat,"scaled:center") <- NULL
693 attr(scrawdat,"scaled:scale") <- NULL 693 attr(scrawdat,"scaled:scale") <- NULL
694 colnames(scrawdat) <- rownames(tabRDATID) 694 colnames(scrawdat) <- rownames(tabRDATID)
695 695
696 ##Discretized the Data 696 ##Discretized the Data
697 dialzdat <- scrawdat %>% 697 dialzdat <- scrawdat %>%
698 dndat(.) %>% 698 dndat(.) %>%
699 t()%>% 699 t()%>%
700 as.data.frame(.) 700 as.data.frame(.)
701 colnames(dialzdat) <- rownames(RAWDATNUM) 701 colnames(dialzdat) <- rownames(RAWDATNUM)
702 702
703 ##setting "ID_REF" as a new variable 703 ##setting "ID_REF" as a new variable
704 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 704 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
705 colnames(geneNAM) <- "ID_REF" 705 colnames(geneNAM) <- "ID_REF"
706 rownames(dialzdat) <- NULL 706 rownames(dialzdat) <- NULL
707 dialzdat <-bind_cols(geneNAM,dialzdat) 707 dialzdat <-bind_cols(geneNAM,dialzdat)
708 708
709 ##NAs in a column 709 ##NAs in a column
710 x <- 2 710 x <- 2
711 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 711 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
712 nacol[1,1] = "COL_NAs" 712 nacol[1,1] = "COL_NAs"
713 for(x in 2:dim(dialzdat)[2]){ 713 for(x in 2:dim(dialzdat)[2]){
714 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 714 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
715 x <- x + 1 715 x <- x + 1
716 } 716 }
717 colnames(nacol) <- colnames(dialzdat) 717 colnames(nacol) <- colnames(dialzdat)
718 dialzdat <- bind_rows(dialzdat,nacol) 718 dialzdat <- bind_rows(dialzdat,nacol)
719 719
720 ##NAs in a row 720 ##NAs in a row
721 y <- 1 721 y <- 1
722 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 722 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
723 for(y in 1:dim(dialzdat)[1]){ 723 for(y in 1:dim(dialzdat)[1]){
724 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 724 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
725 y <- y + 1 725 y <- y + 1
726 } 726 }
727 colnames(narowd) <- "ROW_NAs" 727 colnames(narowd) <- "ROW_NAs"
728 dialzdat <- bind_cols(dialzdat,narowd) 728 dialzdat <- bind_cols(dialzdat,narowd)
729 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam 729 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
730 colnames(RAWWORD) <- colnames(dialzdat) 730 colnames(RAWWORD) <- colnames(dialzdat)
731 ##converting to character so that the clinical can be brought together with discrete data 731 ##converting to character so that the clinical can be brought together with discrete data
732 k <- 2 732 k <- 2
733 for(k in 2:dim(dialzdat)[2]-1){ 733 for(k in 2:dim(dialzdat)[2]-1){
734 dialzdat[,k] <- as.character(dialzdat[,k]) 734 dialzdat[,k] <- as.character(dialzdat[,k])
735 k <- k + 1 735 k <- k + 1
736 } 736 }
737 #The End the full data 737 #The End the full data
738 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat) 738 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
739 739
740 #Produces Discrete file 740 #Produces Discrete file
741 nfnaex2 <- strsplit(alz,"[\\|/]") %>% 741 nfnaex2 <- strsplit(alz,"[\\|/]") %>%
742 .[[1]] %>% 742 .[[1]] %>%
743 .[length(.)] %>% 743 .[length(.)] %>%
744 gsub("\\D","",.) %>% 744 gsub("\\D","",.) %>%
745 c("GSE",.,"dscrt.txt") %>% 745 c("GSE",.,"dscrt.txt") %>%
746 paste(collapse = "") 746 paste(collapse = "")
747 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE) 747 write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
748 748
749 749
750 n <- n + 1 750 n <- n + 1
751 } 751 }
752 } 752 }
753 } 753 }
754 } 754 }
755 #The Rest of this code will be used every time you want to change a data set 755 #The Rest of this code will be used every time you want to change a data set
756 THEFT() 756 THEFT()