Commit f378e57f408548304d0fd9656e502c92151c1dd9

Authored by Efrain Gonzalez
1 parent 58ba5d0b06
Exists in master

Updated RClean3

Showing 1 changed file with 1 additions and 1 deletions   Show diff stats
1 #Libraries required to run the code 1 #Libraries required to run the code
2 library(pryr) 2 library(pryr)
3 library(MASS) 3 library(MASS)
4 library(dplyr) 4 library(dplyr)
5 library(tidyr) 5 library(tidyr)
6 library(readr) 6 library(readr)
7 library(stringr) 7 library(stringr)
8 8
9 9
10 #Necessary Functions 10 #Necessary Functions
11 #1#Function for handling the changing of row names and column names 11 #1#Function for handling the changing of row names and column names
12 chngrownm <- function(mat){ 12 chngrownm <- function(mat){
13 row <- dim(mat)[1] 13 row <- dim(mat)[1]
14 col <- dim(mat)[2] 14 col <- dim(mat)[2]
15 j <- 1 15 j <- 1
16 x <- 1 16 x <- 1
17 p <- 1 17 p <- 1
18 a <- 1 18 a <- 1
19 b <- 1 19 b <- 1
20 g <- 1 20 g <- 1
21 for(j in 1:col){ 21 for(j in 1:col){
22 if("!Sample_source_name_ch1"==mat[1,j]){ 22 if("!Sample_source_name_ch1"==mat[1,j]){
23 colnames(mat)[j] <- "Brain_Region" 23 colnames(mat)[j] <- "Brain_Region"
24 } 24 }
25 if("!Sample_title" == mat[1,j]){ 25 if("!Sample_title" == mat[1,j]){
26 colnames(mat)[j] <- "Title" 26 colnames(mat)[j] <- "Title"
27 } 27 }
28 if("!Sample_geo_accession" == mat[1,j]){ 28 if("!Sample_geo_accession" == mat[1,j]){
29 colnames(mat)[j] <- "ID_REF" 29 colnames(mat)[j] <- "ID_REF"
30 } else{ 30 } else{
31 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ 31 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
32 colnames(mat)[j] <- paste0("Sex",x) 32 colnames(mat)[j] <- paste0("Sex",x)
33 x = x + 1 33 x = x + 1
34 } 34 }
35 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ 35 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
36 colnames(mat)[j] <- paste0("PMI",p) 36 colnames(mat)[j] <- paste0("PMI",p)
37 p = p + 1 37 p = p + 1
38 } 38 }
39 if(grepl("age|Age|AGE",mat[2,j])==TRUE){ 39 if(grepl("age|Age|AGE",mat[2,j])==TRUE){
40 colnames(mat)[j] <- paste0("Age",a) 40 colnames(mat)[j] <- paste0("Age",a)
41 a = a + 1 41 a = a + 1
42 } 42 }
43 if(grepl("braak|b&b",mat[2,j])==TRUE){ 43 if(grepl("braak|b&b",mat[2,j])==TRUE){
44 colnames(mat)[j] <- paste0("Braak",b) 44 colnames(mat)[j] <- paste0("Braak",b)
45 b = b + 1 45 b = b + 1
46 } 46 }
47 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){ 47 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
48 colnames(mat)[j] <- paste0("Group",g) 48 colnames(mat)[j] <- paste0("Group",g)
49 g = g + 1 49 g = g + 1
50 } 50 }
51 51
52 } 52 }
53 j = j + 1 53 j = j + 1
54 } 54 }
55 mat 55 mat
56 } 56 }
57 57
58 #2#Function for reorganizing information within the columns 58 #2#Function for reorganizing information within the columns
59 cinfo <- function(mat){ 59 cinfo <- function(mat){
60 col <- dim(mat)[2] 60 col <- dim(mat)[2]
61 j <-2 61 j <-2
62 for(j in 2:col){ 62 for(j in 2:col){
63 if(grepl("Group",colnames(mat)[j]) == TRUE){ 63 if(grepl("Group",colnames(mat)[j]) == TRUE){
64 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 64 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
65 } 65 }
66 if(grepl("Age",colnames(mat)[j])==TRUE){ 66 if(grepl("Age",colnames(mat)[j])==TRUE){
67 mat[,j] <- gsub("\\D","",mat[,j])%>% 67 mat[,j] <- gsub("\\D","",mat[,j])%>%
68 as.integer() 68 as.integer()
69 } 69 }
70 if(grepl("Sex",colnames(mat)[j])==TRUE){ 70 if(grepl("Sex",colnames(mat)[j])==TRUE){
71 mat[,j] <- gsub(".+:\\s","",mat[,j]) 71 mat[,j] <- gsub(".+:\\s","",mat[,j])
72 } 72 }
73 if(grepl("PMI",colnames(mat)[j])==TRUE){ 73 if(grepl("PMI",colnames(mat)[j])==TRUE){
74 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 74 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
75 as.numeric() 75 as.numeric()
76 } 76 }
77 if(grepl("Braak",colnames(mat)[j])==TRUE){ 77 if(grepl("Braak",colnames(mat)[j])==TRUE){
78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
79 as.roman()%>% 79 as.roman()%>%
80 as.integer() 80 as.integer()
81 } 81 }
82 j=j+1 82 j=j+1
83 } 83 }
84 mat 84 mat
85 } 85 }
86 86
87 #3#Function for labeling the gene IDs without names 87 #3#Function for labeling the gene IDs without names
88 NAFIXING <- function(GIDNAM){ 88 NAFIXING <- function(GIDNAM){
89 row <- dim(GIDNAM)[1] 89 row <- dim(GIDNAM)[1]
90 i <- 1 90 i <- 1
91 for(i in 1:row){ 91 for(i in 1:row){
92 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){ 92 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
93 GIDNAM[i,2] <- GIDNAM[i,1] 93 GIDNAM[i,2] <- GIDNAM[i,1]
94 } 94 }
95 i <- i + 1 95 i <- i + 1
96 } 96 }
97 GIDNAM 97 GIDNAM
98 } 98 }
99 99
100 #4#Function for changing the gene ID to gene name 100 #4#Function for changing the gene ID to gene name
101 cgeneID <- function(GeneName,DATA){ 101 cgeneID <- function(GeneName,DATA){
102 colGene <- dim(GeneName)[2] 102 colGene <- dim(GeneName)[2]
103 j <- 1 103 j <- 1
104 for(j in 1:colGene){ 104 for(j in 1:colGene){
105 chngsreq <- grep(GeneName[1,j],DATA[1,]) 105 chngsreq <- grep(GeneName[1,j],DATA[1,])
106 #DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) 106 #DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
107 DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq]) 107 DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
108 j = j+1 108 j = j+1
109 } 109 }
110 DATA 110 DATA
111 } 111 }
112 112
113 #5#Function for adjusting the gene names 113 #5#Function for adjusting the gene names
114 gcnames <- function(DiData,usecol=1){ 114 gcnames <- function(DiData,usecol=1){
115 nuruns <- dim(DiData)[2] 115 nuruns <- dim(DiData)[2]
116 i = 1 116 i = 1
117 nwnam <- rep("0",length.out=nuruns) 117 nwnam <- rep("0",length.out=nuruns)
118 for(i in 1:nuruns){ 118 for(i in 1:nuruns){
119 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ 119 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
120 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol] 120 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
121 } else{ 121 } else{
122 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1] 122 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
123 } 123 }
124 124
125 } 125 }
126 nwnam 126 nwnam
127 127
128 } 128 }
129 129
130 130
131 131
132 #The Rest of this code will be used every time you want to change a data set 132 #The Rest of this code will be used every time you want to change a data set
133 133
134 #Getting the series matrix file 134 #Getting the series matrix file
135 print("Choose the series matrix file that you want to Analyze") 135 print("Choose the series matrix file that you want to Analyze")
136 alz <- file.choose() 136 alz <- file.choose()
137 137
138 #Getting the GPL file 138 #Getting the GPL file
139 print("Choose the GPL file that correlates with the above series matrix file") 139 print("Choose the GPL file that correlates with the above series matrix file")
140 genena <- file.choose() 140 genena <- file.choose()
141 141
142 142
143 #Set working directory based on the directory of the series matrix file Currently only works for windows 143 #Set working directory based on the directory of the series matrix file Currently only works for windows
144 ##strsplit(alz,"[\\]") %>% 144 ##strsplit(alz,"[\\]") %>%
145 ## .[[1]] %>% 145 ## .[[1]] %>%
146 ## .[-length(.)] %>% 146 ## .[-length(.)] %>%
147 ## paste(.,collapse="/") %>% 147 ## paste(.,collapse="/") %>%
148 ## setwd() 148 ## setwd()
149 149
150 #Find out if it is a soft GPL file or not 150 #Find out if it is a soft GPL file or not
151 soft <- strsplit(genena,"[\\|/]") %>% 151 soft <- strsplit(genena,"[\\|/]") %>%
152 .[[1]] %>% 152 .[[1]] %>%
153 .[length(.)] %>% 153 .[length(.)] %>%
154 grepl("soft",.) 154 grepl("soft",.)
155 155
156 #Working with the wordy part of the document 156 #Working with the wordy part of the document
157 alzword <- alz %>% 157 alzword <- alz %>%
158 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 158 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
159 filter(grepl("!Sample",X1))%>% 159 filter(grepl("!Sample",X1))%>%
160 filter(!grepl("!Sample_contact",X1)) 160 filter(!grepl("!Sample_contact",X1))
161 161
162 ##Changing row names and column names: 162 ##Changing row names and column names:
163 ALZWORD <- t(alzword) 163 ALZWORD <- t(alzword)
164 rownames(ALZWORD)=NULL 164 rownames(ALZWORD)=NULL
165 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 165 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
166 ALZWORD <- chngrownm(ALZWORD)[-1,] 166 ALZWORD <- chngrownm(ALZWORD)[-1,]
167 ALZWORD <- ALZWORD%>% 167 ALZWORD <- ALZWORD%>%
168 as.data.frame()%>% 168 as.data.frame()%>%
169 dplyr::select(-starts_with("col")) 169 dplyr::select(-starts_with("col"))
170 170
171 ##Reorganizing information within the columns 171 ##Reorganizing information within the columns
172 ALZWORDF <- cinfo(ALZWORD) 172 ALZWORDF <- cinfo(ALZWORD)
173 173
174 174
175 #Working with Actual Data part of file 175 #Working with Actual Data part of file
176 alzdat <- alz %>% 176 alzdat <- alz %>%
177 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 177 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
178 ALZDAT <- t(alzdat[,-1]) 178 ALZDAT <- t(alzdat[,-1])
179 rownames(ALZDAT)=NULL 179 rownames(ALZDAT)=NULL
180 180
181 ##Is there a clean version of the GPL file available? 181 ##Is there a clean version of the GPL file available?
182 gplnum <- strsplit(genena,"[\\|/]") %>% 182 gplnum <- strsplit(genena,"[\\|/]") %>%
183 .[[1]] %>% 183 .[[1]] %>%
184 .[length(.)] %>% 184 .[length(.)] %>%
185 gsub("\\D","",.) 185 gsub("\\D","",.)
186 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files())) 186 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
187 if(clfileex >= 1){ 187 if(clfileex >= 1){
188 #use the clean version 188 #use the clean version
189 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>% 189 geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
190 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!") 190 read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
191 191
192 } 192 }
193 if(clfileex == 0){ 193 if(clfileex == 0){
194 ##Lets Create a clean version 194 ##Lets Create a clean version
195 195
196 ##Gene ID to Gene Name 196 ##Gene ID to Gene Name
197 ###geneIDNam <- genena %>% 197 ###geneIDNam <- genena %>%
198 ### read_delim(delim="\t",comment = "#")%>% 198 ### read_delim(delim="\t",comment = "#")%>%
199 ### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 199 ### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
200 ###problems with the above for soft files 200 ###problems with the above for soft files
201 if(soft == TRUE){ 201 if(soft == TRUE){
202 #gplnum <- strsplit(genena,"[\\|/]") %>% 202 #gplnum <- strsplit(genena,"[\\|/]") %>%
203 # .[[1]] %>% 203 # .[[1]] %>%
204 # .[length(.)] %>% 204 # .[length(.)] %>%
205 # gsub("\\D","",.) 205 # gsub("\\D","",.)
206 #Check to see if there is already a file containing information on soft files 206 #Check to see if there is already a file containing information on soft files
207 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 207 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
208 if(fileex == 1){ 208 if(fileex == 1){
209 #Check to see if this GPL soft file has been used before 209 #Check to see if this GPL soft file has been used before
210 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 210 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
211 .$GPL_FILE_NUM%>% 211 .$GPL_FILE_NUM%>%
212 grepl(gplnum,.) %>% 212 grepl(gplnum,.) %>%
213 sum() 213 sum()
214 if(IDF == 1){ 214 if(IDF == 1){
215 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 215 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
216 .$GPL_FILE_NUM%>% 216 .$GPL_FILE_NUM%>%
217 grep(gplnum,.) 217 grep(gplnum,.)
218 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 218 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
219 .$LOC_ID %>% 219 .$LOC_ID %>%
220 .[IDLOCAL] 220 .[IDLOCAL]
221 geneIDNam <- genena %>% 221 geneIDNam <- genena %>%
222 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 222 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
223 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 223 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
224 } 224 }
225 if(IDF == 0){ 225 if(IDF == 0){
226 #No information on this particular GPL file 226 #No information on this particular GPL file
227 idLOCGPL <- genena %>% 227 idLOCGPL <- genena %>%
228 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 228 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
229 t(.) %>% 229 t(.) %>%
230 grep("^\\D",.) %>% 230 grep("^\\D",.) %>%
231 length()-1 231 length()-1
232 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 232 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
233 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 233 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
234 geneIDNam <- genena %>% 234 geneIDNam <- genena %>%
235 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 235 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
236 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 236 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
237 } 237 }
238 } 238 }
239 if(fileex == 0){ 239 if(fileex == 0){
240 #We must create a file that we can access for later use 240 #We must create a file that we can access for later use
241 idLOCGPL <- genena %>% 241 idLOCGPL <- genena %>%
242 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 242 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
243 t(.) %>% 243 t(.) %>%
244 grep("^\\D",.) %>% 244 grep("^\\D",.) %>%
245 length()-1 245 length()-1
246 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 246 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
247 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 247 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
248 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 248 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
249 geneIDNam <- genena %>% 249 geneIDNam <- genena %>%
250 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 250 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
251 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 251 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
252 } 252 }
253 } 253 }
254 if(soft == FALSE){ 254 if(soft == FALSE){
255 geneIDNam <- genena %>% 255 geneIDNam <- genena %>%
256 read_delim(delim="\t",comment = "#")%>% 256 read_delim(delim="\t",comment = "#")%>%
257 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 257 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
258 } 258 }
259 259
260 ##Labeling the gene IDs without names 260 ##Labeling the gene IDs without names
261 geneIDNam <- NAFIXING(geneIDNam) 261 geneIDNam <- NAFIXING(geneIDNam)
262 262
263 ##remove the whitespace 263 ##remove the whitespace
264 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 264 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
265 265
266 ##Here is the clean version 266 ##Here is the clean version
267 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE) 267 write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
268 } 268 }
269 269
270 270
271 271
272 ##Changing the gene ID to gene name 272 ##Changing the gene ID to gene name
273 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 273 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
274 colnames(ALZDAT) = ALZDAT1[1,] 274 colnames(ALZDAT) = ALZDAT1[1,]
275 275
276 276
277 ##Adjusting the column names aka the gene names 277 ##Adjusting the column names aka the gene names
278 colnames(ALZDAT) <- gcnames(ALZDAT) 278 colnames(ALZDAT) <- gcnames(ALZDAT)
279 279
280 280
281 #Full Data 281 #Full Data
282 Fullalzdw <- ALZDAT %>% 282 Fullalzdw <- ALZDAT %>%
283 as.data.frame() %>% 283 as.data.frame() %>%
284 cbind(ALZWORDF,.) 284 cbind(ALZWORDF,.)
285 285
286 286
287 #nfna <- strsplit(alz,"[\\|/]") %>% 287 #nfna <- strsplit(alz,"[\\|/]") %>%
288 # .[[1]] %>% 288 # .[[1]] %>%
289 # .[length(.)] %>% 289 # .[length(.)] %>%
290 # gsub("\\D","",.) %>% 290 # gsub("\\D","",.) %>%
291 # c("GSE",.,"after.txt") %>% 291 # c("GSE",.,"after.txt") %>%
292 # paste(collapse = "") 292 # paste(collapse = "")
293 #write.matrix(Fullalzdw,file = nfna,sep = "\t") 293 #write.matrix(Fullalzdw,file = nfna,sep = "\t")
294 294
295 #Perfect for excel viewing 295 #Perfect for excel viewing
296 nfnaex <- strsplit(alz,"[\\]") %>% 296 nfnaex <- strsplit(alz,"[\\]") %>%
297 .[[1]] %>% 297 .[[1]] %>%
298 .[length(.)] %>% 298 .[length(.)] %>%
299 gsub("\\D","",.) %>% 299 gsub("\\D","",.) %>%
300 c("GSE",.,"aftexcel.txt") %>% 300 c("GSE",.,"aftexcel.txt") %>%
301 paste(collapse = "") 301 paste(collapse = "")
302 write.table(t(Fullalzdw), file = nfnaex, sep = "\t",row.names = FALSE)
302 write.table(t(Fullalzdw), file = nfnaex, sep = "\t")