Commit 4255e424d8aa63da2b8754e0e531cfaf10191e44

Authored by Efrain Gonzalez
1 parent cc59b7f832
Exists in master

Found problem with the function cgeneID and fixed it.

Showing 1 changed file with 1 additions and 1 deletions   Show diff stats
1 #Libraries required to run the code 1 #Libraries required to run the code
2 library(pryr) 2 library(pryr)
3 library(MASS) 3 library(MASS)
4 library(dplyr) 4 library(dplyr)
5 library(tidyr) 5 library(tidyr)
6 library(readr) 6 library(readr)
7 library(stringr) 7 library(stringr)
8 8
9 9
10 #Necessary Functions 10 #Necessary Functions
11 #1#Function for handling the changing of row names and column names 11 #1#Function for handling the changing of row names and column names
12 chngrownm <- function(mat){ 12 chngrownm <- function(mat){
13 row <- dim(mat)[1] 13 row <- dim(mat)[1]
14 col <- dim(mat)[2] 14 col <- dim(mat)[2]
15 j <- 1 15 j <- 1
16 x <- 1 16 x <- 1
17 p <- 1 17 p <- 1
18 a <- 1 18 a <- 1
19 b <- 1 19 b <- 1
20 g <- 1 20 g <- 1
21 for(j in 1:col){ 21 for(j in 1:col){
22 if("!Sample_source_name_ch1"==mat[1,j]){ 22 if("!Sample_source_name_ch1"==mat[1,j]){
23 colnames(mat)[j] <- "Brain_Region" 23 colnames(mat)[j] <- "Brain_Region"
24 } 24 }
25 if("!Sample_title" == mat[1,j]){ 25 if("!Sample_title" == mat[1,j]){
26 colnames(mat)[j] <- "Title" 26 colnames(mat)[j] <- "Title"
27 } 27 }
28 if("!Sample_geo_accession" == mat[1,j]){ 28 if("!Sample_geo_accession" == mat[1,j]){
29 colnames(mat)[j] <- "ID_REF" 29 colnames(mat)[j] <- "ID_REF"
30 } else{ 30 } else{
31 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ 31 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
32 colnames(mat)[j] <- paste0("Sex",x) 32 colnames(mat)[j] <- paste0("Sex",x)
33 x = x + 1 33 x = x + 1
34 } 34 }
35 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ 35 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
36 colnames(mat)[j] <- paste0("PMI",p) 36 colnames(mat)[j] <- paste0("PMI",p)
37 p = p + 1 37 p = p + 1
38 } 38 }
39 if(grepl("age|Age|AGE",mat[2,j])==TRUE){ 39 if(grepl("age|Age|AGE",mat[2,j])==TRUE){
40 colnames(mat)[j] <- paste0("Age",a) 40 colnames(mat)[j] <- paste0("Age",a)
41 a = a + 1 41 a = a + 1
42 } 42 }
43 if(grepl("braak|b&b",mat[2,j])==TRUE){ 43 if(grepl("braak|b&b",mat[2,j])==TRUE){
44 colnames(mat)[j] <- paste0("Braak",b) 44 colnames(mat)[j] <- paste0("Braak",b)
45 b = b + 1 45 b = b + 1
46 } 46 }
47 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){ 47 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
48 colnames(mat)[j] <- paste0("Group",g) 48 colnames(mat)[j] <- paste0("Group",g)
49 g = g + 1 49 g = g + 1
50 } 50 }
51 51
52 } 52 }
53 j = j + 1 53 j = j + 1
54 } 54 }
55 mat 55 mat
56 } 56 }
57 57
58 #2#Function for reorganizing information within the columns 58 #2#Function for reorganizing information within the columns
59 cinfo <- function(mat){ 59 cinfo <- function(mat){
60 col <- dim(mat)[2] 60 col <- dim(mat)[2]
61 j <-2 61 j <-2
62 for(j in 2:col){ 62 for(j in 2:col){
63 if(grepl("Group",colnames(mat)[j]) == TRUE){ 63 if(grepl("Group",colnames(mat)[j]) == TRUE){
64 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 64 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
65 } 65 }
66 if(grepl("Age",colnames(mat)[j])==TRUE){ 66 if(grepl("Age",colnames(mat)[j])==TRUE){
67 mat[,j] <- gsub("\\D","",mat[,j])%>% 67 mat[,j] <- gsub("\\D","",mat[,j])%>%
68 as.integer() 68 as.integer()
69 } 69 }
70 if(grepl("Sex",colnames(mat)[j])==TRUE){ 70 if(grepl("Sex",colnames(mat)[j])==TRUE){
71 mat[,j] <- gsub(".+:\\s","",mat[,j]) 71 mat[,j] <- gsub(".+:\\s","",mat[,j])
72 } 72 }
73 if(grepl("PMI",colnames(mat)[j])==TRUE){ 73 if(grepl("PMI",colnames(mat)[j])==TRUE){
74 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 74 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
75 as.numeric() 75 as.numeric()
76 } 76 }
77 if(grepl("Braak",colnames(mat)[j])==TRUE){ 77 if(grepl("Braak",colnames(mat)[j])==TRUE){
78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
79 as.roman()%>% 79 as.roman()%>%
80 as.integer() 80 as.integer()
81 } 81 }
82 j=j+1 82 j=j+1
83 } 83 }
84 mat 84 mat
85 } 85 }
86 86
87 #3#Function for labeling the gene IDs without names 87 #3#Function for labeling the gene IDs without names
88 NAFIXING <- function(GIDNAM){ 88 NAFIXING <- function(GIDNAM){
89 row <- dim(GIDNAM)[1] 89 row <- dim(GIDNAM)[1]
90 i <- 1 90 i <- 1
91 x <- 1 91 x <- 1
92 for(i in 1:row){ 92 for(i in 1:row){
93 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE){ 93 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE){
94 GIDNAM[i,2] <- gsub(".*",paste0(NA,x),GIDNAM[i,2]) 94 GIDNAM[i,2] <- gsub(".*",paste0(NA,x),GIDNAM[i,2])
95 x <- x + 1 95 x <- x + 1
96 } 96 }
97 i <- i + 1 97 i <- i + 1
98 } 98 }
99 GIDNAM 99 GIDNAM
100 } 100 }
101 101
102 #4#Function for changing the gene ID to gene name 102 #4#Function for changing the gene ID to gene name
103 cgeneID <- function(GeneName,DATA){ 103 cgeneID <- function(GeneName,DATA){
104 colGene <- dim(GeneName)[2] 104 colGene <- dim(GeneName)[2]
105 j <- 1 105 j <- 1
106 for(j in 1:colGene){ 106 for(j in 1:colGene){
107 chngsreq <- grep(GeneName[1,j],DATA[1,]) 107 chngsreq <- grep(paste0("^",GeneName[1,j]),DATA[1,])
108 DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) 108 DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
109 j = j+1 109 j = j+1
110 } 110 }
111 DATA 111 DATA
112 } 112 }
113 113
114 #5#Function for adjusting the gene names 114 #5#Function for adjusting the gene names
115 gcnames <- function(DiData,usecol=1){ 115 gcnames <- function(DiData,usecol=1){
116 nuruns <- dim(DiData)[2] 116 nuruns <- dim(DiData)[2]
117 i = 1 117 i = 1
118 nwnam <- rep("0",length.out=nuruns) 118 nwnam <- rep("0",length.out=nuruns)
119 for(i in 1:nuruns){ 119 for(i in 1:nuruns){
120 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ 120 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
121 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol] 121 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
122 } else{ 122 } else{
123 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1] 123 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
124 } 124 }
125 125
126 } 126 }
127 nwnam 127 nwnam
128 128
129 } 129 }
130 130
131 131
132 132
133 #The Rest of this code will be used every time you want to change a data set 133 #The Rest of this code will be used every time you want to change a data set
134 134
135 #Getting the series matrix file 135 #Getting the series matrix file
136 print("Choose the series matrix file that you want to Analyze") 136 print("Choose the series matrix file that you want to Analyze")
137 alz <- file.choose() 137 alz <- file.choose()
138 138
139 #Getting the GPL file 139 #Getting the GPL file
140 print("Choose the GPL file that correlates with the above series matrix file") 140 print("Choose the GPL file that correlates with the above series matrix file")
141 genena <- file.choose() 141 genena <- file.choose()
142 142
143 143
144 #Set working directory based on the directory of the series matrix file Currently only works for windows 144 #Set working directory based on the directory of the series matrix file Currently only works for windows
145 ##strsplit(alz,"[\\]") %>% 145 ##strsplit(alz,"[\\]") %>%
146 ## .[[1]] %>% 146 ## .[[1]] %>%
147 ## .[-length(.)] %>% 147 ## .[-length(.)] %>%
148 ## paste(.,collapse="/") %>% 148 ## paste(.,collapse="/") %>%
149 ## setwd() 149 ## setwd()
150 150
151 #Find out if it is a soft GPL file or not 151 #Find out if it is a soft GPL file or not
152 soft <- strsplit(genena,"[\\|/]") %>% 152 soft <- strsplit(genena,"[\\|/]") %>%
153 .[[1]] %>% 153 .[[1]] %>%
154 .[length(.)] %>% 154 .[length(.)] %>%
155 grepl("soft",.) 155 grepl("soft",.)
156 156
157 #Working with the wordy part of the document 157 #Working with the wordy part of the document
158 alzword <- alz %>% 158 alzword <- alz %>%
159 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 159 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
160 filter(grepl("!Sample",X1))%>% 160 filter(grepl("!Sample",X1))%>%
161 filter(!grepl("!Sample_contact",X1)) 161 filter(!grepl("!Sample_contact",X1))
162 162
163 ##Changing row names and column names: 163 ##Changing row names and column names:
164 ALZWORD <- t(alzword) 164 ALZWORD <- t(alzword)
165 rownames(ALZWORD)=NULL 165 rownames(ALZWORD)=NULL
166 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 166 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
167 ALZWORD <- chngrownm(ALZWORD)[-1,] 167 ALZWORD <- chngrownm(ALZWORD)[-1,]
168 ALZWORD <- ALZWORD%>% 168 ALZWORD <- ALZWORD%>%
169 as.data.frame()%>% 169 as.data.frame()%>%
170 dplyr::select(-starts_with("col")) 170 dplyr::select(-starts_with("col"))
171 171
172 ##Reorganizing information within the columns 172 ##Reorganizing information within the columns
173 ALZWORDF <- cinfo(ALZWORD) 173 ALZWORDF <- cinfo(ALZWORD)
174 174
175 175
176 #Working with Actual Data part of file 176 #Working with Actual Data part of file
177 alzdat <- alz %>% 177 alzdat <- alz %>%
178 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 178 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
179 ALZDAT <- t(alzdat[,-1]) 179 ALZDAT <- t(alzdat[,-1])
180 rownames(ALZDAT)=NULL 180 rownames(ALZDAT)=NULL
181 181
182 182
183 ##Gene ID to Gene Name 183 ##Gene ID to Gene Name
184 ###geneIDNam <- genena %>% 184 ###geneIDNam <- genena %>%
185 ### read_delim(delim="\t",comment = "#")%>% 185 ### read_delim(delim="\t",comment = "#")%>%
186 ### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 186 ### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
187 ###problems with the above for soft files 187 ###problems with the above for soft files
188 if(soft == TRUE){ 188 if(soft == TRUE){
189 gplnum <- strsplit(genena,"[\\|/]") %>% 189 gplnum <- strsplit(genena,"[\\|/]") %>%
190 .[[1]] %>% 190 .[[1]] %>%
191 .[length(.)] %>% 191 .[length(.)] %>%
192 gsub("\\D","",.) 192 gsub("\\D","",.)
193 #Check to see if there is already a file containing information on soft files 193 #Check to see if there is already a file containing information on soft files
194 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 194 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
195 if(fileex == 1){ 195 if(fileex == 1){
196 #Check to see if this GPL soft file has been used before 196 #Check to see if this GPL soft file has been used before
197 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 197 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
198 .$GPL_FILE_NUM%>% 198 .$GPL_FILE_NUM%>%
199 grepl(gplnum,.) %>% 199 grepl(gplnum,.) %>%
200 sum() 200 sum()
201 if(IDF == 1){ 201 if(IDF == 1){
202 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 202 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
203 .$GPL_FILE_NUM%>% 203 .$GPL_FILE_NUM%>%
204 grep(gplnum,.) 204 grep(gplnum,.)
205 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 205 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
206 .$LOC_ID %>% 206 .$LOC_ID %>%
207 .[IDLOCAL] 207 .[IDLOCAL]
208 geneIDNam <- genena %>% 208 geneIDNam <- genena %>%
209 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 209 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
210 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 210 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
211 } 211 }
212 if(IDF == 0){ 212 if(IDF == 0){
213 #No information on this particular GPL file 213 #No information on this particular GPL file
214 idLOCGPL <- genena %>% 214 idLOCGPL <- genena %>%
215 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 215 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
216 t(.) %>% 216 t(.) %>%
217 grep("^\\D",.) %>% 217 grep("^\\D",.) %>%
218 length()-1 218 length()-1
219 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 219 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
220 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 220 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
221 geneIDNam <- genena %>% 221 geneIDNam <- genena %>%
222 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 222 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
223 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 223 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
224 } 224 }
225 } 225 }
226 if(fileex == 0){ 226 if(fileex == 0){
227 #We must create a file that we can access for later use 227 #We must create a file that we can access for later use
228 idLOCGPL <- genena %>% 228 idLOCGPL <- genena %>%
229 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 229 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
230 t(.) %>% 230 t(.) %>%
231 grep("^\\D",.) %>% 231 grep("^\\D",.) %>%
232 length()-1 232 length()-1
233 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 233 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
234 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 234 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
235 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 235 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
236 geneIDNam <- genena %>% 236 geneIDNam <- genena %>%
237 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 237 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
238 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 238 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
239 } 239 }
240 } 240 }
241 if(soft == FALSE){ 241 if(soft == FALSE){
242 geneIDNam <- genena %>% 242 geneIDNam <- genena %>%
243 read_delim(delim="\t",comment = "#")%>% 243 read_delim(delim="\t",comment = "#")%>%
244 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 244 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
245 } 245 }
246 246
247 ##Labeling the gene IDs without names 247 ##Labeling the gene IDs without names
248 geneIDNam <- NAFIXING(geneIDNam) 248 geneIDNam <- NAFIXING(geneIDNam)
249 249
250 ##remove the whitespace 250 ##remove the whitespace
251 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 251 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
252 252
253 ##Changing the gene ID to gene name 253 ##Changing the gene ID to gene name
254 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 254 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
255 colnames(ALZDAT) = ALZDAT1[1,] 255 colnames(ALZDAT) = ALZDAT1[1,]
256 256
257 257
258 ##Adjusting the column names aka the gene names 258 ##Adjusting the column names aka the gene names
259 colnames(ALZDAT) <- gcnames(ALZDAT) 259 colnames(ALZDAT) <- gcnames(ALZDAT)
260 260
261 261
262 #Full Data 262 #Full Data
263 Fullalzdw <- ALZDAT %>% 263 Fullalzdw <- ALZDAT %>%
264 as.data.frame() %>% 264 as.data.frame() %>%
265 cbind(ALZWORDF,.) 265 cbind(ALZWORDF,.)
266 266
267 267
268 nfna <- strsplit(alz,"[\\]") %>% 268 nfna <- strsplit(alz,"[\\]") %>%
269 .[[1]] %>% 269 .[[1]] %>%
270 .[length(.)] %>% 270 .[length(.)] %>%
271 gsub("\\D","",.) %>% 271 gsub("\\D","",.) %>%
272 c("GSE",.,"after.txt") %>% 272 c("GSE",.,"after.txt") %>%
273 paste(collapse = "") 273 paste(collapse = "")
274 write.matrix(Fullalzdw,file = nfna,sep = "\t") 274 write.matrix(Fullalzdw,file = nfna,sep = "\t")
275 #Perfect for excel viewing 275 #Perfect for excel viewing
276 nfnaex <- strsplit(alz,"[\\]") %>% 276 nfnaex <- strsplit(alz,"[\\]") %>%
277 .[[1]] %>% 277 .[[1]] %>%
278 .[length(.)] %>% 278 .[length(.)] %>%
279 gsub("\\D","",.) %>% 279 gsub("\\D","",.) %>%
280 c("GSE",.,"aftexcel.txt") %>% 280 c("GSE",.,"aftexcel.txt") %>%
281 paste(collapse = "") 281 paste(collapse = "")
282 write.table(t(Fullalzdw), file = nfnaex, sep = "\t") 282 write.table(t(Fullalzdw), file = nfnaex, sep = "\t")
283 283
284 284
285 285