Commit 18c7602e69fd98ea57dcd9e51e86d07062aeebf1

Authored by Efrain Gonzalez
1 parent 743d3ade8d
Exists in master

error in NAFIXING fixed

Showing 1 changed file with 3 additions and 5 deletions   Show diff stats
1 #Libraries required to run the code 1 #Libraries required to run the code
2 library(pryr) 2 library(pryr)
3 library(MASS) 3 library(MASS)
4 library(dplyr) 4 library(dplyr)
5 library(tidyr) 5 library(tidyr)
6 library(readr) 6 library(readr)
7 library(stringr) 7 library(stringr)
8 8
9 9
10 #Necessary Functions 10 #Necessary Functions
11 #1#Function for handling the changing of row names and column names 11 #1#Function for handling the changing of row names and column names
12 chngrownm <- function(mat){ 12 chngrownm <- function(mat){
13 row <- dim(mat)[1] 13 row <- dim(mat)[1]
14 col <- dim(mat)[2] 14 col <- dim(mat)[2]
15 j <- 1 15 j <- 1
16 x <- 1 16 x <- 1
17 p <- 1 17 p <- 1
18 a <- 1 18 a <- 1
19 b <- 1 19 b <- 1
20 g <- 1 20 g <- 1
21 for(j in 1:col){ 21 for(j in 1:col){
22 if("!Sample_source_name_ch1"==mat[1,j]){ 22 if("!Sample_source_name_ch1"==mat[1,j]){
23 colnames(mat)[j] <- "Brain_Region" 23 colnames(mat)[j] <- "Brain_Region"
24 } 24 }
25 if("!Sample_title" == mat[1,j]){ 25 if("!Sample_title" == mat[1,j]){
26 colnames(mat)[j] <- "Title" 26 colnames(mat)[j] <- "Title"
27 } 27 }
28 if("!Sample_geo_accession" == mat[1,j]){ 28 if("!Sample_geo_accession" == mat[1,j]){
29 colnames(mat)[j] <- "ID_REF" 29 colnames(mat)[j] <- "ID_REF"
30 } else{ 30 } else{
31 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ 31 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
32 colnames(mat)[j] <- paste0("Sex",x) 32 colnames(mat)[j] <- paste0("Sex",x)
33 x = x + 1 33 x = x + 1
34 } 34 }
35 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ 35 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
36 colnames(mat)[j] <- paste0("PMI",p) 36 colnames(mat)[j] <- paste0("PMI",p)
37 p = p + 1 37 p = p + 1
38 } 38 }
39 if(grepl("age|Age|AGE",mat[2,j])==TRUE){ 39 if(grepl("age|Age|AGE",mat[2,j])==TRUE){
40 colnames(mat)[j] <- paste0("Age",a) 40 colnames(mat)[j] <- paste0("Age",a)
41 a = a + 1 41 a = a + 1
42 } 42 }
43 if(grepl("braak|b&b",mat[2,j])==TRUE){ 43 if(grepl("braak|b&b",mat[2,j])==TRUE){
44 colnames(mat)[j] <- paste0("Braak",b) 44 colnames(mat)[j] <- paste0("Braak",b)
45 b = b + 1 45 b = b + 1
46 } 46 }
47 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){ 47 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
48 colnames(mat)[j] <- paste0("Group",g) 48 colnames(mat)[j] <- paste0("Group",g)
49 g = g + 1 49 g = g + 1
50 } 50 }
51 51
52 } 52 }
53 j = j + 1 53 j = j + 1
54 } 54 }
55 mat 55 mat
56 } 56 }
57 57
58 #2#Function for reorganizing information within the columns 58 #2#Function for reorganizing information within the columns
59 cinfo <- function(mat){ 59 cinfo <- function(mat){
60 col <- dim(mat)[2] 60 col <- dim(mat)[2]
61 j <-2 61 j <-2
62 for(j in 2:col){ 62 for(j in 2:col){
63 if(grepl("Group",colnames(mat)[j]) == TRUE){ 63 if(grepl("Group",colnames(mat)[j]) == TRUE){
64 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 64 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
65 } 65 }
66 if(grepl("Age",colnames(mat)[j])==TRUE){ 66 if(grepl("Age",colnames(mat)[j])==TRUE){
67 mat[,j] <- gsub("\\D","",mat[,j])%>% 67 mat[,j] <- gsub("\\D","",mat[,j])%>%
68 as.integer() 68 as.integer()
69 } 69 }
70 if(grepl("Sex",colnames(mat)[j])==TRUE){ 70 if(grepl("Sex",colnames(mat)[j])==TRUE){
71 mat[,j] <- gsub(".+:\\s","",mat[,j]) 71 mat[,j] <- gsub(".+:\\s","",mat[,j])
72 } 72 }
73 if(grepl("PMI",colnames(mat)[j])==TRUE){ 73 if(grepl("PMI",colnames(mat)[j])==TRUE){
74 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 74 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
75 as.numeric() 75 as.numeric()
76 } 76 }
77 if(grepl("Braak",colnames(mat)[j])==TRUE){ 77 if(grepl("Braak",colnames(mat)[j])==TRUE){
78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
79 as.roman()%>% 79 as.roman()%>%
80 as.integer() 80 as.integer()
81 } 81 }
82 j=j+1 82 j=j+1
83 } 83 }
84 mat 84 mat
85 } 85 }
86 86
87 #3#Function for labeling the gene IDs without names 87 #3#Function for labeling the gene IDs without names
88 NAFIXING <- function(GIDNAM){ 88 NAFIXING <- function(GIDNAM){
89 row <- dim(GIDNAM)[1] 89 row <- dim(GIDNAM)[1]
90 i <- 1 90 i <- 1
91 x <- 1
92 for(i in 1:row){ 91 for(i in 1:row){
93 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE){ 92 if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
94 GIDNAM[i,2] <- gsub(".*",paste0(NA,x),GIDNAM[i,2]) 93 GIDNAM[i,2] <- GIDNAM[i,1]
95 x <- x + 1
96 } 94 }
97 i <- i + 1 95 i <- i + 1
98 } 96 }
99 GIDNAM 97 GIDNAM
100 } 98 }
101 99
102 #4#Function for changing the gene ID to gene name 100 #4#Function for changing the gene ID to gene name
103 cgeneID <- function(GeneName,DATA){ 101 cgeneID <- function(GeneName,DATA){
104 colGene <- dim(GeneName)[2] 102 colGene <- dim(GeneName)[2]
105 j <- 1 103 j <- 1
106 for(j in 1:colGene){ 104 for(j in 1:colGene){
107 chngsreq <- grep(paste0("^",GeneName[1,j]),DATA[1,]) 105 chngsreq <- grep(paste0("^",GeneName[1,j]),DATA[1,])
108 DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) 106 DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
109 j = j+1 107 j = j+1
110 } 108 }
111 DATA 109 DATA
112 } 110 }
113 111
114 #5#Function for adjusting the gene names 112 #5#Function for adjusting the gene names
115 gcnames <- function(DiData,usecol=1){ 113 gcnames <- function(DiData,usecol=1){
116 nuruns <- dim(DiData)[2] 114 nuruns <- dim(DiData)[2]
117 i = 1 115 i = 1
118 nwnam <- rep("0",length.out=nuruns) 116 nwnam <- rep("0",length.out=nuruns)
119 for(i in 1:nuruns){ 117 for(i in 1:nuruns){
120 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ 118 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
121 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol] 119 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
122 } else{ 120 } else{
123 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1] 121 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
124 } 122 }
125 123
126 } 124 }
127 nwnam 125 nwnam
128 126
129 } 127 }
130 128
131 129
132 130
133 #The Rest of this code will be used every time you want to change a data set 131 #The Rest of this code will be used every time you want to change a data set
134 132
135 #Getting the series matrix file 133 #Getting the series matrix file
136 print("Choose the series matrix file that you want to Analyze") 134 print("Choose the series matrix file that you want to Analyze")
137 alz <- file.choose() 135 alz <- file.choose()
138 136
139 #Getting the GPL file 137 #Getting the GPL file
140 print("Choose the GPL file that correlates with the above series matrix file") 138 print("Choose the GPL file that correlates with the above series matrix file")
141 genena <- file.choose() 139 genena <- file.choose()
142 140
143 141
144 #Set working directory based on the directory of the series matrix file Currently only works for windows 142 #Set working directory based on the directory of the series matrix file Currently only works for windows
145 ##strsplit(alz,"[\\]") %>% 143 ##strsplit(alz,"[\\]") %>%
146 ## .[[1]] %>% 144 ## .[[1]] %>%
147 ## .[-length(.)] %>% 145 ## .[-length(.)] %>%
148 ## paste(.,collapse="/") %>% 146 ## paste(.,collapse="/") %>%
149 ## setwd() 147 ## setwd()
150 148
151 #Find out if it is a soft GPL file or not 149 #Find out if it is a soft GPL file or not
152 soft <- strsplit(genena,"[\\|/]") %>% 150 soft <- strsplit(genena,"[\\|/]") %>%
153 .[[1]] %>% 151 .[[1]] %>%
154 .[length(.)] %>% 152 .[length(.)] %>%
155 grepl("soft",.) 153 grepl("soft",.)
156 154
157 #Working with the wordy part of the document 155 #Working with the wordy part of the document
158 alzword <- alz %>% 156 alzword <- alz %>%
159 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 157 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
160 filter(grepl("!Sample",X1))%>% 158 filter(grepl("!Sample",X1))%>%
161 filter(!grepl("!Sample_contact",X1)) 159 filter(!grepl("!Sample_contact",X1))
162 160
163 ##Changing row names and column names: 161 ##Changing row names and column names:
164 ALZWORD <- t(alzword) 162 ALZWORD <- t(alzword)
165 rownames(ALZWORD)=NULL 163 rownames(ALZWORD)=NULL
166 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 164 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
167 ALZWORD <- chngrownm(ALZWORD)[-1,] 165 ALZWORD <- chngrownm(ALZWORD)[-1,]
168 ALZWORD <- ALZWORD%>% 166 ALZWORD <- ALZWORD%>%
169 as.data.frame()%>% 167 as.data.frame()%>%
170 dplyr::select(-starts_with("col")) 168 dplyr::select(-starts_with("col"))
171 169
172 ##Reorganizing information within the columns 170 ##Reorganizing information within the columns
173 ALZWORDF <- cinfo(ALZWORD) 171 ALZWORDF <- cinfo(ALZWORD)
174 172
175 173
176 #Working with Actual Data part of file 174 #Working with Actual Data part of file
177 alzdat <- alz %>% 175 alzdat <- alz %>%
178 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 176 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
179 ALZDAT <- t(alzdat[,-1]) 177 ALZDAT <- t(alzdat[,-1])
180 rownames(ALZDAT)=NULL 178 rownames(ALZDAT)=NULL
181 179
182 180
183 ##Gene ID to Gene Name 181 ##Gene ID to Gene Name
184 ###geneIDNam <- genena %>% 182 ###geneIDNam <- genena %>%
185 ### read_delim(delim="\t",comment = "#")%>% 183 ### read_delim(delim="\t",comment = "#")%>%
186 ### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 184 ### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
187 ###problems with the above for soft files 185 ###problems with the above for soft files
188 if(soft == TRUE){ 186 if(soft == TRUE){
189 gplnum <- strsplit(genena,"[\\|/]") %>% 187 gplnum <- strsplit(genena,"[\\|/]") %>%
190 .[[1]] %>% 188 .[[1]] %>%
191 .[length(.)] %>% 189 .[length(.)] %>%
192 gsub("\\D","",.) 190 gsub("\\D","",.)
193 #Check to see if there is already a file containing information on soft files 191 #Check to see if there is already a file containing information on soft files
194 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) 192 fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
195 if(fileex == 1){ 193 if(fileex == 1){
196 #Check to see if this GPL soft file has been used before 194 #Check to see if this GPL soft file has been used before
197 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 195 IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
198 .$GPL_FILE_NUM%>% 196 .$GPL_FILE_NUM%>%
199 grepl(gplnum,.) %>% 197 grepl(gplnum,.) %>%
200 sum() 198 sum()
201 if(IDF == 1){ 199 if(IDF == 1){
202 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 200 IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
203 .$GPL_FILE_NUM%>% 201 .$GPL_FILE_NUM%>%
204 grep(gplnum,.) 202 grep(gplnum,.)
205 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% 203 idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
206 .$LOC_ID %>% 204 .$LOC_ID %>%
207 .[IDLOCAL] 205 .[IDLOCAL]
208 geneIDNam <- genena %>% 206 geneIDNam <- genena %>%
209 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% 207 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
210 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 208 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
211 } 209 }
212 if(IDF == 0){ 210 if(IDF == 0){
213 #No information on this particular GPL file 211 #No information on this particular GPL file
214 idLOCGPL <- genena %>% 212 idLOCGPL <- genena %>%
215 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 213 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
216 t(.) %>% 214 t(.) %>%
217 grep("^\\D",.) %>% 215 grep("^\\D",.) %>%
218 length()-1 216 length()-1
219 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% 217 cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
220 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) 218 cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
221 geneIDNam <- genena %>% 219 geneIDNam <- genena %>%
222 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 220 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
223 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 221 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
224 } 222 }
225 } 223 }
226 if(fileex == 0){ 224 if(fileex == 0){
227 #We must create a file that we can access for later use 225 #We must create a file that we can access for later use
228 idLOCGPL <- genena %>% 226 idLOCGPL <- genena %>%
229 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% 227 read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
230 t(.) %>% 228 t(.) %>%
231 grep("^\\D",.) %>% 229 grep("^\\D",.) %>%
232 length()-1 230 length()-1
233 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) 231 Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
234 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") 232 colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
235 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) 233 write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
236 geneIDNam <- genena %>% 234 geneIDNam <- genena %>%
237 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% 235 read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
238 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 236 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
239 } 237 }
240 } 238 }
241 if(soft == FALSE){ 239 if(soft == FALSE){
242 geneIDNam <- genena %>% 240 geneIDNam <- genena %>%
243 read_delim(delim="\t",comment = "#")%>% 241 read_delim(delim="\t",comment = "#")%>%
244 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 242 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
245 } 243 }
246 244
247 ##Labeling the gene IDs without names 245 ##Labeling the gene IDs without names
248 geneIDNam <- NAFIXING(geneIDNam) 246 geneIDNam <- NAFIXING(geneIDNam)
249 247
250 ##remove the whitespace 248 ##remove the whitespace
251 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) 249 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
252 250
253 ##Changing the gene ID to gene name 251 ##Changing the gene ID to gene name
254 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 252 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
255 colnames(ALZDAT) = ALZDAT1[1,] 253 colnames(ALZDAT) = ALZDAT1[1,]
256 254
257 255
258 ##Adjusting the column names aka the gene names 256 ##Adjusting the column names aka the gene names
259 colnames(ALZDAT) <- gcnames(ALZDAT) 257 colnames(ALZDAT) <- gcnames(ALZDAT)
260 258
261 259
262 #Full Data 260 #Full Data
263 Fullalzdw <- ALZDAT %>% 261 Fullalzdw <- ALZDAT %>%
264 as.data.frame() %>% 262 as.data.frame() %>%
265 cbind(ALZWORDF,.) 263 cbind(ALZWORDF,.)
266 264
267 265
268 #nfna <- strsplit(alz,"[\\]") %>% 266 #nfna <- strsplit(alz,"[\\]") %>%
269 # .[[1]] %>% 267 # .[[1]] %>%
270 # .[length(.)] %>% 268 # .[length(.)] %>%
271 # gsub("\\D","",.) %>% 269 # gsub("\\D","",.) %>%
272 # c("GSE",.,"after.txt") %>% 270 # c("GSE",.,"after.txt") %>%
273 # paste(collapse = "") 271 # paste(collapse = "")
274 #write.matrix(Fullalzdw,file = nfna,sep = "\t") 272 #write.matrix(Fullalzdw,file = nfna,sep = "\t")
275 #Perfect for excel viewing 273 #Perfect for excel viewing
276 nfnaex <- strsplit(alz,"[\\]") %>% 274 nfnaex <- strsplit(alz,"[\\]") %>%
277 .[[1]] %>% 275 .[[1]] %>%
278 .[length(.)] %>% 276 .[length(.)] %>%
279 gsub("\\D","",.) %>% 277 gsub("\\D","",.) %>%
280 c("GSE",.,"aftexcel.txt") %>% 278 c("GSE",.,"aftexcel.txt") %>%
281 paste(collapse = "") 279 paste(collapse = "")
282 write.table(t(Fullalzdw), file = nfnaex, sep = "\t",row.names = FALSE) 280 write.table(t(Fullalzdw), file = nfnaex, sep = "\t",row.names = FALSE)
283 281
284 282
285 283