Commit fe3623221f234e5fe409237f4fedba8475f9e2db

Authored by Efrain Gonzalez
1 parent cae87f17c0
Exists in master

Added folder

1 #Libraries required to run the code 1 #Libraries required to run the code
2 library(MASS) 2 library(MASS)
3 library(pryr) 3 library(pryr)
4 library(dplyr) 4 library(dplyr)
5 library(tidyr) 5 library(tidyr)
6 library(readr) 6 library(readr)
7 library(stringr) 7 library(stringr)
8 8
9 9
10 #Necessary Functions 10 #Necessary Functions
11 #1#Function for handling the changing of row names and column names 11 #1#Function for handling the changing of row names and column names
12 chngrownm <- function(mat){ 12 chngrownm <- function(mat){
13 row <- dim(mat)[1] 13 row <- dim(mat)[1]
14 col <- dim(mat)[2] 14 col <- dim(mat)[2]
15 j <- 1 15 j <- 1
16 x <- 1 16 x <- 1
17 p <- 1 17 p <- 1
18 a <- 1 18 a <- 1
19 b <- 1 19 b <- 1
20 g <- 1 20 g <- 1
21 for(j in 1:col){ 21 for(j in 1:col){
22 if("!Sample_source_name_ch1"==mat[1,j]){ 22 if("!Sample_source_name_ch1"==mat[1,j]){
23 colnames(mat)[j] <- "Brain_Region" 23 colnames(mat)[j] <- "Brain_Region"
24 } 24 }
25 if("!Sample_title" == mat[1,j]){ 25 if("!Sample_title" == mat[1,j]){
26 colnames(mat)[j] <- "Title" 26 colnames(mat)[j] <- "Title"
27 } 27 }
28 if("!Sample_geo_accession" == mat[1,j]){ 28 if("!Sample_geo_accession" == mat[1,j]){
29 colnames(mat)[j] <- "ID_REF" 29 colnames(mat)[j] <- "ID_REF"
30 } else{ 30 } else{
31 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ 31 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
32 colnames(mat)[j] <- paste0("Sex",x) 32 colnames(mat)[j] <- paste0("Sex",x)
33 x = x + 1 33 x = x + 1
34 } 34 }
35 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ 35 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
36 colnames(mat)[j] <- paste0("PMI",p) 36 colnames(mat)[j] <- paste0("PMI",p)
37 p = p + 1 37 p = p + 1
38 } 38 }
39 if(grepl("age|Age|AGE",mat[2,j])==TRUE){ 39 if(grepl("age|Age|AGE",mat[2,j])==TRUE){
40 colnames(mat)[j] <- paste0("Age",a) 40 colnames(mat)[j] <- paste0("Age",a)
41 a = a + 1 41 a = a + 1
42 } 42 }
43 if(grepl("braak|b&b",mat[2,j])==TRUE){ 43 if(grepl("braak|b&b",mat[2,j])==TRUE){
44 colnames(mat)[j] <- paste0("Braak",b) 44 colnames(mat)[j] <- paste0("Braak",b)
45 b = b + 1 45 b = b + 1
46 } 46 }
47 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){ 47 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
48 colnames(mat)[j] <- paste0("Group",g) 48 colnames(mat)[j] <- paste0("Group",g)
49 g = g + 1 49 g = g + 1
50 } 50 }
51 51
52 } 52 }
53 j = j + 1 53 j = j + 1
54 } 54 }
55 mat 55 mat
56 } 56 }
57 57
58 #2#Function for reorganizing information within the columns 58 #2#Function for reorganizing information within the columns
59 cinfo <- function(mat){ 59 cinfo <- function(mat){
60 col <- dim(mat)[2] 60 col <- dim(mat)[2]
61 j <-2 61 j <-2
62 for(j in 2:col){ 62 for(j in 2:col){
63 if(grepl("Group",colnames(mat)[j]) == TRUE){ 63 if(grepl("Group",colnames(mat)[j]) == TRUE){
64 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) 64 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
65 } 65 }
66 if(grepl("Age",colnames(mat)[j])==TRUE){ 66 if(grepl("Age",colnames(mat)[j])==TRUE){
67 mat[,j] <- gsub("\\D","",mat[,j])%>% 67 mat[,j] <- gsub("\\D","",mat[,j])%>%
68 as.integer() 68 as.integer()
69 } 69 }
70 if(grepl("Sex",colnames(mat)[j])==TRUE){ 70 if(grepl("Sex",colnames(mat)[j])==TRUE){
71 mat[,j] <- gsub(".+:\\s","",mat[,j]) 71 mat[,j] <- gsub(".+:\\s","",mat[,j])
72 } 72 }
73 if(grepl("PMI",colnames(mat)[j])==TRUE){ 73 if(grepl("PMI",colnames(mat)[j])==TRUE){
74 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% 74 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
75 as.numeric() 75 as.numeric()
76 } 76 }
77 if(grepl("Braak",colnames(mat)[j])==TRUE){ 77 if(grepl("Braak",colnames(mat)[j])==TRUE){
78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>% 78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
79 as.roman()%>% 79 as.roman()%>%
80 as.integer() 80 as.integer()
81 } 81 }
82 j=j+1 82 j=j+1
83 } 83 }
84 mat 84 mat
85 } 85 }
86 86
87 #3#Function for changing the gene ID to gene name 87 #3#Function for changing the gene ID to gene name
88 cgeneID <- function(GeneName,DATA){ 88 cgeneID <- function(GeneName,DATA){
89 colGene <- dim(GeneName)[2] 89 colGene <- dim(GeneName)[2]
90 j <- 1 90 j <- 1
91 for(j in 1:colGene){ 91 for(j in 1:colGene){
92 chngsreq <- grep(GeneName[1,j],DATA[1,]) 92 chngsreq <- grep(GeneName[1,j],DATA[1,])
93 DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) 93 DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
94 j = j+1 94 j = j+1
95 } 95 }
96 DATA 96 DATA
97 } 97 }
98 98
99 #4#Function for adjusting the gene names 99 #4#Function for adjusting the gene names
100 gcnames <- function(DiData,usecol=1){ 100 gcnames <- function(DiData,usecol=1){
101 nuruns <- dim(DiData)[2] 101 nuruns <- dim(DiData)[2]
102 i = 1 102 i = 1
103 nwnam <- rep("0",length.out=nuruns) 103 nwnam <- rep("0",length.out=nuruns)
104 for(i in 1:nuruns){ 104 for(i in 1:nuruns){
105 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ 105 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
106 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol] 106 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
107 } else{ 107 } else{
108 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1] 108 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
109 } 109 }
110 110
111 } 111 }
112 nwnam 112 nwnam
113 113
114 } 114 }
115 115
116 116
117 117
118 #The Rest of this code will be used every time you want to change a data set 118 #The Rest of this code will be used every time you want to change a data set
119 119
120 #Getting the series matrix file 120 #Getting the series matrix file
121 print("Choose the series matrix file that you want to Analyze") 121 print("Choose the series matrix file that you want to Analyze")
122 alz <- file.choose() 122 alz <- file.choose()
123 123
124 #Getting the GPL file 124 #Getting the GPL file
125 print("Choose the GPL file that correlates with the above series matrix file") 125 print("Choose the GPL file that correlates with the above series matrix file")
126 genena <- file.choose() 126 genena <- file.choose()
127 127
128 128
129 #Set working directory based on the directory of the series matrix file 129 #Set working directory based on the directory of the series matrix file
130 ##strsplit(alz,"[\\]") %>% 130 ##strsplit(alz,"[\\]") %>%
131 ## .[[1]] %>% 131 ## .[[1]] %>%
132 ## .[-length(.)] %>% 132 ## .[-length(.)] %>%
133 ## paste(.,collapse="/") %>% 133 ## paste(.,collapse="/") %>%
134 ## setwd() 134 ## setwd()
135 135
136 136
137 #Working with the wordy part of the document 137 #Working with the wordy part of the document
138 alzword <- alz %>% 138 alzword <- alz %>%
139 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% 139 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
140 filter(grepl("!Sample",X1))%>% 140 filter(grepl("!Sample",X1))%>%
141 filter(!grepl("!Sample_contact",X1)) 141 filter(!grepl("!Sample_contact",X1))
142 142
143 ##Changing row names and column names: 143 ##Changing row names and column names:
144 ALZWORD <- t(alzword) 144 ALZWORD <- t(alzword)
145 rownames(ALZWORD)=NULL 145 rownames(ALZWORD)=NULL
146 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) 146 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
147 ALZWORD <- chngrownm(ALZWORD)[-1,] 147 ALZWORD <- chngrownm(ALZWORD)[-1,]
148 ALZWORD <- ALZWORD%>% 148 ALZWORD <- ALZWORD%>%
149 as.data.frame()%>% 149 as.data.frame()%>%
150 dplyr::select(-starts_with("col")) 150 dplyr::select(-starts_with("col"))
151 151
152 ##Reorganizing information within the columns 152 ##Reorganizing information within the columns
153 ALZWORDF <- cinfo(ALZWORD) 153 ALZWORDF <- cinfo(ALZWORD)
154 154
155 155
156 #Working with Actual Data part of file 156 #Working with Actual Data part of file
157 alzdat <- alz %>% 157 alzdat <- alz %>%
158 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) 158 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
159 ALZDAT <- t(alzdat[,-1]) 159 ALZDAT <- t(alzdat[,-1])
160 rownames(ALZDAT)=NULL 160 rownames(ALZDAT)=NULL
161 161
162 162
163 ##Gene ID to Gene Name 163 ##Gene ID to Gene Name
164 geneIDNam <- genena %>% 164 geneIDNam <- genena %>%
165 read_delim(delim="\t",comment = "#")%>% 165 read_delim(delim="\t",comment = "#")%>%
166 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) 166 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
167 167
168 ##Changing the ID to a Name 168 ##Changing the ID to a Name
169 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) 169 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
170 colnames(ALZDAT) = ALZDAT1[1,] 170 colnames(ALZDAT) = ALZDAT1[1,]
171 171
172 172
173 ##Adjusting the column names aka the gene names 173 ##Adjusting the column names aka the gene names
174 colnames(ALZDAT) <- gcnames(ALZDAT) 174 colnames(ALZDAT) <- gcnames(ALZDAT)
175 175
176 176
177 #Full Data 177 #Full Data
178 Fullalzdw <- ALZDAT %>% 178 Fullalzdw <- ALZDAT %>%
179 as.data.frame() %>% 179 as.data.frame() %>%
180 cbind(ALZWORDF,.) 180 cbind(ALZWORDF,.)
181 181
182 ##since the order in which the packages are added matters I moved this package to the top 182 ##since the order in which the packages are added matters I moved this package to the top
183 ##library(MASS) 183 ##library(MASS)
184 nfna <- strsplit(alz,"[\\]") %>% 184 nfna <- strsplit(alz,"[\\]") %>%
185 .[[1]] %>% 185 .[[1]] %>%
186 .[length(.)] %>% 186 .[length(.)] %>%
187 gsub("\\D","",.) %>% 187 gsub("\\D","",.) %>%
188 c("GSE",.,"after.txt") %>% 188 c("GSE",.,"after.txt") %>%
189 paste(collapse = "") 189 paste(collapse = "")
190 MASS::write.matrix(Fullalzdw,file = nfna,sep = "\t") 190 MASS::write.matrix(Fullalzdw,file = nfna,sep = "\t")
191 #Perfect for excel viewing 191 #Perfect for excel viewing
192 nfnaex <- strsplit(alz,"[\\]") %>% 192 nfnaex <- strsplit(alz,"[\\]") %>%
193 .[[1]] %>% 193 .[[1]] %>%
194 .[length(.)] %>% 194 .[length(.)] %>%
195 gsub("\\D","",.) %>% 195 gsub("\\D","",.) %>%
196 c("GSE",.,"aftexcel.txt") %>% 196 c("GSE",.,"aftexcel.txt") %>%
197 paste(collapse = "") 197 paste(collapse = "")
198 write.table(t(Fullalzdw), file = nfnaex, sep = "\t") 198 write.table(t(Fullalzdw), file = nfnaex, sep = "\t")
199 199