Commit 16b4f55de1bee74c9f9060e6884d4d27c04cfe45

Authored by Efrain Gonzalez
1 parent a66a63dc50
Exists in master

wrong extension

Showing 1 changed file with 0 additions and 198 deletions   Show diff stats
1 #Libraries required to run the code File was deleted
2 library(MASS)
3 library(pryr)
4 library(dplyr)
5 library(tidyr)
6 library(readr)
7 library(stringr)
8
9
10 #Necessary Functions
11 #1#Function for handling the changing of row names and column names
12 chngrownm <- function(mat){
13 row <- dim(mat)[1]
14 col <- dim(mat)[2]
15 j <- 1
16 x <- 1
17 p <- 1
18 a <- 1
19 b <- 1
20 g <- 1
21 for(j in 1:col){
22 if("!Sample_source_name_ch1"==mat[1,j]){
23 colnames(mat)[j] <- "Brain_Region"
24 }
25 if("!Sample_title" == mat[1,j]){
26 colnames(mat)[j] <- "Title"
27 }
28 if("!Sample_geo_accession" == mat[1,j]){
29 colnames(mat)[j] <- "ID_REF"
30 } else{
31 if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
32 colnames(mat)[j] <- paste0("Sex",x)
33 x = x + 1
34 }
35 if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
36 colnames(mat)[j] <- paste0("PMI",p)
37 p = p + 1
38 }
39 if(grepl("age|Age|AGE",mat[2,j])==TRUE){
40 colnames(mat)[j] <- paste0("Age",a)
41 a = a + 1
42 }
43 if(grepl("braak|b&b",mat[2,j])==TRUE){
44 colnames(mat)[j] <- paste0("Braak",b)
45 b = b + 1
46 }
47 if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
48 colnames(mat)[j] <- paste0("Group",g)
49 g = g + 1
50 }
51
52 }
53 j = j + 1
54 }
55 mat
56 }
57
58 #2#Function for reorganizing information within the columns
59 cinfo <- function(mat){
60 col <- dim(mat)[2]
61 j <-2
62 for(j in 2:col){
63 if(grepl("Group",colnames(mat)[j]) == TRUE){
64 mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
65 }
66 if(grepl("Age",colnames(mat)[j])==TRUE){
67 mat[,j] <- gsub("\\D","",mat[,j])%>%
68 as.integer()
69 }
70 if(grepl("Sex",colnames(mat)[j])==TRUE){
71 mat[,j] <- gsub(".+:\\s","",mat[,j])
72 }
73 if(grepl("PMI",colnames(mat)[j])==TRUE){
74 mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
75 as.numeric()
76 }
77 if(grepl("Braak",colnames(mat)[j])==TRUE){
78 mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
79 as.roman()%>%
80 as.integer()
81 }
82 j=j+1
83 }
84 mat
85 }
86
87 #3#Function for changing the gene ID to gene name
88 cgeneID <- function(GeneName,DATA){
89 colGene <- dim(GeneName)[2]
90 j <- 1
91 for(j in 1:colGene){
92 chngsreq <- grep(GeneName[1,j],DATA[1,])
93 DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
94 j = j+1
95 }
96 DATA
97 }
98
99 #4#Function for adjusting the gene names
100 gcnames <- function(DiData,usecol=1){
101 nuruns <- dim(DiData)[2]
102 i = 1
103 nwnam <- rep("0",length.out=nuruns)
104 for(i in 1:nuruns){
105 if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
106 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
107 } else{
108 nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
109 }
110
111 }
112 nwnam
113
114 }
115
116
117
118 #The Rest of this code will be used every time you want to change a data set
119
120 #Getting the series matrix file
121 print("Choose the series matrix file that you want to Analyze")
122 alz <- file.choose()
123
124 #Getting the GPL file
125 print("Choose the GPL file that correlates with the above series matrix file")
126 genena <- file.choose()
127
128
129 #Set working directory based on the directory of the series matrix file
130 ##strsplit(alz,"[\\]") %>%
131 ## .[[1]] %>%
132 ## .[-length(.)] %>%
133 ## paste(.,collapse="/") %>%
134 ## setwd()
135
136
137 #Working with the wordy part of the document
138 alzword <- alz %>%
139 read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
140 filter(grepl("!Sample",X1))%>%
141 filter(!grepl("!Sample_contact",X1))
142
143 ##Changing row names and column names:
144 ALZWORD <- t(alzword)
145 rownames(ALZWORD)=NULL
146 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
147 ALZWORD <- chngrownm(ALZWORD)[-1,]
148 ALZWORD <- ALZWORD%>%
149 as.data.frame()%>%
150 dplyr::select(-starts_with("col"))
151
152 ##Reorganizing information within the columns
153 ALZWORDF <- cinfo(ALZWORD)
154
155
156 #Working with Actual Data part of file
157 alzdat <- alz %>%
158 read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
159 ALZDAT <- t(alzdat[,-1])
160 rownames(ALZDAT)=NULL
161
162
163 ##Gene ID to Gene Name
164 geneIDNam <- genena %>%
165 read_delim(delim="\t",comment = "#")%>%
166 dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
167
168 ##Changing the ID to a Name
169 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
170 colnames(ALZDAT) = ALZDAT1[1,]
171
172
173 ##Adjusting the column names aka the gene names
174 colnames(ALZDAT) <- gcnames(ALZDAT)
175
176
177 #Full Data
178 Fullalzdw <- ALZDAT %>%
179 as.data.frame() %>%
180 cbind(ALZWORDF,.)
181
182 ##since the order in which the packages are added matters I moved this package to the top
183 ##library(MASS)
184 nfna <- strsplit(alz,"[\\]") %>%
185 .[[1]] %>%
186 .[length(.)] %>%
187 gsub("\\D","",.) %>%
188 c("GSE",.,"after.txt") %>%
189 paste(collapse = "")
190 MASS::write.matrix(Fullalzdw,file = nfna,sep = "\t")
191 #Perfect for excel viewing
192 nfnaex <- strsplit(alz,"[\\]") %>%
193 .[[1]] %>%
194 .[length(.)] %>%
195 gsub("\\D","",.) %>%
196 c("GSE",.,"aftexcel.txt") %>%
197 paste(collapse = "")
198 write.table(t(Fullalzdw), file = nfnaex, sep = "\t")
199 1 #Libraries required to run the code