Commit 22a75a38eb8c5a8df6acce96b9c4487874143168

Authored by Efrain Gonzalez
1 parent f378e57f40
Exists in master

Most Recent (UNTESTED)

Showing 1 changed file with 324 additions and 0 deletions   Show diff stats
... ... @@ -0,0 +1,324 @@
  1 +#Libraries required to run the code
  2 +library(pryr)
  3 +library(MASS)
  4 +library(dplyr)
  5 +library(tidyr)
  6 +library(readr)
  7 +library(stringr)
  8 +
  9 +
  10 +#Necessary Functions
  11 +#1#Function for handling the changing of row names and column names
  12 +chngrownm <- function(mat){
  13 + row <- dim(mat)[1]
  14 + col <- dim(mat)[2]
  15 + j <- 1
  16 + x <- 1
  17 + p <- 1
  18 + a <- 1
  19 + b <- 1
  20 + g <- 1
  21 + for(j in 1:col){
  22 + if("!Sample_source_name_ch1"==mat[1,j]){
  23 + colnames(mat)[j] <- "Brain_Region"
  24 + }
  25 + if("!Sample_title" == mat[1,j]){
  26 + colnames(mat)[j] <- "Title"
  27 + }
  28 + if("!Sample_geo_accession" == mat[1,j]){
  29 + colnames(mat)[j] <- "ID_REF"
  30 + } else{
  31 + if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
  32 + colnames(mat)[j] <- paste0("Sex",x)
  33 + x = x + 1
  34 + }
  35 + if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
  36 + colnames(mat)[j] <- paste0("PMI",p)
  37 + p = p + 1
  38 + }
  39 + if(grepl("age|Age|AGE",mat[2,j])==TRUE){
  40 + colnames(mat)[j] <- paste0("Age",a)
  41 + a = a + 1
  42 + }
  43 + if(grepl("braak|b&b",mat[2,j])==TRUE){
  44 + colnames(mat)[j] <- paste0("Braak",b)
  45 + b = b + 1
  46 + }
  47 + if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
  48 + colnames(mat)[j] <- paste0("Group",g)
  49 + g = g + 1
  50 + }
  51 +
  52 + }
  53 + j = j + 1
  54 + }
  55 + mat
  56 +}
  57 +
  58 +#2#Function for reorganizing information within the columns
  59 +cinfo <- function(mat){
  60 + col <- dim(mat)[2]
  61 + j <-2
  62 + for(j in 2:col){
  63 + if(grepl("Group",colnames(mat)[j]) == TRUE){
  64 + mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
  65 + }
  66 + if(grepl("Age",colnames(mat)[j])==TRUE){
  67 + mat[,j] <- gsub("\\D","",mat[,j])%>%
  68 + as.integer()
  69 + }
  70 + if(grepl("Sex",colnames(mat)[j])==TRUE){
  71 + mat[,j] <- gsub(".+:\\s","",mat[,j])
  72 + }
  73 + if(grepl("PMI",colnames(mat)[j])==TRUE){
  74 + mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
  75 + as.numeric()
  76 + }
  77 + if(grepl("Braak",colnames(mat)[j])==TRUE){
  78 + mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
  79 + as.roman()%>%
  80 + as.integer()
  81 + }
  82 + j=j+1
  83 + }
  84 + mat
  85 +}
  86 +
  87 +#3#Function for labeling the gene IDs without names
  88 +NAFIXING <- function(GIDNAM){
  89 + row <- dim(GIDNAM)[1]
  90 + i <- 1
  91 + for(i in 1:row){
  92 + if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
  93 + GIDNAM[i,2] <- GIDNAM[i,1]
  94 + }
  95 + i <- i + 1
  96 + }
  97 + GIDNAM
  98 +}
  99 +
  100 +##4#Function for changing the gene ID to gene name
  101 +##cgeneID <- function(GeneName,DATA){
  102 +## colGene <- dim(GeneName)[2]
  103 +## j <- 1
  104 +## for(j in 1:colGene){
  105 +## chngsreq <- grep(GeneName[1,j],DATA[1,])
  106 +## if(sum(chngsreq) > 0){
  107 +## #DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
  108 +## DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
  109 +## }
  110 +## j = j+1
  111 +## }
  112 +## DATA
  113 +##}
  114 +#4#Function for changing the gene ID to gene name
  115 +cgeneID <- function(GeneName,DATA){
  116 + colGene <- dim(GeneName)[2]
  117 + j <- 1
  118 + for(j in 1:colGene){
  119 + chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
  120 + if(is.na(sum(chngsreq))==FALSE){
  121 + if(sum(chngsreq) > 0){
  122 + DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
  123 + }
  124 + }
  125 + #if(sum(chngsreq) > 0){
  126 + ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
  127 + #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
  128 + #}
  129 + j = j+1
  130 + }
  131 + DATA
  132 +}
  133 +
  134 +#5#Function for adjusting the gene names
  135 +gcnames <- function(DiData,usecol=1){
  136 + nuruns <- dim(DiData)[2]
  137 + i = 1
  138 + nwnam <- rep("0",length.out=nuruns)
  139 + for(i in 1:nuruns){
  140 + if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
  141 + nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
  142 + } else{
  143 + nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
  144 + }
  145 +
  146 + }
  147 + nwnam
  148 +
  149 +}
  150 +
  151 +
  152 +
  153 +#The Rest of this code will be used every time you want to change a data set
  154 +
  155 +#Getting the series matrix file
  156 +print("Choose the series matrix file that you want to Analyze")
  157 +alz <- file.choose()
  158 +
  159 +#Getting the GPL file
  160 +print("Choose the GPL file that correlates with the above series matrix file")
  161 +genena <- file.choose()
  162 +
  163 +
  164 +#Set working directory based on the directory of the series matrix file Currently only works for windows
  165 +##strsplit(alz,"[\\]") %>%
  166 +## .[[1]] %>%
  167 +## .[-length(.)] %>%
  168 +## paste(.,collapse="/") %>%
  169 +## setwd()
  170 +
  171 +#Find out if it is a soft GPL file or not
  172 +soft <- strsplit(genena,"[\\|/]") %>%
  173 + .[[1]] %>%
  174 + .[length(.)] %>%
  175 + grepl("soft",.)
  176 +
  177 +#Working with the wordy part of the document
  178 +alzword <- alz %>%
  179 + read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
  180 + filter(grepl("!Sample",X1))%>%
  181 + filter(!grepl("!Sample_contact",X1))
  182 +
  183 +##Changing row names and column names:
  184 +ALZWORD <- t(alzword)
  185 +rownames(ALZWORD)=NULL
  186 +colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
  187 +ALZWORD <- chngrownm(ALZWORD)[-1,]
  188 +ALZWORD <- ALZWORD%>%
  189 + as.data.frame()%>%
  190 + dplyr::select(-starts_with("col"))
  191 +
  192 +##Reorganizing information within the columns
  193 +ALZWORDF <- cinfo(ALZWORD)
  194 +
  195 +
  196 +#Working with Actual Data part of file
  197 +alzdat <- alz %>%
  198 + read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
  199 +ALZDAT <- t(alzdat[,-1])
  200 +rownames(ALZDAT)=NULL
  201 +
  202 +##Is there a clean version of the GPL file available?
  203 +gplnum <- strsplit(genena,"[\\|/]") %>%
  204 + .[[1]] %>%
  205 + .[length(.)] %>%
  206 + gsub("\\D","",.)
  207 +clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
  208 +if(clfileex >= 1){
  209 +#use the clean version
  210 +geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
  211 + read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
  212 +
  213 +}
  214 +if(clfileex == 0){
  215 +##Lets Create a clean version
  216 +
  217 +##Gene ID to Gene Name
  218 +###geneIDNam <- genena %>%
  219 +### read_delim(delim="\t",comment = "#")%>%
  220 +### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
  221 +###problems with the above for soft files
  222 + if(soft == TRUE){
  223 + #gplnum <- strsplit(genena,"[\\|/]") %>%
  224 + # .[[1]] %>%
  225 + # .[length(.)] %>%
  226 + # gsub("\\D","",.)
  227 + #Check to see if there is already a file containing information on soft files
  228 + fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
  229 + if(fileex == 1){
  230 + #Check to see if this GPL soft file has been used before
  231 + IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
  232 + .$GPL_FILE_NUM%>%
  233 + grepl(gplnum,.) %>%
  234 + sum()
  235 + if(IDF == 1){
  236 + IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
  237 + .$GPL_FILE_NUM%>%
  238 + grep(gplnum,.)
  239 + idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
  240 + .$LOC_ID %>%
  241 + .[IDLOCAL]
  242 + geneIDNam <- genena %>%
  243 + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
  244 + dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
  245 + }
  246 + if(IDF == 0){
  247 + #No information on this particular GPL file
  248 + idLOCGPL <- genena %>%
  249 + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
  250 + t(.) %>%
  251 + grep("^\\D",.) %>%
  252 + length()-1
  253 + cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
  254 + cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
  255 + geneIDNam <- genena %>%
  256 + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
  257 + dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
  258 + }
  259 + }
  260 + if(fileex == 0){
  261 + #We must create a file that we can access for later use
  262 + idLOCGPL <- genena %>%
  263 + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
  264 + t(.) %>%
  265 + grep("^\\D",.) %>%
  266 + length()-1
  267 + Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
  268 + colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
  269 + write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
  270 + geneIDNam <- genena %>%
  271 + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
  272 + dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
  273 + }
  274 + }
  275 + if(soft == FALSE){
  276 + geneIDNam <- genena %>%
  277 + read_delim(delim="\t",comment = "#")%>%
  278 + dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
  279 + }
  280 +
  281 + ##Labeling the gene IDs without names
  282 + geneIDNam <- NAFIXING(geneIDNam)
  283 +
  284 + ##remove the whitespace
  285 + geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
  286 +
  287 + ##Here is the clean version
  288 + write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
  289 +}
  290 +
  291 +
  292 +
  293 +##Changing the gene ID to gene name
  294 +ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
  295 +colnames(ALZDAT) = ALZDAT1[1,]
  296 +
  297 +
  298 +##Adjusting the column names aka the gene names
  299 +colnames(ALZDAT) <- gcnames(ALZDAT)
  300 +
  301 +
  302 +#Full Data
  303 +Fullalzdw <- ALZDAT %>%
  304 + as.data.frame() %>%
  305 + cbind(ALZWORDF,.)
  306 +
  307 +
  308 +#nfna <- strsplit(alz,"[\\|/]") %>%
  309 +# .[[1]] %>%
  310 +# .[length(.)] %>%
  311 +# gsub("\\D","",.) %>%
  312 +# c("GSE",.,"after.txt") %>%
  313 +# paste(collapse = "")
  314 +#write.matrix(Fullalzdw,file = nfna,sep = "\t")
  315 +
  316 +#Perfect for excel viewing
  317 +nfnaex <- strsplit(alz,"[\\]") %>%
  318 + .[[1]] %>%
  319 + .[length(.)] %>%
  320 + gsub("\\D","",.) %>%
  321 + c("GSE",.,"aftexcel.txt") %>%
  322 + paste(collapse = "")
  323 +write.table(t(Fullalzdw), file = nfnaex, sep = "\t")
  324 +