Commit 58ba5d0b06bb84f11c3fc28a88967881141849b5

Authored by Efrain Gonzalez
1 parent 18c7602e69
Exists in master

New version which includes the storing of the clean version of the GPL file (UNTESTED)

Showing 1 changed file with 302 additions and 0 deletions   Show diff stats
... ... @@ -0,0 +1,302 @@
  1 +#Libraries required to run the code
  2 +library(pryr)
  3 +library(MASS)
  4 +library(dplyr)
  5 +library(tidyr)
  6 +library(readr)
  7 +library(stringr)
  8 +
  9 +
  10 +#Necessary Functions
  11 +#1#Function for handling the changing of row names and column names
  12 +chngrownm <- function(mat){
  13 + row <- dim(mat)[1]
  14 + col <- dim(mat)[2]
  15 + j <- 1
  16 + x <- 1
  17 + p <- 1
  18 + a <- 1
  19 + b <- 1
  20 + g <- 1
  21 + for(j in 1:col){
  22 + if("!Sample_source_name_ch1"==mat[1,j]){
  23 + colnames(mat)[j] <- "Brain_Region"
  24 + }
  25 + if("!Sample_title" == mat[1,j]){
  26 + colnames(mat)[j] <- "Title"
  27 + }
  28 + if("!Sample_geo_accession" == mat[1,j]){
  29 + colnames(mat)[j] <- "ID_REF"
  30 + } else{
  31 + if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
  32 + colnames(mat)[j] <- paste0("Sex",x)
  33 + x = x + 1
  34 + }
  35 + if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
  36 + colnames(mat)[j] <- paste0("PMI",p)
  37 + p = p + 1
  38 + }
  39 + if(grepl("age|Age|AGE",mat[2,j])==TRUE){
  40 + colnames(mat)[j] <- paste0("Age",a)
  41 + a = a + 1
  42 + }
  43 + if(grepl("braak|b&b",mat[2,j])==TRUE){
  44 + colnames(mat)[j] <- paste0("Braak",b)
  45 + b = b + 1
  46 + }
  47 + if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
  48 + colnames(mat)[j] <- paste0("Group",g)
  49 + g = g + 1
  50 + }
  51 +
  52 + }
  53 + j = j + 1
  54 + }
  55 + mat
  56 +}
  57 +
  58 +#2#Function for reorganizing information within the columns
  59 +cinfo <- function(mat){
  60 + col <- dim(mat)[2]
  61 + j <-2
  62 + for(j in 2:col){
  63 + if(grepl("Group",colnames(mat)[j]) == TRUE){
  64 + mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
  65 + }
  66 + if(grepl("Age",colnames(mat)[j])==TRUE){
  67 + mat[,j] <- gsub("\\D","",mat[,j])%>%
  68 + as.integer()
  69 + }
  70 + if(grepl("Sex",colnames(mat)[j])==TRUE){
  71 + mat[,j] <- gsub(".+:\\s","",mat[,j])
  72 + }
  73 + if(grepl("PMI",colnames(mat)[j])==TRUE){
  74 + mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
  75 + as.numeric()
  76 + }
  77 + if(grepl("Braak",colnames(mat)[j])==TRUE){
  78 + mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
  79 + as.roman()%>%
  80 + as.integer()
  81 + }
  82 + j=j+1
  83 + }
  84 + mat
  85 +}
  86 +
  87 +#3#Function for labeling the gene IDs without names
  88 +NAFIXING <- function(GIDNAM){
  89 + row <- dim(GIDNAM)[1]
  90 + i <- 1
  91 + for(i in 1:row){
  92 + if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
  93 + GIDNAM[i,2] <- GIDNAM[i,1]
  94 + }
  95 + i <- i + 1
  96 + }
  97 + GIDNAM
  98 +}
  99 +
  100 +#4#Function for changing the gene ID to gene name
  101 +cgeneID <- function(GeneName,DATA){
  102 + colGene <- dim(GeneName)[2]
  103 + j <- 1
  104 + for(j in 1:colGene){
  105 + chngsreq <- grep(GeneName[1,j],DATA[1,])
  106 + #DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
  107 + DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
  108 + j = j+1
  109 + }
  110 + DATA
  111 +}
  112 +
  113 +#5#Function for adjusting the gene names
  114 +gcnames <- function(DiData,usecol=1){
  115 + nuruns <- dim(DiData)[2]
  116 + i = 1
  117 + nwnam <- rep("0",length.out=nuruns)
  118 + for(i in 1:nuruns){
  119 + if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
  120 + nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
  121 + } else{
  122 + nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
  123 + }
  124 +
  125 + }
  126 + nwnam
  127 +
  128 +}
  129 +
  130 +
  131 +
  132 +#The Rest of this code will be used every time you want to change a data set
  133 +
  134 +#Getting the series matrix file
  135 +print("Choose the series matrix file that you want to Analyze")
  136 +alz <- file.choose()
  137 +
  138 +#Getting the GPL file
  139 +print("Choose the GPL file that correlates with the above series matrix file")
  140 +genena <- file.choose()
  141 +
  142 +
  143 +#Set working directory based on the directory of the series matrix file Currently only works for windows
  144 +##strsplit(alz,"[\\]") %>%
  145 +## .[[1]] %>%
  146 +## .[-length(.)] %>%
  147 +## paste(.,collapse="/") %>%
  148 +## setwd()
  149 +
  150 +#Find out if it is a soft GPL file or not
  151 +soft <- strsplit(genena,"[\\|/]") %>%
  152 + .[[1]] %>%
  153 + .[length(.)] %>%
  154 + grepl("soft",.)
  155 +
  156 +#Working with the wordy part of the document
  157 +alzword <- alz %>%
  158 + read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
  159 + filter(grepl("!Sample",X1))%>%
  160 + filter(!grepl("!Sample_contact",X1))
  161 +
  162 +##Changing row names and column names:
  163 +ALZWORD <- t(alzword)
  164 +rownames(ALZWORD)=NULL
  165 +colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
  166 +ALZWORD <- chngrownm(ALZWORD)[-1,]
  167 +ALZWORD <- ALZWORD%>%
  168 + as.data.frame()%>%
  169 + dplyr::select(-starts_with("col"))
  170 +
  171 +##Reorganizing information within the columns
  172 +ALZWORDF <- cinfo(ALZWORD)
  173 +
  174 +
  175 +#Working with Actual Data part of file
  176 +alzdat <- alz %>%
  177 + read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
  178 +ALZDAT <- t(alzdat[,-1])
  179 +rownames(ALZDAT)=NULL
  180 +
  181 +##Is there a clean version of the GPL file available?
  182 +gplnum <- strsplit(genena,"[\\|/]") %>%
  183 + .[[1]] %>%
  184 + .[length(.)] %>%
  185 + gsub("\\D","",.)
  186 +clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
  187 +if(clfileex >= 1){
  188 +#use the clean version
  189 +geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
  190 + read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
  191 +
  192 +}
  193 +if(clfileex == 0){
  194 +##Lets Create a clean version
  195 +
  196 +##Gene ID to Gene Name
  197 +###geneIDNam <- genena %>%
  198 +### read_delim(delim="\t",comment = "#")%>%
  199 +### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
  200 +###problems with the above for soft files
  201 + if(soft == TRUE){
  202 + #gplnum <- strsplit(genena,"[\\|/]") %>%
  203 + # .[[1]] %>%
  204 + # .[length(.)] %>%
  205 + # gsub("\\D","",.)
  206 + #Check to see if there is already a file containing information on soft files
  207 + fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
  208 + if(fileex == 1){
  209 + #Check to see if this GPL soft file has been used before
  210 + IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
  211 + .$GPL_FILE_NUM%>%
  212 + grepl(gplnum,.) %>%
  213 + sum()
  214 + if(IDF == 1){
  215 + IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
  216 + .$GPL_FILE_NUM%>%
  217 + grep(gplnum,.)
  218 + idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
  219 + .$LOC_ID %>%
  220 + .[IDLOCAL]
  221 + geneIDNam <- genena %>%
  222 + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
  223 + dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
  224 + }
  225 + if(IDF == 0){
  226 + #No information on this particular GPL file
  227 + idLOCGPL <- genena %>%
  228 + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
  229 + t(.) %>%
  230 + grep("^\\D",.) %>%
  231 + length()-1
  232 + cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
  233 + cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
  234 + geneIDNam <- genena %>%
  235 + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
  236 + dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
  237 + }
  238 + }
  239 + if(fileex == 0){
  240 + #We must create a file that we can access for later use
  241 + idLOCGPL <- genena %>%
  242 + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
  243 + t(.) %>%
  244 + grep("^\\D",.) %>%
  245 + length()-1
  246 + Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
  247 + colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
  248 + write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
  249 + geneIDNam <- genena %>%
  250 + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
  251 + dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
  252 + }
  253 + }
  254 + if(soft == FALSE){
  255 + geneIDNam <- genena %>%
  256 + read_delim(delim="\t",comment = "#")%>%
  257 + dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
  258 + }
  259 +
  260 + ##Labeling the gene IDs without names
  261 + geneIDNam <- NAFIXING(geneIDNam)
  262 +
  263 + ##remove the whitespace
  264 + geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
  265 +
  266 + ##Here is the clean version
  267 + write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
  268 +}
  269 +
  270 +
  271 +
  272 +##Changing the gene ID to gene name
  273 +ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
  274 +colnames(ALZDAT) = ALZDAT1[1,]
  275 +
  276 +
  277 +##Adjusting the column names aka the gene names
  278 +colnames(ALZDAT) <- gcnames(ALZDAT)
  279 +
  280 +
  281 +#Full Data
  282 +Fullalzdw <- ALZDAT %>%
  283 + as.data.frame() %>%
  284 + cbind(ALZWORDF,.)
  285 +
  286 +
  287 +#nfna <- strsplit(alz,"[\\|/]") %>%
  288 +# .[[1]] %>%
  289 +# .[length(.)] %>%
  290 +# gsub("\\D","",.) %>%
  291 +# c("GSE",.,"after.txt") %>%
  292 +# paste(collapse = "")
  293 +#write.matrix(Fullalzdw,file = nfna,sep = "\t")
  294 +
  295 +#Perfect for excel viewing
  296 +nfnaex <- strsplit(alz,"[\\]") %>%
  297 + .[[1]] %>%
  298 + .[length(.)] %>%
  299 + gsub("\\D","",.) %>%
  300 + c("GSE",.,"aftexcel.txt") %>%
  301 + paste(collapse = "")
  302 +write.table(t(Fullalzdw), file = nfnaex, sep = "\t",row.names = FALSE)
0 303 \ No newline at end of file