Commit cc59b7f8323cecc33ca1facf20f024c2a1b5a73e

Authored by Efrain Gonzalez
1 parent c0625ba184
Exists in master

Second version of code (UNTESTED)

Showing 1 changed file with 284 additions and 0 deletions   Show diff stats
... ... @@ -0,0 +1,284 @@
  1 +#Libraries required to run the code
  2 +library(pryr)
  3 +library(MASS)
  4 +library(dplyr)
  5 +library(tidyr)
  6 +library(readr)
  7 +library(stringr)
  8 +
  9 +
  10 +#Necessary Functions
  11 +#1#Function for handling the changing of row names and column names
  12 +chngrownm <- function(mat){
  13 + row <- dim(mat)[1]
  14 + col <- dim(mat)[2]
  15 + j <- 1
  16 + x <- 1
  17 + p <- 1
  18 + a <- 1
  19 + b <- 1
  20 + g <- 1
  21 + for(j in 1:col){
  22 + if("!Sample_source_name_ch1"==mat[1,j]){
  23 + colnames(mat)[j] <- "Brain_Region"
  24 + }
  25 + if("!Sample_title" == mat[1,j]){
  26 + colnames(mat)[j] <- "Title"
  27 + }
  28 + if("!Sample_geo_accession" == mat[1,j]){
  29 + colnames(mat)[j] <- "ID_REF"
  30 + } else{
  31 + if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
  32 + colnames(mat)[j] <- paste0("Sex",x)
  33 + x = x + 1
  34 + }
  35 + if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
  36 + colnames(mat)[j] <- paste0("PMI",p)
  37 + p = p + 1
  38 + }
  39 + if(grepl("age|Age|AGE",mat[2,j])==TRUE){
  40 + colnames(mat)[j] <- paste0("Age",a)
  41 + a = a + 1
  42 + }
  43 + if(grepl("braak|b&b",mat[2,j])==TRUE){
  44 + colnames(mat)[j] <- paste0("Braak",b)
  45 + b = b + 1
  46 + }
  47 + if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
  48 + colnames(mat)[j] <- paste0("Group",g)
  49 + g = g + 1
  50 + }
  51 +
  52 + }
  53 + j = j + 1
  54 + }
  55 + mat
  56 +}
  57 +
  58 +#2#Function for reorganizing information within the columns
  59 +cinfo <- function(mat){
  60 + col <- dim(mat)[2]
  61 + j <-2
  62 + for(j in 2:col){
  63 + if(grepl("Group",colnames(mat)[j]) == TRUE){
  64 + mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
  65 + }
  66 + if(grepl("Age",colnames(mat)[j])==TRUE){
  67 + mat[,j] <- gsub("\\D","",mat[,j])%>%
  68 + as.integer()
  69 + }
  70 + if(grepl("Sex",colnames(mat)[j])==TRUE){
  71 + mat[,j] <- gsub(".+:\\s","",mat[,j])
  72 + }
  73 + if(grepl("PMI",colnames(mat)[j])==TRUE){
  74 + mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
  75 + as.numeric()
  76 + }
  77 + if(grepl("Braak",colnames(mat)[j])==TRUE){
  78 + mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
  79 + as.roman()%>%
  80 + as.integer()
  81 + }
  82 + j=j+1
  83 + }
  84 + mat
  85 +}
  86 +
  87 +#3#Function for labeling the gene IDs without names
  88 +NAFIXING <- function(GIDNAM){
  89 + row <- dim(GIDNAM)[1]
  90 + i <- 1
  91 + x <- 1
  92 + for(i in 1:row){
  93 + if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE){
  94 + GIDNAM[i,2] <- gsub(".*",paste0(NA,x),GIDNAM[i,2])
  95 + x <- x + 1
  96 + }
  97 + i <- i + 1
  98 + }
  99 + GIDNAM
  100 +}
  101 +
  102 +#4#Function for changing the gene ID to gene name
  103 +cgeneID <- function(GeneName,DATA){
  104 + colGene <- dim(GeneName)[2]
  105 + j <- 1
  106 + for(j in 1:colGene){
  107 + chngsreq <- grep(GeneName[1,j],DATA[1,])
  108 + DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
  109 + j = j+1
  110 + }
  111 + DATA
  112 +}
  113 +
  114 +#5#Function for adjusting the gene names
  115 +gcnames <- function(DiData,usecol=1){
  116 + nuruns <- dim(DiData)[2]
  117 + i = 1
  118 + nwnam <- rep("0",length.out=nuruns)
  119 + for(i in 1:nuruns){
  120 + if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
  121 + nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
  122 + } else{
  123 + nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
  124 + }
  125 +
  126 + }
  127 + nwnam
  128 +
  129 +}
  130 +
  131 +
  132 +
  133 +#The Rest of this code will be used every time you want to change a data set
  134 +
  135 +#Getting the series matrix file
  136 +print("Choose the series matrix file that you want to Analyze")
  137 +alz <- file.choose()
  138 +
  139 +#Getting the GPL file
  140 +print("Choose the GPL file that correlates with the above series matrix file")
  141 +genena <- file.choose()
  142 +
  143 +
  144 +#Set working directory based on the directory of the series matrix file Currently only works for windows
  145 +##strsplit(alz,"[\\]") %>%
  146 +## .[[1]] %>%
  147 +## .[-length(.)] %>%
  148 +## paste(.,collapse="/") %>%
  149 +## setwd()
  150 +
  151 +#Find out if it is a soft GPL file or not
  152 +soft <- strsplit(genena,"[\\|/]") %>%
  153 + .[[1]] %>%
  154 + .[length(.)] %>%
  155 + grepl("soft",.)
  156 +
  157 +#Working with the wordy part of the document
  158 +alzword <- alz %>%
  159 + read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
  160 + filter(grepl("!Sample",X1))%>%
  161 + filter(!grepl("!Sample_contact",X1))
  162 +
  163 +##Changing row names and column names:
  164 +ALZWORD <- t(alzword)
  165 +rownames(ALZWORD)=NULL
  166 +colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
  167 +ALZWORD <- chngrownm(ALZWORD)[-1,]
  168 +ALZWORD <- ALZWORD%>%
  169 + as.data.frame()%>%
  170 + dplyr::select(-starts_with("col"))
  171 +
  172 +##Reorganizing information within the columns
  173 +ALZWORDF <- cinfo(ALZWORD)
  174 +
  175 +
  176 +#Working with Actual Data part of file
  177 +alzdat <- alz %>%
  178 + read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
  179 +ALZDAT <- t(alzdat[,-1])
  180 +rownames(ALZDAT)=NULL
  181 +
  182 +
  183 +##Gene ID to Gene Name
  184 +###geneIDNam <- genena %>%
  185 +### read_delim(delim="\t",comment = "#")%>%
  186 +### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
  187 +###problems with the above for soft files
  188 +if(soft == TRUE){
  189 + gplnum <- strsplit(genena,"[\\|/]") %>%
  190 + .[[1]] %>%
  191 + .[length(.)] %>%
  192 + gsub("\\D","",.)
  193 + #Check to see if there is already a file containing information on soft files
  194 + fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
  195 + if(fileex == 1){
  196 + #Check to see if this GPL soft file has been used before
  197 + IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
  198 + .$GPL_FILE_NUM%>%
  199 + grepl(gplnum,.) %>%
  200 + sum()
  201 + if(IDF == 1){
  202 + IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
  203 + .$GPL_FILE_NUM%>%
  204 + grep(gplnum,.)
  205 + idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
  206 + .$LOC_ID %>%
  207 + .[IDLOCAL]
  208 + geneIDNam <- genena %>%
  209 + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
  210 + dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
  211 + }
  212 + if(IDF == 0){
  213 + #No information on this particular GPL file
  214 + idLOCGPL <- genena %>%
  215 + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
  216 + t(.) %>%
  217 + grep("^\\D",.) %>%
  218 + length()-1
  219 + cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
  220 + cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
  221 + geneIDNam <- genena %>%
  222 + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
  223 + dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
  224 + }
  225 + }
  226 + if(fileex == 0){
  227 + #We must create a file that we can access for later use
  228 + idLOCGPL <- genena %>%
  229 + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
  230 + t(.) %>%
  231 + grep("^\\D",.) %>%
  232 + length()-1
  233 + Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
  234 + colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
  235 + write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
  236 + geneIDNam <- genena %>%
  237 + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
  238 + dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
  239 + }
  240 +}
  241 +if(soft == FALSE){
  242 + geneIDNam <- genena %>%
  243 + read_delim(delim="\t",comment = "#")%>%
  244 + dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
  245 +}
  246 +
  247 +##Labeling the gene IDs without names
  248 +geneIDNam <- NAFIXING(geneIDNam)
  249 +
  250 +##remove the whitespace
  251 +geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
  252 +
  253 +##Changing the gene ID to gene name
  254 +ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
  255 +colnames(ALZDAT) = ALZDAT1[1,]
  256 +
  257 +
  258 +##Adjusting the column names aka the gene names
  259 +colnames(ALZDAT) <- gcnames(ALZDAT)
  260 +
  261 +
  262 +#Full Data
  263 +Fullalzdw <- ALZDAT %>%
  264 + as.data.frame() %>%
  265 + cbind(ALZWORDF,.)
  266 +
  267 +
  268 +nfna <- strsplit(alz,"[\\]") %>%
  269 + .[[1]] %>%
  270 + .[length(.)] %>%
  271 + gsub("\\D","",.) %>%
  272 + c("GSE",.,"after.txt") %>%
  273 + paste(collapse = "")
  274 +write.matrix(Fullalzdw,file = nfna,sep = "\t")
  275 +#Perfect for excel viewing
  276 +nfnaex <- strsplit(alz,"[\\]") %>%
  277 + .[[1]] %>%
  278 + .[length(.)] %>%
  279 + gsub("\\D","",.) %>%
  280 + c("GSE",.,"aftexcel.txt") %>%
  281 + paste(collapse = "")
  282 +write.table(t(Fullalzdw), file = nfnaex, sep = "\t")
  283 +
  284 +