Commit a66a63dc504c3814768a1a79640a405d82ea0524

Authored by Efrain Gonzalez
1 parent 061033644f
Exists in master

First version of the cleaning process in R

Showing 1 changed file with 198 additions and 0 deletions   Show diff stats
... ... @@ -0,0 +1,198 @@
  1 +#Libraries required to run the code
  2 +library(MASS)
  3 +library(pryr)
  4 +library(dplyr)
  5 +library(tidyr)
  6 +library(readr)
  7 +library(stringr)
  8 +
  9 +
  10 +#Necessary Functions
  11 +#1#Function for handling the changing of row names and column names
  12 +chngrownm <- function(mat){
  13 + row <- dim(mat)[1]
  14 + col <- dim(mat)[2]
  15 + j <- 1
  16 + x <- 1
  17 + p <- 1
  18 + a <- 1
  19 + b <- 1
  20 + g <- 1
  21 + for(j in 1:col){
  22 + if("!Sample_source_name_ch1"==mat[1,j]){
  23 + colnames(mat)[j] <- "Brain_Region"
  24 + }
  25 + if("!Sample_title" == mat[1,j]){
  26 + colnames(mat)[j] <- "Title"
  27 + }
  28 + if("!Sample_geo_accession" == mat[1,j]){
  29 + colnames(mat)[j] <- "ID_REF"
  30 + } else{
  31 + if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
  32 + colnames(mat)[j] <- paste0("Sex",x)
  33 + x = x + 1
  34 + }
  35 + if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
  36 + colnames(mat)[j] <- paste0("PMI",p)
  37 + p = p + 1
  38 + }
  39 + if(grepl("age|Age|AGE",mat[2,j])==TRUE){
  40 + colnames(mat)[j] <- paste0("Age",a)
  41 + a = a + 1
  42 + }
  43 + if(grepl("braak|b&b",mat[2,j])==TRUE){
  44 + colnames(mat)[j] <- paste0("Braak",b)
  45 + b = b + 1
  46 + }
  47 + if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
  48 + colnames(mat)[j] <- paste0("Group",g)
  49 + g = g + 1
  50 + }
  51 +
  52 + }
  53 + j = j + 1
  54 + }
  55 + mat
  56 +}
  57 +
  58 +#2#Function for reorganizing information within the columns
  59 +cinfo <- function(mat){
  60 + col <- dim(mat)[2]
  61 + j <-2
  62 + for(j in 2:col){
  63 + if(grepl("Group",colnames(mat)[j]) == TRUE){
  64 + mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
  65 + }
  66 + if(grepl("Age",colnames(mat)[j])==TRUE){
  67 + mat[,j] <- gsub("\\D","",mat[,j])%>%
  68 + as.integer()
  69 + }
  70 + if(grepl("Sex",colnames(mat)[j])==TRUE){
  71 + mat[,j] <- gsub(".+:\\s","",mat[,j])
  72 + }
  73 + if(grepl("PMI",colnames(mat)[j])==TRUE){
  74 + mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
  75 + as.numeric()
  76 + }
  77 + if(grepl("Braak",colnames(mat)[j])==TRUE){
  78 + mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
  79 + as.roman()%>%
  80 + as.integer()
  81 + }
  82 + j=j+1
  83 + }
  84 + mat
  85 +}
  86 +
  87 +#3#Function for changing the gene ID to gene name
  88 +cgeneID <- function(GeneName,DATA){
  89 + colGene <- dim(GeneName)[2]
  90 + j <- 1
  91 + for(j in 1:colGene){
  92 + chngsreq <- grep(GeneName[1,j],DATA[1,])
  93 + DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
  94 + j = j+1
  95 + }
  96 + DATA
  97 +}
  98 +
  99 +#4#Function for adjusting the gene names
  100 +gcnames <- function(DiData,usecol=1){
  101 + nuruns <- dim(DiData)[2]
  102 + i = 1
  103 + nwnam <- rep("0",length.out=nuruns)
  104 + for(i in 1:nuruns){
  105 + if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
  106 + nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
  107 + } else{
  108 + nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
  109 + }
  110 +
  111 + }
  112 + nwnam
  113 +
  114 +}
  115 +
  116 +
  117 +
  118 +#The Rest of this code will be used every time you want to change a data set
  119 +
  120 +#Getting the series matrix file
  121 +print("Choose the series matrix file that you want to Analyze")
  122 +alz <- file.choose()
  123 +
  124 +#Getting the GPL file
  125 +print("Choose the GPL file that correlates with the above series matrix file")
  126 +genena <- file.choose()
  127 +
  128 +
  129 +#Set working directory based on the directory of the series matrix file
  130 +##strsplit(alz,"[\\]") %>%
  131 +## .[[1]] %>%
  132 +## .[-length(.)] %>%
  133 +## paste(.,collapse="/") %>%
  134 +## setwd()
  135 +
  136 +
  137 +#Working with the wordy part of the document
  138 +alzword <- alz %>%
  139 + read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
  140 + filter(grepl("!Sample",X1))%>%
  141 + filter(!grepl("!Sample_contact",X1))
  142 +
  143 +##Changing row names and column names:
  144 +ALZWORD <- t(alzword)
  145 +rownames(ALZWORD)=NULL
  146 +colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
  147 +ALZWORD <- chngrownm(ALZWORD)[-1,]
  148 +ALZWORD <- ALZWORD%>%
  149 + as.data.frame()%>%
  150 + dplyr::select(-starts_with("col"))
  151 +
  152 +##Reorganizing information within the columns
  153 +ALZWORDF <- cinfo(ALZWORD)
  154 +
  155 +
  156 +#Working with Actual Data part of file
  157 +alzdat <- alz %>%
  158 + read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
  159 +ALZDAT <- t(alzdat[,-1])
  160 +rownames(ALZDAT)=NULL
  161 +
  162 +
  163 +##Gene ID to Gene Name
  164 +geneIDNam <- genena %>%
  165 + read_delim(delim="\t",comment = "#")%>%
  166 + dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
  167 +
  168 +##Changing the ID to a Name
  169 +ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
  170 +colnames(ALZDAT) = ALZDAT1[1,]
  171 +
  172 +
  173 +##Adjusting the column names aka the gene names
  174 +colnames(ALZDAT) <- gcnames(ALZDAT)
  175 +
  176 +
  177 +#Full Data
  178 +Fullalzdw <- ALZDAT %>%
  179 + as.data.frame() %>%
  180 + cbind(ALZWORDF,.)
  181 +
  182 +##since the order in which the packages are added matters I moved this package to the top
  183 +##library(MASS)
  184 +nfna <- strsplit(alz,"[\\]") %>%
  185 + .[[1]] %>%
  186 + .[length(.)] %>%
  187 + gsub("\\D","",.) %>%
  188 + c("GSE",.,"after.txt") %>%
  189 + paste(collapse = "")
  190 +MASS::write.matrix(Fullalzdw,file = nfna,sep = "\t")
  191 +#Perfect for excel viewing
  192 +nfnaex <- strsplit(alz,"[\\]") %>%
  193 + .[[1]] %>%
  194 + .[length(.)] %>%
  195 + gsub("\\D","",.) %>%
  196 + c("GSE",.,"aftexcel.txt") %>%
  197 + paste(collapse = "")
  198 +write.table(t(Fullalzdw), file = nfnaex, sep = "\t")