From cc59b7f8323cecc33ca1facf20f024c2a1b5a73e Mon Sep 17 00:00:00 2001 From: Efrain Gonzalez Date: Tue, 30 May 2017 11:51:02 -0400 Subject: [PATCH] Second version of code (UNTESTED) --- RClean2.R | 284 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 284 insertions(+) create mode 100644 RClean2.R diff --git a/RClean2.R b/RClean2.R new file mode 100644 index 0000000..cb8981d --- /dev/null +++ b/RClean2.R @@ -0,0 +1,284 @@ +#Libraries required to run the code +library(pryr) +library(MASS) +library(dplyr) +library(tidyr) +library(readr) +library(stringr) + + +#Necessary Functions +#1#Function for handling the changing of row names and column names +chngrownm <- function(mat){ + row <- dim(mat)[1] + col <- dim(mat)[2] + j <- 1 + x <- 1 + p <- 1 + a <- 1 + b <- 1 + g <- 1 + for(j in 1:col){ + if("!Sample_source_name_ch1"==mat[1,j]){ + colnames(mat)[j] <- "Brain_Region" + } + if("!Sample_title" == mat[1,j]){ + colnames(mat)[j] <- "Title" + } + if("!Sample_geo_accession" == mat[1,j]){ + colnames(mat)[j] <- "ID_REF" + } else{ + if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ + colnames(mat)[j] <- paste0("Sex",x) + x = x + 1 + } + if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ + colnames(mat)[j] <- paste0("PMI",p) + p = p + 1 + } + if(grepl("age|Age|AGE",mat[2,j])==TRUE){ + colnames(mat)[j] <- paste0("Age",a) + a = a + 1 + } + if(grepl("braak|b&b",mat[2,j])==TRUE){ + colnames(mat)[j] <- paste0("Braak",b) + b = b + 1 + } + if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){ + colnames(mat)[j] <- paste0("Group",g) + g = g + 1 + } + + } + j = j + 1 + } + mat +} + +#2#Function for reorganizing information within the columns +cinfo <- function(mat){ + col <- dim(mat)[2] + j <-2 + for(j in 2:col){ + if(grepl("Group",colnames(mat)[j]) == TRUE){ + mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) + } + if(grepl("Age",colnames(mat)[j])==TRUE){ + mat[,j] <- gsub("\\D","",mat[,j])%>% + as.integer() + } + if(grepl("Sex",colnames(mat)[j])==TRUE){ + mat[,j] <- gsub(".+:\\s","",mat[,j]) + } + if(grepl("PMI",colnames(mat)[j])==TRUE){ + mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% + as.numeric() + } + if(grepl("Braak",colnames(mat)[j])==TRUE){ + mat[,j]<-gsub(".+:\\s","",mat[,j])%>% + as.roman()%>% + as.integer() + } + j=j+1 + } + mat +} + +#3#Function for labeling the gene IDs without names +NAFIXING <- function(GIDNAM){ + row <- dim(GIDNAM)[1] + i <- 1 + x <- 1 + for(i in 1:row){ + if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE){ + GIDNAM[i,2] <- gsub(".*",paste0(NA,x),GIDNAM[i,2]) + x <- x + 1 + } + i <- i + 1 + } + GIDNAM +} + +#4#Function for changing the gene ID to gene name +cgeneID <- function(GeneName,DATA){ + colGene <- dim(GeneName)[2] + j <- 1 + for(j in 1:colGene){ + chngsreq <- grep(GeneName[1,j],DATA[1,]) + DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) + j = j+1 + } + DATA +} + +#5#Function for adjusting the gene names +gcnames <- function(DiData,usecol=1){ + nuruns <- dim(DiData)[2] + i = 1 + nwnam <- rep("0",length.out=nuruns) + for(i in 1:nuruns){ + if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ + nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol] + } else{ + nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1] + } + + } + nwnam + +} + + + +#The Rest of this code will be used every time you want to change a data set + +#Getting the series matrix file +print("Choose the series matrix file that you want to Analyze") +alz <- file.choose() + +#Getting the GPL file +print("Choose the GPL file that correlates with the above series matrix file") +genena <- file.choose() + + +#Set working directory based on the directory of the series matrix file Currently only works for windows +##strsplit(alz,"[\\]") %>% +## .[[1]] %>% +## .[-length(.)] %>% +## paste(.,collapse="/") %>% +## setwd() + +#Find out if it is a soft GPL file or not +soft <- strsplit(genena,"[\\|/]") %>% + .[[1]] %>% + .[length(.)] %>% + grepl("soft",.) + +#Working with the wordy part of the document +alzword <- alz %>% + read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% + filter(grepl("!Sample",X1))%>% + filter(!grepl("!Sample_contact",X1)) + +##Changing row names and column names: +ALZWORD <- t(alzword) +rownames(ALZWORD)=NULL +colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) +ALZWORD <- chngrownm(ALZWORD)[-1,] +ALZWORD <- ALZWORD%>% + as.data.frame()%>% + dplyr::select(-starts_with("col")) + +##Reorganizing information within the columns +ALZWORDF <- cinfo(ALZWORD) + + +#Working with Actual Data part of file +alzdat <- alz %>% + read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) +ALZDAT <- t(alzdat[,-1]) +rownames(ALZDAT)=NULL + + +##Gene ID to Gene Name +###geneIDNam <- genena %>% +### read_delim(delim="\t",comment = "#")%>% +### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) +###problems with the above for soft files +if(soft == TRUE){ + gplnum <- strsplit(genena,"[\\|/]") %>% + .[[1]] %>% + .[length(.)] %>% + gsub("\\D","",.) + #Check to see if there is already a file containing information on soft files + fileex <- sum(grepl("GPL_ID_LOC.txt",list.files())) + if(fileex == 1){ + #Check to see if this GPL soft file has been used before + IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% + .$GPL_FILE_NUM%>% + grepl(gplnum,.) %>% + sum() + if(IDF == 1){ + IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% + .$GPL_FILE_NUM%>% + grep(gplnum,.) + idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>% + .$LOC_ID %>% + .[IDLOCAL] + geneIDNam <- genena %>% + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>% + dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) + } + if(IDF == 0){ + #No information on this particular GPL file + idLOCGPL <- genena %>% + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% + t(.) %>% + grep("^\\D",.) %>% + length()-1 + cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>% + cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE) + geneIDNam <- genena %>% + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% + dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) + } + } + if(fileex == 0){ + #We must create a file that we can access for later use + idLOCGPL <- genena %>% + read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>% + t(.) %>% + grep("^\\D",.) %>% + length()-1 + Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL)) + colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID") + write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE) + geneIDNam <- genena %>% + read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>% + dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) + } +} +if(soft == FALSE){ + geneIDNam <- genena %>% + read_delim(delim="\t",comment = "#")%>% + dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) +} + +##Labeling the gene IDs without names +geneIDNam <- NAFIXING(geneIDNam) + +##remove the whitespace +geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,]))) + +##Changing the gene ID to gene name +ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) +colnames(ALZDAT) = ALZDAT1[1,] + + +##Adjusting the column names aka the gene names +colnames(ALZDAT) <- gcnames(ALZDAT) + + +#Full Data +Fullalzdw <- ALZDAT %>% + as.data.frame() %>% + cbind(ALZWORDF,.) + + +nfna <- strsplit(alz,"[\\]") %>% + .[[1]] %>% + .[length(.)] %>% + gsub("\\D","",.) %>% + c("GSE",.,"after.txt") %>% + paste(collapse = "") +write.matrix(Fullalzdw,file = nfna,sep = "\t") +#Perfect for excel viewing +nfnaex <- strsplit(alz,"[\\]") %>% + .[[1]] %>% + .[length(.)] %>% + gsub("\\D","",.) %>% + c("GSE",.,"aftexcel.txt") %>% + paste(collapse = "") +write.table(t(Fullalzdw), file = nfnaex, sep = "\t") + + -- 2.29.0