From 16b4f55de1bee74c9f9060e6884d4d27c04cfe45 Mon Sep 17 00:00:00 2001 From: Efrain Gonzalez Date: Fri, 26 May 2017 12:18:30 -0400 Subject: [PATCH] wrong extension --- Rclean.txt | 198 ----------------------------------------------------- 1 file changed, 198 deletions(-) delete mode 100644 Rclean.txt diff --git a/Rclean.txt b/Rclean.txt deleted file mode 100644 index 788b9df..0000000 --- a/Rclean.txt +++ /dev/null @@ -1,198 +0,0 @@ -#Libraries required to run the code -library(MASS) -library(pryr) -library(dplyr) -library(tidyr) -library(readr) -library(stringr) - - -#Necessary Functions -#1#Function for handling the changing of row names and column names -chngrownm <- function(mat){ - row <- dim(mat)[1] - col <- dim(mat)[2] - j <- 1 - x <- 1 - p <- 1 - a <- 1 - b <- 1 - g <- 1 - for(j in 1:col){ - if("!Sample_source_name_ch1"==mat[1,j]){ - colnames(mat)[j] <- "Brain_Region" - } - if("!Sample_title" == mat[1,j]){ - colnames(mat)[j] <- "Title" - } - if("!Sample_geo_accession" == mat[1,j]){ - colnames(mat)[j] <- "ID_REF" - } else{ - if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){ - colnames(mat)[j] <- paste0("Sex",x) - x = x + 1 - } - if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){ - colnames(mat)[j] <- paste0("PMI",p) - p = p + 1 - } - if(grepl("age|Age|AGE",mat[2,j])==TRUE){ - colnames(mat)[j] <- paste0("Age",a) - a = a + 1 - } - if(grepl("braak|b&b",mat[2,j])==TRUE){ - colnames(mat)[j] <- paste0("Braak",b) - b = b + 1 - } - if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){ - colnames(mat)[j] <- paste0("Group",g) - g = g + 1 - } - - } - j = j + 1 - } - mat -} - -#2#Function for reorganizing information within the columns -cinfo <- function(mat){ - col <- dim(mat)[2] - j <-2 - for(j in 2:col){ - if(grepl("Group",colnames(mat)[j]) == TRUE){ - mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j]) - } - if(grepl("Age",colnames(mat)[j])==TRUE){ - mat[,j] <- gsub("\\D","",mat[,j])%>% - as.integer() - } - if(grepl("Sex",colnames(mat)[j])==TRUE){ - mat[,j] <- gsub(".+:\\s","",mat[,j]) - } - if(grepl("PMI",colnames(mat)[j])==TRUE){ - mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>% - as.numeric() - } - if(grepl("Braak",colnames(mat)[j])==TRUE){ - mat[,j]<-gsub(".+:\\s","",mat[,j])%>% - as.roman()%>% - as.integer() - } - j=j+1 - } - mat -} - -#3#Function for changing the gene ID to gene name -cgeneID <- function(GeneName,DATA){ - colGene <- dim(GeneName)[2] - j <- 1 - for(j in 1:colGene){ - chngsreq <- grep(GeneName[1,j],DATA[1,]) - DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq]) - j = j+1 - } - DATA -} - -#4#Function for adjusting the gene names -gcnames <- function(DiData,usecol=1){ - nuruns <- dim(DiData)[2] - i = 1 - nwnam <- rep("0",length.out=nuruns) - for(i in 1:nuruns){ - if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){ - nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol] - } else{ - nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1] - } - - } - nwnam - -} - - - -#The Rest of this code will be used every time you want to change a data set - -#Getting the series matrix file -print("Choose the series matrix file that you want to Analyze") -alz <- file.choose() - -#Getting the GPL file -print("Choose the GPL file that correlates with the above series matrix file") -genena <- file.choose() - - -#Set working directory based on the directory of the series matrix file -##strsplit(alz,"[\\]") %>% -## .[[1]] %>% -## .[-length(.)] %>% -## paste(.,collapse="/") %>% -## setwd() - - -#Working with the wordy part of the document -alzword <- alz %>% - read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>% - filter(grepl("!Sample",X1))%>% - filter(!grepl("!Sample_contact",X1)) - -##Changing row names and column names: -ALZWORD <- t(alzword) -rownames(ALZWORD)=NULL -colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE) -ALZWORD <- chngrownm(ALZWORD)[-1,] -ALZWORD <- ALZWORD%>% - as.data.frame()%>% - dplyr::select(-starts_with("col")) - -##Reorganizing information within the columns -ALZWORDF <- cinfo(ALZWORD) - - -#Working with Actual Data part of file -alzdat <- alz %>% - read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1) -ALZDAT <- t(alzdat[,-1]) -rownames(ALZDAT)=NULL - - -##Gene ID to Gene Name -geneIDNam <- genena %>% - read_delim(delim="\t",comment = "#")%>% - dplyr::select(.,ID,grep("Symbol|ORF",colnames(.))) - -##Changing the ID to a Name -ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat)) -colnames(ALZDAT) = ALZDAT1[1,] - - -##Adjusting the column names aka the gene names -colnames(ALZDAT) <- gcnames(ALZDAT) - - -#Full Data -Fullalzdw <- ALZDAT %>% - as.data.frame() %>% - cbind(ALZWORDF,.) - -##since the order in which the packages are added matters I moved this package to the top -##library(MASS) -nfna <- strsplit(alz,"[\\]") %>% - .[[1]] %>% - .[length(.)] %>% - gsub("\\D","",.) %>% - c("GSE",.,"after.txt") %>% - paste(collapse = "") -MASS::write.matrix(Fullalzdw,file = nfna,sep = "\t") -#Perfect for excel viewing -nfnaex <- strsplit(alz,"[\\]") %>% - .[[1]] %>% - .[length(.)] %>% - gsub("\\D","",.) %>% - c("GSE",.,"aftexcel.txt") %>% - paste(collapse = "") -write.table(t(Fullalzdw), file = nfnaex, sep = "\t") -- 2.29.0