#Efrain Gonzalez #8/25/2017 #Code for Markov Blanket #The required libraries library(pryr) library(MASS) library(dplyr) library(tidyr) library(readr) library(stringr) #Have the user choose an original Dot file that they want to use DotFile <- file.choose() TheDotP1 <- read_delim(DotFile,delim = "\t",col_names = FALSE) %>% dplyr::filter(!grepl("->|[{}]",X1)) %>% dplyr::filter(!grepl("Banjo",X1)) %>% dplyr::filter(!grepl("labeljust",X1)) counterP1 <- 1 sizeDotP1 <- dim(TheDotP1)[1] NewDotP1 <- matrix("0",ncol = 2, nrow = sizeDotP1) for(counterP1 in 1:sizeDotP1){ coldataP1 <- str_trim(TheDotP1[counterP1,1]) %>% as.character(.,stringsAsFactors = FALSE) if(grepl("Banjo|labeljust|>",coldataP1)==FALSE){ NumberP1 <- strsplit(coldataP1," ") %>% .[[1]]%>% .[1] VarNameP1 <- strsplit(coldataP1," ") %>% .[[1]] %>% .[2] %>% strsplit(.,"\"") %>% .[[1]] %>% .[grep("^\\w|^\\d",.)] NewDotP1[counterP1,1] <- VarNameP1 NewDotP1[counterP1,2] <- NumberP1 } if(grepl("->",coldataP1) == TRUE){ break } } TheDotP2 <- read_delim(DotFile,delim = "\t",col_names = FALSE) %>% dplyr::filter(grepl("->",X1)) counterP2 <- 1 sizeDotP2 <- dim(TheDotP2)[1] NewDotP2 <- matrix("0",ncol = 2, nrow = sizeDotP2) for(counterP2 in 1:sizeDotP2){ coldataP2 <- str_trim(TheDotP2[counterP2,1]) %>% as.character(.,stringsAsFactors = FALSE) ParentNumP2 <- strsplit(coldataP2,"->") %>% .[[1]]%>% .[1] ChildNumP2 <- strsplit(coldataP2,"->") %>% .[[1]] %>% .[2] %>% strsplit(.,";") %>% .[[1]] %>% .[1] NewDotP2[counterP2,1] <- ParentNumP2 NewDotP2[counterP2,2] <- ChildNumP2 } colnames(NewDotP2) <- c("Parents","Children") #Matching numbers to variable names NewDotP2_2 <- NewDotP2 for(i in 1:sizeDotP1){ #Where is the variable located within NewDotP2 (column one only)? chngreq <- grep(paste0("^",NewDotP1[i,2],"$"),NewDotP2_2[,1]) if(is.na(sum(chngreq)) == FALSE){ if(sum(chngreq) > 0){ NewDotP2_2[chngreq,1] <- gsub(paste0("^",NewDotP1[i,2],"$"),NewDotP1[i,1],NewDotP2_2[chngreq,1]) } } #i <- i + 1 } NewDotP2_2 for(j in 1:sizeDotP1){ #Where is the variable located within NewDotP2 (column two only)? chngreq <- grep(paste0("^",NewDotP1[j,2],"$"),NewDotP2_2[,2]) if(is.na(sum(chngreq)) == FALSE){ if(sum(chngreq) > 0){ NewDotP2_2[chngreq,2] <- gsub(paste0("^",NewDotP1[j,2],"$"),NewDotP1[j,1],NewDotP2_2[chngreq,2]) } } #j <- j + 1 } lrgMarkov <- dim(NewDotP2_2)[1] Blanky <- function(MarkovDegree = 20, VariableStartName = "Alzheimer", VariableEndName = "GRIN2A"){ #Finding the Parents and Children d <- 1 AllNamList1 <- vector("list",length = 3) #AllNamList <- vector("list", length = 3) ##Finding the Parents of the Parents ##A list of lists ##outer set by the degree of the Markov blanket AllVarList <- vector("list",length = 3) varfound <- 0 for(d in 1:MarkovDegree){ colnames(NewDotP2_2) <- NULL ##Which variable are you looking for? ##This is the VariableEndName if(d == 1){ ##Finding the Parents for the variable LocPofVar <- grep(paste0("^",VariableStartName,"$"),NewDotP2_2[,2]) PofVar <- NewDotP2_2[LocPofVar,1] AllNamList1[[1]] <- PofVar AllNamList1[[1]] <- AllNamList1[[1]][!duplicated(AllNamList1[[1]])] varfound <- varfound + sum(grepl(paste0("^",VariableEndName,"$"),AllNamList1[[1]])) ##Finding the Children for the variable LocCofVar <- grep(paste0("^",VariableStartName,"$"),NewDotP2_2[,1]) CofVar <- NewDotP2_2[LocCofVar,2] AllNamList1[[2]] <- CofVar AllNamList1[[2]] <- AllNamList1[[2]][!duplicated(AllNamList1[[2]])] varfound <- varfound + sum(grepl(paste0("^",VariableEndName,"$"),AllNamList1[[2]])) ##Finding the Co-Parents of the Children for the variable NumofChild <- length(CofVar) if(NumofChild > 0){ ##Creating a list of the Co-Parents for each of the children ##list size is based on the amount of Children COPlist <- vector("character", length = 0) nc <- 1 for(nc in 1:NumofChild){ LocCOPofVar <- grep(paste0("^",CofVar[nc],"$"),NewDotP2_2[,2]) COPofVar <- NewDotP2_2[LocCOPofVar,1] if(sum(grepl(VariableStartName,COPofVar)) >= 1){ #positions of variable start name within the vector of co parents posoforig <- grep(paste0("^",VariableStartName,"$"),COPofVar) COPofVar <- COPofVar[-posoforig] COPlist <- append(COPlist,COPofVar) } else{ #COPlist[[nc]] <- COPofVar[COPofVar!=VariableStartName] COPlist <- append(COPlist,COPofVar) } #nc <- nc + 1 } } else { ##Making COPlist empty COPlist <- vector("character",length = 0) } AllNamList1[[3]] <- COPlist AllNamList1[[3]] <- AllNamList1[[3]][!duplicated(AllNamList1[[3]])] varfound <- varfound + sum(grepl(paste0("^",VariableEndName,"$"),AllNamList1[[3]])) AllVarList[[1]] <- AllNamList1 } else if(d > 1){ ##inner set by the length of the previous AllVarlist we are working on lPreVList <- length(AllVarList[[d-1]]) ef <- 1 ##PCCP will eventually equal the total size that we expect for the iteration (#d) ## which is just lPreVList * 3 PCCP <- 1 newsize <- (lPreVList * 3) if(d > 3){ AllVarList[[d]] <- vector("list") } for(ef in 1:lPreVList){ ##Finding the Parents NumofVars <- length(AllVarList[[d-1]][[ef]]) if(NumofVars > 0){ ##Creating a list of the Parents for each of the Variables ##list size is based on the amount of Previous Variables PofVlist <- vector("character", length = 0) np <- 1 for(np in 1:NumofVars){ LocPofVar <- grep(paste0("^",AllVarList[[d-1]][[ef]][np],"$"),NewDotP2_2[,2]) PofVar <- NewDotP2_2[LocPofVar,1] PofVlist <- append(PofVlist,PofVar) #np <- np + 1 } } else { ##Making COPlist empty PofVlist <- vector("character",length = 0) } AllVarList[[d]][[PCCP]] <- PofVlist AllVarList[[d]][[PCCP]] <- AllVarList[[d]][[PCCP]][!duplicated(AllVarList[[d]][[PCCP]])] ##Have you found the VariableEndName? varfound <- varfound + sum(grepl(paste0("^",VariableEndName,"$"),AllVarList[[d]][[PCCP]])) PCCP <- PCCP + 1 ##Finding the Children if(NumofVars > 0){ ##Creating a list of the Children for each of the Previous Parents ##list size is based on the amount of Previous Parents CofVlist <- vector("character", length = 0) np <- 1 for(np in 1:NumofVars){ LocCofVar <- grep(paste0("^",AllVarList[[d-1]][[ef]][np],"$"),NewDotP2_2[,1]) CofVar <- NewDotP2_2[LocCofVar,2] #if(sum(grepl(VariableStartName,CofVar)) >= 1){ # #positions of variable start name within the vector of co parents # posoforig <- grep(VariableStartName,COPofVar) # COPofVar <- COPofVar[-posoforig] # COPlist <- append(COPlist,COPofVar) #} else{ CofVlist <- append(CofVlist,CofVar) #} #np <- np + 1 } } else { ##Making CofPlist empty CofVlist <- vector("character",length = 0) } AllVarList[[d]][[PCCP]] <- CofVlist AllVarList[[d]][[PCCP]] <- AllVarList[[d]][[PCCP]][!duplicated(AllVarList[[d]][[PCCP]])] ##Have you found the VariableEndName yet? varfound <- varfound + sum(grepl(paste0("^",VariableEndName,"$"),AllVarList[[d]][[PCCP]])) PCCP <- PCCP + 1 ##Finding the Co-Parents NumofCVars <- length(CofVlist) if(NumofCVars > 0){ ncp <- 1 CPofClist <- vector("character",length = 0) for(ncp in 1:NumofCVars){ LocCPofCVar <- grep(paste0("^",CofVlist[ncp],"$"),NewDotP2_2[,2]) CPofCVar <- NewDotP2_2[LocCPofCVar,1] #if(sum(grepl(,CPofCVar)) >= 1){ # #positions of variable start name within the vector of co parents # posoforig <- grep(VariableStartName,COPofVar) # COPofVar <- COPofVar[-posoforig] # COPlist <- append(COPlist,COPofVar) #} else{ CPofClist <- append(CPofClist,CPofCVar) #} #ncp <- ncp + 1 } } else { ##Making COPlist empty CPofClist <- vector("character",length = 0) } AllVarList[[d]][[PCCP]] <- CPofClist AllVarList[[d]][[PCCP]] <- AllVarList[[d]][[PCCP]][!duplicated(AllVarList[[d]][[PCCP]])] ##Have you found VariableEndName now? varfound <- varfound + sum(grepl(paste0("^",VariableEndName,"$"),AllVarList[[d]][[PCCP]])) PCCP <- PCCP + 1 #ef <- ef + 1 } } ##Stop if you have found the VariableEndName value if(varfound > 0){ break } #d <- d + 1 } ##The Markov Degree is that found below d }