Commit 7e98cf9561ea9d37386684ba753ddff576130a1a

Authored by Efrain Gonzalez
1 parent 8f1c6201bb
Exists in master

Fixed issue with similar variable names

Showing 1 changed file with 36 additions and 17 deletions   Show diff stats
1 #Efrain Gonzalez 1 #Efrain Gonzalez
2 #8/25/2017 2 #8/25/2017
3 #Code for Markov Blanket 3 #Code for Markov Blanket
4 4
5 5
6 #The required libraries 6 #The required libraries
7 library(pryr) 7 library(pryr)
8 library(MASS) 8 library(MASS)
9 library(dplyr) 9 library(dplyr)
10 library(tidyr) 10 library(tidyr)
11 library(readr) 11 library(readr)
12 library(stringr) 12 library(stringr)
13 13
14 14
15 #Have the user choose an original Dot file that they want to use 15 #Have the user choose an original Dot file that they want to use
16 DotFile <- file.choose() 16 DotFile <- file.choose()
17 TheDotP1 <- read_delim(DotFile,delim = "\t",col_names = FALSE) %>% 17 TheDotP1 <- read_delim(DotFile,delim = "\t",col_names = FALSE) %>%
18 dplyr::filter(!grepl("->|[{}]",X1)) %>% 18 dplyr::filter(!grepl("->|[{}]",X1)) %>%
19 dplyr::filter(!grepl("Banjo",X1)) %>% 19 dplyr::filter(!grepl("Banjo",X1)) %>%
20 dplyr::filter(!grepl("labeljust",X1)) 20 dplyr::filter(!grepl("labeljust",X1))
21 counterP1 <- 1 21 counterP1 <- 1
22 sizeDotP1 <- dim(TheDotP1)[1] 22 sizeDotP1 <- dim(TheDotP1)[1]
23 NewDotP1 <- matrix("0",ncol = 2, nrow = sizeDotP1) 23 NewDotP1 <- matrix("0",ncol = 2, nrow = sizeDotP1)
24 for(counterP1 in 1:sizeDotP1){ 24 for(counterP1 in 1:sizeDotP1){
25 coldataP1 <- str_trim(TheDotP1[counterP1,1]) %>% 25 coldataP1 <- str_trim(TheDotP1[counterP1,1]) %>%
26 as.character(.,stringsAsFactors = FALSE) 26 as.character(.,stringsAsFactors = FALSE)
27 if(grepl("Banjo|labeljust|>",coldataP1)==FALSE){ 27 if(grepl("Banjo|labeljust|>",coldataP1)==FALSE){
28 NumberP1 <- strsplit(coldataP1," ") %>% 28 NumberP1 <- strsplit(coldataP1," ") %>%
29 .[[1]]%>% 29 .[[1]]%>%
30 .[1] 30 .[1]
31 VarNameP1 <- strsplit(coldataP1," ") %>% 31 VarNameP1 <- strsplit(coldataP1," ") %>%
32 .[[1]] %>% 32 .[[1]] %>%
33 .[2] %>% 33 .[2] %>%
34 strsplit(.,"\"") %>% 34 strsplit(.,"\"") %>%
35 .[[1]] %>% 35 .[[1]] %>%
36 .[grep("^\\w|^\\d",.)] 36 .[grep("^\\w|^\\d",.)]
37 NewDotP1[counterP1,1] <- VarNameP1 37 NewDotP1[counterP1,1] <- VarNameP1
38 NewDotP1[counterP1,2] <- NumberP1 38 NewDotP1[counterP1,2] <- NumberP1
39 } 39 }
40 if(grepl("->",coldataP1) == TRUE){ 40 if(grepl("->",coldataP1) == TRUE){
41 break 41 break
42 } 42 }
43 } 43 }
44 44
45 45
46 TheDotP2 <- read_delim(DotFile,delim = "\t",col_names = FALSE) %>% 46 TheDotP2 <- read_delim(DotFile,delim = "\t",col_names = FALSE) %>%
47 dplyr::filter(grepl("->",X1)) 47 dplyr::filter(grepl("->",X1))
48 counterP2 <- 1 48 counterP2 <- 1
49 sizeDotP2 <- dim(TheDotP2)[1] 49 sizeDotP2 <- dim(TheDotP2)[1]
50 NewDotP2 <- matrix("0",ncol = 2, nrow = sizeDotP2) 50 NewDotP2 <- matrix("0",ncol = 2, nrow = sizeDotP2)
51 for(counterP2 in 1:sizeDotP2){ 51 for(counterP2 in 1:sizeDotP2){
52 coldataP2 <- str_trim(TheDotP2[counterP2,1]) %>% 52 coldataP2 <- str_trim(TheDotP2[counterP2,1]) %>%
53 as.character(.,stringsAsFactors = FALSE) 53 as.character(.,stringsAsFactors = FALSE)
54 ParentNumP2 <- strsplit(coldataP2,"->") %>% 54 ParentNumP2 <- strsplit(coldataP2,"->") %>%
55 .[[1]]%>% 55 .[[1]]%>%
56 .[1] 56 .[1]
57 ChildNumP2 <- strsplit(coldataP2,"->") %>% 57 ChildNumP2 <- strsplit(coldataP2,"->") %>%
58 .[[1]] %>% 58 .[[1]] %>%
59 .[2] %>% 59 .[2] %>%
60 strsplit(.,";") %>% 60 strsplit(.,";") %>%
61 .[[1]] %>% 61 .[[1]] %>%
62 .[1] 62 .[1]
63 NewDotP2[counterP2,1] <- ParentNumP2 63 NewDotP2[counterP2,1] <- ParentNumP2
64 NewDotP2[counterP2,2] <- ChildNumP2 64 NewDotP2[counterP2,2] <- ChildNumP2
65 } 65 }
66 66
67 colnames(NewDotP2) <- c("Parents","Children") 67 colnames(NewDotP2) <- c("Parents","Children")
68 68
69 #Matching numbers to variable names 69 #Matching numbers to variable names
70 NewDotP2_2 <- NewDotP2 70 NewDotP2_2 <- NewDotP2
71 for(i in 1:sizeDotP1){ 71 for(i in 1:sizeDotP1){
72 #Where is the variable located within NewDotP2 (column one only)? 72 #Where is the variable located within NewDotP2 (column one only)?
73 chngreq <- grep(paste0("^",NewDotP1[i,2],"$"),NewDotP2_2[,1]) 73 chngreq <- grep(paste0("^",NewDotP1[i,2],"$"),NewDotP2_2[,1])
74 if(is.na(sum(chngreq)) == FALSE){ 74 if(is.na(sum(chngreq)) == FALSE){
75 if(sum(chngreq) > 0){ 75 if(sum(chngreq) > 0){
76 NewDotP2_2[chngreq,1] <- gsub(paste0("^",NewDotP1[i,2],"$"),NewDotP1[i,1],NewDotP2_2[chngreq,1]) 76 NewDotP2_2[chngreq,1] <- gsub(paste0("^",NewDotP1[i,2],"$"),NewDotP1[i,1],NewDotP2_2[chngreq,1])
77 } 77 }
78 } 78 }
79 i <- i + 1 79 #i <- i + 1
80 } 80 }
81 NewDotP2_2 81 NewDotP2_2
82 for(j in 1:sizeDotP1){ 82 for(j in 1:sizeDotP1){
83 #Where is the variable located within NewDotP2 (column two only)? 83 #Where is the variable located within NewDotP2 (column two only)?
84 chngreq <- grep(paste0("^",NewDotP1[j,2],"$"),NewDotP2_2[,2]) 84 chngreq <- grep(paste0("^",NewDotP1[j,2],"$"),NewDotP2_2[,2])
85 if(is.na(sum(chngreq)) == FALSE){ 85 if(is.na(sum(chngreq)) == FALSE){
86 if(sum(chngreq) > 0){ 86 if(sum(chngreq) > 0){
87 NewDotP2_2[chngreq,2] <- gsub(paste0("^",NewDotP1[j,2],"$"),NewDotP1[j,1],NewDotP2_2[chngreq,2]) 87 NewDotP2_2[chngreq,2] <- gsub(paste0("^",NewDotP1[j,2],"$"),NewDotP1[j,1],NewDotP2_2[chngreq,2])
88 } 88 }
89 } 89 }
90 j <- j + 1 90 #j <- j + 1
91 } 91 }
92 92
93 lrgMarkov <- dim(NewDotP2_2)[1] 93 lrgMarkov <- dim(NewDotP2_2)[1]
94 Blanky <- function(MarkovDegree = lrgMarkov, VariableStartName = "Alzheimer", VariableEndName = "Age"){ 94 Blanky <- function(MarkovDegree = 20, VariableStartName = "Alzheimer", VariableEndName = "GRIN2A"){
95 #Finding the Parents and Children 95 #Finding the Parents and Children
96 d <- 1 96 d <- 1
97 AllNamList1 <- vector("list",length = 3) 97 AllNamList1 <- vector("list",length = 3)
98 #AllNamList <- vector("list", length = 3) 98 #AllNamList <- vector("list", length = 3)
99 ##Finding the Parents of the Parents 99 ##Finding the Parents of the Parents
100 ##A list of lists 100 ##A list of lists
101 ##outer set by the degree of the Markov blanket 101 ##outer set by the degree of the Markov blanket
102 AllVarList <- vector("list",length = 3) 102 AllVarList <- vector("list",length = 3)
103 varfound <- 0
103 for(d in 1:MarkovDegree){ 104 for(d in 1:MarkovDegree){
104 colnames(NewDotP2_2) <- NULL 105 colnames(NewDotP2_2) <- NULL
105 ##Which variable are you looking for? 106 ##Which variable are you looking for?
106 ##This is the VariableEndName 107 ##This is the VariableEndName
107 if(d == 1){ 108 if(d == 1){
108 ##Finding the Parents for the variable 109 ##Finding the Parents for the variable
109 LocPofVar <- grep(VariableStartName,NewDotP2_2[,2]) 110 LocPofVar <- grep(paste0("^",VariableStartName,"$"),NewDotP2_2[,2])
110 PofVar <- NewDotP2_2[LocPofVar,1] 111 PofVar <- NewDotP2_2[LocPofVar,1]
111 AllNamList1[[1]] <- PofVar 112 AllNamList1[[1]] <- PofVar
113 AllNamList1[[1]] <- AllNamList1[[1]][!duplicated(AllNamList1[[1]])]
114 varfound <- varfound + sum(grepl(paste0("^",VariableEndName,"$"),AllNamList1[[1]]))
112 115
113 ##Finding the Children for the variable 116 ##Finding the Children for the variable
114 LocCofVar <- grep(VariableStartName,NewDotP2_2[,1]) 117 LocCofVar <- grep(paste0("^",VariableStartName,"$"),NewDotP2_2[,1])
115 CofVar <- NewDotP2_2[LocCofVar,2] 118 CofVar <- NewDotP2_2[LocCofVar,2]
116 AllNamList1[[2]] <- CofVar 119 AllNamList1[[2]] <- CofVar
120 AllNamList1[[2]] <- AllNamList1[[2]][!duplicated(AllNamList1[[2]])]
121 varfound <- varfound + sum(grepl(paste0("^",VariableEndName,"$"),AllNamList1[[2]]))
117 122
118 ##Finding the Co-Parents of the Children for the variable 123 ##Finding the Co-Parents of the Children for the variable
119 NumofChild <- length(CofVar) 124 NumofChild <- length(CofVar)
120 if(NumofChild > 0){ 125 if(NumofChild > 0){
121 ##Creating a list of the Co-Parents for each of the children 126 ##Creating a list of the Co-Parents for each of the children
122 ##list size is based on the amount of Children 127 ##list size is based on the amount of Children
123 COPlist <- vector("character", length = 0) 128 COPlist <- vector("character", length = 0)
124 nc <- 1 129 nc <- 1
125 for(nc in 1:NumofChild){ 130 for(nc in 1:NumofChild){
126 LocCOPofVar <- grep(CofVar[nc],NewDotP2_2[,2]) 131 LocCOPofVar <- grep(paste0("^",CofVar[nc],"$"),NewDotP2_2[,2])
127 COPofVar <- NewDotP2_2[LocCOPofVar,1] 132 COPofVar <- NewDotP2_2[LocCOPofVar,1]
128 if(sum(grepl(VariableStartName,COPofVar)) >= 1){ 133 if(sum(grepl(VariableStartName,COPofVar)) >= 1){
129 #positions of variable start name within the vector of co parents 134 #positions of variable start name within the vector of co parents
130 posoforig <- grep(VariableStartName,COPofVar) 135 posoforig <- grep(paste0("^",VariableStartName,"$"),COPofVar)
131 COPofVar <- COPofVar[-posoforig] 136 COPofVar <- COPofVar[-posoforig]
132 COPlist <- append(COPlist,COPofVar) 137 COPlist <- append(COPlist,COPofVar)
133 } else{ 138 } else{
134 #COPlist[[nc]] <- COPofVar[COPofVar!=VariableStartName] 139 #COPlist[[nc]] <- COPofVar[COPofVar!=VariableStartName]
135 COPlist <- append(COPlist,COPofVar) 140 COPlist <- append(COPlist,COPofVar)
136 } 141 }
137 nc <- nc + 1 142 #nc <- nc + 1
138 } 143 }
139 } else { 144 } else {
140 ##Making COPlist empty 145 ##Making COPlist empty
141 COPlist <- vector("character",length = 0) 146 COPlist <- vector("character",length = 0)
142 } 147 }
143 AllNamList1[[3]] <- COPlist 148 AllNamList1[[3]] <- COPlist
149 AllNamList1[[3]] <- AllNamList1[[3]][!duplicated(AllNamList1[[3]])]
150 varfound <- varfound + sum(grepl(paste0("^",VariableEndName,"$"),AllNamList1[[3]]))
151
144 AllVarList[[1]] <- AllNamList1 152 AllVarList[[1]] <- AllNamList1
153
145 } else if(d > 1){ 154 } else if(d > 1){
146 ##inner set by the length of the previous AllVarlist we are working on 155 ##inner set by the length of the previous AllVarlist we are working on
147 lPreVList <- length(AllVarList[[d-1]]) 156 lPreVList <- length(AllVarList[[d-1]])
148 ef <- 1 157 ef <- 1
149 ##PCCP will eventually equal the total size that we expect for the iteration (#d) 158 ##PCCP will eventually equal the total size that we expect for the iteration (#d)
150 ## which is just lPreVList * 3 159 ## which is just lPreVList * 3
151 PCCP <- 1 160 PCCP <- 1
152 newsize <- (lPreVList * 3) 161 newsize <- (lPreVList * 3)
153 if(d > 3){ 162 if(d > 3){
154 AllVarList[[d]] <- vector("list") 163 AllVarList[[d]] <- vector("list")
155 } 164 }
156 for(ef in 1:lPreVList){ 165 for(ef in 1:lPreVList){
157 ##Finding the Parents 166 ##Finding the Parents
158 NumofVars <- length(AllVarList[[d-1]][[ef]]) 167 NumofVars <- length(AllVarList[[d-1]][[ef]])
159 if(NumofVars > 0){ 168 if(NumofVars > 0){
160 ##Creating a list of the Parents for each of the Variables 169 ##Creating a list of the Parents for each of the Variables
161 ##list size is based on the amount of Previous Variables 170 ##list size is based on the amount of Previous Variables
162 PofVlist <- vector("character", length = 0) 171 PofVlist <- vector("character", length = 0)
163 np <- 1 172 np <- 1
164 for(np in 1:NumofVars){ 173 for(np in 1:NumofVars){
165 LocPofVar <- grep(AllVarList[[d-1]][[ef]][np],NewDotP2_2[,2]) 174 LocPofVar <- grep(paste0("^",AllVarList[[d-1]][[ef]][np],"$"),NewDotP2_2[,2])
166 PofVar <- NewDotP2_2[LocPofVar,1] 175 PofVar <- NewDotP2_2[LocPofVar,1]
167 PofVlist <- append(PofVlist,PofVar) 176 PofVlist <- append(PofVlist,PofVar)
168 np <- np + 1 177 #np <- np + 1
169 } 178 }
170 } else { 179 } else {
171 ##Making COPlist empty 180 ##Making COPlist empty
172 PofVlist <- vector("character",length = 0) 181 PofVlist <- vector("character",length = 0)
173 } 182 }
174 AllVarList[[d]][[PCCP]] <- PofVlist 183 AllVarList[[d]][[PCCP]] <- PofVlist
184 AllVarList[[d]][[PCCP]] <- AllVarList[[d]][[PCCP]][!duplicated(AllVarList[[d]][[PCCP]])]
185 ##Have you found the VariableEndName?
186 varfound <- varfound + sum(grepl(paste0("^",VariableEndName,"$"),AllVarList[[d]][[PCCP]]))
175 PCCP <- PCCP + 1 187 PCCP <- PCCP + 1
176 188
177 ##Finding the Children 189 ##Finding the Children
178 if(NumofVars > 0){ 190 if(NumofVars > 0){
179 ##Creating a list of the Children for each of the Previous Parents 191 ##Creating a list of the Children for each of the Previous Parents
180 ##list size is based on the amount of Previous Parents 192 ##list size is based on the amount of Previous Parents
181 CofVlist <- vector("character", length = 0) 193 CofVlist <- vector("character", length = 0)
182 np <- 1 194 np <- 1
183 for(np in 1:NumofVars){ 195 for(np in 1:NumofVars){
184 LocCofVar <- grep(AllVarList[[d-1]][[ef]][np],NewDotP2_2[,1]) 196 LocCofVar <- grep(paste0("^",AllVarList[[d-1]][[ef]][np],"$"),NewDotP2_2[,1])
185 CofVar <- NewDotP2_2[LocCofVar,2] 197 CofVar <- NewDotP2_2[LocCofVar,2]
186 #if(sum(grepl(VariableStartName,CofVar)) >= 1){ 198 #if(sum(grepl(VariableStartName,CofVar)) >= 1){
187 # #positions of variable start name within the vector of co parents 199 # #positions of variable start name within the vector of co parents
188 # posoforig <- grep(VariableStartName,COPofVar) 200 # posoforig <- grep(VariableStartName,COPofVar)
189 # COPofVar <- COPofVar[-posoforig] 201 # COPofVar <- COPofVar[-posoforig]
190 # COPlist <- append(COPlist,COPofVar) 202 # COPlist <- append(COPlist,COPofVar)
191 #} else{ 203 #} else{
192 CofVlist <- append(CofVlist,CofVar) 204 CofVlist <- append(CofVlist,CofVar)
193 #} 205 #}
194 np <- np + 1 206 #np <- np + 1
195 } 207 }
196 } else { 208 } else {
197 ##Making CofPlist empty 209 ##Making CofPlist empty
198 CofVlist <- vector("character",length = 0) 210 CofVlist <- vector("character",length = 0)
199 } 211 }
200 AllVarList[[d]][[PCCP]] <- CofVlist 212 AllVarList[[d]][[PCCP]] <- CofVlist
213 AllVarList[[d]][[PCCP]] <- AllVarList[[d]][[PCCP]][!duplicated(AllVarList[[d]][[PCCP]])]
214 ##Have you found the VariableEndName yet?
215 varfound <- varfound + sum(grepl(paste0("^",VariableEndName,"$"),AllVarList[[d]][[PCCP]]))
201 PCCP <- PCCP + 1 216 PCCP <- PCCP + 1
202 217
203 ##Finding the Co-Parents 218 ##Finding the Co-Parents
204 NumofCVars <- length(CofVlist) 219 NumofCVars <- length(CofVlist)
205 if(NumofCVars > 0){ 220 if(NumofCVars > 0){
206 ncp <- 1 221 ncp <- 1
207 CPofClist <- vector("character",length = 0) 222 CPofClist <- vector("character",length = 0)
208 for(ncp in 1:NumofCVars){ 223 for(ncp in 1:NumofCVars){
209 LocCPofCVar <- grep(CofVlist[ncp],NewDotP2_2[,2]) 224 LocCPofCVar <- grep(paste0("^",CofVlist[ncp],"$"),NewDotP2_2[,2])
210 CPofCVar <- NewDotP2_2[LocCPofCVar,1] 225 CPofCVar <- NewDotP2_2[LocCPofCVar,1]
211 #if(sum(grepl(,CPofCVar)) >= 1){ 226 #if(sum(grepl(,CPofCVar)) >= 1){
212 # #positions of variable start name within the vector of co parents 227 # #positions of variable start name within the vector of co parents
213 # posoforig <- grep(VariableStartName,COPofVar) 228 # posoforig <- grep(VariableStartName,COPofVar)
214 # COPofVar <- COPofVar[-posoforig] 229 # COPofVar <- COPofVar[-posoforig]
215 # COPlist <- append(COPlist,COPofVar) 230 # COPlist <- append(COPlist,COPofVar)
216 #} else{ 231 #} else{
217 CPofClist <- append(CPofClist,CPofCVar) 232 CPofClist <- append(CPofClist,CPofCVar)
218 #} 233 #}
219 ncp <- ncp + 1 234 #ncp <- ncp + 1
220 } 235 }
221 236
222 } else { 237 } else {
223 ##Making COPlist empty 238 ##Making COPlist empty
224 CPofClist <- vector("character",length = 0) 239 CPofClist <- vector("character",length = 0)
225 } 240 }
226 AllVarList[[d]][[PCCP]] <- CPofClist 241 AllVarList[[d]][[PCCP]] <- CPofClist
242 AllVarList[[d]][[PCCP]] <- AllVarList[[d]][[PCCP]][!duplicated(AllVarList[[d]][[PCCP]])]
243 ##Have you found VariableEndName now?
244 varfound <- varfound + sum(grepl(paste0("^",VariableEndName,"$"),AllVarList[[d]][[PCCP]]))
227 PCCP <- PCCP + 1 245 PCCP <- PCCP + 1
228 ef <- ef + 1 246 #ef <- ef + 1
229 } 247 }
230 } 248 }
249
231 ##Stop if you have found the VariableEndName value 250 ##Stop if you have found the VariableEndName value
232 if(sum(grepl(VariableEndName,AllVarList)) > 0){ 251 if(varfound > 0){
233 break 252 break
234 } 253 }
235 d <- d + 1 254 #d <- d + 1
236 } 255 }
237 ##The Markov Degree is that found below 256 ##The Markov Degree is that found below
238 d 257 d
239 } 258 }
240 259