score_with_tree.py 2.8 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191


import itertools
import math


class score(object):
	def __init__(self, graph, data):
		"""
		graph is a Graph class as we define
		data should be an Pandas data frame import from a csv formate file
		"""

		self.graph = graph
		self.data = data
		self.n, self.m = data.shape
		self.s, self.ri = self.state()
		self.qi, self.pi, self.conf = self.get_parents()
		

	#n, m = self.data.shape

	def state(self):
		"""
		return the unique states s for each variable and
		the ri
		s is a dict in which the key is i, 
		the ith number of variable in the data, containing the unique states of variable i
		ri is a dict in which ri[i] is the num of states in variable i
		"""
		s = {}
		ri = {}

		for i in xrange(self.m):
			s[i] = self.data.ix[:,i].unique().tolist()
			#sort
			#s[i].sort()
			ri[i] = len(s[i])
		return s, ri

	#s, ri = self.state()
	#names = self.data.columns.values.tolist()

	def get_parents(self):
		"""
		qi is a dict in which the key is i, the ith variable, containing the unique state of
		i variable's parents
		pi is a dict in which the key is i, containing the parents of i
		conf is a dict in which the key is i, containing the distinct states of 
		its parents
		"""
		qi = {}
		pi = {}
		conf = {}

		for i in xrange(self.m):

			#get the parents of variable i
			pi[i] = sorted(self.graph.parents[i])

			#get the # of possible configuarations of the parents of variable i

			#in the case p is empty
			if len(pi[i]) == 0:
				qi[i] = 1
				continue

			conf[i] = list(itertools.product(*[self.s.get(k) for k in pi[i]]))
			qi[i] = len(conf[i])

		return qi, pi, conf

	#qi, pi, conf = self.get_parents()


	def get_Nijk(self):
		"""
		the output is a dictionary in which the keys are i,
		and the values are another dict which the keys are j,
		and the values are lists in which (k+1)th is Nijk
		"""
		Nijk = {}

		#group data by all the variables
		names = self.data.columns.values.tolist()
		group = self.data.groupby(names).size()

		for i in xrange(self.m):
			Nijk[i] = {}

			for j in xrange(self.qi[i]):
				Nijk[i][j] = {}
				for k in xrange(self.ri[i]):
					select = [slice(None)]*self.m
					select[i] = self.s[i][k]

					for ai, a in enumerate(self.pi[i]):
						select[a] = self.conf[i][j][ai]

					Nijk[i][j][k] = group.loc[tuple(select)].sum()

		return Nijk


	def BDe(self, ess = None, prior = None):

		if ess == None:
			ess = 1.0
		else: ess = float(ess)

		result = 0.0
		Nijk = self.get_Nijk()

		for i in xrange(len(Nijk)):
			for j in xrange(len(Nijk[i])):
				for k in xrange(len(Nijk[i][j])):

					nijk = ess/(self.ri[i]*self.qi[i])
					result += math.lgamma(Nijk[i][j][k] + nijk) - math.lgamma(nijk)

				nij = ess/self.qi[i]
				Nij = sum(Nijk[i][j].itervalues())

				result += math.lgamma(nij) - math.lgamma(Nij + nij)

		return result