1
2
3
4
5
6
7
8 from nltk_lite.contrib.classifier import decisionstump as ds, instances as ins, instance, format
9 from nltk_lite.contrib.classifier_tests import *
10 import math
11
19
21 self.assertEqual(3, len(self.outlook_stump.counts))
22 for attr_value in self.outlook_attr.values:
23 for class_value in self.klass:
24 self.assertEqual(0, self.outlook_stump.counts[attr_value][class_value])
25
27 self.outlook_stump.update_count(self.instances[0])
28 for attr_value in self.outlook_attr.values:
29 for class_value in self.klass:
30 if attr_value == 'sunny' and class_value == 'no': continue
31 self.assertEqual(0, self.outlook_stump.counts[attr_value][class_value])
32 self.assertEqual(1, self.outlook_stump.counts['sunny']['no'])
33
38
42
48
50 self.__update_stump()
51 self.assertEqual('no', self.outlook_stump.klass(instance.GoldInstance(['sunny','mild','normal','true'],'yes')))
52 self.assertEqual('yes', self.outlook_stump.klass(instance.GoldInstance(['overcast','mild','normal','true'],'yes')))
53 self.assertEqual('yes', self.outlook_stump.klass(instance.GoldInstance(['rainy','mild','normal','true'],'yes')))
54 self.assertEqual('no', self.outlook_stump.klass(instance.TestInstance(['sunny','mild','normal','true'])))
55 self.assertEqual('yes', self.outlook_stump.klass(instance.TestInstance(['overcast','mild','normal','true'])))
56 self.assertEqual('yes', self.outlook_stump.klass(instance.TestInstance(['rainy','mild','normal','true'])))
57
59 dictionary_of_klass_counts = {}
60 dictionary_of_klass_counts['yes'] = 2
61 dictionary_of_klass_counts['no'] = 0
62 self.assertEqual(2, ds.total_counts(dictionary_of_klass_counts))
63
64 dictionary_of_klass_counts['yes'] = 9
65 dictionary_of_klass_counts['no'] = 5
66 self.assertEqual(14, ds.total_counts(dictionary_of_klass_counts))
67
68
69
70
71
72
73
74
75
76
77
78
79
80
85
86
87
88
95
97 self.__update_stump()
98
99
100
101
102 expected = -(1.0/4 * math.log(1.0/4, 2)) + -(3.0/4 * math.log(3.0/4, 2))
103 self.assertAlmostEqual(expected, self.outlook_stump.entropy('sunny'), 6)
104
105 expected = -(2.0/2 * math.log(2.0/2, 2)) + 0
106 self.assertAlmostEqual(0, self.outlook_stump.entropy('overcast'))
107
108 expected = -(2.0/3 * math.log(2.0/3, 2)) + -(1.0/3 * math.log(1.0/3, 2))
109 self.assertAlmostEqual(expected, self.outlook_stump.entropy('rainy'))
110
118
120 self.__update_stump()
121
122 entropy = -(5.0/9 * math.log(5.0/9, 2)) + -(4.0/9 * math.log(4.0/9, 2))
123 mean_info = 4.0/9 * (-(1.0/4 * math.log(1.0/4, 2)) + -(3.0/4 * math.log(3.0/4, 2))) + 3.0/9 * (-(2.0/3 * math.log(2.0/3, 2)) + -(1.0/3 * math.log(1.0/3, 2)))
124 info_gain = entropy - mean_info
125 split_info = -(1.0/3 * math.log(1.0/3, 2)) * 3
126 expected = float(info_gain) / split_info
127
128 self.assertAlmostEqual(expected, self.outlook_stump.gain_ratio(), 6)
129
131 stumps = []
132 for attribute in self.attributes:
133 stumps.append(ds.DecisionStump(attribute, self.klass))
134 for instance in self.instances:
135 for stump in stumps:
136 stump.update_count(instance)
137
138 self.assertAlmostEqual(0.324409, stumps[0].information_gain(), 6)
139 self.assertAlmostEqual(0.102187, stumps[1].information_gain(), 6)
140 self.assertAlmostEqual(0.091091, stumps[2].information_gain(), 6)
141 self.assertAlmostEqual(0.072780, stumps[3].information_gain(), 6)
142
143 stumps.sort(lambda x, y: cmp(getattr(x, 'information_gain'), getattr(y, 'information_gain')))
144
145 self.assertAlmostEqual(0.324409, stumps[0].information_gain(), 6)
146 self.assertAlmostEqual(0.102187, stumps[1].information_gain(), 6)
147 self.assertAlmostEqual(0.091091, stumps[2].information_gain(), 6)
148 self.assertAlmostEqual(0.072780, stumps[3].information_gain(), 6)
149