1
2
3
4
5
6
7
8 from nltk_lite.contrib.classifier import instances as ins, instance, attribute as a, discretisedattribute as da, numrange as nr, format
9 from nltk_lite.contrib.classifier.exceptions import systemerror as system, invaliddataerror as inv
10 from nltk_lite.contrib.classifier_tests import *
11 import math
12
17
23
36
41
49
58
67
69 path = datasetsDir(self) + 'numerical' + SEP + 'person'
70 training = format.C45_FORMAT.get_training_instances(path)
71 attributes = format.C45_FORMAT.get_attributes(path)
72 ranges = training.value_ranges([attributes[0], attributes[1], attributes[4], attributes[5], attributes[6]])
73 self.assertEqual(5, len(ranges))
74 self.assertEqual(0, ranges[0].lower)
75 self.assertAlmostEqual(5.000001, ranges[0].upper)
76 self.assertEqual(19, ranges[1].lower)
77 self.assertAlmostEqual(42.000001, ranges[1].upper)
78 self.assertEqual(0, ranges[2].lower)
79 self.assertAlmostEqual(2.000001, ranges[2].upper)
80 self.assertEqual(0, ranges[3].lower)
81 self.assertAlmostEqual(6.000001, ranges[3].upper)
82 self.assertEqual(0, ranges[4].lower)
83 self.assertAlmostEqual(120000.000001, ranges[4].upper)
84
93
95 path = datasetsDir(self) + 'numerical' + SEP + 'person'
96 training = format.C45_FORMAT.get_training_instances(path)
97 attributes = format.C45_FORMAT.get_attributes(path)
98 self.assertEqual(0.0, training[0].value(attributes[4]))
99 self.assertEqual(65000.0, training[0].value(attributes[6]))
100 disc_dependents = da.DiscretisedAttribute('dependents', nr.Range(0, 2, True).split(2), 4)
101 disc_annual_income = da.DiscretisedAttribute('annualincome', nr.Range(0, 120000, True).split(5), 6)
102 training.discretise([disc_dependents, disc_annual_income])
103
104 self.assertEqual('a', training[0].value(disc_dependents))
105 self.assertEqual('c', training[0].value(disc_annual_income))
106
108 path = datasetsDir(self) + 'numerical' + SEP + 'weather'
109 training = format.C45_FORMAT.get_training_instances(path)
110 attributes = format.C45_FORMAT.get_attributes(path)
111 self.assertEqual([[27.5, 33.1, 32, 18, 12, 10.7, 6, 14.1, 9, 9, 12, 12]] ,training.values_grouped_by_attribute([attributes[1]]))
112
121
123 path = datasetsDir(self) + 'numerical' + SEP + 'person'
124 training = format.C45_FORMAT.get_training_instances(path)
125 attributes = format.C45_FORMAT.get_attributes(path)
126 attr_values = training.values_grouped_by_attribute([attributes[1]])
127 self.assertEqual([25.0, 19.0, 21.0, 34.0, 31.0, 42.0], attr_values[0])
128 klass_values = training.klass_values()
129 self.assertEqual(['yes', 'no', 'yes', 'yes', 'yes', 'no'], klass_values)
130
131 training.sort_by(attributes[1])
132 attr_values = training.values_grouped_by_attribute([attributes[1]])
133 self.assertEqual([19.0, 21.0, 25.0, 31.0, 34.0, 42.0], attr_values[0])
134 klass_values = training.klass_values()
135 self.assertEqual(['no', 'yes', 'yes', 'yes', 'yes', 'no'], klass_values)
136
138 brkpts = ins.SupervisedBreakpoints(['no', 'yes', 'yes', 'yes', 'yes', 'no'], [19.0, 21.0, 25.0, 31.0, 34.0, 42.0])
139 brkpts.find_naive()
140 ranges = brkpts.as_ranges()
141 self.assertEqual(3, len(ranges))
142 self.assertEqual(19.0, ranges[0].lower)
143 self.assertEqual(20.0, ranges[0].upper)
144 self.assertEqual(20.0, ranges[1].lower)
145 self.assertEqual(38.0, ranges[1].upper)
146 self.assertEqual(38.0, ranges[2].lower)
147 self.assertEqual(42.000001, ranges[2].upper)
148
150 path = datasetsDir(self) + 'numerical' + SEP + 'person'
151 training = format.C45_FORMAT.get_training_instances(path)
152 attributes = format.C45_FORMAT.get_attributes(path)
153
154 breakpoints = training.supervised_breakpoints(attributes[1])
155 breakpoints.find_naive()
156 self.assertEqual(['no', 'yes', 'yes', 'yes', 'yes', 'no'], training.klass_values())
157 self.assertEqual([19.0, 21.0, 25.0, 31.0, 34.0, 42.0], training.attribute_values(attributes[1]))
158 self.assertEqual(2, len(breakpoints))
159 self.assertEqual([0,4], breakpoints)
160
162 path = datasetsDir(self) + 'numerical' + SEP + 'person'
163 attributes = format.C45_FORMAT.get_attributes(path)
164 training = format.C45_FORMAT.get_training_instances(path)
165 breakpoints = training.supervised_breakpoints(attributes[4])
166 breakpoints.find_naive()
167
168 self.assertEqual(['yes', 'no', 'yes', 'yes', 'yes', 'no'], training.klass_values())
169 self.assertEqual([0.0, 0.0, 0.0, 2.0, 2.0, 2.0], training.attribute_values(attributes[4]))
170 self.assertEqual(1, len(breakpoints))
171 self.assertEqual([2], breakpoints)
172
174 breakpoints = ins.SupervisedBreakpoints(['yes', 'no', 'yes', 'yes', 'yes', 'no'], [19.0, 21.0, 25.0, 31.0, 34.0, 42.0])
175 breakpoints = breakpoints.breakpoints_in_class_membership()
176 self.assertEqual(3, len(breakpoints))
177 self.assertEqual([0, 1, 4], breakpoints)
178
180 breakpoints = ins.SupervisedBreakpoints(['yes', 'no', 'yes', 'yes', 'yes', 'no'], [19.0, 21.0, 25.0, 31.0, 34.0, 42.0])
181 breakpoints.find_entropy_based_max_depth(2)
182 self.assertEqual(2, len(breakpoints))
183 self.assertEqual([4,0], breakpoints.data)
184
186 breakpoints = ins.SupervisedBreakpoints(['yes', 'no', 'yes', 'yes', 'yes', 'no', 'no', 'yes', 'yes'], [64, 65, 68, 69, 70, 71, 72, 72, 75])
187 breakpoints.find_naive()
188 self.assertEqual(4, len(breakpoints))
189 self.assertEqual([0, 1, 4, 7], breakpoints)
190
191 breakpoints.adjust_for_min_freq(4)
192 self.assertEqual(1, len(breakpoints))
193 self.assertEqual([4], breakpoints)
194
196 breakpoints = ins.SupervisedBreakpoints(['yes', 'no', 'yes', 'yes', 'yes', 'no', 'no', 'yes', 'yes'], [64, 65, 68, 69, 70, 71, 72, 72, 75])
197 breakpoints.find_naive_v1(3)
198 self.assertEqual(1, len(breakpoints))
199 self.assertEqual([3], breakpoints)
200
202 breakpoints = ins.SupervisedBreakpoints(['yes', 'no', 'yes', 'yes', 'yes', 'no', 'no', 'yes', 'yes'], [64, 65, 68, 69, 70, 71, 72, 72, 75])
203 breakpoints.find_naive_v2(3)
204 self.assertEqual(2, len(breakpoints))
205 self.assertEqual([4, 7], breakpoints)
206
216
217 if __name__ == '__main__':
218 runner = unittest.TextTestRunner()
219 runner.run(unittest.TestSuite(unittest.makeSuite(InstancesTestCase)))
220