Package nltk_lite :: Package contrib :: Package classifier :: Module instances
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.classifier.instances

  1  # Natural Language Toolkit - Instances 
  2  #  Understands the creation and validation of instances from input file path 
  3  # 
  4  # Author: Sumukh Ghodke <sumukh dot ghodke at gmail dot com> 
  5  # 
  6  # URL: <http://nltk.sf.net> 
  7  # This software is distributed under GPL, for license information see LICENSE.TXT 
  8   
  9  from nltk_lite.contrib.classifier import instance as ins, item, cfile, confusionmatrix as cm, numrange as r 
 10  from nltk_lite.contrib.classifier.exceptions import systemerror as system, invaliddataerror as inv 
 11  from nltk_lite import probability as prob 
 12  import operator, UserList 
 13   
14 -class Instances(UserList.UserList):
15 - def __init__(self, instances):
16 UserList.UserList.__init__(self, instances)
17
18 - def are_valid(self, klass, attributes):
19 for instance in self.data: 20 if not instance.is_valid(klass, attributes): 21 return False 22 return True
23
24 - def discretise(self, discretised_attributes):
27
28 - def remove_attributes(self, attributes):
29 for instance in self.data: 30 instance.remove_attributes(attributes)
31
32 -class TrainingInstances(Instances):
33 - def __init__(self, instances):
35
36 - def filter(self, attribute, attr_value):
37 new_instances = TrainingInstances([]) 38 for instance in self.data: 39 if(instance.value(attribute) == attr_value): 40 new_instances.append(instance) 41 return new_instances
42
43 - def value_ranges(self, attributes):
44 """ 45 Returns an array of range objects, in which each corresponds to the range of values an 46 attribute in the attributes parameter can take. 47 len(returned range array) is equal to len(attributes) 48 """ 49 ranges = [] 50 for attribute in attributes: 51 if not attribute.is_continuous(): 52 raise inv.InvalidDataError('Cannot discretise non continuous attribute ' + attribute.name) 53 values = self.values_grouped_by_attribute(attributes) 54 for value in values: #each entry in values is the range of values for a particular attribute 55 value.sort() 56 ranges.append(r.Range(value[0], value[-1], True)) 57 return ranges
58
59 - def values_grouped_by_attribute(self, attributes):
60 """ 61 Returns an array where each element is an array of attribute values for a particular attribute 62 len(returned array) is equal to len(attributes) 63 """ 64 values = [] 65 for attribute in attributes: 66 _vals_in_attr = [] 67 for instance in self.data: 68 if attribute.is_continuous(): 69 _vals_in_attr.append(float(instance.value(attribute))) 70 else: 71 _vals_in_attr.append(instance.value(attribute)) 72 values.append(_vals_in_attr) 73 return values
74
75 - def __as_float(self, values):
76 floats = [] 77 for value in values: 78 floats.append(float(value)) 79 return floats
80
81 - def klass_values(self):
82 values = [] 83 for instance in self.data: 84 values.append(instance.klass_value) 85 return values
86
87 - def supervised_breakpoints(self, attribute):
88 self.sort_by(attribute) 89 attr_values = self.attribute_values(attribute) 90 return SupervisedBreakpoints(self.klass_values(), attr_values)
91
92 - def attribute_values(self, attribute):
93 values = [] 94 for instance in self.data: 95 values.append(instance.value(attribute)) 96 return values
97
98 - def sort_by(self, attribute):
99 self.data.sort(lambda x, y: cmp(x.value(attribute), y.value(attribute)))
100
101 -class TestInstances(Instances):
102 - def __init__(self, instances):
104
105 - def print_all(self):
106 for instance in self.data: 107 print instance
108
109 -class GoldInstances(Instances):
110 - def __init__(self, instances):
112
113 - def confusion_matrix(self, klass):
114 for i in self.data: 115 if i.classified_klass == None: 116 raise system.SystemError('Cannot calculate accuracy as one or more instance(s) are not classified') 117 matrix = cm.ConfusionMatrix(klass) 118 for i in self.data: 119 matrix.count(i.klass_value, i.classified_klass) 120 return matrix
121
122 -class SupervisedBreakpoints(UserList.UserList):
123 """ 124 Used to find breakpoints for discretisation 125 """
126 - def __init__(self, klass_values, attr_values):
127 UserList.UserList.__init__(self, []) 128 self.attr_values = attr_values 129 self.klass_values = klass_values
130
131 - def find_naive(self):
134
135 - def find_naive_v1(self, min_size):
136 frequencies = prob.FreqDist() 137 for index in range(len(self.klass_values) - 1): 138 frequencies.inc(self.klass_values[index]) 139 if frequencies.count(frequencies.max()) >= min_size: 140 self.append(index) 141 frequencies = prob.FreqDist()
142
143 - def find_naive_v2(self, min_size):
144 self.find_naive() 145 self.adjust_for_min_freq(min_size)
146
147 - def find_entropy_based_max_depth(self, max_depth):
148 self.max_depth = max_depth 149 self.extend(self.__find_breakpoints(self.klass_values))
150
151 - def __find_breakpoints(self, klass_values, depth = 0):
152 breakpoints = [] 153 if len(klass_values) <= 1: return breakpoints 154 from nltk_lite.contrib.classifier import min_entropy_breakpoint 155 position, entropy = min_entropy_breakpoint(klass_values) 156 if abs(entropy) == 0: return breakpoints 157 breakpoints.append(position) 158 first, second = klass_values[:position+1], klass_values[position+1:] 159 if depth < self.max_depth: 160 breakpoints.extend(self.__find_breakpoints(first, depth + 1)) 161 breakpoints.extend([position + 1 + x for x in self.__find_breakpoints(second, depth + 1)]) 162 return breakpoints
163
165 """ 166 Returns an array of indices where the class membership changes from one value to another 167 the indicies will always lie between 0 and one less than number of instance, both inclusive. 168 """ 169 breakpoints= [] 170 for index in range(len(self.klass_values) - 1): 171 if self.klass_values[index] != self.klass_values[index + 1]: 172 breakpoints.append(index) 173 return breakpoints
174
175 - def adjust_for_min_freq(self, min_size):
176 prev = -1 177 self.sort() 178 to_remove,frequencies = [], prob.FreqDist() 179 for breakpoint in self.data: 180 frequencies.inc(self.klass_values[breakpoint], breakpoint - prev) 181 if frequencies.count(frequencies.max()) < min_size: 182 to_remove.append(breakpoint) 183 else: 184 frequencies = prob.FreqDist() 185 prev = breakpoint 186 for item in to_remove: 187 self.remove(item)
188
189 - def adjust_for_equal_values(self):
190 to_be_removed = [] 191 for index in range(len(self.data)): 192 i = index 193 while i < len(self.data) - 1 and (self.attr_values[self.data[i]] == self.attr_values[self.data[i] + 1]): 194 #The last and second last elements have the same attribute value or is equal to next breakpoint? 195 if self.data[i] == len(self.attr_values) - 2 or (index < len(self.data) - 1 and self.data[i] == self.data[index + 1]): 196 to_be_removed.append(self.data[i]) 197 break 198 self.data[i] += 1 199 i += 1 200 if index == len(self.data) - 1:#last breakpoint 201 breakpoint = self.data[index] 202 while breakpoint < len(self.attr_values) - 1 and self.attr_values[breakpoint] == self.attr_values[breakpoint + 1]: 203 self.data[index] += 1 204 if self.data[index] == len(self.attr_values) - 1: 205 to_be_removed.append(self.data[index]) 206 break 207 breakpoint = self.data[index] 208 for breakpoint in to_be_removed: 209 self.data.remove(breakpoint)
210
211 - def as_ranges(self):
212 ranges, lower = [], self.attr_values[0] 213 self.sort() 214 for breakpoint in self.data: 215 mid = (self.attr_values[breakpoint] + self.attr_values[breakpoint + 1]) / 2.0 216 ranges.append(r.Range(lower, mid)) 217 lower = mid 218 ranges.append(r.Range(lower, self.attr_values[-1], True)) 219 return ranges
220