Package nltk_lite :: Package contrib :: Package mit :: Package six863 :: Package semantics :: Module featurechart
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.mit.six863.semantics.featurechart

  1  # Natural Language Toolkit: Chart Parser for Feature-Based Grammars 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Rob Speer <rspeer@mit.edu> 
  5  # URL: <http://nltk.sf.net> 
  6  # For license information, see LICENSE.TXT 
  7  # 
  8  # $Id: featurechart.py 4107 2007-02-01 00:07:42Z stevenbird $ 
  9   
 10  """ 
 11  Extension of chart parsing implementation to handle grammars with 
 12  feature structures as nodes. 
 13  """ 
 14   
 15  import yaml 
 16  from chart import * 
 17  from category import * 
 18  import cfg 
 19   
 20  from featurelite import * 
 21   
22 -def load_earley(filename, trace=1):
23 """ 24 Load a grammar from a file, and build an Earley feature parser based on 25 that grammar. 26 27 You can optionally specify a tracing level, for how much output you 28 want to see: 29 30 0: No output. 31 1: Show edges from scanner and completer rules (not predictor). 32 2 (default): Show all edges as they are added to the chart. 33 3: Show all edges, plus the results of successful unifications. 34 4: Show all edges, plus the results of all attempted unifications. 35 5: Show all edges, plus the results of all attempted unifications, 36 including those with cached results. 37 """ 38 39 grammar = GrammarFile.read_file(filename) 40 return grammar.earley_parser(trace)
41
42 -class FeatureTreeEdge(TreeEdge):
43 """ 44 FIXME: out of date documentation 45 46 A modification of L{TreeEdge} to handle nonterminals with features 47 (known as L{Categories<Category>}. 48 49 In addition to the span, left-hand side, right-hand side, and dot position 50 (described at L{TreeEdge}), a C{FeatureTreeEdge} includes X{vars}, a 51 set of L{FeatureBindings} saying which L{FeatureVariable}s are set to which 52 values. 53 54 These values are applied when examining the C{lhs} or C{rhs} of a 55 C{FeatureTreeEdge}. 56 57 For more information about edges, see the L{EdgeI} interface. 58 """
59 - def __init__(self, span, lhs, rhs, dot=0, vars=None):
60 """ 61 Construct a new C{FeatureTreeEdge}. 62 63 @type span: C{(int, int)} 64 @param span: A tuple C{(s,e)}, where C{subtokens[s:e]} is the 65 portion of the sentence that is consistent with the new 66 edge's structure. 67 @type lhs: L{Category} 68 @param lhs: The new edge's left-hand side, specifying the 69 hypothesized tree's node value. 70 @type rhs: C{list} of (L{Category} and C{string}) 71 @param rhs: The new edge's right-hand side, specifying the 72 hypothesized tree's children. 73 @type dot: C{int} 74 @param dot: The position of the new edge's dot. This position 75 specifies what prefix of the production's right hand side 76 is consistent with the text. In particular, if 77 C{sentence} is the list of subtokens in the sentence, then 78 C{subtokens[span[0]:span[1]]} can be spanned by the 79 children specified by C{rhs[:dot]}. 80 @type vars: L{FeatureBindings} 81 @param vars: The bindings specifying what values certain variables in 82 this edge must have. 83 """ 84 TreeEdge.__init__(self, span, lhs, rhs, dot) 85 if vars is None: vars = {} 86 self._vars = vars
87 88 @staticmethod
89 - def from_production(production, index, bindings=None):
90 """ 91 @return: A new C{FeatureTreeEdge} formed from the given production. 92 The new edge's left-hand side and right-hand side will 93 be taken from C{production}; its span will be C{(index, 94 index)}; its dot position will be C{0}, and it may have specified 95 variables set. 96 @rtype: L{FeatureTreeEdge} 97 """ 98 return FeatureTreeEdge(span=(index, index), lhs=production.lhs(), 99 rhs=production.rhs(), dot=0, vars=bindings)
100 101 # Accessors
102 - def vars(self):
103 """ 104 @return: the L{VariableBindings} mapping L{FeatureVariable}s to values. 105 @rtype: L{VariableBindings} 106 """ 107 return self._vars
108
109 - def lhs(self):
110 """ 111 @return: the value of the left-hand side with variables set. 112 @rtype: C{Category} 113 """ 114 return substitute_bindings(TreeEdge.lhs(self), self._vars)
115
116 - def orig_lhs(self):
117 """ 118 @return: the value of the left-hand side with no variables set. 119 @rtype: C{Category} 120 """ 121 return TreeEdge.lhs(self)
122
123 - def rhs(self):
124 """ 125 @return: the value of the right-hand side with variables set. 126 @rtype: C{Category} 127 """ 128 return tuple(apply(x, self._vars) for x in TreeEdge.rhs(self))
129
130 - def orig_rhs(self):
131 """ 132 @return: the value of the right-hand side with no variables set. 133 @rtype: C{Category} 134 """ 135 return TreeEdge.rhs(self)
136 137 # String representation
138 - def __str__(self):
139 str = '%s ->' % self.lhs() 140 141 for i in range(len(self._rhs)): 142 if i == self._dot: str += ' *' 143 str += ' %s' % (self.rhs()[i],) 144 if len(self._rhs) == self._dot: str += ' *' 145 return '%s %s' % (str, self._vars)
146
147 -class FeatureFundamentalRule(FundamentalRule):
148 - def __init__(self, trace=0):
149 FundamentalRule.__init__(self) 150 self.trace = trace 151 self.unify_memo = {}
152 - def apply_iter(self, chart, grammar, left_edge, right_edge):
153 # Make sure the rule is applicable. 154 if not (left_edge.end() == right_edge.start() and 155 left_edge.is_incomplete() and right_edge.is_complete() and 156 isinstance(left_edge, FeatureTreeEdge) and 157 isinstance(right_edge, FeatureTreeEdge) 158 ): 159 return 160 left_bindings = left_edge.vars().copy() 161 right_bindings = right_edge.vars().copy() 162 try: 163 unified = unify(left_edge.next(), right_edge.lhs(), left_bindings, 164 right_bindings, memo=self.unify_memo, trace=self.trace-2) 165 if isinstance(unified, Category): unified.freeze() 166 except UnificationFailure: return 167 168 # Construct the new edge. 169 new_edge = FeatureTreeEdge(span=(left_edge.start(), right_edge.end()), 170 lhs=left_edge.lhs(), rhs=left_edge.rhs(), 171 dot=left_edge.dot()+1, vars=left_bindings) 172 173 # Add it to the chart, with appropraite child pointers. 174 changed_chart = False 175 for cpl1 in chart.child_pointer_lists(left_edge): 176 if chart.insert(new_edge, cpl1+(right_edge,)): 177 changed_chart = True 178 assert chart.child_pointer_lists(new_edge), new_edge 179 180 # If we changed the chart, then generate the edge. 181 if changed_chart: 182 yield new_edge
183
184 -class SingleEdgeFeatureFundamentalRule(SingleEdgeFundamentalRule):
185 - def __init__(self, trace=0):
188
189 - def apply_iter(self, chart, grammar, edge1):
190 fr = self._fundamental_rule 191 if edge1.is_incomplete(): 192 # edge1 = left_edge; edge2 = right_edge 193 for edge2 in chart.select(start=edge1.end(), is_complete=True): 194 for new_edge in fr.apply_iter(chart, grammar, edge1, edge2): 195 yield new_edge 196 else: 197 # edge2 = left_edge; edge1 = right_edge 198 for edge2 in chart.select(end=edge1.start(), is_complete=False): 199 for new_edge in fr.apply_iter(chart, grammar, edge2, edge1): 200 yield new_edge
201 202
203 -class FeatureTopDownExpandRule(TopDownExpandRule):
204 """ 205 The @C{TopDownExpandRule} specialised for feature-based grammars. 206 """
207 - def __init__(self, trace=0):
208 TopDownExpandRule.__init__(self) 209 self.unify_memo = {} 210 self.trace = trace
211 - def apply_iter(self, chart, grammar, edge):
212 if edge.is_complete(): return 213 for prod in grammar.productions(): 214 bindings = edge.vars().copy() 215 try: 216 unified = unify(edge.next(), prod.lhs(), bindings, {}, 217 memo=self.unify_memo, trace=self.trace-2) 218 if isinstance(unified, Category): unified.freeze() 219 except UnificationFailure: 220 continue 221 new_edge = FeatureTreeEdge.from_production(prod, edge.end()) 222 if chart.insert(new_edge, ()): 223 yield new_edge
224
225 -class FeatureEarleyChartParse(EarleyChartParse):
226 """ 227 A chart parser implementing the Earley parsing algorithm, allowing 228 nonterminals that have features (known as L{Categories<Category>}). 229 230 - For each index I{end} in [0, 1, ..., N]: 231 - For each I{edge} s.t. I{edge}.end = I{end}: 232 - If I{edge} is incomplete, and I{edge}.next is not a part 233 of speech: 234 - Apply PredictorRule to I{edge} 235 - If I{edge} is incomplete, and I{edge}.next is a part of 236 speech: 237 - Apply ScannerRule to I{edge} 238 - If I{edge} is complete: 239 - Apply CompleterRule to I{edge} 240 - Return any complete parses in the chart 241 242 C{FeatureEarleyChartParse} uses a X{lexicon} to decide whether a leaf 243 has a given part of speech. This lexicon is encoded as a 244 dictionary that maps each word to a list of parts of speech that 245 word can have. Unlike in the L{EarleyChartParse}, this lexicon is 246 case-insensitive. 247 """
248 - def __init__(self, grammar, lexicon, trace=0):
249 # Build a case-insensitive lexicon. 250 #ci_lexicon = dict((k.upper(), v) for k, v in lexicon.iteritems()) 251 # Call the super constructor. 252 EarleyChartParse.__init__(self, grammar, lexicon, trace)
253
254 - def get_parse_list(self, tokens):
255 chart = Chart(tokens) 256 grammar = self._grammar 257 258 # Width, for printing trace edges. 259 #w = 40/(chart.num_leaves()+1) 260 w = 2 261 if self._trace > 0: print ' '*9, chart.pp_leaves(w) 262 263 # Initialize the chart with a special "starter" edge. 264 root = GrammarCategory(pos='[INIT]') 265 edge = FeatureTreeEdge((0,0), root, (grammar.start(),), 0, 266 {}) 267 chart.insert(edge, ()) 268 269 # Create the 3 rules: 270 predictor = FeatureTopDownExpandRule(self._trace) 271 completer = SingleEdgeFeatureFundamentalRule(self._trace) 272 #scanner = FeatureScannerRule(self._lexicon) 273 274 for end in range(chart.num_leaves()+1): 275 if self._trace > 1: print 'Processing queue %d' % end 276 277 # Scanner rule substitute, i.e. this is being used in place 278 # of a proper FeatureScannerRule at the moment. 279 if end > 0 and end-1 < chart.num_leaves(): 280 leaf = chart.leaf(end-1) 281 for pos in self._lexicon(leaf): 282 new_leaf_edge = LeafEdge(leaf, end-1) 283 chart.insert(new_leaf_edge, ()) 284 new_pos_edge = FeatureTreeEdge((end-1, end), pos, [leaf], 1, 285 {}) 286 chart.insert(new_pos_edge, (new_leaf_edge,)) 287 if self._trace > 0: 288 print 'Scanner ', chart.pp_edge(new_pos_edge,w) 289 290 291 for edge in chart.select(end=end): 292 if edge.is_incomplete(): 293 for e in predictor.apply(chart, grammar, edge): 294 if self._trace > 1: 295 print 'Predictor', chart.pp_edge(e,w) 296 #if edge.is_incomplete(): 297 # for e in scanner.apply(chart, grammar, edge): 298 # if self._trace > 0: 299 # print 'Scanner ', chart.pp_edge(e,w) 300 if edge.is_complete(): 301 for e in completer.apply(chart, grammar, edge): 302 if self._trace > 0: 303 print 'Completer', chart.pp_edge(e,w) 304 305 # Output a list of complete parses. 306 return chart.parses(root)
307
308 -def demo():
309 import sys, time 310 311 S = GrammarCategory.parse('S') 312 VP = GrammarCategory.parse('VP') 313 NP = GrammarCategory.parse('NP') 314 PP = GrammarCategory.parse('PP') 315 V = GrammarCategory.parse('V') 316 N = GrammarCategory.parse('N') 317 P = GrammarCategory.parse('P') 318 Name = GrammarCategory.parse('Name') 319 Det = GrammarCategory.parse('Det') 320 DetSg = GrammarCategory.parse('Det[-pl]') 321 DetPl = GrammarCategory.parse('Det[+pl]') 322 NSg = GrammarCategory.parse('N[-pl]') 323 NPl = GrammarCategory.parse('N[+pl]') 324 325 # Define some grammatical productions. 326 grammatical_productions = [ 327 cfg.Production(S, (NP, VP)), cfg.Production(PP, (P, NP)), 328 cfg.Production(NP, (NP, PP)), 329 cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)), 330 cfg.Production(VP, (V,)), cfg.Production(NP, (DetPl, NPl)), 331 cfg.Production(NP, (DetSg, NSg))] 332 333 # Define some lexical productions. 334 lexical_productions = [ 335 cfg.Production(NP, ('John',)), cfg.Production(NP, ('I',)), 336 cfg.Production(Det, ('the',)), cfg.Production(Det, ('my',)), 337 cfg.Production(Det, ('a',)), 338 cfg.Production(NSg, ('dog',)), cfg.Production(NSg, ('cookie',)), 339 cfg.Production(V, ('ate',)), cfg.Production(V, ('saw',)), 340 cfg.Production(P, ('with',)), cfg.Production(P, ('under',)), 341 ] 342 343 earley_grammar = cfg.Grammar(S, grammatical_productions) 344 earley_lexicon = {} 345 for prod in lexical_productions: 346 earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs()) 347 def lexicon(word): 348 return earley_lexicon.get(word.upper(), [])
349 350 sent = 'I saw John with a dog with my cookie' 351 print "Sentence:\n", sent 352 from nltk_lite import tokenize 353 tokens = list(tokenize.whitespace(sent)) 354 t = time.time() 355 cp = FeatureEarleyChartParse(earley_grammar, lexicon, trace=1) 356 trees = cp.get_parse_list(tokens) 357 print "Time: %s" % (time.time() - t) 358 for tree in trees: print tree 359
360 -def run_profile():
361 import profile 362 profile.run('for i in range(1): demo()', '/tmp/profile.out') 363 import pstats 364 p = pstats.Stats('/tmp/profile.out') 365 p.strip_dirs().sort_stats('time', 'cum').print_stats(60) 366 p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
367 368 if __name__ == '__main__': 369 demo() 370