Package nltk_lite :: Package contrib :: Package mit :: Package six863 :: Package parse :: Module chart
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.mit.six863.parse.chart

   1  # Natural Language Toolkit: A Chart Parser 
   2  # 
   3  # Copyright (C) 2001-2007 University of Pennsylvania 
   4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
   5  #         Steven Bird <sb@csse.unimelb.edu.au> 
   6  #         Jean Mark Gawron <gawron@mail.sdsu.edu> 
   7  # URL: <http://nltk.sf.net> 
   8  # For license information, see LICENSE.TXT 
   9  # 
  10  # $Id: chart.py 4157 2007-02-28 09:56:25Z stevenbird $ 
  11   
  12  from __init__ import * 
  13  from tree import Tree 
  14  import cfg 
  15   
  16  """ 
  17  Data classes and parser implementations for \"chart parsers\", which 
  18  use dynamic programming to efficiently parse a text.  A X{chart 
  19  parser} derives parse trees for a text by iteratively adding \"edges\" 
  20  to a \"chart.\"  Each X{edge} represents a hypothesis about the tree 
  21  structure for a subsequence of the text.  The X{chart} is a 
  22  \"blackboard\" for composing and combining these hypotheses. 
  23   
  24  When a chart parser begins parsing a text, it creates a new (empty) 
  25  chart, spanning the text.  It then incrementally adds new edges to the 
  26  chart.  A set of X{chart rules} specifies the conditions under which 
  27  new edges should be added to the chart.  Once the chart reaches a 
  28  stage where none of the chart rules adds any new edges, parsing is 
  29  complete. 
  30   
  31  Charts are encoded with the L{Chart} class, and edges are encoded with 
  32  the L{TreeEdge} and L{LeafEdge} classes.  The chart parser module 
  33  defines three chart parsers: 
  34   
  35    - C{ChartParse} is a simple and flexible chart parser.  Given a 
  36      set of chart rules, it will apply those rules to the chart until 
  37      no more edges are added. 
  38   
  39    - C{SteppingChartParse} is a subclass of C{ChartParse} that can 
  40      be used to step through the parsing process. 
  41   
  42    - C{EarleyChartParse} is an implementation of the Earley chart parsing 
  43      algorithm.  It makes a single left-to-right pass through the 
  44      chart, and applies one of three rules (predictor, scanner, and 
  45      completer) to each edge it encounters. 
  46  """ 
  47   
  48  import re 
  49   
  50  ######################################################################## 
  51  ##  Edges 
  52  ######################################################################## 
  53   
54 -class EdgeI(object):
55 """ 56 A hypothesis about the structure of part of a sentence. 57 Each edge records the fact that a structure is (partially) 58 consistent with the sentence. An edge contains: 59 60 - A X{span}, indicating what part of the sentence is 61 consistent with the hypothesized structure. 62 63 - A X{left-hand side}, specifying what kind of structure is 64 hypothesized. 65 66 - A X{right-hand side}, specifying the contents of the 67 hypothesized structure. 68 69 - A X{dot position}, indicating how much of the hypothesized 70 structure is consistent with the sentence. 71 72 Every edge is either X{complete} or X{incomplete}: 73 74 - An edge is X{complete} if its structure is fully consistent 75 with the sentence. 76 77 - An edge is X{incomplete} if its structure is partially 78 consistent with the sentence. For every incomplete edge, the 79 span specifies a possible prefix for the edge's structure. 80 81 There are two kinds of edge: 82 83 - C{TreeEdges<TreeEdge>} record which trees have been found to 84 be (partially) consistent with the text. 85 86 - C{LeafEdges<leafEdge>} record the tokens occur in the text. 87 88 The C{EdgeI} interface provides a common interface to both types 89 of edge, allowing chart parsers to treat them in a uniform manner. 90 """
91 - def __init__(self):
92 if self.__class__ == EdgeI: 93 raise TypeError('Edge is an abstract interface')
94 95 #//////////////////////////////////////////////////////////// 96 # Span 97 #//////////////////////////////////////////////////////////// 98
99 - def span(self):
100 """ 101 @return: A tuple C{(s,e)}, where C{subtokens[s:e]} is the 102 portion of the sentence that is consistent with this 103 edge's structure. 104 @rtype: C{(int, int)} 105 """ 106 raise AssertionError('EdgeI is an abstract interface')
107
108 - def start(self):
109 """ 110 @return: The start index of this edge's span. 111 @rtype: C{int} 112 """ 113 raise AssertionError('EdgeI is an abstract interface')
114
115 - def end(self):
116 """ 117 @return: The end index of this edge's span. 118 @rtype: C{int} 119 """ 120 raise AssertionError('EdgeI is an abstract interface')
121
122 - def length(self):
123 """ 124 @return: The length of this edge's span. 125 @rtype: C{int} 126 """ 127 raise AssertionError('EdgeI is an abstract interface')
128 129 #//////////////////////////////////////////////////////////// 130 # Left Hand Side 131 #//////////////////////////////////////////////////////////// 132
133 - def lhs(self):
134 """ 135 @return: This edge's left-hand side, which specifies what kind 136 of structure is hypothesized by this edge. 137 @see: L{TreeEdge} and L{LeafEdge} for a description of 138 the left-hand side values for each edge type. 139 """ 140 raise AssertionError('EdgeI is an abstract interface')
141 142 #//////////////////////////////////////////////////////////// 143 # Right Hand Side 144 #//////////////////////////////////////////////////////////// 145
146 - def rhs(self):
147 """ 148 @return: This edge's right-hand side, which specifies 149 the content of the structure hypothesized by this 150 edge. 151 @see: L{TreeEdge} and L{LeafEdge} for a description of 152 the right-hand side values for each edge type. 153 """ 154 raise AssertionError('EdgeI is an abstract interface')
155
156 - def dot(self):
157 """ 158 @return: This edge's dot position, which indicates how much of 159 the hypothesized structure is consistent with the 160 sentence. In particular, C{self.rhs[:dot]} is consistent 161 with C{subtoks[self.start():self.end()]}. 162 @rtype: C{int} 163 """ 164 raise AssertionError('EdgeI is an abstract interface')
165
166 - def next(self):
167 """ 168 @return: The element of this edge's right-hand side that 169 immediately follows its dot. 170 @rtype: C{Nonterminal} or X{terminal} or C{None} 171 """ 172 raise AssertionError('EdgeI is an abstract interface')
173
174 - def is_complete(self):
175 """ 176 @return: True if this edge's structure is fully consistent 177 with the text. 178 @rtype: C{boolean} 179 """ 180 raise AssertionError('EdgeI is an abstract interface')
181
182 - def is_incomplete(self):
183 """ 184 @return: True if this edge's structure is partially consistent 185 with the text. 186 @rtype: C{boolean} 187 """ 188 raise AssertionError('EdgeI is an abstract interface')
189 190 #//////////////////////////////////////////////////////////// 191 # Comparisons 192 #////////////////////////////////////////////////////////////
193 - def __cmp__(self, other):
194 raise AssertionError('EdgeI is an abstract interface')
195
196 - def __hash__(self, other):
197 raise AssertionError('EdgeI is an abstract interface')
198
199 -class TreeEdge(EdgeI):
200 """ 201 An edge that records the fact that a tree is (partially) 202 consistent with the sentence. A tree edge consists of: 203 204 - A X{span}, indicating what part of the sentence is 205 consistent with the hypothesized tree. 206 207 - A X{left-hand side}, specifying the hypothesized tree's node 208 value. 209 210 - A X{right-hand side}, specifying the hypothesized tree's 211 children. Each element of the right-hand side is either a 212 terminal, specifying a token with that terminal as its leaf 213 value; or a nonterminal, specifying a subtree with that 214 nonterminal's symbol as its node value. 215 216 - A X{dot position}, indicating which children are consistent 217 with part of the sentence. In particular, if C{dot} is the 218 dot position, C{rhs} is the right-hand size, C{(start,end)} 219 is the span, and C{sentence} is the list of subtokens in the 220 sentence, then C{subtokens[start:end]} can be spanned by the 221 children specified by C{rhs[:dot]}. 222 223 For more information about edges, see the L{EdgeI} interface. 224 """
225 - def __init__(self, span, lhs, rhs, dot=0):
226 """ 227 Construct a new C{TreeEdge}. 228 229 @type span: C{(int, int)} 230 @param span: A tuple C{(s,e)}, where C{subtokens[s:e]} is the 231 portion of the sentence that is consistent with the new 232 edge's structure. 233 @type lhs: L{Nonterminal} 234 @param lhs: The new edge's left-hand side, specifying the 235 hypothesized tree's node value. 236 @type rhs: C{list} of (L{Nonterminal} and C{string}) 237 @param rhs: The new edge's right-hand side, specifying the 238 hypothesized tree's children. 239 @type dot: C{int} 240 @param dot: The position of the new edge's dot. This position 241 specifies what prefix of the production's right hand side 242 is consistent with the text. In particular, if 243 C{sentence} is the list of subtokens in the sentence, then 244 C{subtokens[span[0]:span[1]]} can be spanned by the 245 children specified by C{rhs[:dot]}. 246 """ 247 self._lhs = lhs 248 self._rhs = tuple(rhs) 249 self._span = span 250 self._dot = dot
251 252 # [staticmethod]
253 - def from_production(production, index):
254 """ 255 @return: A new C{TreeEdge} formed from the given production. 256 The new edge's left-hand side and right-hand side will 257 be taken from C{production}; its span will be C{(index, 258 index)}; and its dot position will be C{0}. 259 @rtype: L{TreeEdge} 260 """ 261 return TreeEdge(span=(index, index), lhs=production.lhs(), 262 rhs=production.rhs(), dot=0)
263 from_production = staticmethod(from_production) 264 265 # Accessors
266 - def lhs(self): return self._lhs
267 - def span(self): return self._span
268 - def start(self): return self._span[0]
269 - def end(self): return self._span[1]
270 - def length(self): return self._span[1] - self._span[0]
271 - def rhs(self): return self._rhs
272 - def dot(self): return self._dot
273 - def is_complete(self): return self._dot == len(self._rhs)
274 - def is_incomplete(self): return self._dot != len(self._rhs)
275 - def next(self):
276 if self._dot >= len(self._rhs): return None 277 else: return self._rhs[self._dot]
278 279 # Comparisons & hashing
280 - def __cmp__(self, other):
281 if self.__class__ != other.__class__: return -1 282 return cmp((self._span, self.lhs(), self.rhs(), self._dot), 283 (other._span, other.lhs(), other.rhs(), other._dot))
284 - def __hash__(self):
285 return hash((self.lhs(), self.rhs(), self._span, self._dot))
286 287 # String representation
288 - def __str__(self):
289 str = '[%s:%s] ' % (self._span[0], self._span[1]) 290 str += '%-2s ->' % (self._lhs.symbol(),) 291 292 for i in range(len(self._rhs)): 293 if i == self._dot: str += ' *' 294 if isinstance(self._rhs[i], cfg.Nonterminal): 295 str += ' %s' % (self._rhs[i].symbol(),) 296 else: 297 str += ' %r' % (self._rhs[i],) 298 if len(self._rhs) == self._dot: str += ' *' 299 return str
300
301 - def __repr__(self):
302 return '[Edge: %s]' % self
303
304 -class LeafEdge(EdgeI):
305 """ 306 An edge that records the fact that a leaf value is consistent with 307 a word in the sentence. A leaf edge consists of: 308 309 - An X{index}, indicating the position of the word. 310 - A X{leaf}, specifying the word's content. 311 312 A leaf edge's left-hand side is its leaf value, and its right hand 313 side is C{()}. Its span is C{[index, index+1]}, and its dot 314 position is C{0}. 315 """
316 - def __init__(self, leaf, index):
317 """ 318 Construct a new C{LeafEdge}. 319 320 @param leaf: The new edge's leaf value, specifying the word 321 that is recorded by this edge. 322 @param index: The new edge's index, specifying the position of 323 the word that is recorded by this edge. 324 """ 325 self._leaf = leaf 326 self._index = index
327 328 # Accessors
329 - def lhs(self): return self._leaf
330 - def span(self): return (self._index, self._index+1)
331 - def start(self): return self._index
332 - def end(self): return self._index+1
333 - def length(self): return 1
334 - def rhs(self): return ()
335 - def dot(self): return 0
336 - def is_complete(self): return True
337 - def is_incomplete(self): return False
338 - def next(self): return None
339 340 # Comparisons & hashing
341 - def __cmp__(self, other):
342 if not isinstance(other, LeafEdge): return -1 343 return cmp((self._index, self._leaf), (other._index, other._leaf))
344 - def __hash__(self):
345 return hash((self._index, self._leaf))
346 347 # String representations
348 - def __str__(self): return '[%s:%s] %r' % (self._index, self._index+1, self._leaf)
349 - def __repr__(self):
350 return '[Edge: %s]' % (self)
351 352 ######################################################################## 353 ## Chart 354 ######################################################################## 355
356 -class Chart(object):
357 """ 358 A blackboard for hypotheses about the syntactic constituents of a 359 sentence. A chart contains a set of edges, and each edge encodes 360 a single hypothesis about the structure of some portion of the 361 sentence. 362 363 The L{select} method can be used to select a specific collection 364 of edges. For example C{chart.select(is_complete=True, start=0)} 365 yields all complete edges whose start indices are 0. To ensure 366 the efficiency of these selection operations, C{Chart} dynamically 367 creates and maintains an index for each set of attributes that 368 have been selected on. 369 370 In order to reconstruct the trees that are represented by an edge, 371 the chart associates each edge with a set of child pointer lists. 372 A X{child pointer list} is a list of the edges that license an 373 edge's right-hand side. 374 375 @ivar _tokens: The sentence that the chart covers. 376 @ivar _num_leaves: The number of tokens. 377 @ivar _edges: A list of the edges in the chart 378 @ivar _edge_to_cpls: A dictionary mapping each edge to a set 379 of child pointer lists that are associated with that edge. 380 @ivar _indexes: A dictionary mapping tuples of edge attributes 381 to indices, where each index maps the corresponding edge 382 attribute values to lists of edges. 383 """
384 - def __init__(self, tokens):
385 """ 386 Construct a new empty chart. 387 388 @type tokens: L{list} 389 @param tokens: The sentence that this chart will be used to parse. 390 """ 391 # Record the sentence token and the sentence length. 392 self._tokens = list(tokens) 393 self._num_leaves = len(self._tokens) 394 395 # A list of edges contained in this chart. 396 self._edges = [] 397 398 # The set of child pointer lists associated with each edge. 399 self._edge_to_cpls = {} 400 401 # Indexes mapping attribute values to lists of edges (used by 402 # select()). 403 self._indexes = {}
404 405 #//////////////////////////////////////////////////////////// 406 # Sentence Access 407 #//////////////////////////////////////////////////////////// 408
409 - def num_leaves(self):
410 """ 411 @return: The number of words in this chart's sentence. 412 @rtype: C{int} 413 """ 414 return self._num_leaves
415
416 - def leaf(self, index):
417 """ 418 @return: The leaf value of the word at the given index. 419 @rtype: C{string} 420 """ 421 return self._tokens[index]
422
423 - def leaves(self):
424 """ 425 @return: A list of the leaf values of each word in the 426 chart's sentence. 427 @rtype: C{list} of C{string} 428 """ 429 return self._tokens
430 431 #//////////////////////////////////////////////////////////// 432 # Edge access 433 #//////////////////////////////////////////////////////////// 434
435 - def edges(self):
436 """ 437 @return: A list of all edges in this chart. New edges 438 that are added to the chart after the call to edges() 439 will I{not} be contained in this list. 440 @rtype: C{list} of L{EdgeI} 441 @see: L{iteredges}, L{select} 442 """ 443 return self._edges[:]
444
445 - def iteredges(self):
446 """ 447 @return: An iterator over the edges in this chart. Any 448 new edges that are added to the chart before the iterator 449 is exahusted will also be generated. 450 @rtype: C{iter} of L{EdgeI} 451 @see: L{edges}, L{select} 452 """ 453 return iter(self._edges)
454 455 # Iterating over the chart yields its edges. 456 __iter__ = iteredges 457
458 - def num_edges(self):
459 """ 460 @return: The number of edges contained in this chart. 461 @rtype: C{int} 462 """ 463 return len(self._edge_to_cpls)
464
465 - def select(self, **restrictions):
466 """ 467 @return: An iterator over the edges in this chart. Any 468 new edges that are added to the chart before the iterator 469 is exahusted will also be generated. C{restrictions} 470 can be used to restrict the set of edges that will be 471 generated. 472 @rtype: C{iter} of L{EdgeI} 473 474 @kwarg span: Only generate edges C{e} where C{e.span()==span} 475 @kwarg start: Only generate edges C{e} where C{e.start()==start} 476 @kwarg end: Only generate edges C{e} where C{e.end()==end} 477 @kwarg length: Only generate edges C{e} where C{e.length()==length} 478 @kwarg lhs: Only generate edges C{e} where C{e.lhs()==lhs} 479 @kwarg rhs: Only generate edges C{e} where C{e.rhs()==rhs} 480 @kwarg next: Only generate edges C{e} where C{e.next()==next} 481 @kwarg dot: Only generate edges C{e} where C{e.dot()==dot} 482 @kwarg is_complete: Only generate edges C{e} where 483 C{e.is_complete()==is_complete} 484 @kwarg is_incomplete: Only generate edges C{e} where 485 C{e.is_incomplete()==is_incomplete} 486 """ 487 # If there are no restrictions, then return all edges. 488 if restrictions=={}: return iter(self._edges) 489 490 # Find the index corresponding to the given restrictions. 491 restr_keys = restrictions.keys() 492 restr_keys.sort() 493 restr_keys = tuple(restr_keys) 494 495 # If it doesn't exist, then create it. 496 if not self._indexes.has_key(restr_keys): 497 self._add_index(restr_keys) 498 499 vals = [restrictions[k] for k in restr_keys] 500 return iter(self._indexes[restr_keys].get(tuple(vals), []))
501
502 - def _add_index(self, restr_keys):
503 """ 504 A helper function for L{select}, which creates a new index for 505 a given set of attributes (aka restriction keys). 506 """ 507 # Make sure it's a valid index. 508 for k in restr_keys: 509 if not hasattr(EdgeI, k): 510 raise ValueError, 'Bad restriction: %s' % k 511 512 # Create the index. 513 self._indexes[restr_keys] = {} 514 515 # Add all existing edges to the index. 516 for edge in self._edges: 517 vals = [getattr(edge, k)() for k in restr_keys] 518 index = self._indexes[restr_keys] 519 index.setdefault(tuple(vals),[]).append(edge)
520 521 #//////////////////////////////////////////////////////////// 522 # Edge Insertion 523 #//////////////////////////////////////////////////////////// 524
525 - def insert(self, edge, child_pointer_list):
526 """ 527 Add a new edge to the chart. 528 529 @type edge: L{Edge} 530 @param edge: The new edge 531 @type child_pointer_list: C{tuple} of L{Edge} 532 @param child_pointer_list: A list of the edges that were used to 533 form this edge. This list is used to reconstruct the trees 534 (or partial trees) that are associated with C{edge}. 535 @rtype: C{bool} 536 @return: True if this operation modified the chart. In 537 particular, return true iff the chart did not already 538 contain C{edge}, or if it did not already associate 539 C{child_pointer_list} with C{edge}. 540 """ 541 # Is it a new edge? 542 if not self._edge_to_cpls.has_key(edge): 543 # Add it to the list of edges. 544 self._edges.append(edge) 545 546 # Register with indexes 547 for (restr_keys, index) in self._indexes.items(): 548 vals = [getattr(edge, k)() for k in restr_keys] 549 index = self._indexes[restr_keys] 550 index.setdefault(tuple(vals),[]).append(edge) 551 552 # Get the set of child pointer lists for this edge. 553 cpls = self._edge_to_cpls.setdefault(edge,{}) 554 child_pointer_list = tuple(child_pointer_list) 555 556 if cpls.has_key(child_pointer_list): 557 # We've already got this CPL; return false. 558 return False 559 else: 560 # It's a new CPL; register it, and return true. 561 cpls[child_pointer_list] = True 562 return True
563 564 #//////////////////////////////////////////////////////////// 565 # Tree extraction & child pointer lists 566 #//////////////////////////////////////////////////////////// 567
568 - def parses(self, root, tree_class=Tree):
569 """ 570 @return: A list of the complete tree structures that span 571 the entire chart, and whose root node is C{root}. 572 """ 573 trees = [] 574 for edge in self.select(span=(0,self._num_leaves), lhs=root): 575 trees += self.trees(edge, tree_class=tree_class, complete=True) 576 return trees
577
578 - def trees(self, edge, tree_class=Tree, complete=False):
579 """ 580 @return: A list of the tree structures that are associated 581 with C{edge}. 582 583 If C{edge} is incomplete, then the unexpanded children will be 584 encoded as childless subtrees, whose node value is the 585 corresponding terminal or nonterminal. 586 587 @rtype: C{list} of L{Tree} 588 @note: If two trees share a common subtree, then the same 589 C{Tree} may be used to encode that subtree in 590 both trees. If you need to eliminate this subtree 591 sharing, then create a deep copy of each tree. 592 """ 593 return self._trees(edge, complete, memo={}, tree_class=tree_class)
594
595 - def _trees(self, edge, complete, memo, tree_class):
596 """ 597 A helper function for L{trees}. 598 @param memo: A dictionary used to record the trees that we've 599 generated for each edge, so that when we see an edge more 600 than once, we can reuse the same trees. 601 """ 602 # If we've seen this edge before, then reuse our old answer. 603 if memo.has_key(edge): return memo[edge] 604 605 trees = [] 606 607 # when we're reading trees off the chart, don't use incomplete edges 608 if complete and edge.is_incomplete(): 609 return trees 610 611 # Until we're done computing the trees for edge, set 612 # memo[edge] to be empty. This has the effect of filtering 613 # out any cyclic trees (i.e., trees that contain themselves as 614 # descendants), because if we reach this edge via a cycle, 615 # then it will appear that the edge doesn't generate any 616 # trees. 617 memo[edge] = [] 618 619 # Leaf edges. 620 if isinstance(edge, LeafEdge): 621 leaf = self._tokens[edge.start()] 622 memo[edge] = leaf 623 return [leaf] 624 625 # Each child pointer list can be used to form trees. 626 for cpl in self.child_pointer_lists(edge): 627 # Get the set of child choices for each child pointer. 628 # child_choices[i] is the set of choices for the tree's 629 # ith child. 630 child_choices = [self._trees(cp, complete, memo, tree_class) 631 for cp in cpl] 632 633 # Kludge to ensure child_choices is a doubly-nested list 634 if len(child_choices) > 0 and type(child_choices[0]) == type(""): 635 child_choices = [child_choices] 636 637 # For each combination of children, add a tree. 638 for children in self._choose_children(child_choices): 639 lhs = edge.lhs().symbol() 640 trees.append(tree_class(lhs, children)) 641 642 # If the edge is incomplete, then extend it with "partial trees": 643 if edge.is_incomplete(): 644 unexpanded = [tree_class(elt,[]) 645 for elt in edge.rhs()[edge.dot():]] 646 for tree in trees: 647 tree.extend(unexpanded) 648 649 # Update the memoization dictionary. 650 memo[edge] = trees 651 652 # Return the list of trees. 653 return trees
654
655 - def _choose_children(self, child_choices):
656 """ 657 A helper function for L{_trees} that finds the possible sets 658 of subtrees for a new tree. 659 660 @param child_choices: A list that specifies the options for 661 each child. In particular, C{child_choices[i]} is a list of 662 tokens and subtrees that can be used as the C{i}th child. 663 """ 664 children_lists = [[]] 665 for child_choice in child_choices: 666 children_lists = [child_list+[child] 667 for child in child_choice 668 for child_list in children_lists] 669 return children_lists
670
671 - def child_pointer_lists(self, edge):
672 """ 673 @rtype: C{list} of C{list} of C{Edge} 674 @return: The set of child pointer lists for the given edge. 675 Each child pointer list is a list of edges that have 676 been used to form this edge. 677 """ 678 # Make a copy, in case they modify it. 679 return self._edge_to_cpls.get(edge, {}).keys()
680 681 #//////////////////////////////////////////////////////////// 682 # Display 683 #////////////////////////////////////////////////////////////
684 - def pp_edge(self, edge, width=None):
685 """ 686 @return: A pretty-printed string representation of a given edge 687 in this chart. 688 @rtype: C{string} 689 @param width: The number of characters allotted to each 690 index in the sentence. 691 """ 692 if width is None: width = 50/(self.num_leaves()+1) 693 (start, end) = (edge.start(), edge.end()) 694 695 str = '|' + ('.'+' '*(width-1))*start 696 697 # Zero-width edges are "#" if complete, ">" if incomplete 698 if start == end: 699 if edge.is_complete(): str += '#' 700 else: str += '>' 701 702 # Spanning complete edges are "[===]"; Other edges are 703 # "[---]" if complete, "[--->" if incomplete 704 elif edge.is_complete() and edge.span() == (0,self._num_leaves): 705 str += '['+('='*width)*(end-start-1) + '='*(width-1)+']' 706 elif edge.is_complete(): 707 str += '['+('-'*width)*(end-start-1) + '-'*(width-1)+']' 708 else: 709 str += '['+('-'*width)*(end-start-1) + '-'*(width-1)+'>' 710 711 str += (' '*(width-1)+'.')*(self._num_leaves-end) 712 return str + '| %s ' % edge
713
714 - def pp_leaves(self, width=None):
715 """ 716 @return: A pretty-printed string representation of this 717 chart's leaves. This string can be used as a header 718 for calls to L{pp_edge}. 719 """ 720 if width is None: width = 50/(self.num_leaves()+1) 721 722 if self._tokens is not None and width>1: 723 header = '|.' 724 for tok in self._tokens: 725 header += tok[:width-1].center(width-1)+'.' 726 header += '|' 727 else: 728 header = '' 729 730 return header
731
732 - def pp(self, width=None):
733 """ 734 @return: A pretty-printed string representation of this chart. 735 @rtype: C{string} 736 @param width: The number of characters allotted to each 737 index in the sentence. 738 """ 739 if width is None: width = 50/(self.num_leaves()+1) 740 # sort edges: primary key=length, secondary key=start index. 741 # (and filter out the token edges) 742 edges = [(e.length(), e.start(), e) for e in self] 743 edges.sort() 744 edges = [e for (_,_,e) in edges] 745 746 return (self.pp_leaves(width) + '\n' + 747 '\n'.join(self.pp_edge(edge, width) for edge in edges))
748 749 #//////////////////////////////////////////////////////////// 750 # Display: Dot (AT&T Graphviz) 751 #//////////////////////////////////////////////////////////// 752
753 - def dot_digraph(self):
754 # Header 755 s = 'digraph nltk_chart {\n' 756 #s += ' size="5,5";\n' 757 s += ' rankdir=LR;\n' 758 s += ' node [height=0.1,width=0.1];\n' 759 s += ' node [style=filled, color="lightgray"];\n' 760 761 # Set up the nodes 762 for y in range(self.num_edges(), -1, -1): 763 if y == 0: 764 s += ' node [style=filled, color="black"];\n' 765 for x in range(self.num_leaves()+1): 766 if y == 0 or (x <= self._edges[y-1].start() or 767 x >= self._edges[y-1].end()): 768 s += ' %04d.%04d [label=""];\n' % (x,y) 769 770 # Add a spacer 771 s += ' x [style=invis]; x->0000.0000 [style=invis];\n' 772 773 # Declare ranks. 774 for x in range(self.num_leaves()+1): 775 s += ' {rank=same;' 776 for y in range(self.num_edges()+1): 777 if y == 0 or (x <= self._edges[y-1].start() or 778 x >= self._edges[y-1].end()): 779 s += ' %04d.%04d' % (x,y) 780 s += '}\n' 781 782 # Add the leaves 783 s += ' edge [style=invis, weight=100];\n' 784 s += ' node [shape=plaintext]\n' 785 s += ' 0000.0000' 786 for x in range(self.num_leaves()): 787 s += '->%s->%04d.0000' % (self.leaf(x), x+1) 788 s += ';\n\n' 789 790 # Add the edges 791 s += ' edge [style=solid, weight=1];\n' 792 for y, edge in enumerate(self): 793 for x in range(edge.start()): 794 s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' % 795 (x, y+1, x+1, y+1)) 796 s += (' %04d.%04d -> %04d.%04d [label="%s"];\n' % 797 (edge.start(), y+1, edge.end(), y+1, edge)) 798 for x in range(edge.end(), self.num_leaves()): 799 s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' % 800 (x, y+1, x+1, y+1)) 801 s += '}\n' 802 return s
803 804 ######################################################################## 805 ## Chart Rules 806 ######################################################################## 807
808 -class ChartRuleI(object):
809 """ 810 A rule that specifies what new edges are licensed by any given set 811 of existing edges. Each chart rule expects a fixed number of 812 edges, as indicated by the class variable L{NUM_EDGES}. In 813 particular: 814 815 - A chart rule with C{NUM_EDGES=0} specifies what new edges are 816 licensed, regardless of existing edges. 817 818 - A chart rule with C{NUM_EDGES=1} specifies what new edges are 819 licensed by a single existing edge. 820 821 - A chart rule with C{NUM_EDGES=2} specifies what new edges are 822 licensed by a pair of existing edges. 823 824 @type NUM_EDGES: C{int} 825 @cvar NUM_EDGES: The number of existing edges that this rule uses 826 to license new edges. Typically, this number ranges from zero 827 to two. 828 """
829 - def apply(self, chart, grammar, *edges):
830 """ 831 Add the edges licensed by this rule and the given edges to the 832 chart. 833 834 @type edges: C{list} of L{EdgeI} 835 @param edges: A set of existing edges. The number of edges 836 that should be passed to C{apply} is specified by the 837 L{NUM_EDGES} class variable. 838 @rtype: C{list} of L{EdgeI} 839 @return: A list of the edges that were added. 840 """ 841 raise AssertionError, 'ChartRuleI is an abstract interface'
842
843 - def apply_iter(self, chart, grammar, *edges):
844 """ 845 @return: A generator that will add edges licensed by this rule 846 and the given edges to the chart, one at a time. Each 847 time the generator is resumed, it will either add a new 848 edge and yield that edge; or return. 849 @rtype: C{iter} of L{EdgeI} 850 851 @type edges: C{list} of L{EdgeI} 852 @param edges: A set of existing edges. The number of edges 853 that should be passed to C{apply} is specified by the 854 L{NUM_EDGES} class variable. 855 """ 856 raise AssertionError, 'ChartRuleI is an abstract interface'
857
858 - def apply_everywhere(self, chart, grammar):
859 """ 860 Add all the edges licensed by this rule and the edges in the 861 chart to the chart. 862 863 @rtype: C{list} of L{EdgeI} 864 @return: A list of the edges that were added. 865 """ 866 raise AssertionError, 'ChartRuleI is an abstract interface'
867
868 - def apply_everywhere_iter(self, chart, grammar):
869 """ 870 @return: A generator that will add all edges licensed by 871 this rule, given the edges that are currently in the 872 chart, one at a time. Each time the generator is resumed, 873 it will either add a new edge and yield that edge; or 874 return. 875 @rtype: C{iter} of L{EdgeI} 876 """ 877 raise AssertionError, 'ChartRuleI is an abstract interface'
878
879 -class AbstractChartRule(object):
880 """ 881 An abstract base class for chart rules. C{AbstractChartRule} 882 provides: 883 - A default implementation for C{apply}, based on C{apply_iter}. 884 - A default implementation for C{apply_everywhere_iter}, 885 based on C{apply_iter}. 886 - A default implementation for C{apply_everywhere}, based on 887 C{apply_everywhere_iter}. Currently, this implementation 888 assumes that C{NUM_EDGES}<=3. 889 - A default implementation for C{__str__}, which returns a 890 name basd on the rule's class name. 891 """ 892 893 # Subclasses must define apply_iter.
894 - def apply_iter(self, chart, grammar, *edges):
895 raise AssertionError, 'AbstractChartRule is an abstract class'
896 897 # Default: loop through the given number of edges, and call 898 # self.apply() for each set of edges.
899 - def apply_everywhere_iter(self, chart, grammar):
900 if self.NUM_EDGES == 0: 901 for new_edge in self.apply_iter(chart, grammar): 902 yield new_edge 903 904 elif self.NUM_EDGES == 1: 905 for e1 in chart: 906 for new_edge in self.apply_iter(chart, grammar, e1): 907 yield new_edge 908 909 elif self.NUM_EDGES == 2: 910 for e1 in chart: 911 for e2 in chart: 912 for new_edge in self.apply_iter(chart, grammar, e1, e2): 913 yield new_edge 914 915 elif self.NUM_EDGES == 3: 916 for e1 in chart: 917 for e2 in chart: 918 for e3 in chart: 919 for new_edge in self.apply_iter(chart,grammar,e1,e2,e3): 920 yield new_edge 921 922 else: 923 raise AssertionError, 'NUM_EDGES>3 is not currently supported'
924 925 # Default: delegate to apply_iter.
926 - def apply(self, chart, grammar, *edges):
927 return list(self.apply_iter(chart, grammar, *edges))
928 929 # Default: delegate to apply_everywhere_iter.
930 - def apply_everywhere(self, chart, grammar):
932 933 # Default: return a name based on the class name.
934 - def __str__(self):
935 # Add spaces between InitialCapsWords. 936 return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__)
937 938 #//////////////////////////////////////////////////////////// 939 # Fundamental Rule 940 #////////////////////////////////////////////////////////////
941 -class FundamentalRule(AbstractChartRule):
942 """ 943 A rule that joins two adjacent edges to form a single combined 944 edge. In particular, this rule specifies that any pair of edges: 945 946 - [AS{->}S{alpha}*BS{beta}][i:j] 947 - [BS{->}S{gamma}*][j:k] 948 licenses the edge: 949 - [AS{->}S{alpha}B*S{beta}][i:j] 950 """ 951 NUM_EDGES = 2
952 - def apply_iter(self, chart, grammar, left_edge, right_edge):
953 # Make sure the rule is applicable. 954 if not (left_edge.end() == right_edge.start() and 955 left_edge.next() == right_edge.lhs() and 956 left_edge.is_incomplete() and right_edge.is_complete()): 957 return 958 959 # Construct the new edge. 960 new_edge = TreeEdge(span=(left_edge.start(), right_edge.end()), 961 lhs=left_edge.lhs(), rhs=left_edge.rhs(), 962 dot=left_edge.dot()+1) 963 964 # Add it to the chart, with appropriate child pointers. 965 changed_chart = False 966 for cpl1 in chart.child_pointer_lists(left_edge): 967 if chart.insert(new_edge, cpl1+(right_edge,)): 968 changed_chart = True 969 970 # If we changed the chart, then generate the edge. 971 if changed_chart: yield new_edge
972
973 -class SingleEdgeFundamentalRule(AbstractChartRule):
974 """ 975 A rule that joins a given edge with adjacent edges in the chart, 976 to form combined edges. In particular, this rule specifies that 977 either of the edges: 978 - [AS{->}S{alpha}*BS{beta}][i:j] 979 - [BS{->}S{gamma}*][j:k] 980 licenses the edge: 981 - [AS{->}S{alpha}B*S{beta}][i:j] 982 if the other edge is already in the chart. 983 @note: This is basically L{FundamentalRule}, with one edge is left 984 unspecified. 985 """ 986 NUM_EDGES = 1 987 988 _fundamental_rule = FundamentalRule() 989
990 - def apply_iter(self, chart, grammar, edge1):
991 fr = self._fundamental_rule 992 if edge1.is_incomplete(): 993 # edge1 = left_edge; edge2 = right_edge 994 for edge2 in chart.select(start=edge1.end(), is_complete=True, 995 lhs=edge1.next()): 996 for new_edge in fr.apply_iter(chart, grammar, edge1, edge2): 997 yield new_edge 998 else: 999 # edge2 = left_edge; edge1 = right_edge 1000 for edge2 in chart.select(end=edge1.start(), is_complete=False, 1001 next=edge1.lhs()): 1002 for new_edge in fr.apply_iter(chart, grammar, edge2, edge1): 1003 yield new_edge
1004
1005 - def __str__(self): return 'Fundamental Rule'
1006
1007 -class BottomUpInitRule(AbstractChartRule):
1008 """ 1009 A rule licensing any edges corresponding to terminals in the 1010 text. In particular, this rule licenses the leaf edge: 1011 - [wS{->}*][i:i+1] 1012 for C{w} is a word in the text, where C{i} is C{w}'s index. 1013 """ 1014 NUM_EDGES = 0
1015 - def apply_iter(self, chart, grammar):
1016 for index in range(chart.num_leaves()): 1017 new_edge = LeafEdge(chart.leaf(index), index) 1018 if chart.insert(new_edge, ()): 1019 yield new_edge
1020 1021 #//////////////////////////////////////////////////////////// 1022 # Top-Down Parsing 1023 #//////////////////////////////////////////////////////////// 1024 1025
1026 -class TopDownInitRule(AbstractChartRule):
1027 """ 1028 A rule licensing edges corresponding to the grammar productions for 1029 the grammar's start symbol. In particular, this rule specifies that: 1030 - [SS{->}*S{alpha}][0:i] 1031 is licensed for each grammar production C{SS{->}S{alpha}}, where 1032 C{S} is the grammar's start symbol. 1033 """ 1034 NUM_EDGES = 0
1035 - def apply_iter(self, chart, grammar):
1036 for prod in grammar.productions(lhs=grammar.start()): 1037 new_edge = TreeEdge.from_production(prod, 0) 1038 if chart.insert(new_edge, ()): 1039 yield new_edge
1040
1041 -class TopDownExpandRule(AbstractChartRule):
1042 """ 1043 A rule licensing edges corresponding to the grammar productions 1044 for the nonterminal following an incomplete edge's dot. In 1045 particular, this rule specifies that: 1046 - [AS{->}S{alpha}*BS{beta}][i:j] 1047 licenses the edge: 1048 - [BS{->}*S{gamma}][j:j] 1049 for each grammar production C{BS{->}S{gamma}}. 1050 """ 1051 NUM_EDGES = 1
1052 - def apply_iter(self, chart, grammar, edge):
1053 if edge.is_complete(): return 1054 for prod in grammar.productions(lhs=edge.next()): 1055 new_edge = TreeEdge.from_production(prod, edge.end()) 1056 if chart.insert(new_edge, ()): 1057 yield new_edge
1058
1059 -class TopDownMatchRule(AbstractChartRule):
1060 """ 1061 A rule licensing an edge corresponding to a terminal following an 1062 incomplete edge's dot. In particular, this rule specifies that: 1063 - [AS{->}S{alpha}*w{beta}][i:j] 1064 licenses the leaf edge: 1065 - [wS{->}*][j:j+1] 1066 if the C{j}th word in the text is C{w}. 1067 """ 1068 NUM_EDGES = 1
1069 - def apply_iter(self, chart, grammar, edge):
1070 if edge.is_complete() or edge.end() >= chart.num_leaves(): return 1071 index = edge.end() 1072 leaf = chart.leaf(index) 1073 if edge.next() == leaf: 1074 new_edge = LeafEdge(leaf, index) 1075 if chart.insert(new_edge, ()): 1076 yield new_edge
1077 1078 # Add a cache, to prevent recalculating.
1079 -class CachedTopDownInitRule(TopDownInitRule):
1080 """ 1081 A cached version of L{TopDownInitRule}. After the first time this 1082 rule is applied, it will not generate any more edges. 1083 1084 If C{chart} or C{grammar} are changed, then the cache is flushed. 1085 """
1086 - def __init__(self):
1087 AbstractChartRule.__init__(self) 1088 self._done = (None, None)
1089
1090 - def apply_iter(self, chart, grammar):
1091 # If we've already applied this rule, and the chart & grammar 1092 # have not changed, then just return (no new edges to add). 1093 if self._done[0] is chart and self._done[1] is grammar: return 1094 1095 # Add all the edges indicated by the top down init rule. 1096 for e in TopDownInitRule.apply_iter(self, chart, grammar): 1097 yield e 1098 1099 # Record the fact that we've applied this rule. 1100 self._done = (chart, grammar)
1101
1102 - def __str__(self): return 'Top Down Init Rule'
1103
1104 -class CachedTopDownExpandRule(TopDownExpandRule):
1105 """ 1106 A cached version of L{TopDownExpandRule}. After the first time 1107 this rule is applied to an edge with a given C{end} and C{next}, 1108 it will not generate any more edges for edges with that C{end} and 1109 C{next}. 1110 1111 If C{chart} or C{grammar} are changed, then the cache is flushed. 1112 """
1113 - def __init__(self):
1114 AbstractChartRule.__init__(self) 1115 self._done = {}
1116
1117 - def apply_iter(self, chart, grammar, edge):
1118 # If we've already applied this rule to an edge with the same 1119 # next & end, and the chart & grammar have not changed, then 1120 # just return (no new edges to add). 1121 done = self._done.get((edge.next(), edge.end()), (None,None)) 1122 if done[0] is chart and done[1] is grammar: return 1123 1124 # Add all the edges indicated by the top down expand rule. 1125 for e in TopDownExpandRule.apply_iter(self, chart, grammar, edge): 1126 yield e 1127 1128 # Record the fact that we've applied this rule. 1129 self._done[edge.next(), edge.end()] = (chart, grammar)
1130
1131 - def __str__(self): return 'Top Down Expand Rule'
1132 1133 #//////////////////////////////////////////////////////////// 1134 # Bottom-Up Parsing 1135 #//////////////////////////////////////////////////////////// 1136
1137 -class BottomUpInitRule(AbstractChartRule):
1138 """ 1139 A rule licensing any edges corresponding to terminals in the 1140 text. In particular, this rule licenses the leaf edge: 1141 - [wS{->}*][i:i+1] 1142 for C{w} is a word in the text, where C{i} is C{w}'s index. 1143 """ 1144 NUM_EDGES = 0
1145 - def apply_iter(self, chart, grammar):
1146 for index in range(chart.num_leaves()): 1147 new_edge = LeafEdge(chart.leaf(index), index) 1148 if chart.insert(new_edge, ()): 1149 yield new_edge
1150
1151 -class BottomUpPredictRule(AbstractChartRule):
1152 """ 1153 A rule licensing any edge corresponding to a production whose 1154 right-hand side begins with a complete edge's left-hand side. In 1155 particular, this rule specifies that: 1156 - [AS{->}S{alpha}*] 1157 licenses the edge: 1158 - [BS{->}*AS{beta}] 1159 for each grammar production C{BS{->}AS{beta}} 1160 """ 1161 NUM_EDGES = 1
1162 - def apply_iter(self, chart, grammar, edge):
1163 if edge.is_incomplete(): return 1164 for prod in grammar.productions(rhs=edge.lhs()): 1165 new_edge = TreeEdge.from_production(prod, edge.start()) 1166 if chart.insert(new_edge, ()): 1167 yield new_edge
1168 1169 #//////////////////////////////////////////////////////////// 1170 # Earley Parsing 1171 #//////////////////////////////////////////////////////////// 1172
1173 -class CompleterRule(AbstractChartRule):
1174 """ 1175 A rule that joins a given complete edge with adjacent incomplete 1176 edges in the chart, to form combined edges. In particular, this 1177 rule specifies that: 1178 - [BS{->}S{gamma}*][j:k] 1179 licenses the edge: 1180 - [AS{->}S{alpha}B*S{beta}][i:j] 1181 given that the chart contains: 1182 - [AS{->}S{alpha}*BS{beta}][i:j] 1183 @note: This is basically L{FundamentalRule}, with the left edge 1184 left unspecified. 1185 """ 1186 NUM_EDGES = 1 1187 1188 _fundamental_rule = FundamentalRule() 1189
1190 - def apply_iter(self, chart, grammar, right_edge):
1191 if right_edge.is_incomplete(): return 1192 fr = self._fundamental_rule 1193 for left_edge in chart.select(end=right_edge.start(), 1194 is_complete=False, 1195 next=right_edge.lhs()): 1196 for e in fr.apply_iter(chart, grammar, left_edge, right_edge): 1197 yield e
1198
1199 - def __str__(self): return 'Completer Rule'
1200
1201 -class ScannerRule(AbstractChartRule):
1202 """ 1203 A rule licensing a leaf edge corresponding to a part-of-speech 1204 terminal following an incomplete edge's dot. In particular, this 1205 rule specifies that: 1206 - [AS{->}S{alpha}*PS{beta}][i:j] 1207 licenses the edges: 1208 - [PS{->}w*][j:j+1] 1209 - [wS{->}*][j:j+1] 1210 if the C{j}th word in the text is C{w}; and C{P} is a valid part 1211 of speech for C{w}. 1212 """ 1213 NUM_EDGES = 1
1214 - def __init__(self, word_to_pos_lexicon):
1215 self._word_to_pos = word_to_pos_lexicon
1216
1217 - def apply_iter(self, chart, gramar, edge):
1218 if edge.is_complete() or edge.end()>=chart.num_leaves(): return 1219 index = edge.end() 1220 leaf = chart.leaf(index) 1221 if edge.next() in self._word_to_pos.get(leaf, []): 1222 new_leaf_edge = LeafEdge(leaf, index) 1223 if chart.insert(new_leaf_edge, ()): 1224 yield new_leaf_edge 1225 new_pos_edge = TreeEdge((index,index+1), edge.next(), 1226 [leaf], 1) 1227 if chart.insert(new_pos_edge, (new_leaf_edge,)): 1228 yield new_pos_edge
1229 1230 # This is just another name for TopDownExpandRule:
1231 -class PredictorRule(TopDownExpandRule): pass
1232 1233 ######################################################################## 1234 ## Simple Earley Chart Parser 1235 ######################################################################## 1236
1237 -class EarleyChartParse(AbstractParse):
1238 """ 1239 A chart parser implementing the Earley parsing algorithm: 1240 1241 - For each index I{end} in [0, 1, ..., N]: 1242 - For each I{edge} s.t. I{edge}.end = I{end}: 1243 - If I{edge} is incomplete, and I{edge}.next is not a part 1244 of speech: 1245 - Apply PredictorRule to I{edge} 1246 - If I{edge} is incomplete, and I{edge}.next is a part of 1247 speech: 1248 - Apply ScannerRule to I{edge} 1249 - If I{edge} is complete: 1250 - Apply CompleterRule to I{edge} 1251 - Return any complete parses in the chart 1252 1253 C{EarleyChartParse} uses a X{lexicon} to decide whether a leaf 1254 has a given part of speech. This lexicon is encoded as a 1255 dictionary that maps each word to a list of parts of speech that 1256 word can have. 1257 """
1258 - def __init__(self, grammar, lexicon, trace=0):
1259 """ 1260 Create a new Earley chart parser, that uses C{grammar} to 1261 parse texts. 1262 1263 @type grammar: C{cfg.Grammar} 1264 @param grammar: The grammar used to parse texts. 1265 @type lexicon: C{dict} from C{string} to (C{list} of C{string}) 1266 @param lexicon: A lexicon of words that records the parts of 1267 speech that each word can have. Each key is a word, and 1268 the corresponding value is a list of parts of speech. 1269 @type trace: C{int} 1270 @param trace: The level of tracing that should be used when 1271 parsing a text. C{0} will generate no tracing output; 1272 and higher numbers will produce more verbose tracing 1273 output. 1274 """ 1275 self._grammar = grammar 1276 self._lexicon = lexicon 1277 self._trace = trace 1278 AbstractParse.__init__(self)
1279
1280 - def get_parse_list(self, tokens, tree_class=Tree):
1281 chart = Chart(list(tokens)) 1282 grammar = self._grammar 1283 1284 # Width, for printing trace edges. 1285 w = 50/(chart.num_leaves()+1) 1286 if self._trace > 0: print ' ', chart.pp_leaves(w) 1287 1288 # Initialize the chart with a special "starter" edge. 1289 root = cfg.Nonterminal('[INIT]') 1290 edge = TreeEdge((0,0), root, (grammar.start(),)) 1291 chart.insert(edge, ()) 1292 1293 # Create the 3 rules: 1294 predictor = PredictorRule() 1295 completer = CompleterRule() 1296 scanner = ScannerRule(self._lexicon) 1297 1298 for end in range(chart.num_leaves()+1): 1299 if self._trace > 1: print 'Processing queue %d' % end 1300 for edge in chart.select(end=end): 1301 if edge.is_incomplete(): 1302 for e in predictor.apply(chart, grammar, edge): 1303 if self._trace > 0: 1304 print 'Predictor', chart.pp_edge(e,w) 1305 if edge.is_incomplete(): 1306 for e in scanner.apply(chart, grammar, edge): 1307 if self._trace > 0: 1308 print 'Scanner ', chart.pp_edge(e,w) 1309 if edge.is_complete(): 1310 for e in completer.apply(chart, grammar, edge): 1311 if self._trace > 0: 1312 print 'Completer', chart.pp_edge(e,w) 1313 1314 # Output a list of complete parses. 1315 return chart.parses(grammar.start(), tree_class=tree_class)
1316 1317 ######################################################################## 1318 ## Generic Chart Parser 1319 ######################################################################## 1320 1321 TD_STRATEGY = [CachedTopDownInitRule(), CachedTopDownExpandRule(), 1322 TopDownMatchRule(), SingleEdgeFundamentalRule()] 1323 BU_STRATEGY = [BottomUpInitRule(), BottomUpPredictRule(), 1324 SingleEdgeFundamentalRule()] 1325
1326 -class ChartParse(AbstractParse):
1327 """ 1328 A generic chart parser. A X{strategy}, or list of 1329 L{ChartRules<ChartRuleI>}, is used to decide what edges to add to 1330 the chart. In particular, C{ChartParse} uses the following 1331 algorithm to parse texts: 1332 1333 - Until no new edges are added: 1334 - For each I{rule} in I{strategy}: 1335 - Apply I{rule} to any applicable edges in the chart. 1336 - Return any complete parses in the chart 1337 """
1338 - def __init__(self, grammar, strategy, trace=0):
1339 """ 1340 Create a new chart parser, that uses C{grammar} to parse 1341 texts. 1342 1343 @type grammar: L{cfg.Grammar} 1344 @param grammar: The grammar used to parse texts. 1345 @type strategy: C{list} of L{ChartRuleI} 1346 @param strategy: A list of rules that should be used to decide 1347 what edges to add to the chart. 1348 @type trace: C{int} 1349 @param trace: The level of tracing that should be used when 1350 parsing a text. C{0} will generate no tracing output; 1351 and higher numbers will produce more verbose tracing 1352 output. 1353 """ 1354 self._grammar = grammar 1355 self._strategy = strategy 1356 self._trace = trace 1357 AbstractParse.__init__(self)
1358
1359 - def get_parse_list(self, tokens, tree_class=Tree):
1360 chart = Chart(list(tokens)) 1361 grammar = self._grammar 1362 1363 # Width, for printing trace edges. 1364 w = 50/(chart.num_leaves()+1) 1365 if self._trace > 0: print chart.pp_leaves(w) 1366 1367 edges_added = 1 1368 while edges_added > 0: 1369 edges_added = 0 1370 for rule in self._strategy: 1371 edges_added_by_rule = 0 1372 for e in rule.apply_everywhere(chart, grammar): 1373 if self._trace > 0 and edges_added_by_rule == 0: 1374 print '%s:' % rule 1375 edges_added_by_rule += 1 1376 if self._trace > 1: print chart.pp_edge(e,w) 1377 if self._trace == 1 and edges_added_by_rule > 0: 1378 print ' - Added %d edges' % edges_added_by_rule 1379 edges_added += edges_added_by_rule 1380 1381 # Return a list of complete parses. 1382 return chart.parses(grammar.start(), tree_class=tree_class)
1383 1384 ######################################################################## 1385 ## Stepping Chart Parser 1386 ######################################################################## 1387
1388 -class SteppingChartParse(ChartParse):
1389 """ 1390 A C{ChartParse} that allows you to step through the parsing 1391 process, adding a single edge at a time. It also allows you to 1392 change the parser's strategy or grammar midway through parsing a 1393 text. 1394 1395 The C{initialize} method is used to start parsing a text. C{step} 1396 adds a single edge to the chart. C{set_strategy} changes the 1397 strategy used by the chart parser. C{parses} returns the set of 1398 parses that has been found by the chart parser. 1399 1400 @ivar _restart: Records whether the parser's strategy, grammar, 1401 or chart has been changed. If so, then L{step} must restart 1402 the parsing algorithm. 1403 """
1404 - def __init__(self, grammar, strategy=None, trace=0):
1405 self._chart = None 1406 self._current_chartrule = None 1407 self._restart = False 1408 ChartParse.__init__(self, grammar, strategy, trace)
1409 1410 #//////////////////////////////////////////////////////////// 1411 # Initialization 1412 #//////////////////////////////////////////////////////////// 1413
1414 - def initialize(self, tokens):
1415 "Begin parsing the given tokens." 1416 self._chart = Chart(list(tokens)) 1417 self._restart = True
1418 1419 #//////////////////////////////////////////////////////////// 1420 # Stepping 1421 #//////////////////////////////////////////////////////////// 1422
1423 - def step(self):
1424 """ 1425 @return: A generator that adds edges to the chart, one at a 1426 time. Each time the generator is resumed, it adds a single 1427 edge and yields that edge. If no more edges can be added, 1428 then it yields C{None}. 1429 1430 If the parser's strategy, grammar, or chart is changed, then 1431 the generator will continue adding edges using the new 1432 strategy, grammar, or chart. 1433 1434 Note that this generator never terminates, since the grammar 1435 or strategy might be changed to values that would add new 1436 edges. Instead, it yields C{None} when no more edges can be 1437 added with the current strategy and grammar. 1438 """ 1439 if self._chart is None: 1440 raise ValueError, 'Parser must be initialized first' 1441 while 1: 1442 self._restart = False 1443 w = 50/(self._chart.num_leaves()+1) 1444 1445 for e in self._parse(): 1446 if self._trace > 1: print self._current_chartrule 1447 if self._trace > 0: print self._chart.pp_edge(e,w) 1448 yield e 1449 if self._restart: break 1450 else: 1451 yield None # No more edges.
1452
1453 - def _parse(self):
1454 """ 1455 A generator that implements the actual parsing algorithm. 1456 L{step} iterates through this generator, and restarts it 1457 whenever the parser's strategy, grammar, or chart is modified. 1458 """ 1459 chart = self._chart 1460 grammar = self._grammar 1461 edges_added = 1 1462 while edges_added > 0: 1463 edges_added = 0 1464 for rule in self._strategy: 1465 self._current_chartrule = rule 1466 for e in rule.apply_everywhere_iter(chart, grammar): 1467 edges_added += 1 1468 yield e
1469 1470 #//////////////////////////////////////////////////////////// 1471 # Accessors 1472 #//////////////////////////////////////////////////////////// 1473
1474 - def strategy(self):
1475 "@return: The strategy used by this parser." 1476 return self._strategy
1477
1478 - def grammar(self):
1479 "@return: The grammar used by this parser." 1480 return self._grammar
1481
1482 - def chart(self):
1483 "@return: The chart that is used by this parser." 1484 return self._chart
1485
1486 - def current_chartrule(self):
1487 "@return: The chart rule used to generate the most recent edge." 1488 return self._current_chartrule
1489
1490 - def parses(self, tree_class=Tree):
1491 "@return: The parse trees currently contained in the chart." 1492 return self._chart.parses(self._grammar.start(), tree_class)
1493 1494 #//////////////////////////////////////////////////////////// 1495 # Parser modification 1496 #//////////////////////////////////////////////////////////// 1497
1498 - def set_strategy(self, strategy):
1499 """ 1500 Change the startegy that the parser uses to decide which edges 1501 to add to the chart. 1502 @type strategy: C{list} of L{ChartRuleI} 1503 @param strategy: A list of rules that should be used to decide 1504 what edges to add to the chart. 1505 """ 1506 if strategy == self._strategy: return 1507 self._strategy = strategy[:] # Make a copy. 1508 self._restart = True
1509
1510 - def set_grammar(self, grammar):
1511 "Change the grammar used by the parser." 1512 if grammar is self._grammar: return 1513 self._grammar = grammar 1514 self._restart = True
1515
1516 - def set_chart(self, chart):
1517 "Load a given chart into the chart parser." 1518 if chart is self._chart: return 1519 self._chart = chart 1520 self._restart = True
1521 1522 #//////////////////////////////////////////////////////////// 1523 # Standard parser methods 1524 #//////////////////////////////////////////////////////////// 1525
1526 - def get_parse_list(self, token, tree_class=Tree):
1527 # Initialize ourselves. 1528 self.initialize(token) 1529 1530 # Step until no more edges are generated. 1531 for e in self.step(): 1532 if e is None: break 1533 1534 # Return a list of complete parses. 1535 return self.parses(tree_class=tree_class)
1536 1537 ######################################################################## 1538 ## Demo Code 1539 ######################################################################## 1540
1541 -def demo():
1542 """ 1543 A demonstration of the chart parsers. 1544 """ 1545 import sys, time 1546 1547 # Define some nonterminals 1548 S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP') 1549 V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det') 1550 1551 # Define some grammatical productions. 1552 grammatical_productions = [ 1553 cfg.Production(S, [NP, VP]), cfg.Production(PP, [P, NP]), 1554 cfg.Production(NP, [Det, N]), cfg.Production(NP, [NP, PP]), 1555 cfg.Production(VP, [VP, PP]), cfg.Production(VP, [V, NP]), 1556 cfg.Production(VP, [V]),] 1557 1558 # Define some lexical productions. 1559 lexical_productions = [ 1560 cfg.Production(NP, ['John']), cfg.Production(NP, ['I']), 1561 cfg.Production(Det, ['the']), cfg.Production(Det, ['my']), 1562 cfg.Production(Det, ['a']), 1563 cfg.Production(N, ['dog']), cfg.Production(N, ['cookie']), 1564 cfg.Production(V, ['ate']), cfg.Production(V, ['saw']), 1565 cfg.Production(P, ['with']), cfg.Production(P, ['under']), 1566 ] 1567 1568 # Convert the grammar productions to an earley-style lexicon. 1569 earley_lexicon = {} 1570 for prod in lexical_productions: 1571 earley_lexicon.setdefault(prod.rhs()[0], []).append(prod.lhs()) 1572 1573 # The grammar for ChartParse and SteppingChartParse: 1574 grammar = cfg.Grammar(S, grammatical_productions+lexical_productions) 1575 1576 # The grammar for EarleyChartParse: 1577 earley_grammar = cfg.Grammar(S, grammatical_productions) 1578 1579 # Tokenize a sample sentence. 1580 sent = 'I saw John with a dog with my cookie' 1581 print "Sentence:\n", sent 1582 from nltk_lite import tokenize 1583 tokens = list(tokenize.whitespace(sent)) 1584 1585 print tokens 1586 1587 # Ask the user which parser to test 1588 print ' 1: Top-down chart parser' 1589 print ' 2: Bottom-up chart parser' 1590 print ' 3: Earley parser' 1591 print ' 4: Stepping chart parser (alternating top-down & bottom-up)' 1592 print ' 5: All parsers' 1593 print '\nWhich parser (1-5)? ', 1594 choice = sys.stdin.readline().strip() 1595 print 1596 if choice not in '12345': 1597 print 'Bad parser number' 1598 return 1599 1600 # Keep track of how long each parser takes. 1601 times = {} 1602 1603 # Run the top-down parser, if requested. 1604 if choice in ('1', '5'): 1605 cp = ChartParse(grammar, TD_STRATEGY, trace=2) 1606 t = time.time() 1607 parses = cp.get_parse_list(tokens) 1608 times['top down'] = time.time()-t 1609 assert len(parses)==5, 'Not all parses found' 1610 for tree in parses: print tree 1611 1612 # Run the bottom-up parser, if requested. 1613 if choice in ('2', '5'): 1614 cp = ChartParse(grammar, BU_STRATEGY, trace=2) 1615 t = time.time() 1616 parses = cp.get_parse_list(tokens) 1617 times['bottom up'] = time.time()-t 1618 assert len(parses)==5, 'Not all parses found' 1619 for tree in parses: print tree 1620 1621 # Run the earley, if requested. 1622 if choice in ('3', '5'): 1623 cp = EarleyChartParse(earley_grammar, earley_lexicon, trace=1) 1624 t = time.time() 1625 parses = cp.get_parse_list(tokens) 1626 times['Earley parser'] = time.time()-t 1627 assert len(parses)==5, 'Not all parses found' 1628 for tree in parses: print tree 1629 1630 # Run the stepping parser, if requested. 1631 if choice in ('4', '5'): 1632 t = time.time() 1633 cp = SteppingChartParse(grammar, trace=1) 1634 cp.initialize(tokens) 1635 for i in range(5): 1636 print '*** SWITCH TO TOP DOWN' 1637 cp.set_strategy(TD_STRATEGY) 1638 for j, e in enumerate(cp.step()): 1639 if j>20 or e is None: break 1640 print '*** SWITCH TO BOTTOM UP' 1641 cp.set_strategy(BU_STRATEGY) 1642 for j, e in enumerate(cp.step()): 1643 if j>20 or e is None: break 1644 times['stepping'] = time.time()-t 1645 assert len(cp.parses())==5, 'Not all parses found' 1646 for parse in cp.parses(): print parse 1647 1648 # Print the times of all parsers: 1649 maxlen = max(len(key) for key in times.keys()) 1650 format = '%' + `maxlen` + 's parser: %6.3fsec' 1651 times_items = times.items() 1652 times_items.sort(lambda a,b:cmp(a[1], b[1])) 1653 for (parser, t) in times_items: 1654 print format % (parser, t)
1655 1656 if __name__ == '__main__': demo() 1657