1
2
3
4
5
6
7
8
9 from nltk_lite.chunk import *
10 from nltk_lite.parse import AbstractParse
11
13 """
14 A rule specifying how to modify the chunking in a C{ChunkString},
15 using a transformational regular expression. The
16 C{RegexpChunkRule} class itself can be used to implement any
17 transformational rule based on regular expressions. There are
18 also a number of subclasses, which can be used to implement
19 simpler types of rules, based on matching regular expressions.
20
21 Each C{RegexpChunkRule} has a regular expression and a
22 replacement expression. When a C{RegexpChunkRule} is X{applied}
23 to a C{ChunkString}, it searches the C{ChunkString} for any
24 substring that matches the regular expression, and replaces it
25 using the replacement expression. This search/replace operation
26 has the same semantics as C{re.sub}.
27
28 Each C{RegexpChunkRule} also has a description string, which
29 gives a short (typically less than 75 characters) description of
30 the purpose of the rule.
31
32 This transformation defined by this C{RegexpChunkRule} should
33 only add and remove braces; it should I{not} modify the sequence
34 of angle-bracket delimited tags. Furthermore, this transformation
35 may not result in nested or mismatched bracketing.
36 """
37 - def __init__(self, regexp, repl, descr):
38 """
39 Construct a new RegexpChunkRule.
40
41 @type regexp: C{regexp} or C{string}
42 @param regexp: This C{RegexpChunkRule}'s regular expression.
43 When this rule is applied to a C{ChunkString}, any
44 substring that matches C{regexp} will be replaced using
45 the replacement string C{repl}. Note that this must be a
46 normal regular expression, not a tag pattern.
47 @type repl: C{string}
48 @param repl: This C{RegexpChunkRule}'s replacement
49 expression. When this rule is applied to a
50 C{ChunkString}, any substring that matches C{regexp} will
51 be replaced using C{repl}.
52 @type descr: C{string}
53 @param descr: A short description of the purpose and/or effect
54 of this rule.
55 """
56 if type(regexp).__name__ == 'SRE_Pattern': regexp = regexp.pattern
57 self._repl = repl
58 self._descr = descr
59 if type(regexp) == types.StringType:
60 self._regexp = re.compile(regexp)
61 else:
62 self._regexp = regexp
63
64 - def apply(self, chunkstr):
65
66 """
67 Apply this rule to the given C{ChunkString}. See the
68 class reference documentation for a description of what it
69 means to apply a rule.
70
71 @type chunkstr: C{ChunkString}
72 @param chunkstr: The chunkstring to which this rule is
73 applied.
74 @rtype: C{None}
75 @raise ValueError: If this transformation generated an
76 invalid chunkstring.
77 """
78 chunkstr.xform(self._regexp, self._repl)
79
81 """
82 @rtype: C{string}
83 @return: a short description of the purpose and/or effect of
84 this rule.
85 """
86 return self._descr
87
89 """
90 @rtype: C{string}
91 @return: A string representation of this rule. This
92 string representation has the form::
93
94 <RegexpChunkRule: '{<IN|VB.*>}'->'<IN>'>
95
96 Note that this representation does not include the
97 description string; that string can be accessed
98 separately with the C{descr} method.
99 """
100 return ('<RegexpChunkRule: '+`self._regexp.pattern`+
101 '->'+`self._repl`+'>')
102
104 """
105 A rule specifying how to add chunks to a C{ChunkString}, using a
106 matching tag pattern. When applied to a C{ChunkString}, it will
107 find any substring that matches this tag pattern and that is not
108 already part of a chunk, and create a new chunk containing that
109 substring.
110 """
111 - def __init__(self, tag_pattern, descr):
112
113 """
114 Construct a new C{ChunkRule}.
115
116 @type tag_pattern: C{string}
117 @param tag_pattern: This rule's tag pattern. When
118 applied to a C{ChunkString}, this rule will
119 chunk any substring that matches this tag pattern and that
120 is not already part of a chunk.
121 @type descr: C{string}
122 @param descr: A short description of the purpose and/or effect
123 of this rule.
124 """
125 self._pattern = tag_pattern
126 regexp = re.compile('(?P<chunk>%s)%s' %
127 (tag_pattern2re_pattern(tag_pattern),
128 ChunkString.IN_CHINK_PATTERN))
129 RegexpChunkRule.__init__(self, regexp, '{\g<chunk>}', descr)
130
132 """
133 @rtype: C{string}
134 @return: A string representation of this rule. This
135 string representation has the form::
136
137 <ChunkRule: '<IN|VB.*>'>
138
139 Note that this representation does not include the
140 description string; that string can be accessed
141 separately with the C{descr} method.
142 """
143 return '<ChunkRule: '+`self._pattern`+'>'
144
146 """
147 A rule specifying how to remove chinks to a C{ChunkString},
148 using a matching tag pattern. When applied to a
149 C{ChunkString}, it will find any substring that matches this
150 tag pattern and that is contained in a chunk, and remove it
151 from that chunk, thus creating two new chunks.
152 """
153 - def __init__(self, tag_pattern, descr):
154 """
155 Construct a new C{ChinkRule}.
156
157 @type tag_pattern: C{string}
158 @param tag_pattern: This rule's tag pattern. When
159 applied to a C{ChunkString}, this rule will
160 find any substring that matches this tag pattern and that
161 is contained in a chunk, and remove it from that chunk,
162 thus creating two new chunks.
163 @type descr: C{string}
164 @param descr: A short description of the purpose and/or effect
165 of this rule.
166 """
167 self._pattern = tag_pattern
168 regexp = re.compile('(?P<chink>%s)%s' %
169 (tag_pattern2re_pattern(tag_pattern),
170 ChunkString.IN_CHUNK_PATTERN))
171 RegexpChunkRule.__init__(self, regexp, '}\g<chink>{', descr)
172
174 """
175 @rtype: C{string}
176 @return: A string representation of this rule. This
177 string representation has the form::
178
179 <ChinkRule: '<IN|VB.*>'>
180
181 Note that this representation does not include the
182 description string; that string can be accessed
183 separately with the C{descr} method.
184 """
185 return '<ChinkRule: '+`self._pattern`+'>'
186
188 """
189 A rule specifying how to remove chunks to a C{ChunkString},
190 using a matching tag pattern. When applied to a
191 C{ChunkString}, it will find any complete chunk that matches this
192 tag pattern, and un-chunk it.
193 """
194 - def __init__(self, tag_pattern, descr):
195 """
196 Construct a new C{UnChunkRule}.
197
198 @type tag_pattern: C{string}
199 @param tag_pattern: This rule's tag pattern. When
200 applied to a C{ChunkString}, this rule will
201 find any complete chunk that matches this tag pattern,
202 and un-chunk it.
203 @type descr: C{string}
204 @param descr: A short description of the purpose and/or effect
205 of this rule.
206 """
207 self._pattern = tag_pattern
208 regexp = re.compile('\{(?P<chunk>%s)\}' %
209 tag_pattern2re_pattern(tag_pattern))
210 RegexpChunkRule.__init__(self, regexp, '\g<chunk>', descr)
211
213 """
214 @rtype: C{string}
215 @return: A string representation of this rule. This
216 string representation has the form::
217
218 <UnChunkRule: '<IN|VB.*>'>
219
220 Note that this representation does not include the
221 description string; that string can be accessed
222 separately with the C{descr} method.
223 """
224 return '<UnChunkRule: '+`self._pattern`+'>'
225
227 """
228 A rule specifying how to merge chunks in a C{ChunkString}, using
229 two matching tag patterns: a left pattern, and a right pattern.
230 When applied to a C{ChunkString}, it will find any chunk whose end
231 matches left pattern, and immediately followed by a chunk whose
232 beginning matches right pattern. It will then merge those two
233 chunks into a single chunk.
234 """
235 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):
236 """
237 Construct a new C{MergeRule}.
238
239 @type right_tag_pattern: C{string}
240 @param right_tag_pattern: This rule's right tag
241 pattern. When applied to a C{ChunkString}, this
242 rule will find any chunk whose end matches
243 C{left_tag_pattern}, and immediately followed by a chunk
244 whose beginning matches this pattern. It will
245 then merge those two chunks into a single chunk.
246 @type left_tag_pattern: C{string}
247 @param left_tag_pattern: This rule's left tag
248 pattern. When applied to a C{ChunkString}, this
249 rule will find any chunk whose end matches
250 this pattern, and immediately followed by a chunk
251 whose beginning matches C{right_tag_pattern}. It will
252 then merge those two chunks into a single chunk.
253
254 @type descr: C{string}
255 @param descr: A short description of the purpose and/or effect
256 of this rule.
257 """
258 self._left_tag_pattern = left_tag_pattern
259 self._right_tag_pattern = right_tag_pattern
260 regexp = re.compile('(?P<left>%s)}{(?=%s)' %
261 (tag_pattern2re_pattern(left_tag_pattern),
262 tag_pattern2re_pattern(right_tag_pattern)))
263 RegexpChunkRule.__init__(self, regexp, '\g<left>', descr)
264
266 """
267 @rtype: C{string}
268 @return: A string representation of this rule. This
269 string representation has the form::
270
271 <MergeRule: '<NN|DT|JJ>', '<NN|JJ>'>
272
273 Note that this representation does not include the
274 description string; that string can be accessed
275 separately with the C{descr} method.
276 """
277 return ('<MergeRule: '+`self._left_tag_pattern`+', '+
278 `self._right_tag_pattern`+'>')
279
281 """
282 A rule specifying how to split chunks in a C{ChunkString}, using
283 two matching tag patterns: a left pattern, and a right pattern.
284 When applied to a C{ChunkString}, it will find any chunk that
285 matches the left pattern followed by the right pattern. It will
286 then split the chunk into two new chunks, at the point between the
287 two pattern matches.
288 """
289 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):
290 """
291 Construct a new C{SplitRule}.
292
293 @type right_tag_pattern: C{string}
294 @param right_tag_pattern: This rule's right tag
295 pattern. When applied to a C{ChunkString}, this rule will
296 find any chunk containing a substring that matches
297 C{left_tag_pattern} followed by this pattern. It will
298 then split the chunk into two new chunks at the point
299 between these two matching patterns.
300 @type left_tag_pattern: C{string}
301 @param left_tag_pattern: This rule's left tag
302 pattern. When applied to a C{ChunkString}, this rule will
303 find any chunk containing a substring that matches this
304 pattern followed by C{right_tag_pattern}. It will then
305 split the chunk into two new chunks at the point between
306 these two matching patterns.
307 @type descr: C{string}
308 @param descr: A short description of the purpose and/or effect
309 of this rule.
310 """
311 self._left_tag_pattern = left_tag_pattern
312 self._right_tag_pattern = right_tag_pattern
313 regexp = re.compile('(?P<left>%s)(?=%s)' %
314 (tag_pattern2re_pattern(left_tag_pattern),
315 tag_pattern2re_pattern(right_tag_pattern)))
316 RegexpChunkRule.__init__(self, regexp, r'\g<left>}{', descr)
317
319 """
320 @rtype: C{string}
321 @return: A string representation of this rule. This
322 string representation has the form::
323
324 <SplitRule: '<NN>', '<DT>'>
325
326 Note that this representation does not include the
327 description string; that string can be accessed
328 separately with the C{descr} method.
329 """
330 return ('<SplitRule: '+`self._left_tag_pattern`+', '+
331 `self._right_tag_pattern`+'>')
332
334 """
335 A rule specifying how to expand chunks in a C{ChunkString} to the left,
336 using two matching tag patterns: a left pattern, and a right pattern.
337 When applied to a C{ChunkString}, it will find any chunk whose beginning
338 matches right pattern, and immediately preceded by a chink whose
339 end matches left pattern. It will then expand the chunk to incorporate
340 the new material on the left.
341 """
342 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):
343 """
344 Construct a new C{ExpandRightRule}.
345
346 @type right_tag_pattern: C{string}
347 @param right_tag_pattern: This rule's right tag
348 pattern. When applied to a C{ChunkString}, this
349 rule will find any chunk whose beginning matches
350 C{right_tag_pattern}, and immediately preceded by a chink
351 whose end matches this pattern. It will
352 then merge those two chunks into a single chunk.
353 @type left_tag_pattern: C{string}
354 @param left_tag_pattern: This rule's left tag
355 pattern. When applied to a C{ChunkString}, this
356 rule will find any chunk whose beginning matches
357 this pattern, and immediately preceded by a chink
358 whose end matches C{left_tag_pattern}. It will
359 then expand the chunk to incorporate the new material on the left.
360
361 @type descr: C{string}
362 @param descr: A short description of the purpose and/or effect
363 of this rule.
364 """
365 self._left_tag_pattern = left_tag_pattern
366 self._right_tag_pattern = right_tag_pattern
367 regexp = re.compile('(?P<left>%s)\{(?P<right>%s)' %
368 (tag_pattern2re_pattern(left_tag_pattern),
369 tag_pattern2re_pattern(right_tag_pattern)))
370 RegexpChunkRule.__init__(self, regexp, '{\g<left>\g<right>', descr)
371
373 """
374 @rtype: C{string}
375 @return: A string representation of this rule. This
376 string representation has the form::
377
378 <ExpandLeftRule: '<NN|DT|JJ>', '<NN|JJ>'>
379
380 Note that this representation does not include the
381 description string; that string can be accessed
382 separately with the C{descr} method.
383 """
384 return ('<ExpandLeftRule: '+`self._left_tag_pattern`+', '+
385 `self._right_tag_pattern`+'>')
386
388 """
389 A rule specifying how to expand chunks in a C{ChunkString} to the right,
390 using two matching tag patterns: a left pattern, and a right pattern.
391 When applied to a C{ChunkString}, it will find any chunk whose end
392 matches left pattern, and immediately followed by a chink whose
393 beginning matches right pattern. It will then expand the chunk to incorporate
394 the new material on the right.
395 """
396 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):
397 """
398 Construct a new C{ExpandRightRule}.
399
400 @type right_tag_pattern: C{string}
401 @param right_tag_pattern: This rule's right tag
402 pattern. When applied to a C{ChunkString}, this
403 rule will find any chunk whose end matches
404 C{left_tag_pattern}, and immediately followed by a chink
405 whose beginning matches this pattern. It will
406 then merge those two chunks into a single chunk.
407 @type left_tag_pattern: C{string}
408 @param left_tag_pattern: This rule's left tag
409 pattern. When applied to a C{ChunkString}, this
410 rule will find any chunk whose end matches
411 this pattern, and immediately followed by a chink
412 whose beginning matches C{right_tag_pattern}. It will
413 then expand the chunk to incorporate the new material on the right.
414
415 @type descr: C{string}
416 @param descr: A short description of the purpose and/or effect
417 of this rule.
418 """
419 self._left_tag_pattern = left_tag_pattern
420 self._right_tag_pattern = right_tag_pattern
421 regexp = re.compile('(?P<left>%s)\}(?P<right>%s)' %
422 (tag_pattern2re_pattern(left_tag_pattern),
423 tag_pattern2re_pattern(right_tag_pattern)))
424 RegexpChunkRule.__init__(self, regexp, '\g<left>\g<right>}', descr)
425
427 """
428 @rtype: C{string}
429 @return: A string representation of this rule. This
430 string representation has the form::
431
432 <ExpandRightRule: '<NN|DT|JJ>', '<NN|JJ>'>
433
434 Note that this representation does not include the
435 description string; that string can be accessed
436 separately with the C{descr} method.
437 """
438 return ('<ExpandRightRule: '+`self._left_tag_pattern`+', '+
439 `self._right_tag_pattern`+'>')
440
441
442 CHUNK_TAG_PATTERN = re.compile(r'^((%s|<%s>)*)$' %
443 ('[^\{\}<>]+',
444 '[^\{\}<>]+'))
445
446
447
448
449
451 """
452 Convert a tag pattern to a regular expression pattern. A X{tag
453 pattern} is a modified version of a regular expression, designed
454 for matching sequences of tags. The differences between regular
455 expression patterns and tag patterns are:
456
457 - In tag patterns, C{'<'} and C{'>'} act as parentheses; so
458 C{'<NN>+'} matches one or more repetitions of C{'<NN>'}, not
459 C{'<NN'} followed by one or more repetitions of C{'>'}.
460 - Whitespace in tag patterns is ignored. So
461 C{'<DT> | <NN>'} is equivalant to C{'<DT>|<NN>'}
462 - In tag patterns, C{'.'} is equivalant to C{'[^{}<>]'}; so
463 C{'<NN.*>'} matches any single tag starting with C{'NN'}.
464
465 In particular, C{tag_pattern2re_pattern} performs the following
466 transformations on the given pattern:
467
468 - Replace '.' with '[^<>{}]'
469 - Remove any whitespace
470 - Add extra parens around '<' and '>', to make '<' and '>' act
471 like parentheses. E.g., so that in '<NN>+', the '+' has scope
472 over the entire '<NN>'; and so that in '<NN|IN>', the '|' has
473 scope over 'NN' and 'IN', but not '<' or '>'.
474 - Check to make sure the resulting pattern is valid.
475
476 @type tag_pattern: C{string}
477 @param tag_pattern: The tag pattern to convert to a regular
478 expression pattern.
479 @raise ValueError: If C{tag_pattern} is not a valid tag pattern.
480 In particular, C{tag_pattern} should not include braces; and it
481 should not contain nested or mismatched angle-brackets.
482 @rtype: C{string}
483 @return: A regular expression pattern corresponding to
484 C{tag_pattern}.
485 """
486
487 tag_pattern = re.sub(r'\s', '', tag_pattern)
488 tag_pattern = re.sub(r'<', '(<(', tag_pattern)
489 tag_pattern = re.sub(r'>', ')>)', tag_pattern)
490
491
492 if not CHUNK_TAG_PATTERN.match(tag_pattern):
493 raise ValueError('Bad tag pattern: %s' % tag_pattern)
494
495
496
497
498
499
500
501 def reverse_str(str):
502 lst = list(str)
503 lst.reverse()
504 return ''.join(lst)
505 tc_rev = reverse_str(CHUNK_TAG_CHAR)
506 reversed = reverse_str(tag_pattern)
507 reversed = re.sub(r'\.(?!\\(\\\\)*($|[^\\]))', tc_rev, reversed)
508 tag_pattern = reverse_str(reversed)
509
510 return tag_pattern
511
512
513
514
515
516
518 """
519 A regular expression based chunk parser. C{RegexpChunk} uses a
520 sequence of X{rules} to find chunks of a single type within a
521 text. The chunking of the text is encoded using a C{ChunkString},
522 and each rule acts by modifying the chunking in the
523 C{ChunkString}. The rules are all implemented using regular
524 expression matching and substitution.
525
526 The C{RegexpChunkRule} class and its subclasses (C{ChunkRule},
527 C{ChinkRule}, C{UnChunkRule}, C{MergeRule}, and C{SplitRule})
528 define the rules that are used by C{RegexpChunk}. Each rule
529 defines an C{apply} method, which modifies the chunking encoded
530 by a given C{ChunkString}.
531
532 @type _rules: C{list} of C{RegexpChunkRule}
533 @ivar _rules: The list of rules that should be applied to a text.
534 @type _trace: C{int}
535 @ivar _trace: The default level of tracing.
536
537 """
538 - def __init__(self, rules, chunk_node='NP', top_node='S', trace=0):
539 """
540 Construct a new C{RegexpChunk}.
541
542 @type rules: C{list} of C{RegexpChunkRule}
543 @param rules: The sequence of rules that should be used to
544 generate the chunking for a tagged text.
545 @type chunk_node: C{string}
546 @param chunk_node: The node value that should be used for
547 chunk subtrees. This is typically a short string
548 describing the type of information contained by the chunk,
549 such as C{"NP"} for base noun phrases.
550 @type top_node: C{string}
551 @param top_node: The node value that should be used for the
552 top node of the chunk structure.
553 @type trace: C{int}
554 @param trace: The level of tracing that should be used when
555 parsing a text. C{0} will generate no tracing output;
556 C{1} will generate normal tracing output; and C{2} or
557 higher will generate verbose tracing output.
558 """
559 self._rules = rules
560 self._trace = trace
561 self._chunk_node = chunk_node
562 self._top_node = top_node
563 AbstractParse.__init__(self)
564
566 """
567 Apply each of this C{RegexpChunk}'s rules to C{chunkstr}, in
568 turn. Generate trace output between each rule. If C{verbose}
569 is true, then generate verbose output.
570
571 @type chunkstr: C{ChunkString}
572 @param chunkstr: The chunk string to which each rule should be
573 applied.
574 @type verbose: C{boolean}
575 @param verbose: Whether output should be verbose.
576 @rtype: C{None}
577 """
578 print '# Input:'
579 print chunkstr
580 for rule in self._rules:
581 rule.apply(chunkstr)
582 if verbose:
583 print '#', rule.descr()+' ('+`rule`+'):'
584 else:
585 print '#', rule.descr()+':'
586 print chunkstr
587
589 """
590 Apply each of this C{RegexpChunk}'s rules to C{chunkstr}, in
591 turn.
592
593 @param chunkstr: The chunk string to which each rule should be
594 applied.
595 @type chunkstr: C{ChunkString}
596 @rtype: C{None}
597 """
598
599 for rule in self._rules:
600 rule.apply(chunkstr)
601
602 - def parse(self, chunk_struct, trace=None):
603 from nltk_lite.parse.tree import Tree
604 """
605 @type chunk_struct: C{Tree}
606 @param chunk_struct: the chunk structure to be (further) chunked
607 @type trace: C{int}
608 @param trace: The level of tracing that should be used when
609 parsing a text. C{0} will generate no tracing output;
610 C{1} will generate normal tracing output; and C{2} or
611 highter will generate verbose tracing output. This value
612 overrides the trace level value that was given to the
613 constructor.
614 @rtype: C{Tree}
615 @return: a chunk structure that encodes the chunks in a given
616 tagged sentence. A chunk is a non-overlapping linguistic
617 group, such as a noun phrase. The set of chunks
618 identified in the chunk structure depends on the rules
619 used to define this C{RegexpChunk}.
620 """
621 if len(chunk_struct) == 0:
622 print 'Warning: parsing empty text'
623 return Tree(self._top_node, [])
624
625 try:
626 chunk_struct.node
627 except AttributeError:
628 chunk_struct = Tree(self._top_node, chunk_struct)
629
630
631 if trace == None: trace = self._trace
632
633 chunkstr = ChunkString(chunk_struct)
634
635
636 if trace:
637 verbose = (trace>1)
638 self._trace_apply(chunkstr, verbose)
639 else:
640 self._notrace_apply(chunkstr)
641
642
643 return chunkstr.to_chunkstruct(self._chunk_node)
644
646 """
647 @return: the sequence of rules used by C{RegexpChunk}.
648 @rtype: C{list} of C{RegexpChunkRule}
649 """
650 return self._rules
651
653 """
654 @return: a concise string representation of this
655 C{RegexpChunk}.
656 @rtype: C{string}
657 """
658 return "<RegexpChunk with %d rules>" % len(self._rules)
659
661 """
662 @return: a verbose string representation of this C{RegexpChunk}.
663 @rtype: C{string}
664 """
665 s = "RegexpChunk with %d rules:\n" % len(self._rules)
666 margin = 0
667 for rule in self._rules:
668 margin = max(margin, len(rule.descr()))
669 if margin < 35:
670 format = " %" + `-(margin+3)` + "s%s\n"
671 else:
672 format = " %s\n %s\n"
673 for rule in self._rules:
674 s += format % (rule.descr(), `rule`)
675 return s[:-1]
676
677
678
679
680
681 -class Regexp(ChunkParseI, AbstractParse):
682 """
683 A grammar based chunk parser. C{chunk.Regexp} uses a set of
684 regular expression patterns to specify the behavior of the parser.
685 The chunking of the text is encoded using a C{ChunkString}, and
686 each rule acts by modifying the chunking in the C{ChunkString}.
687 The rules are all implemented using regular expression matching
688 and substitution.
689
690 A grammar contains one or more clauses in the following form:
691
692 NP:
693 {<DT|JJ>} # chunk determiners and adjectives
694 }<[\.VI].*>+{ # chink any tag beginning with V, I, or .
695 <.*>}{<DT> # split a chunk at a determiner
696 <DT|JJ>{}<NN.*> # merge chunk ending with det/adj with one starting with a noun
697
698 The patterns of a clause are executed in order. An earlier
699 pattern may introduce a chunk boundary that prevents a later
700 pattern from executing. Sometimes an individual pattern will
701 match on multiple, overlapping extents of the input. As with
702 regular expression substitution more generally, the chunker will
703 identify the first match possible, then continue looking for matches
704 after this one has ended.
705
706 The clauses of a grammar are also executed in order. A cascaded
707 chunk parser is one having more than one clause. The maximum depth
708 of a parse tree created by this chunk parser is the same as the
709 number of clauses in the grammar.
710
711 When tracing is turned on, the comment portion of a line is displayed
712 each time the corresponding pattern is applied.
713
714 @type _start: C{string}
715 @ivar _start: The start symbol of the grammar (the root node of resulting trees)
716 @type _stages: C{int}
717 @ivar _stages: The list of parsing stages corresponding to the grammar
718
719 """
720 - def __init__(self, grammar, top_node='S', loop=1, trace=0):
721 """
722 Create a new chunk parser, from the given start state
723 and set of chunk patterns.
724
725 @param grammar: The list of patterns that defines the grammar
726 @type grammar: C{list} of C{string}
727 @param top_node: The top node of the tree being created
728 @type top_node: L{string} or L{Nonterminal}
729 @param loop: The number of times to run through the patterns
730 @type loop: L{int}
731 @type trace: C{int}
732 @param trace: The level of tracing that should be used when
733 parsing a text. C{0} will generate no tracing output;
734 C{1} will generate normal tracing output; and C{2} or
735 higher will generate verbose tracing output.
736 """
737 from nltk_lite import chunk
738 self._trace = trace
739 self._stages = []
740 self._grammar = grammar
741 self._loop = loop
742 rules = []
743 for line in grammar.split('\n'):
744
745 line = re.sub(r'\\#', r'_HASH_', line)
746 if '#' in line:
747 line, comment = line.split('#', 1)
748 else:
749 comment = ''
750 line = re.sub(r'_HASH_', r'\\#', line)
751 comment = comment.strip()
752
753
754 if ':' in line:
755 if rules != []:
756 parser = RegexpChunk(rules, chunk_node=lhs, trace=trace)
757 self._stages.append(parser)
758 lhs, line = line.split(":")
759 lhs = lhs.strip()
760 rules = []
761
762 line = line.strip()
763 if not line: continue
764
765
766 if line[0] == '{' and line[-1] == '}':
767 rules.append(ChunkRule(line[1:-1], comment))
768 elif line[0] == '}' and line[-1] == '{':
769 rules.append(ChinkRule(line[1:-1], comment))
770 elif '}{' in line:
771 left, right = line.split('}{')
772 rules.append(SplitRule(left, right, comment))
773 elif '{}' in line:
774 left, right = line.split('{}')
775 rules.append(MergeRule(left, right, comment))
776 else:
777 raise ValueError, 'Illegal chunk pattern: %s' % line
778 if rules != []:
779 parser = RegexpChunk(rules, chunk_node=lhs, top_node=top_node, trace=trace)
780 self._stages.append(parser)
781
782 - def parse(self, chunk_struct, trace=None):
783 """
784 Apply the chunk parser to this input.
785
786 @type chunk_struct: C{Tree}
787 @param chunk_struct: the chunk structure to be (further) chunked
788 (this tree is modified, and is also returned)
789 @type trace: C{int}
790 @param trace: The level of tracing that should be used when
791 parsing a text. C{0} will generate no tracing output;
792 C{1} will generate normal tracing output; and C{2} or
793 highter will generate verbose tracing output. This value
794 overrides the trace level value that was given to the
795 constructor.
796 @return: the chunked output.
797 @rtype: C{Tree}
798 """
799 if trace == None: trace = self._trace
800 for i in range(self._loop):
801 for parser in self._stages:
802 chunk_struct = parser.parse(chunk_struct, trace=trace)
803 return chunk_struct
804
806 """
807 @return: a concise string representation of this C{chunk.Regexp}.
808 @rtype: C{string}
809 """
810 return "<chunk.Regexp with %d stages>" % len(self._stages)
811
813 """
814 @return: a verbose string representation of this
815 C{RegexpChunk}.
816 @rtype: C{string}
817 """
818 s = "chunk.Regexp with %d stages:\n" % len(self._stages)
819 margin = 0
820 for parser in self._stages:
821 s += parser.__str__() + "\n"
822 return s[:-1]
823
824
825
826
827
829 """
830 Demonstration code for evaluating a chunk parser, using a
831 C{ChunkScore}. This function assumes that C{text} contains one
832 sentence per line, and that each sentence has the form expected by
833 C{tree.chunk}. It runs the given chunk parser on each sentence in
834 the text, and scores the result. It prints the final score
835 (precision, recall, and f-measure); and reports the set of chunks
836 that were missed and the set of chunks that were incorrect. (At
837 most 10 missing chunks and 10 incorrect chunks are reported).
838
839 @param chunkparser: The chunkparser to be tested
840 @type chunkparser: C{ChunkParseI}
841 @param text: The chunked tagged text that should be used for
842 evaluation.
843 @type text: C{string}
844 """
845
846 from nltk_lite import chunk
847
848
849 chunkscore = chunk.ChunkScore()
850
851 from nltk_lite.parse.tree import Tree
852
853 for sentence in text.split('\n'):
854 print sentence
855 sentence = sentence.strip()
856 if not sentence: continue
857 gold = chunk.tagstr2tree(sentence)
858 tokens = gold.leaves()
859 test = chunkparser.parse(Tree('S', tokens), trace=1)
860 chunkscore.score(gold, test)
861 print
862
863 print '/'+('='*75)+'\\'
864 print 'Scoring', chunkparser
865 print ('-'*77)
866 print 'Precision: %5.1f%%' % (chunkscore.precision()*100), ' '*4,
867 print 'Recall: %5.1f%%' % (chunkscore.recall()*100), ' '*6,
868 print 'F-Measure: %5.1f%%' % (chunkscore.f_measure()*100)
869
870
871
872 if chunkscore.missed():
873 print 'Missed:'
874 missed = chunkscore.missed()
875 for chunk in missed[:10]:
876 print ' ', chunk
877 if len(chunkscore.missed()) > 10:
878 print ' ...'
879
880
881 if chunkscore.incorrect():
882 print 'Incorrect:'
883 incorrect = chunkscore.incorrect()
884 for chunk in incorrect[:10]:
885 print ' ', chunk
886 if len(chunkscore.incorrect()) > 10:
887 print ' ...'
888
889 print '\\'+('='*75)+'/'
890 print
891
893 """
894 A demonstration for the C{RegexpChunk} class. A single text is
895 parsed with four different chunk parsers, using a variety of rules
896 and strategies.
897 """
898
899 from nltk_lite import chunk
900 from nltk_lite.tag import string2tags
901 from nltk_lite.parse.tree import Tree
902
903 text = """\
904 [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./.
905 [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./.
906 [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./.
907 """
908
909 print '*'*75
910 print 'Evaluation text:'
911 print text
912 print '*'*75
913 print
914
915 grammar = r"""
916 NP: # NP stage
917 {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns
918 {<NNP>+} # chunk proper nouns
919 """
920 cp = chunk.Regexp(grammar)
921 chunk.demo_eval(cp, text)
922
923 grammar = r"""
924 NP:
925 {<.*>} # start by chunking each tag
926 }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods
927 <DT|JJ>{}<NN.*> # merge det/adj with nouns
928 """
929 cp = chunk.Regexp(grammar)
930 chunk.demo_eval(cp, text)
931
932 grammar = r"""
933 NP: {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns
934 VP: {<TO>?<VB.*>} # VP = verb words
935 """
936 cp = chunk.Regexp(grammar)
937 chunk.demo_eval(cp, text)
938
939 grammar = r"""
940 NP: {<.*>*} # start by chunking everything
941 }<[\.VI].*>+{ # chink any verbs, prepositions or periods
942 <.*>}{<DT> # separate on determiners
943 PP: {<IN><NP>} # PP = preposition + noun phrase
944 VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs
945 """
946 cp = chunk.Regexp(grammar)
947 chunk.demo_eval(cp, text)
948
949
950
951 from nltk_lite.corpora import conll2000
952
953 print
954 print "Demonstration of empty grammar:"
955
956 cp = chunk.Regexp("")
957 print chunk.accuracy(cp, conll2000.chunked(files='test', chunk_types=('NP',)))
958
959 print
960 print "Demonstration of accuracy evaluation using CoNLL tags:"
961
962 from itertools import islice
963
964 grammar = r"""
965 NP:
966 {<.*>} # start by chunking each tag
967 }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods
968 <DT|JJ>{}<NN.*> # merge det/adj with nouns
969 """
970 cp = chunk.Regexp(grammar)
971 print chunk.accuracy(cp, islice(conll2000.chunked(chunk_types=('NP', 'PP', 'VP')), 0, 5))
972
973 print
974 print "Demonstration of tagged token input"
975
976 grammar = r"""
977 NP: {<.*>*} # start by chunking everything
978 }<[\.VI].*>+{ # chink any verbs, prepositions or periods
979 <.*>}{<DT> # separate on determiners
980 PP: {<IN><NP>} # PP = preposition + noun phrase
981 VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs
982 """
983 cp = chunk.Regexp(grammar)
984 print cp.parse([("the","DT"), ("little","JJ"), ("cat", "NN"), ("sat", "VBD"), ("on", "IN"), ("the", "DT"), ("mat", "NN"), (".", ".")])
985
986 if __name__ == '__main__':
987 demo()
988