Home | Trees | Indices | Help |
|
---|
|
1 # Natural Language Toolkit: Penn Treebank Reader 2 # 3 # Copyright (C) 2001-2007 University of Pennsylvania 4 # Author: Steven Bird <sb@ldc.upenn.edu> 5 # Edward Loper <edloper@gradient.cis.upenn.edu> 6 # URL: <http://nltk.sf.net> 7 # For license information, see LICENSE.TXT 8 9 from nltk_lite.corpora import get_basedir 10 from nltk_lite import tokenize, chunk 11 from nltk_lite.tag import tag2tuple 12 from nltk_lite.parse import tree 13 import os 14 15 """ 16 Penn Treebank corpus sample: tagged, NP-chunked, and parsed data from 17 Wall Street Journal for 3700 sentences. 18 19 This is a ~10% fragment of the Wall Street Journal section of the Penn 20 Treebank, (C) LDC 1995. It is distributed with the Natural Language Toolkit 21 under the terms of the Creative Commons Attribution-NonCommercial-ShareAlike License 22 [http://creativecommons.org/licenses/by-nc-sa/2.5/]. 23 24 Raw: 25 26 Pierre Vinken, 61 years old, will join the board as a nonexecutive 27 director Nov. 29. 28 29 Tagged: 30 31 Pierre/NNP Vinken/NNP ,/, 61/CD years/NNS old/JJ ,/, will/MD join/VB 32 the/DT board/NN as/IN a/DT nonexecutive/JJ director/NN Nov./NNP 29/CD ./. 33 34 NP-Chunked: 35 36 [ Pierre/NNP Vinken/NNP ] 37 ,/, 38 [ 61/CD years/NNS ] 39 old/JJ ,/, will/MD join/VB 40 [ the/DT board/NN ] 41 as/IN 42 [ a/DT nonexecutive/JJ director/NN Nov./NNP 29/CD ] 43 ./. 44 45 Parsed: 46 47 ( (S 48 (NP-SBJ 49 (NP (NNP Pierre) (NNP Vinken) ) 50 (, ,) 51 (ADJP 52 (NP (CD 61) (NNS years) ) 53 (JJ old) ) 54 (, ,) ) 55 (VP (MD will) 56 (VP (VB join) 57 (NP (DT the) (NN board) ) 58 (PP-CLR (IN as) 59 (NP (DT a) (JJ nonexecutive) (NN director) )) 60 (NP-TMP (NNP Nov.) (CD 29) ))) 61 (. .) )) 62 """ 6365 """ 66 @param files: One or more treebank files to be processed 67 @type files: L{string} or L{tuple(string)} 68 @rtype: iterator over L{tree} 69 """ 70 71 # Just one file to process? If so convert to a tuple so we can iterate 72 if type(files) is str: files = (files,) 73 74 if not basedir: basedir = get_basedir() 75 76 for file in files: 77 path = os.path.join(get_basedir(), "treebank", file) 78 s = open(path).read() 79 for t in tokenize.sexpr(s): 80 try: 81 yield tree.bracket_parse(t) 82 except IndexError: 83 # in case it's the real treebank format, 84 # strip first and last brackets before parsing 85 yield tree.bracket_parse(t[1:-1])8688 """ 89 @param files: One or more treebank files to be processed 90 @type files: L{string} or L{tuple(string)} 91 @rtype: iterator over L{tree} 92 """ 93 94 # Just one file to process? If so convert to a tuple so we can iterate 95 if type(files) is str: files = (files,) 96 97 if not basedir: basedir = get_basedir() 98 99 for file in files: 100 path = os.path.join(basedir, "treebank", file) 101 s = open(path).read() 102 for t in tokenize.blankline(s): 103 yield chunk.tagstr2tree(t)104106 """ 107 @param files: One or more treebank files to be processed 108 @type files: L{string} or L{tuple(string)} 109 @rtype: iterator over L{list(tuple)} 110 """ 111 112 # Just one file to process? If so convert to a tuple so we can iterate 113 if type(files) is str: files = (files,) 114 115 if not basedir: basedir = get_basedir() 116 117 for file in files: 118 path = os.path.join(get_basedir(), "treebank", file) 119 f = open(path).read() 120 for sent in tokenize.blankline(f): 121 l = [] 122 for t in tokenize.whitespace(sent): 123 if (t != '[' and t != ']'): 124 l.append(tag2tuple(t)) 125 yield l126128 """ 129 @param files: One or more treebank files to be processed 130 @type files: L{string} or L{tuple(string)} 131 @rtype: iterator over L{list(string)} 132 """ 133 134 # Just one file to process? If so convert to a tuple so we can iterate 135 if type(files) is str: files = (files,) 136 137 if not basedir: basedir = get_basedir() 138 139 for file in files: 140 path = os.path.join(get_basedir(), "treebank", file) 141 f = open(path).read() 142 for sent in tokenize.blankline(f): 143 l = [] 144 for t in tokenize.whitespace(sent): 145 l.append(t) 146 yield l147 148150 from nltk_lite.corpora import treebank 151 from itertools import islice 152 153 print "Parsed:" 154 for tree in islice(treebank.parsed(), 3): 155 print tree.pp() 156 print 157 158 print "Chunked:" 159 for tree in islice(treebank.chunked(), 3): 160 print tree.pp() 161 print 162 163 print "Tagged:" 164 for sent in islice(treebank.tagged(), 3): 165 print sent 166 print 167 168 print "Raw:" 169 for sent in islice(treebank.raw(), 3): 170 print sent 171 print172 173 if __name__ == '__main__': 174 demo() 175
Home | Trees | Indices | Help |
|
---|
Generated by Epydoc 3.0beta1 on Wed May 16 22:47:58 2007 | http://epydoc.sourceforge.net |