Package nltk_lite :: Package corpora :: Module conll2000
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.conll2000

 1  # Natural Language Toolkit: CONLL Corpus Reader 
 2  # 
 3  # Copyright (C) 2001-2007 University of Pennsylvania 
 4  # Author: Steven Bird <sb@ldc.upenn.edu> 
 5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
 6  # URL: <http://nltk.sf.net> 
 7  # For license information, see LICENSE.TXT 
 8   
 9  """ 
10  Read chunk structures from the CONLL-2000 Corpus 
11  """        
12   
13  from nltk_lite.corpora import get_basedir 
14  from nltk_lite import tokenize, chunk 
15  from nltk_lite.parse import tree 
16  import os 
17   
18  items = ['train', 'test'] 
19   
20  item_name = { 
21      'train': 'training set', 
22      'test':  'test set' 
23      } 
24   
25 -def _list_sent(sent):
26 return [tokenize.whitespace(line) for line in tokenize.line(sent)]
27
28 -def raw(files = ['train']):
29 if type(files) is str: files = (files,) 30 for file in files: 31 path = os.path.join(get_basedir(), "conll2000", file + ".txt") 32 s = open(path).read() 33 for sent in tokenize.blankline(s): 34 yield [word for (word, tag, chunk) in _list_sent(sent)]
35
36 -def tagged(files = ['train']):
37 if type(files) is str: files = (files,) 38 for file in files: 39 path = os.path.join(get_basedir(), "conll2000", file + ".txt") 40 s = open(path).read() 41 for sent in tokenize.blankline(s): 42 yield [(word, tag) for (word, tag, chunk) in _list_sent(sent)]
43
44 -def chunked(files = ['train'], chunk_types=('NP','VP','PP')):
45 if type(files) is str: files = (files,) 46 for file in files: 47 path = os.path.join(get_basedir(), "conll2000", file + ".txt") 48 s = open(path).read() 49 for sent in tokenize.blankline(s): 50 yield chunk.conllstr2tree(sent, chunk_types)
51
52 -def demo():
53 from nltk_lite.corpora import conll2000 54 from itertools import islice 55 56 print "CONLL Chunked data\n" 57 58 print "Raw text:" 59 for sent in islice(conll2000.raw(), 0, 5): 60 print sent 61 print 62 63 print "Tagged text:" 64 for sent in islice(conll2000.tagged(), 0, 5): 65 print sent 66 print 67 68 print "Chunked text:" 69 for tree in islice(conll2000.chunked(chunk_types=('NP','PP')), 0, 5): 70 print tree.pp() 71 print
72 73 74 if __name__ == '__main__': 75 demo() 76