Package nltk_lite :: Package corpora :: Module ppattach
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.ppattach

 1  # Natural Language Toolkit: PP Attachment Corpus Reader 
 2  # 
 3  # Copyright (C) 2001-2007 University of Pennsylvania 
 4  # Author: Steven Bird <sb@ldc.upenn.edu> 
 5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
 6  # URL: <http://nltk.sf.net> 
 7  # For license information, see LICENSE.TXT 
 8   
 9  """ 
10  Read lines from the Prepositional Phrase Attachment Corpus. 
11   
12  The PP Attachment Corpus contains several files having the format: 
13   
14  sentence_id verb noun1 preposition noun2 attachment 
15   
16  For example: 
17   
18  42960 gives authority to administration V 
19  46742 gives inventors of microchip N 
20   
21  The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.: 
22   
23  (VP gives (NP authority) (PP to administration)) 
24  (VP gives (NP inventors (PP of microchip))) 
25   
26  The corpus contains the following files: 
27   
28  training:   training set 
29  devset:     development test set, used for algorithm development. 
30  test:       test set, used to report results 
31  bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal. 
32   
33  Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional 
34  Phrase Attachment.  Proceedings of the ARPA Human Language Technology 
35  Conference.  [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps] 
36   
37  The PP Attachment Corpus is distributed with NLTK with the permission 
38  of the author. 
39  """        
40   
41  from nltk_lite.corpora import get_basedir 
42  from nltk_lite import tokenize 
43  from nltk_lite.tag import string2tags, string2words 
44  import os 
45   
46  items = ['training', 'devset', 'test'] 
47   
48  item_name = { 
49      'training': 'training set', 
50      'devset': 'development test set', 
51      'test': 'test set' 
52      } 
53   
54 -def raw(files = items):
55 if type(files) is str: files = (files,) 56 57 for file in files: 58 path = os.path.join(get_basedir(), "ppattach", file) 59 for line in open(path): 60 yield tuple(line.split())
61
62 -def dictionary(files = items):
63 for t in raw(files): 64 yield { 65 'sent': t[0], 66 'verb': t[1], 67 'noun1': t[2], 68 'prep': t[3], 69 'noun2': t[4], 70 'attachment': t[5] 71 }
72
73 -def demo():
74 from nltk_lite.corpora import ppattach 75 from itertools import islice 76 from pprint import pprint 77 78 pprint(list(islice(ppattach.raw('training'), 0, 5))) 79 pprint(list(islice(ppattach.dictionary('training'), 0, 5)))
80 81 if __name__ == '__main__': 82 demo() 83