Package nltk_lite :: Package corpora :: Module genesis
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.genesis

 1  # Natural Language Toolkit: Genesis Corpus Reader 
 2  # 
 3  # Copyright (C) 2001-2007 University of Pennsylvania 
 4  # Author: Steven Bird <sb@ldc.upenn.edu> 
 5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
 6  # URL: <http://nltk.sf.net> 
 7  # For license information, see LICENSE.TXT 
 8   
 9  """ 
10  The Genesis Corpus. 
11   
12  This corpus has been prepared from several web sources; formatting, 
13  markup and verse numbers have been stripped. 
14   
15  english-kjv - Genesis, King James version (Project Gutenberg) 
16  english-web - Genesis, World English Bible (Project Gutenberg) 
17  french - Genesis, Louis Segond 1910 
18  german - Genesis, Luther Translation 
19  swedish - Genesis, Gamla och Nya Testamentet, 1917 (Project Runeberg) 
20  finnish - Genesis, Suomen evankelis-luterilaisen kirkon kirkolliskokouksen vuonna 1992 kayttoon ottama suomennos 
21  """ 
22   
23  from nltk_lite.corpora import get_basedir 
24  from nltk_lite import tokenize 
25  import os 
26   
27  items = [ 
28      'english-kjv', 
29      'english-web', 
30      'french', 
31      'german', 
32      'swedish', 
33      'finnish'] 
34   
35  item_name = { 
36      'english-kjv': 'Genesis, King James version (Project Gutenberg)', 
37      'english-web': 'Genesis, World English Bible (Project Gutenberg)', 
38      'french': 'Genesis, Louis Segond 1910', 
39      'german': 'Genesis, Luther Translation', 
40      'swedish': 'Genesis, Gamla och Nya Testamentet, 1917 (Project Runeberg)', 
41      'finnish': 'Genesis, Suomen evankelis-luterilaisen kirkon kirkolliskokouksen vuonna 1992 kayttoon ottama suomennos' 
42  } 
43   
44 -def raw(files = 'english-kjv'):
45 """ 46 @param files: One or more treebank files to be processed 47 @type files: L{string} or L{tuple(string)} 48 @rtype: iterator over L{tree} 49 """ 50 51 # Just one file to process? If so convert to a tuple so we can iterate 52 if type(files) is str: files = (files,) 53 54 for file in files: 55 path = os.path.join(get_basedir(), "genesis", file+".txt") 56 s = open(path).read() 57 for t in tokenize.whitespace(s): 58 yield t
59
60 -def demo():
61 from nltk_lite.corpora import genesis 62 from itertools import islice 63 64 print 'English:' 65 for word in islice(genesis.raw(), 27): 66 print word, 67 print 68 69 print 'Finnish:' 70 for word in islice(genesis.raw('finnish'), 27): 71 print word, 72 print
73 74 if __name__ == '__main__': 75 demo() 76