Package nltk_lite :: Package corpora :: Module ycoe
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.ycoe

  1  # -*- coding: iso-8859-1 -*- 
  2   
  3  # Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE) 
  4  # 
  5  # Copyright (C) 2001-2007 University of Pennsylvania 
  6  # Author: Selina Dennis <selina@tranzfusion.net> 
  7  # URL: <http://nltk.sf.net> 
  8  # For license information, see LICENSE.TXT 
  9   
 10  """ 
 11  Reads tokens from the York-Toronto-Helsinki Parsed Corpus of  
 12  Old English Prose (YCOE), a 1.5 million word syntactically- 
 13  annotated corpus of Old English prose texts. The corpus is 
 14  distributed by the Oxford Text Archive: http://www.ota.ahds.ac.uk/ 
 15   
 16  The YCOE corpus is divided into 100 files, each representing 
 17  an Old English prose text. Tags used within each text complies 
 18  to the YCOE standard: http://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm  
 19   
 20  Output of the reader is as follows: 
 21   
 22  Raw: 
 23  ['+D+atte', 
 24    'on', 
 25    'o+dre', 
 26    'wisan', 
 27    'sint', 
 28    'to', 
 29    'manianne', 
 30    '+da', 
 31    'unge+dyldegan', 
 32    ',', 
 33    '&', 
 34    'on', 
 35    'o+dre', 
 36    '+da', 
 37    'ge+dyldegan', 
 38    '.'] 
 39   
 40  Tagged: 
 41  [('+D+atte', 'C'), 
 42    ('on', 'P'), 
 43    ('o+dre', 'ADJ'), 
 44    ('wisan', 'N'), 
 45    ('sint', 'BEPI'), 
 46    ('to', 'TO'), 
 47    ('manianne', 'VB^D'), 
 48    ('+da', 'D^N'), 
 49    ('unge+dyldegan', 'ADJ^N'), 
 50    (',', ','), 
 51    ('&', 'CONJ'), 
 52    ('on', 'P'), 
 53    ('o+dre', 'ADJ'), 
 54    ('+da', 'D^N'), 
 55    ('ge+dyldegan', 'ADJ^N'), 
 56    ('.', '.')] 
 57   
 58  Bracket Parse: 
 59  (CP-THT: (C: '+D+atte') (IP-SUB: (IP-SUB-0: (PP: (P: 'on') (NP: (ADJ: 'o+dre') (N: 'wisan')))  
 60  (BEPI: 'sint') (IP-INF: (TO: 'to') (VB^D: 'manianne') (NP: '*-1')) (NP-NOM-1: (D^N: '+da')  
 61  (ADJ^N: 'unge+dyldegan'))) (,: ',') (CONJP: (CONJ: '&') (IPX-SUB-CON=0: (PP: (P: 'on')  
 62  (NP: (ADJ: 'o+dre'))) (NP-NOM: (D^N: '+da') (ADJ^N: 'ge+dyldegan'))))) (.: '.')), 
 63   
 64  Chunk Parse: 
 65  [(S:  
 66      ('C', '+D+atte')  
 67      (PP: ('P', 'on') ('ADJ', 'o+dre') ('N', 'wisan'))  
 68      ('BEPI', 'sint') ('TO', 'to') ('VB^D', 'manianne')  
 69      (NP: ('NP', '*-1')) ('D^N', '+da') ('ADJ^N', 'unge+dyldegan') (',', ',') ('CONJ', '&')  
 70      (PP: ('P', 'on') ('ADJ', 'o+dre')) ('D^N', '+da') ('ADJ^N', 'ge+dyldegan') ('.', '.'))] 
 71   
 72  """ 
 73   
 74  from nltk_lite.corpora import get_basedir 
 75  from nltk_lite import tokenize 
 76  from nltk_lite.tag import string2tags, string2words 
 77  from nltk_lite.parse import tree 
 78  from string import split 
 79  import os 
 80  import re 
 81   
 82  """  
 83  All files within the corpora 
 84  """ 
 85  item_name = { 
 86      'coadrian.o34': 'Adrian and Ritheus', 
 87      'coaelhom.o3': 'Ælfric, Supplemental Homilies', 
 88      'coaelive.o3': 'Ælfric''s Lives of Saints', 
 89      'coalcuin': 'Alcuin De virtutibus et vitiis', 
 90      'coalex.o23': 'Alexander''s Letter to Aristotle', 
 91      'coapollo.o3': 'Apollonius of Tyre', 
 92      'coaugust': 'Augustine', 
 93      'cobede.o2': 'Bede''s History of the English Church', 
 94      'cobenrul.o3': 'Benedictine Rule', 
 95      'coblick.o23': 'Blickling Homilies', 
 96      'coboeth.o2': 'Boethius'' Consolation of Philosophy', 
 97      'cobyrhtf.o3': 'Byrhtferth''s Manual', 
 98      'cocanedgD': 'Canons of Edgar (D)', 
 99      'cocanedgX': 'Canons of Edgar (X)', 
100      'cocathom1.o3': 'Ælfric''s Catholic Homilies I', 
101      'cocathom2.o3': 'Ælfric''s Catholic Homilies II', 
102      'cochad.o24': 'Saint Chad', 
103      'cochdrul': 'Chrodegang of Metz, Rule', 
104      'cochristoph': 'Saint Christopher', 
105      'cochronA.o23': 'Anglo-Saxon Chronicle A', 
106      'cochronC': 'Anglo-Saxon Chronicle C', 
107      'cochronD': 'Anglo-Saxon Chronicle D', 
108      'cochronE.o34': 'Anglo-Saxon Chronicle E', 
109      'cocura.o2': 'Cura Pastoralis', 
110      'cocuraC': 'Cura Pastoralis (Cotton)', 
111      'codicts.o34': 'Dicts of Cato', 
112      'codocu1.o1': 'Documents 1 (O1)', 
113      'codocu2.o12': 'Documents 2 (O1/O2)', 
114      'codocu2.o2': 'Documents 2 (O2)', 
115      'codocu3.o23': 'Documents 3 (O2/O3)', 
116      'codocu3.o3': 'Documents 3 (O3)', 
117      'codocu4.o24': 'Documents 4 (O2/O4)', 
118      'coeluc1': 'Honorius of Autun, Elucidarium 1', 
119      'coeluc2': 'Honorius of Autun, Elucidarium 1', 
120      'coepigen.o3': 'Ælfric''s Epilogue to Genesis', 
121      'coeuphr': 'Saint Euphrosyne', 
122      'coeust': 'Saint Eustace and his companions', 
123      'coexodusP': 'Exodus (P)', 
124      'cogenesiC': 'Genesis (C)', 
125      'cogregdC.o24': 'Gregory''s Dialogues (C)', 
126      'cogregdH.o23': 'Gregory''s Dialogues (H)', 
127      'coherbar': 'Pseudo-Apuleius, Herbarium', 
128      'coinspolD.o34': 'Wulfstan''s Institute of Polity (D)', 
129      'coinspolX': 'Wulfstan''s Institute of Polity (X)', 
130      'cojames': 'Saint James', 
131      'colacnu.o23': 'Lacnunga', 
132      'colaece.o2': 'Leechdoms', 
133      'colaw1cn.o3': 'Laws, Cnut I', 
134      'colaw2cn.o3': 'Laws, Cnut II', 
135      'colaw5atr.o3': 'Laws, Æthelred V', 
136      'colaw6atr.o3': 'Laws, Æthelred VI', 
137      'colawaf.o2': 'Laws, Alfred', 
138      'colawafint.o2': 'Alfred''s Introduction to Laws', 
139      'colawger.o34': 'Laws, Gerefa', 
140      'colawine.ox2': 'Laws, Ine', 
141      'colawnorthu.o3': 'Northumbra Preosta Lagu', 
142      'colawwllad.o4': 'Laws, William I, Lad', 
143      'coleofri.o4': 'Leofric', 
144      'colsigef.o3': 'Ælfric''s Letter to Sigefyrth', 
145      'colsigewB': 'Ælfric''s Letter to Sigeweard (B)', 
146      'colsigewZ.o34': 'Ælfric''s Letter to Sigeweard (Z)', 
147      'colwgeat': 'Ælfric''s Letter to Wulfgeat', 
148      'colwsigeT': 'Ælfric''s Letter to Wulfsige (T)', 
149      'colwsigeXa.o34': 'Ælfric''s Letter to Wulfsige (Xa)', 
150      'colwstan1.o3': 'Ælfric''s Letter to Wulfstan I', 
151      'colwstan2.o3': 'Ælfric''s Letter to Wulfstan II', 
152      'comargaC.o34': 'Saint Margaret (C)', 
153      'comargaT': 'Saint Margaret (T)', 
154      'comart1': 'Martyrology, I', 
155      'comart2': 'Martyrology, II', 
156      'comart3.o23': 'Martyrology, III', 
157      'comarvel.o23': 'Marvels of the East', 
158      'comary': 'Mary of Egypt', 
159      'coneot': 'Saint Neot', 
160      'conicodA': 'Gospel of Nicodemus (A)', 
161      'conicodC': 'Gospel of Nicodemus (C)', 
162      'conicodD': 'Gospel of Nicodemus (D)', 
163      'conicodE': 'Gospel of Nicodemus (E)', 
164      'coorosiu.o2': 'Orosius', 
165      'cootest.o3': 'Heptateuch', 
166      'coprefcath1.o3': 'Ælfric''s Preface to Catholic Homilies I', 
167      'coprefcath2.o3': 'Ælfric''s Preface to Catholic Homilies II', 
168      'coprefcura.o2': 'Preface to the Cura Pastoralis', 
169      'coprefgen.o3': 'Ælfric''s Preface to Genesis', 
170      'copreflives.o3': 'Ælfric''s Preface to Lives of Saints', 
171      'coprefsolilo': 'Preface to Augustine''s Soliloquies', 
172      'coquadru.o23': 'Pseudo-Apuleius, Medicina de quadrupedibus', 
173      'corood': 'History of the Holy Rood-Tree', 
174      'cosevensl': 'Seven Sleepers', 
175      'cosolilo': 'St. Augustine''s Soliloquies', 
176      'cosolsat1.o4': 'Solomon and Saturn I', 
177      'cosolsat2': 'Solomon and Saturn II', 
178      'cotempo.o3': 'Ælfric''s De Temporibus Anni', 
179      'coverhom': 'Vercelli Homilies', 
180      'coverhomE': 'Vercelli Homilies (E)', 
181      'coverhomL': 'Vercelli Homilies (L)', 
182      'covinceB': 'Saint Vincent (Bodley 343)', 
183      'covinsal': 'Vindicta Salvatoris', 
184      'cowsgosp.o3': 'West-Saxon Gospels', 
185      'cowulf.o34': 'Wulfstan''s Homilies' 
186      } 
187   
188  items = item_name.keys() 
189   
190  """ 
191  Reads files from a given list, and converts them via the conversion_function. 
192  Can return raw or tagged read files. 
193  """ 
194 -def _read(files, conversion_function):
195 if type(files) is str: files = (files,) 196 197 for file in files: 198 path = os.path.join(get_basedir(), "ycoe/pos", file) 199 f = open(path).read() 200 rx_pattern = re.compile(r""" 201 <.*>_CODE 202 |\s.*_ID 203 """, re.VERBOSE|re.UNICODE) 204 mySents = tokenize.blankline(f) 205 for sent in mySents: 206 sent= re.sub(rx_pattern, '', sent) 207 if sent != "": 208 yield conversion_function(sent, sep="_")
209 210 """ 211 Returns the raw data without any tags. 212 """
213 -def raw(files = items):
214 return _read(files, string2words)
215 216 """ 217 Returns the tagged corpus data. 218 """
219 -def tagged(files = items):
220 return _read(files, string2tags)
221
222 -def chunked(files = items, chunk_types=('NP',), top_node="S", partial_match=False, collapse_partials=True, cascade=False):
223 return _chunk_parse(files, chunk_types, top_node, partial_match, collapse_partials, cascade)
224
225 -def bracket_parse(files = items):
226 if type(files) is str: files = (files,) 227 for file in files: 228 path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd") 229 s = open(path).read() 230 data = _parse(s) 231 for sent in data: 232 yield tree.bracket_parse(sent)
233 234 """ 235 Rudimentary parsing, used by bracket parser to obtained parsed raw data 236 """
237 -def _parse(s):
238 rx_pattern = re.compile(r""" 239 \(CODE .*\) 240 |\(ID .*\d\) 241 """, re.VERBOSE|re.UNICODE) 242 s = re.sub(rx_pattern, '', s) 243 s = split(s, '\n') 244 fullPhrase = "" 245 # loop through the sentences and parse each sentence 246 # every time a new sentence marker is found 247 for sent in s: 248 if list(tokenize.regexp(sent, r'^\(')) != []: 249 fullPhrase = _strip_spaces(fullPhrase) 250 if fullPhrase != "": 251 yield fullPhrase 252 fullPhrase = sent 253 else: 254 fullPhrase += sent 255 256 # Get the last of the buffer and output a yield 257 fullPhrase = _strip_spaces(fullPhrase) 258 if fullPhrase != "": 259 yield fullPhrase
260 261 """ 262 Helper function, strips tabs, extra spaces, and an erroneous leading 263 and ending bracket. 264 """ 265
266 -def _strip_spaces(s):
267 s = re.sub(r'^\(', '', s) 268 s = re.sub(r'\)\s*$', '', s) 269 s = re.sub(r'^\s*', '', s) 270 s = re.sub(r'\s*$', '', s) 271 s = re.sub(r'\t+', ' ', s) 272 s = re.sub(r'\s+', ' ', s) 273 274 return s
275 276 """ 277 Parses the files to return chunks of type chunk_types. Partial matching, collapsed 278 partials, and cascading are all supported. 279 """
280 -def _chunk_parse(files, chunk_types, top_node, partial_match, collapse_partials, cascade):
281 # allow any kind of bracketing for flexibility 282 283 L_BRACKET = re.compile(r'[\(\[\{<]') 284 R_BRACKET = re.compile(r'[\)\]\}>]') 285 286 if type(files) is str: files = (files,) 287 for file in files: 288 path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd") 289 s = open(path).read() 290 data = _parse(s) 291 for s in data: 292 bracket = 0 293 itmType = None 294 stack = [tree.Tree(top_node, [])] 295 inTag = [] 296 for itm in list(tokenize.whitespace(s)): 297 if L_BRACKET.match(itm[0]): 298 bracket += 1 299 itm = itm[1:] 300 matched = False 301 if partial_match == True: 302 for eachItm in chunk_types: 303 if (len(eachItm) <= len(itm) and 304 eachItm == itm[:len(eachItm)]): 305 matched = True 306 if collapse_partials == True: 307 itm = eachItm 308 else: 309 if (chunk_types is not None and 310 itm in chunk_types): 311 matched = True 312 if matched == True: # and inTag == 0: 313 chunk = tree.Tree(itm, []) 314 if cascade == True: 315 stack.append(chunk) 316 inTag += [bracket] 317 else: 318 if len(inTag) == 0: 319 stack[-1].append(chunk) 320 inTag += [bracket] 321 itmType=itm 322 if R_BRACKET.match(itm[-1]): 323 tmpItm = split(itm, itm[-1]) 324 if tmpItm != "": 325 if len(inTag) > 0 and inTag[-1] <= bracket: #inTag <= bracket: 326 if cascade == True: 327 stack[-1].append( (itmType, tmpItm[0]) ) 328 else: 329 stack[-1][-1].append( (itmType, tmpItm[0]) ) 330 else: 331 if cascade == True: 332 if len(stack) > 1: 333 stack[-2].append(stack[-1]) 334 stack = stack[:-1] 335 stack[-1].append( (itmType, tmpItm[0]) ) 336 inTag = [] + inTag[:-2] 337 bracket -= (len(tmpItm)-1) 338 while( len(inTag) > 0 and bracket < inTag[-1] ): 339 if cascade == True: 340 if len(stack) > 1: 341 stack[-2].append(stack[-1]) 342 stack = stack[:-1] 343 inTag = [] + inTag[:-2] 344 yield stack
345 346 """ 347 Demonstrates the functionality available in the corpus reader. 348 """
349 -def demo():
350 from nltk_lite.corpora import ycoe 351 from itertools import islice 352 from pprint import pprint 353 354 print 'Raw Data:' 355 pprint(list(ycoe.raw('cocuraC'))[:4]) 356 357 print '\nTagged Data:' 358 pprint(list(ycoe.tagged('cocuraC'))[:4]) 359 360 print '\nBracket Parse:' 361 pprint(list(ycoe.bracket_parse('cocuraC'))[:4]) 362 363 print '\nChunk Parse:' 364 pprint(list(ycoe.chunked('cocuraC', chunk_types=('NP', 'PP')))[:4]) 365 366 print '\nChunk Parse (partials, cascaded):' 367 pprint(list(ycoe.chunked('cocuraC', chunk_types=('NP', 'PP'), \ 368 partial_match=True, collapse_partials=False, cascade=True))[:2]) 369 370 print '\nChunk Parse (partials, cascaded, collapsed):' 371 pprint(list(ycoe.chunked('cocuraC', chunk_types=('NP', 'PP'), \ 372 partial_match=True, collapse_partials=True, cascade=True))[:2])
373 374 if __name__ == '__main__': 375 demo() 376