Package Bio :: Package Motif :: Package Parsers :: Module MEME
[hide private]
[frames] | no frames]

Source Code for Module Bio.Motif.Parsers.MEME

  1  # Copyright 2008 by Bartek Wilczynski 
  2  # Adapted from  Bio.MEME.Parser by Jason A. Hackney.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  from Bio.Alphabet import IUPAC 
  8  from Bio import Seq 
  9  import re 
 10  from math import sqrt 
 11  import sys 
 12  from Bio.Motif import Motif 
 13   
 14   
 15   
16 -def read(handle):
17 """Parses the text output of the MEME program into MEME.Record object. 18 19 Example: 20 21 >>> f = open("meme.output.txt") 22 >>> from Bio.Motif.Parsers import MEME 23 >>> record = MEME.read(f) 24 >>> for motif in record.motifs: 25 ... for instance in motif.instances: 26 ... print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue 27 28 """ 29 record = MEMERecord() 30 __read_version(record, handle) 31 __read_datafile(record, handle) 32 __read_alphabet(record, handle) 33 __read_sequence_names(record, handle) 34 __read_command(record, handle) 35 for line in handle: 36 if line.startswith('MOTIF 1'): 37 break 38 else: 39 raise ValueError('Unexpected end of stream') 40 while True: 41 motif = __create_motif(line) 42 motif.alphabet = record.alphabet 43 record.motifs.append(motif) 44 __read_motif_name(motif, handle) 45 __read_motif_sequences(motif, handle, 'revcomp' in record.command) 46 __skip_unused_lines(handle) 47 try: 48 line = handle.next() 49 except StopIteration: 50 raise ValueError('Unexpected end of stream: Expected to find new motif, or the summary of motifs') 51 if line.startswith("SUMMARY OF MOTIFS"): 52 break 53 if not line.startswith('MOTIF'): 54 raise ValueError("Line does not start with 'MOTIF':\n%s" % line) 55 return record
56 57
58 -class MEMEMotif (Motif):
59 """A subclass of Motif used in parsing MEME (and MAST) output. 60 61 This sublcass defines functions and data specific to MEME motifs. 62 This includes the evalue for a motif and the PSSM of the motif. 63 64 Methods: 65 add_instance_from_values (name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = +): create a new instance of the motif with the specified values. 66 add_to_pssm (position): add a new position to the pssm. The position should be a list of nucleotide/amino acid frequencies 67 add_to_logodds (position): add a new position to the log odds matrix. The position should be a tuple of log odds values for the nucleotide/amino acid at that position. 68 compare_motifs (other_motif): returns the maximum correlation between this motif and other_motif 69 """
70 - def __init__ (self):
71 Motif.__init__(self) 72 self.evalue = 0.0
73
74 - def _numoccurrences (self, number):
75 if type(number) == int: 76 self.num_occurrences = number 77 else: 78 number = int(number) 79 self.num_occurrences = number
80
81 - def get_instance_by_name (self,name):
82 for i in self.instances: 83 if i.sequence_name == name: 84 return i 85 return None
86
87 - def add_instance_from_values (self, name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = '+'):
88 inst = MEMEInstance(sequence,self.alphabet) 89 inst._pvalue(pvalue) 90 inst._seqname(name) 91 inst._start(start) 92 inst._strand(strand) 93 if self.length: 94 inst._length(self.length) 95 else: 96 inst._length(len(sequence)) 97 if self.name: 98 inst._motifname(self.name) 99 self.add_instance(inst)
100
101 - def _evalue (self, evalue):
102 if type(evalue) == float: 103 self.evalue = evalue 104 else: 105 evalue = float(evalue) 106 self.evalue = evalue
107 108
109 -class MEMEInstance(Seq.Seq):
110 """A class describing the instances of a MEME motif, and the data thereof. 111 """
112 - def __init__ (self,*args,**kwds):
113 Seq.Seq.__init__(self,*args,**kwds) 114 self.sequence_name = "" 115 self.start = 0 116 self.pvalue = 1.0 117 self.strand = 0 118 self.length = 0 119 self.motif_name = ""
120 121
122 - def _seqname (self, name):
123 self.sequence_name = name
124
125 - def _motifname (self, name):
126 self.motif_name = name
127
128 - def _start (self,start):
129 start = int(start) 130 self.start = start
131
132 - def _pvalue (self,pval):
133 pval = float(pval) 134 self.pvalue = pval
135
136 - def _score (self, score):
137 score = float(score) 138 self.score = score
139
140 - def _strand (self, strand):
141 self.strand = strand
142
143 - def _length (self, length):
144 self.length = length
145 146
147 -class MEMERecord:
148 """A class for holding the results of a MEME run. 149 150 A MEMERecord is an object that holds the results from running 151 MEME. It implements no methods of its own. 152 153 """
154 - def __init__ (self):
155 """__init__ (self)""" 156 self.motifs = [] 157 self.version = "" 158 self.datafile = "" 159 self.command = "" 160 self.alphabet = None 161 self.sequence_names = []
162
163 - def get_motif_by_name (self, name):
164 for m in self.motifs: 165 if m.name == name: 166 return m
167 168 169 # Everything below is private 170 171
172 -def __read_version(record, handle):
173 for line in handle: 174 if line.startswith('MEME version'): 175 break 176 else: 177 raise ValueError("Improper input file. File should contain a line starting MEME version.") 178 line = line.strip() 179 ls = line.split() 180 record.version = ls[2]
181 182
183 -def __read_datafile(record, handle):
184 for line in handle: 185 if line.startswith('TRAINING SET'): 186 break 187 else: 188 raise ValueError("Unexpected end of stream: 'TRAINING SET' not found.") 189 try: 190 line = handle.next() 191 except StopIteration: 192 raise ValueError("Unexpected end of stream: Expected to find line starting with '****'") 193 if not line.startswith('****'): 194 raise ValueError("Line does not start with '****':\n%s" % line) 195 try: 196 line = handle.next() 197 except StopIteration: 198 raise ValueError("Unexpected end of stream: Expected to find line starting with 'DATAFILE'") 199 if not line.startswith('DATAFILE'): 200 raise ValueError("Line does not start with 'DATAFILE':\n%s" % line) 201 line = line.strip() 202 line = line.replace('DATAFILE= ','') 203 record.datafile = line
204 205
206 -def __read_alphabet(record, handle):
207 try: 208 line = handle.next() 209 except StopIteration: 210 raise ValueError("Unexpected end of stream: Expected to find line starting with 'ALPHABET'") 211 if not line.startswith('ALPHABET'): 212 raise ValueError("Line does not start with 'ALPHABET':\n%s" % line) 213 line = line.strip() 214 line = line.replace('ALPHABET= ','') 215 if line == 'ACGT': 216 al = IUPAC.unambiguous_dna 217 else: 218 al = IUPAC.protein 219 record.alphabet = al
220 221
222 -def __read_sequence_names(record, handle):
223 try: 224 line = handle.next() 225 except StopIteration: 226 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'") 227 if not line.startswith('Sequence name'): 228 raise ValueError("Line does not start with 'Sequence name':\n%s" % line) 229 try: 230 line = handle.next() 231 except StopIteration: 232 raise ValueError("Unexpected end of stream: Expected to find line starting with '----'") 233 if not line.startswith('----'): 234 raise ValueError("Line does not start with '----':\n%s" % line) 235 for line in handle: 236 if line.startswith('***'): 237 break 238 line = line.strip() 239 ls = line.split() 240 record.sequence_names.append(ls[0]) 241 if len(ls) == 6: 242 record.sequence_names.append(ls[3]) 243 else: 244 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
245 246
247 -def __read_command(record, handle):
248 for line in handle: 249 if line.startswith('command:'): 250 break 251 else: 252 raise ValueError("Unexpected end of stream: Expected to find line starting with 'command'") 253 line = line.strip() 254 line = line.replace('command: ','') 255 record.command = line
256 257
258 -def __create_motif(line):
259 line = line.strip() 260 ls = line.split() 261 motif = MEMEMotif() 262 motif.length = int(ls[4]) 263 motif._numoccurrences(ls[7]) 264 motif._evalue(ls[13]) 265 return motif
266 267
268 -def __read_motif_name(motif, handle):
269 for line in handle: 270 if 'sorted by position p-value' in line: 271 break 272 else: 273 raise ValueError('Unexpected end of stream: Failed to find motif name') 274 line = line.strip() 275 ls = line.split() 276 name = " ".join(ls[0:2]) 277 motif.name=name
278 279
280 -def __read_motif_sequences(motif, handle, rv):
281 try: 282 line = handle.next() 283 except StopIteration: 284 raise ValueError('Unexpected end of stream: Failed to find motif sequences') 285 if not line.startswith('---'): 286 raise ValueError("Line does not start with '---':\n%s" % line) 287 try: 288 line = handle.next() 289 except StopIteration: 290 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'") 291 if not line.startswith('Sequence name'): 292 raise ValueError("Line does not start with 'Sequence name':\n%s" % line) 293 try: 294 line = handle.next() 295 except StopIteration: 296 raise ValueError('Unexpected end of stream: Failed to find motif sequences') 297 if not line.startswith('---'): 298 raise ValueError("Line does not start with '---':\n%s" % line) 299 for line in handle: 300 if line.startswith('---'): 301 break 302 line = line.strip() 303 ls = line.split() 304 if rv: 305 #seq = Seq.Seq(ls[5], record.alphabet) 306 motif.add_instance_from_values(name = ls[0], sequence = ls[5], start = ls[2], pvalue = ls[3], strand = ls[1]) 307 else: 308 #seq = Seq.Seq(ls[4], record.alphabet) 309 motif.add_instance_from_values(name = ls[0], sequence = ls[4], start = ls[1], pvalue = ls[2]) 310 else: 311 raise ValueError('Unexpected end of stream')
312 313
314 -def __skip_unused_lines(handle):
315 for line in handle: 316 if line.startswith('log-odds matrix'): 317 break 318 else: 319 raise ValueError("Unexpected end of stream: Expected to find line starting with 'log-odds matrix'") 320 for line in handle: 321 if line.startswith('---'): 322 break 323 else: 324 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'") 325 for line in handle: 326 if line.startswith('letter-probability matrix'): 327 break 328 else: 329 raise ValueError("Unexpected end of stream: Expected to find line starting with 'letter-probability matrix'") 330 for line in handle: 331 if line.startswith('---'): 332 break 333 else: 334 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'") 335 for line in handle: 336 if line.startswith('Time'): 337 break 338 else: 339 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Time'") 340 try: 341 line = handle.next() 342 except StopIteration: 343 raise ValueError('Unexpected end of stream: Expected to find blank line') 344 if line.strip(): 345 raise ValueError("Expected blank line, but got:\n%s" % line) 346 try: 347 line = handle.next() 348 except StopIteration: 349 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'") 350 if not line.startswith('***'): 351 raise ValueError("Line does not start with '***':\n%s" % line) 352 for line in handle: 353 if line.strip(): 354 break 355 else: 356 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'") 357 if not line.startswith('***'): 358 raise ValueError("Line does not start with '***':\n%s" % line)
359 360 361 # Everything below is deprecated. 362 363 364 from Bio import File 365 from Bio.ParserSupport import * 366 import Bio 367 368
369 -class MEMEParser (AbstractParser):
370 """A parser for the text output of the MEME program (DEPRECATED). 371 Parses the output into an object of the MEMERecord class. 372 373 Methods: 374 parse (handle): parses the contents of the file handle passed to it. 375 376 Example: 377 378 >>>f = open("meme.output.txt") 379 >>>parser = MEMEParser() 380 >>>meme_record = parser.parse(f) 381 >>>for motif in meme_record.motifs: 382 ... for instance in motif.instances: 383 ... print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue 384 385 This class is DEPRECATED; please use the read() function in this module 386 instead. 387 """
388 - def __init__ (self):
389 """__init__ (self)""" 390 import warnings 391 warnings.warn("Bio.Motif.Parsers.MEME.MEMEParser is deprecated; please use the read() function in this module instead", Bio.BiopythonDeprecationWarning) 392 self._scanner = _MEMEScanner() 393 self._consumer = _MEMEConsumer()
394
395 - def parse (self, handle):
396 """parse (self, handle)""" 397 self._scanner.feed(handle, self._consumer) 398 return self._consumer.data
399 400 401
402 -class _MEMEScanner:
403 """Scanner for MEME output (DEPRECATED). 404 405 Methods: 406 feed 407 408 This class is DEPRECATED; please use the read() function in this module 409 instead. 410 """ 411
412 - def __init__(self):
413 import warnings 414 warnings.warn("Bio.Motif.Parsers.MEME._MEMEScanner is deprecated; please use the read() function in this module instead", Bio.BiopythonDeprecationWarning)
415
416 - def feed (self, handle, consumer):
417 """ 418 Feeds in MEME output for scanning. handle should 419 implement the readline method. consumer is 420 a Consumer object that can receive the salient events. 421 """ 422 if isinstance(handle, File.UndoHandle): 423 uhandle = handle 424 else: 425 uhandle = File.UndoHandle(handle) 426 427 self._scan_header(uhandle, consumer) 428 self._scan_motifs (uhandle, consumer)
429
430 - def _scan_header(self, uhandle, consumer):
431 try: 432 read_and_call_until(uhandle, consumer.noevent, contains = 'MEME version') 433 except ValueError: 434 raise ValueError("Improper input file. File should contain a line starting MEME version.") 435 read_and_call(uhandle, consumer._version, start = 'MEME version') 436 read_and_call_until(uhandle, consumer.noevent, start = 'TRAINING SET') 437 read_and_call(uhandle, consumer.noevent, start = 'TRAINING SET') 438 read_and_call(uhandle, consumer.noevent, start = '****') 439 read_and_call(uhandle, consumer._datafile, start = 'DATAFILE') 440 read_and_call(uhandle, consumer._alphabet, start = 'ALPHABET') 441 read_and_call(uhandle, consumer.noevent, start = 'Sequence name') 442 read_and_call(uhandle, consumer.noevent, start = '----') 443 read_and_call_until(uhandle, consumer._sequence_name, start = '***') 444 read_and_call_until(uhandle, consumer.noevent, start = 'command:') 445 read_and_call(uhandle, consumer._commandline, start = 'command:') 446 read_and_call_until(uhandle, consumer.noevent, start = 'MOTIF 1')
447
448 - def _scan_motifs(self, uhandle, consumer):
449 while 1: 450 read_and_call(uhandle, consumer._add_motif_with_info, start = 'MOTIF') 451 read_and_call_until(uhandle, consumer.noevent, contains = 'sorted by position p-value') 452 read_and_call(uhandle, consumer.motif_name, contains = 'sorted by position p-value') 453 read_and_call(uhandle, consumer.noevent, start = '---') 454 read_and_call(uhandle, consumer.noevent, start = 'Sequence name') 455 read_and_call(uhandle, consumer.noevent, start = '---') 456 read_and_call_until(uhandle, consumer.add_instance, start = '---') 457 read_and_call_until(uhandle, consumer.noevent, start = 'log-odds matrix') 458 read_and_call(uhandle, consumer.noevent) 459 read_and_call_until(uhandle, consumer.add_to_logodds, start = '---') 460 read_and_call_until(uhandle, consumer.noevent, start = 'letter-probability matrix') 461 read_and_call(uhandle, consumer.noevent, start = 'letter-probability matrix') 462 read_and_call_until(uhandle, consumer.add_to_pssm, start = '---') 463 read_and_call_until(uhandle, consumer.noevent, start = 'Time') 464 read_and_call(uhandle, consumer.noevent, start = 'Time') 465 read_and_call(uhandle, consumer.noevent, blank = 1) 466 read_and_call(uhandle, consumer.noevent, start = '***') 467 read_and_call_while(uhandle, consumer.noevent, blank = 1) 468 read_and_call(uhandle, consumer.noevent, start = '***') 469 line = safe_peekline(uhandle) 470 if line.startswith("SUMMARY OF MOTIFS"): 471 break
472 473 474
475 -class _MEMEConsumer:
476 """ 477 Consumer that can receive events from MEME Scanner (DEPRECATED). 478 479 This is the Consumer object that should be passed to the 480 MEME Scanner. 481 482 This class is DEPRECATED; please use the read() function in this module 483 instead. 484 """ 485
486 - def __init__ (self):
487 import warnings 488 warnings.warn("Bio.Motif.Parsers.MEME._MEMEConsumer is deprecated; please use the read() function in this module instead", Bio.BiopythonDeprecationWarning) 489 self.current_motif = None 490 self.sequence_names = [] 491 self.data = MEMERecord()
492
493 - def _version (self, line):
494 line = line.strip() 495 ls = line.split() 496 self.data.version = ls[2]
497
498 - def _datafile (self, line):
499 line = line.strip() 500 line = line.replace('DATAFILE= ','') 501 self.data.datafile = line
502
503 - def _alphabet (self, line):
504 line = line.strip() 505 line = line.replace('ALPHABET= ','') 506 if line == 'ACGT': 507 al = IUPAC.unambiguous_dna 508 else: 509 al = IUPAC.protein 510 self.data.alphabet = al
511
512 - def _sequence_name (self, line):
513 line = line.strip() 514 ls = line.split() 515 self.data.sequence_names.append(ls[0]) 516 if len(ls) == 6: 517 self.data.sequence_names.append(ls[3])
518
519 - def _commandline (self, line):
520 line = line.strip() 521 line = line.replace('command: ','') 522 self.data.command = line
523
524 - def _add_motif_with_info (self, line):
525 line = line.strip() 526 ls = line.split() 527 motif = MEMEMotif() 528 #motif.length=ls[4] 529 motif._numoccurrences(ls[7]) 530 motif._evalue(ls[13]) 531 motif.alphabet=self.data.alphabet 532 self.data.motifs.append(motif) 533 self.current_motif = motif
534
535 - def motif_name (self, line):
536 line = line.strip() 537 ls = line.split() 538 name = " ".join(ls[0:2]) 539 self.current_motif.name=name
540
541 - def add_instance (self, line):
542 line = line.strip() 543 ls = line.split() 544 if self.data.command.find('revcomp') != -1: 545 #seq = Seq.Seq(ls[5], self.data.alphabet) 546 self.current_motif.add_instance_from_values(name = ls[0], sequence = ls[5], start = ls[2], pvalue = ls[3], strand = ls[1]) 547 else: 548 #seq = Seq.Seq(ls[4], self.data.alphabet) 549 self.current_motif.add_instance_from_values(name = ls[0], sequence = ls[4], start = ls[1], pvalue = ls[2])
550
551 - def add_to_pssm (self, line):
552 pass
553
554 - def add_to_logodds (self, line):
555 pass
556
557 - def noevent (self,line):
558 pass
559 560 561
562 -class _MASTConsumer:
563 """ 564 Consumer that can receive events from _MASTScanner (DEPRECATED). 565 566 A _MASTConsumer parses lines from a mast text output file. 567 The motif match diagrams are parsed using line buffering. 568 Each of the buffering functions have a dummy variable that is 569 required for testing using the Bio.ParserSupport.TaggingConsumer. 570 If this variable isn't there, the TaggingConsumer barfs. In 571 the _MASTScanner, None is passed in the place of this variable. 572 573 This class is DEPRECATED; please use the read() function in the module 574 Bio.Motif.Parsers.MAST instead. 575 """
576 - def __init__ (self):
577 import warnings 578 warnings.warn("Bio.Motif.Parsers.MEME._MASTConsumer is deprecated; please use the read() function in the module Bio.Motif.Parsers.MAST instead", Bio.BiopythonDeprecationWarning) 579 self.data = MASTRecord() 580 self._current_seq = "" 581 self._line_buffer = [] 582 self._buffer_size = 0 583 self._buffered_seq_start = 0
584
585 - def _version (self, line):
586 line = line.strip() 587 ls = line.split() 588 self.data._version(ls[2])
589
590 - def _database (self, line):
591 line = line.strip() 592 ls = line.split() 593 self.data._database(ls[1]) 594 al = "" 595 if ls[2] == '(nucleotide)': 596 al = IUPAC.unambiguous_dna 597 self.data._alphabet(al) 598 else: 599 al = IUPAC.protein 600 self.data._alphabet(al)
601
602 - def _add_motif (self, line):
603 line = line.strip() 604 ls = line.split() 605 m = MEMEMotif() 606 m.alphabet=self.data.alphabet 607 m.length=ls[1] 608 name = ls[0] 609 m.name=name 610 m.add_instance(ls[2]) 611 self.data._add_motif(m)
612
613 - def _add_match_diagram (self, line):
614 line = line.strip() 615 ls = line.split() 616 self.data._add_diagram_for_sequence(ls[1], self._current_seq) 617 ds = ls[1].split('_') 618 i = 0 619 start = 0 620 for i in range(0,len(ds)): 621 if ds[i].find('[') != -1 or ds[i].find('<') != -1: 622 inst = MEMEInstance() 623 inst._seqname (self._current_seq) 624 inst._start (start) 625 r = re.compile('\d+') 626 mn = r.findall(ds[i])[0] 627 if ds[i].find('-') != -1: 628 inst.strand = '-' 629 else: 630 inst.strand = '+' 631 motif = self.data.get_motif_by_name(mn) 632 motif.add_instance(inst) 633 start += motif.length 634 else: 635 start += int(ds[i])
636
637 - def _add_sequence_match_with_diagram (self, line):
638 line = line.strip() 639 ls = line.split() 640 self.data._add_sequence(ls[0]) 641 self.data._add_diagram_for_sequence(ls[2],ls[0]) 642 ds = ls[2].split('_') 643 i = 0 644 start = 0 645 for i in range(0,len(ds)): 646 if ds[i].find('+') != -1 or ds[i].find('-') != -1: 647 inst = MEMEInstance() 648 inst._seqname (ls[0]) 649 inst._start (start) 650 r = re.compile('\d+') 651 mn = r.findall(ds[i])[0] 652 if ds[i].find('-') != -1: 653 inst.strand = '-' 654 else: 655 inst.strand = '+' 656 motif = self.data.get_motif_by_name(mn) 657 motif.add_instance(inst) 658 start += motif.length 659 else: 660 start += int(ds[i])
661
662 - def _add_diagram_from_buffer (self, dummy):
663 line = "" 664 for l in self._line_buffer: 665 line += l.strip() 666 ls = line.split() 667 self.data._add_diagram_for_sequence(ls[1], self._current_seq) 668 ds = ls[1].split('_') 669 i = 0 670 start = 0 671 for i in range(0,len(ds)): 672 if ds[i].find('[') != -1 or ds[i].find('<') != -1: 673 inst = MEMEInstance() 674 inst._seqname (self._current_seq) 675 inst._start (start) 676 r = re.compile('\d+') 677 mn = r.findall(ds[i])[0] 678 if ds[i].find('-') != -1: 679 inst.strand = '-' 680 else: 681 inst.strand = '+' 682 motif = self.data.get_motif_by_name(mn) 683 motif.add_instance(inst) 684 start += motif.length 685 else: 686 start += int(ds[i])
687
688 - def _set_current_seq (self, line):
689 line = line.strip() 690 self._current_seq = line 691 if not self.data.sequences.count(line): 692 self.data.sequences.append(line)
693
694 - def _add_line_to_buffer (self, line):
695 line = line.strip() 696 if not line.startswith('*****'): 697 self._line_buffer.append(line) 698 else: 699 return -1
700
701 - def _parse_buffer (self, dummy):
702 """Parses the line buffer to get e-values for each instance of a motif. 703 This buffer parser is the most likely point of failure for the 704 MASTParser. 705 """ 706 insts = self.data.get_motif_matches_for_sequence(self._current_seq) 707 if len(insts) > 0: 708 709 fullSeq = self._line_buffer[self._buffer_size-1] 710 pvals = self._line_buffer[1].split() 711 p = 0 712 lpval = len(pvals) 713 while p < lpval: 714 if pvals[p].count('e') > 1: 715 #Break blocks up by e and parse into valid floats. This only 716 #works if there are no e-values greater than 1e-5. 717 pvs = [] 718 spe = pvals[p].split('e') 719 spe.reverse() 720 dotind = spe[1].find('.') 721 if dotind == -1: 722 thispval = spe[1][-1] + 'e' + spe[0] 723 else: 724 thispval = spe[1][dotind-1:] + 'e' + spe[0] 725 pvs.append(thispval) 726 for spi in range(2,len(spe)): 727 dotind = spe[spi].find('.') 728 prevdotind = spe[spi-1].find('.') 729 if dotind != -1: 730 if prevdotind == -1: 731 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][:-1] 732 else: 733 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][0:prevdotind-1] 734 else: 735 if prevdotind == -1: 736 thispval = spe[spi][-1] + 'e' + spe[spi-1][:-1] 737 else: 738 thispval = spe[spi][-1] + 'e' + spe[spi-1][0:prevdotind-1] 739 pvs.append(thispval) 740 pvs.reverse() 741 if p > 0: 742 pvals = pvals[0:p] + pvs + pvals[p+1:] 743 else: 744 pvals = pvs + pvals[p+1:] 745 lpval = len(pvals) 746 p += 1 747 i = 0 748 if len(pvals) != len(insts): 749 sys.stderr.write("Failure to parse p-values for " + self._current_seq + ": " + self._line_buffer[1] + " to: " + str(pvals) + "\n") 750 pvals = [] 751 # else: 752 # sys.stderr.write('These are just fine' + self._current_seq + ': ' + self._line_buffer[1] + " to: " + str(pvals) + "\n") 753 for i in range(0,len(insts)): 754 inst = insts[i] 755 start = inst.start - self._buffered_seq_start + 1 756 thisSeq = fullSeq[start:start+inst.length] 757 thisSeq = Seq.Seq(thisSeq, self.data.alphabet) 758 inst._sequence(thisSeq) 759 if pvals: 760 inst._pvalue(float(pvals[i]))
761
762 - def _blank_buffer (self, dummy):
763 self._line_buffer = [] 764 self._buffer_size = 0
765
766 - def _collapse_buffer(self, dummy):
767 if self._buffer_size == 0: 768 if len(self._line_buffer) > 0: 769 self._buffer_size = len(self._line_buffer) 770 ll = self._line_buffer[self._buffer_size-1].split() 771 self._line_buffer[self._buffer_size-1] = ll[1] 772 self._buffered_seq_start = int(ll[0]) 773 else: 774 i = 0 775 for i in range(self._buffer_size, len(self._line_buffer)-1): 776 self._line_buffer[i-self._buffer_size] = self._line_buffer[i-self._buffer_size] + self._line_buffer[i].strip() 777 ll = self._line_buffer[len(self._line_buffer)-1].split() 778 if int(ll[0]) == self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1]): 779 self._line_buffer[self._buffer_size-1] += ll[1] 780 else: 781 differ = int(ll[0]) - (self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1])) 782 self._line_buffer[self._buffer_size-1] += "N"*differ 783 self._line_buffer[self._buffer_size-1] += ll[1] 784 self._line_buffer = self._line_buffer[0:self._buffer_size]
785
786 - def _add_motif_match (self, line):
787 line = line.strip() 788 if line.find('[') != -1 or line.find('<') != -1: 789 pass 790 elif line.find('e') != -1: 791 pass 792 elif line.find('+') != -1: 793 pass
794
795 - def noevent (self, line):
796 pass
797 798
799 -class MASTParser(AbstractParser):
800 """ 801 Parser for MAST text output (DEPRECATED). 802 HTML output cannot be parsed, yet. Returns a MASTRecord 803 804 A MASTParser takes a file handle for a MAST text output file and 805 returns a MASTRecord, containing the hits between motifs and 806 sequences. The parser does some unusual line buffering to parse out 807 match diagrams. Really complex diagrams often lead to an error message 808 and p-values not being parsed for a given line. 809 810 Methods: 811 parse (handle): parses the data from the file handle passed to it. 812 813 Example: 814 815 >>>f = open("mast_file.txt") 816 >>>parser = MASTParser() 817 >>>mast_record = parser.parse(f) 818 >>>for motif in mast_record.motifs: 819 ... for instance in motif.instances: 820 ... print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue 821 822 This class is DEPRECATED; please use the read() function in the module 823 Bio.Motif.Parsers.MAST instead. 824 """
825 - def __init__ (self):
826 import warnings 827 warnings.warn("Bio.Motif.Parsers.MEME.MASTParser is deprecated; please use the read() function in the module Bio.Motif.Parsers.MAST instead", Bio.BiopythonDeprecationWarning) 828 self._consumer = _MASTConsumer() 829 self._scanner = _MASTScanner()
830
831 - def parse (self, handle):
832 self._scanner.feed(handle, self._consumer) 833 return self._consumer.data
834 835 836
837 -class _MASTScanner:
838 """ 839 Scanner for MAST text output (DEPRECATED). 840 841 This class is DEPRECATED; please use the read() function in the module 842 Bio.Motif.Parsers.MAST instead. 843 """
844 - def __init__(self):
845 import warnings 846 warnings.warn("Bio.Motif.Parsers.MEME._MASTScanner is deprecated; please use the read() function in the module Bio.Motif.Parsers.MAST instead", Bio.BiopythonDeprecationWarning)
847
848 - def feed (self, handle, consumer):
849 if isinstance(handle, File.UndoHandle): 850 uhandle = handle 851 else: 852 uhandle = File.UndoHandle(handle) 853 854 self._scan_header(uhandle, consumer) 855 self._scan_matches(uhandle, consumer) 856 self._scan_annotated_matches(uhandle, consumer)
857
858 - def _scan_header (self, uhandle, consumer):
859 try: 860 read_and_call_until(uhandle, consumer.noevent, contains = "MAST version") 861 except ValueError: 862 raise ValueError("Improper input file. Does not begin with a line with 'MAST version'") 863 read_and_call(uhandle, consumer._version, contains = 'MAST version') 864 read_and_call_until(uhandle, consumer.noevent, start = 'DATABASE AND MOTIFS') 865 read_and_call(uhandle, consumer.noevent, start = 'DATABASE') 866 read_and_call(uhandle, consumer.noevent, start = '****') 867 read_and_call(uhandle, consumer._database, contains = 'DATABASE') 868 read_and_call_until(uhandle, consumer.noevent, contains = 'MOTIF WIDTH') 869 read_and_call(uhandle, consumer.noevent, contains = 'MOTIF') 870 read_and_call(uhandle, consumer.noevent, contains = '----') 871 read_and_call_until(uhandle, consumer._add_motif, blank = 1) 872 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION II:')
873
874 - def _scan_matches (self, uhandle, consumer):
875 read_and_call_until(uhandle, consumer.noevent, start = 'SEQUENCE NAME') 876 read_and_call(uhandle, consumer.noevent, start = 'SEQUENCE NAME') 877 read_and_call(uhandle, consumer.noevent, start = '---') 878 # read_and_call_until(uhandle, consumer._add_sequence_match_with_diagram, blank = 1) 879 read_and_call_until(uhandle, consumer.noevent, blank = 1) 880 read_and_call(uhandle, consumer.noevent, blank = 1)
881
882 - def _scan_annotated_matches (self, uhandle, consumer):
883 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION III:') 884 read_and_call(uhandle, consumer.noevent, start = 'SECTION III:') 885 read_and_call_until(uhandle, consumer.noevent, start = '****') 886 read_and_call(uhandle, consumer.noevent, start = '****') 887 read_and_call_until(uhandle, consumer.noevent, start = '*****') 888 read_and_call(uhandle, consumer.noevent) 889 read_and_call_while(uhandle, consumer.noevent, blank = 1) 890 readMatches = 1 891 while readMatches == 1: 892 if consumer._current_seq: 893 if consumer._buffer_size != 0: 894 consumer._parse_buffer(None) 895 consumer._blank_buffer(None) 896 read_and_call(uhandle, consumer._set_current_seq) 897 read_and_call_until(uhandle, consumer.noevent, start = ' DIAGRAM') 898 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1) 899 consumer._add_diagram_from_buffer(None) 900 consumer._blank_buffer(None) 901 read_and_call(uhandle, consumer.noevent, blank = 1) 902 while 1: 903 line = safe_peekline(uhandle) 904 if line.startswith('****'): 905 consumer._parse_buffer(None) 906 readMatches = 0 907 break 908 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1) 909 read_and_call(uhandle, consumer.noevent, blank = 1) 910 consumer._collapse_buffer(None) 911 if attempt_read_and_call(uhandle, consumer.noevent, blank = 1): 912 break 913 elif attempt_read_and_call(uhandle, consumer.noevent, start = '*****'): 914 consumer._parse_buffer(None) 915 consumer._blank_buffer(None) 916 readMatches = 0 917 break
918 919 920
921 -class MASTRecord:
922 """The class for holding the results from a MAST run (DEPRECATED). 923 924 A MASTRecord holds data about matches between motifs and sequences. 925 The motifs held by the MASTRecord are objects of the class MEMEMotif. 926 927 Methods: 928 get_motif_matches_for_sequence(sequence_name): returns all of the 929 motif matches within a given sequence. The matches are objects of 930 the class MEMEInstance 931 get_motif_matches (motif_name): returns all of the matches for a motif 932 in the sequences searched. The matches returned are of class 933 MEMEInstance 934 get_motif_by_name (motif_name): returns a MEMEMotif with the given 935 name. 936 937 This class is DEPRECATED; please use the read() function in the module 938 Bio.Motif.Parsers.MAST instead. 939 """
940 - def __init__ (self):
941 import warnings 942 warnings.warn("Bio.Motif.Parsers.MEME.MASTRecord is deprecated; please use the read() function in the module Bio.Motif.Parsers.MAST instead", Bio.BiopythonDeprecationWarning) 943 self.sequences = [] 944 self.version = "" 945 self.matches = [] 946 self.database = "" 947 self.diagrams = {} 948 self.alphabet = None 949 self.motifs = []
950
951 - def _version (self, version):
952 self.version = version
953
954 - def _alphabet (self, alphabet):
955 if alphabet == IUPAC.protein or alphabet == IUPAC.ambiguous_dna or alphabet == IUPAC.unambiguous_dna: 956 self.alphabet = alphabet 957 else: 958 return -1
959
960 - def _database(self, database):
961 self.database = database
962
963 - def get_motif_matches_for_sequence (self, seq):
964 insts = [] 965 for m in self.motifs: 966 for i in m.instances: 967 if i.sequence_name == seq: 968 insts.append(i) 969 insts.sort(lambda x,y: cmp(x.start, y.start)) 970 return insts
971
972 - def get_motif_matches (self, motif):
973 m = self.get_motif_by_name (motif.name) 974 return m.instances
975
976 - def _add_diagram_for_sequence (self, diagram, seq):
977 self.diagrams[seq] = diagram
978
979 - def _add_match (self, match):
980 self.matches.append(match)
981
982 - def _add_sequence (self, sequence):
984
985 - def _add_motif (self, motif):
986 self.motifs.append(motif)
987
988 - def get_motif_by_name (self, name):
989 for m in self.motifs: 990 if m.name == name: 991 return m
992