1
2
3
4
5
6
7 """This module provides code to work with the BLAST XML output
8 following the DTD available on the NCBI FTP
9 ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd
10
11 Classes:
12 BlastParser Parses XML output from BLAST (direct use discouraged).
13 This (now) returns a list of Blast records.
14 Historically it returned a single Blast record.
15 You are expected to use this via the parse or read functions.
16
17 _XMLParser Generic SAX parser (private).
18
19 Functions:
20 parse Incremental parser, this is an iterator that returns
21 Blast records. It uses the BlastParser internally.
22 read Returns a single Blast record. Uses the BlastParser internally.
23 """
24 from Bio.Blast import Record
25 import xml.sax
26 from xml.sax.handler import ContentHandler
27
29 """Generic SAX Parser
30
31 Just a very basic SAX parser.
32
33 Redefine the methods startElement, characters and endElement.
34 """
36 """Constructor
37
38 debug - integer, amount of debug information to print
39 """
40 self._tag = []
41 self._value = ''
42 self._debug = debug
43 self._debug_ignore_list = []
44
46 """Removes 'dangerous' from tag names
47
48 name -- name to be 'secured'
49 """
50
51 return name.replace('-', '_')
52
54 """Found XML start tag
55
56 No real need of attr, BLAST DTD doesn't use them
57
58 name -- name of the tag
59
60 attr -- tag attributes
61 """
62 self._tag.append(name)
63
64
65 method = self._secure_name('_start_' + name)
66
67
68
69 if hasattr(self, method):
70 eval("self.%s()" % method)
71 if self._debug > 4:
72 print "NCBIXML: Parsed: " + method
73 else:
74
75 if method not in self._debug_ignore_list:
76 if self._debug > 3:
77 print "NCBIXML: Ignored: " + method
78 self._debug_ignore_list.append(method)
79
80
81
82 if self._value.strip():
83 raise ValueError("What should we do with %s before the %s tag?" \
84 % (repr(self._value), name))
85 self._value = ""
86
88 """Found some text
89
90 ch -- characters read
91 """
92 self._value += ch
93
95 """Found XML end tag
96
97 name -- tag name
98 """
99
100
101
102 method = self._secure_name('_end_' + name)
103
104
105 if hasattr(self, method):
106 eval("self.%s()" % method)
107 if self._debug > 2:
108 print "NCBIXML: Parsed: " + method, self._value
109 else:
110
111 if method not in self._debug_ignore_list:
112 if self._debug > 1:
113 print "NCBIXML: Ignored: " + method, self._value
114 self._debug_ignore_list.append(method)
115
116
117 self._value = ''
118
120 """Parse XML BLAST data into a Record.Blast object
121
122 All XML 'action' methods are private methods and may be:
123 _start_TAG called when the start tag is found
124 _end_TAG called when the end tag is found
125 """
126
128 """Constructor
129
130 debug - integer, amount of debug information to print
131 """
132
133 _XMLparser.__init__(self, debug)
134
135 self._parser = xml.sax.make_parser()
136 self._parser.setContentHandler(self)
137
138
139 self._parser.setFeature(xml.sax.handler.feature_validation, 0)
140 self._parser.setFeature(xml.sax.handler.feature_namespaces, 0)
141 self._parser.setFeature(xml.sax.handler.feature_external_pes, 0)
142 self._parser.setFeature(xml.sax.handler.feature_external_ges, 0)
143
144 self.reset()
145
147 """Reset all the data allowing reuse of the BlastParser() object"""
148 self._records = []
149 self._header = Record.Header()
150 self._parameters = Record.Parameters()
151 self._parameters.filter = None
152
156
158
159
160 self._blast.reference = self._header.reference
161 self._blast.date = self._header.date
162 self._blast.version = self._header.version
163 self._blast.database = self._header.database
164 self._blast.application = self._header.application
165
166
167
168
169
170
171 if not hasattr(self._blast, "query") \
172 or not self._blast.query:
173 self._blast.query = self._header.query
174 if not hasattr(self._blast, "query_id") \
175 or not self._blast.query_id:
176 self._blast.query_id = self._header.query_id
177 if not hasattr(self._blast, "query_letters") \
178 or not self._blast.query_letters:
179 self._blast.query_letters = self._header.query_letters
180
181
182
183
184 self._blast.query_length = self._blast.query_letters
185
186
187
188
189
190
191 self._blast.database_length = self._blast.num_letters_in_database
192
193
194
195 self._blast.database_sequences = self._blast.num_sequences_in_database
196
197
198 self._blast.matrix = self._parameters.matrix
199 self._blast.num_seqs_better_e = self._parameters.num_seqs_better_e
200 self._blast.gap_penalties = self._parameters.gap_penalties
201 self._blast.filter = self._parameters.filter
202 self._blast.expect = self._parameters.expect
203 self._blast.sc_match = self._parameters.sc_match
204 self._blast.sc_mismatch = self._parameters.sc_mismatch
205
206
207 self._records.append(self._blast)
208
209 self._blast = None
210
211 if self._debug : "NCBIXML: Added Blast record to results"
212
213
215 """BLAST program, e.g., blastp, blastn, etc.
216
217 Save this to put on each blast record object
218 """
219 self._header.application = self._value.upper()
220
222 """version number and date of the BLAST engine.
223
224 e.g. "BLASTX 2.2.12 [Aug-07-2005]" but there can also be
225 variants like "BLASTP 2.2.18+" without the date.
226
227 Save this to put on each blast record object
228 """
229 parts = self._value.split()
230
231
232
233 self._header.version = parts[1]
234
235
236 if len(parts) >= 3:
237 if parts[2][0] == "[" and parts[2][-1] == "]":
238 self._header.date = parts[2][1:-1]
239 else:
240
241
242 self._header.date = parts[2]
243
245 """a reference to the article describing the algorithm
246
247 Save this to put on each blast record object
248 """
249 self._header.reference = self._value
250
252 """the database(s) searched
253
254 Save this to put on each blast record object
255 """
256 self._header.database = self._value
257
259 """the identifier of the query
260
261 Important in old pre 2.2.14 BLAST, for recent versions
262 <Iteration_query-ID> is enough
263 """
264 self._header.query_id = self._value
265
267 """the definition line of the query
268
269 Important in old pre 2.2.14 BLAST, for recent versions
270 <Iteration_query-def> is enough
271 """
272 self._header.query = self._value
273
275 """the length of the query
276
277 Important in old pre 2.2.14 BLAST, for recent versions
278 <Iteration_query-len> is enough
279 """
280 self._header.query_letters = int(self._value)
281
283 """the identifier of the query
284 """
285 self._blast.query_id = self._value
286
288 """the definition line of the query
289 """
290 self._blast.query = self._value
291
293 """the length of the query
294 """
295 self._blast.query_letters = int(self._value)
296
297
298
299
300
301
302
303
304
305
306
308 """hits to the database sequences, one for every sequence
309 """
310 self._blast.num_hits = int(self._value)
311
312
313
314
315
316
317
319 """matrix used (-M)
320 """
321 self._parameters.matrix = self._value
322
324 """expect values cutoff (-e)
325 """
326
327
328
329
330
331
332
333 self._parameters.expect = self._value
334
335
336
337
338
339
341 """match score for nucleotide-nucleotide comparaison (-r)
342 """
343 self._parameters.sc_match = int(self._value)
344
346 """mismatch penalty for nucleotide-nucleotide comparaison (-r)
347 """
348 self._parameters.sc_mismatch = int(self._value)
349
351 """gap existence cost (-G)
352 """
353 self._parameters.gap_penalties = int(self._value)
354
356 """gap extension cose (-E)
357 """
358 self._parameters.gap_penalties = (self._parameters.gap_penalties,
359 int(self._value))
360
362 """filtering options (-F)
363 """
364 self._parameters.filter = self._value
365
366
367
368
369
370
371
372
373
374
375
376
378 self._blast.alignments.append(Record.Alignment())
379 self._blast.descriptions.append(Record.Description())
380 self._blast.multiple_alignment = []
381 self._hit = self._blast.alignments[-1]
382 self._descr = self._blast.descriptions[-1]
383 self._descr.num_alignments = 0
384
386
387 self._blast.multiple_alignment = None
388 self._hit = None
389 self._descr = None
390
392 """identifier of the database sequence
393 """
394 self._hit.hit_id = self._value
395 self._hit.title = self._value + ' '
396
398 """definition line of the database sequence
399 """
400 self._hit.hit_def = self._value
401 self._hit.title += self._value
402 self._descr.title = self._hit.title
403
405 """accession of the database sequence
406 """
407 self._hit.accession = self._value
408 self._descr.accession = self._value
409
411 self._hit.length = int(self._value)
412
413
415
416
417 self._hit.hsps.append(Record.HSP())
418 self._hsp = self._hit.hsps[-1]
419 self._descr.num_alignments += 1
420 self._blast.multiple_alignment.append(Record.MultipleAlignment())
421 self._mult_al = self._blast.multiple_alignment[-1]
422
423
425 """raw score of HSP
426 """
427 self._hsp.score = float(self._value)
428 if self._descr.score == None:
429 self._descr.score = float(self._value)
430
432 """bit score of HSP
433 """
434 self._hsp.bits = float(self._value)
435 if self._descr.bits == None:
436 self._descr.bits = float(self._value)
437
439 """expect value value of the HSP
440 """
441 self._hsp.expect = float(self._value)
442 if self._descr.e == None:
443 self._descr.e = float(self._value)
444
446 """offset of query at the start of the alignment (one-offset)
447 """
448 self._hsp.query_start = int(self._value)
449
451 """offset of query at the end of the alignment (one-offset)
452 """
453 self._hsp.query_end = int(self._value)
454
456 """offset of the database at the start of the alignment (one-offset)
457 """
458 self._hsp.sbjct_start = int(self._value)
459
461 """offset of the database at the end of the alignment (one-offset)
462 """
463 self._hsp.sbjct_end = int(self._value)
464
465
466
467
468
469
470
471
472
473
474
476 """frame of the query if applicable
477 """
478 self._hsp.frame = (int(self._value),)
479
481 """frame of the database sequence if applicable
482 """
483 self._hsp.frame += (int(self._value),)
484
486 """number of identities in the alignment
487 """
488 self._hsp.identities = int(self._value)
489
491 """number of positive (conservative) substitutions in the alignment
492 """
493 self._hsp.positives = int(self._value)
494
496 """number of gaps in the alignment
497 """
498 self._hsp.gaps = int(self._value)
499
501 """length of the alignment
502 """
503 self._hsp.align_length = int(self._value)
504
505
506
507
508
509
511 """alignment string for the query
512 """
513 self._hsp.query = self._value
514
516 """alignment string for the database
517 """
518 self._hsp.sbjct = self._value
519
521 """Formatting middle line as normally seen in BLAST report
522 """
523 self._hsp.match = self._value
524 assert len(self._hsp.match)==len(self._hsp.query)
525 assert len(self._hsp.match)==len(self._hsp.sbjct)
526
527
532
537
542
547
549 """Karlin-Altschul parameter K
550 """
551 self._blast.ka_params = float(self._value)
552
554 """Karlin-Altschul parameter Lambda
555 """
556 self._blast.ka_params = (float(self._value),
557 self._blast.ka_params)
558
560 """Karlin-Altschul parameter H
561 """
562 self._blast.ka_params = self._blast.ka_params + (float(self._value),)
563
564 -def read(handle, debug=0):
565 """Returns a single Blast record (assumes just one query).
566
567 This function is for use when there is one and only one BLAST
568 result in your XML file.
569
570 Use the Bio.Blast.NCBIXML.parse() function if you expect more than
571 one BLAST record (i.e. if you have more than one query sequence).
572
573 """
574 iterator = parse(handle, debug)
575 try:
576 first = iterator.next()
577 except StopIteration:
578 first = None
579 if first is None:
580 raise ValueError("No records found in handle")
581 try:
582 second = iterator.next()
583 except StopIteration:
584 second = None
585 if second is not None:
586 raise ValueError("More than one record found in handle")
587 return first
588
589
590 -def parse(handle, debug=0):
591 """Returns an iterator a Blast record for each query.
592
593 handle - file handle to and XML file to parse
594 debug - integer, amount of debug information to print
595
596 This is a generator function that returns multiple Blast records
597 objects - one for each query sequence given to blast. The file
598 is read incrementally, returning complete records as they are read
599 in.
600
601 Should cope with new BLAST 2.2.14+ which gives a single XML file
602 for mutliple query records.
603
604 Should also cope with XML output from older versions BLAST which
605 gave multiple XML files concatenated together (giving a single file
606 which strictly speaking wasn't valid XML)."""
607 from xml.parsers import expat
608 BLOCK = 1024
609 MARGIN = 10
610 XML_START = "<?xml"
611
612 text = handle.read(BLOCK)
613 pending = ""
614
615 if not text:
616
617 raise ValueError("Your XML file was empty")
618
619 while text:
620
621 if not text.startswith(XML_START):
622 raise ValueError("Your XML file did not start with %s... "
623 "but instead %s" \
624 % (XML_START, repr(text[:20])))
625
626 expat_parser = expat.ParserCreate()
627 blast_parser = BlastParser(debug)
628 expat_parser.StartElementHandler = blast_parser.startElement
629 expat_parser.EndElementHandler = blast_parser.endElement
630 expat_parser.CharacterDataHandler = blast_parser.characters
631
632 expat_parser.Parse(text, False)
633 while blast_parser._records:
634 record = blast_parser._records[0]
635 blast_parser._records = blast_parser._records[1:]
636 yield record
637
638 while True:
639
640 text, pending = pending + handle.read(BLOCK), ""
641 if not text:
642
643 expat_parser.Parse("", True)
644 break
645
646
647
648 pending = handle.read(MARGIN)
649
650 if (text+pending).find("\n" + XML_START) == -1:
651
652 expat_parser.Parse(text, False)
653 while blast_parser._records:
654 yield blast_parser._records.pop(0)
655 else:
656
657
658
659
660 text, pending = (text+pending).split("\n" + XML_START,1)
661 pending = XML_START + pending
662
663 expat_parser.Parse(text, True)
664 while blast_parser._records:
665 yield blast_parser._records.pop(0)
666
667
668
669 text, pending = pending, ""
670 break
671
672
673
674 while blast_parser._records:
675 yield blast_parser._records.pop(0)
676
677
678
679
680 assert pending==""
681 assert len(blast_parser._records) == 0
682
683
684 assert text==""
685 assert pending==""
686 assert len(blast_parser._records) == 0
687
688 if __name__ == '__main__':
689 import sys
690 import os
691 handle = open(sys.argv[1])
692 r_list = parse(handle)
693
694 for r in r_list:
695
696 print 'Blast of', r.query
697 print 'Found %s alignments with a total of %s HSPs' % (len(r.alignments),
698 reduce(lambda a,b: a+b,
699 [len(a.hsps) for a in r.alignments]))
700
701 for al in r.alignments:
702 print al.title[:50], al.length, 'bp', len(al.hsps), 'HSPs'
703
704
705 E_VALUE_THRESH = 0.04
706 for alignment in r.alignments:
707 for hsp in alignment.hsps:
708 if hsp.expect < E_VALUE_THRESH:
709 print '*****'
710 print 'sequence', alignment.title
711 print 'length', alignment.length
712 print 'e value', hsp.expect
713 print hsp.query[:75] + '...'
714 print hsp.match[:75] + '...'
715 print hsp.sbjct[:75] + '...'
716