1
2
3
4
5
6
7 """Code to work with GenBank formatted files.
8
9 Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with
10 the "genbank" or "embl" format names to parse GenBank or EMBL files into
11 SeqRecord and SeqFeature objects (see the Biopython tutorial for details).
12
13 Also, rather than using Bio.GenBank to search or download files from the NCBI,
14 you are now encouraged to use Bio.Entrez instead (again, see the Biopython
15 tutorial for details).
16
17 Currently the ONLY reason to use Bio.GenBank directly is for the RecordParser
18 which turns a GenBank file into GenBank-specific Record objects. This is a
19 much closer representation to the raw file contents that the SeqRecord
20 alternative from the FeatureParser (used in Bio.SeqIO).
21
22 Classes:
23 Iterator Iterate through a file of GenBank entries
24 ErrorFeatureParser Catch errors caused during parsing.
25 FeatureParser Parse GenBank data in SeqRecord and SeqFeature objects.
26 RecordParser Parse GenBank data into a Record object.
27
28 Exceptions:
29 ParserFailureError Exception indicating a failure in the parser (ie.
30 scanner or consumer)
31 LocationParserError Exception indiciating a problem with the spark based
32 location parser.
33
34
35 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records.
36 These are GenBank files that summarize the content of a project, and provide lists of
37 scaffold and contig files in the project. These will be in annotations['wgs'] and
38 annotations['wgs_scafld']. These GenBank files do not have sequences. See
39 http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36
40
41 http://is.gd/nNgk
42 for more details of this format, and an example.
43 Added by Ying Huang & Iddo Friedberg
44 """
45 import cStringIO
46 import re
47
48
49 from Bio import SeqFeature
50 from Bio.ParserSupport import AbstractConsumer
51 from Bio import Entrez
52
53
54 from utils import FeatureValueCleaner
55 from Scanner import GenBankScanner
56
57
58 GENBANK_INDENT = 12
59 GENBANK_SPACER = " " * GENBANK_INDENT
60
61
62 FEATURE_KEY_INDENT = 5
63 FEATURE_QUALIFIER_INDENT = 21
64 FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT
65 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
66
67
68 _solo_location = r"[<>]?\d+"
69 _pair_location = r"[<>]?\d+\.\.[<>]?\d+"
70 _between_location = r"\d+\^\d+"
71
72 _within_position = r"\(\d+\.\d+\)"
73 _re_within_position = re.compile(_within_position)
74 _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \
75 % (_within_position,_within_position)
76 assert _re_within_position.match("(3.9)")
77 assert re.compile(_within_location).match("(3.9)..10")
78 assert re.compile(_within_location).match("26..(30.33)")
79 assert re.compile(_within_location).match("(13.19)..(20.28)")
80
81 _oneof_position = r"one\-of\(\d+(,\d+)+\)"
82 _re_oneof_position = re.compile(_oneof_position)
83 _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \
84 % (_oneof_position,_oneof_position)
85 assert _re_oneof_position.match("one-of(6,9)")
86 assert re.compile(_oneof_location).match("one-of(6,9)..101")
87 assert re.compile(_oneof_location).match("one-of(6,9)..one-of(101,104)")
88 assert re.compile(_oneof_location).match("6..one-of(101,104)")
89
90 assert not _re_oneof_position.match("one-of(3)")
91 assert _re_oneof_position.match("one-of(3,6)")
92 assert _re_oneof_position.match("one-of(3,6,9)")
93
94
95 _simple_location = r"\d+\.\.\d+"
96 _re_simple_location = re.compile(_simple_location)
97 _re_simple_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" \
98 % (_simple_location, _simple_location))
99 _complex_location = r"([a-zA-z][a-zA-Z0-9]*(\.[a-zA-Z0-9]+)?\:)?(%s|%s|%s|%s|%s)" \
100 % (_pair_location, _solo_location, _between_location,
101 _within_location, _oneof_location)
102 _re_complex_location = re.compile(r"^%s$" % _complex_location)
103 _possibly_complemented_complex_location = r"(%s|complement\(%s\))" \
104 % (_complex_location, _complex_location)
105 _re_complex_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" \
106 % (_possibly_complemented_complex_location,
107 _possibly_complemented_complex_location))
108
109 assert _re_simple_location.match("104..160")
110 assert not _re_simple_location.match("<104..>160")
111 assert not _re_simple_location.match("104")
112 assert not _re_simple_location.match("<1")
113 assert not _re_simple_location.match(">99999")
114 assert not _re_simple_location.match("join(104..160,320..390,504..579)")
115 assert not _re_simple_compound.match("bond(12,63)")
116 assert _re_simple_compound.match("join(104..160,320..390,504..579)")
117 assert _re_simple_compound.match("order(1..69,1308..1465)")
118 assert not _re_simple_compound.match("order(1..69,1308..1465,1524)")
119 assert not _re_simple_compound.match("join(<1..442,992..1228,1524..>1983)")
120 assert not _re_simple_compound.match("join(<1..181,254..336,422..497,574..>590)")
121 assert not _re_simple_compound.match("join(1475..1577,2841..2986,3074..3193,3314..3481,4126..>4215)")
122 assert not _re_simple_compound.match("test(1..69,1308..1465)")
123 assert not _re_simple_compound.match("complement(1..69)")
124 assert not _re_simple_compound.match("(1..69)")
125 assert _re_complex_location.match("(3.9)..10")
126 assert _re_complex_location.match("26..(30.33)")
127 assert _re_complex_location.match("(13.19)..(20.28)")
128 assert _re_complex_location.match("41^42")
129 assert _re_complex_location.match("AL121804:41^42")
130 assert _re_complex_location.match("AL121804:41..610")
131 assert _re_complex_location.match("AL121804.2:41..610")
132 assert _re_complex_location.match("one-of(3,6)..101")
133 assert _re_complex_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)")
134 assert not _re_simple_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)")
135 assert _re_complex_compound.match("join(complement(69611..69724),139856..140650)")
136
137 -def _pos(pos_str, offset=0):
138 """Build a Position object (PRIVATE).
139
140 For an end position, leave offset as zero (default):
141
142 >>> _pos("5")
143 ExactPosition(5)
144
145 For a start position, set offset to minus one (for Python counting):
146
147 >>> _pos("5", -1)
148 ExactPosition(4)
149
150 This also covers fuzzy positions:
151
152 >>> _pos("<5")
153 BeforePosition(5)
154 >>> _pos(">5")
155 AfterPosition(5)
156 >>> _pos("one-of(5,8,11)")
157 OneOfPosition([ExactPosition(5), ExactPosition(8), ExactPosition(11)])
158 >>> _pos("(8.10)")
159 WithinPosition(8,2)
160 """
161 if pos_str.startswith("<"):
162 return SeqFeature.BeforePosition(int(pos_str[1:])+offset)
163 elif pos_str.startswith(">"):
164 return SeqFeature.AfterPosition(int(pos_str[1:])+offset)
165 elif _re_within_position.match(pos_str):
166 s,e = pos_str[1:-1].split(".")
167 return SeqFeature.WithinPosition(int(s)+offset, int(e)-int(s))
168 elif _re_oneof_position.match(pos_str):
169 assert pos_str.startswith("one-of(")
170 assert pos_str[-1]==")"
171 parts = [SeqFeature.ExactPosition(int(pos)+offset) \
172 for pos in pos_str[7:-1].split(",")]
173 return SeqFeature.OneOfPosition(parts)
174 else:
175 return SeqFeature.ExactPosition(int(pos_str)+offset)
176
177 -def _loc(loc_str, expected_seq_length):
178 """FeatureLocation from non-compound non-complement location (PRIVATE).
179
180 Simple examples,
181
182 >>> _loc("123..456", 1000)
183 FeatureLocation(ExactPosition(122),ExactPosition(456))
184 >>> _loc("<123..>456", 1000)
185 FeatureLocation(BeforePosition(122),AfterPosition(456))
186
187 A more complex location using within positions,
188
189 >>> _loc("(9.10)..(20.25)", 1000)
190 FeatureLocation(WithinPosition(8,1),WithinPosition(20,5))
191
192 Zero length between feature,
193
194 >>> _loc("123^124", 1000)
195 FeatureLocation(ExactPosition(123),ExactPosition(123))
196
197 The expected sequence length is needed for a special case, a between
198 position at the start/end of a circular genome:
199
200 >>> _loc("1000^1", 1000)
201 FeatureLocation(ExactPosition(1000),ExactPosition(1000))
202
203 Apart from this special case, between positions P^Q must have P+1==Q,
204
205 >>> _loc("123^456", 1000)
206 Traceback (most recent call last):
207 ...
208 ValueError: Invalid between location '123^456'
209 """
210 try:
211 s, e = loc_str.split("..")
212 except ValueError:
213 assert ".." not in loc_str
214 if "^" in loc_str:
215
216
217
218
219
220
221
222 s, e = loc_str.split("^")
223 if int(s)+1==int(e):
224 pos = _pos(s)
225 elif int(s)==expected_seq_length and e=="1":
226 pos = _pos(s)
227 else:
228 raise ValueError("Invalid between location %s" % repr(loc_str))
229 return SeqFeature.FeatureLocation(pos, pos)
230 else:
231
232 s = loc_str
233 e = loc_str
234 return SeqFeature.FeatureLocation(_pos(s,-1), _pos(e))
235
237 """Split a tricky compound location string (PRIVATE).
238
239 >>> list(_split_compound_loc("123..145"))
240 ['123..145']
241 >>> list(_split_compound_loc("123..145,200..209"))
242 ['123..145', '200..209']
243 >>> list(_split_compound_loc("one-of(200,203)..300"))
244 ['one-of(200,203)..300']
245 >>> list(_split_compound_loc("complement(123..145),200..209"))
246 ['complement(123..145)', '200..209']
247 >>> list(_split_compound_loc("123..145,one-of(200,203)..209"))
248 ['123..145', 'one-of(200,203)..209']
249 >>> list(_split_compound_loc("123..145,one-of(200,203)..one-of(209,211),300"))
250 ['123..145', 'one-of(200,203)..one-of(209,211)', '300']
251 >>> list(_split_compound_loc("123..145,complement(one-of(200,203)..one-of(209,211)),300"))
252 ['123..145', 'complement(one-of(200,203)..one-of(209,211))', '300']
253 >>> list(_split_compound_loc("123..145,200..one-of(209,211),300"))
254 ['123..145', '200..one-of(209,211)', '300']
255 >>> list(_split_compound_loc("123..145,200..one-of(209,211)"))
256 ['123..145', '200..one-of(209,211)']
257 """
258 if "one-of(" in compound_loc:
259
260 while "," in compound_loc:
261 assert compound_loc[0] != ","
262 assert compound_loc[0:2] != ".."
263 i = compound_loc.find(",")
264 part = compound_loc[:i]
265 compound_loc = compound_loc[i:]
266 while part.count("(") > part.count(")"):
267 assert "one-of(" in part, (part, compound_loc)
268 i = compound_loc.find(")")
269 part += compound_loc[:i+1]
270 compound_loc = compound_loc[i+1:]
271 if compound_loc.startswith(".."):
272 i = compound_loc.find(",")
273 if i==-1:
274 part += compound_loc
275 compound_loc = ""
276 else:
277 part += compound_loc[:i]
278 compound_loc = compound_loc[i:]
279 while part.count("(") > part.count(")"):
280 assert part.count("one-of(") == 2
281 i = compound_loc.find(")")
282 part += compound_loc[:i+1]
283 compound_loc = compound_loc[i+1:]
284 if compound_loc.startswith(","):
285 compound_loc = compound_loc[1:]
286 assert part
287 yield part
288 if compound_loc:
289 yield compound_loc
290 else:
291
292 for part in compound_loc.split(","):
293 yield part
294
296 """Iterator interface to move over a file of GenBank entries one at a time.
297 """
298 - def __init__(self, handle, parser = None):
299 """Initialize the iterator.
300
301 Arguments:
302 o handle - A handle with GenBank entries to iterate through.
303 o parser - An optional parser to pass the entries through before
304 returning them. If None, then the raw entry will be returned.
305 """
306 self.handle = handle
307 self._parser = parser
308
310 """Return the next GenBank record from the handle.
311
312 Will return None if we ran out of records.
313 """
314 if self._parser is None:
315 lines = []
316 while True:
317 line = self.handle.readline()
318 if not line : return None
319 lines.append(line)
320 if line.rstrip() == "//" : break
321 return "".join(lines)
322 try:
323 return self._parser.parse(self.handle)
324 except StopIteration:
325 return None
326
328 return iter(self.next, None)
329
331 """Failure caused by some kind of problem in the parser.
332 """
333 pass
334
336 """Could not Properly parse out a location from a GenBank file.
337 """
338 pass
339
341 """Parse GenBank files into Seq + Feature objects.
342 """
345 """Initialize a GenBank parser and Feature consumer.
346
347 Arguments:
348 o debug_level - An optional argument that species the amount of
349 debugging information the parser should spit out. By default we have
350 no debugging info (the fastest way to do things), but if you want
351 you can set this as high as two and see exactly where a parse fails.
352 o use_fuzziness - Specify whether or not to use fuzzy representations.
353 The default is 1 (use fuzziness).
354 o feature_cleaner - A class which will be used to clean out the
355 values of features. This class must implement the function
356 clean_value. GenBank.utils has a "standard" cleaner class, which
357 is used by default.
358 """
359 self._scanner = GenBankScanner(debug_level)
360 self.use_fuzziness = use_fuzziness
361 self._cleaner = feature_cleaner
362
363 - def parse(self, handle):
364 """Parse the specified handle.
365 """
366 self._consumer = _FeatureConsumer(self.use_fuzziness,
367 self._cleaner)
368 self._scanner.feed(handle, self._consumer)
369 return self._consumer.data
370
372 """Parse GenBank files into Record objects
373 """
375 """Initialize the parser.
376
377 Arguments:
378 o debug_level - An optional argument that species the amount of
379 debugging information the parser should spit out. By default we have
380 no debugging info (the fastest way to do things), but if you want
381 you can set this as high as two and see exactly where a parse fails.
382 """
383 self._scanner = GenBankScanner(debug_level)
384
385 - def parse(self, handle):
386 """Parse the specified handle into a GenBank record.
387 """
388 self._consumer = _RecordConsumer()
389 self._scanner.feed(handle, self._consumer)
390 return self._consumer.data
391
393 """Abstract GenBank consumer providing useful general functions.
394
395 This just helps to eliminate some duplication in things that most
396 GenBank consumers want to do.
397 """
398
399
400
401
402 remove_space_keys = ["translation"]
403
406
408 """Split a string of keywords into a nice clean list.
409 """
410
411 if keyword_string == "" or keyword_string == ".":
412 keywords = ""
413 elif keyword_string[-1] == '.':
414 keywords = keyword_string[:-1]
415 else:
416 keywords = keyword_string
417 keyword_list = keywords.split(';')
418 clean_keyword_list = [x.strip() for x in keyword_list]
419 return clean_keyword_list
420
422 """Split a string of accession numbers into a list.
423 """
424
425
426 accession = accession_string.replace("\n", " ").replace(";"," ")
427
428 return [x.strip() for x in accession.split() if x.strip()]
429
431 """Split a string with taxonomy info into a list.
432 """
433 if not taxonomy_string or taxonomy_string==".":
434
435 return []
436
437 if taxonomy_string[-1] == '.':
438 tax_info = taxonomy_string[:-1]
439 else:
440 tax_info = taxonomy_string
441 tax_list = tax_info.split(';')
442 new_tax_list = []
443 for tax_item in tax_list:
444 new_items = tax_item.split("\n")
445 new_tax_list.extend(new_items)
446 while '' in new_tax_list:
447 new_tax_list.remove('')
448 clean_tax_list = [x.strip() for x in new_tax_list]
449
450 return clean_tax_list
451
453 """Clean whitespace out of a location string.
454
455 The location parser isn't a fan of whitespace, so we clean it out
456 before feeding it into the parser.
457 """
458
459
460
461 return ''.join(location_string.split())
462
464 """Remove any newlines in the passed text, returning the new string.
465 """
466
467 newlines = ["\n", "\r"]
468 for ws in newlines:
469 text = text.replace(ws, "")
470
471 return text
472
474 """Replace multiple spaces in the passed text with single spaces.
475 """
476
477 text_parts = text.split(" ")
478 text_parts = filter(None, text_parts)
479 return ' '.join(text_parts)
480
482 """Remove all spaces from the passed text.
483 """
484 return text.replace(" ", "")
485
487 """Convert a start and end range to python notation.
488
489 In GenBank, starts and ends are defined in "biological" coordinates,
490 where 1 is the first base and [i, j] means to include both i and j.
491
492 In python, 0 is the first base and [i, j] means to include i, but
493 not j.
494
495 So, to convert "biological" to python coordinates, we need to
496 subtract 1 from the start, and leave the end and things should
497 be converted happily.
498 """
499 new_start = start - 1
500 new_end = end
501
502 return new_start, new_end
503
505 """Create a SeqRecord object with Features to return.
506
507 Attributes:
508 o use_fuzziness - specify whether or not to parse with fuzziness in
509 feature locations.
510 o feature_cleaner - a class that will be used to provide specialized
511 cleaning-up of feature values.
512 """
513 - def __init__(self, use_fuzziness, feature_cleaner = None):
514 from Bio.SeqRecord import SeqRecord
515 _BaseGenBankConsumer.__init__(self)
516 self.data = SeqRecord(None, id = None)
517 self.data.id = None
518 self.data.description = ""
519
520 self._use_fuzziness = use_fuzziness
521 self._feature_cleaner = feature_cleaner
522
523 self._seq_type = ''
524 self._seq_data = []
525 self._cur_reference = None
526 self._cur_feature = None
527 self._cur_qualifier_key = None
528 self._cur_qualifier_value = None
529 self._expected_size = None
530
531 - def locus(self, locus_name):
532 """Set the locus name is set as the name of the Sequence.
533 """
534 self.data.name = locus_name
535
536 - def size(self, content):
537 """Record the sequence length."""
538 self._expected_size = int(content)
539
541 """Record the sequence type so we can choose an appropriate alphabet.
542 """
543 self._seq_type = type
544
547
548 - def date(self, submit_date):
550
560
562 """Set the accession number as the id of the sequence.
563
564 If we have multiple accession numbers, the first one passed is
565 used.
566 """
567 new_acc_nums = self._split_accessions(acc_num)
568
569
570 try:
571
572 for acc in new_acc_nums:
573
574 if acc not in self.data.annotations['accessions']:
575 self.data.annotations['accessions'].append(acc)
576 except KeyError:
577 self.data.annotations['accessions'] = new_acc_nums
578
579
580 if self.data.id is None:
581 if len(new_acc_nums) > 0:
582
583
584 self.data.id = self.data.annotations['accessions'][0]
585
586 - def wgs(self, content):
588
591
592 - def nid(self, content):
594
595 - def pid(self, content):
597
610
612 """Handle the information from the PROJECT line as a list of projects.
613
614 e.g.
615 PROJECT GenomeProject:28471
616
617 or:
618 PROJECT GenomeProject:13543 GenomeProject:99999
619
620 This is stored as dbxrefs in the SeqRecord to be consistent with the
621 projected switch of this line to DBLINK in future GenBank versions.
622 Note the NCBI plan to replace "GenomeProject:28471" with the shorter
623 "Project:28471" as part of this transition.
624 """
625 content = content.replace("GenomeProject:", "Project:")
626 self.data.dbxrefs.extend([p for p in content.split() if p])
627
629 """Store DBLINK cross references as dbxrefs in our record object.
630
631 This line type is expected to replace the PROJECT line in 2009. e.g.
632
633 During transition:
634
635 PROJECT GenomeProject:28471
636 DBLINK Project:28471
637 Trace Assembly Archive:123456
638
639 Once the project line is dropped:
640
641 DBLINK Project:28471
642 Trace Assembly Archive:123456
643
644 Note GenomeProject -> Project.
645
646 We'll have to see some real examples to be sure, but based on the
647 above example we can expect one reference per line.
648 """
649
650
651 if content.strip() not in self.data.dbxrefs:
652 self.data.dbxrefs.append(content.strip())
653
655 """Set the version to overwrite the id.
656
657 Since the verison provides the same information as the accession
658 number, plus some extra info, we set this as the id if we have
659 a version.
660 """
661
662
663
664
665
666
667
668
669
670
671 assert version.isdigit()
672 self.data.annotations['sequence_version'] = int(version)
673
676
677 - def gi(self, content):
679
682
685
687
688
689 if content == "":
690 source_info = ""
691 elif content[-1] == '.':
692 source_info = content[:-1]
693 else:
694 source_info = content
695 self.data.annotations['source'] = source_info
696
699
708
710 """Signal the beginning of a new reference object.
711 """
712
713
714 if self._cur_reference is not None:
715 self.data.annotations['references'].append(self._cur_reference)
716 else:
717 self.data.annotations['references'] = []
718
719 self._cur_reference = SeqFeature.Reference()
720
722 """Attempt to determine the sequence region the reference entails.
723
724 Possible types of information we may have to deal with:
725
726 (bases 1 to 86436)
727 (sites)
728 (bases 1 to 105654; 110423 to 111122)
729 1 (residues 1 to 182)
730 """
731
732 ref_base_info = content[1:-1]
733
734 all_locations = []
735
736 if ref_base_info.find('bases') != -1 and \
737 ref_base_info.find('to') != -1:
738
739 ref_base_info = ref_base_info[5:]
740 locations = self._split_reference_locations(ref_base_info)
741 all_locations.extend(locations)
742 elif (ref_base_info.find("residues") >= 0 and
743 ref_base_info.find("to") >= 0):
744 residues_start = ref_base_info.find("residues")
745
746 ref_base_info = ref_base_info[(residues_start + len("residues ")):]
747 locations = self._split_reference_locations(ref_base_info)
748 all_locations.extend(locations)
749
750
751
752 elif (ref_base_info == 'sites' or
753 ref_base_info.strip() == 'bases'):
754 pass
755
756 else:
757 raise ValueError("Could not parse base info %s in record %s" %
758 (ref_base_info, self.data.id))
759
760 self._cur_reference.location = all_locations
761
763 """Get reference locations out of a string of reference information
764
765 The passed string should be of the form:
766
767 1 to 20; 20 to 100
768
769 This splits the information out and returns a list of location objects
770 based on the reference locations.
771 """
772
773 all_base_info = location_string.split(';')
774
775 new_locations = []
776 for base_info in all_base_info:
777 start, end = base_info.split('to')
778 new_start, new_end = \
779 self._convert_to_python_numbers(int(start.strip()),
780 int(end.strip()))
781 this_location = SeqFeature.FeatureLocation(new_start, new_end)
782 new_locations.append(this_location)
783 return new_locations
784
786 if self._cur_reference.authors:
787 self._cur_reference.authors += ' ' + content
788 else:
789 self._cur_reference.authors = content
790
792 if self._cur_reference.consrtm:
793 self._cur_reference.consrtm += ' ' + content
794 else:
795 self._cur_reference.consrtm = content
796
797 - def title(self, content):
798 if self._cur_reference is None:
799 import warnings
800 warnings.warn("GenBank TITLE line without REFERENCE line.")
801 elif self._cur_reference.title:
802 self._cur_reference.title += ' ' + content
803 else:
804 self._cur_reference.title = content
805
807 if self._cur_reference.journal:
808 self._cur_reference.journal += ' ' + content
809 else:
810 self._cur_reference.journal = content
811
814
817
819 """Deal with a reference comment."""
820 if self._cur_reference.comment:
821 self._cur_reference.comment += ' ' + content
822 else:
823 self._cur_reference.comment = content
824
830
832 """Get ready for the feature table when we reach the FEATURE line.
833 """
834 self.start_feature_table()
835
837 """Indicate we've got to the start of the feature table.
838 """
839
840 if self._cur_reference is not None:
841 self.data.annotations['references'].append(self._cur_reference)
842 self._cur_reference = None
843
845 """Utility function to add a feature to the SeqRecord.
846
847 This does all of the appropriate checking to make sure we haven't
848 left any info behind, and that we are only adding info if it
849 exists.
850 """
851 if self._cur_feature:
852
853
854 self._add_qualifier()
855
856 self._cur_qualifier_key = ''
857 self._cur_qualifier_value = ''
858 self.data.features.append(self._cur_feature)
859
873
875 """Parse out location information from the location string.
876
877 This uses simple Python code with some regular expressions to do the
878 parsing, and then translates the results into appropriate objects.
879 """
880
881
882 location_line = self._clean_location(content)
883
884
885
886
887
888
889 if location_line.find('replace') != -1:
890 comma_pos = location_line.find(',')
891 location_line = location_line[8:comma_pos]
892
893 cur_feature = self._cur_feature
894
895
896 if location_line.startswith("complement("):
897 assert location_line.endswith(")")
898 location_line = location_line[11:-1]
899 cur_feature.strand = -1
900
901
902
903 if _re_simple_location.match(location_line):
904
905 s, e = location_line.split("..")
906 cur_feature.location = SeqFeature.FeatureLocation(int(s)-1,
907 int(e))
908 return
909 if _re_simple_compound.match(location_line):
910
911 i = location_line.find("(")
912 cur_feature.location_operator = location_line[:i]
913
914 for part in location_line[i+1:-1].split(","):
915 s, e = part.split("..")
916 f = SeqFeature.SeqFeature(SeqFeature.FeatureLocation(int(s)-1,
917 int(e)),
918 location_operator=cur_feature.location_operator,
919 strand=cur_feature.strand, type=cur_feature.type)
920 cur_feature.sub_features.append(f)
921 s = cur_feature.sub_features[0].location.start
922 e = cur_feature.sub_features[-1].location.end
923 cur_feature.location = SeqFeature.FeatureLocation(s,e)
924 return
925
926
927 if _re_complex_location.match(location_line):
928
929 if ":" in location_line:
930 cur_feature.ref, location_line = location_line.split(":")
931 cur_feature.location = _loc(location_line, self._expected_size)
932 return
933 if _re_complex_compound.match(location_line):
934 i = location_line.find("(")
935 cur_feature.location_operator = location_line[:i]
936
937 for part in _split_compound_loc(location_line[i+1:-1]):
938 if part.startswith("complement("):
939 assert part[-1]==")"
940 part = part[11:-1]
941 assert cur_feature.strand != -1, "Double complement?"
942 strand = -1
943 else:
944 strand = cur_feature.strand
945 if ":" in part:
946 ref, part = part.split(":")
947 else:
948 ref = None
949 try:
950 loc = _loc(part, self._expected_size)
951 except ValueError, err:
952 print location_line
953 print part
954 raise err
955 f = SeqFeature.SeqFeature(location=loc, ref=ref,
956 location_operator=cur_feature.location_operator,
957 strand=strand, type=cur_feature.type)
958 cur_feature.sub_features.append(f)
959
960
961
962
963
964
965 strands = set(sf.strand for sf in cur_feature.sub_features)
966 if len(strands)==1:
967 cur_feature.strand = cur_feature.sub_features[0].strand
968 else:
969 cur_feature.strand = None
970 s = cur_feature.sub_features[0].location.start
971 e = cur_feature.sub_features[-1].location.end
972 cur_feature.location = SeqFeature.FeatureLocation(s,e)
973 return
974
975 raise LocationParserError(location_line)
976
978 """Add a qualifier to the current feature without loss of info.
979
980 If there are multiple qualifier keys with the same name we
981 would lose some info in the dictionary, so we append a unique
982 number to the end of the name in case of conflicts.
983 """
984
985
986 if self._cur_qualifier_key:
987 key = self._cur_qualifier_key
988 value = "".join(self._cur_qualifier_value)
989 if self._feature_cleaner is not None:
990 value = self._feature_cleaner.clean_value(key, value)
991
992 if key in self._cur_feature.qualifiers:
993 self._cur_feature.qualifiers[key].append(value)
994
995 else:
996 self._cur_feature.qualifiers[key] = [value]
997
999 """When we get a qualifier key, use it as a dictionary key.
1000
1001 We receive a list of keys, since you can have valueless keys such as
1002 /pseudo which would be passed in with the next key (since no other
1003 tags separate them in the file)
1004 """
1005 for content in content_list:
1006
1007 self._add_qualifier()
1008
1009
1010 assert '/' not in content and '=' not in content \
1011 and content == content.strip(), content
1012
1013 self._cur_qualifier_key = content
1014 self._cur_qualifier_value = []
1015
1017
1018 qual_value = content.replace('"', '')
1019
1020 self._cur_qualifier_value.append(qual_value)
1021
1023 """Deal with CONTIG information."""
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039 self.data.annotations["contig"] = content
1040
1043
1046
1049
1051 """Add up sequence information as we get it.
1052
1053 To try and make things speedier, this puts all of the strings
1054 into a list of strings, and then uses string.join later to put
1055 them together. Supposedly, this is a big time savings
1056 """
1057 assert ' ' not in content
1058 self._seq_data.append(content.upper())
1059
1125
1127 """Create a GenBank Record object from scanner generated information.
1128 """
1138
1139 - def wgs(self, content):
1141
1144
1145 - def locus(self, content):
1147
1148 - def size(self, content):
1150
1153
1156
1157 - def date(self, content):
1159
1162
1167
1168 - def nid(self, content):
1170
1171 - def pid(self, content):
1173
1176
1179
1180 - def gi(self, content):
1182
1185
1188
1191
1194
1197
1200
1203
1205 """Grab the reference number and signal the start of a new reference.
1206 """
1207
1208 if self._cur_reference is not None:
1209 self.data.references.append(self._cur_reference)
1210
1211 import Record
1212 self._cur_reference = Record.Reference()
1213 self._cur_reference.number = content
1214
1216 self._cur_reference.bases = content
1217
1219 self._cur_reference.authors = content
1220
1222 self._cur_reference.consrtm = content
1223
1224 - def title(self, content):
1225 if self._cur_reference is None:
1226 import warnings
1227 warnings.warn("GenBank TITLE line without REFERENCE line.")
1228 return
1229 self._cur_reference.title = content
1230
1232 self._cur_reference.journal = content
1233
1236
1239
1241 self._cur_reference.remark = content
1242
1245
1249
1252
1254 """Get ready for the feature table when we reach the FEATURE line.
1255 """
1256 self.start_feature_table()
1257
1259 """Signal the start of the feature table.
1260 """
1261
1262 if self._cur_reference is not None:
1263 self.data.references.append(self._cur_reference)
1264
1266 """Grab the key of the feature and signal the start of a new feature.
1267 """
1268
1269 self._add_feature()
1270
1271 import Record
1272 self._cur_feature = Record.Feature()
1273 self._cur_feature.key = content
1274
1276 """Utility function to add a feature to the Record.
1277
1278 This does all of the appropriate checking to make sure we haven't
1279 left any info behind, and that we are only adding info if it
1280 exists.
1281 """
1282 if self._cur_feature is not None:
1283
1284
1285 if self._cur_qualifier is not None:
1286 self._cur_feature.qualifiers.append(self._cur_qualifier)
1287
1288 self._cur_qualifier = None
1289 self.data.features.append(self._cur_feature)
1290
1293
1295 """Deal with qualifier names
1296
1297 We receive a list of keys, since you can have valueless keys such as
1298 /pseudo which would be passed in with the next key (since no other
1299 tags separate them in the file)
1300 """
1301 import Record
1302 for content in content_list:
1303
1304 if content.find("/") != 0:
1305 content = "/%s" % content
1306
1307 if self._cur_qualifier is not None:
1308 self._cur_feature.qualifiers.append(self._cur_qualifier)
1309
1310 self._cur_qualifier = Record.Qualifier()
1311 self._cur_qualifier.key = content
1312
1324
1326 self.data.base_counts = content
1327
1329 self.data.origin = content
1330
1332 """Signal that we have contig information to add to the record.
1333 """
1334 self.data.contig = self._clean_location(content)
1335
1337 """Add sequence information to a list of sequence strings.
1338
1339 This removes spaces in the data and uppercases the sequence, and
1340 then adds it to a list of sequences. Later on we'll join this
1341 list together to make the final sequence. This is faster than
1342 adding on the new string every time.
1343 """
1344 assert ' ' not in content
1345 self._seq_data.append(content.upper())
1346
1348 """Signal the end of the record and do any necessary clean-up.
1349 """
1350
1351
1352 self.data.sequence = "".join(self._seq_data)
1353
1354 self._add_feature()
1355
1356
1358 """Run the Bio.GenBank module's doctests."""
1359 print "Runing doctests..."
1360 import doctest
1361 doctest.testmod()
1362 print "Done"
1363
1364 if __name__ == "__main__":
1365 _test()
1366