1
2
3
4
5
6
7
8 """Represent a Sequence Record, a sequence with annotation."""
9 __docformat__ = "epytext en"
10
11
12
13
14
15
17 """Dict which only allows sequences of given length as values (PRIVATE).
18
19 This simple subclass of the Python dictionary is used in the SeqRecord
20 object for holding per-letter-annotations. This class is intended to
21 prevent simple errors by only allowing python sequences (e.g. lists,
22 strings and tuples) to be stored, and only if their length matches that
23 expected (the length of the SeqRecord's seq object). It cannot however
24 prevent the entries being edited in situ (for example appending entries
25 to a list).
26 """
32 if not hasattr(value,"__len__") or not hasattr(value,"__getitem__") \
33 or len(value) != self._length:
34 raise TypeError("We only allow python sequences (lists, tuples or "
35 "strings) of length %i." % self._length)
36 dict.__setitem__(self, key, value)
41
43 """A SeqRecord object holds a sequence and information about it.
44
45 Main attributes:
46 - id - Identifier such as a locus tag (string)
47 - seq - The sequence itself (Seq object or similar)
48
49 Additional attributes:
50 - name - Sequence name, e.g. gene name (string)
51 - description - Additional text (string)
52 - dbxrefs - List of database cross references (list of strings)
53 - features - Any (sub)features defined (list of SeqFeature objects)
54 - annotations - Further information about the whole sequence (dictionary)
55 Most entries are strings, or lists of strings.
56 - letter_annotations - Per letter/symbol annotation (restricted
57 dictionary). This holds Python sequences (lists, strings
58 or tuples) whose length matches that of the sequence.
59 A typical use would be to hold a list of integers
60 representing sequencing quality scores, or a string
61 representing the secondary structure.
62
63 You will typically use Bio.SeqIO to read in sequences from files as
64 SeqRecord objects. However, you may want to create your own SeqRecord
65 objects directly (see the __init__ method for further details):
66
67 >>> from Bio.Seq import Seq
68 >>> from Bio.SeqRecord import SeqRecord
69 >>> from Bio.Alphabet import IUPAC
70 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
71 ... IUPAC.protein),
72 ... id="YP_025292.1", name="HokC",
73 ... description="toxic membrane protein")
74 >>> print record
75 ID: YP_025292.1
76 Name: HokC
77 Description: toxic membrane protein
78 Number of features: 0
79 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
80
81 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO
82 for this. For the special case where you want the SeqRecord turned into
83 a string in a particular file format there is a format method which uses
84 Bio.SeqIO internally:
85
86 >>> print record.format("fasta")
87 >YP_025292.1 toxic membrane protein
88 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
89 <BLANKLINE>
90
91 You can also do things like slicing a SeqRecord, checking its length, etc
92
93 >>> len(record)
94 44
95 >>> edited = record[:10] + record[11:]
96 >>> print edited.seq
97 MKQHKAMIVAIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
98 >>> print record.seq
99 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
100
101 """
102 - def __init__(self, seq, id = "<unknown id>", name = "<unknown name>",
103 description = "<unknown description>", dbxrefs = None,
104 features = None, annotations = None,
105 letter_annotations = None):
106 """Create a SeqRecord.
107
108 Arguments:
109 - seq - Sequence, required (Seq, MutableSeq or UnknownSeq)
110 - id - Sequence identifier, recommended (string)
111 - name - Sequence name, optional (string)
112 - description - Sequence description, optional (string)
113 - dbxrefs - Database cross references, optional (list of strings)
114 - features - Any (sub)features, optional (list of SeqFeature objects)
115 - annotations - Dictionary of annotations for the whole sequence
116 - letter_annotations - Dictionary of per-letter-annotations, values
117 should be strings, list or tuples of the same
118 length as the full sequence.
119
120 You will typically use Bio.SeqIO to read in sequences from files as
121 SeqRecord objects. However, you may want to create your own SeqRecord
122 objects directly.
123
124 Note that while an id is optional, we strongly recommend you supply a
125 unique id string for each record. This is especially important
126 if you wish to write your sequences to a file.
127
128 If you don't have the actual sequence, but you do know its length,
129 then using the UnknownSeq object from Bio.Seq is appropriate.
130
131 You can create a 'blank' SeqRecord object, and then populate the
132 attributes later.
133 """
134 if id is not None and not isinstance(id, basestring):
135
136 raise TypeError("id argument should be a string")
137 if not isinstance(name, basestring):
138 raise TypeError("name argument should be a string")
139 if not isinstance(description, basestring):
140 raise TypeError("description argument should be a string")
141 self._seq = seq
142 self.id = id
143 self.name = name
144 self.description = description
145
146
147 if dbxrefs is None:
148 dbxrefs = []
149 elif not isinstance(dbxrefs, list):
150 raise TypeError("dbxrefs argument should be a list (of strings)")
151 self.dbxrefs = dbxrefs
152
153
154 if annotations is None:
155 annotations = {}
156 elif not isinstance(annotations, dict):
157 raise TypeError("annotations argument should be a dict")
158 self.annotations = annotations
159
160 if letter_annotations is None:
161
162 if seq is None:
163
164 self._per_letter_annotations = _RestrictedDict(length=0)
165 else:
166 try:
167 self._per_letter_annotations = \
168 _RestrictedDict(length=len(seq))
169 except:
170 raise TypeError("seq argument should be a Seq object or similar")
171 else:
172
173
174
175 self.letter_annotations = letter_annotations
176
177
178 if features is None:
179 features = []
180 elif not isinstance(features, list):
181 raise TypeError("features argument should be a list (of SeqFeature objects)")
182 self.features = features
183
184
186 if not isinstance(value, dict):
187 raise TypeError("The per-letter-annotations should be a "
188 "(restricted) dictionary.")
189
190 try:
191 self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
192 except AttributeError:
193
194 self._per_letter_annotations = _RestrictedDict(length=0)
195 self._per_letter_annotations.update(value)
196 letter_annotations = property( \
197 fget=lambda self : self._per_letter_annotations,
198 fset=_set_per_letter_annotations,
199 doc="""Dictionary of per-letter-annotation for the sequence.
200
201 For example, this can hold quality scores used in FASTQ or QUAL files.
202 Consider this example using Bio.SeqIO to read in an example Solexa
203 variant FASTQ file as a SeqRecord:
204
205 >>> from Bio import SeqIO
206 >>> handle = open("Quality/solexa_faked.fastq", "rU")
207 >>> record = SeqIO.read(handle, "fastq-solexa")
208 >>> handle.close()
209 >>> print record.id, record.seq
210 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
211 >>> print record.letter_annotations.keys()
212 ['solexa_quality']
213 >>> print record.letter_annotations["solexa_quality"]
214 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
215
216 The letter_annotations get sliced automatically if you slice the
217 parent SeqRecord, for example taking the last ten bases:
218
219 >>> sub_record = record[-10:]
220 >>> print sub_record.id, sub_record.seq
221 slxa_0001_1_0001_01 ACGTNNNNNN
222 >>> print sub_record.letter_annotations["solexa_quality"]
223 [4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
224
225 Any python sequence (i.e. list, tuple or string) can be recorded in
226 the SeqRecord's letter_annotations dictionary as long as the length
227 matches that of the SeqRecord's sequence. e.g.
228
229 >>> len(sub_record.letter_annotations)
230 1
231 >>> sub_record.letter_annotations["dummy"] = "abcdefghij"
232 >>> len(sub_record.letter_annotations)
233 2
234
235 You can delete entries from the letter_annotations dictionary as usual:
236
237 >>> del sub_record.letter_annotations["solexa_quality"]
238 >>> sub_record.letter_annotations
239 {'dummy': 'abcdefghij'}
240
241 You can completely clear the dictionary easily as follows:
242
243 >>> sub_record.letter_annotations = {}
244 >>> sub_record.letter_annotations
245 {}
246 """)
247
249
250 if self._per_letter_annotations:
251
252 raise ValueError("You must empty the letter annotations first!")
253 self._seq = value
254 try:
255 self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
256 except AttributeError:
257
258 self._per_letter_annotations = _RestrictedDict(length=0)
259
260 seq = property(fget=lambda self : self._seq,
261 fset=_set_seq,
262 doc="The sequence itself, as a Seq or MutableSeq object.")
263
265 """Returns a sub-sequence or an individual letter.
266
267 Slicing, e.g. my_record[5:10], returns a new SeqRecord for
268 that sub-sequence with approriate annotation preserved. The
269 name, id and description are kept.
270
271 Any per-letter-annotations are sliced to match the requested
272 sub-sequence. Unless a stride is used, all those features
273 which fall fully within the subsequence are included (with
274 their locations adjusted accordingly).
275
276 However, the annotations dictionary and the dbxrefs list are
277 not used for the new SeqRecord, as in general they may not
278 apply to the subsequence. If you want to preserve them, you
279 must explictly copy them to the new SeqRecord yourself.
280
281 Using an integer index, e.g. my_record[5] is shorthand for
282 extracting that letter from the sequence, my_record.seq[5].
283
284 For example, consider this short protein and its secondary
285 structure as encoded by the PDB (e.g. H for alpha helices),
286 plus a simple feature for its histidine self phosphorylation
287 site:
288
289 >>> from Bio.Seq import Seq
290 >>> from Bio.SeqRecord import SeqRecord
291 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
292 >>> from Bio.Alphabet import IUPAC
293 >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT"
294 ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR",
295 ... IUPAC.protein),
296 ... id="1JOY", name="EnvZ",
297 ... description="Homodimeric domain of EnvZ from E. coli")
298 >>> rec.letter_annotations["secondary_structure"] = " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT "
299 >>> rec.features.append(SeqFeature(FeatureLocation(20,21),
300 ... type = "Site"))
301
302 Now let's have a quick look at the full record,
303
304 >>> print rec
305 ID: 1JOY
306 Name: EnvZ
307 Description: Homodimeric domain of EnvZ from E. coli
308 Number of features: 1
309 Per letter annotation for: secondary_structure
310 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein())
311 >>> print rec.letter_annotations["secondary_structure"]
312 S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT
313 >>> print rec.features[0].location
314 [20:21]
315
316 Now let's take a sub sequence, here chosen as the first (fractured)
317 alpha helix which includes the histidine phosphorylation site:
318
319 >>> sub = rec[11:41]
320 >>> print sub
321 ID: 1JOY
322 Name: EnvZ
323 Description: Homodimeric domain of EnvZ from E. coli
324 Number of features: 1
325 Per letter annotation for: secondary_structure
326 Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein())
327 >>> print sub.letter_annotations["secondary_structure"]
328 HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH
329 >>> print sub.features[0].location
330 [9:10]
331
332 You can also of course omit the start or end values, for
333 example to get the first ten letters only:
334
335 >>> print rec[:10]
336 ID: 1JOY
337 Name: EnvZ
338 Description: Homodimeric domain of EnvZ from E. coli
339 Number of features: 0
340 Per letter annotation for: secondary_structure
341 Seq('MAAGVKQLAD', IUPACProtein())
342
343 Or for the last ten letters:
344
345 >>> print rec[-10:]
346 ID: 1JOY
347 Name: EnvZ
348 Description: Homodimeric domain of EnvZ from E. coli
349 Number of features: 0
350 Per letter annotation for: secondary_structure
351 Seq('IIEQFIDYLR', IUPACProtein())
352
353 If you omit both, then you get a copy of the original record (although
354 lacking the annotations and dbxrefs):
355
356 >>> print rec[:]
357 ID: 1JOY
358 Name: EnvZ
359 Description: Homodimeric domain of EnvZ from E. coli
360 Number of features: 1
361 Per letter annotation for: secondary_structure
362 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein())
363
364 Finally, indexing with a simple integer is shorthand for pulling out
365 that letter from the sequence directly:
366
367 >>> rec[5]
368 'K'
369 >>> rec.seq[5]
370 'K'
371 """
372 if isinstance(index, int):
373
374
375
376 return self.seq[index]
377 elif isinstance(index, slice):
378 if self.seq is None:
379 raise ValueError("If the sequence is None, we cannot slice it.")
380 parent_length = len(self)
381 answer = self.__class__(self.seq[index],
382 id=self.id,
383 name=self.name,
384 description=self.description)
385
386
387
388
389
390
391
392
393
394
395
396 if index.step is None or index.step == 1:
397
398 if index.start is None:
399 start = 0
400 else:
401 start = index.start
402 if index.stop is None:
403 stop = -1
404 else:
405 stop = index.stop
406 if (start < 0 or stop < 0) and parent_length == 0:
407 raise ValueError, \
408 "Cannot support negative indices without the sequence length"
409 if start < 0:
410 start = parent_length + start
411 if stop < 0:
412 stop = parent_length + stop + 1
413
414 for f in self.features:
415 if f.ref or f.ref_db:
416
417 import warnings
418 warnings.warn("When slicing SeqRecord objects, any "
419 "SeqFeature referencing other sequences (e.g. "
420 "from segmented GenBank records) is ignored.")
421 continue
422 if start <= f.location.nofuzzy_start \
423 and f.location.nofuzzy_end <= stop:
424 answer.features.append(f._shift(-start))
425
426
427
428 for key, value in self.letter_annotations.iteritems():
429 answer._per_letter_annotations[key] = value[index]
430
431 return answer
432 raise ValueError, "Invalid index"
433
435 """Iterate over the letters in the sequence.
436
437 For example, using Bio.SeqIO to read in a protein FASTA file:
438
439 >>> from Bio import SeqIO
440 >>> record = SeqIO.read(open("Fasta/loveliesbleeding.pro"),"fasta")
441 >>> for amino in record:
442 ... print amino
443 ... if amino == "L" : break
444 X
445 A
446 G
447 L
448 >>> print record.seq[3]
449 L
450
451 This is just a shortcut for iterating over the sequence directly:
452
453 >>> for amino in record.seq:
454 ... print amino
455 ... if amino == "L" : break
456 X
457 A
458 G
459 L
460 >>> print record.seq[3]
461 L
462
463 Note that this does not facilitate iteration together with any
464 per-letter-annotation. However, you can achieve that using the
465 python zip function on the record (or its sequence) and the relevant
466 per-letter-annotation:
467
468 >>> from Bio import SeqIO
469 >>> rec = SeqIO.read(open("Quality/solexa_faked.fastq", "rU"),
470 ... "fastq-solexa")
471 >>> print rec.id, rec.seq
472 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
473 >>> print rec.letter_annotations.keys()
474 ['solexa_quality']
475 >>> for nuc, qual in zip(rec,rec.letter_annotations["solexa_quality"]):
476 ... if qual > 35:
477 ... print nuc, qual
478 A 40
479 C 39
480 G 38
481 T 37
482 A 36
483
484 You may agree that using zip(rec.seq, ...) is more explicit than using
485 zip(rec, ...) as shown above.
486 """
487 return iter(self.seq)
488
490 """Implements the 'in' keyword, searches the sequence.
491
492 e.g.
493
494 >>> from Bio import SeqIO
495 >>> record = SeqIO.read(open("Fasta/sweetpea.nu"), "fasta")
496 >>> "GAATTC" in record
497 False
498 >>> "AAA" in record
499 True
500
501 This essentially acts as a proxy for using "in" on the sequence:
502
503 >>> "GAATTC" in record.seq
504 False
505 >>> "AAA" in record.seq
506 True
507
508 Note that you can also use Seq objects as the query,
509
510 >>> from Bio.Seq import Seq
511 >>> from Bio.Alphabet import generic_dna
512 >>> Seq("AAA") in record
513 True
514 >>> Seq("AAA", generic_dna) in record
515 True
516
517 See also the Seq object's __contains__ method.
518 """
519 return char in self.seq
520
521
523 """A human readable summary of the record and its annotation (string).
524
525 The python built in function str works by calling the object's ___str__
526 method. e.g.
527
528 >>> from Bio.Seq import Seq
529 >>> from Bio.SeqRecord import SeqRecord
530 >>> from Bio.Alphabet import IUPAC
531 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
532 ... IUPAC.protein),
533 ... id="YP_025292.1", name="HokC",
534 ... description="toxic membrane protein, small")
535 >>> print str(record)
536 ID: YP_025292.1
537 Name: HokC
538 Description: toxic membrane protein, small
539 Number of features: 0
540 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
541
542 In this example you don't actually need to call str explicity, as the
543 print command does this automatically:
544
545 >>> print record
546 ID: YP_025292.1
547 Name: HokC
548 Description: toxic membrane protein, small
549 Number of features: 0
550 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
551
552 Note that long sequences are shown truncated.
553 """
554 lines = []
555 if self.id : lines.append("ID: %s" % self.id)
556 if self.name : lines.append("Name: %s" % self.name)
557 if self.description : lines.append("Description: %s" % self.description)
558 if self.dbxrefs : lines.append("Database cross-references: " \
559 + ", ".join(self.dbxrefs))
560 lines.append("Number of features: %i" % len(self.features))
561 for a in self.annotations:
562 lines.append("/%s=%s" % (a, str(self.annotations[a])))
563 if self.letter_annotations:
564 lines.append("Per letter annotation for: " \
565 + ", ".join(self.letter_annotations.keys()))
566
567
568 lines.append(repr(self.seq))
569 return "\n".join(lines)
570
572 """A concise summary of the record for debugging (string).
573
574 The python built in function repr works by calling the object's ___repr__
575 method. e.g.
576
577 >>> from Bio.Seq import Seq
578 >>> from Bio.SeqRecord import SeqRecord
579 >>> from Bio.Alphabet import generic_protein
580 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT"
581 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ"
582 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ"
583 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF",
584 ... generic_protein),
585 ... id="NP_418483.1", name="b4059",
586 ... description="ssDNA-binding protein",
587 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"])
588 >>> print repr(rec)
589 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
590
591 At the python prompt you can also use this shorthand:
592
593 >>> rec
594 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
595
596 Note that long sequences are shown truncated. Also note that any
597 annotations, letter_annotations and features are not shown (as they
598 would lead to a very long string).
599 """
600 return self.__class__.__name__ \
601 + "(seq=%s, id=%s, name=%s, description=%s, dbxrefs=%s)" \
602 % tuple(map(repr, (self.seq, self.id, self.name,
603 self.description, self.dbxrefs)))
604
638
666
668 """Returns the length of the sequence.
669
670 For example, using Bio.SeqIO to read in a FASTA nucleotide file:
671
672 >>> from Bio import SeqIO
673 >>> record = SeqIO.read(open("Fasta/sweetpea.nu"),"fasta")
674 >>> len(record)
675 309
676 >>> len(record.seq)
677 309
678 """
679 return len(self.seq)
680
682 """Returns True regardless of the length of the sequence.
683
684 This behaviour is for backwards compatibility, since until the
685 __len__ method was added, a SeqRecord always evaluated as True.
686
687 Note that in comparison, a Seq object will evaluate to False if it
688 has a zero length sequence.
689
690 WARNING: The SeqRecord may in future evaluate to False when its
691 sequence is of zero length (in order to better match the Seq
692 object behaviour)!
693 """
694 return True
695
697 """Add another sequence or string to this sequence.
698
699 The other sequence can be a SeqRecord object, a Seq object (or
700 similar, e.g. a MutableSeq) or a plain Python string. If you add
701 a plain string or a Seq (like) object, the new SeqRecord will simply
702 have this appended to the existing data. However, any per letter
703 annotation will be lost:
704
705 >>> from Bio import SeqIO
706 >>> handle = open("Quality/solexa_faked.fastq", "rU")
707 >>> record = SeqIO.read(handle, "fastq-solexa")
708 >>> handle.close()
709 >>> print record.id, record.seq
710 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
711 >>> print record.letter_annotations.keys()
712 ['solexa_quality']
713
714 >>> new = record + "ACT"
715 >>> print new.id, new.seq
716 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNACT
717 >>> print new.letter_annotations.keys()
718 []
719
720 The new record will attempt to combine the annotation, but for any
721 ambiguities (e.g. different names) it defaults to omitting that
722 annotation.
723
724 >>> from Bio import SeqIO
725 >>> handle = open("GenBank/pBAD30.gb")
726 >>> plasmid = SeqIO.read(handle, "gb")
727 >>> handle.close()
728 >>> print plasmid.id, len(plasmid)
729 pBAD30 4923
730
731 Now let's cut the plasmid into two pieces, and join them back up the
732 other way round (i.e. shift the starting point on this plasmid, have
733 a look at the annotated features in the original file to see why this
734 particular split point might make sense):
735
736 >>> left = plasmid[:3765]
737 >>> right = plasmid[3765:]
738 >>> new = right + left
739 >>> print new.id, len(new)
740 pBAD30 4923
741 >>> str(new.seq) == str(right.seq + left.seq)
742 True
743 >>> len(new.features) == len(left.features) + len(right.features)
744 True
745
746 When we add the left and right SeqRecord objects, their annotation
747 is all consistent, so it is all conserved in the new SeqRecord:
748
749 >>> new.id == left.id == right.id == plasmid.id
750 True
751 >>> new.name == left.name == right.name == plasmid.name
752 True
753 >>> new.description == plasmid.description
754 True
755 >>> new.annotations == left.annotations == right.annotations
756 True
757 >>> new.letter_annotations == plasmid.letter_annotations
758 True
759 >>> new.dbxrefs == left.dbxrefs == right.dbxrefs
760 True
761
762 However, we should point out that when we sliced the SeqRecord,
763 any annotations dictionary or dbxrefs list entries were lost.
764 You can explicitly copy them like this:
765
766 >>> new.annotations = plasmid.annotations.copy()
767 >>> new.dbxrefs = plasmid.dbxrefs[:]
768 """
769 if not isinstance(other, SeqRecord):
770
771
772 return SeqRecord(self.seq + other,
773 id = self.id, name = self.name,
774 description = self.description,
775 features = self.features[:],
776 annotations = self.annotations.copy(),
777 dbxrefs = self.dbxrefs[:])
778
779 answer = SeqRecord(self.seq + other.seq,
780 features = self.features[:],
781 dbxrefs = self.dbxrefs[:])
782
783 l = len(self)
784 for f in other.features:
785 answer.features.append(f._shift(l))
786 del l
787 for ref in other.dbxrefs:
788 if ref not in answer.dbxrefs:
789 answer.dbxrefs.append(ref)
790
791 if self.id == other.id:
792 answer.id = self.id
793 if self.name == other.name:
794 answer.name = self.name
795 if self.description == other.description:
796 answer.description = self.description
797 for k,v in self.annotations.iteritems():
798 if k in other.annotations and other.annotations[k] == v:
799 answer.annotations[k] = v
800
801 for k,v in self.letter_annotations.iteritems():
802 if k in other.letter_annotations:
803 answer.letter_annotations[k] = v + other.letter_annotations[k]
804 return answer
805
807 """Add another sequence or string to this sequence (from the left).
808
809 This method handles adding a Seq object (or similar, e.g. MutableSeq)
810 or a plain Python string (on the left) to a SeqRecord (on the right).
811 See the __add__ method for more details, but for example:
812
813 >>> from Bio import SeqIO
814 >>> handle = open("Quality/solexa_faked.fastq", "rU")
815 >>> record = SeqIO.read(handle, "fastq-solexa")
816 >>> handle.close()
817 >>> print record.id, record.seq
818 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
819 >>> print record.letter_annotations.keys()
820 ['solexa_quality']
821
822 >>> new = "ACT" + record
823 >>> print new.id, new.seq
824 slxa_0001_1_0001_01 ACTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
825 >>> print new.letter_annotations.keys()
826 []
827 """
828 if isinstance(other, SeqRecord):
829 raise RuntimeError("This should have happened via the __add__ of "
830 "the other SeqRecord being added!")
831
832
833 offset = len(other)
834 return SeqRecord(other + self.seq,
835 id = self.id, name = self.name,
836 description = self.description,
837 features = [f._shift(offset) for f in self.features],
838 annotations = self.annotations.copy(),
839 dbxrefs = self.dbxrefs[:])
840
842 """Returns a copy of the record with an upper case sequence.
843
844 All the annotation is preserved unchanged. e.g.
845
846 >>> from Bio.Alphabet import generic_dna
847 >>> from Bio.Seq import Seq
848 >>> from Bio.SeqRecord import SeqRecord
849 >>> record = SeqRecord(Seq("acgtACGT", generic_dna), id="Test",
850 ... description = "Made up for this example")
851 >>> record.letter_annotations["phred_quality"] = [1,2,3,4,5,6,7,8]
852 >>> print record.upper().format("fastq")
853 @Test Made up for this example
854 ACGTACGT
855 +
856 "#$%&'()
857 <BLANKLINE>
858
859 Naturally, there is a matching lower method:
860
861 >>> print record.lower().format("fastq")
862 @Test Made up for this example
863 acgtacgt
864 +
865 "#$%&'()
866 <BLANKLINE>
867 """
868 return SeqRecord(self.seq.upper(),
869 id = self.id, name = self.name,
870 description = self.description,
871 dbxrefs = self.dbxrefs[:],
872 features = self.features[:],
873 annotations = self.annotations.copy(),
874 letter_annotations=self.letter_annotations.copy())
875
877 """Returns a copy of the record with a lower case sequence.
878
879 All the annotation is preserved unchanged. e.g.
880
881 >>> from Bio import SeqIO
882 >>> record = SeqIO.read("Fasta/aster.pro", "fasta")
883 >>> print record.format("fasta")
884 >gi|3298468|dbj|BAA31520.1| SAMIPF
885 GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVG
886 VTNALVFEIVMTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI
887 <BLANKLINE>
888 >>> print record.lower().format("fasta")
889 >gi|3298468|dbj|BAA31520.1| SAMIPF
890 gghvnpavtfgafvggnitllrgivyiiaqllgstvaclllkfvtndmavgvfslsagvg
891 vtnalvfeivmtfglvytvyataidpkkgslgtiapiaigfivgani
892 <BLANKLINE>
893
894 To take a more annotation rich example,
895
896 >>> from Bio import SeqIO
897 >>> old = SeqIO.read("EMBL/TRBG361.embl", "embl")
898 >>> len(old.features)
899 3
900 >>> new = old.lower()
901 >>> len(old.features) == len(new.features)
902 True
903 >>> old.annotations["organism"] == new.annotations["organism"]
904 True
905 >>> old.dbxrefs == new.dbxrefs
906 True
907 """
908 return SeqRecord(self.seq.lower(),
909 id = self.id, name = self.name,
910 description = self.description,
911 dbxrefs = self.dbxrefs[:],
912 features = self.features[:],
913 annotations = self.annotations.copy(),
914 letter_annotations=self.letter_annotations.copy())
915
917 """Run the Bio.SeqRecord module's doctests (PRIVATE).
918
919 This will try and locate the unit tests directory, and run the doctests
920 from there in order that the relative paths used in the examples work.
921 """
922 import doctest
923 import os
924 if os.path.isdir(os.path.join("..","Tests")):
925 print "Runing doctests..."
926 cur_dir = os.path.abspath(os.curdir)
927 os.chdir(os.path.join("..","Tests"))
928 doctest.testmod()
929 os.chdir(cur_dir)
930 del cur_dir
931 print "Done"
932 elif os.path.isdir(os.path.join("Tests")) :
933 print "Runing doctests..."
934 cur_dir = os.path.abspath(os.curdir)
935 os.chdir(os.path.join("Tests"))
936 doctest.testmod()
937 os.chdir(cur_dir)
938 del cur_dir
939 print "Done"
940
941 if __name__ == "__main__":
942 _test()
943