Package Bio :: Module SeqRecord
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqRecord

  1  # Copyright 2000-2002 Andrew Dalke. 
  2  # Copyright 2002-2004 Brad Chapman. 
  3  # Copyright 2006-2009 by Peter Cock. 
  4  # All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  """Represent a Sequence Record, a sequence with annotation.""" 
  9  __docformat__ = "epytext en" #Simple markup to show doctests nicely 
 10   
 11  # NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL 
 12  # In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes 
 13  # need to be in sync (this is the BioSQL "Database SeqRecord", see 
 14  # also BioSQL.BioSeq.DBSeq which is the "Database Seq" class) 
 15   
16 -class _RestrictedDict(dict):
17 """Dict which only allows sequences of given length as values (PRIVATE). 18 19 This simple subclass of the Python dictionary is used in the SeqRecord 20 object for holding per-letter-annotations. This class is intended to 21 prevent simple errors by only allowing python sequences (e.g. lists, 22 strings and tuples) to be stored, and only if their length matches that 23 expected (the length of the SeqRecord's seq object). It cannot however 24 prevent the entries being edited in situ (for example appending entries 25 to a list). 26 """
27 - def __init__(self, length):
28 """Create an EMPTY restricted dictionary.""" 29 dict.__init__(self) 30 self._length = int(length)
31 - def __setitem__(self, key, value):
32 if not hasattr(value,"__len__") or not hasattr(value,"__getitem__") \ 33 or len(value) != self._length: 34 raise TypeError("We only allow python sequences (lists, tuples or " 35 "strings) of length %i." % self._length) 36 dict.__setitem__(self, key, value)
37 - def update(self, new_dict):
38 #Force this to go via our strict __setitem__ method 39 for (key, value) in new_dict.iteritems(): 40 self[key] = value
41
42 -class SeqRecord(object):
43 """A SeqRecord object holds a sequence and information about it. 44 45 Main attributes: 46 - id - Identifier such as a locus tag (string) 47 - seq - The sequence itself (Seq object or similar) 48 49 Additional attributes: 50 - name - Sequence name, e.g. gene name (string) 51 - description - Additional text (string) 52 - dbxrefs - List of database cross references (list of strings) 53 - features - Any (sub)features defined (list of SeqFeature objects) 54 - annotations - Further information about the whole sequence (dictionary) 55 Most entries are strings, or lists of strings. 56 - letter_annotations - Per letter/symbol annotation (restricted 57 dictionary). This holds Python sequences (lists, strings 58 or tuples) whose length matches that of the sequence. 59 A typical use would be to hold a list of integers 60 representing sequencing quality scores, or a string 61 representing the secondary structure. 62 63 You will typically use Bio.SeqIO to read in sequences from files as 64 SeqRecord objects. However, you may want to create your own SeqRecord 65 objects directly (see the __init__ method for further details): 66 67 >>> from Bio.Seq import Seq 68 >>> from Bio.SeqRecord import SeqRecord 69 >>> from Bio.Alphabet import IUPAC 70 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 71 ... IUPAC.protein), 72 ... id="YP_025292.1", name="HokC", 73 ... description="toxic membrane protein") 74 >>> print record 75 ID: YP_025292.1 76 Name: HokC 77 Description: toxic membrane protein 78 Number of features: 0 79 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 80 81 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO 82 for this. For the special case where you want the SeqRecord turned into 83 a string in a particular file format there is a format method which uses 84 Bio.SeqIO internally: 85 86 >>> print record.format("fasta") 87 >YP_025292.1 toxic membrane protein 88 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 89 <BLANKLINE> 90 91 You can also do things like slicing a SeqRecord, checking its length, etc 92 93 >>> len(record) 94 44 95 >>> edited = record[:10] + record[11:] 96 >>> print edited.seq 97 MKQHKAMIVAIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 98 >>> print record.seq 99 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 100 101 """
102 - def __init__(self, seq, id = "<unknown id>", name = "<unknown name>", 103 description = "<unknown description>", dbxrefs = None, 104 features = None, annotations = None, 105 letter_annotations = None):
106 """Create a SeqRecord. 107 108 Arguments: 109 - seq - Sequence, required (Seq, MutableSeq or UnknownSeq) 110 - id - Sequence identifier, recommended (string) 111 - name - Sequence name, optional (string) 112 - description - Sequence description, optional (string) 113 - dbxrefs - Database cross references, optional (list of strings) 114 - features - Any (sub)features, optional (list of SeqFeature objects) 115 - annotations - Dictionary of annotations for the whole sequence 116 - letter_annotations - Dictionary of per-letter-annotations, values 117 should be strings, list or tuples of the same 118 length as the full sequence. 119 120 You will typically use Bio.SeqIO to read in sequences from files as 121 SeqRecord objects. However, you may want to create your own SeqRecord 122 objects directly. 123 124 Note that while an id is optional, we strongly recommend you supply a 125 unique id string for each record. This is especially important 126 if you wish to write your sequences to a file. 127 128 If you don't have the actual sequence, but you do know its length, 129 then using the UnknownSeq object from Bio.Seq is appropriate. 130 131 You can create a 'blank' SeqRecord object, and then populate the 132 attributes later. 133 """ 134 if id is not None and not isinstance(id, basestring): 135 #Lots of existing code uses id=None... this may be a bad idea. 136 raise TypeError("id argument should be a string") 137 if not isinstance(name, basestring): 138 raise TypeError("name argument should be a string") 139 if not isinstance(description, basestring): 140 raise TypeError("description argument should be a string") 141 self._seq = seq 142 self.id = id 143 self.name = name 144 self.description = description 145 146 # database cross references (for the whole sequence) 147 if dbxrefs is None: 148 dbxrefs = [] 149 elif not isinstance(dbxrefs, list): 150 raise TypeError("dbxrefs argument should be a list (of strings)") 151 self.dbxrefs = dbxrefs 152 153 # annotations about the whole sequence 154 if annotations is None: 155 annotations = {} 156 elif not isinstance(annotations, dict): 157 raise TypeError("annotations argument should be a dict") 158 self.annotations = annotations 159 160 if letter_annotations is None: 161 # annotations about each letter in the sequence 162 if seq is None: 163 #Should we allow this and use a normal unrestricted dict? 164 self._per_letter_annotations = _RestrictedDict(length=0) 165 else: 166 try: 167 self._per_letter_annotations = \ 168 _RestrictedDict(length=len(seq)) 169 except: 170 raise TypeError("seq argument should be a Seq object or similar") 171 else: 172 #This will be handled via the property set function, which will 173 #turn this into a _RestrictedDict and thus ensure all the values 174 #in the dict are the right length 175 self.letter_annotations = letter_annotations 176 177 # annotations about parts of the sequence 178 if features is None: 179 features = [] 180 elif not isinstance(features, list): 181 raise TypeError("features argument should be a list (of SeqFeature objects)") 182 self.features = features
183 184 #TODO - Just make this a read only property?
185 - def _set_per_letter_annotations(self, value):
186 if not isinstance(value, dict): 187 raise TypeError("The per-letter-annotations should be a " 188 "(restricted) dictionary.") 189 #Turn this into a restricted-dictionary (and check the entries) 190 try: 191 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 192 except AttributeError: 193 #e.g. seq is None 194 self._per_letter_annotations = _RestrictedDict(length=0) 195 self._per_letter_annotations.update(value)
196 letter_annotations = property( \ 197 fget=lambda self : self._per_letter_annotations, 198 fset=_set_per_letter_annotations, 199 doc="""Dictionary of per-letter-annotation for the sequence. 200 201 For example, this can hold quality scores used in FASTQ or QUAL files. 202 Consider this example using Bio.SeqIO to read in an example Solexa 203 variant FASTQ file as a SeqRecord: 204 205 >>> from Bio import SeqIO 206 >>> handle = open("Quality/solexa_faked.fastq", "rU") 207 >>> record = SeqIO.read(handle, "fastq-solexa") 208 >>> handle.close() 209 >>> print record.id, record.seq 210 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 211 >>> print record.letter_annotations.keys() 212 ['solexa_quality'] 213 >>> print record.letter_annotations["solexa_quality"] 214 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 215 216 The letter_annotations get sliced automatically if you slice the 217 parent SeqRecord, for example taking the last ten bases: 218 219 >>> sub_record = record[-10:] 220 >>> print sub_record.id, sub_record.seq 221 slxa_0001_1_0001_01 ACGTNNNNNN 222 >>> print sub_record.letter_annotations["solexa_quality"] 223 [4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 224 225 Any python sequence (i.e. list, tuple or string) can be recorded in 226 the SeqRecord's letter_annotations dictionary as long as the length 227 matches that of the SeqRecord's sequence. e.g. 228 229 >>> len(sub_record.letter_annotations) 230 1 231 >>> sub_record.letter_annotations["dummy"] = "abcdefghij" 232 >>> len(sub_record.letter_annotations) 233 2 234 235 You can delete entries from the letter_annotations dictionary as usual: 236 237 >>> del sub_record.letter_annotations["solexa_quality"] 238 >>> sub_record.letter_annotations 239 {'dummy': 'abcdefghij'} 240 241 You can completely clear the dictionary easily as follows: 242 243 >>> sub_record.letter_annotations = {} 244 >>> sub_record.letter_annotations 245 {} 246 """) 247
248 - def _set_seq(self, value):
249 #TODO - Add a deprecation warning that the seq should be write only? 250 if self._per_letter_annotations: 251 #TODO - Make this a warning? Silently empty the dictionary? 252 raise ValueError("You must empty the letter annotations first!") 253 self._seq = value 254 try: 255 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 256 except AttributeError: 257 #e.g. seq is None 258 self._per_letter_annotations = _RestrictedDict(length=0)
259 260 seq = property(fget=lambda self : self._seq, 261 fset=_set_seq, 262 doc="The sequence itself, as a Seq or MutableSeq object.") 263
264 - def __getitem__(self, index):
265 """Returns a sub-sequence or an individual letter. 266 267 Slicing, e.g. my_record[5:10], returns a new SeqRecord for 268 that sub-sequence with approriate annotation preserved. The 269 name, id and description are kept. 270 271 Any per-letter-annotations are sliced to match the requested 272 sub-sequence. Unless a stride is used, all those features 273 which fall fully within the subsequence are included (with 274 their locations adjusted accordingly). 275 276 However, the annotations dictionary and the dbxrefs list are 277 not used for the new SeqRecord, as in general they may not 278 apply to the subsequence. If you want to preserve them, you 279 must explictly copy them to the new SeqRecord yourself. 280 281 Using an integer index, e.g. my_record[5] is shorthand for 282 extracting that letter from the sequence, my_record.seq[5]. 283 284 For example, consider this short protein and its secondary 285 structure as encoded by the PDB (e.g. H for alpha helices), 286 plus a simple feature for its histidine self phosphorylation 287 site: 288 289 >>> from Bio.Seq import Seq 290 >>> from Bio.SeqRecord import SeqRecord 291 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 292 >>> from Bio.Alphabet import IUPAC 293 >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT" 294 ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR", 295 ... IUPAC.protein), 296 ... id="1JOY", name="EnvZ", 297 ... description="Homodimeric domain of EnvZ from E. coli") 298 >>> rec.letter_annotations["secondary_structure"] = " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT " 299 >>> rec.features.append(SeqFeature(FeatureLocation(20,21), 300 ... type = "Site")) 301 302 Now let's have a quick look at the full record, 303 304 >>> print rec 305 ID: 1JOY 306 Name: EnvZ 307 Description: Homodimeric domain of EnvZ from E. coli 308 Number of features: 1 309 Per letter annotation for: secondary_structure 310 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 311 >>> print rec.letter_annotations["secondary_structure"] 312 S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT 313 >>> print rec.features[0].location 314 [20:21] 315 316 Now let's take a sub sequence, here chosen as the first (fractured) 317 alpha helix which includes the histidine phosphorylation site: 318 319 >>> sub = rec[11:41] 320 >>> print sub 321 ID: 1JOY 322 Name: EnvZ 323 Description: Homodimeric domain of EnvZ from E. coli 324 Number of features: 1 325 Per letter annotation for: secondary_structure 326 Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein()) 327 >>> print sub.letter_annotations["secondary_structure"] 328 HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH 329 >>> print sub.features[0].location 330 [9:10] 331 332 You can also of course omit the start or end values, for 333 example to get the first ten letters only: 334 335 >>> print rec[:10] 336 ID: 1JOY 337 Name: EnvZ 338 Description: Homodimeric domain of EnvZ from E. coli 339 Number of features: 0 340 Per letter annotation for: secondary_structure 341 Seq('MAAGVKQLAD', IUPACProtein()) 342 343 Or for the last ten letters: 344 345 >>> print rec[-10:] 346 ID: 1JOY 347 Name: EnvZ 348 Description: Homodimeric domain of EnvZ from E. coli 349 Number of features: 0 350 Per letter annotation for: secondary_structure 351 Seq('IIEQFIDYLR', IUPACProtein()) 352 353 If you omit both, then you get a copy of the original record (although 354 lacking the annotations and dbxrefs): 355 356 >>> print rec[:] 357 ID: 1JOY 358 Name: EnvZ 359 Description: Homodimeric domain of EnvZ from E. coli 360 Number of features: 1 361 Per letter annotation for: secondary_structure 362 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 363 364 Finally, indexing with a simple integer is shorthand for pulling out 365 that letter from the sequence directly: 366 367 >>> rec[5] 368 'K' 369 >>> rec.seq[5] 370 'K' 371 """ 372 if isinstance(index, int): 373 #NOTE - The sequence level annotation like the id, name, etc 374 #do not really apply to a single character. However, should 375 #we try and expose any per-letter-annotation here? If so how? 376 return self.seq[index] 377 elif isinstance(index, slice): 378 if self.seq is None: 379 raise ValueError("If the sequence is None, we cannot slice it.") 380 parent_length = len(self) 381 answer = self.__class__(self.seq[index], 382 id=self.id, 383 name=self.name, 384 description=self.description) 385 #TODO - The desription may no longer apply. 386 #It would be safer to change it to something 387 #generic like "edited" or the default value. 388 389 #Don't copy the annotation dict and dbxefs list, 390 #they may not apply to a subsequence. 391 #answer.annotations = dict(self.annotations.iteritems()) 392 #answer.dbxrefs = self.dbxrefs[:] 393 #TODO - Review this in light of adding SeqRecord objects? 394 395 #TODO - Cope with strides by generating ambiguous locations? 396 if index.step is None or index.step == 1: 397 #Select relevant features, add them with shifted locations 398 if index.start is None: 399 start = 0 400 else: 401 start = index.start 402 if index.stop is None: 403 stop = -1 404 else: 405 stop = index.stop 406 if (start < 0 or stop < 0) and parent_length == 0: 407 raise ValueError, \ 408 "Cannot support negative indices without the sequence length" 409 if start < 0: 410 start = parent_length + start 411 if stop < 0: 412 stop = parent_length + stop + 1 413 #assert str(self.seq)[index] == str(self.seq)[start:stop] 414 for f in self.features: 415 if f.ref or f.ref_db: 416 #TODO - Implement this (with lots of tests)? 417 import warnings 418 warnings.warn("When slicing SeqRecord objects, any " 419 "SeqFeature referencing other sequences (e.g. " 420 "from segmented GenBank records) is ignored.") 421 continue 422 if start <= f.location.nofuzzy_start \ 423 and f.location.nofuzzy_end <= stop: 424 answer.features.append(f._shift(-start)) 425 426 #Slice all the values to match the sliced sequence 427 #(this should also work with strides, even negative strides): 428 for key, value in self.letter_annotations.iteritems(): 429 answer._per_letter_annotations[key] = value[index] 430 431 return answer 432 raise ValueError, "Invalid index"
433
434 - def __iter__(self):
435 """Iterate over the letters in the sequence. 436 437 For example, using Bio.SeqIO to read in a protein FASTA file: 438 439 >>> from Bio import SeqIO 440 >>> record = SeqIO.read(open("Fasta/loveliesbleeding.pro"),"fasta") 441 >>> for amino in record: 442 ... print amino 443 ... if amino == "L" : break 444 X 445 A 446 G 447 L 448 >>> print record.seq[3] 449 L 450 451 This is just a shortcut for iterating over the sequence directly: 452 453 >>> for amino in record.seq: 454 ... print amino 455 ... if amino == "L" : break 456 X 457 A 458 G 459 L 460 >>> print record.seq[3] 461 L 462 463 Note that this does not facilitate iteration together with any 464 per-letter-annotation. However, you can achieve that using the 465 python zip function on the record (or its sequence) and the relevant 466 per-letter-annotation: 467 468 >>> from Bio import SeqIO 469 >>> rec = SeqIO.read(open("Quality/solexa_faked.fastq", "rU"), 470 ... "fastq-solexa") 471 >>> print rec.id, rec.seq 472 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 473 >>> print rec.letter_annotations.keys() 474 ['solexa_quality'] 475 >>> for nuc, qual in zip(rec,rec.letter_annotations["solexa_quality"]): 476 ... if qual > 35: 477 ... print nuc, qual 478 A 40 479 C 39 480 G 38 481 T 37 482 A 36 483 484 You may agree that using zip(rec.seq, ...) is more explicit than using 485 zip(rec, ...) as shown above. 486 """ 487 return iter(self.seq)
488
489 - def __contains__(self, char):
490 """Implements the 'in' keyword, searches the sequence. 491 492 e.g. 493 494 >>> from Bio import SeqIO 495 >>> record = SeqIO.read(open("Fasta/sweetpea.nu"), "fasta") 496 >>> "GAATTC" in record 497 False 498 >>> "AAA" in record 499 True 500 501 This essentially acts as a proxy for using "in" on the sequence: 502 503 >>> "GAATTC" in record.seq 504 False 505 >>> "AAA" in record.seq 506 True 507 508 Note that you can also use Seq objects as the query, 509 510 >>> from Bio.Seq import Seq 511 >>> from Bio.Alphabet import generic_dna 512 >>> Seq("AAA") in record 513 True 514 >>> Seq("AAA", generic_dna) in record 515 True 516 517 See also the Seq object's __contains__ method. 518 """ 519 return char in self.seq
520 521
522 - def __str__(self):
523 """A human readable summary of the record and its annotation (string). 524 525 The python built in function str works by calling the object's ___str__ 526 method. e.g. 527 528 >>> from Bio.Seq import Seq 529 >>> from Bio.SeqRecord import SeqRecord 530 >>> from Bio.Alphabet import IUPAC 531 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 532 ... IUPAC.protein), 533 ... id="YP_025292.1", name="HokC", 534 ... description="toxic membrane protein, small") 535 >>> print str(record) 536 ID: YP_025292.1 537 Name: HokC 538 Description: toxic membrane protein, small 539 Number of features: 0 540 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 541 542 In this example you don't actually need to call str explicity, as the 543 print command does this automatically: 544 545 >>> print record 546 ID: YP_025292.1 547 Name: HokC 548 Description: toxic membrane protein, small 549 Number of features: 0 550 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 551 552 Note that long sequences are shown truncated. 553 """ 554 lines = [] 555 if self.id : lines.append("ID: %s" % self.id) 556 if self.name : lines.append("Name: %s" % self.name) 557 if self.description : lines.append("Description: %s" % self.description) 558 if self.dbxrefs : lines.append("Database cross-references: " \ 559 + ", ".join(self.dbxrefs)) 560 lines.append("Number of features: %i" % len(self.features)) 561 for a in self.annotations: 562 lines.append("/%s=%s" % (a, str(self.annotations[a]))) 563 if self.letter_annotations: 564 lines.append("Per letter annotation for: " \ 565 + ", ".join(self.letter_annotations.keys())) 566 #Don't want to include the entire sequence, 567 #and showing the alphabet is useful: 568 lines.append(repr(self.seq)) 569 return "\n".join(lines)
570
571 - def __repr__(self):
572 """A concise summary of the record for debugging (string). 573 574 The python built in function repr works by calling the object's ___repr__ 575 method. e.g. 576 577 >>> from Bio.Seq import Seq 578 >>> from Bio.SeqRecord import SeqRecord 579 >>> from Bio.Alphabet import generic_protein 580 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT" 581 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ" 582 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ" 583 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF", 584 ... generic_protein), 585 ... id="NP_418483.1", name="b4059", 586 ... description="ssDNA-binding protein", 587 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"]) 588 >>> print repr(rec) 589 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 590 591 At the python prompt you can also use this shorthand: 592 593 >>> rec 594 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 595 596 Note that long sequences are shown truncated. Also note that any 597 annotations, letter_annotations and features are not shown (as they 598 would lead to a very long string). 599 """ 600 return self.__class__.__name__ \ 601 + "(seq=%s, id=%s, name=%s, description=%s, dbxrefs=%s)" \ 602 % tuple(map(repr, (self.seq, self.id, self.name, 603 self.description, self.dbxrefs)))
604
605 - def format(self, format):
606 r"""Returns the record as a string in the specified file format. 607 608 The format should be a lower case string supported as an output 609 format by Bio.SeqIO, which is used to turn the SeqRecord into a 610 string. e.g. 611 612 >>> from Bio.Seq import Seq 613 >>> from Bio.SeqRecord import SeqRecord 614 >>> from Bio.Alphabet import IUPAC 615 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 616 ... IUPAC.protein), 617 ... id="YP_025292.1", name="HokC", 618 ... description="toxic membrane protein") 619 >>> record.format("fasta") 620 '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n' 621 >>> print record.format("fasta") 622 >YP_025292.1 toxic membrane protein 623 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 624 <BLANKLINE> 625 626 The python print command automatically appends a new line, meaning 627 in this example a blank line is shown. If you look at the string 628 representation you can see there is a trailing new line (shown as 629 slash n) which is important when writing to a file or if 630 concatenating mutliple sequence strings together. 631 632 Note that this method will NOT work on every possible file format 633 supported by Bio.SeqIO (e.g. some are for multiple sequences only). 634 """ 635 #See also the __format__ added for Python 2.6 / 3.0, PEP 3101 636 #See also the Bio.Align.Generic.Alignment class and its format() 637 return self.__format__(format)
638
639 - def __format__(self, format_spec):
640 """Returns the record as a string in the specified file format. 641 642 This method supports the python format() function added in 643 Python 2.6/3.0. The format_spec should be a lower case string 644 supported by Bio.SeqIO as an output file format. See also the 645 SeqRecord's format() method. 646 """ 647 if not format_spec: 648 #Follow python convention and default to using __str__ 649 return str(self) 650 from Bio import SeqIO 651 if format_spec in SeqIO._BinaryFormats: 652 #Return bytes on Python 3 653 try: 654 #This is in Python 2.6+, but we need it on Python 3 655 from io import BytesIO 656 handle = BytesIO() 657 except ImportError: 658 #Must be on Python 2.5 or older 659 from StringIO import StringIO 660 handle = StringIO() 661 else: 662 from StringIO import StringIO 663 handle = StringIO() 664 SeqIO.write(self, handle, format_spec) 665 return handle.getvalue()
666
667 - def __len__(self):
668 """Returns the length of the sequence. 669 670 For example, using Bio.SeqIO to read in a FASTA nucleotide file: 671 672 >>> from Bio import SeqIO 673 >>> record = SeqIO.read(open("Fasta/sweetpea.nu"),"fasta") 674 >>> len(record) 675 309 676 >>> len(record.seq) 677 309 678 """ 679 return len(self.seq)
680
681 - def __nonzero__(self):
682 """Returns True regardless of the length of the sequence. 683 684 This behaviour is for backwards compatibility, since until the 685 __len__ method was added, a SeqRecord always evaluated as True. 686 687 Note that in comparison, a Seq object will evaluate to False if it 688 has a zero length sequence. 689 690 WARNING: The SeqRecord may in future evaluate to False when its 691 sequence is of zero length (in order to better match the Seq 692 object behaviour)! 693 """ 694 return True
695
696 - def __add__(self, other):
697 """Add another sequence or string to this sequence. 698 699 The other sequence can be a SeqRecord object, a Seq object (or 700 similar, e.g. a MutableSeq) or a plain Python string. If you add 701 a plain string or a Seq (like) object, the new SeqRecord will simply 702 have this appended to the existing data. However, any per letter 703 annotation will be lost: 704 705 >>> from Bio import SeqIO 706 >>> handle = open("Quality/solexa_faked.fastq", "rU") 707 >>> record = SeqIO.read(handle, "fastq-solexa") 708 >>> handle.close() 709 >>> print record.id, record.seq 710 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 711 >>> print record.letter_annotations.keys() 712 ['solexa_quality'] 713 714 >>> new = record + "ACT" 715 >>> print new.id, new.seq 716 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNACT 717 >>> print new.letter_annotations.keys() 718 [] 719 720 The new record will attempt to combine the annotation, but for any 721 ambiguities (e.g. different names) it defaults to omitting that 722 annotation. 723 724 >>> from Bio import SeqIO 725 >>> handle = open("GenBank/pBAD30.gb") 726 >>> plasmid = SeqIO.read(handle, "gb") 727 >>> handle.close() 728 >>> print plasmid.id, len(plasmid) 729 pBAD30 4923 730 731 Now let's cut the plasmid into two pieces, and join them back up the 732 other way round (i.e. shift the starting point on this plasmid, have 733 a look at the annotated features in the original file to see why this 734 particular split point might make sense): 735 736 >>> left = plasmid[:3765] 737 >>> right = plasmid[3765:] 738 >>> new = right + left 739 >>> print new.id, len(new) 740 pBAD30 4923 741 >>> str(new.seq) == str(right.seq + left.seq) 742 True 743 >>> len(new.features) == len(left.features) + len(right.features) 744 True 745 746 When we add the left and right SeqRecord objects, their annotation 747 is all consistent, so it is all conserved in the new SeqRecord: 748 749 >>> new.id == left.id == right.id == plasmid.id 750 True 751 >>> new.name == left.name == right.name == plasmid.name 752 True 753 >>> new.description == plasmid.description 754 True 755 >>> new.annotations == left.annotations == right.annotations 756 True 757 >>> new.letter_annotations == plasmid.letter_annotations 758 True 759 >>> new.dbxrefs == left.dbxrefs == right.dbxrefs 760 True 761 762 However, we should point out that when we sliced the SeqRecord, 763 any annotations dictionary or dbxrefs list entries were lost. 764 You can explicitly copy them like this: 765 766 >>> new.annotations = plasmid.annotations.copy() 767 >>> new.dbxrefs = plasmid.dbxrefs[:] 768 """ 769 if not isinstance(other, SeqRecord): 770 #Assume it is a string or a Seq. 771 #Note can't transfer any per-letter-annotations 772 return SeqRecord(self.seq + other, 773 id = self.id, name = self.name, 774 description = self.description, 775 features = self.features[:], 776 annotations = self.annotations.copy(), 777 dbxrefs = self.dbxrefs[:]) 778 #Adding two SeqRecord objects... must merge annotation. 779 answer = SeqRecord(self.seq + other.seq, 780 features = self.features[:], 781 dbxrefs = self.dbxrefs[:]) 782 #Will take all the features and all the db cross refs, 783 l = len(self) 784 for f in other.features: 785 answer.features.append(f._shift(l)) 786 del l 787 for ref in other.dbxrefs: 788 if ref not in answer.dbxrefs: 789 answer.dbxrefs.append(ref) 790 #Take common id/name/description/annotation 791 if self.id == other.id: 792 answer.id = self.id 793 if self.name == other.name: 794 answer.name = self.name 795 if self.description == other.description: 796 answer.description = self.description 797 for k,v in self.annotations.iteritems(): 798 if k in other.annotations and other.annotations[k] == v: 799 answer.annotations[k] = v 800 #Can append matching per-letter-annotation 801 for k,v in self.letter_annotations.iteritems(): 802 if k in other.letter_annotations: 803 answer.letter_annotations[k] = v + other.letter_annotations[k] 804 return answer
805
806 - def __radd__(self, other):
807 """Add another sequence or string to this sequence (from the left). 808 809 This method handles adding a Seq object (or similar, e.g. MutableSeq) 810 or a plain Python string (on the left) to a SeqRecord (on the right). 811 See the __add__ method for more details, but for example: 812 813 >>> from Bio import SeqIO 814 >>> handle = open("Quality/solexa_faked.fastq", "rU") 815 >>> record = SeqIO.read(handle, "fastq-solexa") 816 >>> handle.close() 817 >>> print record.id, record.seq 818 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 819 >>> print record.letter_annotations.keys() 820 ['solexa_quality'] 821 822 >>> new = "ACT" + record 823 >>> print new.id, new.seq 824 slxa_0001_1_0001_01 ACTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 825 >>> print new.letter_annotations.keys() 826 [] 827 """ 828 if isinstance(other, SeqRecord): 829 raise RuntimeError("This should have happened via the __add__ of " 830 "the other SeqRecord being added!") 831 #Assume it is a string or a Seq. 832 #Note can't transfer any per-letter-annotations 833 offset = len(other) 834 return SeqRecord(other + self.seq, 835 id = self.id, name = self.name, 836 description = self.description, 837 features = [f._shift(offset) for f in self.features], 838 annotations = self.annotations.copy(), 839 dbxrefs = self.dbxrefs[:])
840
841 - def upper(self):
842 """Returns a copy of the record with an upper case sequence. 843 844 All the annotation is preserved unchanged. e.g. 845 846 >>> from Bio.Alphabet import generic_dna 847 >>> from Bio.Seq import Seq 848 >>> from Bio.SeqRecord import SeqRecord 849 >>> record = SeqRecord(Seq("acgtACGT", generic_dna), id="Test", 850 ... description = "Made up for this example") 851 >>> record.letter_annotations["phred_quality"] = [1,2,3,4,5,6,7,8] 852 >>> print record.upper().format("fastq") 853 @Test Made up for this example 854 ACGTACGT 855 + 856 "#$%&'() 857 <BLANKLINE> 858 859 Naturally, there is a matching lower method: 860 861 >>> print record.lower().format("fastq") 862 @Test Made up for this example 863 acgtacgt 864 + 865 "#$%&'() 866 <BLANKLINE> 867 """ 868 return SeqRecord(self.seq.upper(), 869 id = self.id, name = self.name, 870 description = self.description, 871 dbxrefs = self.dbxrefs[:], 872 features = self.features[:], 873 annotations = self.annotations.copy(), 874 letter_annotations=self.letter_annotations.copy())
875
876 - def lower(self):
877 """Returns a copy of the record with a lower case sequence. 878 879 All the annotation is preserved unchanged. e.g. 880 881 >>> from Bio import SeqIO 882 >>> record = SeqIO.read("Fasta/aster.pro", "fasta") 883 >>> print record.format("fasta") 884 >gi|3298468|dbj|BAA31520.1| SAMIPF 885 GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVG 886 VTNALVFEIVMTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI 887 <BLANKLINE> 888 >>> print record.lower().format("fasta") 889 >gi|3298468|dbj|BAA31520.1| SAMIPF 890 gghvnpavtfgafvggnitllrgivyiiaqllgstvaclllkfvtndmavgvfslsagvg 891 vtnalvfeivmtfglvytvyataidpkkgslgtiapiaigfivgani 892 <BLANKLINE> 893 894 To take a more annotation rich example, 895 896 >>> from Bio import SeqIO 897 >>> old = SeqIO.read("EMBL/TRBG361.embl", "embl") 898 >>> len(old.features) 899 3 900 >>> new = old.lower() 901 >>> len(old.features) == len(new.features) 902 True 903 >>> old.annotations["organism"] == new.annotations["organism"] 904 True 905 >>> old.dbxrefs == new.dbxrefs 906 True 907 """ 908 return SeqRecord(self.seq.lower(), 909 id = self.id, name = self.name, 910 description = self.description, 911 dbxrefs = self.dbxrefs[:], 912 features = self.features[:], 913 annotations = self.annotations.copy(), 914 letter_annotations=self.letter_annotations.copy())
915
916 -def _test():
917 """Run the Bio.SeqRecord module's doctests (PRIVATE). 918 919 This will try and locate the unit tests directory, and run the doctests 920 from there in order that the relative paths used in the examples work. 921 """ 922 import doctest 923 import os 924 if os.path.isdir(os.path.join("..","Tests")): 925 print "Runing doctests..." 926 cur_dir = os.path.abspath(os.curdir) 927 os.chdir(os.path.join("..","Tests")) 928 doctest.testmod() 929 os.chdir(cur_dir) 930 del cur_dir 931 print "Done" 932 elif os.path.isdir(os.path.join("Tests")) : 933 print "Runing doctests..." 934 cur_dir = os.path.abspath(os.curdir) 935 os.chdir(os.path.join("Tests")) 936 doctest.testmod() 937 os.chdir(cur_dir) 938 del cur_dir 939 print "Done"
940 941 if __name__ == "__main__": 942 _test() 943