Package Bio :: Module Seq
[hide private]
[frames] | no frames]

Source Code for Module Bio.Seq

   1  # Copyright 2000-2002 Brad Chapman. 
   2  # Copyright 2004-2005 by M de Hoon. 
   3  # Copyright 2007-2010 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Provides objects to represent biological sequences with alphabets. 
   9   
  10  See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial: 
  11   - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html} 
  12   - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf} 
  13  """ 
  14  __docformat__ ="epytext en" #Don't just use plain text in epydoc API pages! 
  15   
  16  import string #for maketrans only 
  17  import array 
  18  import sys 
  19   
  20  from Bio import Alphabet 
  21  from Bio.Alphabet import IUPAC 
  22  from Bio.SeqRecord import SeqRecord 
  23  from Bio.Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement 
  24  from Bio.Data import CodonTable 
25 26 -def _maketrans(complement_mapping):
27 """Makes a python string translation table (PRIVATE). 28 29 Arguments: 30 - complement_mapping - a dictionary such as ambiguous_dna_complement 31 and ambiguous_rna_complement from Data.IUPACData. 32 33 Returns a translation table (a string of length 256) for use with the 34 python string's translate method to use in a (reverse) complement. 35 36 Compatible with lower case and upper case sequences. 37 38 For internal use only. 39 """ 40 before = ''.join(complement_mapping.keys()) 41 after = ''.join(complement_mapping.values()) 42 before = before + before.lower() 43 after = after + after.lower() 44 if sys.version_info[0] == 3 : 45 return str.maketrans(before, after) 46 else: 47 return string.maketrans(before, after)
48 49 _dna_complement_table = _maketrans(ambiguous_dna_complement) 50 _rna_complement_table = _maketrans(ambiguous_rna_complement)
51 52 -class Seq(object):
53 """A read-only sequence object (essentially a string with an alphabet). 54 55 Like normal python strings, our basic sequence object is immutable. 56 This prevents you from doing my_seq[5] = "A" for example, but does allow 57 Seq objects to be used as dictionary keys. 58 59 The Seq object provides a number of string like methods (such as count, 60 find, split and strip), which are alphabet aware where appropriate. 61 62 In addition to the string like sequence, the Seq object has an alphabet 63 property. This is an instance of an Alphabet class from Bio.Alphabet, 64 for example generic DNA, or IUPAC DNA. This describes the type of molecule 65 (e.g. RNA, DNA, protein) and may also indicate the expected symbols 66 (letters). 67 68 The Seq object also provides some biological methods, such as complement, 69 reverse_complement, transcribe, back_transcribe and translate (which are 70 not applicable to sequences with a protein alphabet). 71 """
72 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
73 """Create a Seq object. 74 75 Arguments: 76 - seq - Sequence, required (string) 77 - alphabet - Optional argument, an Alphabet object from Bio.Alphabet 78 79 You will typically use Bio.SeqIO to read in sequences from files as 80 SeqRecord objects, whose sequence will be exposed as a Seq object via 81 the seq property. 82 83 However, will often want to create your own Seq objects directly: 84 85 >>> from Bio.Seq import Seq 86 >>> from Bio.Alphabet import IUPAC 87 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 88 ... IUPAC.protein) 89 >>> my_seq 90 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 91 >>> print my_seq 92 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 93 >>> my_seq.alphabet 94 IUPACProtein() 95 96 """ 97 # Enforce string storage 98 if not isinstance(data, basestring): 99 raise TypeError("The sequence data given to a Seq object should " 100 "be a string (not another Seq object etc)") 101 self._data = data 102 self.alphabet = alphabet # Seq API requirement
103 104 # A data property is/was a Seq API requirement 105 # Note this is read only since the Seq object is meant to be imutable 106 @property
107 - def data(self) :
108 """Sequence as a string (DEPRECATED). 109 110 This is a read only property provided for backwards compatility with 111 older versions of Biopython (as is the tostring() method). We now 112 encourage you to use str(my_seq) instead of my_seq.data or the method 113 my_seq.tostring(). 114 115 In recent releases of Biopython it was possible to change a Seq object 116 by updating its data property, but this triggered a deprecation warning. 117 Now the data property is read only, since Seq objects are meant to be 118 immutable: 119 120 >>> from Bio.Seq import Seq 121 >>> from Bio.Alphabet import generic_dna 122 >>> my_seq = Seq("ACGT", generic_dna) 123 >>> str(my_seq) == my_seq.tostring() == "ACGT" 124 True 125 >>> my_seq.data = "AAAA" 126 Traceback (most recent call last): 127 ... 128 AttributeError: can't set attribute 129 """ 130 import warnings 131 import Bio 132 warnings.warn("Accessing the .data attribute is deprecated. Please " 133 "use str(my_seq) or my_seq.tostring() instead of " 134 "my_seq.data.", Bio.BiopythonDeprecationWarning) 135 return str(self)
136
137 - def __repr__(self):
138 """Returns a (truncated) representation of the sequence for debugging.""" 139 if len(self) > 60: 140 #Shows the last three letters as it is often useful to see if there 141 #is a stop codon at the end of a sequence. 142 #Note total length is 54+3+3=60 143 return "%s('%s...%s', %s)" % (self.__class__.__name__, 144 str(self)[:54], str(self)[-3:], 145 repr(self.alphabet)) 146 else: 147 return "%s(%s, %s)" % (self.__class__.__name__, 148 repr(self._data), 149 repr(self.alphabet))
150 - def __str__(self):
151 """Returns the full sequence as a python string, use str(my_seq). 152 153 Note that Biopython 1.44 and earlier would give a truncated 154 version of repr(my_seq) for str(my_seq). If you are writing code 155 which need to be backwards compatible with old Biopython, you 156 should continue to use my_seq.tostring() rather than str(my_seq). 157 """ 158 return self._data
159
160 - def __hash__(self):
161 """Hash for comparison. 162 163 See the __cmp__ documentation - we plan to change this! 164 """ 165 return id(self) #Currently use object identity for equality testing
166
167 - def __cmp__(self, other):
168 """Compare the sequence to another sequence or a string (README). 169 170 Historically comparing Seq objects has done Python object comparison. 171 After considerable discussion (keeping in mind constraints of the 172 Python language, hashes and dictionary support) a future release of 173 Biopython will change this to use simple string comparison. The plan is 174 that comparing incompatible alphabets (e.g. DNA to RNA) will trigger a 175 warning. 176 177 This version of Biopython still does Python object comparison, but with 178 a warning about this future change. During this transition period, 179 please just do explicit comparisons: 180 181 >>> seq1 = Seq("ACGT") 182 >>> seq2 = Seq("ACGT") 183 >>> id(seq1) == id(seq2) 184 False 185 >>> str(seq1) == str(seq2) 186 True 187 188 Note - This method indirectly supports ==, < , etc. 189 """ 190 if hasattr(other, "alphabet"): 191 #other should be a Seq or a MutableSeq 192 import warnings 193 warnings.warn("In future comparing Seq objects will use string " 194 "comparison (not object comparison). Incompatible " 195 "alphabets will trigger a warning (not an exception). " 196 "In the interim please use id(seq1)==id(seq2) or " 197 "str(seq1)==str(seq2) to make your code explicit " 198 "and to avoid this warning.", FutureWarning) 199 return cmp(id(self), id(other))
200
201 - def __len__(self):
202 """Returns the length of the sequence, use len(my_seq).""" 203 return len(self._data) # Seq API requirement
204
205 - def __getitem__(self, index) : # Seq API requirement
206 """Returns a subsequence of single letter, use my_seq[index].""" 207 #Note since Python 2.0, __getslice__ is deprecated 208 #and __getitem__ is used instead. 209 #See http://docs.python.org/ref/sequence-methods.html 210 if isinstance(index, int): 211 #Return a single letter as a string 212 return self._data[index] 213 else: 214 #Return the (sub)sequence as another Seq object 215 return Seq(self._data[index], self.alphabet)
216
217 - def __add__(self, other):
218 """Add another sequence or string to this sequence. 219 220 If adding a string to a Seq, the alphabet is preserved: 221 222 >>> from Bio.Seq import Seq 223 >>> from Bio.Alphabet import generic_protein 224 >>> Seq("MELKI", generic_protein) + "LV" 225 Seq('MELKILV', ProteinAlphabet()) 226 227 When adding two Seq (like) objects, the alphabets are important. 228 Consider this example: 229 230 >>> from Bio.Seq import Seq 231 >>> from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna 232 >>> unamb_dna_seq = Seq("ACGT", unambiguous_dna) 233 >>> ambig_dna_seq = Seq("ACRGT", ambiguous_dna) 234 >>> unamb_dna_seq 235 Seq('ACGT', IUPACUnambiguousDNA()) 236 >>> ambig_dna_seq 237 Seq('ACRGT', IUPACAmbiguousDNA()) 238 239 If we add the ambiguous and unambiguous IUPAC DNA alphabets, we get 240 the more general ambiguous IUPAC DNA alphabet: 241 242 >>> unamb_dna_seq + ambig_dna_seq 243 Seq('ACGTACRGT', IUPACAmbiguousDNA()) 244 245 However, if the default generic alphabet is included, the result is 246 a generic alphabet: 247 248 >>> Seq("") + ambig_dna_seq 249 Seq('ACRGT', Alphabet()) 250 251 You can't add RNA and DNA sequences: 252 253 >>> from Bio.Alphabet import generic_dna, generic_rna 254 >>> Seq("ACGT", generic_dna) + Seq("ACGU", generic_rna) 255 Traceback (most recent call last): 256 ... 257 TypeError: Incompatable alphabets DNAAlphabet() and RNAAlphabet() 258 259 You can't add nucleotide and protein sequences: 260 261 >>> from Bio.Alphabet import generic_dna, generic_protein 262 >>> Seq("ACGT", generic_dna) + Seq("MELKI", generic_protein) 263 Traceback (most recent call last): 264 ... 265 TypeError: Incompatable alphabets DNAAlphabet() and ProteinAlphabet() 266 """ 267 if hasattr(other, "alphabet"): 268 #other should be a Seq or a MutableSeq 269 if not Alphabet._check_type_compatible([self.alphabet, 270 other.alphabet]): 271 raise TypeError("Incompatable alphabets %s and %s" \ 272 % (repr(self.alphabet), repr(other.alphabet))) 273 #They should be the same sequence type (or one of them is generic) 274 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 275 return self.__class__(str(self) + str(other), a) 276 elif isinstance(other, basestring): 277 #other is a plain string - use the current alphabet 278 return self.__class__(str(self) + other, self.alphabet) 279 elif isinstance(other, SeqRecord): 280 #Get the SeqRecord's __radd__ to handle this 281 return NotImplemented 282 else : 283 raise TypeError
284
285 - def __radd__(self, other):
286 """Adding a sequence on the left. 287 288 If adding a string to a Seq, the alphabet is preserved: 289 290 >>> from Bio.Seq import Seq 291 >>> from Bio.Alphabet import generic_protein 292 >>> "LV" + Seq("MELKI", generic_protein) 293 Seq('LVMELKI', ProteinAlphabet()) 294 295 Adding two Seq (like) objects is handled via the __add__ method. 296 """ 297 if hasattr(other, "alphabet"): 298 #other should be a Seq or a MutableSeq 299 if not Alphabet._check_type_compatible([self.alphabet, 300 other.alphabet]): 301 raise TypeError("Incompatable alphabets %s and %s" \ 302 % (repr(self.alphabet), repr(other.alphabet))) 303 #They should be the same sequence type (or one of them is generic) 304 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 305 return self.__class__(str(other) + str(self), a) 306 elif isinstance(other, basestring): 307 #other is a plain string - use the current alphabet 308 return self.__class__(other + str(self), self.alphabet) 309 else: 310 raise TypeError
311
312 - def tostring(self): # Seq API requirement
313 """Returns the full sequence as a python string (semi-obsolete). 314 315 Although not formally deprecated, you are now encouraged to use 316 str(my_seq) instead of my_seq.tostring().""" 317 #TODO - Fix all places elsewhere in Biopython using this method, 318 #then start deprecation process? 319 #import warnings 320 #warnings.warn("This method is obsolete; please use str(my_seq) " 321 # "instead of my_seq.tostring().", 322 # PendingDeprecationWarning) 323 return str(self) 324
325 - def tomutable(self): # Needed? Or use a function?
326 """Returns the full sequence as a MutableSeq object. 327 328 >>> from Bio.Seq import Seq 329 >>> from Bio.Alphabet import IUPAC 330 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL", 331 ... IUPAC.protein) 332 >>> my_seq 333 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 334 >>> my_seq.tomutable() 335 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 336 337 Note that the alphabet is preserved. 338 """ 339 return MutableSeq(str(self), self.alphabet) 340
341 - def _get_seq_str_and_check_alphabet(self, other_sequence):
342 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE). 343 344 For a string argument, returns the string. 345 346 For a Seq or MutableSeq, it checks the alphabet is compatible 347 (raising an exception if it isn't), and then returns a string. 348 """ 349 try: 350 other_alpha = other_sequence.alphabet 351 except AttributeError: 352 #Assume other_sequence is a string 353 return other_sequence 354 355 #Other should be a Seq or a MutableSeq 356 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]): 357 raise TypeError("Incompatable alphabets %s and %s" \ 358 % (repr(self.alphabet), repr(other_alpha))) 359 #Return as a string 360 return str(other_sequence)
361
362 - def count(self, sub, start=0, end=sys.maxint):
363 """Non-overlapping count method, like that of a python string. 364 365 This behaves like the python string method of the same name, 366 which does a non-overlapping count! 367 368 Returns an integer, the number of occurrences of substring 369 argument sub in the (sub)sequence given by [start:end]. 370 Optional arguments start and end are interpreted as in slice 371 notation. 372 373 Arguments: 374 - sub - a string or another Seq object to look for 375 - start - optional integer, slice start 376 - end - optional integer, slice end 377 378 e.g. 379 380 >>> from Bio.Seq import Seq 381 >>> my_seq = Seq("AAAATGA") 382 >>> print my_seq.count("A") 383 5 384 >>> print my_seq.count("ATG") 385 1 386 >>> print my_seq.count(Seq("AT")) 387 1 388 >>> print my_seq.count("AT", 2, -1) 389 1 390 391 HOWEVER, please note because python strings and Seq objects (and 392 MutableSeq objects) do a non-overlapping search, this may not give 393 the answer you expect: 394 395 >>> "AAAA".count("AA") 396 2 397 >>> print Seq("AAAA").count("AA") 398 2 399 400 A non-overlapping search would give the answer as three! 401 """ 402 #If it has one, check the alphabet: 403 sub_str = self._get_seq_str_and_check_alphabet(sub) 404 return str(self).count(sub_str, start, end)
405
406 - def __contains__(self, char):
407 """Implements the 'in' keyword, like a python string. 408 409 e.g. 410 411 >>> from Bio.Seq import Seq 412 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein 413 >>> my_dna = Seq("ATATGAAATTTGAAAA", generic_dna) 414 >>> "AAA" in my_dna 415 True 416 >>> Seq("AAA") in my_dna 417 True 418 >>> Seq("AAA", generic_dna) in my_dna 419 True 420 421 Like other Seq methods, this will raise a type error if another Seq 422 (or Seq like) object with an incompatible alphabet is used: 423 424 >>> Seq("AAA", generic_rna) in my_dna 425 Traceback (most recent call last): 426 ... 427 TypeError: Incompatable alphabets DNAAlphabet() and RNAAlphabet() 428 >>> Seq("AAA", generic_protein) in my_dna 429 Traceback (most recent call last): 430 ... 431 TypeError: Incompatable alphabets DNAAlphabet() and ProteinAlphabet() 432 """ 433 #If it has one, check the alphabet: 434 sub_str = self._get_seq_str_and_check_alphabet(char) 435 return sub_str in str(self)
436
437 - def find(self, sub, start=0, end=sys.maxint):
438 """Find method, like that of a python string. 439 440 This behaves like the python string method of the same name. 441 442 Returns an integer, the index of the first occurrence of substring 443 argument sub in the (sub)sequence given by [start:end]. 444 445 Arguments: 446 - sub - a string or another Seq object to look for 447 - start - optional integer, slice start 448 - end - optional integer, slice end 449 450 Returns -1 if the subsequence is NOT found. 451 452 e.g. Locating the first typical start codon, AUG, in an RNA sequence: 453 454 >>> from Bio.Seq import Seq 455 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 456 >>> my_rna.find("AUG") 457 3 458 """ 459 #If it has one, check the alphabet: 460 sub_str = self._get_seq_str_and_check_alphabet(sub) 461 return str(self).find(sub_str, start, end)
462
463 - def rfind(self, sub, start=0, end=sys.maxint):
464 """Find from right method, like that of a python string. 465 466 This behaves like the python string method of the same name. 467 468 Returns an integer, the index of the last (right most) occurrence of 469 substring argument sub in the (sub)sequence given by [start:end]. 470 471 Arguments: 472 - sub - a string or another Seq object to look for 473 - start - optional integer, slice start 474 - end - optional integer, slice end 475 476 Returns -1 if the subsequence is NOT found. 477 478 e.g. Locating the last typical start codon, AUG, in an RNA sequence: 479 480 >>> from Bio.Seq import Seq 481 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 482 >>> my_rna.rfind("AUG") 483 15 484 """ 485 #If it has one, check the alphabet: 486 sub_str = self._get_seq_str_and_check_alphabet(sub) 487 return str(self).rfind(sub_str, start, end)
488
489 - def startswith(self, prefix, start=0, end=sys.maxint):
490 """Does the Seq start with the given prefix? Returns True/False. 491 492 This behaves like the python string method of the same name. 493 494 Return True if the sequence starts with the specified prefix 495 (a string or another Seq object), False otherwise. 496 With optional start, test sequence beginning at that position. 497 With optional end, stop comparing sequence at that position. 498 prefix can also be a tuple of strings to try. e.g. 499 500 >>> from Bio.Seq import Seq 501 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 502 >>> my_rna.startswith("GUC") 503 True 504 >>> my_rna.startswith("AUG") 505 False 506 >>> my_rna.startswith("AUG", 3) 507 True 508 >>> my_rna.startswith(("UCC","UCA","UCG"),1) 509 True 510 """ 511 #If it has one, check the alphabet: 512 if isinstance(prefix, tuple): 513 #TODO - Once we drop support for Python 2.4, instead of this 514 #loop offload to the string method (requires Python 2.5+). 515 #Check all the alphabets first... 516 prefix_strings = [self._get_seq_str_and_check_alphabet(p) \ 517 for p in prefix] 518 for prefix_str in prefix_strings: 519 if str(self).startswith(prefix_str, start, end): 520 return True 521 return False 522 else: 523 prefix_str = self._get_seq_str_and_check_alphabet(prefix) 524 return str(self).startswith(prefix_str, start, end)
525
526 - def endswith(self, suffix, start=0, end=sys.maxint):
527 """Does the Seq end with the given suffix? Returns True/False. 528 529 This behaves like the python string method of the same name. 530 531 Return True if the sequence ends with the specified suffix 532 (a string or another Seq object), False otherwise. 533 With optional start, test sequence beginning at that position. 534 With optional end, stop comparing sequence at that position. 535 suffix can also be a tuple of strings to try. e.g. 536 537 >>> from Bio.Seq import Seq 538 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 539 >>> my_rna.endswith("UUG") 540 True 541 >>> my_rna.endswith("AUG") 542 False 543 >>> my_rna.endswith("AUG", 0, 18) 544 True 545 >>> my_rna.endswith(("UCC","UCA","UUG")) 546 True 547 """ 548 #If it has one, check the alphabet: 549 if isinstance(suffix, tuple): 550 #TODO - Once we drop support for Python 2.4, instead of this 551 #loop offload to the string method (requires Python 2.5+). 552 #Check all the alphabets first... 553 suffix_strings = [self._get_seq_str_and_check_alphabet(p) \ 554 for p in suffix] 555 for suffix_str in suffix_strings: 556 if str(self).endswith(suffix_str, start, end): 557 return True 558 return False 559 else: 560 suffix_str = self._get_seq_str_and_check_alphabet(suffix) 561 return str(self).endswith(suffix_str, start, end)
562 563
564 - def split(self, sep=None, maxsplit=-1):
565 """Split method, like that of a python string. 566 567 This behaves like the python string method of the same name. 568 569 Return a list of the 'words' in the string (as Seq objects), 570 using sep as the delimiter string. If maxsplit is given, at 571 most maxsplit splits are done. If maxsplit is ommited, all 572 splits are made. 573 574 Following the python string method, sep will by default be any 575 white space (tabs, spaces, newlines) but this is unlikely to 576 apply to biological sequences. 577 578 e.g. 579 580 >>> from Bio.Seq import Seq 581 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 582 >>> my_aa = my_rna.translate() 583 >>> my_aa 584 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*')) 585 >>> my_aa.split("*") 586 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 587 >>> my_aa.split("*",1) 588 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 589 590 See also the rsplit method: 591 592 >>> my_aa.rsplit("*",1) 593 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 594 """ 595 #If it has one, check the alphabet: 596 sep_str = self._get_seq_str_and_check_alphabet(sep) 597 #TODO - If the sep is the defined stop symbol, or gap char, 598 #should we adjust the alphabet? 599 return [Seq(part, self.alphabet) \ 600 for part in str(self).split(sep_str, maxsplit)]
601
602 - def rsplit(self, sep=None, maxsplit=-1):
603 """Right split method, like that of a python string. 604 605 This behaves like the python string method of the same name. 606 607 Return a list of the 'words' in the string (as Seq objects), 608 using sep as the delimiter string. If maxsplit is given, at 609 most maxsplit splits are done COUNTING FROM THE RIGHT. 610 If maxsplit is ommited, all splits are made. 611 612 Following the python string method, sep will by default be any 613 white space (tabs, spaces, newlines) but this is unlikely to 614 apply to biological sequences. 615 616 e.g. print my_seq.rsplit("*",1) 617 618 See also the split method. 619 """ 620 #If it has one, check the alphabet: 621 sep_str = self._get_seq_str_and_check_alphabet(sep) 622 return [Seq(part, self.alphabet) \ 623 for part in str(self).rsplit(sep_str, maxsplit)]
624
625 - def strip(self, chars=None):
626 """Returns a new Seq object with leading and trailing ends stripped. 627 628 This behaves like the python string method of the same name. 629 630 Optional argument chars defines which characters to remove. If 631 ommitted or None (default) then as for the python string method, 632 this defaults to removing any white space. 633 634 e.g. print my_seq.strip("-") 635 636 See also the lstrip and rstrip methods. 637 """ 638 #If it has one, check the alphabet: 639 strip_str = self._get_seq_str_and_check_alphabet(chars) 640 return Seq(str(self).strip(strip_str), self.alphabet)
641
642 - def lstrip(self, chars=None):
643 """Returns a new Seq object with leading (left) end stripped. 644 645 This behaves like the python string method of the same name. 646 647 Optional argument chars defines which characters to remove. If 648 ommitted or None (default) then as for the python string method, 649 this defaults to removing any white space. 650 651 e.g. print my_seq.lstrip("-") 652 653 See also the strip and rstrip methods. 654 """ 655 #If it has one, check the alphabet: 656 strip_str = self._get_seq_str_and_check_alphabet(chars) 657 return Seq(str(self).lstrip(strip_str), self.alphabet)
658
659 - def rstrip(self, chars=None):
660 """Returns a new Seq object with trailing (right) end stripped. 661 662 This behaves like the python string method of the same name. 663 664 Optional argument chars defines which characters to remove. If 665 ommitted or None (default) then as for the python string method, 666 this defaults to removing any white space. 667 668 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail): 669 670 >>> from Bio.Alphabet import IUPAC 671 >>> from Bio.Seq import Seq 672 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna) 673 >>> my_seq 674 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA()) 675 >>> my_seq.rstrip("A") 676 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA()) 677 678 See also the strip and lstrip methods. 679 """ 680 #If it has one, check the alphabet: 681 strip_str = self._get_seq_str_and_check_alphabet(chars) 682 return Seq(str(self).rstrip(strip_str), self.alphabet)
683
684 - def upper(self):
685 """Returns an upper case copy of the sequence. 686 687 >>> from Bio.Alphabet import HasStopCodon, generic_protein 688 >>> from Bio.Seq import Seq 689 >>> my_seq = Seq("VHLTPeeK*", HasStopCodon(generic_protein)) 690 >>> my_seq 691 Seq('VHLTPeeK*', HasStopCodon(ProteinAlphabet(), '*')) 692 >>> my_seq.lower() 693 Seq('vhltpeek*', HasStopCodon(ProteinAlphabet(), '*')) 694 >>> my_seq.upper() 695 Seq('VHLTPEEK*', HasStopCodon(ProteinAlphabet(), '*')) 696 697 This will adjust the alphabet if required. See also the lower method. 698 """ 699 return Seq(str(self).upper(), self.alphabet._upper())
700
701 - def lower(self):
702 """Returns a lower case copy of the sequence. 703 704 This will adjust the alphabet if required. Note that the IUPAC alphabets 705 are upper case only, and thus a generic alphabet must be substituted. 706 707 >>> from Bio.Alphabet import Gapped, generic_dna 708 >>> from Bio.Alphabet import IUPAC 709 >>> from Bio.Seq import Seq 710 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAG*AAAAAA", Gapped(IUPAC.unambiguous_dna, "*")) 711 >>> my_seq 712 Seq('CGGTACGCTTATGTCACGTAG*AAAAAA', Gapped(IUPACUnambiguousDNA(), '*')) 713 >>> my_seq.lower() 714 Seq('cggtacgcttatgtcacgtag*aaaaaa', Gapped(DNAAlphabet(), '*')) 715 716 See also the upper method. 717 """ 718 return Seq(str(self).lower(), self.alphabet._lower())
719
720 - def complement(self):
721 """Returns the complement sequence. New Seq object. 722 723 >>> from Bio.Seq import Seq 724 >>> from Bio.Alphabet import IUPAC 725 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna) 726 >>> my_dna 727 Seq('CCCCCGATAG', IUPACUnambiguousDNA()) 728 >>> my_dna.complement() 729 Seq('GGGGGCTATC', IUPACUnambiguousDNA()) 730 731 You can of course used mixed case sequences, 732 733 >>> from Bio.Seq import Seq 734 >>> from Bio.Alphabet import generic_dna 735 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna) 736 >>> my_dna 737 Seq('CCCCCgatA-GD', DNAAlphabet()) 738 >>> my_dna.complement() 739 Seq('GGGGGctaT-CH', DNAAlphabet()) 740 741 Note in the above example, ambiguous character D denotes 742 G, A or T so its complement is H (for C, T or A). 743 744 Trying to complement a protein sequence raises an exception. 745 746 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 747 >>> my_protein.complement() 748 Traceback (most recent call last): 749 ... 750 ValueError: Proteins do not have complements! 751 """ 752 base = Alphabet._get_base_alphabet(self.alphabet) 753 if isinstance(base, Alphabet.ProteinAlphabet): 754 raise ValueError("Proteins do not have complements!") 755 if isinstance(base, Alphabet.DNAAlphabet): 756 ttable = _dna_complement_table 757 elif isinstance(base, Alphabet.RNAAlphabet): 758 ttable = _rna_complement_table 759 elif ('U' in self._data or 'u' in self._data) \ 760 and ('T' in self._data or 't' in self._data): 761 #TODO - Handle this cleanly? 762 raise ValueError("Mixed RNA/DNA found") 763 elif 'U' in self._data or 'u' in self._data: 764 ttable = _rna_complement_table 765 else: 766 ttable = _dna_complement_table 767 #Much faster on really long sequences than the previous loop based one. 768 #thx to Michael Palmer, University of Waterloo 769 return Seq(str(self).translate(ttable), self.alphabet)
770
771 - def reverse_complement(self):
772 """Returns the reverse complement sequence. New Seq object. 773 774 >>> from Bio.Seq import Seq 775 >>> from Bio.Alphabet import IUPAC 776 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna) 777 >>> my_dna 778 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA()) 779 >>> my_dna.reverse_complement() 780 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA()) 781 782 Note in the above example, since R = G or A, its complement 783 is Y (which denotes C or T). 784 785 You can of course used mixed case sequences, 786 787 >>> from Bio.Seq import Seq 788 >>> from Bio.Alphabet import generic_dna 789 >>> my_dna = Seq("CCCCCgatA-G", generic_dna) 790 >>> my_dna 791 Seq('CCCCCgatA-G', DNAAlphabet()) 792 >>> my_dna.reverse_complement() 793 Seq('C-TatcGGGGG', DNAAlphabet()) 794 795 Trying to complement a protein sequence raises an exception: 796 797 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 798 >>> my_protein.reverse_complement() 799 Traceback (most recent call last): 800 ... 801 ValueError: Proteins do not have complements! 802 """ 803 #Use -1 stride/step to reverse the complement 804 return self.complement()[::-1]
805
806 - def transcribe(self):
807 """Returns the RNA sequence from a DNA sequence. New Seq object. 808 809 >>> from Bio.Seq import Seq 810 >>> from Bio.Alphabet import IUPAC 811 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", 812 ... IUPAC.unambiguous_dna) 813 >>> coding_dna 814 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 815 >>> coding_dna.transcribe() 816 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 817 818 Trying to transcribe a protein or RNA sequence raises an exception: 819 820 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 821 >>> my_protein.transcribe() 822 Traceback (most recent call last): 823 ... 824 ValueError: Proteins cannot be transcribed! 825 """ 826 base = Alphabet._get_base_alphabet(self.alphabet) 827 if isinstance(base, Alphabet.ProteinAlphabet): 828 raise ValueError("Proteins cannot be transcribed!") 829 if isinstance(base, Alphabet.RNAAlphabet): 830 raise ValueError("RNA cannot be transcribed!") 831 832 if self.alphabet==IUPAC.unambiguous_dna: 833 alphabet = IUPAC.unambiguous_rna 834 elif self.alphabet==IUPAC.ambiguous_dna: 835 alphabet = IUPAC.ambiguous_rna 836 else: 837 alphabet = Alphabet.generic_rna 838 return Seq(str(self).replace('T','U').replace('t','u'), alphabet)
839
840 - def back_transcribe(self):
841 """Returns the DNA sequence from an RNA sequence. New Seq object. 842 843 >>> from Bio.Seq import Seq 844 >>> from Bio.Alphabet import IUPAC 845 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", 846 ... IUPAC.unambiguous_rna) 847 >>> messenger_rna 848 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 849 >>> messenger_rna.back_transcribe() 850 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 851 852 Trying to back-transcribe a protein or DNA sequence raises an 853 exception: 854 855 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 856 >>> my_protein.back_transcribe() 857 Traceback (most recent call last): 858 ... 859 ValueError: Proteins cannot be back transcribed! 860 """ 861 base = Alphabet._get_base_alphabet(self.alphabet) 862 if isinstance(base, Alphabet.ProteinAlphabet): 863 raise ValueError("Proteins cannot be back transcribed!") 864 if isinstance(base, Alphabet.DNAAlphabet): 865 raise ValueError("DNA cannot be back transcribed!") 866 867 if self.alphabet==IUPAC.unambiguous_rna: 868 alphabet = IUPAC.unambiguous_dna 869 elif self.alphabet==IUPAC.ambiguous_rna: 870 alphabet = IUPAC.ambiguous_dna 871 else: 872 alphabet = Alphabet.generic_dna 873 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
874
875 - def translate(self, table="Standard", stop_symbol="*", to_stop=False, 876 cds=False):
877 """Turns a nucleotide sequence into a protein sequence. New Seq object. 878 879 This method will translate DNA or RNA sequences, and those with a 880 nucleotide or generic alphabet. Trying to translate a protein 881 sequence raises an exception. 882 883 Arguments: 884 - table - Which codon table to use? This can be either a name 885 (string), an NCBI identifier (integer), or a CodonTable 886 object (useful for non-standard genetic codes). This 887 defaults to the "Standard" table. 888 - stop_symbol - Single character string, what to use for terminators. 889 This defaults to the asterisk, "*". 890 - to_stop - Boolean, defaults to False meaning do a full translation 891 continuing on past any stop codons (translated as the 892 specified stop_symbol). If True, translation is 893 terminated at the first in frame stop codon (and the 894 stop_symbol is not appended to the returned protein 895 sequence). 896 - cds - Boolean, indicates this is a complete CDS. If True, 897 this checks the sequence starts with a valid alternative start 898 codon (which will be translated as methionine, M), that the 899 sequence length is a multiple of three, and that there is a 900 single in frame stop codon at the end (this will be excluded 901 from the protein sequence, regardless of the to_stop option). 902 If these tests fail, an exception is raised. 903 904 e.g. Using the standard table: 905 906 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") 907 >>> coding_dna.translate() 908 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 909 >>> coding_dna.translate(stop_symbol="@") 910 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@')) 911 >>> coding_dna.translate(to_stop=True) 912 Seq('VAIVMGR', ExtendedIUPACProtein()) 913 914 Now using NCBI table 2, where TGA is not a stop codon: 915 916 >>> coding_dna.translate(table=2) 917 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 918 >>> coding_dna.translate(table=2, to_stop=True) 919 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein()) 920 921 In fact, GTG is an alternative start codon under NCBI table 2, meaning 922 this sequence could be a complete CDS: 923 924 >>> coding_dna.translate(table=2, cds=True) 925 Seq('MAIVMGRWKGAR', ExtendedIUPACProtein()) 926 927 It isn't a valid CDS under NCBI table 1, due to both the start codon and 928 also the in frame stop codons: 929 930 >>> coding_dna.translate(table=1, cds=True) 931 Traceback (most recent call last): 932 ... 933 TranslationError: First codon 'GTG' is not a start codon 934 935 If the sequence has no in-frame stop codon, then the to_stop argument 936 has no effect: 937 938 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") 939 >>> coding_dna2.translate() 940 Seq('LAIVMGR', ExtendedIUPACProtein()) 941 >>> coding_dna2.translate(to_stop=True) 942 Seq('LAIVMGR', ExtendedIUPACProtein()) 943 944 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 945 or a stop codon. These are translated as "X". Any invalid codon 946 (e.g. "TA?" or "T-A") will throw a TranslationError. 947 948 NOTE - Does NOT support gapped sequences. 949 950 NOTE - This does NOT behave like the python string's translate 951 method. For that use str(my_seq).translate(...) instead. 952 """ 953 if isinstance(table, str) and len(table)==256: 954 raise ValueError("The Seq object translate method DOES NOT take " \ 955 + "a 256 character string mapping table like " \ 956 + "the python string object's translate method. " \ 957 + "Use str(my_seq).translate(...) instead.") 958 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 959 Alphabet.ProteinAlphabet): 960 raise ValueError("Proteins cannot be translated!") 961 try: 962 table_id = int(table) 963 except ValueError: 964 #Assume its a table name 965 if self.alphabet==IUPAC.unambiguous_dna: 966 #Will use standard IUPAC protein alphabet, no need for X 967 codon_table = CodonTable.unambiguous_dna_by_name[table] 968 elif self.alphabet==IUPAC.unambiguous_rna: 969 #Will use standard IUPAC protein alphabet, no need for X 970 codon_table = CodonTable.unambiguous_rna_by_name[table] 971 else: 972 #This will use the extended IUPAC protein alphabet with X etc. 973 #The same table can be used for RNA or DNA (we use this for 974 #translating strings). 975 codon_table = CodonTable.ambiguous_generic_by_name[table] 976 except (AttributeError, TypeError): 977 #Assume its a CodonTable object 978 if isinstance(table, CodonTable.CodonTable): 979 codon_table = table 980 else: 981 raise ValueError('Bad table argument') 982 else: 983 #Assume its a table ID 984 if self.alphabet==IUPAC.unambiguous_dna: 985 #Will use standard IUPAC protein alphabet, no need for X 986 codon_table = CodonTable.unambiguous_dna_by_id[table_id] 987 elif self.alphabet==IUPAC.unambiguous_rna: 988 #Will use standard IUPAC protein alphabet, no need for X 989 codon_table = CodonTable.unambiguous_rna_by_id[table_id] 990 else: 991 #This will use the extended IUPAC protein alphabet with X etc. 992 #The same table can be used for RNA or DNA (we use this for 993 #translating strings). 994 codon_table = CodonTable.ambiguous_generic_by_id[table_id] 995 protein = _translate_str(str(self), codon_table, \ 996 stop_symbol, to_stop, cds) 997 if stop_symbol in protein: 998 alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet, 999 stop_symbol = stop_symbol) 1000 else: 1001 alphabet = codon_table.protein_alphabet 1002 return Seq(protein, alphabet)
1003
1004 - def ungap(self, gap=None):
1005 """Return a copy of the sequence without the gap character(s). 1006 1007 The gap character can be specified in two ways - either as an explicit 1008 argument, or via the sequence's alphabet. For example: 1009 1010 >>> from Bio.Seq import Seq 1011 >>> from Bio.Alphabet import generic_dna 1012 >>> my_dna = Seq("-ATA--TGAAAT-TTGAAAA", generic_dna) 1013 >>> my_dna 1014 Seq('-ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 1015 >>> my_dna.ungap("-") 1016 Seq('ATATGAAATTTGAAAA', DNAAlphabet()) 1017 1018 If the gap character is not given as an argument, it will be taken from 1019 the sequence's alphabet (if defined). Notice that the returned sequence's 1020 alphabet is adjusted since it no longer requires a gapped alphabet: 1021 1022 >>> from Bio.Seq import Seq 1023 >>> from Bio.Alphabet import IUPAC, Gapped, HasStopCodon 1024 >>> my_pro = Seq("MVVLE=AD*", HasStopCodon(Gapped(IUPAC.protein, "="))) 1025 >>> my_pro 1026 Seq('MVVLE=AD*', HasStopCodon(Gapped(IUPACProtein(), '='), '*')) 1027 >>> my_pro.ungap() 1028 Seq('MVVLEAD*', HasStopCodon(IUPACProtein(), '*')) 1029 1030 Or, with a simpler gapped DNA example: 1031 1032 >>> from Bio.Seq import Seq 1033 >>> from Bio.Alphabet import IUPAC, Gapped 1034 >>> my_seq = Seq("CGGGTAG=AAAAAA", Gapped(IUPAC.unambiguous_dna, "=")) 1035 >>> my_seq 1036 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1037 >>> my_seq.ungap() 1038 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1039 1040 As long as it is consistent with the alphabet, although it is redundant, 1041 you can still supply the gap character as an argument to this method: 1042 1043 >>> my_seq 1044 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1045 >>> my_seq.ungap("=") 1046 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1047 1048 However, if the gap character given as the argument disagrees with that 1049 declared in the alphabet, an exception is raised: 1050 1051 >>> my_seq 1052 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1053 >>> my_seq.ungap("-") 1054 Traceback (most recent call last): 1055 ... 1056 ValueError: Gap '-' does not match '=' from alphabet 1057 1058 Finally, if a gap character is not supplied, and the alphabet does not 1059 define one, an exception is raised: 1060 1061 >>> from Bio.Seq import Seq 1062 >>> from Bio.Alphabet import generic_dna 1063 >>> my_dna = Seq("ATA--TGAAAT-TTGAAAA", generic_dna) 1064 >>> my_dna 1065 Seq('ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 1066 >>> my_dna.ungap() 1067 Traceback (most recent call last): 1068 ... 1069 ValueError: Gap character not given and not defined in alphabet 1070 1071 """ 1072 if hasattr(self.alphabet, "gap_char"): 1073 if not gap: 1074 gap = self.alphabet.gap_char 1075 elif gap != self.alphabet.gap_char: 1076 raise ValueError("Gap %s does not match %s from alphabet" \ 1077 % (repr(gap), repr(self.alphabet.gap_char))) 1078 alpha = Alphabet._ungap(self.alphabet) 1079 elif not gap: 1080 raise ValueError("Gap character not given and not defined in alphabet") 1081 else: 1082 alpha = self.alphabet #modify! 1083 if len(gap)!=1 or not isinstance(gap, str): 1084 raise ValueError("Unexpected gap character, %s" % repr(gap)) 1085 return Seq(str(self).replace(gap, ""), alpha)
1086
1087 -class UnknownSeq(Seq):
1088 """A read-only sequence object of known length but unknown contents. 1089 1090 If you have an unknown sequence, you can represent this with a normal 1091 Seq object, for example: 1092 1093 >>> my_seq = Seq("N"*5) 1094 >>> my_seq 1095 Seq('NNNNN', Alphabet()) 1096 >>> len(my_seq) 1097 5 1098 >>> print my_seq 1099 NNNNN 1100 1101 However, this is rather wasteful of memory (especially for large 1102 sequences), which is where this class is most usefull: 1103 1104 >>> unk_five = UnknownSeq(5) 1105 >>> unk_five 1106 UnknownSeq(5, alphabet = Alphabet(), character = '?') 1107 >>> len(unk_five) 1108 5 1109 >>> print(unk_five) 1110 ????? 1111 1112 You can add unknown sequence together, provided their alphabets and 1113 characters are compatible, and get another memory saving UnknownSeq: 1114 1115 >>> unk_four = UnknownSeq(4) 1116 >>> unk_four 1117 UnknownSeq(4, alphabet = Alphabet(), character = '?') 1118 >>> unk_four + unk_five 1119 UnknownSeq(9, alphabet = Alphabet(), character = '?') 1120 1121 If the alphabet or characters don't match up, the addition gives an 1122 ordinary Seq object: 1123 1124 >>> unk_nnnn = UnknownSeq(4, character = "N") 1125 >>> unk_nnnn 1126 UnknownSeq(4, alphabet = Alphabet(), character = 'N') 1127 >>> unk_nnnn + unk_four 1128 Seq('NNNN????', Alphabet()) 1129 1130 Combining with a real Seq gives a new Seq object: 1131 1132 >>> known_seq = Seq("ACGT") 1133 >>> unk_four + known_seq 1134 Seq('????ACGT', Alphabet()) 1135 >>> known_seq + unk_four 1136 Seq('ACGT????', Alphabet()) 1137 """
1138 - def __init__(self, length, alphabet = Alphabet.generic_alphabet, character = None):
1139 """Create a new UnknownSeq object. 1140 1141 If character is ommited, it is determed from the alphabet, "N" for 1142 nucleotides, "X" for proteins, and "?" otherwise. 1143 """ 1144 self._length = int(length) 1145 if self._length < 0: 1146 #TODO - Block zero length UnknownSeq? You can just use a Seq! 1147 raise ValueError("Length must not be negative.") 1148 self.alphabet = alphabet 1149 if character: 1150 if len(character) != 1: 1151 raise ValueError("character argument should be a single letter string.") 1152 self._character = character 1153 else: 1154 base = Alphabet._get_base_alphabet(alphabet) 1155 #TODO? Check the case of the letters in the alphabet? 1156 #We may have to use "n" instead of "N" etc. 1157 if isinstance(base, Alphabet.NucleotideAlphabet): 1158 self._character = "N" 1159 elif isinstance(base, Alphabet.ProteinAlphabet): 1160 self._character = "X" 1161 else: 1162 self._character = "?"
1163
1164 - def __len__(self):
1165 """Returns the stated length of the unknown sequence.""" 1166 return self._length
1167
1168 - def __str__(self):
1169 """Returns the unknown sequence as full string of the given length.""" 1170 return self._character * self._length
1171
1172 - def __repr__(self):
1173 return "UnknownSeq(%i, alphabet = %s, character = %s)" \ 1174 % (self._length, repr(self.alphabet), repr(self._character))
1175
1176 - def __add__(self, other):
1177 """Add another sequence or string to this sequence. 1178 1179 Adding two UnknownSeq objects returns another UnknownSeq object 1180 provided the character is the same and the alphabets are compatible. 1181 1182 >>> from Bio.Seq import UnknownSeq 1183 >>> from Bio.Alphabet import generic_protein 1184 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein) 1185 UnknownSeq(15, alphabet = ProteinAlphabet(), character = 'X') 1186 1187 If the characters differ, an UnknownSeq object cannot be used, so a 1188 Seq object is returned: 1189 1190 >>> from Bio.Seq import UnknownSeq 1191 >>> from Bio.Alphabet import generic_protein 1192 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein, 1193 ... character="x") 1194 Seq('XXXXXXXXXXxxxxx', ProteinAlphabet()) 1195 1196 If adding a string to an UnknownSeq, a new Seq is returned with the 1197 same alphabet: 1198 1199 >>> from Bio.Seq import UnknownSeq 1200 >>> from Bio.Alphabet import generic_protein 1201 >>> UnknownSeq(5, generic_protein) + "LV" 1202 Seq('XXXXXLV', ProteinAlphabet()) 1203 """ 1204 if isinstance(other, UnknownSeq) \ 1205 and other._character == self._character: 1206 #TODO - Check the alphabets match 1207 return UnknownSeq(len(self)+len(other), 1208 self.alphabet, self._character) 1209 #Offload to the base class... 1210 return Seq(str(self), self.alphabet) + other
1211
1212 - def __radd__(self, other):
1213 #If other is an UnknownSeq, then __add__ would be called. 1214 #Offload to the base class... 1215 return other + Seq(str(self), self.alphabet)
1216
1217 - def __getitem__(self, index):
1218 """Get a subsequence from the UnknownSeq object. 1219 1220 >>> unk = UnknownSeq(8, character="N") 1221 >>> print unk[:] 1222 NNNNNNNN 1223 >>> print unk[5:3] 1224 <BLANKLINE> 1225 >>> print unk[1:-1] 1226 NNNNNN 1227 >>> print unk[1:-1:2] 1228 NNN 1229 """ 1230 if isinstance(index, int): 1231 #TODO - Check the bounds without wasting memory 1232 return str(self)[index] 1233 old_length = self._length 1234 step = index.step 1235 if step is None or step == 1: 1236 #This calculates the length you'd get from ("N"*old_length)[index] 1237 start = index.start 1238 end = index.stop 1239 if start is None: 1240 start = 0 1241 elif start < 0: 1242 start = max(0, old_length + start) 1243 elif start > old_length: 1244 start = old_length 1245 if end is None: 1246 end = old_length 1247 elif end < 0: 1248 end = max(0, old_length + end) 1249 elif end > old_length: 1250 end = old_length 1251 new_length = max(0, end-start) 1252 elif step == 0: 1253 raise ValueError("slice step cannot be zero") 1254 else: 1255 #TODO - handle step efficiently 1256 new_length = len(("X"*old_length)[index]) 1257 #assert new_length == len(("X"*old_length)[index]), \ 1258 # (index, start, end, step, old_length, 1259 # new_length, len(("X"*old_length)[index])) 1260 return UnknownSeq(new_length, self.alphabet, self._character)
1261
1262 - def count(self, sub, start=0, end=sys.maxint):
1263 """Non-overlapping count method, like that of a python string. 1264 1265 This behaves like the python string (and Seq object) method of the 1266 same name, which does a non-overlapping count! 1267 1268 Returns an integer, the number of occurrences of substring 1269 argument sub in the (sub)sequence given by [start:end]. 1270 Optional arguments start and end are interpreted as in slice 1271 notation. 1272 1273 Arguments: 1274 - sub - a string or another Seq object to look for 1275 - start - optional integer, slice start 1276 - end - optional integer, slice end 1277 1278 >>> "NNNN".count("N") 1279 4 1280 >>> Seq("NNNN").count("N") 1281 4 1282 >>> UnknownSeq(4, character="N").count("N") 1283 4 1284 >>> UnknownSeq(4, character="N").count("A") 1285 0 1286 >>> UnknownSeq(4, character="N").count("AA") 1287 0 1288 1289 HOWEVER, please note because that python strings and Seq objects (and 1290 MutableSeq objects) do a non-overlapping search, this may not give 1291 the answer you expect: 1292 1293 >>> UnknownSeq(4, character="N").count("NN") 1294 2 1295 >>> UnknownSeq(4, character="N").count("NNN") 1296 1 1297 """ 1298 sub_str = self._get_seq_str_and_check_alphabet(sub) 1299 if len(sub_str) == 1: 1300 if str(sub_str) == self._character: 1301 if start==0 and end >= self._length: 1302 return self._length 1303 else: 1304 #This could be done more cleverly... 1305 return str(self).count(sub_str, start, end) 1306 else: 1307 return 0 1308 else: 1309 if set(sub_str) == set(self._character): 1310 if start==0 and end >= self._length: 1311 return self._length // len(sub_str) 1312 else: 1313 #This could be done more cleverly... 1314 return str(self).count(sub_str, start, end) 1315 else: 1316 return 0
1317
1318 - def complement(self):
1319 """The complement of an unknown nucleotide equals itself. 1320 1321 >>> my_nuc = UnknownSeq(8) 1322 >>> my_nuc 1323 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1324 >>> print my_nuc 1325 ???????? 1326 >>> my_nuc.complement() 1327 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1328 >>> print my_nuc.complement() 1329 ???????? 1330 """ 1331 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1332 Alphabet.ProteinAlphabet): 1333 raise ValueError("Proteins do not have complements!") 1334 return self
1335
1336 - def reverse_complement(self):
1337 """The reverse complement of an unknown nucleotide equals itself. 1338 1339 >>> my_nuc = UnknownSeq(10) 1340 >>> my_nuc 1341 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1342 >>> print my_nuc 1343 ?????????? 1344 >>> my_nuc.reverse_complement() 1345 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1346 >>> print my_nuc.reverse_complement() 1347 ?????????? 1348 """ 1349 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1350 Alphabet.ProteinAlphabet): 1351 raise ValueError("Proteins do not have complements!") 1352 return self
1353
1354 - def transcribe(self):
1355 """Returns unknown RNA sequence from an unknown DNA sequence. 1356 1357 >>> my_dna = UnknownSeq(10, character="N") 1358 >>> my_dna 1359 UnknownSeq(10, alphabet = Alphabet(), character = 'N') 1360 >>> print my_dna 1361 NNNNNNNNNN 1362 >>> my_rna = my_dna.transcribe() 1363 >>> my_rna 1364 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N') 1365 >>> print my_rna 1366 NNNNNNNNNN 1367 """ 1368 #Offload the alphabet stuff 1369 s = Seq(self._character, self.alphabet).transcribe() 1370 return UnknownSeq(self._length, s.alphabet, self._character)
1371
1372 - def back_transcribe(self):
1373 """Returns unknown DNA sequence from an unknown RNA sequence. 1374 1375 >>> my_rna = UnknownSeq(20, character="N") 1376 >>> my_rna 1377 UnknownSeq(20, alphabet = Alphabet(), character = 'N') 1378 >>> print my_rna 1379 NNNNNNNNNNNNNNNNNNNN 1380 >>> my_dna = my_rna.back_transcribe() 1381 >>> my_dna 1382 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1383 >>> print my_dna 1384 NNNNNNNNNNNNNNNNNNNN 1385 """ 1386 #Offload the alphabet stuff 1387 s = Seq(self._character, self.alphabet).back_transcribe() 1388 return UnknownSeq(self._length, s.alphabet, self._character)
1389
1390 - def upper(self):
1391 """Returns an upper case copy of the sequence. 1392 1393 >>> from Bio.Alphabet import generic_dna 1394 >>> from Bio.Seq import UnknownSeq 1395 >>> my_seq = UnknownSeq(20, generic_dna, character="n") 1396 >>> my_seq 1397 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'n') 1398 >>> print my_seq 1399 nnnnnnnnnnnnnnnnnnnn 1400 >>> my_seq.upper() 1401 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1402 >>> print my_seq.upper() 1403 NNNNNNNNNNNNNNNNNNNN 1404 1405 This will adjust the alphabet if required. See also the lower method. 1406 """ 1407 return UnknownSeq(self._length, self.alphabet._upper(), self._character.upper())
1408
1409 - def lower(self):
1410 """Returns a lower case copy of the sequence. 1411 1412 This will adjust the alphabet if required: 1413 1414 >>> from Bio.Alphabet import IUPAC 1415 >>> from Bio.Seq import UnknownSeq 1416 >>> my_seq = UnknownSeq(20, IUPAC.extended_protein) 1417 >>> my_seq 1418 UnknownSeq(20, alphabet = ExtendedIUPACProtein(), character = 'X') 1419 >>> print my_seq 1420 XXXXXXXXXXXXXXXXXXXX 1421 >>> my_seq.lower() 1422 UnknownSeq(20, alphabet = ProteinAlphabet(), character = 'x') 1423 >>> print my_seq.lower() 1424 xxxxxxxxxxxxxxxxxxxx 1425 1426 See also the upper method. 1427 """ 1428 return UnknownSeq(self._length, self.alphabet._lower(), self._character.lower())
1429
1430 - def translate(self, **kwargs):
1431 """Translate an unknown nucleotide sequence into an unknown protein. 1432 1433 e.g. 1434 1435 >>> my_seq = UnknownSeq(11, character="N") 1436 >>> print my_seq 1437 NNNNNNNNNNN 1438 >>> my_protein = my_seq.translate() 1439 >>> my_protein 1440 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X') 1441 >>> print my_protein 1442 XXX 1443 1444 In comparison, using a normal Seq object: 1445 1446 >>> my_seq = Seq("NNNNNNNNNNN") 1447 >>> print my_seq 1448 NNNNNNNNNNN 1449 >>> my_protein = my_seq.translate() 1450 >>> my_protein 1451 Seq('XXX', ExtendedIUPACProtein()) 1452 >>> print my_protein 1453 XXX 1454 1455 """ 1456 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1457 Alphabet.ProteinAlphabet): 1458 raise ValueError("Proteins cannot be translated!") 1459 return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
1460
1461 - def ungap(self, gap=None):
1462 """Return a copy of the sequence without the gap character(s). 1463 1464 The gap character can be specified in two ways - either as an explicit 1465 argument, or via the sequence's alphabet. For example: 1466 1467 >>> from Bio.Seq import UnknownSeq 1468 >>> from Bio.Alphabet import Gapped, generic_dna 1469 >>> my_dna = UnknownSeq(20, Gapped(generic_dna,"-")) 1470 >>> my_dna 1471 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = 'N') 1472 >>> my_dna.ungap() 1473 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1474 >>> my_dna.ungap("-") 1475 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1476 1477 If the UnknownSeq is using the gap character, then an empty Seq is 1478 returned: 1479 1480 >>> my_gap = UnknownSeq(20, Gapped(generic_dna,"-"), character="-") 1481 >>> my_gap 1482 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = '-') 1483 >>> my_gap.ungap() 1484 Seq('', DNAAlphabet()) 1485 >>> my_gap.ungap("-") 1486 Seq('', DNAAlphabet()) 1487 1488 Notice that the returned sequence's alphabet is adjusted to remove any 1489 explicit gap character declaration. 1490 """ 1491 #Offload the alphabet stuff 1492 s = Seq(self._character, self.alphabet).ungap() 1493 if s : 1494 return UnknownSeq(self._length, s.alphabet, self._character) 1495 else : 1496 return Seq("", s.alphabet)
1497
1498 -class MutableSeq(object):
1499 """An editable sequence object (with an alphabet). 1500 1501 Unlike normal python strings and our basic sequence object (the Seq class) 1502 which are immuatable, the MutableSeq lets you edit the sequence in place. 1503 However, this means you cannot use a MutableSeq object as a dictionary key. 1504 1505 >>> from Bio.Seq import MutableSeq 1506 >>> from Bio.Alphabet import generic_dna 1507 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna) 1508 >>> my_seq 1509 MutableSeq('ACTCGTCGTCG', DNAAlphabet()) 1510 >>> my_seq[5] 1511 'T' 1512 >>> my_seq[5] = "A" 1513 >>> my_seq 1514 MutableSeq('ACTCGACGTCG', DNAAlphabet()) 1515 >>> my_seq[5] 1516 'A' 1517 >>> my_seq[5:8] = "NNN" 1518 >>> my_seq 1519 MutableSeq('ACTCGNNNTCG', DNAAlphabet()) 1520 >>> len(my_seq) 1521 11 1522 1523 Note that the MutableSeq object does not support as many string-like 1524 or biological methods as the Seq object. 1525 """
1526 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
1527 if sys.version_info[0] == 3: 1528 self.array_indicator = "u" 1529 else: 1530 self.array_indicator = "c" 1531 if isinstance(data, str): #TODO - What about unicode? 1532 self.data = array.array(self.array_indicator, data) 1533 else: 1534 self.data = data # assumes the input is an array 1535 self.alphabet = alphabet
1536
1537 - def __repr__(self):
1538 """Returns a (truncated) representation of the sequence for debugging.""" 1539 if len(self) > 60: 1540 #Shows the last three letters as it is often useful to see if there 1541 #is a stop codon at the end of a sequence. 1542 #Note total length is 54+3+3=60 1543 return "%s('%s...%s', %s)" % (self.__class__.__name__, 1544 str(self[:54]), str(self[-3:]), 1545 repr(self.alphabet)) 1546 else: 1547 return "%s('%s', %s)" % (self.__class__.__name__, 1548 str(self), 1549 repr(self.alphabet))
1550
1551 - def __str__(self):
1552 """Returns the full sequence as a python string. 1553 1554 Note that Biopython 1.44 and earlier would give a truncated 1555 version of repr(my_seq) for str(my_seq). If you are writing code 1556 which needs to be backwards compatible with old Biopython, you 1557 should continue to use my_seq.tostring() rather than str(my_seq). 1558 """ 1559 #See test_GAQueens.py for an historic usage of a non-string alphabet! 1560 return "".join(self.data)
1561
1562 - def __cmp__(self, other):
1563 """Compare the sequence to another sequence or a string (README). 1564 1565 Currently if compared to another sequence the alphabets must be 1566 compatible. Comparing DNA to RNA, or Nucleotide to Protein will raise 1567 an exception. Otherwise only the sequence itself is compared, not the 1568 precise alphabet. 1569 1570 A future release of Biopython will change this (and the Seq object etc) 1571 to use simple string comparison. The plan is that comparing sequences 1572 with incompatible alphabets (e.g. DNA to RNA) will trigger a warning 1573 but not an exception. 1574 1575 During this transition period, please just do explicit comparisons: 1576 1577 >>> seq1 = MutableSeq("ACGT") 1578 >>> seq2 = MutableSeq("ACGT") 1579 >>> id(seq1) == id(seq2) 1580 False 1581 >>> str(seq1) == str(seq2) 1582 True 1583 1584 This method indirectly supports ==, < , etc. 1585 """ 1586 if hasattr(other, "alphabet"): 1587 #other should be a Seq or a MutableSeq 1588 import warnings 1589 warnings.warn("In future comparing incompatible alphabets will " 1590 "only trigger a warning (not an exception). In " 1591 "the interim please use id(seq1)==id(seq2) or " 1592 "str(seq1)==str(seq2) to make your code explicit " 1593 "and to avoid this warning.", FutureWarning) 1594 if not Alphabet._check_type_compatible([self.alphabet, 1595 other.alphabet]): 1596 raise TypeError("Incompatable alphabets %s and %s" \ 1597 % (repr(self.alphabet), repr(other.alphabet))) 1598 #They should be the same sequence type (or one of them is generic) 1599 if isinstance(other, MutableSeq): 1600 #See test_GAQueens.py for an historic usage of a non-string 1601 #alphabet! Comparing the arrays supports this. 1602 return cmp(self.data, other.data) 1603 else: 1604 return cmp(str(self), str(other)) 1605 elif isinstance(other, basestring): 1606 return cmp(str(self), other) 1607 else: 1608 raise TypeError
1609
1610 - def __len__(self): return len(self.data)
1611
1612 - def __getitem__(self, index):
1613 #Note since Python 2.0, __getslice__ is deprecated 1614 #and __getitem__ is used instead. 1615 #See http://docs.python.org/ref/sequence-methods.html 1616 if isinstance(index, int): 1617 #Return a single letter as a string 1618 return self.data[index] 1619 else: 1620 #Return the (sub)sequence as another Seq object 1621 return MutableSeq(self.data[index], self.alphabet)
1622
1623 - def __setitem__(self, index, value):
1624 #Note since Python 2.0, __setslice__ is deprecated 1625 #and __setitem__ is used instead. 1626 #See http://docs.python.org/ref/sequence-methods.html 1627 if isinstance(index, int): 1628 #Replacing a single letter with a new string 1629 self.data[index] = value 1630 else: 1631 #Replacing a sub-sequence 1632 if isinstance(value, MutableSeq): 1633 self.data[index] = value.data 1634 elif isinstance(value, type(self.data)): 1635 self.data[index] = value 1636 else: 1637 self.data[index] = array.array(self.array_indicator, 1638 str(value))
1639
1640 - def __delitem__(self, index):
1641 #Note since Python 2.0, __delslice__ is deprecated 1642 #and __delitem__ is used instead. 1643 #See http://docs.python.org/ref/sequence-methods.html 1644 1645 #Could be deleting a single letter, or a slice 1646 del self.data[index]
1647
1648 - def __add__(self, other):
1649 """Add another sequence or string to this sequence. 1650 1651 Returns a new MutableSeq object.""" 1652 if hasattr(other, "alphabet"): 1653 #other should be a Seq or a MutableSeq 1654 if not Alphabet._check_type_compatible([self.alphabet, 1655 other.alphabet]): 1656 raise TypeError("Incompatable alphabets %s and %s" \ 1657 % (repr(self.alphabet), repr(other.alphabet))) 1658 #They should be the same sequence type (or one of them is generic) 1659 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1660 if isinstance(other, MutableSeq): 1661 #See test_GAQueens.py for an historic usage of a non-string 1662 #alphabet! Adding the arrays should support this. 1663 return self.__class__(self.data + other.data, a) 1664 else: 1665 return self.__class__(str(self) + str(other), a) 1666 elif isinstance(other, basestring): 1667 #other is a plain string - use the current alphabet 1668 return self.__class__(str(self) + str(other), self.alphabet) 1669 else: 1670 raise TypeError
1671
1672 - def __radd__(self, other):
1673 if hasattr(other, "alphabet"): 1674 #other should be a Seq or a MutableSeq 1675 if not Alphabet._check_type_compatible([self.alphabet, 1676 other.alphabet]): 1677 raise TypeError("Incompatable alphabets %s and %s" \ 1678 % (repr(self.alphabet), repr(other.alphabet))) 1679 #They should be the same sequence type (or one of them is generic) 1680 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1681 if isinstance(other, MutableSeq): 1682 #See test_GAQueens.py for an historic usage of a non-string 1683 #alphabet! Adding the arrays should support this. 1684 return self.__class__(other.data + self.data, a) 1685 else: 1686 return self.__class__(str(other) + str(self), a) 1687 elif isinstance(other, basestring): 1688 #other is a plain string - use the current alphabet 1689 return self.__class__(str(other) + str(self), self.alphabet) 1690 else: 1691 raise TypeError
1692
1693 - def append(self, c):
1694 self.data.append(c)
1695
1696 - def insert(self, i, c):
1697 self.data.insert(i, c)
1698
1699 - def pop(self, i = (-1)):
1700 c = self.data[i] 1701 del self.data[i] 1702 return c
1703
1704 - def remove(self, item):
1705 for i in range(len(self.data)): 1706 if self.data[i] == item: 1707 del self.data[i] 1708 return 1709 raise ValueError("MutableSeq.remove(x): x not in list")
1710
1711 - def count(self, sub, start=0, end=sys.maxint):
1712 """Non-overlapping count method, like that of a python string. 1713 1714 This behaves like the python string method of the same name, 1715 which does a non-overlapping count! 1716 1717 Returns an integer, the number of occurrences of substring 1718 argument sub in the (sub)sequence given by [start:end]. 1719 Optional arguments start and end are interpreted as in slice 1720 notation. 1721 1722 Arguments: 1723 - sub - a string or another Seq object to look for 1724 - start - optional integer, slice start 1725 - end - optional integer, slice end 1726 1727 e.g. 1728 1729 >>> from Bio.Seq import MutableSeq 1730 >>> my_mseq = MutableSeq("AAAATGA") 1731 >>> print my_mseq.count("A") 1732 5 1733 >>> print my_mseq.count("ATG") 1734 1 1735 >>> print my_mseq.count(Seq("AT")) 1736 1 1737 >>> print my_mseq.count("AT", 2, -1) 1738 1 1739 1740 HOWEVER, please note because that python strings, Seq objects and 1741 MutableSeq objects do a non-overlapping search, this may not give 1742 the answer you expect: 1743 1744 >>> "AAAA".count("AA") 1745 2 1746 >>> print MutableSeq("AAAA").count("AA") 1747 2 1748 1749 A non-overlapping search would give the answer as three! 1750 """ 1751 try: 1752 #TODO - Should we check the alphabet? 1753 search = sub.tostring() 1754 except AttributeError: 1755 search = sub 1756 1757 if not isinstance(search, basestring): 1758 raise TypeError("expected a string, Seq or MutableSeq") 1759 1760 if len(search) == 1: 1761 #Try and be efficient and work directly from the array. 1762 count = 0 1763 for c in self.data[start:end]: 1764 if c == search: count += 1 1765 return count 1766 else: 1767 #TODO - Can we do this more efficiently? 1768 return self.tostring().count(search, start, end)
1769
1770 - def index(self, item):
1771 for i in range(len(self.data)): 1772 if self.data[i] == item: 1773 return i 1774 raise ValueError("MutableSeq.index(x): x not in list")
1775
1776 - def reverse(self):
1777 """Modify the mutable sequence to reverse itself. 1778 1779 No return value. 1780 """ 1781 self.data.reverse()
1782
1783 - def complement(self):
1784 """Modify the mutable sequence to take on its complement. 1785 1786 Trying to complement a protein sequence raises an exception. 1787 1788 No return value. 1789 """ 1790 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1791 Alphabet.ProteinAlphabet): 1792 raise ValueError("Proteins do not have complements!") 1793 if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): 1794 d = ambiguous_dna_complement 1795 elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): 1796 d = ambiguous_rna_complement 1797 elif 'U' in self.data and 'T' in self.data: 1798 #TODO - Handle this cleanly? 1799 raise ValueError("Mixed RNA/DNA found") 1800 elif 'U' in self.data: 1801 d = ambiguous_rna_complement 1802 else: 1803 d = ambiguous_dna_complement 1804 c = dict([(x.lower(), y.lower()) for x,y in d.iteritems()]) 1805 d.update(c) 1806 self.data = map(lambda c: d[c], self.data) 1807 self.data = array.array(self.array_indicator, self.data)
1808
1809 - def reverse_complement(self):
1810 """Modify the mutable sequence to take on its reverse complement. 1811 1812 Trying to reverse complement a protein sequence raises an exception. 1813 1814 No return value. 1815 """ 1816 self.complement() 1817 self.data.reverse()
1818 1819 ## Sorting a sequence makes no sense. 1820 # def sort(self, *args): self.data.sort(*args) 1821
1822 - def extend(self, other):
1823 if isinstance(other, MutableSeq): 1824 for c in other.data: 1825 self.data.append(c) 1826 else: 1827 for c in other: 1828 self.data.append(c)
1829
1830 - def tostring(self):
1831 """Returns the full sequence as a python string (semi-obsolete). 1832 1833 Although not formally deprecated, you are now encouraged to use 1834 str(my_seq) instead of my_seq.tostring(). 1835 1836 Because str(my_seq) will give you the full sequence as a python string, 1837 there is often no need to make an explicit conversion. For example, 1838 1839 print "ID={%s}, sequence={%s}" % (my_name, my_seq) 1840 1841 On Biopython 1.44 or older you would have to have done this: 1842 1843 print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring()) 1844 """ 1845 return "".join(self.data)
1846
1847 - def toseq(self):
1848 """Returns the full sequence as a new immutable Seq object. 1849 1850 >>> from Bio.Seq import Seq 1851 >>> from Bio.Alphabet import IUPAC 1852 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", 1853 ... IUPAC.protein) 1854 >>> my_mseq 1855 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1856 >>> my_mseq.toseq() 1857 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1858 1859 Note that the alphabet is preserved. 1860 """ 1861 return Seq("".join(self.data), self.alphabet)
1862
1863 # The transcribe, backward_transcribe, and translate functions are 1864 # user-friendly versions of the corresponding functions in Bio.Transcribe 1865 # and Bio.Translate. The functions work both on Seq objects, and on strings. 1866 1867 -def transcribe(dna):
1868 """Transcribes a DNA sequence into RNA. 1869 1870 If given a string, returns a new string object. 1871 1872 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1873 1874 Trying to transcribe a protein or RNA sequence raises an exception. 1875 1876 e.g. 1877 1878 >>> transcribe("ACTGN") 1879 'ACUGN' 1880 """ 1881 if isinstance(dna, Seq): 1882 return dna.transcribe() 1883 elif isinstance(dna, MutableSeq): 1884 return dna.toseq().transcribe() 1885 else: 1886 return dna.replace('T','U').replace('t','u')
1887
1888 -def back_transcribe(rna):
1889 """Back-transcribes an RNA sequence into DNA. 1890 1891 If given a string, returns a new string object. 1892 1893 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1894 1895 Trying to transcribe a protein or DNA sequence raises an exception. 1896 1897 e.g. 1898 1899 >>> back_transcribe("ACUGN") 1900 'ACTGN' 1901 """ 1902 if isinstance(rna, Seq): 1903 return rna.back_transcribe() 1904 elif isinstance(rna, MutableSeq): 1905 return rna.toseq().back_transcribe() 1906 else: 1907 return rna.replace('U','T').replace('u','t')
1908
1909 -def _translate_str(sequence, table, stop_symbol="*", to_stop=False, 1910 cds=False, pos_stop="X"):
1911 """Helper function to translate a nucleotide string (PRIVATE). 1912 1913 Arguments: 1914 - sequence - a string 1915 - table - a CodonTable object (NOT a table name or id number) 1916 - stop_symbol - a single character string, what to use for terminators. 1917 - to_stop - boolean, should translation terminate at the first 1918 in frame stop codon? If there is no in-frame stop codon 1919 then translation continues to the end. 1920 - pos_stop - a single character string for a possible stop codon 1921 (e.g. TAN or NNN) 1922 - cds - Boolean, indicates this is a complete CDS. If True, this 1923 checks the sequence starts with a valid alternative start 1924 codon (which will be translated as methionine, M), that the 1925 sequence length is a multiple of three, and that there is a 1926 single in frame stop codon at the end (this will be excluded 1927 from the protein sequence, regardless of the to_stop option). 1928 If these tests fail, an exception is raised. 1929 1930 Returns a string. 1931 1932 e.g. 1933 1934 >>> from Bio.Data import CodonTable 1935 >>> table = CodonTable.ambiguous_dna_by_id[1] 1936 >>> _translate_str("AAA", table) 1937 'K' 1938 >>> _translate_str("TAR", table) 1939 '*' 1940 >>> _translate_str("TAN", table) 1941 'X' 1942 >>> _translate_str("TAN", table, pos_stop="@") 1943 '@' 1944 >>> _translate_str("TA?", table) 1945 Traceback (most recent call last): 1946 ... 1947 TranslationError: Codon 'TA?' is invalid 1948 >>> _translate_str("ATGCCCTAG", table, cds=True) 1949 'MP' 1950 >>> _translate_str("AAACCCTAG", table, cds=True) 1951 Traceback (most recent call last): 1952 ... 1953 TranslationError: First codon 'AAA' is not a start codon 1954 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True) 1955 Traceback (most recent call last): 1956 ... 1957 TranslationError: Extra in frame stop codon found. 1958 """ 1959 sequence = sequence.upper() 1960 amino_acids = [] 1961 forward_table = table.forward_table 1962 stop_codons = table.stop_codons 1963 if table.nucleotide_alphabet.letters is not None: 1964 valid_letters = set(table.nucleotide_alphabet.letters.upper()) 1965 else: 1966 #Assume the worst case, ambiguous DNA or RNA: 1967 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + \ 1968 IUPAC.ambiguous_rna.letters.upper()) 1969 if cds: 1970 if str(sequence[:3]).upper() not in table.start_codons: 1971 raise CodonTable.TranslationError(\ 1972 "First codon '%s' is not a start codon" % sequence[:3]) 1973 if len(sequence) % 3 != 0: 1974 raise CodonTable.TranslationError(\ 1975 "Sequence length %i is not a multiple of three" % len(sequence)) 1976 if str(sequence[-3:]).upper() not in stop_codons: 1977 raise CodonTable.TranslationError(\ 1978 "Final codon '%s' is not a stop codon" % sequence[-3:]) 1979 #Don't translate the stop symbol, and manually translate the M 1980 sequence = sequence[3:-3] 1981 amino_acids = ["M"] 1982 n = len(sequence) 1983 for i in xrange(0,n-n%3,3): 1984 codon = sequence[i:i+3] 1985 try: 1986 amino_acids.append(forward_table[codon]) 1987 except (KeyError, CodonTable.TranslationError): 1988 #Todo? Treat "---" as a special case (gapped translation) 1989 if codon in table.stop_codons: 1990 if cds: 1991 raise CodonTable.TranslationError(\ 1992 "Extra in frame stop codon found.") 1993 if to_stop : break 1994 amino_acids.append(stop_symbol) 1995 elif valid_letters.issuperset(set(codon)): 1996 #Possible stop codon (e.g. NNN or TAN) 1997 amino_acids.append(pos_stop) 1998 else: 1999 raise CodonTable.TranslationError(\ 2000 "Codon '%s' is invalid" % codon) 2001 return "".join(amino_acids)
2002
2003 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False, 2004 cds=False):
2005 """Translate a nucleotide sequence into amino acids. 2006 2007 If given a string, returns a new string object. Given a Seq or 2008 MutableSeq, returns a Seq object with a protein alphabet. 2009 2010 Arguments: 2011 - table - Which codon table to use? This can be either a name (string), 2012 an NCBI identifier (integer), or a CodonTable object (useful 2013 for non-standard genetic codes). Defaults to the "Standard" 2014 table. 2015 - stop_symbol - Single character string, what to use for any 2016 terminators, defaults to the asterisk, "*". 2017 - to_stop - Boolean, defaults to False meaning do a full 2018 translation continuing on past any stop codons 2019 (translated as the specified stop_symbol). If 2020 True, translation is terminated at the first in 2021 frame stop codon (and the stop_symbol is not 2022 appended to the returned protein sequence). 2023 - cds - Boolean, indicates this is a complete CDS. If True, this 2024 checks the sequence starts with a valid alternative start 2025 codon (which will be translated as methionine, M), that the 2026 sequence length is a multiple of three, and that there is a 2027 single in frame stop codon at the end (this will be excluded 2028 from the protein sequence, regardless of the to_stop option). 2029 If these tests fail, an exception is raised. 2030 2031 A simple string example using the default (standard) genetic code: 2032 2033 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" 2034 >>> translate(coding_dna) 2035 'VAIVMGR*KGAR*' 2036 >>> translate(coding_dna, stop_symbol="@") 2037 'VAIVMGR@KGAR@' 2038 >>> translate(coding_dna, to_stop=True) 2039 'VAIVMGR' 2040 2041 Now using NCBI table 2, where TGA is not a stop codon: 2042 2043 >>> translate(coding_dna, table=2) 2044 'VAIVMGRWKGAR*' 2045 >>> translate(coding_dna, table=2, to_stop=True) 2046 'VAIVMGRWKGAR' 2047 2048 In fact this example uses an alternative start codon valid under NCBI table 2, 2049 GTG, which means this example is a complete valid CDS which when translated 2050 should really start with methionine (not valine): 2051 2052 >>> translate(coding_dna, table=2, cds=True) 2053 'MAIVMGRWKGAR' 2054 2055 Note that if the sequence has no in-frame stop codon, then the to_stop 2056 argument has no effect: 2057 2058 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC" 2059 >>> translate(coding_dna2) 2060 'VAIVMGR' 2061 >>> translate(coding_dna2, to_stop=True) 2062 'VAIVMGR' 2063 2064 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 2065 or a stop codon. These are translated as "X". Any invalid codon 2066 (e.g. "TA?" or "T-A") will throw a TranslationError. 2067 2068 NOTE - Does NOT support gapped sequences. 2069 2070 It will however translate either DNA or RNA. 2071 """ 2072 if isinstance(sequence, Seq): 2073 return sequence.translate(table, stop_symbol, to_stop, cds) 2074 elif isinstance(sequence, MutableSeq): 2075 #Return a Seq object 2076 return sequence.toseq().translate(table, stop_symbol, to_stop, cds) 2077 else: 2078 #Assume its a string, return a string 2079 try: 2080 codon_table = CodonTable.ambiguous_generic_by_id[int(table)] 2081 except ValueError: 2082 codon_table = CodonTable.ambiguous_generic_by_name[table] 2083 except (AttributeError, TypeError): 2084 if isinstance(table, CodonTable.CodonTable): 2085 codon_table = table 2086 else: 2087 raise ValueError('Bad table argument') 2088 return _translate_str(sequence, codon_table, stop_symbol, to_stop, cds)
2089
2090 -def reverse_complement(sequence):
2091 """Returns the reverse complement sequence of a nucleotide string. 2092 2093 If given a string, returns a new string object. 2094 Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet. 2095 2096 Supports unambiguous and ambiguous nucleotide sequences. 2097 2098 e.g. 2099 2100 >>> reverse_complement("ACTG-NH") 2101 'DN-CAGT' 2102 """ 2103 if isinstance(sequence, Seq): 2104 #Return a Seq 2105 return sequence.reverse_complement() 2106 elif isinstance(sequence, MutableSeq): 2107 #Return a Seq 2108 #Don't use the MutableSeq reverse_complement method as it is 'in place'. 2109 return sequence.toseq().reverse_complement() 2110 2111 #Assume its a string. 2112 #In order to avoid some code duplication, the old code would turn the string 2113 #into a Seq, use the reverse_complement method, and convert back to a string. 2114 #This worked, but is over five times slower on short sequences! 2115 if ('U' in sequence or 'u' in sequence) \ 2116 and ('T' in sequence or 't' in sequence): 2117 raise ValueError("Mixed RNA/DNA found") 2118 elif 'U' in sequence or 'u' in sequence: 2119 ttable = _rna_complement_table 2120 else: 2121 ttable = _dna_complement_table 2122 return sequence.translate(ttable)[::-1]
2123
2124 -def _test():
2125 """Run the Bio.Seq module's doctests (PRIVATE).""" 2126 if sys.version_info[0:2] == (3,1): 2127 print "Not running Bio.Seq doctest on Python 3.1" 2128 print "See http://bugs.python.org/issue7490" 2129 else: 2130 print "Runing doctests..." 2131 import doctest 2132 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL) 2133 print "Done"
2134 2135 if __name__ == "__main__": 2136 _test() 2137