Package Bio :: Package Align :: Module Generic
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Generic

  1  # Copyright 2000-2004 Brad Chapman. 
  2  # Copyright 2001 Iddo Friedberg. 
  3  # Copyright 2007-2010 by Peter Cock. 
  4  # All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  """ 
  9  Contains classes to deal with generic sequence alignment stuff not 
 10  specific to a particular program or format. 
 11   
 12  classes: 
 13  o Alignment 
 14  """ 
 15   
 16  # biopython 
 17  from Bio.Seq import Seq 
 18  from Bio.SeqRecord import SeqRecord 
 19  from Bio import Alphabet 
 20   
21 -class Alignment:
22 """Represent a set of alignments (OBSOLETE?). 23 24 This is a base class to represent alignments, which can be subclassed 25 to deal with an alignment in a specific format. 26 27 With the introduction of the MultipleSeqAlignment class in Bio.Align, 28 this base class is effectively obsolete and will likely be deprecated and 29 later removed in future releases of Biopython. 30 """
31 - def __init__(self, alphabet):
32 """Initialize a new Alignment object. 33 34 Arguments: 35 o alphabet - The alphabet to use for the sequence objects that are 36 created. This alphabet must be a gapped type. 37 38 e.g. 39 >>> from Bio.Alphabet import IUPAC, Gapped 40 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 41 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 42 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 43 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 44 >>> print align 45 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 46 ACTGCTAGCTAG Alpha 47 ACT-CTAGCTAG Beta 48 ACTGCTAGATAG Gamma 49 """ 50 import warnings 51 warnings.warn("With the introduction of the MultipleSeqAlignment class in Bio.Align, this base class is effectively obsolete and will likely be deprecated and later removed in future releases of Biopython.", PendingDeprecationWarning) 52 if not (isinstance(alphabet, Alphabet.Alphabet) \ 53 or isinstance(alphabet, Alphabet.AlphabetEncoder)): 54 raise ValueError("Invalid alphabet argument") 55 self._alphabet = alphabet 56 # hold everything at a list of SeqRecord objects 57 self._records = []
58
59 - def _str_line(self, record):
60 """Returns a truncated string representation of a SeqRecord (PRIVATE). 61 62 This is a PRIVATE function used by the __str__ method. 63 """ 64 if len(record.seq) <= 50: 65 return "%s %s" % (record.seq, record.id) 66 else: 67 return "%s...%s %s" \ 68 % (record.seq[:44], record.seq[-3:], record.id)
69
70 - def __str__(self):
71 """Returns a multi-line string summary of the alignment. 72 73 This output is intended to be readable, but large alignments are 74 shown truncated. A maximum of 20 rows (sequences) and 50 columns 75 are shown, with the record identifiers. This should fit nicely on a 76 single screen. e.g. 77 78 >>> from Bio.Alphabet import IUPAC, Gapped 79 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 80 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 81 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 82 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 83 >>> print align 84 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 85 ACTGCTAGCTAG Alpha 86 ACT-CTAGCTAG Beta 87 ACTGCTAGATAG Gamma 88 89 See also the alignment's format method. 90 """ 91 rows = len(self._records) 92 lines = ["%s alignment with %i rows and %i columns" \ 93 % (str(self._alphabet), rows, self.get_alignment_length())] 94 if rows <= 20: 95 lines.extend([self._str_line(rec) for rec in self._records]) 96 else: 97 lines.extend([self._str_line(rec) for rec in self._records[:18]]) 98 lines.append("...") 99 lines.append(self._str_line(self._records[-1])) 100 return "\n".join(lines)
101
102 - def __repr__(self):
103 """Returns a representation of the object for debugging. 104 105 The representation cannot be used with eval() to recreate the object, 106 which is usually possible with simple python ojects. For example: 107 108 <Bio.Align.Generic.Alignment instance (2 records of length 14, 109 SingleLetterAlphabet()) at a3c184c> 110 111 The hex string is the memory address of the object, see help(id). 112 This provides a simple way to visually distinguish alignments of 113 the same size. 114 """ 115 #A doctest for __repr__ would be nice, but __class__ comes out differently 116 #if run via the __main__ trick. 117 return "<%s instance (%i records of length %i, %s) at %x>" % \ 118 (self.__class__, len(self._records), 119 self.get_alignment_length(), repr(self._alphabet), id(self))
120 #This version is useful for doing eval(repr(alignment)), 121 #but it can be VERY long: 122 #return "%s(%s, %s)" \ 123 # % (self.__class__, repr(self._records), repr(self._alphabet)) 124
125 - def format(self, format):
126 """Returns the alignment as a string in the specified file format. 127 128 The format should be a lower case string supported as an output 129 format by Bio.AlignIO (such as "fasta", "clustal", "phylip", 130 "stockholm", etc), which is used to turn the alignment into a 131 string. 132 133 e.g. 134 >>> from Bio.Alphabet import IUPAC, Gapped 135 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 136 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 137 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 138 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 139 >>> print align.format("fasta") 140 >Alpha 141 ACTGCTAGCTAG 142 >Beta 143 ACT-CTAGCTAG 144 >Gamma 145 ACTGCTAGATAG 146 <BLANKLINE> 147 >>> print align.format("phylip") 148 3 12 149 Alpha ACTGCTAGCT AG 150 Beta ACT-CTAGCT AG 151 Gamma ACTGCTAGAT AG 152 <BLANKLINE> 153 154 For Python 2.6, 3.0 or later see also the built in format() function. 155 """ 156 #See also the __format__ added for Python 2.6 / 3.0, PEP 3101 157 #See also the SeqRecord class and its format() method using Bio.SeqIO 158 return self.__format__(format)
159 160
161 - def __format__(self, format_spec):
162 """Returns the alignment as a string in the specified file format. 163 164 This method supports the python format() function added in 165 Python 2.6/3.0. The format_spec should be a lower case 166 string supported by Bio.AlignIO as an output file format. 167 See also the alignment's format() method.""" 168 if format_spec: 169 from StringIO import StringIO 170 from Bio import AlignIO 171 handle = StringIO() 172 AlignIO.write([self], handle, format_spec) 173 return handle.getvalue() 174 else: 175 #Follow python convention and default to using __str__ 176 return str(self)
177
178 - def get_all_seqs(self):
179 """Return all of the sequences involved in the alignment (DEPRECATED). 180 181 The return value is a list of SeqRecord objects. 182 183 This method is deprecated, as the Alignment object itself now offers 184 much of the functionality of a list of SeqRecord objects (e.g. 185 iteration or slicing to create a sub-alignment). Instead use the 186 Python builtin function list, i.e. my_list = list(my_align) 187 """ 188 import warnings 189 import Bio 190 warnings.warn("This method is deprecated, since the alignment object" 191 "now acts more like a list. Instead of calling " 192 "align.get_all_seqs() you can use list(align)", 193 Bio.BiopythonDeprecationWarning) 194 return self._records
195
196 - def __iter__(self):
197 """Iterate over alignment rows as SeqRecord objects. 198 199 e.g. 200 >>> from Bio.Alphabet import IUPAC, Gapped 201 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 202 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 203 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 204 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 205 >>> for record in align: 206 ... print record.id 207 ... print record.seq 208 Alpha 209 ACTGCTAGCTAG 210 Beta 211 ACT-CTAGCTAG 212 Gamma 213 ACTGCTAGATAG 214 """ 215 return iter(self._records)
216
217 - def get_seq_by_num(self, number):
218 """Retrieve a sequence by row number (OBSOLETE). 219 220 Returns: 221 o A Seq object for the requested sequence. 222 223 Raises: 224 o IndexError - If the specified number is out of range. 225 226 NOTE: This is a legacy method. In new code where you need to access 227 the rows of the alignment (i.e. the sequences) consider iterating 228 over them or accessing them as SeqRecord objects. e.g. 229 230 for record in alignment: 231 print record.id 232 print record.seq 233 first_record = alignment[0] 234 last_record = alignment[-1] 235 """ 236 import warnings 237 warnings.warn("This is a legacy method. In new code where you need to access the rows of the alignment (i.e. the sequences) consider iterating over them or accessing them as SeqRecord objects.", PendingDeprecationWarning) 238 return self._records[number].seq
239
240 - def __len__(self):
241 """Returns the number of sequences in the alignment. 242 243 Use len(alignment) to get the number of sequences (i.e. the number of 244 rows), and alignment.get_alignment_length() to get the length of the 245 longest sequence (i.e. the number of columns). 246 247 This is easy to remember if you think of the alignment as being like a 248 list of SeqRecord objects. 249 """ 250 return len(self._records)
251
252 - def get_alignment_length(self):
253 """Return the maximum length of the alignment. 254 255 All objects in the alignment should (hopefully) have the same 256 length. This function will go through and find this length 257 by finding the maximum length of sequences in the alignment. 258 259 >>> from Bio.Alphabet import IUPAC, Gapped 260 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 261 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 262 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 263 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 264 >>> align.get_alignment_length() 265 12 266 267 If you want to know the number of sequences in the alignment, 268 use len(align) instead: 269 270 >>> len(align) 271 3 272 273 """ 274 max_length = 0 275 276 for record in self._records: 277 if len(record.seq) > max_length: 278 max_length = len(record.seq) 279 280 return max_length
281
282 - def add_sequence(self, descriptor, sequence, start = None, end = None, 283 weight = 1.0):
284 """Add a sequence to the alignment. 285 286 This doesn't do any kind of alignment, it just adds in the sequence 287 object, which is assumed to be prealigned with the existing 288 sequences. 289 290 Arguments: 291 o descriptor - The descriptive id of the sequence being added. 292 This will be used as the resulting SeqRecord's 293 .id property (and, for historical compatibility, 294 also the .description property) 295 o sequence - A string with sequence info. 296 o start - You can explicitly set the start point of the sequence. 297 This is useful (at least) for BLAST alignments, which can just 298 be partial alignments of sequences. 299 o end - Specify the end of the sequence, which is important 300 for the same reason as the start. 301 o weight - The weight to place on the sequence in the alignment. 302 By default, all sequences have the same weight. (0.0 => no weight, 303 1.0 => highest weight) 304 """ 305 new_seq = Seq(sequence, self._alphabet) 306 307 #We are now effectively using the SeqRecord's .id as 308 #the primary identifier (e.g. in Bio.SeqIO) so we should 309 #populate it with the descriptor. 310 #For backwards compatibility, also store this in the 311 #SeqRecord's description property. 312 new_record = SeqRecord(new_seq, 313 id = descriptor, 314 description = descriptor) 315 316 # hack! We really need to work out how to deal with annotations 317 # and features in biopython. Right now, I'll just use the 318 # generic annotations dictionary we've got to store the start 319 # and end, but we should think up something better. I don't know 320 # if I'm really a big fan of the LocatableSeq thing they've got 321 # in BioPerl, but I'm not positive what the best thing to do on 322 # this is... 323 if start: 324 new_record.annotations['start'] = start 325 if end: 326 new_record.annotations['end'] = end 327 328 # another hack to add weight information to the sequence 329 new_record.annotations['weight'] = weight 330 331 self._records.append(new_record)
332
333 - def get_column(self,col):
334 """Returns a string containing a given column. 335 336 e.g. 337 >>> from Bio.Alphabet import IUPAC, Gapped 338 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 339 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 340 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 341 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 342 >>> align.get_column(0) 343 'AAA' 344 >>> align.get_column(3) 345 'G-G' 346 """ 347 #TODO - Support negative indices? 348 col_str = '' 349 assert col >= 0 and col <= self.get_alignment_length() 350 for rec in self._records: 351 col_str += rec.seq[col] 352 return col_str
353
354 - def __getitem__(self, index):
355 """Access part of the alignment. 356 357 We'll use the following example alignment here for illustration: 358 359 >>> from Bio.Alphabet import IUPAC, Gapped 360 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 361 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 362 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 363 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 364 >>> align.add_sequence("Delta", "ACTGCTTGCTAG") 365 >>> align.add_sequence("Epsilon","ACTGCTTGATAG") 366 367 You can access a row of the alignment as a SeqRecord using an integer 368 index (think of the alignment as a list of SeqRecord objects here): 369 370 >>> first_record = align[0] 371 >>> print first_record.id, first_record.seq 372 Alpha ACTGCTAGCTAG 373 >>> last_record = align[-1] 374 >>> print last_record.id, last_record.seq 375 Epsilon ACTGCTTGATAG 376 377 You can also access use python's slice notation to create a sub-alignment 378 containing only some of the SeqRecord objects: 379 380 >>> sub_alignment = align[2:5] 381 >>> print sub_alignment 382 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 383 ACTGCTAGATAG Gamma 384 ACTGCTTGCTAG Delta 385 ACTGCTTGATAG Epsilon 386 387 This includes support for a step, i.e. align[start:end:step], which 388 can be used to select every second sequence: 389 390 >>> sub_alignment = align[::2] 391 >>> print sub_alignment 392 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 393 ACTGCTAGCTAG Alpha 394 ACTGCTAGATAG Gamma 395 ACTGCTTGATAG Epsilon 396 397 Or to get a copy of the alignment with the rows in reverse order: 398 399 >>> rev_alignment = align[::-1] 400 >>> print rev_alignment 401 Gapped(IUPACUnambiguousDNA(), '-') alignment with 5 rows and 12 columns 402 ACTGCTTGATAG Epsilon 403 ACTGCTTGCTAG Delta 404 ACTGCTAGATAG Gamma 405 ACT-CTAGCTAG Beta 406 ACTGCTAGCTAG Alpha 407 408 Right now, these are the ONLY indexing operations supported. The use of 409 a second column based index is under discussion for a future update. 410 """ 411 if isinstance(index, int): 412 #e.g. result = align[x] 413 #Return a SeqRecord 414 return self._records[index] 415 elif isinstance(index, slice): 416 #e.g. sub_aling = align[i:j:k] 417 #Return a new Alignment using only the specified records. 418 #TODO - See Bug 2554 for changing the __init__ method 419 #to allow us to do this more cleanly. 420 sub_align = Alignment(self._alphabet) 421 sub_align._records = self._records[index] 422 return sub_align 423 elif len(index)==2: 424 raise TypeError("Row and Column indexing is not currently supported,"\ 425 +"but may be in future.") 426 else: 427 raise TypeError("Invalid index type.")
428
429 -def _test():
430 """Run the Bio.Align.Generic module's doctests.""" 431 print "Running doctests..." 432 import doctest 433 doctest.testmod() 434 print "Done"
435 436 if __name__ == "__main__": 437 _test() 438