1
2
3
4
5
6 """Multiple sequence alignment input/output as alignment objects.
7
8 The Bio.AlignIO interface is deliberately very similar to Bio.SeqIO, and in
9 fact the two are connected internally. Both modules use the same set of file
10 format names (lower case strings). From the user's perspective, you can read
11 in a PHYLIP file containing one or more alignments using Bio.AlignIO, or you
12 can read in the sequences within these alignmenta using Bio.SeqIO.
13
14 Bio.AlignIO is also documented at U{http://biopython.org/wiki/AlignIO} and by
15 a whole chapter in our tutorial:
16 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
17 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
18
19 Input
20 =====
21 For the typical special case when your file or handle contains one and only
22 one alignment, use the function Bio.AlignIO.read(). This takes an input file
23 handle (or in recent versions of Biopython a filename as a string), format
24 string and optional number of sequences per alignment. It will return a single
25 MultipleSeqAlignment object (or raise an exception if there isn't just one
26 alignment):
27
28 >>> from Bio import AlignIO
29 >>> align = AlignIO.read("Phylip/interlaced.phy", "phylip")
30 >>> print align
31 SingleLetterAlphabet() alignment with 3 rows and 384 columns
32 -----MKVILLFVLAVFTVFVSS---------------RGIPPE...I-- CYS1_DICDI
33 MAHARVLLLALAVLATAAVAVASSSSFADSNPIRPVTDRAASTL...VAA ALEU_HORVU
34 ------MWATLPLLCAGAWLLGV--------PVCGAAELSVNSL...PLV CATH_HUMAN
35
36 For the general case, when the handle could contain any number of alignments,
37 use the function Bio.AlignIO.parse(...) which takes the same arguments, but
38 returns an iterator giving MultipleSeqAlignment objects (typically used in a
39 for loop). If you want random access to the alignments by number, turn this
40 into a list:
41
42 >>> from Bio import AlignIO
43 >>> alignments = list(AlignIO.parse("Emboss/needle.txt", "emboss"))
44 >>> print alignments[2]
45 SingleLetterAlphabet() alignment with 2 rows and 120 columns
46 -KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQALDIVTKER...--- ref_rec
47 LHIVVVDDDPGTCVYIESVFAELGHTCKSFVRPEAAEEYILTHP...HKE gi|94967506|receiver
48
49 Most alignment file formats can be concatenated so as to hold as many
50 different multiple sequence alignments as possible. One common example
51 is the output of the tool seqboot in the PHLYIP suite. Sometimes there
52 can be a file header and footer, as seen in the EMBOSS alignment output.
53
54 Output
55 ======
56 Use the function Bio.AlignIO.write(...), which takes a complete set of
57 Alignment objects (either as a list, or an iterator), an output file handle
58 (or filename in recent versions of Biopython) and of course the file format::
59
60 from Bio import AlignIO
61 alignments = ...
62 count = SeqIO.write(alignments, "example.faa", "fasta")
63
64 If using a handle make sure to close it to flush the data to the disk::
65
66 from Bio import AlignIO
67 alignments = ...
68 handle = open("example.faa", "w")
69 count = SeqIO.write(alignments, handle, "fasta")
70 handle.close()
71
72 In general, you are expected to call this function once (with all your
73 alignments) and then close the file handle. However, for file formats
74 like PHYLIP where multiple alignments are stored sequentially (with no file
75 header and footer), then multiple calls to the write function should work as
76 expected when using handles.
77
78 If you are using a filename, the repeated calls to the write functions will
79 overwrite the existing file each time.
80
81 Conversion
82 ==========
83 The Bio.AlignIO.convert(...) function allows an easy interface for simple
84 alignnment file format conversions. Additionally, it may use file format
85 specific optimisations so this should be the fastest way too.
86
87 In general however, you can combine the Bio.AlignIO.parse(...) function with
88 the Bio.AlignIO.write(...) function for sequence file conversion. Using
89 generator expressions provides a memory efficient way to perform filtering or
90 other extra operations as part of the process.
91
92 File Formats
93 ============
94 When specifying the file format, use lowercase strings. The same format
95 names are also used in Bio.SeqIO and include the following:
96
97 - clustal - Ouput from Clustal W or X, see also the module Bio.Clustalw
98 which can be used to run the command line tool from Biopython.
99 - emboss - EMBOSS tools' "pairs" and "simple" alignment formats.
100 - fasta - The generic sequence file format where each record starts with
101 an identifer line starting with a ">" character, followed by
102 lines of sequence.
103 - fasta-m10 - For the pairswise alignments output by Bill Pearson's FASTA
104 tools when used with the -m 10 command line option for machine
105 readable output.
106 - ig - The IntelliGenetics file format, apparently the same as the
107 MASE alignment format.
108 - nexus - Output from NEXUS, see also the module Bio.Nexus which can also
109 read any phylogenetic trees in these files.
110 - phylip - Used by the PHLIP tools.
111 - stockholm - A richly annotated alignment file format used by PFAM.
112
113 Note that while Bio.AlignIO can read all the above file formats, it cannot
114 write to all of them.
115
116 You can also use any file format supported by Bio.SeqIO, such as "fasta" or
117 "ig" (which are listed above), PROVIDED the sequences in your file are all the
118 same length.
119 """
120 __docformat__ = "epytext en"
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137 from StringIO import StringIO
138 from Bio.Seq import Seq
139 from Bio.SeqRecord import SeqRecord
140 from Bio.Align import MultipleSeqAlignment
141 from Bio.Align.Generic import Alignment
142 from Bio.Alphabet import Alphabet, AlphabetEncoder, _get_base_alphabet
143
144 import StockholmIO
145 import ClustalIO
146 import NexusIO
147 import PhylipIO
148 import EmbossIO
149 import FastaIO
150
151
152
153
154 _FormatToIterator = {
155 "clustal" : ClustalIO.ClustalIterator,
156 "emboss" : EmbossIO.EmbossIterator,
157 "fasta-m10" : FastaIO.FastaM10Iterator,
158 "nexus" : NexusIO.NexusIterator,
159 "phylip" : PhylipIO.PhylipIterator,
160 "stockholm" : StockholmIO.StockholmIterator,
161 }
162
163 _FormatToWriter = {
164
165 "nexus" : NexusIO.NexusWriter,
166 "phylip" : PhylipIO.PhylipWriter,
167 "stockholm" : StockholmIO.StockholmWriter,
168 "clustal" : ClustalIO.ClustalWriter,
169 }
170
171 -def write(alignments, handle, format):
172 """Write complete set of alignments to a file.
173
174 Arguments:
175 - alignments - A list (or iterator) of Alignment objects (ideally the
176 new MultipleSeqAlignment objects), or (if using Biopython
177 1.54 or later) a single alignment object.
178 - handle - File handle object to write to, or filename as string
179 (note older versions of Biopython only took a handle).
180 - format - lower case string describing the file format to write.
181
182 You should close the handle after calling this function.
183
184 Returns the number of alignments written (as an integer).
185 """
186 from Bio import SeqIO
187
188
189 if not isinstance(format, basestring):
190 raise TypeError("Need a string for the file format (lower case)")
191 if not format:
192 raise ValueError("Format required (lower case string)")
193 if format != format.lower():
194 raise ValueError("Format string '%s' should be lower case" % format)
195
196 if isinstance(alignments, Alignment):
197
198 alignments = [alignments]
199
200 if isinstance(handle, basestring):
201 handle = open(handle, "w")
202 handle_close = True
203 else:
204 handle_close = False
205
206
207 if format in _FormatToIterator:
208 writer_class = _FormatToWriter[format]
209 count = writer_class(handle).write_file(alignments)
210 elif format in SeqIO._FormatToWriter:
211
212
213 count = 0
214 for alignment in alignments:
215 if not isinstance(alignment, Alignment):
216 raise TypeError(\
217 "Expect a list or iterator of Alignment objects.")
218 SeqIO.write(alignment, handle, format)
219 count += 1
220 elif format in _FormatToIterator or format in SeqIO._FormatToIterator:
221 raise ValueError("Reading format '%s' is supported, but not writing" \
222 % format)
223 else:
224 raise ValueError("Unknown format '%s'" % format)
225
226 assert isinstance(count, int), "Internal error - the underlying %s " \
227 "writer should have returned the alignment count, not %s" \
228 % (format, repr(count))
229
230 if handle_close:
231 handle.close()
232
233 return count
234
235
237 """Uses Bio.SeqIO to create an MultipleSeqAlignment iterator (PRIVATE).
238
239 Arguments:
240 - handle - handle to the file.
241 - format - string describing the file format.
242 - alphabet - optional Alphabet object, useful when the sequence type
243 cannot be automatically inferred from the file itself
244 (e.g. fasta, phylip, clustal)
245 - seq_count - Optional integer, number of sequences expected in each
246 alignment. Recommended for fasta format files.
247
248 If count is omitted (default) then all the sequences in the file are
249 combined into a single MultipleSeqAlignment.
250 """
251 from Bio import SeqIO
252 assert format in SeqIO._FormatToIterator
253
254 if seq_count:
255
256 seq_record_iterator = SeqIO.parse(handle, format, alphabet)
257
258 records = []
259 for record in seq_record_iterator:
260 records.append(record)
261 if len(records) == seq_count:
262 yield MultipleSeqAlignment(records, alphabet)
263 records = []
264 if len(records) > 0:
265 raise ValueError("Check seq_count argument, not enough sequences?")
266 else:
267
268
269 records = list(SeqIO.parse(handle, format, alphabet))
270 if records:
271 yield MultipleSeqAlignment(records, alphabet)
272 raise StopIteration
273
293
294 -def parse(handle, format, seq_count=None, alphabet=None):
295 """Iterate over an alignment file as MultipleSeqAlignment objects.
296
297 Arguments:
298 - handle - handle to the file, or the filename as a string
299 (note older verions of Biopython only took a handle).
300 - format - string describing the file format.
301 - alphabet - optional Alphabet object, useful when the sequence type
302 cannot be automatically inferred from the file itself
303 (e.g. fasta, phylip, clustal)
304 - seq_count - Optional integer, number of sequences expected in each
305 alignment. Recommended for fasta format files.
306
307 If you have the file name in a string 'filename', use:
308
309 >>> from Bio import AlignIO
310 >>> filename = "Emboss/needle.txt"
311 >>> format = "emboss"
312 >>> for alignment in AlignIO.parse(filename, format):
313 ... print "Alignment of length", alignment.get_alignment_length()
314 Alignment of length 124
315 Alignment of length 119
316 Alignment of length 120
317 Alignment of length 118
318 Alignment of length 125
319
320 If you have a string 'data' containing the file contents, use:
321
322 from Bio import AlignIO
323 from StringIO import StringIO
324 my_iterator = AlignIO.parse(StringIO(data), format)
325
326 Use the Bio.AlignIO.read() function when you expect a single record only.
327 """
328 from Bio import SeqIO
329
330 handle_close = False
331
332 if isinstance(handle, basestring):
333 handle = open(handle, "rU")
334
335 handle_close = True
336
337
338 if not isinstance(format, basestring):
339 raise TypeError("Need a string for the file format (lower case)")
340 if not format:
341 raise ValueError("Format required (lower case string)")
342 if format != format.lower():
343 raise ValueError("Format string '%s' should be lower case" % format)
344 if alphabet is not None and not (isinstance(alphabet, Alphabet) or \
345 isinstance(alphabet, AlphabetEncoder)):
346 raise ValueError("Invalid alphabet, %s" % repr(alphabet))
347 if seq_count is not None and not isinstance(seq_count, int):
348 raise TypeError("Need integer for seq_count (sequences per alignment)")
349
350
351 if format in _FormatToIterator:
352 iterator_generator = _FormatToIterator[format]
353 if alphabet is None :
354 i = iterator_generator(handle, seq_count)
355 else:
356 try:
357
358 i = iterator_generator(handle, seq_count, alphabet=alphabet)
359 except TypeError:
360
361 i = _force_alphabet(iterator_generator(handle, seq_count),
362 alphabet)
363
364 elif format in SeqIO._FormatToIterator:
365
366 i = _SeqIO_to_alignment_iterator(handle, format,
367 alphabet=alphabet,
368 seq_count=seq_count)
369 else:
370 raise ValueError("Unknown format '%s'" % format)
371
372
373 for a in i:
374 yield a
375 if handle_close:
376 handle.close()
377
378 -def read(handle, format, seq_count=None, alphabet=None):
379 """Turns an alignment file into a single MultipleSeqAlignment object.
380
381 Arguments:
382 - handle - handle to the file, or the filename as a string
383 (note older verions of Biopython only took a handle).
384 - format - string describing the file format.
385 - alphabet - optional Alphabet object, useful when the sequence type
386 cannot be automatically inferred from the file itself
387 (e.g. fasta, phylip, clustal)
388 - seq_count - Optional integer, number of sequences expected in each
389 alignment. Recommended for fasta format files.
390
391 If the handle contains no alignments, or more than one alignment,
392 an exception is raised. For example, using a PFAM/Stockholm file
393 containing one alignment:
394
395 >>> from Bio import AlignIO
396 >>> filename = "Clustalw/protein.aln"
397 >>> format = "clustal"
398 >>> alignment = AlignIO.read(filename, format)
399 >>> print "Alignment of length", alignment.get_alignment_length()
400 Alignment of length 411
401
402 If however you want the first alignment from a file containing
403 multiple alignments this function would raise an exception.
404
405 >>> from Bio import AlignIO
406 >>> filename = "Emboss/needle.txt"
407 >>> format = "emboss"
408 >>> alignment = AlignIO.read(filename, format)
409 Traceback (most recent call last):
410 ...
411 ValueError: More than one record found in handle
412
413 Instead use:
414
415 >>> from Bio import AlignIO
416 >>> filename = "Emboss/needle.txt"
417 >>> format = "emboss"
418 >>> alignment = AlignIO.parse(filename, format).next()
419 >>> print "First alignment has length", alignment.get_alignment_length()
420 First alignment has length 124
421
422 You must use the Bio.AlignIO.parse() function if you want to read multiple
423 records from the handle.
424 """
425 iterator = parse(handle, format, seq_count, alphabet)
426 try:
427 first = iterator.next()
428 except StopIteration:
429 first = None
430 if first is None:
431 raise ValueError("No records found in handle")
432 try:
433 second = iterator.next()
434 except StopIteration:
435 second = None
436 if second is not None:
437 raise ValueError("More than one record found in handle")
438 if seq_count:
439 assert len(first)==seq_count
440 return first
441
442 -def convert(in_file, in_format, out_file, out_format, alphabet=None):
443 """Convert between two alignment files, returns number of alignments.
444
445 - in_file - an input handle or filename
446 - in_format - input file format, lower case string
447 - output - an output handle or filename
448 - out_file - output file format, lower case string
449 - alphabet - optional alphabet to assume
450
451 NOTE - If you provide an output filename, it will be opened which will
452 overwrite any existing file without warning. This may happen if even the
453 conversion is aborted (e.g. an invalid out_format name is given).
454 """
455
456
457 if isinstance(in_file, basestring):
458 in_handle = open(in_file, "rU")
459 in_close = True
460 else:
461 in_handle = in_file
462 in_close = False
463
464 alignments = parse(in_handle, in_format, None, alphabet)
465
466 if isinstance(out_file, basestring):
467 out_handle = open(out_file, "w")
468 out_close = True
469 else:
470 out_handle = out_file
471 out_close = False
472
473
474 count = write(alignments, out_handle, out_format)
475
476 if in_close:
477 in_handle.close()
478 if out_close:
479 out_handle.close()
480 return count
481
483 """Run the Bio.AlignIO module's doctests.
484
485 This will try and locate the unit tests directory, and run the doctests
486 from there in order that the relative paths used in the examples work.
487 """
488 import doctest
489 import os
490 if os.path.isdir(os.path.join("..", "..", "Tests")):
491 print "Runing doctests..."
492 cur_dir = os.path.abspath(os.curdir)
493 os.chdir(os.path.join("..", "..", "Tests"))
494 doctest.testmod()
495 os.chdir(cur_dir)
496 del cur_dir
497 print "Done"
498 elif os.path.isdir(os.path.join("Tests", "Fasta")):
499 print "Runing doctests..."
500 cur_dir = os.path.abspath(os.curdir)
501 os.chdir(os.path.join("Tests"))
502 doctest.testmod()
503 os.chdir(cur_dir)
504 del cur_dir
505 print "Done"
506
507 if __name__ == "__main__":
508 _test()
509