1
2
3
4
5
6 """
7 Bio.AlignIO support for the "clustal" output from CLUSTAL W and other tools.
8
9 You are expected to use this module via the Bio.AlignIO functions (or the
10 Bio.SeqIO functions if you want to work directly with the gapped sequences).
11 """
12
13 from Bio.Seq import Seq
14 from Bio.SeqRecord import SeqRecord
15 from Bio.Align import MultipleSeqAlignment
16 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
17
19 """Clustalw alignment writer."""
21 """Use this to write (another) single alignment to an open file."""
22
23 if len(alignment) == 0:
24 raise ValueError("Must have at least one sequence")
25 if alignment.get_alignment_length() == 0:
26
27 raise ValueError("Non-empty sequences are required")
28
29
30 try:
31 version = str(alignment._version)
32 except AttributeError:
33 version = ""
34 if not version:
35 version = '1.81'
36 if version.startswith("2."):
37
38 output = "CLUSTAL %s multiple sequence alignment\n\n\n" % version
39 else:
40
41 output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % version
42
43 cur_char = 0
44 max_length = len(alignment[0])
45
46 if max_length <= 0:
47 raise ValueError("Non-empty sequences are required")
48
49
50 while cur_char != max_length:
51
52
53 if (cur_char + 50) > max_length:
54 show_num = max_length - cur_char
55 else:
56 show_num = 50
57
58
59
60
61 for record in alignment:
62
63
64
65 line = record.id[0:30].replace(" ","_").ljust(36)
66 line += record.seq[cur_char:(cur_char + show_num)].tostring()
67 output += line + "\n"
68
69
70
71 if hasattr(alignment, "_star_info") and alignment._star_info != '':
72 output += (" " * 36) + \
73 alignment._star_info[cur_char:(cur_char + show_num)] + "\n"
74
75 output += "\n"
76 cur_char += show_num
77
78
79 self.handle.write(output + "\n")
80
82 """Clustalw alignment iterator."""
83
85 handle = self.handle
86 try:
87
88
89 line = self._header
90 del self._header
91 except AttributeError:
92 line = handle.readline()
93 if not line:
94 raise StopIteration
95
96
97 known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE']
98 if line.strip().split()[0] not in known_headers:
99 raise ValueError("%s is not a known CLUSTAL header: %s" % \
100 (line.strip().split()[0],
101 ", ".join(known_headers)))
102
103
104 version = None
105 for word in line.split():
106 if word[0]=='(' and word[-1]==')':
107 word = word[1:-1]
108 if word[0] in '0123456789':
109 version = word
110 break
111
112
113 line = handle.readline()
114 while line.strip() == "":
115 line = handle.readline()
116
117
118
119
120 ids = []
121 seqs = []
122 consensus = ""
123 seq_cols = None
124
125
126 while True:
127 if line[0] != " " and line.strip() != "":
128
129 fields = line.rstrip().split()
130
131
132
133 if len(fields) < 2 or len(fields) > 3:
134 raise ValueError("Could not parse line:\n%s" % line)
135
136 ids.append(fields[0])
137 seqs.append(fields[1])
138
139
140 if seq_cols is None:
141 start = len(fields[0]) + line[len(fields[0]):].find(fields[1])
142 end = start + len(fields[1])
143 seq_cols = slice(start, end)
144 del start, end
145 assert fields[1] == line[seq_cols]
146
147 if len(fields) == 3:
148
149 try:
150 letters = int(fields[2])
151 except ValueError:
152 raise ValueError("Could not parse line, bad sequence number:\n%s" % line)
153 if len(fields[1].replace("-","")) != letters:
154 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line)
155 elif line[0] == " ":
156
157 assert len(ids) == len(seqs)
158 assert len(ids) > 0
159 assert seq_cols is not None
160 consensus = line[seq_cols]
161 assert not line[:seq_cols.start].strip()
162 assert not line[seq_cols.stop:].strip()
163
164 line = handle.readline()
165 assert line.strip() == ""
166 break
167 else:
168
169 break
170 line = handle.readline()
171 if not line : break
172
173 assert line.strip() == ""
174 assert seq_cols is not None
175
176
177 for s in seqs:
178 assert len(s) == len(seqs[0])
179 if consensus:
180 assert len(consensus) == len(seqs[0])
181
182
183 done = False
184 while not done:
185
186
187
188 while (not line) or line.strip() == "":
189 line = handle.readline()
190 if not line : break
191 if not line : break
192
193 if line.split(None,1)[0] in known_headers:
194
195 done = True
196 self._header = line
197 break
198
199 for i in range(len(ids)):
200 assert line[0] != " ", "Unexpected line:\n%s" % repr(line)
201 fields = line.rstrip().split()
202
203
204
205 if len(fields) < 2 or len(fields) > 3:
206 raise ValueError("Could not parse line:\n%s" % repr(line))
207
208 if fields[0] != ids[i]:
209 raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \
210 % (fields[0], ids[i]))
211
212 if fields[1] != line[seq_cols]:
213 start = len(fields[0]) + line[len(fields[0]):].find(fields[1])
214 assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start)
215 end = start + len(fields[1])
216 seq_cols = slice(start, end)
217 del start, end
218
219
220 seqs[i] += fields[1]
221 assert len(seqs[i]) == len(seqs[0])
222
223 if len(fields) == 3:
224
225 try:
226 letters = int(fields[2])
227 except ValueError:
228 raise ValueError("Could not parse line, bad sequence number:\n%s" % line)
229 if len(seqs[i].replace("-","")) != letters:
230 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line)
231
232
233 line = handle.readline()
234
235 if consensus:
236 assert line[0] == " "
237 assert seq_cols is not None
238 consensus += line[seq_cols]
239 assert len(consensus) == len(seqs[0])
240 assert not line[:seq_cols.start].strip()
241 assert not line[seq_cols.stop:].strip()
242
243 line = handle.readline()
244
245
246 assert len(ids) == len(seqs)
247 if len(seqs) == 0 or len(seqs[0]) == 0:
248 raise StopIteration
249
250 if self.records_per_alignment is not None \
251 and self.records_per_alignment != len(ids):
252 raise ValueError("Found %i records in this alignment, told to expect %i" \
253 % (len(ids), self.records_per_alignment))
254
255 records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i) \
256 for (i,s) in zip(ids, seqs))
257 alignment = MultipleSeqAlignment(records, self.alphabet)
258
259
260 if version:
261 alignment._version = version
262 if consensus:
263 alignment_length = len(seqs[0])
264 assert len(consensus) == alignment_length, \
265 "Alignment length is %i, consensus length is %i, '%s'" \
266 % (alignment_length, len(consensus), consensus)
267 alignment._star_info = consensus
268 return alignment
269
270 if __name__ == "__main__":
271 print "Running a quick self-test"
272
273
274
275 aln_example1 = \
276 """CLUSTAL W (1.81) multiple sequence alignment
277
278
279 gi|4959044|gb|AAD34209.1|AF069 MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN 50
280 gi|671626|emb|CAA85685.1| ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR 41
281 * *: :: :. :* : :. : . :* :: .
282
283 gi|4959044|gb|AAD34209.1|AF069 LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW 100
284 gi|671626|emb|CAA85685.1| VTPQPG-----------------VPPEEAGAAVAAESSTGT--------- 65
285 : ** **:... *.*** ..
286
287 gi|4959044|gb|AAD34209.1|AF069 LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT 150
288 gi|671626|emb|CAA85685.1| WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------ 92
289 .:* * *: .* :* : :* .*
290
291 gi|4959044|gb|AAD34209.1|AF069 SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE 200
292 gi|671626|emb|CAA85685.1| -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP 141
293 *::. . .:: :*..* :* .* .. . : . :
294
295 gi|4959044|gb|AAD34209.1|AF069 VPTTRAQRRA 210
296 gi|671626|emb|CAA85685.1| VAYVKTFQGP 151
297 *. .:: : .
298
299 """
300
301
302
303
304 aln_example2 = \
305 """CLUSTAL X (1.83) multiple sequence alignment
306
307
308 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG
309 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG
310 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG
311 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG
312 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG
313 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA
314 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA
315 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
316 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
317 : . : :.
318
319 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL
320 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE
321 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG
322 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG
323 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS
324 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA
325 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG
326 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
327 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
328 ** .: *::::. : :. . ..:
329
330 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI
331 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI
332 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV
333 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI
334 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL
335 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV
336 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII
337 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
338 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
339 *.: . * . * *: :
340
341 """
342
343 from StringIO import StringIO
344
345 alignments = list(ClustalIterator(StringIO(aln_example1)))
346 assert 1 == len(alignments)
347 assert alignments[0]._version == "1.81"
348 alignment = alignments[0]
349 assert 2 == len(alignment)
350 assert alignment[0].id == "gi|4959044|gb|AAD34209.1|AF069"
351 assert alignment[1].id == "gi|671626|emb|CAA85685.1|"
352 assert alignment[0].seq.tostring() == \
353 "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN" + \
354 "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW" + \
355 "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT" + \
356 "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE" + \
357 "VPTTRAQRRA"
358
359 alignments = list(ClustalIterator(StringIO(aln_example2)))
360 assert 1 == len(alignments)
361 assert alignments[0]._version == "1.83"
362 alignment = alignments[0]
363 assert 9 == len(alignment)
364 assert alignment[-1].id == "HISJ_E_COLI"
365 assert alignment[-1].seq.tostring() == \
366 "MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG" + \
367 "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS" + \
368 "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV"
369
370 for alignment in ClustalIterator(StringIO(aln_example2 + aln_example1)):
371 print "Alignment with %i records of length %i" \
372 % (len(alignment),
373 alignment.get_alignment_length())
374
375 print "Checking empty file..."
376 assert 0 == len(list(ClustalIterator(StringIO(""))))
377
378 print "Checking write/read..."
379 alignments = list(ClustalIterator(StringIO(aln_example1))) \
380 + list(ClustalIterator(StringIO(aln_example2)))*2
381 handle = StringIO()
382 ClustalWriter(handle).write_file(alignments)
383 handle.seek(0)
384 for i,a in enumerate(ClustalIterator(handle)):
385 assert a.get_alignment_length() == alignments[i].get_alignment_length()
386 handle.seek(0)
387
388 print "Testing write/read when there is only one sequence..."
389 alignment = alignment[0:1]
390 handle = StringIO()
391 ClustalWriter(handle).write_file([alignment])
392 handle.seek(0)
393 for i,a in enumerate(ClustalIterator(handle)):
394 assert a.get_alignment_length() == alignment.get_alignment_length()
395 assert len(a) == 1
396
397 aln_example3 = \
398 """CLUSTAL 2.0.9 multiple sequence alignment
399
400
401 Test1seq ------------------------------------------------------------
402 AT3G20900.1-SEQ ATGAACAAAGTAGCGAGGAAGAACAAAACATCAGGTGAACAAAAAAAAAACTCAATCCAC
403 AT3G20900.1-CDS ------------------------------------------------------------
404
405
406 Test1seq -----AGTTACAATAACTGACGAAGCTAAGTAGGCTACTAATTAACGTCATCAACCTAAT
407 AT3G20900.1-SEQ ATCAAAGTTACAATAACTGACGAAGCTAAGTAGGCTAGAAATTAAAGTCATCAACCTAAT
408 AT3G20900.1-CDS ------------------------------------------------------------
409
410
411 Test1seq ACATAGCACTTAGAAAAAAGTGAAGTAAGAAAATATAAAATAATAAAAGGGTGGGTTATC
412 AT3G20900.1-SEQ ACATAGCACTTAGAAAAAAGTGAAGCAAGAAAATATAAAATAATAAAAGGGTGGGTTATC
413 AT3G20900.1-CDS ------------------------------------------------------------
414
415
416 Test1seq AATTGATAGTGTAAATCATCGTATTCCGGTGATATACCCTACCACAAAAACTCAAACCGA
417 AT3G20900.1-SEQ AATTGATAGTGTAAATCATAGTTGATTTTTGATATACCCTACCACAAAAACTCAAACCGA
418 AT3G20900.1-CDS ------------------------------------------------------------
419
420
421 Test1seq CTTGATTCAAATCATCTCAATAAATTAGCGCCAAAATAATGAAAAAAATAATAACAAACA
422 AT3G20900.1-SEQ CTTGATTCAAATCATCTCAAAAAACAAGCGCCAAAATAATGAAAAAAATAATAACAAAAA
423 AT3G20900.1-CDS ------------------------------------------------------------
424
425
426 Test1seq AAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT
427 AT3G20900.1-SEQ CAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT
428 AT3G20900.1-CDS ------------------------------------------------------------
429
430
431 Test1seq GTATTAACAAATCAAAGAGCTGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT
432 AT3G20900.1-SEQ GTATTAACAAATCAAAGAGATGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT
433 AT3G20900.1-CDS ------------------------------------------------------------
434
435
436 Test1seq CCTATATCAACGTAAACAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT
437 AT3G20900.1-SEQ CCTATATCAAAAAAAAAAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT
438 AT3G20900.1-CDS ------------------------------------------------------ATGAAC
439 *
440
441 Test1seq TCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT
442 AT3G20900.1-SEQ GCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT
443 AT3G20900.1-CDS AAAGTAGCGAGGAAGAACAAAACATC------AGCAAAGAAAACGATCTGTCTCCGTCGT
444 * *** ***** * * ** ****************************
445
446 Test1seq AACACACGGTCGCTAGAGAAACTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
447 AT3G20900.1-SEQ AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
448 AT3G20900.1-CDS AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
449 ******* ** * **** ***************************************
450
451 Test1seq GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCGTGGTGACGTCAGCACCGCT
452 AT3G20900.1-SEQ GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT
453 AT3G20900.1-CDS GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT
454 **************************************** *******************
455
456 Test1seq GCTGGGGATGGAGAGGGAACAGAGTT-
457 AT3G20900.1-SEQ GCTGGGGATGGAGAGGGAACAGAGTAG
458 AT3G20900.1-CDS GCTGGGGATGGAGAGGGAACAGAGTAG
459 *************************
460 """
461 alignments = list(ClustalIterator(StringIO(aln_example3)))
462 assert 1 == len(alignments)
463 assert alignments[0]._version == "2.0.9"
464
465 print "The End"
466