Package Bio :: Package AlignIO :: Module PhylipIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.PhylipIO

  1  # Copyright 2006-2010 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  AlignIO support for the "phylip" format used in Joe Felsenstein's PHYLIP tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10   
 11  Note 
 12  ==== 
 13  In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003) 
 14  a dot/period (".") in a sequence is interpreted as meaning the same 
 15  character as in the first sequence.  The PHYLIP 3.6 documentation says: 
 16   
 17     "a period was also previously allowed but it is no longer allowed, 
 18     because it sometimes is used in different senses in other programs" 
 19   
 20  At the time of writing, we do nothing special with a dot/period. 
 21  """ 
 22   
 23  from Bio.Seq import Seq 
 24  from Bio.SeqRecord import SeqRecord   
 25  from Bio.Alphabet import single_letter_alphabet 
 26  from Bio.Align import MultipleSeqAlignment 
 27  from Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 28   
29 -class PhylipWriter(SequentialAlignmentWriter):
30 """Phylip alignment writer."""
31 - def write_alignment(self, alignment):
32 """Use this to write (another) single alignment to an open file. 33 34 This code will write interlaced alignments (when the sequences are 35 longer than 50 characters). 36 37 Note that record identifiers are strictly truncated at 10 characters. 38 39 For more information on the file format, please see: 40 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 41 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 42 """ 43 truncate=10 44 handle = self.handle 45 46 if len(alignment)==0: 47 raise ValueError("Must have at least one sequence") 48 length_of_seqs = alignment.get_alignment_length() 49 for record in alignment: 50 if length_of_seqs != len(record.seq): 51 raise ValueError("Sequences must all be the same length") 52 if length_of_seqs <= 0: 53 raise ValueError("Non-empty sequences are required") 54 55 if len(alignment) > len(set([r.id[:truncate] for r in alignment])): 56 raise ValueError("Repeated identifier, possibly due to truncation") 57 58 59 # From experimentation, the use of tabs is not understood by the 60 # EMBOSS suite. The nature of the expected white space is not 61 # defined in the PHYLIP documentation, simply "These are in free 62 # format, separated by blanks". We'll use spaces to keep EMBOSS 63 # happy. 64 handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) 65 block=0 66 while True: 67 for record in alignment: 68 if block==0: 69 #Write name (truncated/padded to 10 characters) 70 """ 71 Quoting the PHYLIP version 3.6 documentation: 72 73 The name should be ten characters in length, filled out to 74 the full ten characters by blanks if shorter. Any printable 75 ASCII/ISO character is allowed in the name, except for 76 parentheses ("(" and ")"), square brackets ("[" and "]"), 77 colon (":"), semicolon (";") and comma (","). If you forget 78 to extend the names to ten characters in length by blanks, 79 the program [i.e. PHYLIP] will get out of synchronization 80 with the contents of the data file, and an error message will 81 result. 82 83 Note that Tab characters count as only one character in the 84 species names. Their inclusion can cause trouble. 85 """ 86 name = record.id.strip() 87 #Either remove the banned characters, or map them to something 88 #else like an underscore "_" or pipe "|" character... 89 for char in "[](),": 90 name = name.replace(char,"") 91 for char in ":;": 92 name = name.replace(char,"|") 93 94 #Now truncate and right pad to expected length. 95 handle.write(name[:truncate].ljust(truncate)) 96 else: 97 #write 10 space indent 98 handle.write(" "*truncate) 99 #Write five chunks of ten letters per line... 100 for chunk in range(0,5): 101 i = block*50 + chunk*10 102 seq_segment = record.seq.tostring()[i:i+10] 103 #TODO - Force any gaps to be '-' character? Look at the alphabet... 104 #TODO - How to cope with '?' or '.' in the sequence? 105 handle.write(" %s" % seq_segment) 106 if i+10 > length_of_seqs : break 107 handle.write("\n") 108 block=block+1 109 if block*50 > length_of_seqs : break 110 handle.write("\n")
111
112 -class PhylipIterator(AlignmentIterator):
113 """Reads a Phylip alignment file returning a MultipleSeqAlignment iterator. 114 115 Record identifiers are limited to at most 10 characters. 116 117 It only copes with interlaced phylip files! Sequential files won't work 118 where the sequences are split over multiple lines. 119 120 For more information on the file format, please see: 121 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 122 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 123 """ 124
125 - def _is_header(self, line):
126 line = line.strip() 127 parts = filter(None, line.split()) 128 if len(parts)!=2: 129 return False # First line should have two integers 130 try: 131 number_of_seqs = int(parts[0]) 132 length_of_seqs = int(parts[1]) 133 return True 134 except ValueError: 135 return False # First line should have two integers
136
137 - def next(self):
138 handle = self.handle 139 140 try: 141 #Header we saved from when we were parsing 142 #the previous alignment. 143 line = self._header 144 del self._header 145 except AttributeError: 146 line = handle.readline() 147 148 if not line: 149 raise StopIteration 150 line = line.strip() 151 parts = filter(None, line.split()) 152 if len(parts)!=2: 153 raise ValueError("First line should have two integers") 154 try: 155 number_of_seqs = int(parts[0]) 156 length_of_seqs = int(parts[1]) 157 except ValueError: 158 raise ValueError("First line should have two integers") 159 160 assert self._is_header(line) 161 162 if self.records_per_alignment is not None \ 163 and self.records_per_alignment != number_of_seqs: 164 raise ValueError("Found %i records in this alignment, told to expect %i" \ 165 % (number_of_seqs, self.records_per_alignment)) 166 167 ids = [] 168 seqs = [] 169 170 #Expects STRICT truncation/padding to 10 characters 171 #Does not require any white space between name and seq. 172 for i in range(0,number_of_seqs): 173 line = handle.readline().rstrip() 174 ids.append(line[:10].strip()) #first ten characters 175 seqs.append([line[10:].strip().replace(" ","")]) 176 177 #Look for further blocks 178 line="" 179 while True: 180 #Skip any blank lines between blocks... 181 while ""==line.strip(): 182 line = handle.readline() 183 if not line : break #end of file 184 if not line : break #end of file 185 186 if self._is_header(line): 187 #Looks like the start of a concatenated alignment 188 self._header = line 189 break 190 191 #print "New block..." 192 for i in range(0,number_of_seqs): 193 seqs[i].append(line.strip().replace(" ","")) 194 line = handle.readline() 195 if (not line) and i+1 < number_of_seqs: 196 raise ValueError("End of file mid-block") 197 if not line : break #end of file 198 199 records = (SeqRecord(Seq("".join(s), self.alphabet), \ 200 id=i, name=i, description=i) \ 201 for (i,s) in zip(ids, seqs)) 202 return MultipleSeqAlignment(records, self.alphabet)
203 204 if __name__=="__main__": 205 print "Running short mini-test" 206 207 phylip_text=""" 8 286 208 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG 209 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG 210 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG 211 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG 212 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG 213 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA 214 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA 215 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG 216 217 MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL 218 ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE 219 TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG 220 TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG 221 LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS 222 TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA 223 MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG 224 TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS 225 226 LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE 227 LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD 228 LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA 229 LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE 230 LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD 231 LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK 232 LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA 233 LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE 234 235 DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA 236 DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA 237 DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM 238 DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA 239 DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA 240 DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA 241 DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA 242 SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA 243 244 FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK 245 YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK 246 TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE 247 TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA 248 ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED 249 VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE 250 AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA 251 AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE 252 253 LQAEVNKALA EMRADGTVEK ISVKWFGADI TK---- 254 LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH 255 VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK---- 256 LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ---- 257 LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK---- 258 LRDKVNGALK TLRENGTYNE IYKKWFGTEP K----- 259 LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP--- 260 LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG--- 261 """ 262 263 from cStringIO import StringIO 264 handle = StringIO(phylip_text) 265 count=0 266 for alignment in PhylipIterator(handle): 267 for record in alignment: 268 count=count+1 269 print record.id 270 #print record.seq.tostring() 271 assert count == 8 272 273 expected="""mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg 274 fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly 275 aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys 276 dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre 277 alnkafaemradgtyeklak kyfdfdvygg""".replace(" ","").replace("\n","").upper() 278 assert record.seq.tostring().replace("-","") == expected 279 280 #From here: 281 #http://atgc.lirmm.fr/phyml/usersguide.html 282 phylip_text2="""5 60 283 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAG 284 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGG 285 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGG 286 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGG 287 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGG 288 289 GAAATGGTCAATATTACAAGGT 290 GAAATGGTCAACATTAAAAGAT 291 GAAATCGTCAATATTAAAAGGT 292 GAAATGGTCAATCTTAAAAGGT 293 GAAATGGTCAATATTAAAAGGT""" 294 295 phylip_text3="""5 60 296 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAGGAAATGGTCAATATTACAAGGT 297 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAACATTAAAAGAT 298 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGGGAAATCGTCAATATTAAAAGGT 299 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGGGAAATGGTCAATCTTAAAAGGT 300 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAATATTAAAAGGT""" 301 302 handle = StringIO(phylip_text2) 303 list2 = list(PhylipIterator(handle)) 304 handle.close() 305 assert len(list2)==1 306 assert len(list2[0])==5 307 308 handle = StringIO(phylip_text3) 309 list3 = list(PhylipIterator(handle)) 310 handle.close() 311 assert len(list3)==1 312 assert len(list3[0])==5 313 314 for i in range(0,5): 315 list2[0][i].id == list3[0][i].id 316 list2[0][i].seq.tostring() == list3[0][i].seq.tostring() 317 318 #From here: 319 #http://evolution.genetics.washington.edu/phylip/doc/sequence.html 320 #Note the lack of any white space between names 2 and 3 and their seqs. 321 phylip_text4=""" 5 42 322 Turkey AAGCTNGGGC ATTTCAGGGT 323 Salmo gairAAGCCTTGGC AGTGCAGGGT 324 H. SapiensACCGGTTGGC CGTTCAGGGT 325 Chimp AAACCCTTGC CGTTACGCTT 326 Gorilla AAACCCTTGC CGGTACGCTT 327 328 GAGCCCGGGC AATACAGGGT AT 329 GAGCCGTGGC CGGGCACGGT AT 330 ACAGGTTGGC CGTTCAGGGT AA 331 AAACCGAGGC CGGGACACTC AT 332 AAACCATTGC CGGTACGCTT AA""" 333 334 #From here: 335 #http://evolution.genetics.washington.edu/phylip/doc/sequence.html 336 phylip_text5=""" 5 42 337 Turkey AAGCTNGGGC ATTTCAGGGT 338 GAGCCCGGGC AATACAGGGT AT 339 Salmo gairAAGCCTTGGC AGTGCAGGGT 340 GAGCCGTGGC CGGGCACGGT AT 341 H. SapiensACCGGTTGGC CGTTCAGGGT 342 ACAGGTTGGC CGTTCAGGGT AA 343 Chimp AAACCCTTGC CGTTACGCTT 344 AAACCGAGGC CGGGACACTC AT 345 Gorilla AAACCCTTGC CGGTACGCTT 346 AAACCATTGC CGGTACGCTT AA""" 347 348 phylip_text5a=""" 5 42 349 Turkey AAGCTNGGGC ATTTCAGGGT GAGCCCGGGC AATACAGGGT AT 350 Salmo gairAAGCCTTGGC AGTGCAGGGT GAGCCGTGGC CGGGCACGGT AT 351 H. SapiensACCGGTTGGC CGTTCAGGGT ACAGGTTGGC CGTTCAGGGT AA 352 Chimp AAACCCTTGC CGTTACGCTT AAACCGAGGC CGGGACACTC AT 353 Gorilla AAACCCTTGC CGGTACGCTT AAACCATTGC CGGTACGCTT AA""" 354 355 handle = StringIO(phylip_text4) 356 list4 = list(PhylipIterator(handle)) 357 handle.close() 358 assert len(list4)==1 359 assert len(list4[0])==5 360 361 handle = StringIO(phylip_text5) 362 try: 363 list5 = list(PhylipIterator(handle)) 364 assert len(list5)==1 365 assert len(list5[0])==5 366 print "That should have failed..." 367 except ValueError: 368 print "Evil multiline non-interlaced example failed as expected" 369 handle.close() 370 371 handle = StringIO(phylip_text5a) 372 list5 = list(PhylipIterator(handle)) 373 handle.close() 374 assert len(list5)==1 375 assert len(list4[0])==5 376 377 print "Concatenation" 378 handle = StringIO(phylip_text4 + "\n" + phylip_text4) 379 assert len(list(PhylipIterator(handle))) == 2 380 381 handle = StringIO(phylip_text3 + "\n" + phylip_text4 + "\n\n\n" + phylip_text) 382 assert len(list(PhylipIterator(handle))) == 3 383 384 print "OK" 385 386 print "Checking write/read" 387 handle = StringIO() 388 PhylipWriter(handle).write_file(list5) 389 handle.seek(0) 390 list6 = list(PhylipIterator(handle)) 391 assert len(list5) == len(list6) 392 for a1,a2 in zip(list5, list6): 393 assert len(a1) == len(a2) 394 for r1, r2 in zip(a1, a2): 395 assert r1.id == r2.id 396 assert r1.seq.tostring() == r2.seq.tostring() 397 print "Done" 398