Package Bio :: Package Nexus :: Module Nexus
[hide private]
[frames] | no frames]

Source Code for Module Bio.Nexus.Nexus

   1  # Copyright 2005-2008 by Frank Kauff & Cymon J. Cox. All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # 
   6  # Bug reports welcome: fkauff@biologie.uni-kl.de or on Biopython's bugzilla. 
   7  """Nexus class. Parse the contents of a NEXUS file. 
   8   
   9  Based upon 'NEXUS: An extensible file format for systematic information' 
  10  Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621 
  11  """ 
  12   
  13  import os,sys, math, random, copy 
  14   
  15  from Bio.Alphabet import IUPAC 
  16  from Bio.Data import IUPACData 
  17  from Bio.Seq import Seq 
  18   
  19  from Trees import Tree,NodeData 
  20   
  21  try: 
  22      import cnexus 
  23  except ImportError: 
  24      C=False 
  25  else: 
  26      C=True 
  27   
  28  INTERLEAVE=70 
  29  SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\ 
  30          'matrix','tree', 'utree','translate','codonposset','title'] 
  31  KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets','codons'] 
  32  PUNCTUATION='()[]{}/\,;:=*\'"`+-<>' 
  33  MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_' 
  34  WHITESPACE=' \t\n' 
  35  #SPECIALCOMMENTS=['!','&','%','/','\\','@'] #original list of special comments 
  36  SPECIALCOMMENTS=['&'] # supported special comment ('tree' command), all others are ignored 
  37  CHARSET='chars' 
  38  TAXSET='taxa' 
  39  CODONPOSITIONS='codonpositions' 
  40  DEFAULTNEXUS='#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; ' 
41 -class NexusError(Exception): pass
42
43 -class CharBuffer:
44 """Helps reading NEXUS-words and characters from a buffer."""
45 - def __init__(self,string):
46 if string: 47 self.buffer=list(string) 48 else: 49 self.buffer=[]
50
51 - def peek(self):
52 if self.buffer: 53 return self.buffer[0] 54 else: 55 return None
56
57 - def peek_nonwhitespace(self):
58 b=''.join(self.buffer).strip() 59 if b: 60 return b[0] 61 else: 62 return None
63
64 - def next(self):
65 if self.buffer: 66 return self.buffer.pop(0) 67 else: 68 return None
69
70 - def next_nonwhitespace(self):
71 while True: 72 p=self.next() 73 if p is None: 74 break 75 if p not in WHITESPACE: 76 return p 77 return None
78
79 - def skip_whitespace(self):
80 while self.buffer[0] in WHITESPACE: 81 self.buffer=self.buffer[1:]
82
83 - def next_until(self,target):
84 for t in target: 85 try: 86 pos=self.buffer.index(t) 87 except ValueError: 88 pass 89 else: 90 found=''.join(self.buffer[:pos]) 91 self.buffer=self.buffer[pos:] 92 return found 93 else: 94 return None
95
96 - def peek_word(self,word):
97 return ''.join(self.buffer[:len(word)])==word
98
99 - def next_word(self):
100 """Return the next NEXUS word from a string. 101 102 This deals with single and double quotes, whitespace and punctuation. 103 """ 104 105 word=[] 106 quoted=False 107 first=self.next_nonwhitespace() # get first character 108 if not first: # return empty if only whitespace left 109 return None 110 word.append(first) 111 if first=="'": # word starts with a quote 112 quoted="'" 113 elif first=='"': 114 quoted='"' 115 elif first in PUNCTUATION: # if it's punctuation, return immediately 116 return first 117 while True: 118 c=self.peek() 119 if c==quoted: # a quote? 120 word.append(self.next()) # store quote 121 if self.peek()==quoted: # double quote 122 skip=self.next() # skip second quote 123 elif quoted: # second single quote ends word 124 break 125 elif quoted: 126 word.append(self.next()) # if quoted, then add anything 127 elif not c or c in PUNCTUATION or c in WHITESPACE: # if not quoted and special character, stop 128 break 129 else: 130 word.append(self.next()) # standard character 131 return ''.join(word)
132
133 - def rest(self):
134 """Return the rest of the string without parsing.""" 135 return ''.join(self.buffer)
136
137 -class StepMatrix:
138 """Calculate a stepmatrix for weighted parsimony. 139 140 See Wheeler (1990), Cladistics 6:269-275. 141 """ 142
143 - def __init__(self,symbols,gap):
144 self.data={} 145 self.symbols=[s for s in symbols] 146 self.symbols.sort() 147 if gap: 148 self.symbols.append(gap) 149 for x in self.symbols: 150 for y in [s for s in self.symbols if s!=x]: 151 self.set(x,y,0)
152
153 - def set(self,x,y,value):
154 if x>y: 155 x,y=y,x 156 self.data[x+y]=value
157
158 - def add(self,x,y,value):
159 if x>y: 160 x,y=y,x 161 self.data[x+y]+=value
162
163 - def sum(self):
164 return reduce(lambda x,y:x+y,self.data.values())
165
166 - def transformation(self):
167 total=self.sum() 168 if total!=0: 169 for k in self.data: 170 self.data[k]=self.data[k]/float(total) 171 return self
172
173 - def weighting(self):
174 for k in self.data: 175 if self.data[k]!=0: 176 self.data[k]=-math.log(self.data[k]) 177 return self
178
179 - def smprint(self,name='your_name_here'):
180 matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols)) 181 matrix+=' %s\n' % ' '.join(self.symbols) 182 for x in self.symbols: 183 matrix+='[%s]'.ljust(8) % x 184 for y in self.symbols: 185 if x==y: 186 matrix+=' . ' 187 else: 188 if x>y: 189 x1,y1=y,x 190 else: 191 x1,y1=x,y 192 if self.data[x1+y1]==0: 193 matrix+='inf. ' 194 else: 195 matrix+='%2.2f'.ljust(10) % (self.data[x1+y1]) 196 matrix+='\n' 197 matrix+=';\n' 198 return matrix
199
200 -def safename(name,mrbayes=False):
201 """Return a taxon identifier according to NEXUS standard. 202 203 Wrap quotes around names with punctuation or whitespace, and double 204 single quotes. 205 206 mrbayes=True: write names without quotes, whitespace or punctuation 207 for the mrbayes software package. 208 """ 209 if mrbayes: 210 safe=name.replace(' ','_') 211 safe=''.join([c for c in safe if c in MRBAYESSAFE]) 212 else: 213 safe=name.replace("'","''") 214 if set(safe).intersection(set(WHITESPACE+PUNCTUATION)): 215 safe="'"+safe+"'" 216 return safe
217
218 -def quotestrip(word):
219 """Remove quotes and/or double quotes around identifiers.""" 220 if not word: 221 return None 222 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')): 223 word=word[1:-1] 224 return word
225
226 -def get_start_end(sequence, skiplist=['-','?']):
227 """Return position of first and last character which is not in skiplist. 228 229 Skiplist defaults to ['-','?']).""" 230 231 length=len(sequence) 232 if length==0: 233 return None,None 234 end=length-1 235 while end>=0 and (sequence[end] in skiplist): 236 end-=1 237 start=0 238 while start<length and (sequence[start] in skiplist): 239 start+=1 240 if start==length and end==-1: # empty sequence 241 return -1,-1 242 else: 243 return start,end
244
245 -def _sort_keys_by_values(p):
246 """Returns a sorted list of keys of p sorted by values of p.""" 247 startpos=[(p[pn],pn) for pn in p if p[pn]] 248 startpos.sort() 249 # parenthisis added because of py3k 250 return (zip(*startpos))[1]
251
252 -def _make_unique(l):
253 """Check that all values in list are unique and return a pruned and sorted list.""" 254 l=list(set(l)) 255 l.sort() 256 return l
257
258 -def _unique_label(previous_labels,label):
259 """Returns a unique name if label is already in previous_labels.""" 260 while label in previous_labels: 261 if label.split('.')[-1].startswith('copy'): 262 label='.'.join(label.split('.')[:-1])+'.copy'+str(eval('0'+label.split('.')[-1][4:])+1) 263 else: 264 label+='.copy' 265 return label
266
267 -def _seqmatrix2strmatrix(matrix):
268 """Converts a Seq-object matrix to a plain sequence-string matrix.""" 269 return dict([(t,matrix[t].tostring()) for t in matrix])
270
271 -def _compact4nexus(orig_list):
272 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class) 273 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.). 274 """ 275 276 if not orig_list: 277 return '' 278 orig_list=list(set(orig_list)) 279 orig_list.sort() 280 shortlist=[] 281 clist=orig_list[:] 282 clist.append(clist[-1]+.5) # dummy value makes it easier 283 while len(clist)>1: 284 step=1 285 for i,x in enumerate(clist): 286 if x==clist[0]+i*step: # are we still in the right step? 287 continue 288 elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]: 289 # second element, and possibly at least 3 elements to link, 290 # and the next one is in the right step 291 step=x-clist[0] 292 else: # pattern broke, add all values before current position to new list 293 sub=clist[:i] 294 if len(sub)==1: 295 shortlist.append(str(sub[0]+1)) 296 else: 297 if step==1: 298 shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1)) 299 else: 300 shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step)) 301 clist=clist[i:] 302 break 303 return ' '.join(shortlist)
304
305 -def combine(matrices):
306 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance. 307 308 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...] 309 Character sets, character partitions and taxon sets are prefixed, readjusted 310 and present in the combined matrix. 311 """ 312 313 if not matrices: 314 return None 315 name=matrices[0][0] 316 combined=copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix 317 mixed_datatypes=(len(set([n[1].datatype for n in matrices]))>1) 318 if mixed_datatypes: 319 combined.datatype='None' # dealing with mixed matrices is application specific. You take care of that yourself! 320 # raise NexusError('Matrices must be of same datatype') 321 combined.charlabels=None 322 combined.statelabels=None 323 combined.interleave=False 324 combined.translate=None 325 326 # rename taxon sets and character sets and name them with prefix 327 for cn,cs in combined.charsets.iteritems(): 328 combined.charsets['%s.%s' % (name,cn)]=cs 329 del combined.charsets[cn] 330 for tn,ts in combined.taxsets.iteritems(): 331 combined.taxsets['%s.%s' % (name,tn)]=ts 332 del combined.taxsets[tn] 333 # previous partitions usually don't make much sense in combined matrix 334 # just initiate one new partition parted by single matrices 335 combined.charpartitions={'combined':{name:range(combined.nchar)}} 336 for n,m in matrices[1:]: # add all other matrices 337 both=[t for t in combined.taxlabels if t in m.taxlabels] 338 combined_only=[t for t in combined.taxlabels if t not in both] 339 m_only=[t for t in m.taxlabels if t not in both] 340 for t in both: 341 # concatenate sequences and unify gap and missing character symbols 342 combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet) 343 # replace date of missing taxa with symbol for missing data 344 for t in combined_only: 345 combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet) 346 for t in m_only: 347 combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\ 348 Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet) 349 combined.taxlabels.extend(m_only) # new taxon list 350 for cn,cs in m.charsets.iteritems(): # adjust character sets for new matrix 351 combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs] 352 if m.taxsets: 353 if not combined.taxsets: 354 combined.taxsets={} 355 # update taxon sets 356 combined.taxsets.update(dict(('%s.%s' % (n,tn),ts) \ 357 for tn,ts in m.taxsets.iteritems())) 358 # update new charpartition 359 combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar) 360 # update charlabels 361 if m.charlabels: 362 if not combined.charlabels: 363 combined.charlabels={} 364 combined.charlabels.update(dict((combined.nchar+i,label) \ 365 for (i,label) in m.charlabels.iteritems())) 366 combined.nchar+=m.nchar # update nchar and ntax 367 combined.ntax+=len(m_only) 368 369 # some prefer partitions, some charsets: 370 # make separate charset for ecah initial dataset 371 for c in combined.charpartitions['combined']: 372 combined.charsets[c]=combined.charpartitions['combined'][c] 373 374 return combined
375
376 -def _kill_comments_and_break_lines(text):
377 """Delete []-delimited comments out of a file and break into lines separated by ';'. 378 379 stripped_text=_kill_comments_and_break_lines(text): 380 Nested and multiline comments are allowed. [ and ] symbols within single 381 or double quotes are ignored, newline ends a quote, all symbols with quotes are 382 treated the same (thus not quoting inside comments like [this character ']' ends a comment]) 383 Special [&...] and [\...] comments remain untouched, if not inside standard comment. 384 Quotes inside special [& and [\ are treated as normal characters, 385 but no nesting inside these special comments allowed (like [& [\ ]]). 386 ';' ist deleted from end of line. 387 388 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus 389 """ 390 import warnings 391 warnings.warn("This function is very slow for large files, and obsolete when using C extension cnexus", PendingDeprecationWarning) 392 contents=CharBuffer(text) 393 newtext=[] 394 newline=[] 395 quotelevel='' 396 speciallevel=False 397 commlevel=0 398 while True: 399 #plain=contents.next_until(["'",'"','[',']','\n',';']) # search for next special character 400 #if not plain: 401 # newline.append(contents.rest) # not found, just add the rest 402 # break 403 #newline.append(plain) # add intermediate text 404 t=contents.next() # and get special character 405 if t is None: 406 break 407 if t==quotelevel and not (commlevel or speciallevel): # matching quote ends quotation 408 quotelevel='' 409 elif not quotelevel and not (commlevel or speciallevel) and (t=='"' or t=="'"): # single or double quote starts quotation 410 quotelevel=t 411 elif not quotelevel and t=='[': # opening bracket outside a quote 412 if contents.peek() in SPECIALCOMMENTS and commlevel==0 and not speciallevel: 413 speciallevel=True 414 else: 415 commlevel+=1 416 elif not quotelevel and t==']': # closing bracket ioutside a quote 417 if speciallevel: 418 speciallevel=False 419 else: 420 commlevel-=1 421 if commlevel<0: 422 raise NexusError('Nexus formatting error: unmatched ]') 423 continue 424 if commlevel==0: # copy if we're not in comment 425 if t==';' and not quotelevel: 426 newtext.append(''.join(newline)) 427 newline=[] 428 else: 429 newline.append(t) 430 #level of comments should be 0 at the end of the file 431 if newline: 432 newtext.append('\n'.join(newline)) 433 if commlevel>0: 434 raise NexusError('Nexus formatting error: unmatched [') 435 return newtext
436 437
438 -def _adjust_lines(lines):
439 """Adjust linebreaks to match ';', strip leading/trailing whitespace. 440 441 list_of_commandlines=_adjust_lines(input_text) 442 Lines are adjusted so that no linebreaks occur within a commandline 443 (except matrix command line) 444 """ 445 formatted_lines=[] 446 for l in lines: 447 #Convert line endings 448 l=l.replace('\r\n','\n').replace('\r','\n').strip() 449 if l.lower().startswith('matrix'): 450 formatted_lines.append(l) 451 else: 452 l=l.replace('\n',' ') 453 if l: 454 formatted_lines.append(l) 455 return formatted_lines
456
457 -def _replace_parenthesized_ambigs(seq,rev_ambig_values):
458 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code.""" 459 460 opening=seq.find('(') 461 while opening>-1: 462 closing=seq.find(')') 463 if closing<0: 464 raise NexusError('Missing closing parenthesis in: '+seq) 465 elif closing<opening: 466 raise NexusError('Missing opening parenthesis in: '+seq) 467 ambig=[x for x in seq[opening+1:closing]] 468 ambig.sort() 469 ambig=''.join(ambig) 470 ambig_code=rev_ambig_values[ambig.upper()] 471 if ambig!=ambig.upper(): 472 ambig_code=ambig_code.lower() 473 seq=seq[:opening]+ambig_code+seq[closing+1:] 474 opening=seq.find('(') 475 return seq
476
477 -class Commandline:
478 """Represent a commandline as command and options.""" 479
480 - def __init__(self, line, title):
481 self.options={} 482 options=[] 483 self.command=None 484 try: 485 #Assume matrix (all other command lines have been stripped of \n) 486 self.command, options = line.strip().split('\n', 1) 487 except ValueError: #Not matrix 488 #self.command,options=line.split(' ',1) #no: could be tab or spaces (translate...) 489 self.command=line.split()[0] 490 options=' '.join(line.split()[1:]) 491 self.command = self.command.strip().lower() 492 if self.command in SPECIAL_COMMANDS: # special command that need newlines and order of options preserved 493 self.options=options.strip() 494 else: 495 if len(options) > 0: 496 try: 497 options = options.replace('=', ' = ').split() 498 valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))] 499 indices = [] 500 for sl in valued_indices: 501 indices.extend(sl) 502 token_indices = [n for n in range(len(options)) if n not in indices] 503 for opt in valued_indices: 504 #self.options[options[opt[0]].lower()] = options[opt[2]].lower() 505 self.options[options[opt[0]].lower()] = options[opt[2]] 506 for token in token_indices: 507 self.options[options[token].lower()] = None 508 except ValueError: 509 raise NexusError('Incorrect formatting in line: %s' % line)
510
511 -class Block:
512 """Represent a NEXUS block with block name and list of commandlines."""
513 - def __init__(self,title=None):
514 self.title=title 515 self.commandlines=[]
516
517 -class Nexus(object):
518 519 __slots__=['original_taxon_order','__dict__'] 520
521 - def __init__(self, input=None):
522 self.ntax=0 # number of taxa 523 self.nchar=0 # number of characters 524 self.unaltered_taxlabels=[] # taxlabels as the appear in the input file (incl. duplicates, etc.) 525 self.taxlabels=[] # labels for taxa, ordered by their id 526 self.charlabels=None # ... and for characters 527 self.statelabels=None # ... and for states 528 self.datatype='dna' # (standard), dna, rna, nucleotide, protein 529 self.respectcase=False # case sensitivity 530 self.missing='?' # symbol for missing characters 531 self.gap='-' # symbol for gap 532 self.symbols=None # set of symbols 533 self.equate=None # set of symbol synonyms 534 self.matchchar=None # matching char for matrix representation 535 self.labels=None # left, right, no 536 self.transpose=False # whether matrix is transposed 537 self.interleave=False # whether matrix is interleaved 538 self.tokens=False # unsupported 539 self.eliminate=None # unsupported 540 self.matrix=None # ... 541 self.unknown_blocks=[] # blocks we don't care about 542 self.taxsets={} 543 self.charsets={} 544 self.charpartitions={} 545 self.taxpartitions={} 546 self.trees=[] # list of Trees (instances of Tree class) 547 self.translate=None # Dict to translate taxon <-> taxon numbers 548 self.structured=[] # structured input representation 549 self.set={} # dict of the set command to set various options 550 self.options={} # dict of the options command in the data block 551 self.codonposset=None # name of the charpartition that defines codon positions 552 553 # some defaults 554 self.options['gapmode']='missing' 555 556 if input: 557 self.read(input) 558 else: 559 self.read(DEFAULTNEXUS)
560
561 - def get_original_taxon_order(self):
562 """Included for backwards compatibility (DEPRECATED).""" 563 return self.taxlabels
564 - def set_original_taxon_order(self,value):
565 """Included for backwards compatibility (DEPRECATED).""" 566 self.taxlabels=value
567 original_taxon_order=property(get_original_taxon_order,set_original_taxon_order) 568
569 - def read(self,input):
570 """Read and parse NEXUS imput (a filename, file-handle, or string).""" 571 572 # 1. Assume we have the name of a file in the execution dir 573 # Note we need to add parsing of the path to dir/filename 574 try: 575 file_contents = open(os.path.expanduser(input),'rU').read() 576 self.filename=input 577 except (TypeError,IOError,AttributeError): 578 #2 Assume we have a string from a fh.read() 579 if isinstance(input, str): 580 file_contents = input 581 self.filename='input_string' 582 #3 Assume we have a file object 583 elif hasattr(input,'read'): # file objects or StringIO objects 584 file_contents=input.read() 585 if hasattr(input,"name") and input.name: 586 self.filename=input.name 587 else: 588 self.filename='Unknown_nexus_file' 589 else: 590 print input.strip()[:50] 591 raise NexusError('Unrecognized input: %s ...' % input[:100]) 592 file_contents=file_contents.strip() 593 if file_contents.startswith('#NEXUS'): 594 file_contents=file_contents[6:] 595 if C: 596 decommented=cnexus.scanfile(file_contents) 597 #check for unmatched parentheses 598 if decommented=='[' or decommented==']': 599 raise NexusError('Unmatched %s' % decommented) 600 # cnexus can't return lists, so in analogy we separate commandlines with chr(7) 601 # (a character that shoudn't be part of a nexus file under normal circumstances) 602 commandlines=_adjust_lines(decommented.split(chr(7))) 603 else: 604 commandlines=_adjust_lines(_kill_comments_and_break_lines(file_contents)) 605 # get rid of stupid 'NEXUS token - in merged treefiles, this might appear multiple times' 606 for i,cl in enumerate(commandlines): 607 try: 608 if cl[:6].upper()=='#NEXUS': 609 commandlines[i]=cl[6:].strip() 610 except: 611 pass 612 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands 613 nexus_block_gen = self._get_nexus_block(commandlines) 614 while 1: 615 try: 616 title, contents = nexus_block_gen.next() 617 except StopIteration: 618 break 619 if title in KNOWN_NEXUS_BLOCKS: 620 self._parse_nexus_block(title, contents) 621 else: 622 self._unknown_nexus_block(title, contents)
623
624 - def _get_nexus_block(self,file_contents):
625 """Generator for looping through Nexus blocks.""" 626 inblock=False 627 blocklines=[] 628 while file_contents: 629 cl=file_contents.pop(0) 630 if cl.lower().startswith('begin'): 631 if not inblock: 632 inblock=True 633 title=cl.split()[1].lower() 634 else: 635 raise NexusError('Illegal block nesting in block %s' % title) 636 elif cl.lower().startswith('end'): 637 if inblock: 638 inblock=False 639 yield title,blocklines 640 blocklines=[] 641 else: 642 raise NexusError('Unmatched \'end\'.') 643 elif inblock: 644 blocklines.append(cl)
645
646 - def _unknown_nexus_block(self,title, contents):
647 block = Block() 648 block.commandlines.append(contents) 649 block.title = title 650 self.unknown_blocks.append(block)
651
652 - def _parse_nexus_block(self,title, contents):
653 """Parse a known Nexus Block (PRIVATE).""" 654 # attached the structered block representation 655 self._apply_block_structure(title, contents) 656 #now check for taxa,characters,data blocks. If this stuff is defined more than once 657 #the later occurences will override the previous ones. 658 block=self.structured[-1] 659 for line in block.commandlines: 660 try: 661 getattr(self,'_'+line.command)(line.options) 662 except AttributeError: 663 raise 664 raise NexusError('Unknown command: %s ' % line.command)
665
666 - def _title(self,options):
667 pass
668
669 - def _dimensions(self,options):
670 if 'ntax' in options: 671 self.ntax=eval(options['ntax']) 672 if 'nchar' in options: 673 self.nchar=eval(options['nchar'])
674
675 - def _format(self,options):
676 # print options 677 # we first need to test respectcase, then symbols (which depends on respectcase) 678 # then datatype (which, if standard, depends on symbols and respectcase in order to generate 679 # dicts for ambiguous values and alphabet 680 if 'respectcase' in options: 681 self.respectcase=True 682 # adjust symbols to for respectcase 683 if 'symbols' in options: 684 self.symbols=options['symbols'] 685 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\ 686 (self.symbold.startswith("'") and self.symbols.endswith("'")): 687 self.symbols=self.symbols[1:-1].replace(' ','') 688 if not self.respectcase: 689 self.symbols=self.symbols.lower()+self.symbols.upper() 690 self.symbols=list(set(self.symbols)) 691 if 'datatype' in options: 692 self.datatype=options['datatype'].lower() 693 if self.datatype=='dna' or self.datatype=='nucleotide': 694 self.alphabet=copy.deepcopy(IUPAC.ambiguous_dna) 695 self.ambiguous_values=copy.deepcopy(IUPACData.ambiguous_dna_values) 696 self.unambiguous_letters=copy.deepcopy(IUPACData.unambiguous_dna_letters) 697 elif self.datatype=='rna': 698 self.alphabet=copy.deepcopy(IUPAC.ambiguous_rna) 699 self.ambiguous_values=copy.deepcopy(IUPACData.ambiguous_rna_values) 700 self.unambiguous_letters=copy.deepcopy(IUPACData.unambiguous_rna_letters) 701 elif self.datatype=='protein': 702 self.alphabet=copy.deepcopy(IUPAC.protein) 703 self.ambiguous_values={'B':'DN','Z':'EQ','X':copy.deepcopy(IUPACData.protein_letters)} # that's how PAUP handles it 704 self.unambiguous_letters=copy.deepcopy(IUPACData.protein_letters)+'*' # stop-codon 705 elif self.datatype=='standard': 706 raise NexusError('Datatype standard is not yet supported.') 707 #self.alphabet=None 708 #self.ambiguous_values={} 709 #if not self.symbols: 710 # self.symbols='01' # if nothing else defined, then 0 and 1 are the default states 711 #self.unambiguous_letters=self.symbols 712 else: 713 raise NexusError('Unsupported datatype: '+self.datatype) 714 self.valid_characters=''.join(self.ambiguous_values)+self.unambiguous_letters 715 if not self.respectcase: 716 self.valid_characters=self.valid_characters.lower()+self.valid_characters.upper() 717 #we have to sort the reverse ambig coding dict key characters: 718 #to be sure that it's 'ACGT':'N' and not 'GTCA':'N' 719 rev=dict((i[1],i[0]) for i in self.ambiguous_values.iteritems() if i[0]!='X') 720 self.rev_ambiguous_values={} 721 for (k,v) in rev.iteritems(): 722 key=[c for c in k] 723 key.sort() 724 self.rev_ambiguous_values[''.join(key)]=v 725 #overwrite symbols for datype rna,dna,nucleotide 726 if self.datatype in ['dna','rna','nucleotide']: 727 self.symbols=self.alphabet.letters 728 if self.missing not in self.ambiguous_values: 729 self.ambiguous_values[self.missing]=self.unambiguous_letters+self.gap 730 self.ambiguous_values[self.gap]=self.gap 731 elif self.datatype=='standard': 732 if not self.symbols: 733 self.symbols=['1','0'] 734 if 'missing' in options: 735 self.missing=options['missing'][0] 736 if 'gap' in options: 737 self.gap=options['gap'][0] 738 if 'equate' in options: 739 self.equate=options['equate'] 740 if 'matchchar' in options: 741 self.matchchar=options['matchchar'][0] 742 if 'labels' in options: 743 self.labels=options['labels'] 744 if 'transpose' in options: 745 raise NexusError('TRANSPOSE is not supported!') 746 self.transpose=True 747 if 'interleave' in options: 748 if options['interleave']==None or options['interleave'].lower()=='yes': 749 self.interleave=True 750 if 'tokens' in options: 751 self.tokens=True 752 if 'notokens' in options: 753 self.tokens=False
754 755
756 - def _set(self,options):
757 self.set=options;
758
759 - def _options(self,options):
760 self.options=options;
761
762 - def _eliminate(self,options):
763 self.eliminate=options
764
765 - def _taxlabels(self,options):
766 """Get taxon labels (PRIVATE). 767 768 As the taxon names are already in the matrix, this is superfluous 769 except for transpose matrices, which are currently unsupported anyway. 770 Thus, we ignore the taxlabels command to make handling of duplicate 771 taxon names easier. 772 """ 773 pass
774 #self.taxlabels=[] 775 #opts=CharBuffer(options) 776 #while True: 777 # taxon=quotestrip(opts.next_word()) 778 # if not taxon: 779 # break 780 # self.taxlabels.append(taxon) 781
782 - def _check_taxlabels(self,taxon):
783 """Check for presence of taxon in self.taxlabels.""" 784 # According to NEXUS standard, underscores shall be treated as spaces..., 785 # so checking for identity is more difficult 786 nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels]) 787 nexid=taxon.replace(' ','_') 788 return nextaxa.get(nexid)
789
790 - def _charlabels(self,options):
791 self.charlabels={} 792 opts=CharBuffer(options) 793 while True: 794 try: 795 # get id and state 796 w=opts.next_word() 797 if w is None: # McClade saves and reads charlabel-lists with terminal comma?! 798 break 799 identifier=self._resolve(w,set_type=CHARSET) 800 state=quotestrip(opts.next_word()) 801 self.charlabels[identifier]=state 802 # check for comma or end of command 803 c=opts.next_nonwhitespace() 804 if c is None: 805 break 806 elif c!=',': 807 raise NexusError('Missing \',\' in line %s.' % options) 808 except NexusError: 809 raise 810 except: 811 raise NexusError('Format error in line %s.' % options)
812
813 - def _charstatelabels(self,options):
814 # warning: charstatelabels supports only charlabels-syntax! 815 self._charlabels(options)
816
817 - def _statelabels(self,options):
818 #self.charlabels=options 819 #print 'Command statelabels is not supported and will be ignored.' 820 pass
821
822 - def _matrix(self,options):
823 if not self.ntax or not self.nchar: 824 raise NexusError('Dimensions must be specified before matrix!') 825 self.matrix={} 826 taxcount=0 827 first_matrix_block=True 828 829 #eliminate empty lines and leading/trailing whitespace 830 lines=[l.strip() for l in options.split('\n') if l.strip()!=''] 831 lineiter=iter(lines) 832 while 1: 833 try: 834 l=lineiter.next() 835 except StopIteration: 836 if taxcount<self.ntax: 837 raise NexusError('Not enough taxa in matrix.') 838 elif taxcount>self.ntax: 839 raise NexusError('Too many taxa in matrix.') 840 else: 841 break 842 # count the taxa and check for interleaved matrix 843 taxcount+=1 844 ##print taxcount 845 if taxcount>self.ntax: 846 if not self.interleave: 847 raise NexusError('Too many taxa in matrix - should matrix be interleaved?') 848 else: 849 taxcount=1 850 first_matrix_block=False 851 #get taxon name and sequence 852 linechars=CharBuffer(l) 853 id=quotestrip(linechars.next_word()) 854 l=linechars.rest().strip() 855 chars='' 856 if self.interleave: 857 #interleaved matrix 858 #print 'In interleave' 859 if l: 860 chars=''.join(l.split()) 861 else: 862 chars=''.join(lineiter.next().split()) 863 else: 864 #non-interleaved matrix 865 chars=''.join(l.split()) 866 while len(chars)<self.nchar: 867 l=lineiter.next() 868 chars+=''.join(l.split()) 869 iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet) 870 #first taxon has the reference sequence if matchhar is used 871 if taxcount==1: 872 refseq=iupac_seq 873 else: 874 if self.matchchar: 875 while 1: 876 p=iupac_seq.tostring().find(self.matchchar) 877 if p==-1: 878 break 879 iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet) 880 #check for invalid characters 881 for i,c in enumerate(iupac_seq.tostring()): 882 if c not in self.valid_characters and c!=self.gap and c!=self.missing: 883 raise NexusError( \ 884 ('Taxon %s: Illegal character %s in sequence %s ' + \ 885 '(check dimensions/interleaving)') % (id,c, iupac_seq)) 886 #add sequence to matrix 887 if first_matrix_block: 888 self.unaltered_taxlabels.append(id) 889 id=_unique_label(self.matrix.keys(),id) 890 self.matrix[id]=iupac_seq 891 self.taxlabels.append(id) 892 else: 893 # taxon names need to be in the same order in each interleaved block 894 id=_unique_label(self.taxlabels[:taxcount-1],id) 895 taxon_present=self._check_taxlabels(id) 896 if taxon_present: 897 self.matrix[taxon_present]+=iupac_seq 898 else: 899 raise NexusError('Taxon %s not in first block of interleaved matrix. Check matrix dimensions and interleave.' % id) 900 #check all sequences for length according to nchar 901 for taxon in self.matrix: 902 if len(self.matrix[taxon])!=self.nchar: 903 raise NexusError('Matrx Nchar %d does not match data length (%d) for taxon %s' \ 904 % (self.nchar, len(self.matrix[taxon]),taxon)) 905 #check that taxlabels is identical with matrix.keys. If not, it's a problem 906 matrixkeys=sorted(self.matrix) 907 taxlabelssort=self.taxlabels[:] 908 taxlabelssort.sort() 909 assert matrixkeys==taxlabelssort,"ERROR: TAXLABELS must be identical with MATRIX. Please Report this as a bug, and send in data file."
910
911 - def _translate(self,options):
912 self.translate={} 913 opts=CharBuffer(options) 914 while True: 915 try: 916 # get id and state 917 identifier=int(opts.next_word()) 918 label=quotestrip(opts.next_word()) 919 self.translate[identifier]=label 920 # check for comma or end of command 921 c=opts.next_nonwhitespace() 922 if c is None: 923 break 924 elif c!=',': 925 raise NexusError('Missing \',\' in line %s.' % options) 926 except NexusError: 927 raise 928 except: 929 raise NexusError('Format error in line %s.' % options)
930
931 - def _utree(self,options):
932 """Some software (clustalx) uses 'utree' to denote an unrooted tree.""" 933 self._tree(options)
934
935 - def _tree(self,options):
936 opts=CharBuffer(options) 937 if opts.peek_nonwhitespace()=='*': # a star can be used to make it the default tree in some software packages 938 dummy=opts.next_nonwhitespace() 939 name=opts.next_word() 940 if opts.next_nonwhitespace()!='=': 941 raise NexusError('Syntax error in tree description: %s' \ 942 % options[:50]) 943 rooted=False 944 weight=1.0 945 while opts.peek_nonwhitespace()=='[': 946 open=opts.next_nonwhitespace() 947 symbol=opts.next() 948 if symbol!='&': 949 raise NexusError('Illegal special comment [%s...] in tree description: %s' \ 950 % (symbol, options[:50])) 951 special=opts.next() 952 value=opts.next_until(']') 953 closing=opts.next() 954 if special=='R': 955 rooted=True 956 elif special=='U': 957 rooted=False 958 elif special=='W': 959 weight=float(value) 960 tree=Tree(name=name,weight=weight,rooted=rooted,tree=opts.rest().strip()) 961 # if there's an active translation table, translate 962 if self.translate: 963 for n in tree.get_terminals(): 964 try: 965 tree.node(n).data.taxon=safename(self.translate[int(tree.node(n).data.taxon)]) 966 except (ValueError,KeyError): 967 raise NexusError('Unable to substitue %s using \'translate\' data.' \ 968 % tree.node(n).data.taxon) 969 self.trees.append(tree)
970
971 - def _apply_block_structure(self,title,lines):
972 block=Block('') 973 block.title = title 974 for line in lines: 975 block.commandlines.append(Commandline(line, title)) 976 self.structured.append(block)
977
978 - def _taxset(self, options):
979 name,taxa=self._get_indices(options,set_type=TAXSET) 980 self.taxsets[name]=_make_unique(taxa)
981
982 - def _charset(self, options):
983 name,sites=self._get_indices(options,set_type=CHARSET) 984 self.charsets[name]=_make_unique(sites)
985
986 - def _taxpartition(self, options):
987 taxpartition={} 988 quotelevel=False 989 opts=CharBuffer(options) 990 name=self._name_n_vector(opts) 991 if not name: 992 raise NexusError('Formatting error in taxpartition: %s ' % options) 993 # now collect thesubbpartitions and parse them 994 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 995 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words 996 sub='' 997 while True: 998 w=opts.next() 999 if w is None or (w==',' and not quotelevel): 1000 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':') 1001 taxpartition[subname]=_make_unique(subindices) 1002 sub='' 1003 if w is None: 1004 break 1005 else: 1006 if w=="'": 1007 quotelevel=not quotelevel 1008 sub+=w 1009 self.taxpartitions[name]=taxpartition
1010
1011 - def _codonposset(self,options):
1012 """Read codon positions from a codons block as written from McClade. 1013 1014 Here codonposset is just a fancy name for a character partition with 1015 the name CodonPositions and the partitions N,1,2,3 1016 """ 1017 1018 prev_partitions=self.charpartitions 1019 self._charpartition(options) 1020 # mcclade calls it CodonPositions, but you never know... 1021 codonname=[n for n in self.charpartitions if n not in prev_partitions] 1022 if codonname==[] or len(codonname)>1: 1023 raise NexusError('Formatting Error in codonposset: %s ' % options) 1024 else: 1025 self.codonposset=codonname[0]
1026
1027 - def _codeset(self,options):
1028 pass
1029
1030 - def _charpartition(self, options):
1031 charpartition={} 1032 quotelevel=False 1033 opts=CharBuffer(options) 1034 name=self._name_n_vector(opts) 1035 if not name: 1036 raise NexusError('Formatting error in charpartition: %s ' % options) 1037 # now collect thesubbpartitions and parse them 1038 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 1039 sub='' 1040 while True: 1041 w=opts.next() 1042 if w is None or (w==',' and not quotelevel): 1043 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':') 1044 charpartition[subname]=_make_unique(subindices) 1045 sub='' 1046 if w is None: 1047 break 1048 else: 1049 if w=="'": 1050 quotelevel=not quotelevel 1051 sub+=w 1052 self.charpartitions[name]=charpartition
1053
1054 - def _get_indices(self,options,set_type=CHARSET,separator='='):
1055 """Parse the taxset/charset specification (PRIVATE). 1056 1057 e.g. '1 2 3 - 5 dog cat 10 - 20 \\ 3' 1058 --> [0,1,2,3,4,'dog','cat',9,12,15,18] 1059 """ 1060 opts=CharBuffer(options) 1061 name=self._name_n_vector(opts,separator=separator) 1062 indices=self._parse_list(opts,set_type=set_type) 1063 if indices is None: 1064 raise NexusError('Formatting error in line: %s ' % options) 1065 return name,indices
1066
1067 - def _name_n_vector(self,opts,separator='='):
1068 """Extract name and check that it's not in vector format.""" 1069 rest=opts.rest() 1070 name=opts.next_word() 1071 # we ignore * before names 1072 if name=='*': 1073 name=opts.next_word() 1074 if not name: 1075 raise NexusError('Formatting error in line: %s ' % rest) 1076 name=quotestrip(name) 1077 if opts.peek_nonwhitespace=='(': 1078 open=opts.next_nonwhitespace() 1079 qualifier=open.next_word() 1080 close=opts.next_nonwhitespace() 1081 if qualifier.lower()=='vector': 1082 raise NexusError('Unsupported VECTOR format in line %s' \ 1083 % (opts)) 1084 elif qualifier.lower()!='standard': 1085 raise NexusError('Unknown qualifier %s in line %s' \ 1086 % (qualifier, opts)) 1087 if opts.next_nonwhitespace()!=separator: 1088 raise NexusError('Formatting error in line: %s ' % rest) 1089 return name
1090
1091 - def _parse_list(self,options_buffer,set_type):
1092 """Parse a NEXUS list (PRIVATE). 1093 1094 e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21], 1095 (assuming dog is taxon no. 17 and cat is taxon no. 21). 1096 """ 1097 plain_list=[] 1098 if options_buffer.peek_nonwhitespace(): 1099 try: # capture all possible exceptions and treat them as formatting erros, if they are not NexusError 1100 while True: 1101 identifier=options_buffer.next_word() # next list element 1102 if not identifier: # end of list? 1103 break 1104 start=self._resolve(identifier,set_type=set_type) 1105 if options_buffer.peek_nonwhitespace()=='-': # followd by - 1106 end=start 1107 step=1 1108 # get hyphen and end of range 1109 hyphen=options_buffer.next_nonwhitespace() 1110 end=self._resolve(options_buffer.next_word(),set_type=set_type) 1111 if set_type==CHARSET: 1112 if options_buffer.peek_nonwhitespace()=='\\': # followd by \ 1113 backslash=options_buffer.next_nonwhitespace() 1114 step=int(options_buffer.next_word()) # get backslash and step 1115 plain_list.extend(range(start,end+1,step)) 1116 else: 1117 if type(start)==list or type(end)==list: 1118 raise NexusError('Name if character sets not allowed in range definition: %s'\ 1119 % identifier) 1120 start=self.taxlabels.index(start) 1121 end=self.taxlabels.index(end) 1122 taxrange=self.taxlabels[start:end+1] 1123 plain_list.extend(taxrange) 1124 else: 1125 if type(start)==list: # start was the name of charset or taxset 1126 plain_list.extend(start) 1127 else: # start was an ordinary identifier 1128 plain_list.append(start) 1129 except NexusError: 1130 raise 1131 except: 1132 return None 1133 return plain_list
1134
1135 - def _resolve(self,identifier,set_type=None):
1136 """Translate identifier in list into character/taxon index. 1137 1138 Characters (which are referred to by their index in Nexus.py): 1139 Plain numbers are returned minus 1 (Nexus indices to python indices) 1140 Text identifiers are translaterd into their indices (if plain character indentifiers), 1141 the first hit in charlabels is returned (charlabels don't need to be unique) 1142 or the range of indices is returned (if names of character sets). 1143 Taxa (which are referred to by their unique name in Nexus.py): 1144 Plain numbers are translated in their taxon name, underscores and spaces are considered equal. 1145 Names are returned unchanged (if plain taxon identifiers), or the names in 1146 the corresponding taxon set is returned 1147 """ 1148 identifier=quotestrip(identifier) 1149 if not set_type: 1150 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.') 1151 if set_type==CHARSET: 1152 try: 1153 n=int(identifier) 1154 except ValueError: 1155 if self.charlabels and identifier in self.charlabels.itervalues(): 1156 for k in self.charlabels: 1157 if self.charlabels[k]==identifier: 1158 return k 1159 elif self.charsets and identifier in self.charsets: 1160 return self.charsets[identifier] 1161 else: 1162 raise NexusError('Unknown character identifier: %s' \ 1163 % identifier) 1164 else: 1165 if n<=self.nchar: 1166 return n-1 1167 else: 1168 raise NexusError('Illegal character identifier: %d>nchar (=%d).' \ 1169 % (identifier,self.nchar)) 1170 elif set_type==TAXSET: 1171 try: 1172 n=int(identifier) 1173 except ValueError: 1174 taxlabels_id=self._check_taxlabels(identifier) 1175 if taxlabels_id: 1176 return taxlabels_id 1177 elif self.taxsets and identifier in self.taxsets: 1178 return self.taxsets[identifier] 1179 else: 1180 raise NexusError('Unknown taxon identifier: %s' \ 1181 % identifier) 1182 else: 1183 if n>0 and n<=self.ntax: 1184 return self.taxlabels[n-1] 1185 else: 1186 raise NexusError('Illegal taxon identifier: %d>ntax (=%d).' \ 1187 % (identifier,self.ntax)) 1188 else: 1189 raise NexusError('Unknown set specification: %s.'% set_type)
1190
1191 - def _stateset(self, options):
1192 #Not implemented 1193 pass
1194
1195 - def _changeset(self, options):
1196 #Not implemented 1197 pass
1198
1199 - def _treeset(self, options):
1200 #Not implemented 1201 pass
1202
1203 - def _treepartition(self, options):
1204 #Not implemented 1205 pass
1206
1207 - def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, interleave=False, 1208 exclude=[], delete=[], charpartition=None, comment='',mrbayes=False):
1209 """Writes a nexus file for each partition in charpartition. 1210 1211 Only non-excluded characters and non-deleted taxa are included, 1212 just the data block is written. 1213 """ 1214 1215 if not matrix: 1216 matrix=self.matrix 1217 if not matrix: 1218 return 1219 if not filename: 1220 filename=self.filename 1221 if charpartition: 1222 pfilenames={} 1223 for p in charpartition: 1224 total_exclude=[]+exclude 1225 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]]) 1226 total_exclude=_make_unique(total_exclude) 1227 pcomment=comment+'\nPartition: '+p+'\n' 1228 dot=filename.rfind('.') 1229 if dot>0: 1230 pfilename=filename[:dot]+'_'+p+'.data' 1231 else: 1232 pfilename=filename+'_'+p 1233 pfilenames[p]=pfilename 1234 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize, 1235 interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False, 1236 mrbayes=mrbayes) 1237 return pfilenames 1238 else: 1239 fn=self.filename+'.data' 1240 self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave, 1241 exclude=exclude,delete=delete,comment=comment,append_sets=False, 1242 mrbayes=mrbayes) 1243 return fn
1244
1245 - def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\ 1246 blocksize=None, interleave=False, interleave_by_partition=False,\ 1247 comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False,\ 1248 codons_block=True):
1249 """Writes a nexus file with data and sets block to a file or handle. 1250 1251 Character sets and partitions are appended by default, and are 1252 adjusted according to excluded characters (i.e. character sets 1253 still point to the same sites (not necessarily same positions), 1254 without including the deleted characters. 1255 1256 filename - Either a filename as a string (which will be opened, 1257 written to and closed), or a handle object (which will 1258 be written to but NOT closed). 1259 omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the 1260 start of the file is ommited. 1261 1262 Returns the filename/handle used to write the data. 1263 """ 1264 if not matrix: 1265 matrix=self.matrix 1266 if not matrix: 1267 return 1268 if not filename: 1269 filename=self.filename 1270 if [t for t in delete if not self._check_taxlabels(t)]: 1271 raise NexusError('Unknown taxa: %s' \ 1272 % ', '.join(set(delete).difference(set(self.taxlabels)))) 1273 if interleave_by_partition: 1274 if not interleave_by_partition in self.charpartitions: 1275 raise NexusError('Unknown partition: '+interleave_by_partition) 1276 else: 1277 partition=self.charpartitions[interleave_by_partition] 1278 # we need to sort the partition names by starting position before we exclude characters 1279 names=_sort_keys_by_values(partition) 1280 newpartition={} 1281 for p in partition: 1282 newpartition[p]=[c for c in partition[p] if c not in exclude] 1283 # how many taxa and how many characters are left? 1284 undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete] 1285 cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete)) 1286 ntax_adjusted=len(undelete) 1287 nchar_adjusted=len(cropped_matrix[undelete[0]]) 1288 if not undelete or (undelete and undelete[0]==''): 1289 return 1290 if isinstance(filename,basestring): 1291 try: 1292 fh=open(filename,'w',encoding="utf-8") 1293 except IOError: 1294 raise NexusError('Could not open %s for writing.' % filename) 1295 elif hasattr(filename, 'write'): 1296 #e.g. StringIO or a real file handle 1297 fh=filename 1298 else: 1299 raise ValueError("Neither a filename nor a handle was supplied") 1300 if not omit_NEXUS: 1301 fh.write('#NEXUS\n') 1302 if comment: 1303 fh.write('['+comment+']\n') 1304 fh.write('begin data;\n') 1305 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted)) 1306 fh.write('\tformat datatype='+self.datatype) 1307 if self.respectcase: 1308 fh.write(' respectcase') 1309 if self.missing: 1310 fh.write(' missing='+self.missing) 1311 if self.gap: 1312 fh.write(' gap='+self.gap) 1313 if self.matchchar: 1314 fh.write(' matchchar='+self.matchchar) 1315 if self.labels: 1316 fh.write(' labels='+self.labels) 1317 if self.equate: 1318 fh.write(' equate='+self.equate) 1319 if interleave or interleave_by_partition: 1320 fh.write(' interleave') 1321 fh.write(';\n') 1322 #if self.taxlabels: 1323 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n') 1324 if self.charlabels: 1325 newcharlabels=self._adjust_charlabels(exclude=exclude) 1326 clkeys=sorted(newcharlabels) 1327 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n') 1328 fh.write('matrix\n') 1329 if not blocksize: 1330 if interleave: 1331 blocksize=70 1332 else: 1333 blocksize=self.nchar 1334 # delete deleted taxa and ecxclude excluded characters... 1335 namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete]) 1336 if interleave_by_partition: 1337 # interleave by partitions, but adjust partitions with regard to excluded characters 1338 seek=0 1339 for p in names: 1340 fh.write('[%s: %s]\n' % (interleave_by_partition,p)) 1341 if len(newpartition[p])>0: 1342 for taxon in undelete: 1343 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1344 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n') 1345 fh.write('\n') 1346 else: 1347 fh.write('[empty]\n\n') 1348 seek+=len(newpartition[p]) 1349 elif interleave: 1350 for seek in range(0,nchar_adjusted,blocksize): 1351 for taxon in undelete: 1352 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1353 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') 1354 fh.write('\n') 1355 else: 1356 for taxon in undelete: 1357 if blocksize<nchar_adjusted: 1358 fh.write(safename(taxon,mrbayes=mrbayes)+'\n') 1359 else: 1360 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1361 for seek in range(0,nchar_adjusted,blocksize): 1362 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') 1363 fh.write(';\nend;\n') 1364 if append_sets: 1365 if codons_block: 1366 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,include_codons=False)) 1367 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,codons_only=True)) 1368 else: 1369 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes)) 1370 1371 if fh == filename: 1372 #We were given the handle, don't close it. 1373 return filename 1374 else: 1375 #We opened the handle, so we should close it. 1376 fh.close() 1377 return filename
1378
1379 - def append_sets(self,exclude=[],delete=[],mrbayes=False,include_codons=True,codons_only=False):
1380 """Returns a sets block.""" 1381 if not self.charsets and not self.taxsets and not self.charpartitions: 1382 return '' 1383 if codons_only: 1384 setsb=['\nbegin codons'] 1385 else: 1386 setsb=['\nbegin sets'] 1387 # - now if characters have been excluded, the character sets need to be adjusted, 1388 # so that they still point to the right character positions 1389 # calculate a list of offsets: for each deleted character, the following character position 1390 # in the new file will have an additional offset of -1 1391 offset=0 1392 offlist=[] 1393 for c in range(self.nchar): 1394 if c in exclude: 1395 offset+=1 1396 offlist.append(-1) # dummy value as these character positions are excluded 1397 else: 1398 offlist.append(c-offset) 1399 # now adjust each of the character sets 1400 if not codons_only: 1401 for n,ns in self.charsets.iteritems(): 1402 cset=[offlist[c] for c in ns if c not in exclude] 1403 if cset: 1404 setsb.append('charset %s = %s' % (safename(n),_compact4nexus(cset))) 1405 for n,s in self.taxsets.iteritems(): 1406 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete] 1407 if tset: 1408 setsb.append('taxset %s = %s' % (safename(n),' '.join(tset))) 1409 for n,p in self.charpartitions.iteritems(): 1410 if not include_codons and n==CODONPOSITIONS: 1411 continue 1412 elif codons_only and n!=CODONPOSITIONS: 1413 continue 1414 # as characters have been excluded, the partitions must be adjusted 1415 # if a partition is empty, it will be omitted from the charpartition command 1416 # (although paup allows charpartition part=t1:,t2:,t3:1-100) 1417 names=_sort_keys_by_values(p) 1418 newpartition={} 1419 for sn in names: 1420 nsp=[offlist[c] for c in p[sn] if c not in exclude] 1421 if nsp: 1422 newpartition[sn]=nsp 1423 if newpartition: 1424 if include_codons and n==CODONPOSITIONS: 1425 command='codonposset' 1426 else: 1427 command='charpartition' 1428 setsb.append('%s %s = %s' % (command,safename(n),\ 1429 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition]))) 1430 # now write charpartititions, much easier than charpartitions 1431 for n,p in self.taxpartitions.iteritems(): 1432 names=_sort_keys_by_values(p) 1433 newpartition={} 1434 for sn in names: 1435 nsp=[t for t in p[sn] if t not in delete] 1436 if nsp: 1437 newpartition[sn]=nsp 1438 if newpartition: 1439 setsb.append('taxpartition %s = %s' % (safename(n),\ 1440 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition]))) 1441 # add 'end' and return everything 1442 setsb.append('end;\n') 1443 if len(setsb)==2: # begin and end only 1444 return '' 1445 else: 1446 return ';\n'.join(setsb)
1447
1448 - def export_fasta(self, filename=None, width=70):
1449 """Writes matrix into a fasta file: (self, filename=None, width=70).""" 1450 if not filename: 1451 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']: 1452 filename='.'.join(self.filename.split('.')[:-1])+'.fas' 1453 else: 1454 filename=self.filename+'.fas' 1455 fh=open(filename,'w') 1456 for taxon in self.taxlabels: 1457 fh.write('>'+safename(taxon)+'\n') 1458 for i in range(0, len(self.matrix[taxon].tostring()), width): 1459 fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n') 1460 fh.close() 1461 return filename
1462
1463 - def export_phylip(self, filename=None):
1464 """Writes matrix into a PHYLIP file: (self, filename=None). 1465 1466 Note that this writes a relaxed PHYLIP format file, where the names 1467 are not truncated, nor checked for invalid characters.""" 1468 if not filename: 1469 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']: 1470 filename='.'.join(self.filename.split('.')[:-1])+'.phy' 1471 else: 1472 filename=self.filename+'.phy' 1473 fh=open(filename,'w') 1474 fh.write('%d %d\n' % (self.ntax,self.nchar)) 1475 for taxon in self.taxlabels: 1476 fh.write('%s %s\n' % (safename(taxon),self.matrix[taxon].tostring())) 1477 fh.close() 1478 return filename
1479
1480 - def constant(self,matrix=None,delete=[],exclude=[]):
1481 """Return a list with all constant characters.""" 1482 if not matrix: 1483 matrix=self.matrix 1484 undelete=[t for t in self.taxlabels if t in matrix and t not in delete] 1485 if not undelete: 1486 return None 1487 elif len(undelete)==1: 1488 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude] 1489 # get the first sequence and expand all ambiguous values 1490 constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for 1491 x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude] 1492 for taxon in undelete[1:]: 1493 newconstant=[] 1494 for site in constant: 1495 #print '%d (paup=%d)' % (site[0],site[0]+1), 1496 seqsite=matrix[taxon][site[0]].upper() 1497 #print seqsite,'checked against',site[1],'\t', 1498 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]: 1499 # missing or same as before -> ok 1500 newconstant.append(site) 1501 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap): 1502 # subset of an ambig or only missing in previous -> take subset 1503 newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite))) 1504 elif seqsite in self.ambiguous_values: # is it an ambig: check the intersection with prev. values 1505 intersect = set(self.ambiguous_values[seqsite]).intersection(set(site[1])) 1506 if intersect: 1507 newconstant.append((site[0],''.join(intersect))) 1508 # print 'ok' 1509 #else: 1510 # print 'failed' 1511 #else: 1512 # print 'failed' 1513 constant=newconstant 1514 cpos=[s[0] for s in constant] 1515 return cpos
1516
1517 - def cstatus(self,site,delete=[],narrow=True):
1518 """Summarize character. 1519 1520 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?) 1521 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -) 1522 """ 1523 undelete=[t for t in self.taxlabels if t not in delete] 1524 if not undelete: 1525 return None 1526 cstatus=[] 1527 for t in undelete: 1528 c=self.matrix[t][site].upper() 1529 if self.options.get('gapmode')=='missing' and c==self.gap: 1530 c=self.missing 1531 if narrow and c==self.missing: 1532 if c not in cstatus: 1533 cstatus.append(c) 1534 else: 1535 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus]) 1536 if self.missing in cstatus and narrow and len(cstatus)>1: 1537 cstatus=[c for c in cstatus if c!=self.missing] 1538 cstatus.sort() 1539 return cstatus
1540
1541 - def weighted_stepmatrix(self,name='your_name_here',exclude=[],delete=[]):
1542 """Calculates a stepmatrix for weighted parsimony. 1543 1544 See Wheeler (1990), Cladistics 6:269-275 and 1545 Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196 1546 """ 1547 m=StepMatrix(self.unambiguous_letters,self.gap) 1548 for site in [s for s in range(self.nchar) if s not in exclude]: 1549 cstatus=self.cstatus(site,delete) 1550 for i,b1 in enumerate(cstatus[:-1]): 1551 for b2 in cstatus[i+1:]: 1552 m.add(b1.upper(),b2.upper(),1) 1553 return m.transformation().weighting().smprint(name=name)
1554
1555 - def crop_matrix(self,matrix=None, delete=[], exclude=[]):
1556 """Return a matrix without deleted taxa and excluded characters.""" 1557 if not matrix: 1558 matrix=self.matrix 1559 if [t for t in delete if not self._check_taxlabels(t)]: 1560 raise NexusError('Unknown taxa: %s' \ 1561 % ', '.join(set(delete).difference(self.taxlabels))) 1562 if exclude!=[]: 1563 undelete=[t for t in self.taxlabels if t in matrix and t not in delete] 1564 if not undelete: 1565 return {} 1566 m=[matrix[k].tostring() for k in undelete] 1567 zipped_m=zip(*m) 1568 sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude] 1569 if sitesm==[]: 1570 return dict([(t,Seq('',self.alphabet)) for t in undelete]) 1571 else: 1572 zipped_sitesm=zip(*sitesm) 1573 m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)] 1574 return dict(zip(undelete,m)) 1575 else: 1576 return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
1577
1578 - def bootstrap(self,matrix=None,delete=[],exclude=[]):
1579 """Return a bootstrapped matrix.""" 1580 if not matrix: 1581 matrix=self.matrix 1582 seqobjects=isinstance(matrix[matrix.keys()[0]],Seq) # remember if Seq objects 1583 cm=self.crop_matrix(delete=delete,exclude=exclude) # crop data out 1584 if not cm: # everything deleted? 1585 return {} 1586 elif len(cm[cm.keys()[0]])==0: # everything excluded? 1587 return cm 1588 undelete=[t for t in self.taxlabels if t in cm] 1589 if seqobjects: 1590 sitesm=zip(*[cm[t].tostring() for t in undelete]) 1591 alphabet=matrix[matrix.keys()[0]].alphabet 1592 else: 1593 sitesm=zip(*[cm[t] for t in undelete]) 1594 bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))] 1595 bootstrapseqs=map(''.join,zip(*bootstrapsitesm)) 1596 if seqobjects: 1597 bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs] 1598 return dict(zip(undelete,bootstrapseqs))
1599
1600 - def add_sequence(self,name,sequence):
1601 """Adds a sequence (string) to the matrix.""" 1602 1603 if not name: 1604 raise NexusError('New sequence must have a name') 1605 1606 diff=self.nchar-len(sequence) 1607 if diff<0: 1608 self.insert_gap(self.nchar,-diff) 1609 elif diff>0: 1610 sequence+=self.missing*diff 1611 1612 if name in self.taxlabels: 1613 unique_name=_unique_label(self.taxlabels,name) 1614 #print "WARNING: Sequence name %s is already present. Sequence was added as %s." % (name,unique_name) 1615 else: 1616 unique_name=name 1617 1618 assert unique_name not in self.matrix, "ERROR. There is a discrepancy between taxlabels and matrix keys. Report this as a bug." 1619 1620 self.matrix[unique_name]=Seq(sequence,self.alphabet) 1621 self.ntax+=1 1622 self.taxlabels.append(unique_name) 1623 self.unaltered_taxlabels.append(name)
1624
1625 - def insert_gap(self,pos,n=1,leftgreedy=False):
1626 """Add a gap into the matrix and adjust charsets and partitions. 1627 1628 pos=0: first position 1629 pos=nchar: last position 1630 """ 1631 1632 def _adjust(set,x,d,leftgreedy=False): 1633 """Adjusts chartacter sets if gaps are inserted, taking care of 1634 new gaps within a coherent character set.""" 1635 # if 3 gaps are inserted at pos. 9 in a set that looks like 1 2 3 8 9 10 11 13 14 15 1636 # then the adjusted set will be 1 2 3 8 9 10 11 12 13 14 15 16 17 18 1637 # but inserting into position 8 it will stay like 1 2 3 11 12 13 14 15 16 17 18 1638 set.sort() 1639 addpos=0 1640 for i,c in enumerate(set): 1641 if c>=x: 1642 set[i]=c+d 1643 # if we add gaps within a group of characters, we want the gap position included in this group 1644 if c==x: 1645 if leftgreedy or (i>0 and set[i-1]==c-1): 1646 addpos=i 1647 if addpos>0: 1648 set[addpos:addpos]=range(x,x+d) 1649 return set
1650 1651 if pos<0 or pos>self.nchar: 1652 raise NexusError('Illegal gap position: %d' % pos) 1653 if n==0: 1654 return 1655 if self.taxlabels: 1656 #python 2.3 does not support zip(*[]) 1657 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels]) 1658 else: 1659 sitesm=[] 1660 sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n 1661 # #self.matrix=dict([(taxon,Seq(map(''.join,zip(*sitesm))[i],self.alphabet)) for\ 1662 # i,taxon in enumerate(self.taxlabels)]) 1663 zipped=zip(*sitesm) 1664 mapped=map(''.join,zipped) 1665 listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)] 1666 self.matrix=dict(listed) 1667 self.nchar+=n 1668 # now adjust character sets 1669 for i,s in self.charsets.iteritems(): 1670 self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy) 1671 for p in self.charpartitions: 1672 for sp,s in self.charpartitions[p].iteritems(): 1673 self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy) 1674 # now adjust character state labels 1675 self.charlabels=self._adjust_charlabels(insert=[pos]*n) 1676 return self.charlabels 1677
1678 - def _adjust_charlabels(self,exclude=None,insert=None):
1679 """Return adjusted indices of self.charlabels if characters are excluded or inserted.""" 1680 if exclude and insert: 1681 raise NexusError('Can\'t exclude and insert at the same time') 1682 if not self.charlabels: 1683 return None 1684 labels=sorted(self.charlabels) 1685 newcharlabels={} 1686 if exclude: 1687 exclude.sort() 1688 exclude.append(sys.maxint) 1689 excount=0 1690 for c in labels: 1691 if not c in exclude: 1692 while c>exclude[excount]: 1693 excount+=1 1694 newcharlabels[c-excount]=self.charlabels[c] 1695 elif insert: 1696 insert.sort() 1697 insert.append(sys.maxint) 1698 icount=0 1699 for c in labels: 1700 while c>=insert[icount]: 1701 icount+=1 1702 newcharlabels[c+icount]=self.charlabels[c] 1703 else: 1704 return self.charlabels 1705 return newcharlabels
1706
1707 - def invert(self,charlist):
1708 """Returns all character indices that are not in charlist.""" 1709 return [c for c in range(self.nchar) if c not in charlist]
1710
1711 - def gaponly(self,include_missing=False):
1712 """Return gap-only sites.""" 1713 gap=set(self.gap) 1714 if include_missing: 1715 gap.add(self.missing) 1716 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels]) 1717 gaponly=[i for i,site in enumerate(sitesm) if set(site).issubset(gap)] 1718 return gaponly
1719
1720 - def terminal_gap_to_missing(self,missing=None,skip_n=True):
1721 """Replaces all terminal gaps with missing character. 1722 1723 Mixtures like ???------??------- are properly resolved.""" 1724 1725 if not missing: 1726 missing=self.missing 1727 replace=[self.missing,self.gap] 1728 if not skip_n: 1729 replace.extend(['n','N']) 1730 for taxon in self.taxlabels: 1731 sequence=self.matrix[taxon].tostring() 1732 length=len(sequence) 1733 start,end=get_start_end(sequence,skiplist=replace) 1734 if start==-1 and end==-1: 1735 sequence=missing*length 1736 else: 1737 sequence=sequence[:end+1]+missing*(length-end-1) 1738 sequence=start*missing+sequence[start:] 1739 assert length==len(sequence), 'Illegal sequence manipulation in Nexus.termial_gap_to_missing in taxon %s' % taxon 1740 self.matrix[taxon]=Seq(sequence,self.alphabet)
1741