Package Bio :: Package SeqUtils
[hide private]
[frames] | no frames]

Source Code for Package Bio.SeqUtils

  1  #!/usr/bin/env python 
  2  # Created: Wed May 29 08:07:18 2002 
  3  # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se 
  4  # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark. 
  5  # All rights reserved. 
  6  # This code is part of the Biopython distribution and governed by its 
  7  # license.  Please see the LICENSE file that should have been included 
  8  # as part of this package. 
  9   
 10  """Miscellaneous functions for dealing with sequences.""" 
 11   
 12  import re, time 
 13  from Bio import SeqIO 
 14  from Bio.Seq import Seq 
 15  from Bio import Alphabet 
 16  from Bio.Alphabet import IUPAC 
 17  from Bio.Data import IUPACData, CodonTable 
 18   
 19   
 20  ###################################### 
 21  # DNA 
 22  ###################### 
 23  # {{{  
 24   
25 -def reverse(seq):
26 """Reverse the sequence. Works on string sequences (DEPRECATED). 27 28 This function is now deprecated, instead use the string's built in slice 29 method with a step of minus one: 30 31 >>> "ACGGT"[::-1] 32 'TGGCA' 33 """ 34 import warnings 35 import Bio 36 warnings.warn("Bio.SeqUtils.reverse() is deprecated, use the string's " 37 "slice method with a step of minus one instead.", 38 Bio.BiopythonDeprecationWarning) 39 r = list(seq) 40 r.reverse() 41 return ''.join(r)
42
43 -def GC(seq):
44 """Calculates G+C content, returns the percentage (float between 0 and 100). 45 46 Copes mixed case seuqneces, and with the ambiguous nucleotide S (G or C) 47 when counting the G and C content. The percentage is calculated against 48 the full length, e.g.: 49 50 >>> from Bio.SeqUtils import GC 51 >>> GC("ACTGN") 52 40.0 53 54 Note that this will return zero for an empty sequence. 55 """ 56 try: 57 gc = sum(map(seq.count,['G','C','g','c','S','s'])) 58 return gc*100.0/len(seq) 59 except ZeroDivisionError: 60 return 0.0
61 62
63 -def GC123(seq):
64 """Calculates total G+C content plus first, second and third positions. 65 66 Returns a tuple of four floats (percentages between 0 and 100) for the 67 entire sequence, and the three codon positions. e.g. 68 69 >>> from Bio.SeqUtils import GC123 70 >>> GC123("ACTGTN") 71 (40.0, 50.0, 50.0, 0.0) 72 73 Copes with mixed case sequences, but does NOT deal with ambiguous 74 nucleotides. 75 """ 76 d= {} 77 for nt in ['A','T','G','C']: 78 d[nt] = [0,0,0] 79 80 for i in range(0,len(seq),3): 81 codon = seq[i:i+3] 82 if len(codon) <3: codon += ' ' 83 for pos in range(0,3): 84 for nt in ['A','T','G','C']: 85 if codon[pos] == nt or codon[pos] == nt.lower(): 86 d[nt][pos] += 1 87 gc = {} 88 gcall = 0 89 nall = 0 90 for i in range(0,3): 91 try: 92 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i] 93 gc[i] = (d['G'][i] + d['C'][i])*100.0/n 94 except: 95 gc[i] = 0 96 97 gcall = gcall + d['G'][i] + d['C'][i] 98 nall = nall + n 99 100 gcall = 100.0*gcall/nall 101 return gcall, gc[0], gc[1], gc[2]
102
103 -def GC_skew(seq, window = 100):
104 """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence. 105 106 Returns a list of ratios (floats), controlled by the length of the sequence 107 and the size of the window. 108 109 Does NOT look at any ambiguous nucleotides. 110 """ 111 # 8/19/03: Iddo: added lowercase 112 values = [] 113 for i in range(0, len(seq), window): 114 s = seq[i: i + window] 115 g = s.count('G') + s.count('g') 116 c = s.count('C') + s.count('c') 117 skew = (g-c)/float(g+c) 118 values.append(skew) 119 return values
120 121 from math import pi, sin, cos, log
122 -def xGC_skew(seq, window = 1000, zoom = 100, 123 r = 300, px = 100, py = 100):
124 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!).""" 125 from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \ 126 VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y 127 yscroll = Scrollbar(orient = VERTICAL) 128 xscroll = Scrollbar(orient = HORIZONTAL) 129 canvas = Canvas(yscrollcommand = yscroll.set, 130 xscrollcommand = xscroll.set, background = 'white') 131 win = canvas.winfo_toplevel() 132 win.geometry('700x700') 133 134 yscroll.config(command = canvas.yview) 135 xscroll.config(command = canvas.xview) 136 yscroll.pack(side = RIGHT, fill = Y) 137 xscroll.pack(side = BOTTOM, fill = X) 138 canvas.pack(fill=BOTH, side = LEFT, expand = 1) 139 canvas.update() 140 141 X0, Y0 = r + px, r + py 142 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r 143 144 ty = Y0 145 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq))) 146 ty +=20 147 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq))) 148 ty +=20 149 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue') 150 ty +=20 151 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta') 152 ty +=20 153 canvas.create_oval(x1,y1, x2, y2) 154 155 acc = 0 156 start = 0 157 for gc in GC_skew(seq, window): 158 r1 = r 159 acc+=gc 160 # GC skew 161 alpha = pi - (2*pi*start)/len(seq) 162 r2 = r1 - gc*zoom 163 x1 = X0 + r1 * sin(alpha) 164 y1 = Y0 + r1 * cos(alpha) 165 x2 = X0 + r2 * sin(alpha) 166 y2 = Y0 + r2 * cos(alpha) 167 canvas.create_line(x1,y1,x2,y2, fill = 'blue') 168 # accumulated GC skew 169 r1 = r - 50 170 r2 = r1 - acc 171 x1 = X0 + r1 * sin(alpha) 172 y1 = Y0 + r1 * cos(alpha) 173 x2 = X0 + r2 * sin(alpha) 174 y2 = Y0 + r2 * cos(alpha) 175 canvas.create_line(x1,y1,x2,y2, fill = 'magenta') 176 177 canvas.update() 178 start += window 179 180 canvas.configure(scrollregion = canvas.bbox(ALL))
181
182 -def molecular_weight(seq):
183 """Calculate the molecular weight of a DNA sequence.""" 184 if type(seq) == type(''): seq = Seq(seq, IUPAC.unambiguous_dna) 185 weight_table = IUPACData.unambiguous_dna_weights 186 return sum(weight_table[x] for x in seq)
187
188 -def nt_search(seq, subseq):
189 """Search for a DNA subseq in sequence. 190 191 use ambiguous values (like N = A or T or C or G, R = A or G etc.) 192 searches only on forward strand 193 """ 194 pattern = '' 195 for nt in subseq: 196 value = IUPACData.ambiguous_dna_values[nt] 197 if len(value) == 1: 198 pattern += value 199 else: 200 pattern += '[%s]' % value 201 202 pos = -1 203 result = [pattern] 204 l = len(seq) 205 while True: 206 pos+=1 207 s = seq[pos:] 208 m = re.search(pattern, s) 209 if not m: break 210 pos += int(m.start(0)) 211 result.append(pos) 212 return result
213 214 # }}} 215 216 ###################################### 217 # Protein 218 ###################### 219 # {{{ 220 221 # temporary hack for exception free translation of "dirty" DNA 222 # should be moved to ??? 223
224 -class ProteinX(Alphabet.ProteinAlphabet):
225 """Variant of the extended IUPAC extended protein alphabet (DEPRECATED).""" 226 letters = IUPACData.extended_protein_letters + "X"
227 228 #Can't add a deprecation warning to the class due to the following line: 229 proteinX = ProteinX() 230
231 -class MissingTable:
232 - def __init__(self, table):
233 self._table = table 234 import warnings 235 import Bio 236 warnings.warn("Function Bio.SeqUtils.makeTableX() and related classes ProteinX " 237 "and MissingTable are deprecated.", Bio.BiopythonDeprecationWarning)
238 - def get(self, codon, stop_symbol):
239 try: 240 return self._table.get(codon, stop_symbol) 241 except CodonTable.TranslationError: 242 return 'X'
243
244 -def makeTableX(table):
245 assert table.protein_alphabet == IUPAC.extended_protein 246 return CodonTable.CodonTable(table.nucleotide_alphabet, proteinX, 247 MissingTable(table.forward_table), 248 table.back_table, table.start_codons, 249 table.stop_codons)
250 251 # end of hacks 252
253 -def seq3(seq):
254 """Turn a one letter code protein sequence into one with three letter codes. 255 256 The single input argument 'seq' should be a protein sequence using single 257 letter codes, either as a python string or as a Seq or MutableSeq object. 258 259 This function returns the amino acid sequence as a string using the three 260 letter amino acid codes. Output follows the IUPAC standard (including 261 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U 262 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. Any unknown 263 character (including possible gap characters), is changed into 'Xaa'. 264 265 e.g. 266 >>> from Bio.SeqUtils import seq3 267 >>> seq3("MAIVMGRWKGAR*") 268 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer' 269 270 This function was inspired by BioPerl's seq3. 271 """ 272 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp', 273 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His', 274 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met', 275 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg', 276 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp', 277 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter', 278 'U':'Sel', 'O':'Pyl', 'J':'Xle', 279 } 280 #We use a default of 'Xaa' for undefined letters 281 #Note this will map '-' to 'Xaa' which may be undesirable! 282 return ''.join([threecode.get(aa,'Xaa') for aa in seq])
283 284 285 # }}} 286 287 ###################################### 288 # Mixed ??? 289 ###################### 290 # {{{ 291
292 -def GC_Frame(seq, genetic_code = 1):
293 """Just an alias for six_frame_translations (DEPRECATED). 294 295 Please use six_frame_translations directly, as this function is deprecated.""" 296 import warnings 297 import Bio 298 warnings.warn("GC_Frame is deprecated; please use six_frame_translations instead", 299 Bio.BiopythonDeprecationWarning) 300 return six_frame_translations(seq, genetic_code)
301
302 -def six_frame_translations(seq, genetic_code = 1):
303 """Formatted string showing the 6 frame translations and GC content. 304 305 nice looking 6 frame translation with GC content - code from xbbtools 306 similar to DNA Striders six-frame translation 307 308 e.g. 309 from Bio.SeqUtils import six_frame_translations 310 print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA") 311 """ 312 from Bio.Seq import reverse_complement, translate 313 anti = reverse_complement(seq) 314 comp = anti[::-1] 315 length = len(seq) 316 frames = {} 317 for i in range(0,3): 318 frames[i+1] = translate(seq[i:], genetic_code) 319 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code)) 320 321 # create header 322 if length > 20: 323 short = '%s ... %s' % (seq[:10], seq[-10:]) 324 else: 325 short = seq 326 #TODO? Remove the date as this would spoil any unit test... 327 date = time.strftime('%y %b %d, %X', time.localtime(time.time())) 328 header = 'GC_Frame: %s, ' % date 329 for nt in ['a','t','g','c']: 330 header += '%s:%d ' % (nt, seq.count(nt.upper())) 331 332 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq)) 333 res = header 334 335 for i in range(0,length,60): 336 subseq = seq[i:i+60] 337 csubseq = comp[i:i+60] 338 p = i/3 339 res = res + '%d/%d\n' % (i+1, i/3+1) 340 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n' 341 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n' 342 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n' 343 # seq 344 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq)) 345 res = res + csubseq.lower() + '\n' 346 # - frames 347 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n' 348 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n' 349 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n' 350 return res
351 352 # }}} 353 354 ###################################### 355 # FASTA file utilities 356 ###################### 357 # {{{ 358
359 -def fasta_uniqids(filename):
360 """Checks and changes the name/ID's to be unique identifiers by adding numbers (DEPRECATED). 361 362 file - a FASTA format filename to read in. 363 364 No return value, the output is written to screen. 365 """ 366 import warnings 367 import Bio 368 warnings.warn("fasta_uniqids is deprecated", Bio.BiopythonDeprecationWarning) 369 370 mydict = {} 371 txt = open(filename).read() 372 entries = [] 373 for entry in txt.split('>')[1:]: 374 name, seq= entry.split('\n',1) 375 name = name.split()[0].split(',')[0] 376 377 if name in mydict: 378 n = 1 379 while 1: 380 n = n + 1 381 _name = name + str(n) 382 if _name not in mydict: 383 name = _name 384 break 385 386 mydict[name] = seq 387 388 for name, seq in mydict.iteritems(): 389 print '>%s\n%s' % (name, seq)
390
391 -def quick_FASTA_reader(file):
392 """Simple FASTA reader, returning a list of string tuples. 393 394 The single argument 'file' should be the filename of a FASTA format file. 395 This function will open and read in the entire file, constructing a list 396 of all the records, each held as a tuple of strings (the sequence name or 397 title, and its sequence). 398 399 This function was originally intended for use on large files, where its 400 low overhead makes it very fast. However, because it returns the data as 401 a single in memory list, this can require a lot of RAM on large files. 402 403 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which 404 allows you to iterate over the records one by one (avoiding having all the 405 records in memory at once). Using Bio.SeqIO also makes it easy to switch 406 between different input file formats. However, please note that rather 407 than simple strings, Bio.SeqIO uses SeqRecord objects for each record. 408 """ 409 #Want to split on "\n>" not just ">" in case there are any extra ">" 410 #in the name/description. So, in order to make sure we also split on 411 #the first entry, prepend a "\n" to the start of the file. 412 handle = open(file) 413 txt = "\n" + handle.read() 414 handle.close() 415 entries = [] 416 for entry in txt.split('\n>')[1:]: 417 name,seq= entry.split('\n',1) 418 seq = seq.replace('\n','').replace(' ','').upper() 419 entries.append((name, seq)) 420 return entries
421
422 -def apply_on_multi_fasta(file, function, *args):
423 """Apply a function on each sequence in a multiple FASTA file (DEPRECATED). 424 425 file - filename of a FASTA format file 426 function - the function you wish to invoke on each record 427 *args - any extra arguments you want passed to the function 428 429 This function will iterate over each record in a FASTA file as SeqRecord 430 objects, calling your function with the record (and supplied args) as 431 arguments. 432 433 This function returns a list. For those records where your function 434 returns a value, this is taken as a sequence and used to construct a 435 FASTA format string. If your function never has a return value, this 436 means apply_on_multi_fasta will return an empty list. 437 """ 438 import warnings 439 import Bio 440 warnings.warn("apply_on_multi_fasta is deprecated", Bio.BiopythonDeprecationWarning) 441 try: 442 f = globals()[function] 443 except: 444 raise NotImplementedError("%s not implemented" % function) 445 446 handle = open(file, 'r') 447 records = SeqIO.parse(handle, "fasta") 448 results = [] 449 for record in records: 450 arguments = [record.sequence] 451 for arg in args: arguments.append(arg) 452 result = f(*arguments) 453 if result: 454 results.append('>%s\n%s' % (record.name, result)) 455 handle.close() 456 return results
457
458 -def quicker_apply_on_multi_fasta(file, function, *args):
459 """Apply a function on each sequence in a multiple FASTA file (DEPRECATED). 460 461 file - filename of a FASTA format file 462 function - the function you wish to invoke on each record 463 *args - any extra arguments you want passed to the function 464 465 This function will use quick_FASTA_reader to load every record in the 466 FASTA file into memory as a list of tuples. For each record, it will 467 call your supplied function with the record as a tuple of the name and 468 sequence as strings (plus any supplied args). 469 470 This function returns a list. For those records where your function 471 returns a value, this is taken as a sequence and used to construct a 472 FASTA format string. If your function never has a return value, this 473 means quicker_apply_on_multi_fasta will return an empty list. 474 """ 475 import warnings 476 import Bio 477 warnings.warn("quicker_apply_on_multi_fasta is deprecated", Bio.BiopythonDeprecationWarning) 478 try: 479 f = globals()[function] 480 except: 481 raise NotImplementedError("%s not implemented" % function) 482 483 entries = quick_FASTA_reader(file) 484 results = [] 485 for name, seq in entries: 486 arguments = [seq] 487 for arg in args: arguments.append(arg) 488 result = f(*arguments) 489 if result: 490 results.append('>%s\n%s' % (name, result)) 491 file.close() 492 return results
493 494 # }}} 495 496 ###################################### 497 # Main 498 ##################### 499 # {{{ 500 501 if __name__ == '__main__': 502 import sys, getopt 503 # crude command line options to use most functions directly on a FASTA file 504 options = {'apply_on_multi_fasta':0, 505 'quick':0, 506 'uniq_ids':0, 507 } 508 509 optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=', 510 'help', 'quick', 'uniq_ids', 'search=']) 511 for arg in optlist: 512 if arg[0] in ['-h', '--help']: 513 pass 514 elif arg[0] in ['--describe']: 515 # get all new functions from this file 516 mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)] 517 mol_funcs.sort() 518 print 'available functions:' 519 for f in mol_funcs: print '\t--%s' % f 520 print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas' 521 522 sys.exit(0) 523 elif arg[0] in ['--apply_on_multi_fasta']: 524 options['apply_on_multi_fasta'] = arg[1] 525 elif arg[0] in ['--search']: 526 options['search'] = arg[1] 527 else: 528 key = re.search('-*(.+)', arg[0]).group(1) 529 options[key] = 1 530 531 532 if options.get('apply_on_multi_fasta'): 533 file = args[0] 534 function = options['apply_on_multi_fasta'] 535 arguments = [] 536 if options.get('search'): 537 arguments = options['search'] 538 if function == 'xGC_skew': 539 arguments = 1000 540 if options.get('quick'): 541 results = quicker_apply_on_multi_fasta(file, function, arguments) 542 else: 543 results = apply_on_multi_fasta(file, function, arguments) 544 for result in results: print result 545 546 elif options.get('uniq_ids'): 547 file = args[0] 548 fasta_uniqids(file) 549 550 # }}} 551
552 -def _test():
553 """Run the Bio.SeqUtils module's doctests (PRIVATE).""" 554 print "Runing doctests..." 555 import doctest 556 doctest.testmod() 557 print "Done"
558 559 if __name__ == "__main__": 560 _test() 561