1
2
3
4
5
6
7
8
9
10 """Miscellaneous functions for dealing with sequences."""
11
12 import re, time
13 from Bio import SeqIO
14 from Bio.Seq import Seq
15 from Bio import Alphabet
16 from Bio.Alphabet import IUPAC
17 from Bio.Data import IUPACData, CodonTable
18
19
20
21
22
23
24
26 """Reverse the sequence. Works on string sequences (DEPRECATED).
27
28 This function is now deprecated, instead use the string's built in slice
29 method with a step of minus one:
30
31 >>> "ACGGT"[::-1]
32 'TGGCA'
33 """
34 import warnings
35 import Bio
36 warnings.warn("Bio.SeqUtils.reverse() is deprecated, use the string's "
37 "slice method with a step of minus one instead.",
38 Bio.BiopythonDeprecationWarning)
39 r = list(seq)
40 r.reverse()
41 return ''.join(r)
42
44 """Calculates G+C content, returns the percentage (float between 0 and 100).
45
46 Copes mixed case seuqneces, and with the ambiguous nucleotide S (G or C)
47 when counting the G and C content. The percentage is calculated against
48 the full length, e.g.:
49
50 >>> from Bio.SeqUtils import GC
51 >>> GC("ACTGN")
52 40.0
53
54 Note that this will return zero for an empty sequence.
55 """
56 try:
57 gc = sum(map(seq.count,['G','C','g','c','S','s']))
58 return gc*100.0/len(seq)
59 except ZeroDivisionError:
60 return 0.0
61
62
64 """Calculates total G+C content plus first, second and third positions.
65
66 Returns a tuple of four floats (percentages between 0 and 100) for the
67 entire sequence, and the three codon positions. e.g.
68
69 >>> from Bio.SeqUtils import GC123
70 >>> GC123("ACTGTN")
71 (40.0, 50.0, 50.0, 0.0)
72
73 Copes with mixed case sequences, but does NOT deal with ambiguous
74 nucleotides.
75 """
76 d= {}
77 for nt in ['A','T','G','C']:
78 d[nt] = [0,0,0]
79
80 for i in range(0,len(seq),3):
81 codon = seq[i:i+3]
82 if len(codon) <3: codon += ' '
83 for pos in range(0,3):
84 for nt in ['A','T','G','C']:
85 if codon[pos] == nt or codon[pos] == nt.lower():
86 d[nt][pos] += 1
87 gc = {}
88 gcall = 0
89 nall = 0
90 for i in range(0,3):
91 try:
92 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i]
93 gc[i] = (d['G'][i] + d['C'][i])*100.0/n
94 except:
95 gc[i] = 0
96
97 gcall = gcall + d['G'][i] + d['C'][i]
98 nall = nall + n
99
100 gcall = 100.0*gcall/nall
101 return gcall, gc[0], gc[1], gc[2]
102
104 """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence.
105
106 Returns a list of ratios (floats), controlled by the length of the sequence
107 and the size of the window.
108
109 Does NOT look at any ambiguous nucleotides.
110 """
111
112 values = []
113 for i in range(0, len(seq), window):
114 s = seq[i: i + window]
115 g = s.count('G') + s.count('g')
116 c = s.count('C') + s.count('c')
117 skew = (g-c)/float(g+c)
118 values.append(skew)
119 return values
120
121 from math import pi, sin, cos, log
122 -def xGC_skew(seq, window = 1000, zoom = 100,
123 r = 300, px = 100, py = 100):
124 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!)."""
125 from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \
126 VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y
127 yscroll = Scrollbar(orient = VERTICAL)
128 xscroll = Scrollbar(orient = HORIZONTAL)
129 canvas = Canvas(yscrollcommand = yscroll.set,
130 xscrollcommand = xscroll.set, background = 'white')
131 win = canvas.winfo_toplevel()
132 win.geometry('700x700')
133
134 yscroll.config(command = canvas.yview)
135 xscroll.config(command = canvas.xview)
136 yscroll.pack(side = RIGHT, fill = Y)
137 xscroll.pack(side = BOTTOM, fill = X)
138 canvas.pack(fill=BOTH, side = LEFT, expand = 1)
139 canvas.update()
140
141 X0, Y0 = r + px, r + py
142 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r
143
144 ty = Y0
145 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq)))
146 ty +=20
147 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq)))
148 ty +=20
149 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue')
150 ty +=20
151 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta')
152 ty +=20
153 canvas.create_oval(x1,y1, x2, y2)
154
155 acc = 0
156 start = 0
157 for gc in GC_skew(seq, window):
158 r1 = r
159 acc+=gc
160
161 alpha = pi - (2*pi*start)/len(seq)
162 r2 = r1 - gc*zoom
163 x1 = X0 + r1 * sin(alpha)
164 y1 = Y0 + r1 * cos(alpha)
165 x2 = X0 + r2 * sin(alpha)
166 y2 = Y0 + r2 * cos(alpha)
167 canvas.create_line(x1,y1,x2,y2, fill = 'blue')
168
169 r1 = r - 50
170 r2 = r1 - acc
171 x1 = X0 + r1 * sin(alpha)
172 y1 = Y0 + r1 * cos(alpha)
173 x2 = X0 + r2 * sin(alpha)
174 y2 = Y0 + r2 * cos(alpha)
175 canvas.create_line(x1,y1,x2,y2, fill = 'magenta')
176
177 canvas.update()
178 start += window
179
180 canvas.configure(scrollregion = canvas.bbox(ALL))
181
187
189 """Search for a DNA subseq in sequence.
190
191 use ambiguous values (like N = A or T or C or G, R = A or G etc.)
192 searches only on forward strand
193 """
194 pattern = ''
195 for nt in subseq:
196 value = IUPACData.ambiguous_dna_values[nt]
197 if len(value) == 1:
198 pattern += value
199 else:
200 pattern += '[%s]' % value
201
202 pos = -1
203 result = [pattern]
204 l = len(seq)
205 while True:
206 pos+=1
207 s = seq[pos:]
208 m = re.search(pattern, s)
209 if not m: break
210 pos += int(m.start(0))
211 result.append(pos)
212 return result
213
214
215
216
217
218
219
220
221
222
223
224 -class ProteinX(Alphabet.ProteinAlphabet):
227
228
229 proteinX = ProteinX()
230
233 self._table = table
234 import warnings
235 import Bio
236 warnings.warn("Function Bio.SeqUtils.makeTableX() and related classes ProteinX "
237 "and MissingTable are deprecated.", Bio.BiopythonDeprecationWarning)
238 - def get(self, codon, stop_symbol):
243
250
251
252
254 """Turn a one letter code protein sequence into one with three letter codes.
255
256 The single input argument 'seq' should be a protein sequence using single
257 letter codes, either as a python string or as a Seq or MutableSeq object.
258
259 This function returns the amino acid sequence as a string using the three
260 letter amino acid codes. Output follows the IUPAC standard (including
261 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
262 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. Any unknown
263 character (including possible gap characters), is changed into 'Xaa'.
264
265 e.g.
266 >>> from Bio.SeqUtils import seq3
267 >>> seq3("MAIVMGRWKGAR*")
268 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
269
270 This function was inspired by BioPerl's seq3.
271 """
272 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
273 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
274 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
275 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
276 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
277 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter',
278 'U':'Sel', 'O':'Pyl', 'J':'Xle',
279 }
280
281
282 return ''.join([threecode.get(aa,'Xaa') for aa in seq])
283
284
285
286
287
288
289
290
291
293 """Just an alias for six_frame_translations (DEPRECATED).
294
295 Please use six_frame_translations directly, as this function is deprecated."""
296 import warnings
297 import Bio
298 warnings.warn("GC_Frame is deprecated; please use six_frame_translations instead",
299 Bio.BiopythonDeprecationWarning)
300 return six_frame_translations(seq, genetic_code)
301
303 """Formatted string showing the 6 frame translations and GC content.
304
305 nice looking 6 frame translation with GC content - code from xbbtools
306 similar to DNA Striders six-frame translation
307
308 e.g.
309 from Bio.SeqUtils import six_frame_translations
310 print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")
311 """
312 from Bio.Seq import reverse_complement, translate
313 anti = reverse_complement(seq)
314 comp = anti[::-1]
315 length = len(seq)
316 frames = {}
317 for i in range(0,3):
318 frames[i+1] = translate(seq[i:], genetic_code)
319 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code))
320
321
322 if length > 20:
323 short = '%s ... %s' % (seq[:10], seq[-10:])
324 else:
325 short = seq
326
327 date = time.strftime('%y %b %d, %X', time.localtime(time.time()))
328 header = 'GC_Frame: %s, ' % date
329 for nt in ['a','t','g','c']:
330 header += '%s:%d ' % (nt, seq.count(nt.upper()))
331
332 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq))
333 res = header
334
335 for i in range(0,length,60):
336 subseq = seq[i:i+60]
337 csubseq = comp[i:i+60]
338 p = i/3
339 res = res + '%d/%d\n' % (i+1, i/3+1)
340 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n'
341 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n'
342 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n'
343
344 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq))
345 res = res + csubseq.lower() + '\n'
346
347 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n'
348 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n'
349 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n'
350 return res
351
352
353
354
355
356
357
358
360 """Checks and changes the name/ID's to be unique identifiers by adding numbers (DEPRECATED).
361
362 file - a FASTA format filename to read in.
363
364 No return value, the output is written to screen.
365 """
366 import warnings
367 import Bio
368 warnings.warn("fasta_uniqids is deprecated", Bio.BiopythonDeprecationWarning)
369
370 mydict = {}
371 txt = open(filename).read()
372 entries = []
373 for entry in txt.split('>')[1:]:
374 name, seq= entry.split('\n',1)
375 name = name.split()[0].split(',')[0]
376
377 if name in mydict:
378 n = 1
379 while 1:
380 n = n + 1
381 _name = name + str(n)
382 if _name not in mydict:
383 name = _name
384 break
385
386 mydict[name] = seq
387
388 for name, seq in mydict.iteritems():
389 print '>%s\n%s' % (name, seq)
390
392 """Simple FASTA reader, returning a list of string tuples.
393
394 The single argument 'file' should be the filename of a FASTA format file.
395 This function will open and read in the entire file, constructing a list
396 of all the records, each held as a tuple of strings (the sequence name or
397 title, and its sequence).
398
399 This function was originally intended for use on large files, where its
400 low overhead makes it very fast. However, because it returns the data as
401 a single in memory list, this can require a lot of RAM on large files.
402
403 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which
404 allows you to iterate over the records one by one (avoiding having all the
405 records in memory at once). Using Bio.SeqIO also makes it easy to switch
406 between different input file formats. However, please note that rather
407 than simple strings, Bio.SeqIO uses SeqRecord objects for each record.
408 """
409
410
411
412 handle = open(file)
413 txt = "\n" + handle.read()
414 handle.close()
415 entries = []
416 for entry in txt.split('\n>')[1:]:
417 name,seq= entry.split('\n',1)
418 seq = seq.replace('\n','').replace(' ','').upper()
419 entries.append((name, seq))
420 return entries
421
423 """Apply a function on each sequence in a multiple FASTA file (DEPRECATED).
424
425 file - filename of a FASTA format file
426 function - the function you wish to invoke on each record
427 *args - any extra arguments you want passed to the function
428
429 This function will iterate over each record in a FASTA file as SeqRecord
430 objects, calling your function with the record (and supplied args) as
431 arguments.
432
433 This function returns a list. For those records where your function
434 returns a value, this is taken as a sequence and used to construct a
435 FASTA format string. If your function never has a return value, this
436 means apply_on_multi_fasta will return an empty list.
437 """
438 import warnings
439 import Bio
440 warnings.warn("apply_on_multi_fasta is deprecated", Bio.BiopythonDeprecationWarning)
441 try:
442 f = globals()[function]
443 except:
444 raise NotImplementedError("%s not implemented" % function)
445
446 handle = open(file, 'r')
447 records = SeqIO.parse(handle, "fasta")
448 results = []
449 for record in records:
450 arguments = [record.sequence]
451 for arg in args: arguments.append(arg)
452 result = f(*arguments)
453 if result:
454 results.append('>%s\n%s' % (record.name, result))
455 handle.close()
456 return results
457
459 """Apply a function on each sequence in a multiple FASTA file (DEPRECATED).
460
461 file - filename of a FASTA format file
462 function - the function you wish to invoke on each record
463 *args - any extra arguments you want passed to the function
464
465 This function will use quick_FASTA_reader to load every record in the
466 FASTA file into memory as a list of tuples. For each record, it will
467 call your supplied function with the record as a tuple of the name and
468 sequence as strings (plus any supplied args).
469
470 This function returns a list. For those records where your function
471 returns a value, this is taken as a sequence and used to construct a
472 FASTA format string. If your function never has a return value, this
473 means quicker_apply_on_multi_fasta will return an empty list.
474 """
475 import warnings
476 import Bio
477 warnings.warn("quicker_apply_on_multi_fasta is deprecated", Bio.BiopythonDeprecationWarning)
478 try:
479 f = globals()[function]
480 except:
481 raise NotImplementedError("%s not implemented" % function)
482
483 entries = quick_FASTA_reader(file)
484 results = []
485 for name, seq in entries:
486 arguments = [seq]
487 for arg in args: arguments.append(arg)
488 result = f(*arguments)
489 if result:
490 results.append('>%s\n%s' % (name, result))
491 file.close()
492 return results
493
494
495
496
497
498
499
500
501 if __name__ == '__main__':
502 import sys, getopt
503
504 options = {'apply_on_multi_fasta':0,
505 'quick':0,
506 'uniq_ids':0,
507 }
508
509 optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=',
510 'help', 'quick', 'uniq_ids', 'search='])
511 for arg in optlist:
512 if arg[0] in ['-h', '--help']:
513 pass
514 elif arg[0] in ['--describe']:
515
516 mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)]
517 mol_funcs.sort()
518 print 'available functions:'
519 for f in mol_funcs: print '\t--%s' % f
520 print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas'
521
522 sys.exit(0)
523 elif arg[0] in ['--apply_on_multi_fasta']:
524 options['apply_on_multi_fasta'] = arg[1]
525 elif arg[0] in ['--search']:
526 options['search'] = arg[1]
527 else:
528 key = re.search('-*(.+)', arg[0]).group(1)
529 options[key] = 1
530
531
532 if options.get('apply_on_multi_fasta'):
533 file = args[0]
534 function = options['apply_on_multi_fasta']
535 arguments = []
536 if options.get('search'):
537 arguments = options['search']
538 if function == 'xGC_skew':
539 arguments = 1000
540 if options.get('quick'):
541 results = quicker_apply_on_multi_fasta(file, function, arguments)
542 else:
543 results = apply_on_multi_fasta(file, function, arguments)
544 for result in results: print result
545
546 elif options.get('uniq_ids'):
547 file = args[0]
548 fasta_uniqids(file)
549
550
551
553 """Run the Bio.SeqUtils module's doctests (PRIVATE)."""
554 print "Runing doctests..."
555 import doctest
556 doctest.testmod()
557 print "Done"
558
559 if __name__ == "__main__":
560 _test()
561