1
2
3
4
5
6
7 """Bio.SeqIO support for the binary Standard Flowgram Format (SFF) file format.
8
9 SFF was designed by 454 Life Sciences (Roche), the Whitehead Institute for
10 Biomedical Research and the Wellcome Trust Sanger Institute. You are expected
11 to use this module via the Bio.SeqIO functions under the format name "sff" (or
12 "sff-trim" as described below).
13
14 For example, to iterate over the records in an SFF file,
15
16 >>> from Bio import SeqIO
17 >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff"):
18 ... print record.id, len(record), record.seq[:20]+"..."
19 E3MFGYR02JWQ7T 265 tcagGGTCTACATGTTGGTT...
20 E3MFGYR02JA6IL 271 tcagTTTTTTTTGGAAAGGA...
21 E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC...
22 E3MFGYR02GFKUC 299 tcagCGGCCGGGCCTCTCAT...
23 E3MFGYR02FTGED 281 tcagTGGTAATGGGGGGAAA...
24 E3MFGYR02FR9G7 261 tcagCTCCGTAAGAAGGTGC...
25 E3MFGYR02GAZMS 278 tcagAAAGAAGTAAGGTAAA...
26 E3MFGYR02HHZ8O 221 tcagACTTTCTTCTTTACCG...
27 E3MFGYR02GPGB1 269 tcagAAGCAGTGGTATCAAC...
28 E3MFGYR02F7Z7G 219 tcagAATCATCCACTTTTTA...
29
30 Each SeqRecord object will contain all the annotation from the SFF file,
31 including the PHRED quality scores.
32
33 >>> print record.id, len(record)
34 E3MFGYR02F7Z7G 219
35 >>> print record.seq[:10], "..."
36 tcagAATCAT ...
37 >>> print record.letter_annotations["phred_quality"][:10], "..."
38 [22, 21, 23, 28, 26, 15, 12, 21, 28, 21] ...
39
40 Notice that the sequence is given in mixed case, the central upper case region
41 corresponds to the trimmed sequence. This matches the output of the Roche
42 tools (and the 3rd party tool sff_extract) for SFF to FASTA.
43
44 >>> print record.annotations["clip_qual_left"]
45 4
46 >>> print record.annotations["clip_qual_right"]
47 134
48 >>> print record.seq[:4]
49 tcag
50 >>> print record.seq[4:20], "...", record.seq[120:134]
51 AATCATCCACTTTTTA ... CAAAACACAAACAG
52 >>> print record.seq[134:]
53 atcttatcaacaaaactcaaagttcctaactgagacacgcaacaggggataagacaaggcacacaggggataggnnnnnnnnnnn
54
55 The annotations dictionary also contains any adapter clip positions
56 (usually zero), and information about the flows. e.g.
57
58 >>> print record.annotations["flow_key"]
59 TCAG
60 >>> print record.annotations["flow_values"][:10], "..."
61 (83, 1, 128, 7, 4, 84, 6, 106, 3, 172) ...
62 >>> print len(record.annotations["flow_values"])
63 400
64 >>> print record.annotations["flow_index"][:10], "..."
65 (1, 2, 3, 2, 2, 0, 3, 2, 3, 3) ...
66 >>> print len(record.annotations["flow_index"])
67 219
68
69 As a convenience method, you can read the file with SeqIO format name "sff-trim"
70 instead of "sff" to get just the trimmed sequences (without any annotation
71 except for the PHRED quality scores):
72
73 >>> from Bio import SeqIO
74 >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim"):
75 ... print record.id, len(record), record.seq[:20]+"..."
76 E3MFGYR02JWQ7T 260 GGTCTACATGTTGGTTAACC...
77 E3MFGYR02JA6IL 265 TTTTTTTTGGAAAGGAAAAC...
78 E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG...
79 E3MFGYR02GFKUC 295 CGGCCGGGCCTCTCATCGGT...
80 E3MFGYR02FTGED 277 TGGTAATGGGGGGAAATTTA...
81 E3MFGYR02FR9G7 256 CTCCGTAAGAAGGTGCTGCC...
82 E3MFGYR02GAZMS 271 AAAGAAGTAAGGTAAATAAC...
83 E3MFGYR02HHZ8O 150 ACTTTCTTCTTTACCGTAAC...
84 E3MFGYR02GPGB1 221 AAGCAGTGGTATCAACGCAG...
85 E3MFGYR02F7Z7G 130 AATCATCCACTTTTTAACGT...
86
87 Looking at the final record in more detail, note how this differs to the
88 example above:
89
90 >>> print record.id, len(record)
91 E3MFGYR02F7Z7G 130
92 >>> print record.seq[:10], "..."
93 AATCATCCAC ...
94 >>> print record.letter_annotations["phred_quality"][:10], "..."
95 [26, 15, 12, 21, 28, 21, 36, 28, 27, 27] ...
96 >>> print record.annotations
97 {}
98
99 You might use the Bio.SeqIO.convert() function to convert the (trimmed) SFF
100 reads into a FASTQ file (or a FASTA file and a QUAL file), e.g.
101
102 >>> from Bio import SeqIO
103 >>> from StringIO import StringIO
104 >>> out_handle = StringIO()
105 >>> count = SeqIO.convert("Roche/E3MFGYR02_random_10_reads.sff", "sff",
106 ... out_handle, "fastq")
107 >>> print "Converted %i records" % count
108 Converted 10 records
109
110 The output FASTQ file would start like this:
111
112 >>> print "%s..." % out_handle.getvalue()[:50]
113 @E3MFGYR02JWQ7T
114 tcagGGTCTACATGTTGGTTAACCCGTACTGATT...
115
116 Bio.SeqIO.index() provides memory efficient random access to the reads in an
117 SFF file by name. SFF files can include an index within the file, which can
118 be read in making this very fast. If the index is missing (or in a format not
119 yet supported in Biopython) the file is indexed by scanning all the reads -
120 which is a little slower. For example,
121
122 >>> from Bio import SeqIO
123 >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff")
124 >>> record = reads["E3MFGYR02JHD4H"]
125 >>> print record.id, len(record), record.seq[:20]+"..."
126 E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC...
127
128 Or, using the trimmed reads:
129
130 >>> from Bio import SeqIO
131 >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim")
132 >>> record = reads["E3MFGYR02JHD4H"]
133 >>> print record.id, len(record), record.seq[:20]+"..."
134 E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG...
135
136 You can also use the Bio.SeqIO.write() function with the "sff" format. Note
137 that this requires all the flow information etc, and thus is probably only
138 useful for SeqRecord objects originally from reading another SFF file (and
139 not the trimmed SeqRecord objects from parsing an SFF file as "sff-trim").
140
141 As an example, let's pretend this example SFF file represents some DNA which
142 was pre-amplified with a PCR primers AAAGANNNNN. The following script would
143 produce a sub-file containing all those reads whose post-quality clipping
144 region (i.e. the sequence after trimming) starts with AAAGA exactly (the non-
145 degenerate bit of this pretend primer):
146
147 >>> from Bio import SeqIO
148 >>> records = (record for record in
149 ... SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff","sff")
150 ... if record.seq[record.annotations["clip_qual_left"]:].startswith("AAAGA"))
151 >>> count = SeqIO.write(records, "temp_filtered.sff", "sff")
152 >>> print "Selected %i records" % count
153 Selected 2 records
154
155 Of course, for an assembly you would probably want to remove these primers.
156 If you want FASTA or FASTQ output, you could just slice the SeqRecord. However,
157 if you want SFF output we have to preserve all the flow information - the trick
158 is just to adjust the left clip position!
159
160 >>> from Bio import SeqIO
161 >>> def filter_and_trim(records, primer):
162 ... for record in records:
163 ... if record.seq[record.annotations["clip_qual_left"]:].startswith(primer):
164 ... record.annotations["clip_qual_left"] += len(primer)
165 ... yield record
166 >>> records = SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff")
167 >>> count = SeqIO.write(filter_and_trim(records,"AAAGA"),
168 ... "temp_filtered.sff", "sff")
169 >>> print "Selected %i records" % count
170 Selected 2 records
171
172 We can check the results, note the lower case clipped region now includes the "AAAGA"
173 sequence:
174
175 >>> for record in SeqIO.parse("temp_filtered.sff", "sff"):
176 ... print record.id, len(record), record.seq[:20]+"..."
177 E3MFGYR02JHD4H 310 tcagaaagaCAAGTGGTATC...
178 E3MFGYR02GAZMS 278 tcagaaagaAGTAAGGTAAA...
179 >>> for record in SeqIO.parse("temp_filtered.sff", "sff-trim"):
180 ... print record.id, len(record), record.seq[:20]+"..."
181 E3MFGYR02JHD4H 287 CAAGTGGTATCAACGCAGAG...
182 E3MFGYR02GAZMS 266 AGTAAGGTAAATAACAAACG...
183 >>> import os
184 >>> os.remove("temp_filtered.sff")
185
186 For a description of the file format, please see the Roche manuals and:
187 http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=formats
188
189 """
190 from Bio.SeqIO.Interfaces import SequenceWriter
191 from Bio import Alphabet
192 from Bio.Seq import Seq
193 from Bio.SeqRecord import SeqRecord
194 import struct
195 import sys
196
197 from Bio._py3k import _bytes_to_string, _as_bytes
198 _null = _as_bytes("\0")
199 _sff = _as_bytes(".sff")
200 _hsh = _as_bytes(".hsh")
201 _srt = _as_bytes(".srt")
202 _mft = _as_bytes(".mft")
203
204 try:
205
206 _flag = eval(r'b"\xff"')
207 except SyntaxError:
208
209 _flag = "\xff"
210
212 """Read in an SFF file header (PRIVATE).
213
214 Assumes the handle is at the start of the file, will read forwards
215 though the header and leave the handle pointing at the first record.
216 Returns a tuple of values from the header (header_length, index_offset,
217 index_length, number_of_reads, flows_per_read, flow_chars, key_sequence)
218
219 >>> handle = open("Roche/greek.sff", "rb")
220 >>> values = _sff_file_header(handle)
221 >>> print values[0]
222 840
223 >>> print values[1]
224 65040
225 >>> print values[2]
226 256
227 >>> print values[3]
228 24
229 >>> print values[4]
230 800
231 >>> values[-1]
232 'TCAG'
233
234 """
235 if hasattr(handle,"mode") and "U" in handle.mode.upper():
236 raise ValueError("SFF files must NOT be opened in universal new "
237 "lines mode. Binary mode is recommended (although "
238 "on Unix the default mode is also fine).")
239 elif hasattr(handle,"mode") and "B" not in handle.mode.upper() \
240 and sys.platform == "win32":
241 raise ValueError("SFF files must be opened in binary mode on Windows")
242
243
244
245
246
247
248
249
250
251
252
253
254 fmt = '>4s4BQIIHHHB'
255 assert 31 == struct.calcsize(fmt)
256 data = handle.read(31)
257 if not data:
258 raise ValueError("Empty file.")
259 elif len(data) < 13:
260 raise ValueError("File too small to hold a valid SFF header.")
261 magic_number, ver0, ver1, ver2, ver3, index_offset, index_length, \
262 number_of_reads, header_length, key_length, number_of_flows_per_read, \
263 flowgram_format = struct.unpack(fmt, data)
264 if magic_number in [_hsh, _srt, _mft]:
265
266 raise ValueError("Handle seems to be at SFF index block, not start")
267 if magic_number != _sff:
268 raise ValueError("SFF file did not start '.sff', but %s" \
269 % repr(magic_number))
270 if (ver0, ver1, ver2, ver3) != (0, 0, 0, 1):
271 raise ValueError("Unsupported SFF version in header, %i.%i.%i.%i" \
272 % (ver0, ver1, ver2, ver3))
273 if flowgram_format != 1:
274 raise ValueError("Flowgram format code %i not supported" \
275 % flowgram_format)
276 if (index_offset!=0) ^ (index_length!=0):
277 raise ValueError("Index offset %i but index length %i" \
278 % (index_offset, index_length))
279 flow_chars = _bytes_to_string(handle.read(number_of_flows_per_read))
280 key_sequence = _bytes_to_string(handle.read(key_length))
281
282
283
284
285 assert header_length % 8 == 0
286 padding = header_length - number_of_flows_per_read - key_length - 31
287 assert 0 <= padding < 8, padding
288 if handle.read(padding).count(_null) != padding:
289 raise ValueError("Post header %i byte padding region contained data" \
290 % padding)
291 return header_length, index_offset, index_length, \
292 number_of_reads, number_of_flows_per_read, \
293 flow_chars, key_sequence
294
295
297 """Generates an index by scanning though all the reads in an SFF file (PRIVATE).
298
299 This is a slow but generic approach if we can't parse the provided index
300 (if present).
301
302 Will use the handle seek/tell functions.
303 """
304 handle.seek(0)
305 header_length, index_offset, index_length, number_of_reads, \
306 number_of_flows_per_read, flow_chars, key_sequence \
307 = _sff_file_header(handle)
308
309 read_header_fmt = '>2HI4H'
310 read_header_size = struct.calcsize(read_header_fmt)
311
312 read_flow_fmt = ">%iH" % number_of_flows_per_read
313 read_flow_size = struct.calcsize(read_flow_fmt)
314 assert 1 == struct.calcsize(">B")
315 assert 1 == struct.calcsize(">s")
316 assert 1 == struct.calcsize(">c")
317 assert read_header_size % 8 == 0
318 for read in range(number_of_reads):
319 record_offset = handle.tell()
320 if record_offset == index_offset:
321
322 offset = index_offset + index_length
323 if offset % 8:
324 offset += 8 - (offset % 8)
325 assert offset % 8 == 0
326 handle.seek(offset)
327 record_offset = offset
328
329
330 data = handle.read(read_header_size)
331 read_header_length, name_length, seq_len, clip_qual_left, \
332 clip_qual_right, clip_adapter_left, clip_adapter_right \
333 = struct.unpack(read_header_fmt, data)
334 if read_header_length < 10 or read_header_length % 8 != 0:
335 raise ValueError("Malformed read header, says length is %i:\n%s" \
336 % (read_header_length, repr(data)))
337
338 name = _bytes_to_string(handle.read(name_length))
339 padding = read_header_length - read_header_size - name_length
340 if handle.read(padding).count(_null) != padding:
341 raise ValueError("Post name %i byte padding region contained data" \
342 % padding)
343 assert record_offset + read_header_length == handle.tell()
344
345 size = read_flow_size + 3*seq_len
346 handle.seek(size, 1)
347
348 padding = size % 8
349 if padding:
350 padding = 8 - padding
351 if handle.read(padding).count(_null) != padding:
352 raise ValueError("Post quality %i byte padding region contained data" \
353 % padding)
354
355 yield name, record_offset
356 if handle.tell() % 8 != 0:
357 raise ValueError("After scanning reads, did not end on a multiple of 8")
358
360 """Locate any existing Roche style XML meta data and read index (PRIVATE).
361
362 Makes a number of hard coded assumptions based on reverse engineered SFF
363 files from Roche 454 machines.
364
365 Returns a tuple of read count, SFF "index" offset and size, XML offset
366 and size, and the actual read index offset and size.
367
368 Raises a ValueError for unsupported or non-Roche index blocks.
369 """
370 handle.seek(0)
371 header_length, index_offset, index_length, number_of_reads, \
372 number_of_flows_per_read, flow_chars, key_sequence \
373 = _sff_file_header(handle)
374 assert handle.tell() == header_length
375 if not index_offset or not index_offset:
376 raise ValueError("No index present in this SFF file")
377
378 handle.seek(index_offset)
379 fmt = ">4s4B"
380 fmt_size = struct.calcsize(fmt)
381 data = handle.read(fmt_size)
382 if not data:
383 raise ValueError("Premature end of file? Expected index of size %i at offest %i, found nothing" \
384 % (index_length, index_offset))
385 if len(data) < fmt_size:
386 raise ValueError("Premature end of file? Expected index of size %i at offest %i, found %s" \
387 % (index_length, index_offset, repr(data)))
388 magic_number, ver0, ver1, ver2, ver3 = struct.unpack(fmt, data)
389 if magic_number == _mft:
390
391
392
393 if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48):
394
395 raise ValueError("Unsupported version in .mft index header, %i.%i.%i.%i" \
396 % (ver0, ver1, ver2, ver3))
397 fmt2 = ">LL"
398 fmt2_size = struct.calcsize(fmt2)
399 xml_size, data_size = struct.unpack(fmt2, handle.read(fmt2_size))
400 if index_length != fmt_size + fmt2_size + xml_size + data_size:
401 raise ValueError("Problem understanding .mft index header, %i != %i + %i + %i + %i" \
402 % (index_length, fmt_size, fmt2_size, xml_size, data_size))
403 return number_of_reads, header_length, \
404 index_offset, index_length, \
405 index_offset + fmt_size + fmt2_size, xml_size, \
406 index_offset + fmt_size + fmt2_size + xml_size, data_size
407 elif magic_number == _srt:
408
409
410
411 if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48):
412
413 raise ValueError("Unsupported version in .srt index header, %i.%i.%i.%i" \
414 % (ver0, ver1, ver2, ver3))
415 data = handle.read(4)
416 if data != _null*4:
417 raise ValueError("Did not find expected null four bytes in .srt index")
418 return number_of_reads, header_length, \
419 index_offset, index_length, \
420 0, 0, \
421 index_offset + fmt_size + 4, index_length - fmt_size - 4
422 elif magic_number == _hsh:
423 raise ValueError("Hash table style indexes (.hsh) in SFF files are "
424 "not (yet) supported")
425 else:
426 raise ValueError("Unknown magic number %s in SFF index header:\n%s" \
427 % (repr(magic_number), repr(data)))
428
430 """Reads any existing Roche style XML manifest data in the SFF "index" (PRIVATE, DEPRECATED).
431
432 Will use the handle seek/tell functions. Returns a string.
433
434 This has been replaced by ReadRocheXmlManifest. We would normally just
435 delete an old private function without warning, but I believe some people
436 are using this so we'll handle this with a deprecation warning.
437 """
438 import warnings
439 warnings.warn("Private function _sff_read_roche_index_xml is deprecated. "
440 "Use new public function ReadRocheXmlManifest instead",
441 DeprecationWarning)
442 return ReadRocheXmlManifest(handle)
443
444
446 """Reads any Roche style XML manifest data in the SFF "index".
447
448 The SFF file format allows for multiple different index blocks, and Roche
449 took advantage of this to define their own index block wich also embeds
450 an XML manifest string. This is not a publically documented extension to
451 the SFF file format, this was reverse engineered.
452
453 The handle should be to an SFF file opened in binary mode. This function
454 will use the handle seek/tell functions and leave the handle in an
455 arbitrary location.
456
457 Any XML manifest found is returned as a Python string, which you can then
458 parse as appropriate, or reuse when writing out SFF files with the
459 SffWriter class.
460
461 Returns a string, or raises a ValueError if an Roche manifest could not be
462 found.
463 """
464 number_of_reads, header_length, index_offset, index_length, xml_offset, \
465 xml_size, read_index_offset, read_index_size = _sff_find_roche_index(handle)
466 if not xml_offset or not xml_size:
467 raise ValueError("No XML manifest found")
468 handle.seek(xml_offset)
469 return _bytes_to_string(handle.read(xml_size))
470
471
472
474 """Reads any existing Roche style read index provided in the SFF file (PRIVATE).
475
476 Will use the handle seek/tell functions.
477
478 This works on ".srt1.00" and ".mft1.00" style Roche SFF index blocks.
479
480 Roche SFF indices use base 255 not 256, meaning we see bytes in range the
481 range 0 to 254 only. This appears to be so that byte 0xFF (character 255)
482 can be used as a marker character to separate entries (required if the
483 read name lengths vary).
484
485 Note that since only four bytes are used for the read offset, this is
486 limited to 255^4 bytes (nearly 4GB). If you try to use the Roche sfffile
487 tool to combine SFF files beyound this limit, they issue a warning and
488 omit the index (and manifest).
489 """
490 number_of_reads, header_length, index_offset, index_length, xml_offset, \
491 xml_size, read_index_offset, read_index_size = _sff_find_roche_index(handle)
492
493 handle.seek(read_index_offset)
494 fmt = ">5B"
495 for read in range(number_of_reads):
496
497 data = handle.read(6)
498 while True:
499 more = handle.read(1)
500 if not more:
501 raise ValueError("Premature end of file!")
502 data += more
503 if more == _flag: break
504 assert data[-1:] == _flag, data[-1:]
505 name = _bytes_to_string(data[:-6])
506 off4, off3, off2, off1, off0 = struct.unpack(fmt, data[-6:-1])
507 offset = off0 + 255*off1 + 65025*off2 + 16581375*off3
508 if off4:
509
510
511
512 raise ValueError("Expected a null terminator to the read name.")
513 yield name, offset
514 if handle.tell() != read_index_offset + read_index_size:
515 raise ValueError("Problem with index length? %i vs %i" \
516 % (handle.tell(), read_index_offset + read_index_size))
517
518 -def _sff_read_seq_record(handle, number_of_flows_per_read, flow_chars,
519 key_sequence, alphabet, trim=False):
520 """Parse the next read in the file, return data as a SeqRecord (PRIVATE)."""
521
522
523
524
525
526
527
528
529
530
531 read_header_fmt = '>2HI4H'
532 read_header_size = struct.calcsize(read_header_fmt)
533 read_flow_fmt = ">%iH" % number_of_flows_per_read
534 read_flow_size = struct.calcsize(read_flow_fmt)
535
536 read_header_length, name_length, seq_len, clip_qual_left, \
537 clip_qual_right, clip_adapter_left, clip_adapter_right \
538 = struct.unpack(read_header_fmt, handle.read(read_header_size))
539 if clip_qual_left:
540 clip_qual_left -= 1
541 if clip_adapter_left:
542 clip_adapter_left -= 1
543 if read_header_length < 10 or read_header_length % 8 != 0:
544 raise ValueError("Malformed read header, says length is %i" \
545 % read_header_length)
546
547 name = _bytes_to_string(handle.read(name_length))
548 padding = read_header_length - read_header_size - name_length
549 if handle.read(padding).count(_null) != padding:
550 raise ValueError("Post name %i byte padding region contained data" \
551 % padding)
552
553
554 flow_values = handle.read(read_flow_size)
555 temp_fmt = ">%iB" % seq_len
556 flow_index = handle.read(seq_len)
557 seq = _bytes_to_string(handle.read(seq_len))
558 quals = list(struct.unpack(temp_fmt, handle.read(seq_len)))
559
560 padding = (read_flow_size + seq_len*3)%8
561 if padding:
562 padding = 8 - padding
563 if handle.read(padding).count(_null) != padding:
564 raise ValueError("Post quality %i byte padding region contained data" \
565 % padding)
566
567 if trim:
568 seq = seq[clip_qual_left:clip_qual_right].upper()
569 quals = quals[clip_qual_left:clip_qual_right]
570
571 annotations = {}
572 else:
573
574 seq = seq[:clip_qual_left].lower() + \
575 seq[clip_qual_left:clip_qual_right].upper() + \
576 seq[clip_qual_right:].lower()
577 annotations = {"flow_values":struct.unpack(read_flow_fmt, flow_values),
578 "flow_index":struct.unpack(temp_fmt, flow_index),
579 "flow_chars":flow_chars,
580 "flow_key":key_sequence,
581 "clip_qual_left":clip_qual_left,
582 "clip_qual_right":clip_qual_right,
583 "clip_adapter_left":clip_adapter_left,
584 "clip_adapter_right":clip_adapter_right}
585 record = SeqRecord(Seq(seq, alphabet),
586 id=name,
587 name=name,
588 description="",
589 annotations=annotations)
590
591
592 dict.__setitem__(record._per_letter_annotations,
593 "phred_quality", quals)
594
595
596 return record
597
598
600 """Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects).
601
602 handle - input file, an SFF file, e.g. from Roche 454 sequencing.
603 This must NOT be opened in universal read lines mode!
604 alphabet - optional alphabet, defaults to generic DNA.
605 trim - should the sequences be trimmed?
606
607 The resulting SeqRecord objects should match those from a paired FASTA
608 and QUAL file converted from the SFF file using the Roche 454 tool
609 ssfinfo. i.e. The sequence will be mixed case, with the trim regions
610 shown in lower case.
611
612 This function is used internally via the Bio.SeqIO functions:
613
614 >>> from Bio import SeqIO
615 >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
616 >>> for record in SeqIO.parse(handle, "sff"):
617 ... print record.id, len(record)
618 E3MFGYR02JWQ7T 265
619 E3MFGYR02JA6IL 271
620 E3MFGYR02JHD4H 310
621 E3MFGYR02GFKUC 299
622 E3MFGYR02FTGED 281
623 E3MFGYR02FR9G7 261
624 E3MFGYR02GAZMS 278
625 E3MFGYR02HHZ8O 221
626 E3MFGYR02GPGB1 269
627 E3MFGYR02F7Z7G 219
628 >>> handle.close()
629
630 You can also call it directly:
631
632 >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
633 >>> for record in SffIterator(handle):
634 ... print record.id, len(record)
635 E3MFGYR02JWQ7T 265
636 E3MFGYR02JA6IL 271
637 E3MFGYR02JHD4H 310
638 E3MFGYR02GFKUC 299
639 E3MFGYR02FTGED 281
640 E3MFGYR02FR9G7 261
641 E3MFGYR02GAZMS 278
642 E3MFGYR02HHZ8O 221
643 E3MFGYR02GPGB1 269
644 E3MFGYR02F7Z7G 219
645 >>> handle.close()
646
647 Or, with the trim option:
648
649 >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
650 >>> for record in SffIterator(handle, trim=True):
651 ... print record.id, len(record)
652 E3MFGYR02JWQ7T 260
653 E3MFGYR02JA6IL 265
654 E3MFGYR02JHD4H 292
655 E3MFGYR02GFKUC 295
656 E3MFGYR02FTGED 277
657 E3MFGYR02FR9G7 256
658 E3MFGYR02GAZMS 271
659 E3MFGYR02HHZ8O 150
660 E3MFGYR02GPGB1 221
661 E3MFGYR02F7Z7G 130
662 >>> handle.close()
663
664 """
665 if isinstance(Alphabet._get_base_alphabet(alphabet),
666 Alphabet.ProteinAlphabet):
667 raise ValueError("Invalid alphabet, SFF files do not hold proteins.")
668 if isinstance(Alphabet._get_base_alphabet(alphabet),
669 Alphabet.RNAAlphabet):
670 raise ValueError("Invalid alphabet, SFF files do not hold RNA.")
671 header_length, index_offset, index_length, number_of_reads, \
672 number_of_flows_per_read, flow_chars, key_sequence \
673 = _sff_file_header(handle)
674
675
676
677
678
679
680
681
682
683
684 read_header_fmt = '>2HI4H'
685 read_header_size = struct.calcsize(read_header_fmt)
686 read_flow_fmt = ">%iH" % number_of_flows_per_read
687 read_flow_size = struct.calcsize(read_flow_fmt)
688 assert 1 == struct.calcsize(">B")
689 assert 1 == struct.calcsize(">s")
690 assert 1 == struct.calcsize(">c")
691 assert read_header_size % 8 == 0
692
693
694
695 for read in range(number_of_reads):
696 if index_offset and handle.tell() == index_offset:
697 offset = index_offset + index_length
698 if offset % 8:
699 offset += 8 - (offset % 8)
700 assert offset % 8 == 0
701 handle.seek(offset)
702
703
704 index_offset = 0
705 yield _sff_read_seq_record(handle,
706 number_of_flows_per_read,
707 flow_chars,
708 key_sequence,
709 alphabet,
710 trim)
711
712
713 if index_offset and handle.tell() == index_offset:
714 offset = index_offset + index_length
715 if offset % 8:
716 offset += 8 - (offset % 8)
717 assert offset % 8 == 0
718 handle.seek(offset)
719
720 if handle.read(1):
721 raise ValueError("Additional data at end of SFF file")
722
723
724
726 """Iterate over SFF reads (as SeqRecord objects) with trimming (PRIVATE)."""
727 return SffIterator(handle, alphabet, trim=True)
728
729
731 """SFF file writer."""
732
733 - def __init__(self, handle, index=True, xml=None):
734 """Creates the writer object.
735
736 handle - Output handle, ideally in binary write mode.
737 index - Boolean argument, should we try and write an index?
738 xml - Optional string argument, xml manifest to be recorded in the index
739 block (see function ReadRocheXmlManifest for reading this data).
740 """
741 if hasattr(handle,"mode") and "U" in handle.mode.upper():
742 raise ValueError("SFF files must NOT be opened in universal new "
743 "lines mode. Binary mode is required")
744 elif hasattr(handle,"mode") and "B" not in handle.mode.upper():
745 raise ValueError("SFF files must be opened in binary mode")
746 self.handle = handle
747 self._xml = xml
748 if index:
749 self._index = []
750 else:
751 self._index = None
752
754 """Use this to write an entire file containing the given records."""
755 try:
756 self._number_of_reads = len(records)
757 except TypeError:
758 self._number_of_reads = 0
759 if not hasattr(self.handle, "seek") \
760 or not hasattr(self.handle, "tell"):
761 raise ValueError("A handle with a seek/tell methods is "
762 "required in order to record the total "
763 "record count in the file header (once it "
764 "is known at the end).")
765 if self._index is not None and \
766 not (hasattr(self.handle, "seek") and hasattr(self.handle, "tell")):
767 import warnings
768 warnings.warn("A handle with a seek/tell methods is required in "
769 "order to record an SFF index.")
770 self._index = None
771 self._index_start = 0
772 self._index_length = 0
773 if not hasattr(records, "next"):
774 records = iter(records)
775
776
777 try:
778 record = records.next()
779 except StopIteration:
780 record = None
781 if record is None:
782
783
784
785 raise ValueError("Need at least one record for SFF output")
786 try:
787 self._key_sequence = _as_bytes(record.annotations["flow_key"])
788 self._flow_chars = _as_bytes(record.annotations["flow_chars"])
789 self._number_of_flows_per_read = len(self._flow_chars)
790 except KeyError:
791 raise ValueError("Missing SFF flow information")
792 self.write_header()
793 self.write_record(record)
794 count = 1
795 for record in records:
796 self.write_record(record)
797 count += 1
798 if self._number_of_reads == 0:
799
800 offset = self.handle.tell()
801 self.handle.seek(0)
802 self._number_of_reads = count
803 self.write_header()
804 self.handle.seek(offset)
805 else:
806 assert count == self._number_of_reads
807 if self._index is not None:
808 self._write_index()
809 return count
810
812 assert len(self._index)==self._number_of_reads
813 handle = self.handle
814 self._index.sort()
815 self._index_start = handle.tell()
816
817 if self._xml is not None:
818 xml = _as_bytes(self._xml)
819 else:
820 from Bio import __version__
821 xml = "<!-- This file was output with Biopython %s -->\n" % __version__
822 xml += "<!-- This XML and index block attempts to mimic Roche SFF files -->\n"
823 xml += "<!-- This file may be a combination of multiple SFF files etc -->\n"
824 xml = _as_bytes(xml)
825 xml_len = len(xml)
826
827 fmt = ">I4BLL"
828 fmt_size = struct.calcsize(fmt)
829 handle.write(_null*fmt_size + xml)
830 fmt2 = ">6B"
831 assert 6 == struct.calcsize(fmt2)
832 self._index.sort()
833 index_len = 0
834 for name, offset in self._index:
835
836
837
838 off3 = offset
839 off0 = off3 % 255
840 off3 -= off0
841 off1 = off3 % 65025
842 off3 -= off1
843 off2 = off3 % 16581375
844 off3 -= off2
845 assert offset == off0 + off1 + off2 + off3, \
846 "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3)
847 off3, off2, off1, off0 = off3//16581375, off2//65025, \
848 off1//255, off0
849 assert off0 < 255 and off1 < 255 and off2 < 255 and off3 < 255, \
850 "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3)
851 handle.write(name + struct.pack(fmt2, 0, \
852 off3, off2, off1, off0, 255))
853 index_len += len(name) + 6
854
855 self._index_length = fmt_size + xml_len + index_len
856
857
858
859 if self._index_length % 8:
860 padding = 8 - (self._index_length%8)
861 handle.write(_null*padding)
862 else:
863 padding = 0
864 offset = handle.tell()
865 assert offset == self._index_start + self._index_length + padding, \
866 "%i vs %i + %i + %i" % (offset, self._index_start, \
867 self._index_length, padding)
868
869 handle.seek(self._index_start)
870 handle.write(struct.pack(fmt, 778921588,
871 49,46,48,48,
872 xml_len, index_len) + xml)
873
874 handle.seek(0)
875 self.write_header()
876 handle.seek(offset)
877
879
880 key_length = len(self._key_sequence)
881
882
883
884
885
886
887
888
889
890
891
892
893 fmt = '>I4BQIIHHHB%is%is' % (self._number_of_flows_per_read, key_length)
894
895
896
897
898 if struct.calcsize(fmt) % 8 == 0:
899 padding = 0
900 else:
901 padding = 8 - (struct.calcsize(fmt) % 8)
902 header_length = struct.calcsize(fmt) + padding
903 assert header_length % 8 == 0
904 header = struct.pack(fmt, 779314790,
905 0, 0, 0, 1,
906 self._index_start, self._index_length,
907 self._number_of_reads,
908 header_length, key_length,
909 self._number_of_flows_per_read,
910 1,
911 self._flow_chars, self._key_sequence)
912 self.handle.write(header + _null*padding)
913
915 """Write a single additional record to the output file.
916
917 This assumes the header has been done.
918 """
919
920 name = _as_bytes(record.id)
921 name_len = len(name)
922 seq = _as_bytes(str(record.seq).upper())
923 seq_len = len(seq)
924
925 try:
926 quals = record.letter_annotations["phred_quality"]
927 except KeyError:
928 raise ValueError("Missing PHRED qualities information")
929
930 try:
931 flow_values = record.annotations["flow_values"]
932 flow_index = record.annotations["flow_index"]
933 if self._key_sequence != _as_bytes(record.annotations["flow_key"]) \
934 or self._flow_chars != _as_bytes(record.annotations["flow_chars"]):
935 raise ValueError("Records have inconsistent SFF flow data")
936 except KeyError:
937 raise ValueError("Missing SFF flow information")
938 except AttributeError:
939 raise ValueError("Header not written yet?")
940
941 try:
942 clip_qual_left = record.annotations["clip_qual_left"]
943 if clip_qual_left:
944 clip_qual_left += 1
945 clip_qual_right = record.annotations["clip_qual_right"]
946 clip_adapter_left = record.annotations["clip_adapter_left"]
947 if clip_adapter_left:
948 clip_adapter_left += 1
949 clip_adapter_right = record.annotations["clip_adapter_right"]
950 except KeyError:
951 raise ValueError("Missing SFF clipping information")
952
953
954 if self._index is not None:
955 offset = self.handle.tell()
956
957
958
959 if offset > 4244897280:
960 import warnings
961 warnings.warn("Read %s has file offset %i, which is too large "
962 "to store in the Roche SFF index structure. No "
963 "index block will be recorded." % (name, offset))
964
965 self._index = None
966 else:
967 self._index.append((name, self.handle.tell()))
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983 read_header_fmt = '>2HI4H%is' % name_len
984 if struct.calcsize(read_header_fmt) % 8 == 0:
985 padding = 0
986 else:
987 padding = 8 - (struct.calcsize(read_header_fmt) % 8)
988 read_header_length = struct.calcsize(read_header_fmt) + padding
989 assert read_header_length % 8 == 0
990 data = struct.pack(read_header_fmt,
991 read_header_length,
992 name_len, seq_len,
993 clip_qual_left, clip_qual_right,
994 clip_adapter_left, clip_adapter_right,
995 name) + _null*padding
996 assert len(data) == read_header_length
997
998
999 read_flow_fmt = ">%iH" % self._number_of_flows_per_read
1000 read_flow_size = struct.calcsize(read_flow_fmt)
1001 temp_fmt = ">%iB" % seq_len
1002 data += struct.pack(read_flow_fmt, *flow_values) \
1003 + struct.pack(temp_fmt, *flow_index) \
1004 + seq \
1005 + struct.pack(temp_fmt, *quals)
1006
1007 padding = (read_flow_size + seq_len*3)%8
1008 if padding:
1009 padding = 8 - padding
1010 self.handle.write(data + _null*padding)
1011
1012
1013 if __name__ == "__main__":
1014 print "Running quick self test"
1015 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.sff"
1016 metadata = ReadRocheXmlManifest(open(filename, "rb"))
1017 index1 = sorted(_sff_read_roche_index(open(filename, "rb")))
1018 index2 = sorted(_sff_do_slow_index(open(filename, "rb")))
1019 assert index1 == index2
1020 assert len(index1) == len(list(SffIterator(open(filename, "rb"))))
1021 from StringIO import StringIO
1022 try:
1023
1024 from io import BytesIO
1025 except ImportError:
1026 BytesIO = StringIO
1027 assert len(index1) == len(list(SffIterator(BytesIO(open(filename,"rb").read()))))
1028
1029 if sys.platform != "win32":
1030 assert len(index1) == len(list(SffIterator(open(filename, "r"))))
1031 index2 = sorted(_sff_read_roche_index(open(filename)))
1032 assert index1 == index2
1033 index2 = sorted(_sff_do_slow_index(open(filename)))
1034 assert index1 == index2
1035 assert len(index1) == len(list(SffIterator(open(filename))))
1036 assert len(index1) == len(list(SffIterator(BytesIO(open(filename,"r").read()))))
1037 assert len(index1) == len(list(SffIterator(BytesIO(open(filename).read()))))
1038
1039 sff = list(SffIterator(open(filename, "rb")))
1040
1041 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
1042 assert len(sff) == len(sff2)
1043 for old, new in zip(sff, sff2):
1044 assert old.id == new.id
1045 assert str(old.seq) == str(new.seq)
1046
1047 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
1048 assert len(sff) == len(sff2)
1049 for old, new in zip(sff, sff2):
1050 assert old.id == new.id
1051 assert str(old.seq) == str(new.seq)
1052
1053 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
1054 assert len(sff) == len(sff2)
1055 for old, new in zip(sff, sff2):
1056 assert old.id == new.id
1057 assert str(old.seq) == str(new.seq)
1058
1059 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_index_at_start.sff", "rb")))
1060 assert len(sff) == len(sff2)
1061 for old, new in zip(sff, sff2):
1062 assert old.id == new.id
1063 assert str(old.seq) == str(new.seq)
1064
1065 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_index_in_middle.sff", "rb")))
1066 assert len(sff) == len(sff2)
1067 for old, new in zip(sff, sff2):
1068 assert old.id == new.id
1069 assert str(old.seq) == str(new.seq)
1070
1071 sff_trim = list(SffIterator(open(filename, "rb"), trim=True))
1072
1073 print ReadRocheXmlManifest(open(filename, "rb"))
1074
1075 from Bio import SeqIO
1076 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads_no_trim.fasta"
1077 fasta_no_trim = list(SeqIO.parse(open(filename,"rU"), "fasta"))
1078 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads_no_trim.qual"
1079 qual_no_trim = list(SeqIO.parse(open(filename,"rU"), "qual"))
1080
1081 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.fasta"
1082 fasta_trim = list(SeqIO.parse(open(filename,"rU"), "fasta"))
1083 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.qual"
1084 qual_trim = list(SeqIO.parse(open(filename,"rU"), "qual"))
1085
1086 for s, sT, f, q, fT, qT in zip(sff, sff_trim, fasta_no_trim,
1087 qual_no_trim, fasta_trim, qual_trim):
1088
1089 print s.id
1090
1091
1092
1093 assert s.id == f.id == q.id
1094 assert str(s.seq) == str(f.seq)
1095 assert s.letter_annotations["phred_quality"] == q.letter_annotations["phred_quality"]
1096
1097 assert s.id == sT.id == fT.id == qT.id
1098 assert str(sT.seq) == str(fT.seq)
1099 assert sT.letter_annotations["phred_quality"] == qT.letter_annotations["phred_quality"]
1100
1101
1102 print "Writing with a list of SeqRecords..."
1103 handle = StringIO()
1104 w = SffWriter(handle, xml=metadata)
1105 w.write_file(sff)
1106 data = handle.getvalue()
1107 print "And again with an iterator..."
1108 handle = StringIO()
1109 w = SffWriter(handle, xml=metadata)
1110 w.write_file(iter(sff))
1111 assert data == handle.getvalue()
1112
1113 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.sff"
1114 original = open(filename,"rb").read()
1115 assert len(data) == len(original)
1116 assert data == original
1117 del data
1118 handle.close()
1119
1120 print "-"*50
1121 filename = "../../Tests/Roche/greek.sff"
1122 for record in SffIterator(open(filename,"rb")):
1123 print record.id
1124 index1 = sorted(_sff_read_roche_index(open(filename, "rb")))
1125 index2 = sorted(_sff_do_slow_index(open(filename, "rb")))
1126 assert index1 == index2
1127 try:
1128 print ReadRocheXmlManifest(open(filename, "rb"))
1129 assert False, "Should fail!"
1130 except ValueError:
1131 pass
1132
1133
1134 handle = open(filename, "rb")
1135 for record in SffIterator(handle):
1136 pass
1137 try:
1138 for record in SffIterator(handle):
1139 print record.id
1140 assert False, "Should have failed"
1141 except ValueError, err:
1142 print "Checking what happens on re-reading a handle:"
1143 print err
1144
1145
1146 """
1147 #Ugly code to make test files...
1148 index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
1149 padding = len(index)%8
1150 if padding:
1151 padding = 8 - padding
1152 index += chr(0)*padding
1153 assert len(index)%8 == 0
1154
1155 #Ugly bit of code to make a fake index at start
1156 records = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
1157 out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "w")
1158 index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
1159 padding = len(index)%8
1160 if padding:
1161 padding = 8 - padding
1162 index += chr(0)*padding
1163 w = SffWriter(out_handle, index=False, xml=None)
1164 #Fake the header...
1165 w._number_of_reads = len(records)
1166 w._index_start = 0
1167 w._index_length = 0
1168 w._key_sequence = records[0].annotations["flow_key"]
1169 w._flow_chars = records[0].annotations["flow_chars"]
1170 w._number_of_flows_per_read = len(w._flow_chars)
1171 w.write_header()
1172 w._index_start = out_handle.tell()
1173 w._index_length = len(index)
1174 out_handle.seek(0)
1175 w.write_header() #this time with index info
1176 w.handle.write(index)
1177 for record in records:
1178 w.write_record(record)
1179 out_handle.close()
1180 records2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
1181 for old, new in zip(records, records2):
1182 assert str(old.seq)==str(new.seq)
1183 i = list(_sff_do_slow_index(open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
1184
1185 #Ugly bit of code to make a fake index in middle
1186 records = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
1187 out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "w")
1188 index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
1189 padding = len(index)%8
1190 if padding:
1191 padding = 8 - padding
1192 index += chr(0)*padding
1193 w = SffWriter(out_handle, index=False, xml=None)
1194 #Fake the header...
1195 w._number_of_reads = len(records)
1196 w._index_start = 0
1197 w._index_length = 0
1198 w._key_sequence = records[0].annotations["flow_key"]
1199 w._flow_chars = records[0].annotations["flow_chars"]
1200 w._number_of_flows_per_read = len(w._flow_chars)
1201 w.write_header()
1202 for record in records[:5]:
1203 w.write_record(record)
1204 w._index_start = out_handle.tell()
1205 w._index_length = len(index)
1206 w.handle.write(index)
1207 for record in records[5:]:
1208 w.write_record(record)
1209 out_handle.seek(0)
1210 w.write_header() #this time with index info
1211 out_handle.close()
1212 records2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
1213 for old, new in zip(records, records2):
1214 assert str(old.seq)==str(new.seq)
1215 j = list(_sff_do_slow_index(open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
1216
1217 #Ugly bit of code to make a fake index at end
1218 records = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
1219 out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "w")
1220 w = SffWriter(out_handle, index=False, xml=None)
1221 #Fake the header...
1222 w._number_of_reads = len(records)
1223 w._index_start = 0
1224 w._index_length = 0
1225 w._key_sequence = records[0].annotations["flow_key"]
1226 w._flow_chars = records[0].annotations["flow_chars"]
1227 w._number_of_flows_per_read = len(w._flow_chars)
1228 w.write_header()
1229 for record in records:
1230 w.write_record(record)
1231 w._index_start = out_handle.tell()
1232 w._index_length = len(index)
1233 out_handle.write(index)
1234 out_handle.seek(0)
1235 w.write_header() #this time with index info
1236 out_handle.close()
1237 records2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
1238 for old, new in zip(records, records2):
1239 assert str(old.seq)==str(new.seq)
1240 try:
1241 print ReadRocheXmlManifest(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb"))
1242 assert False, "Should fail!"
1243 except ValueError:
1244 pass
1245 k = list(_sff_do_slow_index(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
1246 """
1247
1248 print "Done"
1249