1
2
3
4
5 """Dictionary like indexing of sequence files (PRIVATE).
6
7 You are not expected to access this module, or any of its code, directly. This
8 is all handled internally by the Bio.SeqIO.index(...) function which is the
9 public interface for this functionality.
10
11 The basic idea is that we scan over a sequence file, looking for new record
12 markers. We then try and extract the string that Bio.SeqIO.parse/read would
13 use as the record id, ideally without actually parsing the full record. We
14 then use a subclassed Python dictionary to record the file offset for the
15 record start against the record id.
16
17 Note that this means full parsing is on demand, so any invalid or problem
18 record may not trigger an exception until it is accessed. This is by design.
19
20 This means our dictionary like objects have in memory ALL the keys (all the
21 record identifiers), which shouldn't be a problem even with second generation
22 sequencing. If this is an issue later on, storing the keys and offsets in a
23 temp lookup file might be one idea (e.g. using SQLite or an OBDA style index).
24 """
25
26 import re
27 from Bio import SeqIO
28 from Bio import Alphabet
29
31 """Read only dictionary interface to a sequential sequence file.
32
33 Keeps the keys in memory, reads the file to access entries as
34 SeqRecord objects using Bio.SeqIO for parsing them. This approach
35 is memory limited, but will work even with millions of sequences.
36
37 Note - as with the Bio.SeqIO.to_dict() function, duplicate keys
38 (record identifiers by default) are not allowed. If this happens,
39 a ValueError exception is raised.
40
41 By default the SeqRecord's id string is used as the dictionary
42 key. This can be changed by suppling an optional key_function,
43 a callback function which will be given the record id and must
44 return the desired key. For example, this allows you to parse
45 NCBI style FASTA identifiers, and extract the GI number to use
46 as the dictionary key.
47
48 Note that this dictionary is essentially read only. You cannot
49 add or change values, pop values, nor clear the dictionary.
50 """
51 - def __init__(self, filename, format, alphabet, key_function):
52
53 dict.__init__(self)
54 if format in SeqIO._BinaryFormats:
55 mode = "rb"
56 else:
57 mode = "rU"
58 self._handle = open(filename, mode)
59 self._alphabet = alphabet
60 self._format = format
61 self._key_function = key_function
62 if key_function:
63 offset_iter = ((key_function(k),o) for (k,o) in self._build())
64 else:
65 offset_iter = self._build()
66 for key, offset in offset_iter:
67 if key in self:
68 raise ValueError("Duplicate key '%s'" % key)
69 else:
70 dict.__setitem__(self, key, offset)
71
73 """Actually scan the file identifying records and offsets (PRIVATE).
74
75 Returns an iterator giving tuples of record names and their offsets.
76 """
77 pass
78
80 return "SeqIO.index('%s', '%s', alphabet=%s, key_function=%s)" \
81 % (self._handle.name, self._format,
82 repr(self._alphabet), self._key_function)
83
85 if self:
86 return "{%s : SeqRecord(...), ...}" % repr(self.keys()[0])
87 else:
88 return "{}"
89
90 if hasattr(dict, "iteritems"):
91
93 """Would be a list of the SeqRecord objects, but not implemented.
94
95 In general you can be indexing very very large files, with millions
96 of sequences. Loading all these into memory at once as SeqRecord
97 objects would (probably) use up all the RAM. Therefore we simply
98 don't support this dictionary method.
99 """
100 raise NotImplementedError("Due to memory concerns, when indexing a "
101 "sequence file you cannot access all the "
102 "records at once.")
103
105 """Would be a list of the (key, SeqRecord) tuples, but not implemented.
106
107 In general you can be indexing very very large files, with millions
108 of sequences. Loading all these into memory at once as SeqRecord
109 objects would (probably) use up all the RAM. Therefore we simply
110 don't support this dictionary method.
111 """
112 raise NotImplementedError("Due to memory concerns, when indexing a "
113 "sequence file you cannot access all the "
114 "records at once.")
115
120 else:
121
126
131
132
134 """x.__getitem__(y) <==> x[y]"""
135
136 raise NotImplementedError("Not implemented for this file format (yet).")
137
138 - def get(self, k, d=None):
139 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
140 try:
141 return self.__getitem__(k)
142 except KeyError:
143 return d
144
146 """Similar to the get method, but returns the record as a raw string.
147
148 If the key is not found, a KeyError exception is raised.
149
150 NOTE - This functionality is not supported for every file format.
151 """
152
153 raise NotImplementedError("Not available for this file format.")
154
156 """Would allow setting or replacing records, but not implemented."""
157 raise NotImplementedError("An indexed a sequence file is read only.")
158
160 """Would allow adding more values, but not implemented."""
161 raise NotImplementedError("An indexed a sequence file is read only.")
162
163
164 - def pop(self, key, default=None):
165 """Would remove specified record, but not implemented."""
166 raise NotImplementedError("An indexed a sequence file is read only.")
167
169 """Would remove and return a SeqRecord, but not implemented."""
170 raise NotImplementedError("An indexed a sequence file is read only.")
171
172
174 """Would clear dictionary, but not implemented."""
175 raise NotImplementedError("An indexed a sequence file is read only.")
176
178 """A dictionary method which we don't implement."""
179 raise NotImplementedError("An indexed a sequence file doesn't "
180 "support this.")
181
183 """A dictionary method which we don't implement."""
184 raise NotImplementedError("An indexed a sequence file doesn't "
185 "support this.")
186
187
188
189
190
191
192
193
194
195
196
197
198 -class SffDict(_IndexedSeqFileDict) :
199 """Indexed dictionary like access to a Standard Flowgram Format (SFF) file."""
201 """Load any index block in the file, or build it the slow way (PRIVATE)."""
202 if self._alphabet is None:
203 self._alphabet = Alphabet.generic_dna
204 handle = self._handle
205
206 header_length, index_offset, index_length, number_of_reads, \
207 self._flows_per_read, self._flow_chars, self._key_sequence \
208 = SeqIO.SffIO._sff_file_header(handle)
209 if index_offset and index_length:
210
211 try :
212 for name, offset in SeqIO.SffIO._sff_read_roche_index(handle) :
213 yield name, offset
214 assert len(self) == number_of_reads, \
215 "Indexed %i records, expected %i" \
216 % (len(self), number_of_reads)
217 return
218 except ValueError, err :
219 import warnings
220 warnings.warn("Could not parse the SFF index: %s" % err)
221 assert len(self)==0, "Partially populated index"
222 handle.seek(0)
223 else :
224
225 import warnings
226 warnings.warn("No SFF index, doing it the slow way")
227
228 for name, offset in SeqIO.SffIO._sff_do_slow_index(handle) :
229 yield name, offset
230 assert len(self) == number_of_reads, \
231 "Indexed %i records, expected %i" % (len(self), number_of_reads)
232
241
242
253
254
255
256
257
258
260 """Indexed dictionary like access to most sequential sequence files."""
261 - def __init__(self, filename, format, alphabet, key_function):
272 else:
273
274 def _parse():
275 """Dynamically generated parser function (PRIVATE)."""
276 try:
277 return i(self._handle, alphabet=alphabet).next()
278 except TypeError:
279 return SeqIO._force_alphabet(i(self._handle),
280 alphabet).next()
281 self._parse = _parse
282
284 handle = self._handle
285 marker = {"ace" : "CO ",
286 "fasta": ">",
287 "phd" : "BEGIN_SEQUENCE",
288 "pir" : ">..;",
289 "qual": ">",
290 }[self._format]
291 marker_re = re.compile("^%s" % marker)
292 marker_offset = len(marker)
293 self._marker_re = marker_re
294 while True:
295 offset = handle.tell()
296 line = handle.readline()
297 if marker_re.match(line):
298
299
300 yield line[marker_offset:].strip().split(None, 1)[0], offset
301 elif not line:
302
303 break
304
320
335
336
337
338
339
340
342 """Indexed dictionary like access to a GenBank file."""
344 handle = self._handle
345 marker_re = re.compile("^LOCUS ")
346 self._marker_re = marker_re
347 while True:
348 offset = handle.tell()
349 line = handle.readline()
350 if marker_re.match(line):
351
352
353 key = None
354 done = False
355 while not done:
356 line = handle.readline()
357 if line.startswith("ACCESSION "):
358 key = line.rstrip().split()[1]
359 elif line.startswith("VERSION "):
360 version_id = line.rstrip().split()[1]
361 if version_id.count(".")==1 and version_id.split(".")[1].isdigit():
362
363 key = version_id
364 done = True
365 break
366 elif line.startswith("FEATURES ") \
367 or line.startswith("ORIGIN ") \
368 or line.startswith("//") \
369 or marker_re.match(line) \
370 or not line:
371 done = True
372 break
373 if not key:
374 raise ValueError("Did not find ACCESSION/VERSION lines")
375 yield key, offset
376 elif not line:
377
378 break
379
380
382 """Indexed dictionary like access to an EMBL file."""
423
424
426 """Indexed dictionary like access to a SwissProt file."""
444
445
447 """Indexed dictionary like access to a UniProt XML file."""
449 handle = self._handle
450 marker_re = re.compile("^<entry ")
451 self._marker_re = marker_re
452 while True:
453 offset = handle.tell()
454 line = handle.readline()
455 if marker_re.match(line):
456
457
458 key = None
459 done = False
460 while True:
461 line = handle.readline()
462 if line.startswith("<accession>"):
463 assert "</accession>" in line, line
464 key = line[11:].split("<")[0]
465 break
466 elif "</entry>" in line:
467 break
468 elif marker_re.match(line) or not line:
469
470 raise ValueError("Didn't find end of record")
471 if not key:
472 raise ValueError("Did not find <accession> line")
473 yield key, offset
474 elif not line:
475
476 break
477
479 """Similar to the get method, but returns the record as a raw string."""
480 handle = self._handle
481 marker_re = self._marker_re
482 handle.seek(dict.__getitem__(self, key))
483 data = handle.readline()
484 while True:
485 line = handle.readline()
486 i = line.find("</entry>")
487 if i != -1:
488 data += line[:i+8]
489 break
490 if marker_re.match(line) or not line:
491
492 raise ValueError("Didn't find end of record")
493 data += line
494 return data
495
497
498
499
500 data = """<?xml version='1.0' encoding='UTF-8'?>
501 <uniprot xmlns="http://uniprot.org/uniprot"
502 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
503 xsi:schemaLocation="http://uniprot.org/uniprot
504 http://www.uniprot.org/support/docs/uniprot.xsd">
505 %s
506 </uniprot>
507 """ % self.get_raw(key)
508
509 return SeqIO.UniprotIO.UniprotIterator(data).next()
510
511
513 """Indexed dictionary like access to a IntelliGenetics file."""
515 handle = self._handle
516 marker_re = re.compile("^;")
517 self._marker_re = marker_re
518 while True:
519 offset = handle.tell()
520 line = handle.readline()
521 if marker_re.match(line):
522
523 while True:
524 line = handle.readline()
525 if line[0] != ";" and line.strip():
526 key = line.split()[0]
527 yield key, offset
528 break
529 if not line:
530 raise ValueError("Premature end of file?")
531 elif not line:
532
533 break
534
535
536 -class TabDict(SequentialSeqFileDict):
537 """Indexed dictionary like access to a simple tabbed file."""
539 handle = self._handle
540 while True:
541 offset = handle.tell()
542 line = handle.readline()
543 if not line : break
544 try:
545 key = line.split("\t")[0]
546 except ValueError, err:
547 if not line.strip():
548
549 continue
550 else:
551 raise err
552 else:
553 yield key, offset
554
560
561
562
563
564
565
567 """Indexed dictionary like access to a FASTQ file (any supported variant).
568
569 With FASTQ the records all start with a "@" line, but so can quality lines.
570 Note this will cope with line-wrapped FASTQ files.
571 """
573 handle = self._handle
574 pos = handle.tell()
575 line = handle.readline()
576 if not line:
577
578 return
579 if line[0] != "@":
580 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line))
581 while line:
582
583
584 yield line[1:].rstrip().split(None, 1)[0], pos
585
586 seq_len = 0
587 while line:
588 line = handle.readline()
589 if line.startswith("+") : break
590 seq_len += len(line.strip())
591 if not line:
592 raise ValueError("Premature end of file in seq section")
593
594
595 qual_len = 0
596 while line:
597 if seq_len == qual_len:
598
599 pos = handle.tell()
600 line = handle.readline()
601 if line and line[0] != "@":
602 ValueError("Problem with line %s" % repr(line))
603 break
604 else:
605 line = handle.readline()
606 qual_len += len(line.strip())
607 if seq_len != qual_len:
608 raise ValueError("Problem with quality section")
609
610
612 """Similar to the get method, but returns the record as a raw string."""
613
614 handle = self._handle
615 handle.seek(dict.__getitem__(self, key))
616 line = handle.readline()
617 data = line
618 if line[0] != "@":
619 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line))
620 identifier = line[1:].rstrip().split(None, 1)[0]
621 if self._key_function:
622 identifier = self._key_function(identifier)
623 if key != identifier:
624 raise ValueError("Key did not match")
625
626 seq_len = 0
627 while line:
628 line = handle.readline()
629 data += line
630 if line.startswith("+") : break
631 seq_len += len(line.strip())
632 if not line:
633 raise ValueError("Premature end of file in seq section")
634 assert line[0]=="+"
635
636 qual_len = 0
637 while line:
638 if seq_len == qual_len:
639
640 pos = handle.tell()
641 line = handle.readline()
642 if line and line[0] != "@":
643 ValueError("Problem with line %s" % repr(line))
644 break
645 else:
646 line = handle.readline()
647 data += line
648 qual_len += len(line.strip())
649 if seq_len != qual_len:
650 raise ValueError("Problem with quality section")
651 return data
652
653
654
655
656 _FormatToIndexedDict = {"ace" : SequentialSeqFileDict,
657 "embl" : EmblDict,
658 "fasta" : SequentialSeqFileDict,
659 "fastq" : FastqDict,
660 "fastq-sanger" : FastqDict,
661 "fastq-solexa" : FastqDict,
662 "fastq-illumina" : FastqDict,
663 "genbank" : GenBankDict,
664 "gb" : GenBankDict,
665 "ig" : IntelliGeneticsDict,
666 "imgt" : EmblDict,
667 "phd" : SequentialSeqFileDict,
668 "pir" : SequentialSeqFileDict,
669 "sff" : SffDict,
670 "sff-trim" : SffTrimmedDict,
671 "swiss" : SwissDict,
672 "tab" : TabDict,
673 "qual" : SequentialSeqFileDict,
674 "uniprot-xml" : UniprotDict,
675 }
676