Package Bio :: Package SeqIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.FastaIO

  1  # Copyright 2006-2009 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # This module is for reading and writing FASTA format files as SeqRecord 
  7  # objects.  The code is partly inspired  by earlier Biopython modules, 
  8  # Bio.Fasta.* and the now deprecated Bio.SeqIO.FASTA 
  9   
 10  """Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format. 
 11   
 12  You are expected to use this module via the Bio.SeqIO functions.""" 
 13   
 14  from Bio.Alphabet import single_letter_alphabet 
 15  from Bio.Seq import Seq 
 16  from Bio.SeqRecord import SeqRecord 
 17  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
 18   
 19  #This is a generator function! 
20 -def FastaIterator(handle, alphabet = single_letter_alphabet, title2ids = None):
21 """Generator function to iterate over Fasta records (as SeqRecord objects). 22 23 handle - input file 24 alphabet - optional alphabet 25 title2ids - A function that, when given the title of the FASTA 26 file (without the beginning >), will return the id, name and 27 description (in that order) for the record as a tuple of strings. 28 29 If this is not given, then the entire title line will be used 30 as the description, and the first word as the id and name. 31 32 Note that use of title2ids matches that of Bio.Fasta.SequenceParser 33 but the defaults are slightly different. 34 """ 35 #Skip any text before the first record (e.g. blank lines, comments) 36 while True: 37 line = handle.readline() 38 if line == "" : return #Premature end of file, or just empty? 39 if line[0] == ">": 40 break 41 42 while True: 43 if line[0]!=">": 44 raise ValueError("Records in Fasta files should start with '>' character") 45 if title2ids: 46 id, name, descr = title2ids(line[1:].rstrip()) 47 else: 48 descr = line[1:].rstrip() 49 id = descr.split()[0] 50 name = id 51 52 lines = [] 53 line = handle.readline() 54 while True: 55 if not line : break 56 if line[0] == ">": break 57 #Remove trailing whitespace, and any internal spaces 58 #(and any embedded \r which are possible in mangled files 59 #when not opened in universal read lines mode) 60 lines.append(line.rstrip().replace(" ","").replace("\r","")) 61 line = handle.readline() 62 63 #Return the record and then continue... 64 yield SeqRecord(Seq("".join(lines), alphabet), 65 id = id, name = name, description = descr) 66 67 if not line : return #StopIteration 68 assert False, "Should not reach this line"
69
70 -class FastaWriter(SequentialSequenceWriter):
71 """Class to write Fasta format files."""
72 - def __init__(self, handle, wrap=60, record2title=None):
73 """Create a Fasta writer. 74 75 handle - Handle to an output file, e.g. as returned 76 by open(filename, "w") 77 wrap - Optional line length used to wrap sequence lines. 78 Defaults to wrapping the sequence at 60 characters 79 Use zero (or None) for no wrapping, giving a single 80 long line for the sequence. 81 record2title - Optional function to return the text to be 82 used for the title line of each record. By default the 83 a combination of the record.id and record.description 84 is used. If the record.description starts with the 85 record.id, then just the record.description is used. 86 87 You can either use: 88 89 myWriter = FastaWriter(open(filename,"w")) 90 writer.write_file(myRecords) 91 92 Or, follow the sequential file writer system, for example: 93 94 myWriter = FastaWriter(open(filename,"w")) 95 writer.write_header() # does nothing for Fasta files 96 ... 97 Multiple calls to writer.write_record() and/or writer.write_records() 98 ... 99 writer.write_footer() # does nothing for Fasta files 100 writer.close() 101 """ 102 SequentialSequenceWriter.__init__(self, handle) 103 #self.handle = handle 104 self.wrap = None 105 if wrap: 106 if wrap < 1: 107 raise ValueError 108 self.wrap = wrap 109 self.record2title = record2title
110
111 - def write_record(self, record):
112 """Write a single Fasta record to the file.""" 113 assert self._header_written 114 assert not self._footer_written 115 self._record_written = True 116 117 if self.record2title: 118 title=self.clean(self.record2title(record)) 119 else: 120 id = self.clean(record.id) 121 description = self.clean(record.description) 122 123 #if description[:len(id)]==id: 124 if description and description.split(None,1)[0]==id: 125 #The description includes the id at the start 126 title = description 127 elif description: 128 title = "%s %s" % (id, description) 129 else: 130 title = id 131 132 assert "\n" not in title 133 assert "\r" not in title 134 self.handle.write(">%s\n" % title) 135 136 data = self._get_seq_string(record) #Catches sequence being None 137 138 assert "\n" not in data 139 assert "\r" not in data 140 141 if self.wrap: 142 for i in range(0, len(data), self.wrap): 143 self.handle.write(data[i:i+self.wrap] + "\n") 144 else: 145 self.handle.write(data + "\n")
146 147 if __name__ == "__main__": 148 print "Running quick self test" 149 150 import os 151 from Bio.Alphabet import generic_protein, generic_nucleotide 152 153 #Download the files from here: 154 #ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Nanoarchaeum_equitans 155 fna_filename = "NC_005213.fna" 156 faa_filename = "NC_005213.faa" 157
158 - def genbank_name_function(text):
159 text, descr = text.split(None,1) 160 id = text.split("|")[3] 161 name = id.split(".",1)[0] 162 return id, name, descr
163 176 177 if os.path.isfile(fna_filename): 178 print "--------" 179 print "FastaIterator (single sequence)" 180 iterator = FastaIterator(open(fna_filename, "r"), alphabet=generic_nucleotide, title2ids=genbank_name_function) 181 count=0 182 for record in iterator: 183 count=count+1 184 print_record(record) 185 assert count == 1 186 print str(record.__class__) 187 188 if os.path.isfile(faa_filename): 189 print "--------" 190 print "FastaIterator (multiple sequences)" 191 iterator = FastaIterator(open(faa_filename, "r"), alphabet=generic_protein, title2ids=genbank_name_function) 192 count=0 193 for record in iterator: 194 count=count+1 195 print_record(record) 196 break 197 assert count>0 198 print str(record.__class__) 199 200 from cStringIO import StringIO 201 print "--------" 202 print "FastaIterator (empty input file)" 203 #Just to make sure no errors happen 204 iterator = FastaIterator(StringIO("")) 205 count = 0 206 for record in iterator: 207 count = count+1 208 assert count==0 209 210 print "Done" 211