Package Bio :: Package GFF
[hide private]
[frames] | no frames]

Source Code for Package Bio.GFF

  1  #!/usr/bin/env python 
  2  # 
  3  # Copyright 2002 by Michael Hoffman.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Access to General Feature Format databases created with BioPerl (DEPRECATED). 
  9   
 10  This is the "old" Bio.GFF module by Michael Hoffman, which offers access to 
 11  a MySQL database holding GFF data loaded by BioPerl. This code has now been 
 12  deprecated, and will be removed (or at best, relocated) in order to free the 
 13  Bio.GFF namespace for a new GFF parser in Biopython (including GFF3 support). 
 14   
 15  Based on documentation for Lincoln Stein's Perl Bio::DB::GFF 
 16   
 17  >>> import os 
 18  >>> import Bio.GFF 
 19  >>> PASSWORD = os.environ['MYSQLPASS'] 
 20  >>> DATABASE_GFF = 'wormbase_ws93' 
 21  >>> FASTADIR = '/local/wormbase_ws93/' 
 22  >>> gff = Bio.GFF.Connection(passwd=PASSWORD, db=DATABASE_GFF, fastadir=FASTADIR) 
 23   
 24  """ 
 25   
 26  import warnings 
 27  import Bio 
 28  warnings.warn("The old Bio.GFF module for access to a MySQL GFF database " 
 29                "created with BioPerl is deprecated, and will be removed (or " 
 30                "possibly just moved) in a future release of Biopython.  If you " 
 31                "want to continue to use this code, please get in contact with " 
 32                "the developers via the mailing lists to avoid its permanent " 
 33                "removal from Biopython. The plan is to re-use the Bio.GFF " 
 34                "namespace for a new GFF parsing module.", Bio.BiopythonDeprecationWarning) 
 35   
 36  __version__ = "$Revision: 1.10 $" 
 37  # $Source: /home/bartek/cvs2bzr/biopython_fastimport/cvs_repo/biopython/Bio/GFF/__init__.py,v $ 
 38   
 39  import operator 
 40  import os.path 
 41  import sys 
 42  import types 
 43   
 44  from Bio import MissingPythonDependencyError 
 45   
 46  try: 
 47      import MySQLdb 
 48  except: 
 49      raise MissingPythonDependencyError("Install MySQLdb if you want to use " 
 50                                         "Bio.GFF.") 
 51   
 52  from Bio.Alphabet import IUPAC 
 53  from Bio import DocSQL 
 54  from Bio.Seq import Seq 
 55  from Bio.SeqRecord import SeqRecord 
 56   
 57  import binning 
 58  import easy 
 59  import GenericTools 
 60   
 61   
 62  DEFAULT_ALPHABET = IUPAC.unambiguous_dna 
 63   
64 -class Segment(object):
65 """ 66 this will only work for the simplest of easy.Location objects 67 """
68 - def __init__(self, gff, location=None):
69 self.gff = gff 70 if location is None: 71 self.seqname = None 72 self.coords = None 73 self.complement = None 74 else: 75 self.seqname = location.seqname 76 self.coords = [location.start(), location.end()] 77 self.complement = location.complement
78
79 - def features(self, *args, **keywds):
80 return FeatureQuery(self.seqname, self.coords, self.complement, connection=self.gff.db, *args, **keywds)
81
82 - def single_feature(self, *args, **keywds):
83 features = self.features(*args, **keywds) 84 all_features = GenericTools.all(features) 85 assert len(all_features) == 1 86 return all_features[0]
87
88 -class Connection(Segment):
89 """ 90 Connection to GFF database 91 92 also functions as whole segment 93 """ 94
95 - def __init__(self, *args, **keywds):
96 try: 97 RetrieveSeqname._dir = keywds['fastadir'] 98 del keywds['fastadir'] 99 except KeyError: 100 RetrieveSeqname._dir = '.' 101 self.db = MySQLdb.connect(*args, **keywds) 102 Segment.__init__(self, self)
103
104 - def segment(self, *args, **keywds):
105 return Segment(self, *args, **keywds)
106
107 -class RetrieveSeqname(GenericTools.Surrogate, SeqRecord):
108 """ 109 Singleton: contain records of loaded FASTA files 110 111 >>> RetrieveSeqname._dir = 'GFF' 112 >>> RetrieveSeqname._diagnostics = 1 113 >>> sys.stderr = sys.stdout # dump diagnostics to stdout so doctest can see them 114 >>> record1 = RetrieveSeqname('NC_001802.fna') 115 Loading GFF/NC_001802.fna... Done. 116 >>> record1.id 117 'gi|9629357|ref|NC_001802.1|' 118 >>> record2 = RetrieveSeqname('NC_001802.fna') 119 >>> record1.seq is record2.seq # should be same space in memory 120 1 121 """ 122 __records = {} 123 _diagnostics = 0 124
125 - def __init__(self, seqname):
126 try: 127 GenericTools.Surrogate.__init__(self, self.__records[seqname]) 128 except KeyError: 129 filename = os.path.join(self._dir, seqname) 130 if self._diagnostics: 131 sys.stderr.write("Loading %s..." % filename) 132 record = easy.fasta_single(filename) 133 self.__records[seqname] = record 134 GenericTools.Surrogate.__init__(self, self.__records[seqname]) 135 if self._diagnostics: 136 print >>sys.stderr, " Done."
137
138 -class Feature(object):
139 """ 140 strand may be: 141 +/0 = Watson 142 -/1 = Crick 143 144 I propose that we start calling these the Rosalind and Franklin strands 145 146 >>> RetrieveSeqname._dir = 'GFF' 147 >>> feature = Feature("NC_001802x.fna", 73, 78) # not having the x will interfere with the RetrieveSequence test 148 >>> feature.seq() 149 Seq('AATAAA', Alphabet()) 150 >>> print feature.location() 151 NC_001802x.fna:73..78 152 >>> from Bio import SeqIO 153 >>> record = feature.record() 154 >>> records = [record] 155 >>> SeqIO.write(records, sys.stdout, 'fasta') 156 > NC_001802x.fna:73..78 157 AATAAA 158 >>> feature2 = Feature(location=easy.LocationFromString("NC_001802x.fna:73..78")) 159 >>> writer.write(feature2.record()) 160 > NC_001802x.fna:73..78 161 AATAAA 162 >>> location3 = easy.LocationFromString("NC_001802x.fna:complement(73..78)") 163 >>> feature3 = Feature(location=location3) 164 >>> writer.write(feature3.record()) 165 > NC_001802x.fna:complement(73..78) 166 TTTATT 167 >>> location4 = easy.LocationFromString("NC_001802x.fna:336..1631") 168 >>> feature4 = Feature(location=location4, frame=0) 169 >>> feature4.frame 170 0 171 >>> feature4.translate()[:7] 172 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*')) 173 >>> feature4.frame = 6 # can't happen, but a useful demonstration 174 >>> feature4.translate()[:5] 175 Seq('ARASV', HasStopCodon(IUPACProtein(), '*')) 176 >>> feature4.frame = 1 177 >>> feature4.translate()[:5] 178 Seq('WVRER', HasStopCodon(IUPACProtein(), '*')) 179 >>> location5 = easy.LocationFromString("NC_001802lc.fna:336..1631") # lowercase data 180 >>> feature5 = Feature(location=location5, frame=0) 181 >>> feature5.translate()[:7] 182 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*')) 183 >>> location6 = easy.LocationFromString("NC_001802lc.fna:335..351") 184 >>> feature6 = Feature(location=location6, frame=1) 185 >>> feature6.translate() 186 Seq('MGARA', HasStopCodon(IUPACProtein(), '*')) 187 """
188 - def __init__(self, 189 seqname=None, 190 start=None, 191 end=None, 192 strand="+", 193 frame=None, 194 location=None, 195 alphabet=DEFAULT_ALPHABET):
196 if not isinstance(seqname, (types.StringType, types.NoneType)): 197 raise TypeError("seqname needs to be string") 198 self.frame = frame 199 self.alphabet = alphabet 200 if location is None: 201 self.seqname = seqname 202 self.start = start 203 self.end = end 204 self.strand = strand 205 return 206 else: 207 self.seqname = location.seqname 208 self.start = location.start() + 1 209 self.end = location.end() + 1 210 self.strand = location.complement
211
212 - def __hash__(self):
213 return hash((self.seqname, 214 self.start, 215 self.end, 216 self.strand, 217 self.frame, 218 self.alphabet))
219
220 - def seq(self):
221 rec = RetrieveSeqname(self.seqname) 222 return easy.record_subseq(rec, self.location(), upper=1)
223
224 - def translate(self):
225 seq = self.seq() 226 try: 227 seq = Seq(seq.tostring()[self.frame:], self.alphabet) 228 except TypeError: 229 seq.alphabet = self.alphabet 230 try: 231 return seq.translate() #default table 232 except AssertionError: 233 # if the feature was pickled then we have problems 234 import cPickle 235 if cPickle.dumps(seq.alphabet) == cPickle.dumps(DEFAULT_ALPHABET): 236 seq.alphabet = DEFAULT_ALPHABET 237 return seq.translate() 238 else: 239 raise
240
241 - def location(self):
242 return easy.LocationFromCoords(self.start-1, self.end-1, self.strand, seqname=self.seqname)
243
244 - def target_location(self):
245 if self.target_start <= self.target_end: 246 return easy.LocationFromCoords(self.target_start-1, self.target_end-1, 0, seqname=self.gname) 247 else: 248 # switch start and end and make it complement: 249 return easy.LocationFromCoords(self.target_end-1, self.target_start-1, 1, seqname=self.gname)
250
251 - def id(self):
252 try: 253 return "%s:%s" % (self.gclass, self.gname) 254 except AttributeError: 255 return ""
256
257 - def record(self):
258 return SeqRecord(self.seq(), self.id(), "", self.location())
259
260 - def xrange(self):
261 return xrange(self.start, self.end)
262
263 - def __str__(self):
264 return "Feature(%s)" % self.location()
265
266 -class FeatureQueryRow(DocSQL.QueryRow, Feature):
267 """ 268 row of FeatureQuery results 269 works like a Feature 270 """
271 - def __init__(self, *args, **keywds):
272 DocSQL.QueryRow.__init__(self, *args, **keywds) 273 try: 274 self.frame = int(self.frame) 275 except ValueError: 276 self.frame = '' 277 except TypeError: 278 self.frame = None 279 self.alphabet = DEFAULT_ALPHABET
280
281 -class FeatureQuery(DocSQL.Query):
282 """ 283 SELECT fdata.fref AS seqname, 284 ftype.fsource AS source, 285 ftype.fmethod AS feature, 286 fdata.fstart AS start, 287 fdata.fstop AS end, 288 fdata.fscore AS score, 289 fdata.fstrand AS strand, 290 fdata.fphase AS frame, 291 fdata.ftarget_start AS target_start, 292 fdata.ftarget_stop AS target_end, 293 fdata.gid, 294 fgroup.gclass, 295 fgroup.gname 296 FROM fdata, ftype, fgroup 297 WHERE fdata.ftypeid = ftype.ftypeid 298 AND fdata.gid = fgroup.gid 299 """ 300 301 row_class = FeatureQueryRow
302 - def __init__(self, 303 seqname=None, 304 coords=None, 305 complement=0, 306 method=None, 307 source=None, 308 object=None, # "class:name" 309 gid=None, 310 *args, 311 **keywds):
312 313 DocSQL.Query.__init__(self, *args, **keywds) 314 315 if seqname is not None: 316 self.statement += ' AND fref="%s"\n' % seqname 317 if coords is not None: 318 self.statement += " AND (%s)\n" % binning.query(coords[0], coords[1]) 319 if coords[0] is not None: 320 self.statement += (" AND fstop >= %s\n" % coords[0]) 321 if coords[1] is not None: 322 self.statement += (" AND fstart <= %s\n" % coords[1]) 323 if method is not None: 324 self.statement += ' AND fmethod LIKE "%s"\n' % method # LIKE allows SQL "%" wildcards 325 if source is not None: 326 self.statement += ' AND fsource LIKE "%s"\n' % source 327 if object is not None: 328 self.statement += ' AND %s\n' % object2fgroup_sql(object) 329 if gid is not None: 330 self.statement += ' AND fgroup.gid = %s\n' % gid 331 332 if complement: 333 self.statement += " ORDER BY 0-fstart, 0-fstop" 334 else: 335 self.statement += " ORDER BY fstart, fstop"
336
337 - def aggregate(self):
338 return FeatureAggregate(self)
339
340 -def object2fgroup_sql(object):
341 return 'gclass LIKE "%s" AND gname LIKE "%s"' % object_split(object)
342
343 -class FeatureAggregate(list, Feature):
344 """ 345 >>> feature1_1 = Feature(location=easy.LocationFromString("NC_001802x.fna:336..1631"), frame=0) # gag-pol 346 >>> feature1_2 = Feature(location=easy.LocationFromString("NC_001802x.fna:1631..4642"), frame=0) # slippage 347 >>> aggregate = FeatureAggregate([feature1_1, feature1_2]) 348 >>> print aggregate.location() 349 join(NC_001802x.fna:336..1631,NC_001802x.fna:1631..4642) 350 >>> xlate_str = aggregate.translate().tostring() 351 >>> xlate_str[:5], xlate_str[-5:] 352 ('MGARA', 'RQDED') 353 354 >>> location1 = easy.LocationFromString("NC_001802x.fna:complement(1..6)") 355 >>> location2 = easy.LocationFromString("NC_001802x.fna:complement(7..12)") 356 >>> feature2_1 = Feature(location=location1, frame=0) 357 >>> feature2_2 = Feature(location=location2, frame=0) 358 >>> aggregate2 = FeatureAggregate([feature2_1, feature2_2]) 359 >>> print aggregate2.location() 360 complement(join(NC_001802x.fna:1..6,NC_001802x.fna:7..12)) 361 >>> print aggregate2.translate() 362 Seq('TRET', HasStopCodon(IUPACProtein(), '*')) 363 >>> location1.reverse() 364 >>> location2.reverse() 365 >>> aggregate3 = FeatureAggregate([Feature(location=x, frame=0) for x in [location1, location2]]) 366 >>> print aggregate3.location() 367 join(NC_001802x.fna:1..6,NC_001802x.fna:7..12) 368 >>> print aggregate3.translate() 369 Seq('GLSG', HasStopCodon(IUPACProtein(), '*')) 370 >>> aggregate3[0].frame = 3 371 >>> print aggregate3.translate() 372 Seq('LSG', HasStopCodon(IUPACProtein(), '*')) 373 374 >>> aggregate4 = FeatureAggregate() 375 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:1..5"), frame=0)) 376 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:6..12"), frame=2)) 377 >>> aggregate4.seq() 378 Seq('GGTCTCTCTGGT', Alphabet()) 379 >>> aggregate4.translate() 380 Seq('GLSG', HasStopCodon(IUPACProtein(), '*')) 381 """
382 - def __init__(self, feature_query=None):
383 if feature_query is None: 384 list.__init__(self, []) 385 else: 386 list.__init__(self, map(lambda x: x, feature_query))
387
388 - def location(self):
389 loc = easy.LocationJoin(map(lambda x: x.location(), self)) 390 loc.reorient() 391 return loc
392
393 - def map(self, func):
394 mapped = map(func, self) 395 if self.location().complement: 396 mapped.reverse() 397 return reduce(operator.add, mapped)
398
399 - def seq(self):
400 return self.map(lambda x: x.seq())
401
402 - def translate(self):
403 attributes = ['alphabet', 'frame'] 404 index = [0, -1][self.location().complement] 405 for attribute in attributes: 406 self.__dict__[attribute] = self[index].__dict__[attribute] 407 translation = Feature.translate(self) 408 try: 409 assert translation.tostring().index("*") == len(translation) - 1 410 return translation[:translation.tostring().index("*")] 411 except ValueError: 412 return translation
413
414 -def object_split(object):
415 """ 416 >>> object_split("Sequence:F02E9.2a") 417 ('Sequence', 'F02E9.2a') 418 """ 419 return tuple(object.split(":"))
420
421 -def _test(*args, **keywds):
422 import doctest, sys 423 doctest.testmod(sys.modules[__name__], *args, **keywds)
424 425 if __name__ == "__main__": 426 if __debug__: 427 _test() 428