Package BioSQL :: Module Loader
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.Loader

   1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
   2  # Revisions 2007-2009 copyright by Peter Cock.  All rights reserved. 
   3  # Revisions 2008 copyright by Cymon J. Cox.  All rights reserved. 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license.  Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7  # 
   8  # Note that BioSQL (including the database schema and scripts) is 
   9  # available and licensed separately.  Please consult www.biosql.org 
  10   
  11  """Load biopython objects into a BioSQL database for persistent storage. 
  12   
  13  This code makes it possible to store biopython objects in a relational 
  14  database and then retrieve them back. You shouldn't use any of the 
  15  classes in this module directly. Rather, call the load() method on 
  16  a database object. 
  17  """ 
  18  # standard modules 
  19  from time import gmtime, strftime 
  20   
  21  # biopython 
  22  from Bio import Alphabet 
  23  from Bio.SeqUtils.CheckSum import crc64 
  24  from Bio import Entrez 
  25  from Bio.Seq import UnknownSeq 
  26   
  27  from Bio._py3k import _is_int_or_long 
  28   
29 -class DatabaseLoader:
30 """Object used to load SeqRecord objects into a BioSQL database."""
31 - def __init__(self, adaptor, dbid, fetch_NCBI_taxonomy=False):
32 """Initialize with connection information for the database. 33 34 Creating a DatabaseLoader object is normally handled via the 35 BioSeqDatabase DBServer object, for example: 36 37 from BioSQL import BioSeqDatabase 38 server = BioSeqDatabase.open_database(driver="MySQLdb", user="gbrowse", 39 passwd = "biosql", host = "localhost", db="test_biosql") 40 try: 41 db = server["test"] 42 except KeyError: 43 db = server.new_database("test", description="For testing GBrowse") 44 """ 45 self.adaptor = adaptor 46 self.dbid = dbid 47 self.fetch_NCBI_taxonomy = fetch_NCBI_taxonomy
48
49 - def load_seqrecord(self, record):
50 """Load a Biopython SeqRecord into the database. 51 """ 52 bioentry_id = self._load_bioentry_table(record) 53 self._load_bioentry_date(record, bioentry_id) 54 self._load_biosequence(record, bioentry_id) 55 self._load_comment(record, bioentry_id) 56 self._load_dbxrefs(record, bioentry_id) 57 references = record.annotations.get('references', ()) 58 for reference, rank in zip(references, range(len(references))): 59 self._load_reference(reference, rank, bioentry_id) 60 self._load_annotations(record, bioentry_id) 61 for seq_feature_num in range(len(record.features)): 62 seq_feature = record.features[seq_feature_num] 63 self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
64
65 - def _get_ontology_id(self, name, definition=None):
66 """Returns the identifier for the named ontology (PRIVATE). 67 68 This looks through the onotology table for a the given entry name. 69 If it is not found, a row is added for this ontology (using the 70 definition if supplied). In either case, the id corresponding to 71 the provided name is returned, so that you can reference it in 72 another table. 73 """ 74 oids = self.adaptor.execute_and_fetch_col0( 75 "SELECT ontology_id FROM ontology WHERE name = %s", 76 (name,)) 77 if oids: 78 return oids[0] 79 self.adaptor.execute( 80 "INSERT INTO ontology(name, definition) VALUES (%s, %s)", 81 (name, definition)) 82 return self.adaptor.last_id("ontology")
83 84
85 - def _get_term_id(self, 86 name, 87 ontology_id=None, 88 definition=None, 89 identifier=None):
90 """Get the id that corresponds to a term (PRIVATE). 91 92 This looks through the term table for a the given term. If it 93 is not found, a new id corresponding to this term is created. 94 In either case, the id corresponding to that term is returned, so 95 that you can reference it in another table. 96 97 The ontology_id should be used to disambiguate the term. 98 """ 99 100 # try to get the term id 101 sql = r"SELECT term_id FROM term " \ 102 r"WHERE name = %s" 103 fields = [name] 104 if ontology_id: 105 sql += ' AND ontology_id = %s' 106 fields.append(ontology_id) 107 id_results = self.adaptor.execute_and_fetchall(sql, fields) 108 # something is wrong 109 if len(id_results) > 1: 110 raise ValueError("Multiple term ids for %s: %r" % 111 (name, id_results)) 112 elif len(id_results) == 1: 113 return id_results[0][0] 114 else: 115 sql = r"INSERT INTO term (name, definition," \ 116 r" identifier, ontology_id)" \ 117 r" VALUES (%s, %s, %s, %s)" 118 self.adaptor.execute(sql, (name, definition, 119 identifier, ontology_id)) 120 return self.adaptor.last_id("term")
121
122 - def _add_dbxref(self, dbname, accession, version):
123 """Insert a dbxref and return its id.""" 124 125 self.adaptor.execute( 126 "INSERT INTO dbxref(dbname, accession, version)" \ 127 " VALUES (%s, %s, %s)", (dbname, accession, version)) 128 return self.adaptor.last_id("dbxref")
129
130 - def _get_taxon_id(self, record):
131 """Get the taxon id for this record (PRIVATE). 132 133 record - a SeqRecord object 134 135 This searches the taxon/taxon_name tables using the 136 NCBI taxon ID, scientific name and common name to find 137 the matching taxon table entry's id. 138 139 If the species isn't in the taxon table, and we have at 140 least the NCBI taxon ID, scientific name or common name, 141 at least a minimal stub entry is created in the table. 142 143 Returns the taxon id (database key for the taxon table, 144 not an NCBI taxon ID), or None if the taxonomy information 145 is missing. 146 147 See also the BioSQL script load_ncbi_taxonomy.pl which 148 will populate and update the taxon/taxon_name tables 149 with the latest information from the NCBI. 150 """ 151 152 # To find the NCBI taxid, first check for a top level annotation 153 ncbi_taxon_id = None 154 if "ncbi_taxid" in record.annotations: 155 #Could be a list of IDs. 156 if isinstance(record.annotations["ncbi_taxid"],list): 157 if len(record.annotations["ncbi_taxid"])==1: 158 ncbi_taxon_id = record.annotations["ncbi_taxid"][0] 159 else: 160 ncbi_taxon_id = record.annotations["ncbi_taxid"] 161 if not ncbi_taxon_id: 162 # Secondly, look for a source feature 163 for f in record.features: 164 if f.type == 'source': 165 quals = getattr(f, 'qualifiers', {}) 166 if "db_xref" in quals: 167 for db_xref in f.qualifiers["db_xref"]: 168 if db_xref.startswith("taxon:"): 169 ncbi_taxon_id = int(db_xref[6:]) 170 break 171 if ncbi_taxon_id: break 172 173 try: 174 scientific_name = record.annotations["organism"][:255] 175 except KeyError: 176 scientific_name = None 177 try: 178 common_name = record.annotations["source"][:255] 179 except KeyError: 180 common_name = None 181 # Note: The maximum length for taxon names in the schema is 255. 182 # Cropping it now should help in getting a match when searching, 183 # and avoids an error if we try and add these to the database. 184 185 186 if ncbi_taxon_id: 187 #Good, we have the NCBI taxon to go on - this is unambiguous :) 188 #Note that the scientific name and common name will only be 189 #used if we have to record a stub entry. 190 return self._get_taxon_id_from_ncbi_taxon_id(ncbi_taxon_id, 191 scientific_name, 192 common_name) 193 194 if not common_name and not scientific_name: 195 # Nothing to go on... and there is no point adding 196 # a new entry to the database. We'll just leave this 197 # sequence's taxon as a NULL in the database. 198 return None 199 200 # Next, we'll try to find a match based on the species name 201 # (stored in GenBank files as the organism and/or the source). 202 if scientific_name: 203 taxa = self.adaptor.execute_and_fetch_col0( 204 "SELECT taxon_id FROM taxon_name" \ 205 " WHERE name_class = 'scientific name' AND name = %s", 206 (scientific_name,)) 207 if taxa: 208 #Good, mapped the scientific name to a taxon table entry 209 return taxa[0] 210 211 # Last chance... 212 if common_name: 213 taxa = self.adaptor.execute_and_fetch_col0( 214 "SELECT DISTINCT taxon_id FROM taxon_name" \ 215 " WHERE name = %s", 216 (common_name,)) 217 #Its natural that several distinct taxa will have the same common 218 #name - in which case we can't resolve the taxon uniquely. 219 if len(taxa) > 1: 220 raise ValueError("Taxa: %d species have name %r" % ( 221 len(taxa), 222 common_name)) 223 if taxa: 224 #Good, mapped the common name to a taxon table entry 225 return taxa[0] 226 227 # At this point, as far as we can tell, this species isn't 228 # in the taxon table already. So we'll have to add it. 229 # We don't have an NCBI taxonomy ID, so if we do record just 230 # a stub entry, there is no simple way to fix this later. 231 # 232 # TODO - Should we try searching the NCBI taxonomy using the 233 # species name? 234 # 235 # OK, let's try inserting the species. 236 # Chances are we don't have enough information ... 237 # Furthermore, it won't be in the hierarchy. 238 239 lineage = [] 240 for c in record.annotations.get("taxonomy", []): 241 lineage.append([None, None, c]) 242 if lineage: 243 lineage[-1][1] = "genus" 244 lineage.append([None, "species", record.annotations["organism"]]) 245 # XXX do we have them? 246 if "subspecies" in record.annotations: 247 lineage.append([None, "subspecies", 248 record.annotations["subspecies"]]) 249 if "variant" in record.annotations: 250 lineage.append([None, "varietas", 251 record.annotations["variant"]]) 252 lineage[-1][0] = ncbi_taxon_id 253 254 left_value = self.adaptor.execute_one( 255 "SELECT MAX(left_value) FROM taxon")[0] 256 if not left_value: 257 left_value = 0 258 left_value += 1 259 260 # XXX -- Brad: Fixing this for now in an ugly way because 261 # I am getting overlaps for right_values. I need to dig into this 262 # more to actually understand how it works. I'm not sure it is 263 # actually working right anyhow. 264 right_start_value = self.adaptor.execute_one( 265 "SELECT MAX(right_value) FROM taxon")[0] 266 if not right_start_value: 267 right_start_value = 0 268 right_value = right_start_value + 2 * len(lineage) - 1 269 270 parent_taxon_id = None 271 for taxon in lineage: 272 self.adaptor.execute( 273 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\ 274 " left_value, right_value)" \ 275 " VALUES (%s, %s, %s, %s, %s)", (parent_taxon_id, 276 taxon[0], 277 taxon[1], 278 left_value, 279 right_value)) 280 taxon_id = self.adaptor.last_id("taxon") 281 self.adaptor.execute( 282 "INSERT INTO taxon_name(taxon_id, name, name_class)" \ 283 "VALUES (%s, %s, 'scientific name')", (taxon_id, taxon[2][:255])) 284 #Note the name field is limited to 255, some SwissProt files 285 #have a multi-species name which can be longer. So truncate this. 286 left_value += 1 287 right_value -= 1 288 parent_taxon_id = taxon_id 289 if common_name: 290 self.adaptor.execute( 291 "INSERT INTO taxon_name(taxon_id, name, name_class)" \ 292 "VALUES (%s, %s, 'common name')", ( 293 taxon_id, common_name)) 294 295 return taxon_id
296
297 - def _fix_name_class(self, entrez_name):
298 """Map Entrez name terms to those used in taxdump (PRIVATE). 299 300 We need to make this conversion to match the taxon_name.name_class 301 values used by the BioSQL load_ncbi_taxonomy.pl script. 302 303 e.g. 304 "ScientificName" -> "scientific name", 305 "EquivalentName" -> "equivalent name", 306 "Synonym" -> "synonym", 307 """ 308 #Add any special cases here: 309 # 310 #known = {} 311 #try: 312 # return known[entrez_name] 313 #except KeyError: 314 # pass 315 316 #Try automatically by adding spaces before each capital 317 def add_space(letter): 318 if letter.isupper(): 319 return " "+letter.lower() 320 else: 321 return letter
322 answer = "".join([add_space(letter) for letter in entrez_name]).strip() 323 assert answer == answer.lower() 324 return answer
325
326 - def _get_taxon_id_from_ncbi_taxon_id(self, ncbi_taxon_id, 327 scientific_name = None, 328 common_name = None):
329 """Get the taxon id for this record from the NCBI taxon ID (PRIVATE). 330 331 ncbi_taxon_id - string containing an NCBI taxon id 332 scientific_name - string, used if a stub entry is recorded 333 common_name - string, used if a stub entry is recorded 334 335 This searches the taxon table using ONLY the NCBI taxon ID 336 to find the matching taxon table entry's ID (database key). 337 338 If the species isn't in the taxon table, and the fetch_NCBI_taxonomy 339 flag is true, Biopython will attempt to go online using Bio.Entrez 340 to fetch the official NCBI lineage, recursing up the tree until an 341 existing entry is found in the database or the full lineage has been 342 fetched. 343 344 Otherwise the NCBI taxon ID, scientific name and common name are 345 recorded as a minimal stub entry in the taxon and taxon_name tables. 346 Any partial information about the lineage from the SeqRecord is NOT 347 recorded. This should mean that (re)running the BioSQL script 348 load_ncbi_taxonomy.pl can fill in the taxonomy lineage. 349 350 Returns the taxon id (database key for the taxon table, not 351 an NCBI taxon ID). 352 """ 353 assert ncbi_taxon_id 354 355 taxon_id = self.adaptor.execute_and_fetch_col0( 356 "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s", 357 (ncbi_taxon_id,)) 358 if taxon_id: 359 #Good, we have mapped the NCBI taxid to a taxon table entry 360 return taxon_id[0] 361 362 # At this point, as far as we can tell, this species isn't 363 # in the taxon table already. So we'll have to add it. 364 365 parent_taxon_id = None 366 rank = "species" 367 genetic_code = None 368 mito_genetic_code = None 369 species_names = [] 370 if scientific_name: 371 species_names.append(("scientific name", scientific_name)) 372 if common_name: 373 species_names.append(("common name", common_name)) 374 375 if self.fetch_NCBI_taxonomy: 376 #Go online to get the parent taxon ID! 377 handle = Entrez.efetch(db="taxonomy",id=ncbi_taxon_id,retmode="XML") 378 taxonomic_record = Entrez.read(handle) 379 if len(taxonomic_record) == 1: 380 assert taxonomic_record[0]["TaxId"] == str(ncbi_taxon_id), \ 381 "%s versus %s" % (taxonomic_record[0]["TaxId"], 382 ncbi_taxon_id) 383 parent_taxon_id = self._get_taxon_id_from_ncbi_lineage( \ 384 taxonomic_record[0]["LineageEx"]) 385 rank = taxonomic_record[0]["Rank"] 386 genetic_code = taxonomic_record[0]["GeneticCode"]["GCId"] 387 mito_genetic_code = taxonomic_record[0]["MitoGeneticCode"]["MGCId"] 388 species_names = [("scientific name", 389 taxonomic_record[0]["ScientificName"])] 390 try: 391 for name_class, names in taxonomic_record[0]["OtherNames"].iteritems(): 392 name_class = self._fix_name_class(name_class) 393 if not isinstance(names, list): 394 #The Entrez parser seems to return single entry 395 #lists as just a string which is annoying. 396 names = [names] 397 for name in names: 398 #Want to ignore complex things like ClassCDE entries 399 if isinstance(name, basestring): 400 species_names.append((name_class, name)) 401 except KeyError: 402 #OtherNames isn't always present, 403 #e.g. NCBI taxon 41205, Bromheadia finlaysoniana 404 pass 405 else: 406 pass 407 # If we are not allowed to go online, we will record the bare minimum; 408 # as long as the NCBI taxon id is present, then (re)running 409 # load_ncbi_taxonomy.pl should fill in the taxonomomy lineage 410 # (and update the species names). 411 # 412 # I am NOT going to try and record the lineage, even if it 413 # is in the record annotation as a list of names, as we won't 414 # know the NCBI taxon IDs for these parent nodes. 415 416 self.adaptor.execute( 417 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\ 418 " genetic_code, mito_genetic_code, left_value, right_value)" \ 419 " VALUES (%s, %s, %s, %s, %s, %s, %s)", (parent_taxon_id, 420 ncbi_taxon_id, 421 rank, 422 genetic_code, 423 mito_genetic_code, 424 None, 425 None)) 426 taxon_id = self.adaptor.last_id("taxon") 427 428 #Record the scientific name, common name, etc 429 for name_class, name in species_names: 430 self.adaptor.execute( 431 "INSERT INTO taxon_name(taxon_id, name, name_class)" \ 432 " VALUES (%s, %s, %s)", (taxon_id, 433 name[:255], 434 name_class)) 435 return taxon_id
436
437 - def _get_taxon_id_from_ncbi_lineage(self, taxonomic_lineage):
438 """This is recursive! (PRIVATE). 439 440 taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez 441 442 First dictionary in list is the taxonomy root, highest would be the species. 443 Each dictionary includes: 444 - TaxID (string, NCBI taxon id) 445 - Rank (string, e.g. "species", "genus", ..., "phylum", ...) 446 - ScientificName (string) 447 (and that is all at the time of writing) 448 449 This method will record all the lineage given, returning the the taxon id 450 (database key, not NCBI taxon id) of the final entry (the species). 451 """ 452 ncbi_taxon_id = taxonomic_lineage[-1]["TaxId"] 453 454 #Is this in the database already? Check the taxon table... 455 taxon_id = self.adaptor.execute_and_fetch_col0( 456 "SELECT taxon_id FROM taxon" \ 457 " WHERE ncbi_taxon_id=%s" % ncbi_taxon_id) 458 if taxon_id: 459 # we could verify that the Scientific Name etc in the database 460 # is the same and update it or print a warning if not... 461 if isinstance(taxon_id, list): 462 assert len(taxon_id)==1 463 return taxon_id[0] 464 else: 465 return taxon_id 466 467 #We have to record this. 468 if len(taxonomic_lineage) > 1: 469 #Use recursion to find out the taxon id (database key) of the parent. 470 parent_taxon_id = self._get_taxon_id_from_ncbi_lineage(taxonomic_lineage[:-1]) 471 assert _is_int_or_long(parent_taxon_id), repr(parent_taxon_id) 472 else: 473 parent_taxon_id = None 474 475 # INSERT new taxon 476 rank = taxonomic_lineage[-1].get("Rank", None) 477 self.adaptor.execute( 478 "INSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank)"\ 479 " VALUES (%s, %s, %s)", (ncbi_taxon_id, parent_taxon_id, rank)) 480 taxon_id = self.adaptor.last_id("taxon") 481 assert isinstance(taxon_id, int) or isinstance(taxon_id, long), repr(taxon_id) 482 # ... and its name in taxon_name 483 scientific_name = taxonomic_lineage[-1].get("ScientificName", None) 484 if scientific_name: 485 self.adaptor.execute( 486 "INSERT INTO taxon_name(taxon_id, name, name_class)" \ 487 " VALUES (%s, %s, 'scientific name')", (taxon_id, 488 scientific_name[:255])) 489 return taxon_id
490 491
492 - def _load_bioentry_table(self, record):
493 """Fill the bioentry table with sequence information (PRIVATE). 494 495 record - SeqRecord object to add to the database. 496 """ 497 # get the pertinent info and insert it 498 499 if record.id.count(".") == 1: # try to get a version from the id 500 #This assumes the string is something like "XXXXXXXX.123" 501 accession, version = record.id.split('.') 502 try: 503 version = int(version) 504 except ValueError: 505 accession = record.id 506 version = 0 507 else: # otherwise just use a version of 0 508 accession = record.id 509 version = 0 510 511 if "accessions" in record.annotations \ 512 and isinstance(record.annotations["accessions"],list) \ 513 and record.annotations["accessions"]: 514 #Take the first accession (one if there is more than one) 515 accession = record.annotations["accessions"][0] 516 517 #Find the taxon id (this is not just the NCBI Taxon ID) 518 #NOTE - If the species isn't defined in the taxon table, 519 #a new minimal entry is created. 520 taxon_id = self._get_taxon_id(record) 521 522 if "gi" in record.annotations: 523 identifier = record.annotations["gi"] 524 else: 525 identifier = record.id 526 527 #Allow description and division to default to NULL as in BioPerl. 528 description = getattr(record, 'description', None) 529 division = record.annotations.get("data_file_division", None) 530 531 sql = """ 532 INSERT INTO bioentry ( 533 biodatabase_id, 534 taxon_id, 535 name, 536 accession, 537 identifier, 538 division, 539 description, 540 version) 541 VALUES ( 542 %s, 543 %s, 544 %s, 545 %s, 546 %s, 547 %s, 548 %s, 549 %s)""" 550 #print self.dbid, taxon_id, record.name, accession, identifier, \ 551 # division, description, version 552 self.adaptor.execute(sql, (self.dbid, 553 taxon_id, 554 record.name, 555 accession, 556 identifier, 557 division, 558 description, 559 version)) 560 # now retrieve the id for the bioentry 561 bioentry_id = self.adaptor.last_id('bioentry') 562 563 return bioentry_id
564
565 - def _load_bioentry_date(self, record, bioentry_id):
566 """Add the effective date of the entry into the database. 567 568 record - a SeqRecord object with an annotated date 569 bioentry_id - corresponding database identifier 570 """ 571 # dates are GenBank style, like: 572 # 14-SEP-2000 573 date = record.annotations.get("date", 574 strftime("%d-%b-%Y", gmtime()).upper()) 575 if isinstance(date, list) : date = date[0] 576 annotation_tags_id = self._get_ontology_id("Annotation Tags") 577 date_id = self._get_term_id("date_changed", annotation_tags_id) 578 sql = r"INSERT INTO bioentry_qualifier_value" \ 579 r" (bioentry_id, term_id, value, rank)" \ 580 r" VALUES (%s, %s, %s, 1)" 581 self.adaptor.execute(sql, (bioentry_id, date_id, date))
582
583 - def _load_biosequence(self, record, bioentry_id):
584 """Record a SeqRecord's sequence and alphabet in the database (PRIVATE). 585 586 record - a SeqRecord object with a seq property 587 bioentry_id - corresponding database identifier 588 """ 589 if record.seq is None: 590 #The biosequence table entry is optional, so if we haven't 591 #got a sequence, we don't need to write to the table. 592 return 593 594 # determine the string representation of the alphabet 595 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 596 alphabet = "dna" 597 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 598 alphabet = "rna" 599 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 600 alphabet = "protein" 601 else: 602 alphabet = "unknown" 603 604 if isinstance(record.seq, UnknownSeq): 605 seq_str = None 606 else: 607 seq_str = str(record.seq) 608 609 sql = r"INSERT INTO biosequence (bioentry_id, version, " \ 610 r"length, seq, alphabet) " \ 611 r"VALUES (%s, 0, %s, %s, %s)" 612 self.adaptor.execute(sql, (bioentry_id, 613 len(record.seq), 614 seq_str, 615 alphabet))
616
617 - def _load_comment(self, record, bioentry_id):
618 """Record a SeqRecord's annotated comment in the database (PRIVATE). 619 620 record - a SeqRecord object with an annotated comment 621 bioentry_id - corresponding database identifier 622 """ 623 comments = record.annotations.get('comment') 624 if not comments: 625 return 626 if not isinstance(comments, list): 627 #It should be a string then... 628 comments = [comments] 629 630 for index, comment in enumerate(comments): 631 comment = comment.replace('\n', ' ') 632 #TODO - Store each line as a separate entry? This would preserve 633 #the newlines, but we should check BioPerl etc to be consistent. 634 sql = "INSERT INTO comment (bioentry_id, comment_text, rank)" \ 635 " VALUES (%s, %s, %s)" 636 self.adaptor.execute(sql, (bioentry_id, comment, index+1))
637
638 - def _load_annotations(self, record, bioentry_id):
639 """Record a SeqRecord's misc annotations in the database (PRIVATE). 640 641 The annotation strings are recorded in the bioentry_qualifier_value 642 table, except for special cases like the reference, comment and 643 taxonomy which are handled with their own tables. 644 645 record - a SeqRecord object with an annotations dictionary 646 bioentry_id - corresponding database identifier 647 """ 648 mono_sql = "INSERT INTO bioentry_qualifier_value" \ 649 "(bioentry_id, term_id, value)" \ 650 " VALUES (%s, %s, %s)" 651 many_sql = "INSERT INTO bioentry_qualifier_value" \ 652 "(bioentry_id, term_id, value, rank)" \ 653 " VALUES (%s, %s, %s, %s)" 654 tag_ontology_id = self._get_ontology_id('Annotation Tags') 655 for key, value in record.annotations.iteritems(): 656 if key in ["references", "comment", "ncbi_taxid", "date"]: 657 #Handled separately 658 continue 659 term_id = self._get_term_id(key, ontology_id=tag_ontology_id) 660 if isinstance(value, list) or isinstance(value, tuple): 661 rank = 0 662 for entry in value: 663 if isinstance(entry, str) or isinstance(entry, int): 664 #Easy case 665 rank += 1 666 self.adaptor.execute(many_sql, \ 667 (bioentry_id, term_id, str(entry), rank)) 668 else: 669 pass 670 #print "Ignoring annotation '%s' sub-entry of type '%s'" \ 671 # % (key, str(type(entry))) 672 elif isinstance(value, str) or isinstance(value, int): 673 #Have a simple single entry, leave rank as the DB default 674 self.adaptor.execute(mono_sql, \ 675 (bioentry_id, term_id, str(value))) 676 else: 677 pass
678 #print "Ignoring annotation '%s' entry of type '%s'" \ 679 # % (key, type(value)) 680 681
682 - def _load_reference(self, reference, rank, bioentry_id):
683 """Record a SeqRecord's annotated references in the database (PRIVATE). 684 685 record - a SeqRecord object with annotated references 686 bioentry_id - corresponding database identifier 687 """ 688 689 refs = None 690 if reference.medline_id: 691 refs = self.adaptor.execute_and_fetch_col0( 692 "SELECT reference_id" \ 693 " FROM reference JOIN dbxref USING (dbxref_id)" \ 694 " WHERE dbname = 'MEDLINE' AND accession = %s", 695 (reference.medline_id,)) 696 if not refs and reference.pubmed_id: 697 refs = self.adaptor.execute_and_fetch_col0( 698 "SELECT reference_id" \ 699 " FROM reference JOIN dbxref USING (dbxref_id)" \ 700 " WHERE dbname = 'PUBMED' AND accession = %s", 701 (reference.pubmed_id,)) 702 if not refs: 703 s = [] 704 for f in reference.authors, reference.title, reference.journal: 705 s.append(f or "<undef>") 706 crc = crc64("".join(s)) 707 refs = self.adaptor.execute_and_fetch_col0( 708 "SELECT reference_id FROM reference" \ 709 r" WHERE crc = %s", (crc,)) 710 if not refs: 711 if reference.medline_id: 712 dbxref_id = self._add_dbxref("MEDLINE", 713 reference.medline_id, 0) 714 elif reference.pubmed_id: 715 dbxref_id = self._add_dbxref("PUBMED", 716 reference.pubmed_id, 0) 717 else: 718 dbxref_id = None 719 authors = reference.authors or None 720 title = reference.title or None 721 #The location/journal field cannot be Null, so default 722 #to an empty string rather than None: 723 journal = reference.journal or "" 724 self.adaptor.execute( 725 "INSERT INTO reference (dbxref_id, location," \ 726 " title, authors, crc)" \ 727 " VALUES (%s, %s, %s, %s, %s)", 728 (dbxref_id, journal, title, 729 authors, crc)) 730 reference_id = self.adaptor.last_id("reference") 731 else: 732 reference_id = refs[0] 733 734 if reference.location: 735 start = 1 + int(str(reference.location[0].start)) 736 end = int(str(reference.location[0].end)) 737 else: 738 start = None 739 end = None 740 741 sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \ 742 " start_pos, end_pos, rank)" \ 743 " VALUES (%s, %s, %s, %s, %s)" 744 self.adaptor.execute(sql, (bioentry_id, reference_id, 745 start, end, rank + 1))
746
747 - def _load_seqfeature(self, feature, feature_rank, bioentry_id):
748 """Load a biopython SeqFeature into the database (PRIVATE). 749 """ 750 seqfeature_id = self._load_seqfeature_basic(feature.type, feature_rank, 751 bioentry_id) 752 self._load_seqfeature_locations(feature, seqfeature_id) 753 self._load_seqfeature_qualifiers(feature.qualifiers, seqfeature_id)
754
755 - def _load_seqfeature_basic(self, feature_type, feature_rank, bioentry_id):
756 """Load the first tables of a seqfeature and returns the id (PRIVATE). 757 758 This loads the "key" of the seqfeature (ie. CDS, gene) and 759 the basic seqfeature table itself. 760 """ 761 ontology_id = self._get_ontology_id('SeqFeature Keys') 762 seqfeature_key_id = self._get_term_id(feature_type, 763 ontology_id = ontology_id) 764 # XXX source is always EMBL/GenBank/SwissProt here; it should depend on 765 # the record (how?) 766 source_cat_id = self._get_ontology_id('SeqFeature Sources') 767 source_term_id = self._get_term_id('EMBL/GenBank/SwissProt', 768 ontology_id = source_cat_id) 769 770 sql = r"INSERT INTO seqfeature (bioentry_id, type_term_id, " \ 771 r"source_term_id, rank) VALUES (%s, %s, %s, %s)" 772 self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id, 773 source_term_id, feature_rank + 1)) 774 seqfeature_id = self.adaptor.last_id('seqfeature') 775 776 return seqfeature_id
777
778 - def _load_seqfeature_locations(self, feature, seqfeature_id):
779 """Load all of the locations for a SeqFeature into tables (PRIVATE). 780 781 This adds the locations related to the SeqFeature into the 782 seqfeature_location table. Fuzzies are not handled right now. 783 For a simple location, ie (1..2), we have a single table row 784 with seq_start = 1, seq_end = 2, location_rank = 1. 785 786 For split locations, ie (1..2, 3..4, 5..6) we would have three 787 row tables with: 788 start = 1, end = 2, rank = 1 789 start = 3, end = 4, rank = 2 790 start = 5, end = 6, rank = 3 791 """ 792 # TODO - Record an ontology for the locations (using location.term_id) 793 # which for now as in BioPerl we leave defaulting to NULL. 794 if feature.location_operator and feature.location_operator != "join": 795 # e.g. order locations... we don't record "order" so it 796 # will become a "join" on reloading. What does BioPerl do? 797 import warnings 798 warnings.warn("%s location operators are not fully supported" \ 799 % feature.location_operator) 800 801 # two cases, a simple location or a split location 802 if not feature.sub_features: # simple location 803 self._insert_seqfeature_location(feature, 1, seqfeature_id) 804 else: # split location 805 for rank, cur_feature in enumerate(feature.sub_features): 806 self._insert_seqfeature_location(cur_feature, 807 rank + 1, 808 seqfeature_id)
809
810 - def _insert_seqfeature_location(self, feature, rank, seqfeature_id):
811 """Add a location of a SeqFeature to the seqfeature_location table (PRIVATE). 812 813 TODO - Add location_operators to location_qualifier_value. 814 """ 815 # convert biopython locations to the 1-based location system 816 # used in bioSQL 817 # XXX This could also handle fuzzies 818 start = feature.location.nofuzzy_start + 1 819 end = feature.location.nofuzzy_end 820 821 # Biopython uses None when we don't know strand information but 822 # BioSQL requires something (non null) and sets this as zero 823 # So we'll use the strand or 0 if Biopython spits out None 824 strand = feature.strand or 0 825 826 # TODO - Record an ontology term for the location (location.term_id) 827 # which for now like BioPerl we'll leave as NULL. 828 # This might allow us to record "between" positions properly, but I 829 # doesn't really see how it could work for before/after fuzzy positions 830 loc_term_id = None 831 832 if feature.ref: 833 # sub_feature remote locations when they are in the same db as the current 834 # record do not have a value for ref_db, which the SeqFeature object 835 # stores as None. BioSQL schema requires a varchar and is not NULL 836 dbxref_id = self._get_dbxref_id(feature.ref_db or "", feature.ref) 837 else: 838 dbxref_id = None 839 840 sql = r"INSERT INTO location (seqfeature_id, dbxref_id, term_id," \ 841 r"start_pos, end_pos, strand, rank) " \ 842 r"VALUES (%s, %s, %s, %s, %s, %s, %s)" 843 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, loc_term_id, 844 start, end, strand, rank)) 845 846 """ 847 # See Bug 2677 848 # TODO - Record the location_operator (e.g. "join" or "order") 849 # using the location_qualifier_value table (which we and BioPerl 850 # have historically left empty). 851 # Note this will need an ontology term for the location qualifer 852 # (location_qualifier_value.term_id) for which oddly the schema 853 # does not allow NULL. 854 if feature.location_operator: 855 #e.g. "join" (common), 856 #or "order" (see Tests/GenBank/protein_refseq2.gb) 857 location_id = self.adaptor.last_id('location') 858 loc_qual_term_id = None # Not allowed in BioSQL v1.0.1 859 sql = r"INSERT INTO location_qualifier_value" \ 860 r"(location_id, term_id, value)" \ 861 r"VALUES (%s, %s, %s)" 862 self.adaptor.execute(sql, (location_id, loc_qual_term_id, 863 feature.location_operator)) 864 """
865
866 - def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
867 """Insert the (key, value) pair qualifiers relating to a feature (PRIVATE). 868 869 Qualifiers should be a dictionary of the form: 870 {key : [value1, value2]} 871 """ 872 tag_ontology_id = self._get_ontology_id('Annotation Tags') 873 for qualifier_key in qualifiers: 874 # Treat db_xref qualifiers differently to sequence annotation 875 # qualifiers by populating the seqfeature_dbxref and dbxref 876 # tables. Other qualifiers go into the seqfeature_qualifier_value 877 # and (if new) term tables. 878 if qualifier_key != 'db_xref': 879 qualifier_key_id = self._get_term_id(qualifier_key, 880 ontology_id=tag_ontology_id) 881 # now add all of the values to their table 882 entries = qualifiers[qualifier_key] 883 if not isinstance(entries, list): 884 # Could be a plain string, or an int or a float. 885 # However, we exect a list of strings here. 886 entries = [entries] 887 for qual_value_rank in range(len(entries)): 888 qualifier_value = entries[qual_value_rank] 889 sql = r"INSERT INTO seqfeature_qualifier_value "\ 890 r" (seqfeature_id, term_id, rank, value) VALUES"\ 891 r" (%s, %s, %s, %s)" 892 self.adaptor.execute(sql, (seqfeature_id, 893 qualifier_key_id, 894 qual_value_rank + 1, 895 qualifier_value)) 896 else: 897 # The dbxref_id qualifier/value sets go into the dbxref table 898 # as dbname, accession, version tuples, with dbxref.dbxref_id 899 # being automatically assigned, and into the seqfeature_dbxref 900 # table as seqfeature_id, dbxref_id, and rank tuples 901 self._load_seqfeature_dbxref(qualifiers[qualifier_key], 902 seqfeature_id)
903 904
905 - def _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id):
906 """Add database crossreferences of a SeqFeature to the database (PRIVATE). 907 908 o dbxrefs List, dbxref data from the source file in the 909 format <database>:<accession> 910 911 o seqfeature_id Int, the identifier for the seqfeature in the 912 seqfeature table 913 914 Insert dbxref qualifier data for a seqfeature into the 915 seqfeature_dbxref and, if required, dbxref tables. 916 The dbxref_id qualifier/value sets go into the dbxref table 917 as dbname, accession, version tuples, with dbxref.dbxref_id 918 being automatically assigned, and into the seqfeature_dbxref 919 table as seqfeature_id, dbxref_id, and rank tuples 920 """ 921 # NOTE - In older versions of Biopython, we would map the GenBank 922 # db_xref "name", for example "GI" to "GeneIndex", and give a warning 923 # for any unknown terms. This was a long term maintainance problem, 924 # and differed from BioPerl and BioJava's implementation. See bug 2405 925 for rank, value in enumerate(dbxrefs): 926 # Split the DB:accession format string at colons. We have to 927 # account for multiple-line and multiple-accession entries 928 try: 929 dbxref_data = value.replace(' ','').replace('\n','').split(':') 930 db = dbxref_data[0] 931 accessions = dbxref_data[1:] 932 except: 933 raise ValueError("Parsing of db_xref failed: '%s'" % value) 934 # Loop over all the grabbed accessions, and attempt to fill the 935 # table 936 for accession in accessions: 937 # Get the dbxref_id value for the dbxref data 938 dbxref_id = self._get_dbxref_id(db, accession) 939 # Insert the seqfeature_dbxref data 940 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank+1)
941
942 - def _get_dbxref_id(self, db, accession):
943 """ _get_dbxref_id(self, db, accession) -> Int 944 945 o db String, the name of the external database containing 946 the accession number 947 948 o accession String, the accession of the dbxref data 949 950 Finds and returns the dbxref_id for the passed data. The method 951 attempts to find an existing record first, and inserts the data 952 if there is no record. 953 """ 954 # Check for an existing record 955 sql = r'SELECT dbxref_id FROM dbxref WHERE dbname = %s ' \ 956 r'AND accession = %s' 957 dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession)) 958 # If there was a record, return the dbxref_id, else create the 959 # record and return the created dbxref_id 960 if dbxref_id: 961 return dbxref_id[0] 962 return self._add_dbxref(db, accession, 0)
963
964 - def _get_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
965 """ Check for a pre-existing seqfeature_dbxref entry with the passed 966 seqfeature_id and dbxref_id. If one does not exist, insert new 967 data 968 969 """ 970 # Check for an existing record 971 sql = r"SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref " \ 972 r"WHERE seqfeature_id = %s AND dbxref_id = %s" 973 result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id, 974 dbxref_id)) 975 # If there was a record, return without executing anything, else create 976 # the record and return 977 if result: 978 return result 979 return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
980
981 - def _add_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
982 """ Insert a seqfeature_dbxref row and return the seqfeature_id and 983 dbxref_id 984 """ 985 sql = r'INSERT INTO seqfeature_dbxref ' \ 986 '(seqfeature_id, dbxref_id, rank) VALUES' \ 987 r'(%s, %s, %s)' 988 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank)) 989 return (seqfeature_id, dbxref_id)
990
991 - def _load_dbxrefs(self, record, bioentry_id):
992 """Load any sequence level cross references into the database (PRIVATE). 993 994 See table bioentry_dbxref.""" 995 for rank, value in enumerate(record.dbxrefs): 996 # Split the DB:accession string at first colon. 997 # We have to cope with things like: 998 # "MGD:MGI:892" (db="MGD", accession="MGI:892") 999 # "GO:GO:123" (db="GO", accession="GO:123") 1000 # 1001 # Annoyingly I have seen the NCBI use both the style 1002 # "GO:GO:123" and "GO:123" in different vintages. 1003 assert value.count("\n")==0 1004 try: 1005 db, accession = value.split(':',1) 1006 db = db.strip() 1007 accession = accession.strip() 1008 except: 1009 raise ValueError("Parsing of dbxrefs list failed: '%s'" % value) 1010 # Get the dbxref_id value for the dbxref data 1011 dbxref_id = self._get_dbxref_id(db, accession) 1012 # Insert the bioentry_dbxref data 1013 self._get_bioentry_dbxref(bioentry_id, dbxref_id, rank+1)
1014
1015 - def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1016 """ Check for a pre-existing bioentry_dbxref entry with the passed 1017 seqfeature_id and dbxref_id. If one does not exist, insert new 1018 data 1019 1020 """ 1021 # Check for an existing record 1022 sql = r"SELECT bioentry_id, dbxref_id FROM bioentry_dbxref " \ 1023 r"WHERE bioentry_id = %s AND dbxref_id = %s" 1024 result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id, 1025 dbxref_id)) 1026 # If there was a record, return without executing anything, else create 1027 # the record and return 1028 if result: 1029 return result 1030 return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
1031
1032 - def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1033 """ Insert a bioentry_dbxref row and return the seqfeature_id and 1034 dbxref_id 1035 """ 1036 sql = r'INSERT INTO bioentry_dbxref ' \ 1037 '(bioentry_id,dbxref_id,rank) VALUES ' \ 1038 '(%s, %s, %s)' 1039 self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank)) 1040 return (bioentry_id, dbxref_id)
1041
1042 -class DatabaseRemover:
1043 """Complement the Loader functionality by fully removing a database. 1044 1045 This probably isn't really useful for normal purposes, since you 1046 can just do a: 1047 DROP DATABASE db_name 1048 and then recreate the database. But, it's really useful for testing 1049 purposes. 1050 1051 YB: now use the cascaded deletions 1052 """
1053 - def __init__(self, adaptor, dbid):
1054 """Initialize with a database id and adaptor connection. 1055 """ 1056 self.adaptor = adaptor 1057 self.dbid = dbid
1058
1059 - def remove(self):
1060 """Remove everything related to the given database id. 1061 """ 1062 sql = r"DELETE FROM bioentry WHERE biodatabase_id = %s" 1063 self.adaptor.execute(sql, (self.dbid,)) 1064 sql = r"DELETE FROM biodatabase WHERE biodatabase_id = %s" 1065 self.adaptor.execute(sql, (self.dbid,))
1066