1
2
3
4
5
6
7
8
9
10 """Connect with a BioSQL database and load Biopython like objects from it.
11
12 This provides interfaces for loading biological objects from a relational
13 database, and is compatible with the BioSQL standards.
14 """
15 import BioSeq
16 import Loader
17 import DBUtils
18
19 _POSTGRES_RULES_PRESENT = False
20
22 """Main interface for loading a existing BioSQL-style database.
23
24 This function is the easiest way to retrieve a connection to a
25 database, doing something like:
26
27 >>> from BioSeq import BioSeqDatabase
28 >>> server = BioSeqDatabase.open_database(user="root", db="minidb")
29
30 the various options are:
31 driver -> The name of the database driver to use for connecting. The
32 driver should implement the python DB API. By default, the MySQLdb
33 driver is used.
34 user -> the username to connect to the database with.
35 password, passwd -> the password to connect with
36 host -> the hostname of the database
37 database or db -> the name of the database
38 """
39 if driver == "psycopg":
40 raise ValueError("Using BioSQL with psycopg (version one) is no "
41 "longer supported. Use psycopg2 instead.")
42
43 module = __import__(driver)
44 connect = getattr(module, "connect")
45
46
47 kw = kwargs.copy()
48 if driver == "MySQLdb":
49 if "database" in kw:
50 kw["db"] = kw["database"]
51 del kw["database"]
52 if "password" in kw:
53 kw["passwd"] = kw["password"]
54 del kw["password"]
55 else:
56
57 if "db" in kw:
58 kw["database"] = kw["db"]
59 del kw["db"]
60 if "passwd" in kw:
61 kw["password"] = kw["passwd"]
62 del kw["passwd"]
63 if driver in ["psycopg2", "pgdb"] and not kw.get("database"):
64 kw["database"] = "template1"
65
66 if driver in ["sqlite3"]:
67 conn = connect(kw["database"])
68 else:
69 try:
70 conn = connect(**kw)
71 except module.InterfaceError:
72
73
74 if "database" in kw:
75 kw["dbname"] = kw["database"]
76 del kw["database"]
77 elif "db" in kw:
78 kw["dbname"] = kw["db"]
79 del kw["db"]
80 dsn = ' '.join(['='.join(i) for i in kw.items()])
81 conn = connect(dsn)
82
83 server = DBServer(conn, module)
84
85
86
87 if driver in ["psycopg2", "pgdb"]:
88 sql = "SELECT ev_class FROM pg_rewrite WHERE " + \
89 "rulename='rule_bioentry_i1' OR " + \
90 "rulename='rule_bioentry_i2';"
91 if server.adaptor.execute_and_fetchall(sql):
92 import warnings
93 warnings.warn("Your BioSQL PostgreSQL schema includes some "
94 "rules currently required for bioperl-db but "
95 "which may cause problems loading data using "
96 "Biopython (see BioSQL Bug 2839). If you do not "
97 "use BioPerl, please remove these rules. "
98 "Biopython should cope with the rules present, "
99 "but with a performance penalty when loading "
100 "new records.")
101 global _POSTGRES_RULES_PRESENT
102 _POSTGRES_RULES_PRESENT = True
103
104 return server
105
107 """Represents a BioSQL database continaing namespaces (sub-databases).
108
109 This acts like a Python dictionary, giving access to each namespace
110 (defined by a row in the biodatabase table) as a BioSeqDatabase object.
111 """
112 - def __init__(self, conn, module, module_name=None):
113 self.module = module
114 if module_name is None:
115 module_name = module.__name__
116 self.adaptor = Adaptor(conn, DBUtils.get_dbutils(module_name))
117 self.module_name = module_name
118
120 return self.__class__.__name__ + "(%r)" % self.adaptor.conn
121
124
126 """Number of namespaces (sub-databases) in this database."""
127 sql = "SELECT COUNT(name) FROM biodatabase;"
128 return int(self.adaptor.execute_and_fetch_col0(sql)[0])
129
131 """Check if a namespace (sub-database) in this database."""
132 sql = "SELECT COUNT(name) FROM biodatabase WHERE name=%s;"
133 return bool(self.adaptor.execute_and_fetch_col0(sql, (value,))[0])
134
136 """Iterate over namespaces (sub-databases) in the database."""
137
138 return iter(self.adaptor.list_biodatabase_names())
139
140 if hasattr(dict, "iteritems"):
141
143 """List of namespaces (sub-databases) in the database."""
144 return self.adaptor.list_biodatabase_names()
145
147 """List of BioSeqDatabase objects in the database."""
148 return [self[key] for key in self.keys()]
149
151 """List of (namespace, BioSeqDatabase) for entries in the database."""
152 return [(key, self[key]) for key in self.keys()]
153
155 """Iterate over namespaces (sub-databases) in the database."""
156 return iter(self)
157
159 """Iterate over BioSeqDatabase objects in the database."""
160 for key in self:
161 yield self[key]
162
164 """Iterate over (namespace, BioSeqDatabase) in the database."""
165 for key in self:
166 yield key, self[key]
167 else:
168
170 """Iterate over namespaces (sub-databases) in the database."""
171 return iter(self)
172
174 """Iterate over BioSeqDatabase objects in the database."""
175 for key in self:
176 yield self[key]
177
179 """Iterate over (namespace, BioSeqDatabase) in the database."""
180 for key in self:
181 yield key, self[key]
182
188
190 """Remove a namespace and all its entries (OBSOLETE).
191
192 Try to remove all references to items in a database.
193
194 server.remove_database(name)
195
196 In keeping with the dictionary interface, you can now do this:
197
198 del server[name]
199 """
200 import warnings
201 warnings.warn("This method is obsolete. In keeping with the dictionary interface, you can now use 'del server[name]' instead", PendingDeprecationWarning)
202 db_id = self.adaptor.fetch_dbid_by_dbname(db_name)
203 remover = Loader.DatabaseRemover(self.adaptor, db_id)
204 remover.remove()
205
206 - def new_database(self, db_name, authority=None, description=None):
207 """Add a new database to the server and return it.
208 """
209
210 sql = r"INSERT INTO biodatabase (name, authority, description)" \
211 r" VALUES (%s, %s, %s)"
212 self.adaptor.execute(sql, (db_name,authority, description))
213 return BioSeqDatabase(self.adaptor, db_name)
214
216 """Load a database schema into the given database.
217
218 This is used to create tables, etc when a database is first created.
219 sql_file should specify the complete path to a file containing
220 SQL entries for building the tables.
221 """
222
223
224
225
226
227 sql_handle = open(sql_file, "rU")
228 sql = r""
229 for line in sql_handle:
230 if line.find("--") == 0:
231 pass
232 elif line.find("#") == 0:
233 pass
234 elif line.strip():
235 sql += line.strip()
236 sql += ' '
237
238
239
240
241
242 if self.module_name in ["psycopg2", "pgdb"]:
243 self.adaptor.cursor.execute(sql)
244
245
246 elif self.module_name in ["MySQLdb", "sqlite3"]:
247 sql_parts = sql.split(";")
248 for sql_line in sql_parts[:-1]:
249 self.adaptor.cursor.execute(sql_line)
250 else:
251 raise ValueError("Module %s not supported by the loader." %
252 (self.module_name))
253
255 """Commits the current transaction to the database."""
256 return self.adaptor.commit()
257
259 """Rolls backs the current transaction."""
260 return self.adaptor.rollback()
261
263 """Close the connection. No further activity possible."""
264 return self.adaptor.close()
265
268 self.conn = conn
269 self.cursor = conn.cursor()
270 self.dbutils = dbutils
271
274
276 """Set the autocommit mode. True values enable; False value disable."""
277 return self.dbutils.autocommit(self.conn, y)
278
280 """Commits the current transaction."""
281 return self.conn.commit()
282
284 """Rolls backs the current transaction."""
285 return self.conn.rollback()
286
288 """Close the connection. No further activity possible."""
289 return self.conn.close()
290
292 self.execute(
293 r"select biodatabase_id from biodatabase where name = %s",
294 (dbname,))
295 rv = self.cursor.fetchall()
296 if not rv:
297 raise KeyError("Cannot find biodatabase with name %r" % dbname)
298
299
300 return rv[0][0]
301
303 sql = r"select bioentry_id from bioentry where name = %s"
304 fields = [name]
305 if dbid:
306 sql += " and biodatabase_id = %s"
307 fields.append(dbid)
308 self.execute(sql, fields)
309 rv = self.cursor.fetchall()
310 if not rv:
311 raise IndexError("Cannot find display id %r" % name)
312 if len(rv) > 1:
313 raise IndexError("More than one entry with display id %r" % name)
314 return rv[0][0]
315
317 sql = r"select bioentry_id from bioentry where accession = %s"
318 fields = [name]
319 if dbid:
320 sql += " and biodatabase_id = %s"
321 fields.append(dbid)
322 self.execute(sql, fields)
323 rv = self.cursor.fetchall()
324 if not rv:
325 raise IndexError("Cannot find accession %r" % name)
326 if len(rv) > 1:
327 raise IndexError("More than one entry with accession %r" % name)
328 return rv[0][0]
329
331 sql = r"select bioentry_id from bioentry where accession = %s"
332 fields = [name]
333 if dbid:
334 sql += " and biodatabase_id = %s"
335 fields.append(dbid)
336 return self.execute_and_fetch_col0(sql, fields)
337
339 acc_version = name.split(".")
340 if len(acc_version) > 2:
341 raise IndexError("Bad version %r" % name)
342 acc = acc_version[0]
343 if len(acc_version) == 2:
344 version = acc_version[1]
345 else:
346 version = "0"
347 sql = r"SELECT bioentry_id FROM bioentry WHERE accession = %s" \
348 r" AND version = %s"
349 fields = [acc, version]
350 if dbid:
351 sql += " and biodatabase_id = %s"
352 fields.append(dbid)
353 self.execute(sql, fields)
354 rv = self.cursor.fetchall()
355 if not rv:
356 raise IndexError("Cannot find version %r" % name)
357 if len(rv) > 1:
358 raise IndexError("More than one entry with version %r" % name)
359 return rv[0][0]
360
362
363 sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s"
364 fields = [identifier]
365 if dbid:
366 sql += " and biodatabase_id = %s"
367 fields.append(dbid)
368 self.execute(sql, fields)
369 rv = self.cursor.fetchall()
370 if not rv:
371 raise IndexError("Cannot find display id %r" % identifier)
372 return rv[0][0]
373
377
378 - def list_bioentry_ids(self, dbid):
379 return self.execute_and_fetch_col0(
380 "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s",
381 (dbid,))
382
384 return self.execute_and_fetch_col0(
385 "SELECT name FROM bioentry WHERE biodatabase_id = %s",
386 (dbid,))
387
389 """Return ids given a SQL statement to select for them.
390
391 This assumes that the given SQL does a SELECT statement that
392 returns a list of items. This parses them out of the 2D list
393 they come as and just returns them in a list.
394 """
395 return self.execute_and_fetch_col0(sql, args)
396
398 self.execute(sql, args or ())
399 rv = self.cursor.fetchall()
400 assert len(rv) == 1, "Expected 1 response, got %d" % len(rv)
401 return rv[0]
402
403 - def execute(self, sql, args=None):
404 """Just execute an sql command.
405 """
406 self.dbutils.execute(self.cursor, sql, args)
407
409 length = end - start
410
411
412
413
414
415
416
417
418
419 return str(self.execute_one(
420 """select SUBSTR(seq, %s, %s)
421 from biosequence where bioentry_id = %s""",
422 (start+1, length, seqid))[0])
423
425 self.execute(sql, args or ())
426 return [field[0] for field in self.cursor.fetchall()]
427
429 self.execute(sql, args or ())
430 return self.cursor.fetchall()
431
432 _allowed_lookups = {
433
434 'primary_id': "fetch_seqid_by_identifier",
435 'gi': "fetch_seqid_by_identifier",
436 'display_id': "fetch_seqid_by_display_id",
437 'name': "fetch_seqid_by_display_id",
438 'accession': "fetch_seqid_by_accession",
439 'version': "fetch_seqid_by_version",
440 }
441
443 """Represents a namespace (sub-database) within the BioSQL database.
444
445 i.e. One row in the biodatabase table, and all all rows in the bioentry
446 table associated with it.
447 """
452
454 return "BioSeqDatabase(%r, %r)" % (self.adaptor, self.name)
455
457 """Gets a DBSeqRecord object by its name
458
459 Example: seq_rec = db.get_Seq_by_id('ROA1_HUMAN')
460
461 The name of this method is misleading since it returns a DBSeqRecord
462 rather than a DBSeq ojbect, and presumably was to mirror BioPerl.
463 """
464 seqid = self.adaptor.fetch_seqid_by_display_id(self.dbid, name)
465 return BioSeq.DBSeqRecord(self.adaptor, seqid)
466
468 """Gets a DBSeqRecord object by accession number
469
470 Example: seq_rec = db.get_Seq_by_acc('X77802')
471
472 The name of this method is misleading since it returns a DBSeqRecord
473 rather than a DBSeq ojbect, and presumably was to mirror BioPerl.
474 """
475 seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name)
476 return BioSeq.DBSeqRecord(self.adaptor, seqid)
477
479 """Gets a DBSeqRecord object by version number
480
481 Example: seq_rec = db.get_Seq_by_ver('X77802.1')
482
483 The name of this method is misleading since it returns a DBSeqRecord
484 rather than a DBSeq ojbect, and presumably was to mirror BioPerl.
485 """
486 seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name)
487 return BioSeq.DBSeqRecord(self.adaptor, seqid)
488
490 """Gets a list of DBSeqRecord objects by accession number
491
492 Example: seq_recs = db.get_Seq_by_acc('X77802')
493
494 The name of this method is misleading since it returns a list of
495 DBSeqRecord objects rather than a list of DBSeq ojbects, and presumably
496 was to mirror BioPerl.
497 """
498 seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name)
499 return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids]
500
502 """All the primary_ids of the sequences in the database (OBSOLETE).
503
504 These maybe ids (display style) or accession numbers or
505 something else completely different - they *are not*
506 meaningful outside of this database implementation.
507
508 Please use .keys() instead of .get_all_primary_ids()
509 """
510 import warnings
511 warnings.warn("Use bio_seq_database.keys() instead of "
512 "bio_seq_database.get_all_primary_ids()",
513 PendingDeprecationWarning)
514 return self.keys()
515
518
520 """Remove an entry and all its annotation."""
521 if key not in self:
522 raise KeyError(key)
523
524 sql = "DELETE FROM bioentry " + \
525 "WHERE biodatabase_id=%s AND bioentry_id=%s;"
526 self.adaptor.execute(sql, (self.dbid,key))
527
529 """Number of records in this namespace (sub database)."""
530 sql = "SELECT COUNT(bioentry_id) FROM bioentry " + \
531 "WHERE biodatabase_id=%s;"
532 return int(self.adaptor.execute_and_fetch_col0(sql, (self.dbid,))[0])
533
535 """Check if a primary (internal) id is this namespace (sub database)."""
536 sql = "SELECT COUNT(bioentry_id) FROM bioentry " + \
537 "WHERE biodatabase_id=%s AND bioentry_id=%s;"
538
539
540 try:
541 bioentry_id = int(value)
542 except ValueError:
543 return False
544 return bool(self.adaptor.execute_and_fetch_col0(sql,
545 (self.dbid, bioentry_id))[0])
546
548 """Iterate over ids (which may not be meaningful outside this database)."""
549
550 return iter(self.adaptor.list_bioentry_ids(self.dbid))
551
552 if hasattr(dict, "iteritems"):
553
555 """List of ids which may not be meaningful outside this database."""
556 return self.adaptor.list_bioentry_ids(self.dbid)
557
559 """List of DBSeqRecord objects in the namespace (sub database)."""
560 return [self[key] for key in self.keys()]
561
563 """List of (id, DBSeqRecord) for the namespace (sub database)."""
564 return [(key, self[key]) for key in self.keys()]
565
567 """Iterate over ids (which may not be meaningful outside this database)."""
568 return iter(self)
569
571 """Iterate over DBSeqRecord objects in the namespace (sub database)."""
572 for key in self:
573 yield self[key]
574
576 """Iterate over (id, DBSeqRecord) for the namespace (sub database)."""
577 for key in self:
578 yield key, self[key]
579 else:
580
582 """Iterate over ids (which may not be meaningful outside this database)."""
583 return iter(self)
584
586 """Iterate over DBSeqRecord objects in the namespace (sub database)."""
587 for key in self:
588 yield self[key]
589
591 """Iterate over (id, DBSeqRecord) for the namespace (sub database)."""
592 for key in self:
593 yield key, self[key]
594
596 if len(kwargs) != 1:
597 raise TypeError("single key/value parameter expected")
598 k, v = kwargs.items()[0]
599 if k not in _allowed_lookups:
600 raise TypeError("lookup() expects one of %s, not %r" % \
601 (repr(_allowed_lookups.keys())[1:-1], repr(k)))
602 lookup_name = _allowed_lookups[k]
603 lookup_func = getattr(self.adaptor, lookup_name)
604 seqid = lookup_func(self.dbid, v)
605 return BioSeq.DBSeqRecord(self.adaptor, seqid)
606
608 """Get a DBSeqRecord by the primary (internal) id (OBSOLETE).
609
610 Rather than db.get_Seq_by_primary_id(my_id) use db[my_id]
611
612 The name of this method is misleading since it returns a DBSeqRecord
613 rather than a DBSeq ojbect, and presumably was to mirror BioPerl.
614 """
615 import warnings
616 warnings.warn("Use bio_seq_database[my_id] instead of "
617 "bio_seq_database.get_Seq_by_primary_id(my_id)",
618 PendingDeprecationWarning)
619 return self[seqid]
620
621 - def load(self, record_iterator, fetch_NCBI_taxonomy=False):
622 """Load a set of SeqRecords into the BioSQL database.
623
624 record_iterator is either a list of SeqRecord objects, or an
625 Iterator object that returns SeqRecord objects (such as the
626 output from the Bio.SeqIO.parse() function), which will be
627 used to populate the database.
628
629 fetch_NCBI_taxonomy is boolean flag allowing or preventing
630 connection to the taxonomic database on the NCBI server
631 (via Bio.Entrez) to fetch a detailed taxonomy for each
632 SeqRecord.
633
634 Example:
635 from Bio import SeqIO
636 count = db.load(SeqIO.parse(open(filename), format))
637
638 Returns the number of records loaded.
639 """
640 db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid, \
641 fetch_NCBI_taxonomy)
642 num_records = 0
643 global _POSTGRES_RULES_PRESENT
644 for cur_record in record_iterator:
645 num_records += 1
646
647
648 if _POSTGRES_RULES_PRESENT:
649
650 if cur_record.id.count(".") == 1:
651 accession, version = cur_record.id.split('.')
652 try:
653 version = int(version)
654 except ValueError:
655 accession = cur_record.id
656 version = 0
657 else:
658 accession = cur_record.id
659 version = 0
660 gi = cur_record.annotations.get("gi", None)
661 sql = "SELECT bioentry_id FROM bioentry WHERE (identifier " + \
662 "= '%s' AND biodatabase_id = '%s') OR (accession = " + \
663 "'%s' AND version = '%s' AND biodatabase_id = '%s')"
664 self.adaptor.execute(sql % (gi, self.dbid, accession, version, self.dbid))
665 if self.adaptor.cursor.fetchone():
666 raise self.adaptor.conn.IntegrityError("Duplicate record "
667 "detected: record has not been inserted")
668
669 db_loader.load_seqrecord(cur_record)
670 return num_records
671