Package Bio :: Package SCOP
[hide private]
[frames] | no frames]

Source Code for Package Bio.SCOP

  1  # Copyright 2001 by Gavin E. Crooks.  All rights reserved. 
  2  # Modifications Copyright 2004/2005 James Casbon. All rights Reserved. 
  3  # 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  # 
  8  # Changes made by James Casbon: 
  9  # - New Astral class 
 10  # - SQL functionality for both Scop and Astral classes 
 11  # - All sunids are int not strings 
 12  # 
 13  # Code written by Jeffrey Chang to access SCOP over the internet, which 
 14  # was previously in Bio.WWW.SCOP, has now been merged into this module. 
 15   
 16  """ SCOP: Structural Classification of Proteins. 
 17   
 18  The SCOP database aims to provide a manually constructed classification of 
 19  all know protein structures into a hierarchy, the main levels of which 
 20  are family, superfamily and fold. 
 21   
 22  * "SCOP":http://scop.mrc-lmb.cam.ac.uk/scop/ 
 23   
 24  * "Introduction":http://scop.mrc-lmb.cam.ac.uk/scop/intro.html 
 25   
 26  * "SCOP parsable files":http://scop.mrc-lmb.cam.ac.uk/scop/parse/ 
 27   
 28  The Scop object in this module represents the entire SCOP classification. It 
 29  can be built from the three SCOP parsable files, modified is so desired, and 
 30  converted back to the same file formats. A single SCOP domain (represented 
 31  by the Domain class) can be obtained from Scop using the domain's SCOP 
 32  identifier (sid). 
 33   
 34   
 35  nodeCodeDict  -- A mapping between known 2 letter node codes and a longer 
 36                    description. The known node types are 'cl' (class), 'cf' 
 37                    (fold), 'sf' (superfamily), 'fa' (family), 'dm' (domain),  
 38                    'sp' (species), 'px' (domain). Additional node types may 
 39                    be added in the future. 
 40   
 41  This module also provides code to access SCOP over the WWW. 
 42   
 43  Functions: 
 44  search        -- Access the main CGI script. 
 45  _open         -- Internally used function. 
 46   
 47  """ 
 48   
 49  from types import * 
 50  import os 
 51   
 52  import Des 
 53  import Cla 
 54  import Hie 
 55  from Residues import * 
 56  from Bio import SeqIO 
 57  from Bio.Seq import Seq 
 58   
 59   
 60  nodeCodeDict = { 'cl':'class', 'cf':'fold', 'sf':'superfamily', 
 61                   'fa':'family', 'dm':'protein', 'sp':'species', 'px':'domain'} 
 62   
 63   
 64  _nodetype_to_code= { 'class': 'cl', 'fold': 'cf', 'superfamily': 'sf', 
 65                       'family': 'fa', 'protein': 'dm', 'species': 'sp', 'domain': 'px'} 
 66   
 67  nodeCodeOrder = [ 'ro', 'cl', 'cf', 'sf', 'fa', 'dm', 'sp', 'px' ]  
 68   
 69  astralBibIds = [10,20,25,30,35,40,50,70,90,95,100] 
 70   
 71  astralEvs = [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 1e-4, 1e-5, 1e-10, 1e-15, 
 72               1e-20, 1e-25, 1e-50] 
 73   
 74  astralEv_to_file = { 10: 'e+1', 5: 'e+0,7', 1: 'e+0', 0.5: 'e-0,3', 0.1: 'e-1', 
 75                       0.05: 'e-1,3', 0.01: 'e-2', 0.005: 'e-2,3', 0.001: 'e-3', 
 76                       1e-4: 'e-4',  1e-5: 'e-5', 1e-10: 'e-10', 1e-15: 'e-15', 
 77                       1e-20: 'e-20', 1e-25: 'e-25', 1e-50: 'e-50' } 
 78   
 79  astralEv_to_sql = { 10: 'e1', 5: 'e0_7', 1: 'e0', 0.5: 'e_0_3', 0.1: 'e_1', 
 80                       0.05: 'e_1_3', 0.01: 'e_2', 0.005: 'e_2_3', 0.001: 'e_3', 
 81                       1e-4: 'e_4',  1e-5: 'e_5', 1e-10: 'e_10', 1e-15: 'e_15', 
 82                       1e-20: 'e_20', 1e-25: 'e_25', 1e-50: 'e_50' } 
 83   
 84  try: 
 85      #See if the cmp function exists (will on Python 2) 
 86      _cmp = cmp 
 87  except NameError: 
88 - def _cmp(a,b):
89 """Implementation of cmp(x,y) for Python 3 (PRIVATE). 90 91 Based on Python 3 docs which say if you really need the cmp() 92 functionality, you could use the expression (a > b) - (a < b) 93 as the equivalent for cmp(a, b) 94 """ 95 return (a > b) - (a < b)
96
97 -def cmp_sccs(sccs1, sccs2):
98 """Order SCOP concise classification strings (sccs). 99 100 a.4.5.1 < a.4.5.11 < b.1.1.1 101 102 A sccs (e.g. a.4.5.11) compactly represents a domain's classification. 103 The letter represents the class, and the numbers are the fold, 104 superfamily, and family, respectively. 105 106 """ 107 108 s1 = sccs1.split(".") 109 s2 = sccs2.split(".") 110 111 if s1[0] != s2[0]: return _cmp(s1[0], s2[0]) 112 113 s1 = list(map(int, s1[1:])) 114 s2 = list(map(int, s2[1:])) 115 116 return _cmp(s1,s2)
117 118 119 _domain_re = re.compile(r">?([\w_\.]*)\s+([\w\.]*)\s+\(([^)]*)\) (.*)") 120
121 -def parse_domain(str):
122 """Convert an ASTRAL header string into a Scop domain. 123 124 An ASTRAL (http://astral.stanford.edu/) header contains a concise 125 description of a SCOP domain. A very similar format is used when a 126 Domain object is converted into a string. The Domain returned by this 127 method contains most of the SCOP information, but it will not be located 128 within the SCOP hierarchy (i.e. The parent node will be None). The 129 description is composed of the SCOP protein and species descriptions. 130 131 A typical ASTRAL header looks like -- 132 >d1tpt_1 a.46.2.1 (1-70) Thymidine phosphorylase {Escherichia coli} 133 """ 134 135 m = _domain_re.match(str) 136 if (not m) : raise ValueError("Domain: "+ str) 137 138 dom = Domain() 139 dom.sid = m.group(1) 140 dom.sccs = m.group(2) 141 dom.residues = Residues(m.group(3)) 142 if not dom.residues.pdbid: 143 dom.residues.pdbid= dom.sid[1:5] 144 dom.description = m.group(4).strip() 145 146 return dom
147
148 -def _open_scop_file(scop_dir_path, version, filetype):
149 filename = "dir.%s.scop.txt_%s" % (filetype,version) 150 handle = open(os.path.join( scop_dir_path, filename)) 151 return handle
152 153
154 -class Scop:
155 """The entire SCOP hierarchy. 156 157 root -- The root node of the hierarchy 158 """
159 - def __init__(self, cla_handle=None, des_handle=None, hie_handle=None, 160 dir_path=None, db_handle=None, version=None):
161 """Build the SCOP hierarchy from the SCOP parsable files, or a sql backend. 162 163 If no file handles are given, then a Scop object with a single 164 empty root node is returned. 165 166 If a directory and version are given (with dir_path=.., version=...) or 167 file handles for each file, the whole scop tree will be built in memory. 168 169 If a MySQLdb database handle is given, the tree will be built as needed, 170 minimising construction times. To build the SQL database to the methods 171 write_xxx_sql to create the tables. 172 173 """ 174 self._sidDict = {} 175 self._sunidDict = {} 176 177 if cla_handle==des_handle==hie_handle==dir_path==db_handle==None: return 178 179 if dir_path is None and db_handle is None: 180 if cla_handle == None or des_handle==None or hie_handle==None: 181 raise RuntimeError("Need CLA, DES and HIE files to build SCOP") 182 183 sunidDict = {} 184 185 self.db_handle = db_handle 186 try: 187 188 if db_handle: 189 # do nothing if we have a db handle, we'll do it all on the fly 190 pass 191 192 else: 193 # open SCOP parseable files 194 if dir_path: 195 if not version: 196 raise RuntimeError("Need SCOP version to find parsable files in directory") 197 if cla_handle or des_handle or hie_handle: 198 raise RuntimeError("Cannot specify SCOP directory and specific files") 199 200 cla_handle = _open_scop_file( dir_path, version, 'cla') 201 des_handle = _open_scop_file( dir_path, version, 'des') 202 hie_handle = _open_scop_file( dir_path, version, 'hie') 203 204 root = Node() 205 domains = [] 206 root.sunid=0 207 root.type='ro' 208 sunidDict[root.sunid] = root 209 self.root = root 210 root.description = 'SCOP Root' 211 212 # Build the rest of the nodes using the DES file 213 records = Des.parse(des_handle) 214 for record in records: 215 if record.nodetype =='px': 216 n = Domain() 217 n.sid = record.name 218 domains.append(n) 219 else : 220 n = Node() 221 n.sunid = record.sunid 222 n.type = record.nodetype 223 n.sccs = record.sccs 224 n.description = record.description 225 226 sunidDict[n.sunid] = n 227 228 # Glue all of the Nodes together using the HIE file 229 records = Hie.parse(hie_handle) 230 for record in records: 231 if record.sunid not in sunidDict: 232 print record.sunid 233 234 n = sunidDict[record.sunid] 235 236 if record.parent != '' : # Not root node 237 238 if record.parent not in sunidDict: 239 raise ValueError("Incomplete data?") 240 241 n.parent = sunidDict[record.parent] 242 243 for c in record.children: 244 if c not in sunidDict: 245 raise ValueError("Incomplete data?") 246 n.children.append(sunidDict[c]) 247 248 249 # Fill in the gaps with information from the CLA file 250 sidDict = {} 251 records = Cla.parse(cla_handle) 252 for record in records: 253 n = sunidDict[record.sunid] 254 assert n.sccs == record.sccs 255 assert n.sid == record.sid 256 n.residues = record.residues 257 sidDict[n.sid] = n 258 259 # Clean up 260 self._sunidDict = sunidDict 261 self._sidDict = sidDict 262 self._domains = tuple(domains) 263 264 finally: 265 if dir_path: 266 # If we opened the files, we close the files 267 if cla_handle : cla_handle.close() 268 if des_handle : des_handle.close() 269 if hie_handle : hie_handle.close()
270 271
272 - def getRoot(self):
273 return self.getNodeBySunid(0)
274 275
276 - def getDomainBySid(self, sid):
277 """Return a domain from its sid""" 278 if sid in self._sidDict: 279 return self._sidDict[sid] 280 if self.db_handle: 281 self.getDomainFromSQL(sid=sid) 282 if sid in self._sidDict: 283 return self._sidDict[sid] 284 else: 285 return None
286 287
288 - def getNodeBySunid(self, sunid):
289 """Return a node from its sunid""" 290 if sunid in self._sunidDict: 291 return self._sunidDict[sunid] 292 if self.db_handle: 293 self.getDomainFromSQL(sunid=sunid) 294 if sunid in self._sunidDict: 295 return self._sunidDict[sunid] 296 else: 297 return None
298 299
300 - def getDomains(self):
301 """Returns an ordered tuple of all SCOP Domains""" 302 if self.db_handle: 303 return self.getRoot().getDescendents('px') 304 else: 305 return self._domains
306 307 308
309 - def write_hie(self, handle):
310 """Build an HIE SCOP parsable file from this object""" 311 nodes = self._sunidDict.values() 312 # We order nodes to ease comparison with original file 313 nodes.sort(key = lambda n: n.sunid) 314 for n in nodes: 315 handle.write(str(n.toHieRecord()))
316 317
318 - def write_des(self, handle):
319 """Build a DES SCOP parsable file from this object""" 320 nodes = self._sunidDict.values() 321 # Origional SCOP file is not ordered? 322 nodes.sort(key = lambda n: n.sunid) 323 for n in nodes: 324 if n != self.root: 325 handle.write(str(n.toDesRecord()))
326 327
328 - def write_cla(self, handle):
329 """Build a CLA SCOP parsable file from this object""" 330 nodes = self._sidDict.values() 331 # We order nodes to ease comparison with original file 332 nodes.sort(key = lambda n: n.sunid) 333 for n in nodes: 334 handle.write(str(n.toClaRecord()))
335 336
337 - def getDomainFromSQL(self, sunid=None, sid=None):
338 """Load a node from the SQL backend using sunid or sid""" 339 if sunid==sid==None: return None 340 341 cur = self.db_handle.cursor() 342 343 if sid: 344 cur.execute("SELECT sunid FROM cla WHERE sid=%s", sid) 345 res = cur.fetchone() 346 if res is None: 347 return None 348 sunid = res[0] 349 350 cur.execute("SELECT * FROM des WHERE sunid=%s", sunid) 351 data = cur.fetchone() 352 353 if data is not None: 354 n = None 355 356 #determine if Node or Domain 357 if data[1] != "px": 358 n = Node(scop=self) 359 360 cur.execute("SELECT child FROM hie WHERE parent=%s", sunid) 361 children = [] 362 for c in cur.fetchall(): 363 children.append(c[0]) 364 n.children = children 365 366 367 else: 368 n = Domain(scop=self) 369 cur.execute("select sid, residues, pdbid from cla where sunid=%s", 370 sunid) 371 372 [n.sid,n.residues,pdbid] = cur.fetchone() 373 n.residues = Residues(n.residues) 374 n.residues.pdbid=pdbid 375 self._sidDict[n.sid] = n 376 377 [n.sunid,n.type,n.sccs,n.description] = data 378 379 if data[1] != 'ro': 380 cur.execute("SELECT parent FROM hie WHERE child=%s", sunid) 381 n.parent = cur.fetchone()[0] 382 383 n.sunid = int(n.sunid) 384 385 self._sunidDict[n.sunid] = n
386 387
388 - def getAscendentFromSQL(self, node, type):
389 """Get ascendents using SQL backend""" 390 if nodeCodeOrder.index(type) >= nodeCodeOrder.index(node.type): return None 391 392 cur = self.db_handle.cursor() 393 cur.execute("SELECT "+type+" from cla WHERE "+node.type+"=%s", (node.sunid)) 394 result = cur.fetchone() 395 if result is not None: 396 return self.getNodeBySunid(result[0]) 397 else: 398 return None
399 400
401 - def getDescendentsFromSQL(self, node, type):
402 """Get descendents of a node using the database backend. This avoids 403 repeated iteration of SQL calls and is therefore much quicker than 404 repeatedly calling node.getChildren(). 405 """ 406 if nodeCodeOrder.index(type) <= nodeCodeOrder.index(node.type): return [] 407 408 des_list = [] 409 410 411 # SQL cla table knows nothing about 'ro' 412 if node.type == 'ro': 413 for c in node.getChildren(): 414 for d in self.getDescendentsFromSQL(c,type): 415 des_list.append(d) 416 return des_list 417 418 cur = self.db_handle.cursor() 419 420 421 if type != 'px': 422 cur.execute("SELECT DISTINCT des.sunid,des.type,des.sccs,description FROM \ 423 cla,des WHERE cla."+node.type+"=%s AND cla."+type+"=des.sunid", (node.sunid)) 424 data = cur.fetchall() 425 for d in data: 426 if int(d[0]) not in self._sunidDict: 427 n = Node(scop=self) 428 [n.sunid,n.type,n.sccs,n.description] = d 429 n.sunid=int(n.sunid) 430 self._sunidDict[n.sunid] = n 431 432 cur.execute("SELECT parent FROM hie WHERE child=%s", n.sunid) 433 n.parent = cur.fetchone()[0] 434 435 cur.execute("SELECT child FROM hie WHERE parent=%s", n.sunid) 436 children = [] 437 for c in cur.fetchall(): 438 children.append(c[0]) 439 n.children = children 440 441 442 des_list.append( self._sunidDict[int(d[0])] ) 443 444 else: 445 cur.execute("SELECT cla.sunid,sid,pdbid,residues,cla.sccs,type,description,sp\ 446 FROM cla,des where cla.sunid=des.sunid and cla."+node.type+"=%s", 447 node.sunid) 448 449 data = cur.fetchall() 450 for d in data: 451 if int(d[0]) not in self._sunidDict: 452 n = Domain(scop=self) 453 #[n.sunid, n.sid, n.pdbid, n.residues, n.sccs, n.type, 454 #n.description,n.parent] = data 455 [n.sunid,n.sid, pdbid,n.residues,n.sccs,n.type,n.description, 456 n.parent] = d[0:8] 457 n.residues = Residues(n.residues) 458 n.residues.pdbid = pdbid 459 n.sunid = int(n.sunid) 460 self._sunidDict[n.sunid] = n 461 self._sidDict[n.sid] = n 462 463 464 des_list.append( self._sunidDict[int(d[0])] ) 465 466 return des_list
467 468 469 470
471 - def write_hie_sql(self, handle):
472 """Write HIE data to SQL database""" 473 cur = handle.cursor() 474 475 cur.execute("DROP TABLE IF EXISTS hie") 476 cur.execute("CREATE TABLE hie (parent INT, child INT, PRIMARY KEY (child),\ 477 INDEX (parent) )") 478 479 for p in self._sunidDict.itervalues(): 480 for c in p.children: 481 cur.execute("INSERT INTO hie VALUES (%s,%s)" % (p.sunid, c.sunid))
482 483
484 - def write_cla_sql(self, handle):
485 """Write CLA data to SQL database""" 486 cur = handle.cursor() 487 488 cur.execute("DROP TABLE IF EXISTS cla") 489 cur.execute("CREATE TABLE cla (sunid INT, sid CHAR(8), pdbid CHAR(4),\ 490 residues VARCHAR(50), sccs CHAR(10), cl INT, cf INT, sf INT, fa INT,\ 491 dm INT, sp INT, px INT, PRIMARY KEY (sunid), INDEX (SID) )") 492 493 for n in self._sidDict.itervalues(): 494 c = n.toClaRecord() 495 cur.execute( "INSERT INTO cla VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", 496 (n.sunid, n.sid, c.residues.pdbid, c.residues, n.sccs, 497 n.getAscendent('cl').sunid, n.getAscendent('cf').sunid, 498 n.getAscendent('sf').sunid, n.getAscendent('fa').sunid, 499 n.getAscendent('dm').sunid, n.getAscendent('sp').sunid, 500 n.sunid ))
501 502
503 - def write_des_sql(self, handle):
504 """Write DES data to SQL database""" 505 cur = handle.cursor() 506 507 cur.execute("DROP TABLE IF EXISTS des") 508 cur.execute("CREATE TABLE des (sunid INT, type CHAR(2), sccs CHAR(10),\ 509 description VARCHAR(255),\ 510 PRIMARY KEY (sunid) )") 511 512 for n in self._sunidDict.itervalues(): 513 cur.execute( "INSERT INTO des VALUES (%s,%s,%s,%s)", 514 ( n.sunid, n.type, n.sccs, n.description ) )
515 516 517
518 -class Node:
519 """ A node in the Scop hierarchy 520 521 sunid -- SCOP unique identifiers. e.g. '14986' 522 523 parent -- The parent node 524 525 children -- A list of child nodes 526 527 sccs -- SCOP concise classification string. e.g. 'a.1.1.2' 528 529 type -- A 2 letter node type code. e.g. 'px' for domains 530 531 description -- 532 533 """
534 - def __init__(self, scop=None):
535 """Create a Node in the scop hierarchy. If a Scop instance is provided to the 536 constructor, this will be used to lookup related references using the SQL 537 methods. If no instance is provided, it is assumed the whole tree exists 538 and is connected.""" 539 self.sunid='' 540 self.parent = None 541 self.children=[] 542 self.sccs = '' 543 self.type ='' 544 self.description ='' 545 self.scop=scop
546
547 - def __str__(self):
548 s = [] 549 s.append(str(self.sunid)) 550 s.append(self.sccs) 551 s.append(self.type) 552 s.append(self.description) 553 554 return " ".join(s)
555
556 - def toHieRecord(self):
557 """Return an Hie.Record""" 558 rec = Hie.Record() 559 rec.sunid = str(self.sunid) 560 if self.getParent() : #Not root node 561 rec.parent = str(self.getParent().sunid) 562 else: 563 rec.parent = '-' 564 for c in self.getChildren(): 565 rec.children.append(str(c.sunid)) 566 return rec
567
568 - def toDesRecord(self):
569 """Return a Des.Record""" 570 rec = Des.Record() 571 rec.sunid = str(self.sunid) 572 rec.nodetype = self.type 573 rec.sccs = self.sccs 574 rec.description = self.description 575 return rec
576
577 - def getChildren(self):
578 """Return a list of children of this Node""" 579 if self.scop is None: 580 return self.children 581 else: 582 return map ( self.scop.getNodeBySunid, self.children )
583
584 - def getParent(self):
585 """Return the parent of this Node""" 586 if self.scop is None: 587 return self.parent 588 else: 589 return self.scop.getNodeBySunid( self.parent )
590
591 - def getDescendents( self, node_type):
592 """ Return a list of all decendent nodes of the given type. Node type can a 593 two letter code or longer description. e.g. 'fa' or 'family' 594 """ 595 if node_type in _nodetype_to_code: 596 node_type = _nodetype_to_code[node_type] 597 598 nodes = [self] 599 if self.scop: 600 return self.scop.getDescendentsFromSQL(self,node_type) 601 while nodes[0].type != node_type: 602 if nodes[0].type == 'px' : return [] # Fell of the bottom of the hierarchy 603 child_list = [] 604 for n in nodes: 605 for child in n.getChildren(): 606 child_list.append( child ) 607 nodes = child_list 608 609 return nodes
610 611
612 - def getAscendent( self, node_type):
613 """ Return the ancenstor node of the given type, or None.Node type can a 614 two letter code or longer description. e.g. 'fa' or 'family'""" 615 if node_type in _nodetype_to_code: 616 node_type = _nodetype_to_code[node_type] 617 618 if self.scop: 619 return self.scop.getAscendentFromSQL(self,node_type) 620 else: 621 n = self 622 if n.type == node_type: return None 623 while n.type != node_type: 624 if n.type == 'ro': return None # Fell of the top of the hierarchy 625 n = n.getParent() 626 627 return n
628 629 630
631 -class Domain(Node):
632 """ A SCOP domain. A leaf node in the Scop hierarchy. 633 634 sid -- The SCOP domain identifier. e.g. 'd5hbib_' 635 636 residues -- A Residue object. It defines the collection 637 of PDB atoms that make up this domain. 638 """
639 - def __init__(self,scop=None):
640 Node.__init__(self,scop=scop) 641 self.sid = '' 642 self.residues = None
643
644 - def __str__(self):
645 s = [] 646 s.append(self.sid) 647 s.append(self.sccs) 648 s.append("("+str(self.residues)+")") 649 650 if not self.getParent(): 651 s.append(self.description) 652 else: 653 sp = self.getParent() 654 dm = sp.getParent() 655 s.append(dm.description) 656 s.append("{"+sp.description+"}") 657 658 return " ".join(s)
659
660 - def toDesRecord(self):
661 """Return a Des.Record""" 662 rec = Node.toDesRecord(self) 663 rec.name = self.sid 664 return rec
665
666 - def toClaRecord(self):
667 """Return a Cla.Record""" 668 rec = Cla.Record() 669 rec.sid = self.sid 670 rec.residues = self.residues 671 rec.sccs = self.sccs 672 rec.sunid = self.sunid 673 674 n = self 675 while n.sunid != 0: #Not root node 676 rec.hierarchy.append( (n.type, str(n.sunid)) ) 677 n = n.getParent() 678 679 rec.hierarchy.reverse() 680 681 return rec
682
683 -class Astral:
684 """Abstraction of the ASTRAL database, which has sequences for all the SCOP domains, 685 as well as clusterings by percent id or evalue. 686 """ 687
688 - def __init__( self, dir_path=None, version=None, scop=None, 689 astral_file=None, db_handle=None):
690 """ 691 Initialise the astral database. 692 693 You must provide either a directory of SCOP files: 694 695 dir_path - string, the path to location of the scopseq-x.xx directory 696 (not the directory itself), and 697 version -a version number. 698 699 or, a FASTA file: 700 701 astral_file - string, a path to a fasta file (which will be loaded in memory) 702 703 or, a MYSQL database: 704 705 db_handle - a database handle for a MYSQL database containing a table 706 'astral' with the astral data in it. This can be created 707 using writeToSQL. 708 """ 709 710 if astral_file==dir_path==db_handle==None: 711 raise RuntimeError("Need either file handle, or (dir_path + "\ 712 + "version) or database handle to construct Astral") 713 if not scop: 714 raise RuntimeError("Must provide a Scop instance to construct") 715 716 self.scop = scop 717 self.db_handle = db_handle 718 719 720 if not astral_file and not db_handle: 721 if dir_path == None or version == None: 722 raise RuntimeError("must provide dir_path and version") 723 724 self.version = version 725 self.path = os.path.join( dir_path, "scopseq-%s" % version) 726 astral_file = "astral-scopdom-seqres-all-%s.fa" % self.version 727 astral_file = os.path.join (self.path, astral_file) 728 729 if astral_file: 730 #Build a dictionary of SeqRecord objects in the FASTA file, IN MEMORY 731 self.fasta_dict = SeqIO.to_dict(SeqIO.parse(open(astral_file), "fasta")) 732 733 self.astral_file = astral_file 734 self.EvDatasets = {} 735 self.EvDatahash = {} 736 self.IdDatasets = {} 737 self.IdDatahash = {}
738 739
740 - def domainsClusteredByEv(self,id):
741 """get domains clustered by evalue""" 742 if id not in self.EvDatasets: 743 if self.db_handle: 744 self.EvDatasets[id] = self.getAstralDomainsFromSQL(astralEv_to_sql[id]) 745 746 else: 747 if not self.path: 748 raise RuntimeError("No scopseq directory specified") 749 750 file_prefix = "astral-scopdom-seqres-sel-gs" 751 filename = "%s-e100m-%s-%s.id" % (file_prefix, astralEv_to_file[id] , 752 self.version) 753 filename = os.path.join(self.path,filename) 754 self.EvDatasets[id] = self.getAstralDomainsFromFile(filename) 755 return self.EvDatasets[id]
756 757
758 - def domainsClusteredById(self,id):
759 """get domains clustered by percent id""" 760 if id not in self.IdDatasets: 761 if self.db_handle: 762 self.IdDatasets[id] = self.getAstralDomainsFromSQL("id"+str(id)) 763 764 else: 765 if not self.path: 766 raise RuntimeError("No scopseq directory specified") 767 768 file_prefix = "astral-scopdom-seqres-sel-gs" 769 filename = "%s-bib-%s-%s.id" % (file_prefix, id, self.version) 770 filename = os.path.join(self.path,filename) 771 self.IdDatasets[id] = self.getAstralDomainsFromFile(filename) 772 return self.IdDatasets[id]
773 774
775 - def getAstralDomainsFromFile(self,filename=None,file_handle=None):
776 """Get the scop domains from a file containing a list of sids""" 777 if file_handle == filename == None: 778 raise RuntimeError("You must provide a filename or handle") 779 if not file_handle: 780 file_handle = open(filename) 781 doms = [] 782 while 1: 783 line = file_handle.readline() 784 if not line: 785 break 786 line = line.rstrip() 787 doms.append(line) 788 if filename: 789 file_handle.close() 790 791 doms = filter( lambda a: a[0]=='d', doms ) 792 doms = map( self.scop.getDomainBySid, doms ) 793 return doms
794
795 - def getAstralDomainsFromSQL(self, column):
796 """Load a set of astral domains from a column in the astral table of a MYSQL 797 database (which can be created with writeToSQL(...)""" 798 cur = self.db_handle.cursor() 799 cur.execute("SELECT sid FROM astral WHERE "+column+"=1") 800 data = cur.fetchall() 801 data = map( lambda x: self.scop.getDomainBySid(x[0]), data) 802 803 return data
804 805
806 - def getSeqBySid(self,domain):
807 """get the seq record of a given domain from its sid""" 808 if self.db_handle is None: 809 return self.fasta_dict[domain].seq 810 811 else: 812 cur = self.db_handle.cursor() 813 cur.execute("SELECT seq FROM astral WHERE sid=%s", domain) 814 return Seq(cur.fetchone()[0])
815
816 - def getSeq(self,domain):
817 """Return seq associated with domain""" 818 return self.getSeqBySid(domain.sid)
819 820
821 - def hashedDomainsById(self,id):
822 """Get domains clustered by sequence identity in a dict""" 823 if id not in self.IdDatahash: 824 self.IdDatahash[id] = {} 825 for d in self.domainsClusteredById(id): 826 self.IdDatahash[id][d] = 1 827 return self.IdDatahash[id]
828
829 - def hashedDomainsByEv(self,id):
830 """Get domains clustered by evalue in a dict""" 831 if id not in self.EvDatahash: 832 self.EvDatahash[id] = {} 833 for d in self.domainsClusteredByEv(id): 834 self.EvDatahash[id][d] = 1 835 return self.EvDatahash[id]
836 837
838 - def isDomainInId(self,dom,id):
839 """Returns true if the domain is in the astral clusters for percent ID""" 840 return dom in self.hashedDomainsById(id)
841
842 - def isDomainInEv(self,dom,id):
843 """Returns true if the domain is in the ASTRAL clusters for evalues""" 844 return dom in self.hashedDomainsByEv(id)
845 846
847 - def writeToSQL(self, db_handle):
848 """Write the ASTRAL database to a MYSQL database""" 849 cur = db_handle.cursor() 850 851 cur.execute("DROP TABLE IF EXISTS astral") 852 cur.execute("CREATE TABLE astral (sid CHAR(8), seq TEXT, PRIMARY KEY (sid))") 853 854 for dom in self.fasta_dict: 855 cur.execute("INSERT INTO astral (sid,seq) values (%s,%s)", 856 (dom, self.fasta_dict[dom].seq.data)) 857 858 for i in astralBibIds: 859 cur.execute("ALTER TABLE astral ADD (id"+str(i)+" TINYINT)") 860 861 for d in self.domainsClusteredById(i): 862 cur.execute("UPDATE astral SET id"+str(i)+"=1 WHERE sid=%s", 863 d.sid) 864 865 for ev in astralEvs: 866 cur.execute("ALTER TABLE astral ADD ("+astralEv_to_sql[ev]+" TINYINT)") 867 868 for d in self.domainsClusteredByEv(ev): 869 870 cur.execute("UPDATE astral SET "+astralEv_to_sql[ev]+"=1 WHERE sid=%s", 871 d.sid)
872
873 -def search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None, 874 cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds):
875 """search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None, 876 cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds) 877 878 Access search.cgi and return a handle to the results. See the 879 online help file for an explanation of the parameters: 880 http://scop.mrc-lmb.cam.ac.uk/scop/help.html 881 882 Raises an IOError if there's a network error. 883 884 """ 885 params = {'pdb' : pdb, 'key' : key, 'sid' : sid, 'disp' : disp, 886 'dir' : dir, 'loc' : loc} 887 variables = {} 888 for k, v in params.iteritmes(): 889 if v is not None: 890 variables[k] = v 891 variables.update(keywds) 892 return _open(cgi, variables)
893
894 -def _open(cgi, params={}, get=1):
895 """_open(cgi, params={}, get=1) -> UndoHandle 896 897 Open a handle to SCOP. cgi is the URL for the cgi script to access. 898 params is a dictionary with the options to pass to it. get is a boolean 899 that describes whether a GET should be used. Does some 900 simple error checking, and will raise an IOError if it encounters one. 901 902 """ 903 import urllib 904 from Bio import File 905 # Open a handle to SCOP. 906 options = urllib.urlencode(params) 907 if get: # do a GET 908 fullcgi = cgi 909 if options: 910 fullcgi = "%s?%s" % (cgi, options) 911 handle = urllib.urlopen(fullcgi) 912 else: # do a POST 913 handle = urllib.urlopen(cgi, options) 914 915 # Wrap the handle inside an UndoHandle. 916 uhandle = File.UndoHandle(handle) 917 # Should I check for 404? timeout? etc? 918 return uhandle
919