Package Bio :: Package Blast :: Module NCBIWWW
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIWWW

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # Patched by Brad Chapman. 
  7  # Chris Wroe added modifications for work in myGrid 
  8   
  9  """ 
 10  This module provides code to work with the WWW version of BLAST 
 11  provided by the NCBI. 
 12  http://blast.ncbi.nlm.nih.gov/ 
 13   
 14  Functions: 
 15  qblast        Do a BLAST search using the QBLAST API. 
 16  """ 
 17   
 18  import sys 
 19  try: 
 20      from cStringIO import StringIO 
 21  except ImportError: 
 22      from StringIO import StringIO 
 23   
 24  from Bio._py3k import _as_string 
 25   
26 -def qblast(program, database, sequence, 27 auto_format=None,composition_based_statistics=None, 28 db_genetic_code=None,endpoints=None,entrez_query='(none)', 29 expect=10.0,filter=None,gapcosts=None,genetic_code=None, 30 hitlist_size=50,i_thresh=None,layout=None,lcase_mask=None, 31 matrix_name=None,nucl_penalty=None,nucl_reward=None, 32 other_advanced=None,perc_ident=None,phi_pattern=None, 33 query_file=None,query_believe_defline=None,query_from=None, 34 query_to=None,searchsp_eff=None,service=None,threshold=None, 35 ungapped_alignment=None,word_size=None, 36 alignments=500,alignment_view=None,descriptions=500, 37 entrez_links_new_window=None,expect_low=None,expect_high=None, 38 format_entrez_query=None,format_object=None,format_type='XML', 39 ncbi_gi=None,results_file=None,show_overview=None, megablast=None, 40 ):
41 """Do a BLAST search using the QBLAST server at NCBI. 42 43 Supports all parameters of the qblast API for Put and Get. 44 Some useful parameters: 45 program blastn, blastp, blastx, tblastn, or tblastx (lower case) 46 database Which database to search against (e.g. "nr"). 47 sequence The sequence to search. 48 ncbi_gi TRUE/FALSE whether to give 'gi' identifier. 49 descriptions Number of descriptions to show. Def 500. 50 alignments Number of alignments to show. Def 500. 51 expect An expect value cutoff. Def 10.0. 52 matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). 53 filter "none" turns off filtering. Default no filtering 54 format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". 55 entrez_query Entrez query to limit Blast search 56 hitlist_size Number of hits to return. Default 50 57 megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) 58 59 This function does no checking of the validity of the parameters 60 and passes the values to the server as is. More help is available at: 61 http://www.ncbi.nlm.nih.gov/BLAST/blast_overview.html 62 63 """ 64 import urllib, urllib2 65 import time 66 67 assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx'] 68 69 # Format the "Put" command, which sends search requests to qblast. 70 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 71 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 72 parameters = [ 73 ('AUTO_FORMAT',auto_format), 74 ('COMPOSITION_BASED_STATISTICS',composition_based_statistics), 75 ('DATABASE',database), 76 ('DB_GENETIC_CODE',db_genetic_code), 77 ('ENDPOINTS',endpoints), 78 ('ENTREZ_QUERY',entrez_query), 79 ('EXPECT',expect), 80 ('FILTER',filter), 81 ('GAPCOSTS',gapcosts), 82 ('GENETIC_CODE',genetic_code), 83 ('HITLIST_SIZE',hitlist_size), 84 ('I_THRESH',i_thresh), 85 ('LAYOUT',layout), 86 ('LCASE_MASK',lcase_mask), 87 ('MEGABLAST',megablast), 88 ('MATRIX_NAME',matrix_name), 89 ('NUCL_PENALTY',nucl_penalty), 90 ('NUCL_REWARD',nucl_reward), 91 ('OTHER_ADVANCED',other_advanced), 92 ('PERC_IDENT',perc_ident), 93 ('PHI_PATTERN',phi_pattern), 94 ('PROGRAM',program), 95 #('PSSM',pssm), - It is possible to use PSI-BLAST via this API? 96 ('QUERY',sequence), 97 ('QUERY_FILE',query_file), 98 ('QUERY_BELIEVE_DEFLINE',query_believe_defline), 99 ('QUERY_FROM',query_from), 100 ('QUERY_TO',query_to), 101 #('RESULTS_FILE',...), - Can we use this parameter? 102 ('SEARCHSP_EFF',searchsp_eff), 103 ('SERVICE',service), 104 ('THRESHOLD',threshold), 105 ('UNGAPPED_ALIGNMENT',ungapped_alignment), 106 ('WORD_SIZE',word_size), 107 ('CMD', 'Put'), 108 ] 109 query = [x for x in parameters if x[1] is not None] 110 message = urllib.urlencode(query) 111 112 # Send off the initial query to qblast. 113 # Note the NCBI do not currently impose a rate limit here, other 114 # than the request not to make say 50 queries at once using multiple 115 # threads. 116 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", 117 message, 118 {"User-Agent":"BiopythonClient"}) 119 handle = urllib2.urlopen(request) 120 121 # Format the "Get" command, which gets the formatted results from qblast 122 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 123 rid, rtoe = _parse_qblast_ref_page(handle) 124 parameters = [ 125 ('ALIGNMENTS',alignments), 126 ('ALIGNMENT_VIEW',alignment_view), 127 ('DESCRIPTIONS',descriptions), 128 ('ENTREZ_LINKS_NEW_WINDOW',entrez_links_new_window), 129 ('EXPECT_LOW',expect_low), 130 ('EXPECT_HIGH',expect_high), 131 ('FORMAT_ENTREZ_QUERY',format_entrez_query), 132 ('FORMAT_OBJECT',format_object), 133 ('FORMAT_TYPE',format_type), 134 ('NCBI_GI',ncbi_gi), 135 ('RID',rid), 136 ('RESULTS_FILE',results_file), 137 ('SERVICE',service), 138 ('SHOW_OVERVIEW',show_overview), 139 ('CMD', 'Get'), 140 ] 141 query = [x for x in parameters if x[1] is not None] 142 message = urllib.urlencode(query) 143 144 # Poll NCBI until the results are ready. Use a 3 second wait 145 delay = 3.0 146 previous = time.time() 147 while True: 148 current = time.time() 149 wait = previous + delay - current 150 if wait > 0: 151 time.sleep(wait) 152 previous = current + wait 153 else: 154 previous = current 155 156 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", 157 message, 158 {"User-Agent":"BiopythonClient"}) 159 handle = urllib2.urlopen(request) 160 results = _as_string(handle.read()) 161 162 # Can see an "\n\n" page while results are in progress, 163 # if so just wait a bit longer... 164 if results=="\n\n": 165 continue 166 # XML results don't have the Status tag when finished 167 if results.find("Status=") < 0: 168 break 169 i = results.index("Status=") 170 j = results.index("\n", i) 171 status = results[i+len("Status="):j].strip() 172 if status.upper() == "READY": 173 break 174 175 return StringIO(results)
176
177 -def _parse_qblast_ref_page(handle):
178 """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE). 179 180 The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is proably 181 'Request Time of Execution' and RID would be 'Request Identifier'. 182 """ 183 s = _as_string(handle.read()) 184 i = s.find("RID =") 185 if i == -1: 186 rid = None 187 else: 188 j = s.find("\n", i) 189 rid = s[i+len("RID ="):j].strip() 190 191 i = s.find("RTOE =") 192 if i == -1: 193 rtoe = None 194 else: 195 j = s.find("\n", i) 196 rtoe = s[i+len("RTOE ="):j].strip() 197 198 if not rid and not rtoe: 199 #Can we reliably extract the error message from the HTML page? 200 #e.g. "Message ID#24 Error: Failed to read the Blast query: 201 # Nucleotide FASTA provided for protein sequence" 202 #or "Message ID#32 Error: Query contains no data: Query 203 # contains no sequence data" 204 # 205 #This used to occur inside a <div class="error msInf"> entry: 206 i = s.find('<div class="error msInf">') 207 if i != -1: 208 msg = s[i+len('<div class="error msInf">'):].strip() 209 msg = msg.split("</div>",1)[0].split("\n",1)[0].strip() 210 if msg: 211 raise ValueError("Error message from NCBI: %s" % msg) 212 #In spring 2010 the markup was like this: 213 i = s.find('<p class="error">') 214 if i != -1: 215 msg = s[i+len('<p class="error">'):].strip() 216 msg = msg.split("</p>",1)[0].split("\n",1)[0].strip() 217 if msg: 218 raise ValueError("Error message from NCBI: %s" % msg) 219 #Generic search based on the way the error messages start: 220 i = s.find('Message ID#') 221 if i != -1: 222 #Break the message at the first HTML tag 223 msg = s[i:].split("<",1)[0].split("\n",1)[0].strip() 224 raise ValueError("Error message from NCBI: %s" % msg) 225 #We didn't recognise the error layout :( 226 #print s 227 raise ValueError("No RID and no RTOE found in the 'please wait' page, " 228 "there was probably an error in your request but we " 229 "could not extract a helpful error message.") 230 elif not rid: 231 #Can this happen? 232 raise ValueError("No RID found in the 'please wait' page." 233 " (although RTOE = %s)" % repr(rtoe)) 234 elif not rtoe: 235 #Can this happen? 236 raise ValueError("No RTOE found in the 'please wait' page." 237 " (although RID = %s)" % repr(rid)) 238 239 try: 240 return rid, int(rtoe) 241 except ValueError: 242 raise ValueError("A non-integer RTOE found in " \ 243 +"the 'please wait' page, %s" % repr(rtoe))
244