1
2
3
4
5
6
7
8
9 """
10 This module provides code to work with the WWW version of BLAST
11 provided by the NCBI.
12 http://blast.ncbi.nlm.nih.gov/
13
14 Functions:
15 qblast Do a BLAST search using the QBLAST API.
16 """
17
18 import sys
19 try:
20 from cStringIO import StringIO
21 except ImportError:
22 from StringIO import StringIO
23
24 from Bio._py3k import _as_string
25
26 -def qblast(program, database, sequence,
27 auto_format=None,composition_based_statistics=None,
28 db_genetic_code=None,endpoints=None,entrez_query='(none)',
29 expect=10.0,filter=None,gapcosts=None,genetic_code=None,
30 hitlist_size=50,i_thresh=None,layout=None,lcase_mask=None,
31 matrix_name=None,nucl_penalty=None,nucl_reward=None,
32 other_advanced=None,perc_ident=None,phi_pattern=None,
33 query_file=None,query_believe_defline=None,query_from=None,
34 query_to=None,searchsp_eff=None,service=None,threshold=None,
35 ungapped_alignment=None,word_size=None,
36 alignments=500,alignment_view=None,descriptions=500,
37 entrez_links_new_window=None,expect_low=None,expect_high=None,
38 format_entrez_query=None,format_object=None,format_type='XML',
39 ncbi_gi=None,results_file=None,show_overview=None, megablast=None,
40 ):
41 """Do a BLAST search using the QBLAST server at NCBI.
42
43 Supports all parameters of the qblast API for Put and Get.
44 Some useful parameters:
45 program blastn, blastp, blastx, tblastn, or tblastx (lower case)
46 database Which database to search against (e.g. "nr").
47 sequence The sequence to search.
48 ncbi_gi TRUE/FALSE whether to give 'gi' identifier.
49 descriptions Number of descriptions to show. Def 500.
50 alignments Number of alignments to show. Def 500.
51 expect An expect value cutoff. Def 10.0.
52 matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
53 filter "none" turns off filtering. Default no filtering
54 format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML".
55 entrez_query Entrez query to limit Blast search
56 hitlist_size Number of hits to return. Default 50
57 megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
58
59 This function does no checking of the validity of the parameters
60 and passes the values to the server as is. More help is available at:
61 http://www.ncbi.nlm.nih.gov/BLAST/blast_overview.html
62
63 """
64 import urllib, urllib2
65 import time
66
67 assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']
68
69
70
71
72 parameters = [
73 ('AUTO_FORMAT',auto_format),
74 ('COMPOSITION_BASED_STATISTICS',composition_based_statistics),
75 ('DATABASE',database),
76 ('DB_GENETIC_CODE',db_genetic_code),
77 ('ENDPOINTS',endpoints),
78 ('ENTREZ_QUERY',entrez_query),
79 ('EXPECT',expect),
80 ('FILTER',filter),
81 ('GAPCOSTS',gapcosts),
82 ('GENETIC_CODE',genetic_code),
83 ('HITLIST_SIZE',hitlist_size),
84 ('I_THRESH',i_thresh),
85 ('LAYOUT',layout),
86 ('LCASE_MASK',lcase_mask),
87 ('MEGABLAST',megablast),
88 ('MATRIX_NAME',matrix_name),
89 ('NUCL_PENALTY',nucl_penalty),
90 ('NUCL_REWARD',nucl_reward),
91 ('OTHER_ADVANCED',other_advanced),
92 ('PERC_IDENT',perc_ident),
93 ('PHI_PATTERN',phi_pattern),
94 ('PROGRAM',program),
95
96 ('QUERY',sequence),
97 ('QUERY_FILE',query_file),
98 ('QUERY_BELIEVE_DEFLINE',query_believe_defline),
99 ('QUERY_FROM',query_from),
100 ('QUERY_TO',query_to),
101
102 ('SEARCHSP_EFF',searchsp_eff),
103 ('SERVICE',service),
104 ('THRESHOLD',threshold),
105 ('UNGAPPED_ALIGNMENT',ungapped_alignment),
106 ('WORD_SIZE',word_size),
107 ('CMD', 'Put'),
108 ]
109 query = [x for x in parameters if x[1] is not None]
110 message = urllib.urlencode(query)
111
112
113
114
115
116 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi",
117 message,
118 {"User-Agent":"BiopythonClient"})
119 handle = urllib2.urlopen(request)
120
121
122
123 rid, rtoe = _parse_qblast_ref_page(handle)
124 parameters = [
125 ('ALIGNMENTS',alignments),
126 ('ALIGNMENT_VIEW',alignment_view),
127 ('DESCRIPTIONS',descriptions),
128 ('ENTREZ_LINKS_NEW_WINDOW',entrez_links_new_window),
129 ('EXPECT_LOW',expect_low),
130 ('EXPECT_HIGH',expect_high),
131 ('FORMAT_ENTREZ_QUERY',format_entrez_query),
132 ('FORMAT_OBJECT',format_object),
133 ('FORMAT_TYPE',format_type),
134 ('NCBI_GI',ncbi_gi),
135 ('RID',rid),
136 ('RESULTS_FILE',results_file),
137 ('SERVICE',service),
138 ('SHOW_OVERVIEW',show_overview),
139 ('CMD', 'Get'),
140 ]
141 query = [x for x in parameters if x[1] is not None]
142 message = urllib.urlencode(query)
143
144
145 delay = 3.0
146 previous = time.time()
147 while True:
148 current = time.time()
149 wait = previous + delay - current
150 if wait > 0:
151 time.sleep(wait)
152 previous = current + wait
153 else:
154 previous = current
155
156 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi",
157 message,
158 {"User-Agent":"BiopythonClient"})
159 handle = urllib2.urlopen(request)
160 results = _as_string(handle.read())
161
162
163
164 if results=="\n\n":
165 continue
166
167 if results.find("Status=") < 0:
168 break
169 i = results.index("Status=")
170 j = results.index("\n", i)
171 status = results[i+len("Status="):j].strip()
172 if status.upper() == "READY":
173 break
174
175 return StringIO(results)
176
178 """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE).
179
180 The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is proably
181 'Request Time of Execution' and RID would be 'Request Identifier'.
182 """
183 s = _as_string(handle.read())
184 i = s.find("RID =")
185 if i == -1:
186 rid = None
187 else:
188 j = s.find("\n", i)
189 rid = s[i+len("RID ="):j].strip()
190
191 i = s.find("RTOE =")
192 if i == -1:
193 rtoe = None
194 else:
195 j = s.find("\n", i)
196 rtoe = s[i+len("RTOE ="):j].strip()
197
198 if not rid and not rtoe:
199
200
201
202
203
204
205
206 i = s.find('<div class="error msInf">')
207 if i != -1:
208 msg = s[i+len('<div class="error msInf">'):].strip()
209 msg = msg.split("</div>",1)[0].split("\n",1)[0].strip()
210 if msg:
211 raise ValueError("Error message from NCBI: %s" % msg)
212
213 i = s.find('<p class="error">')
214 if i != -1:
215 msg = s[i+len('<p class="error">'):].strip()
216 msg = msg.split("</p>",1)[0].split("\n",1)[0].strip()
217 if msg:
218 raise ValueError("Error message from NCBI: %s" % msg)
219
220 i = s.find('Message ID#')
221 if i != -1:
222
223 msg = s[i:].split("<",1)[0].split("\n",1)[0].strip()
224 raise ValueError("Error message from NCBI: %s" % msg)
225
226
227 raise ValueError("No RID and no RTOE found in the 'please wait' page, "
228 "there was probably an error in your request but we "
229 "could not extract a helpful error message.")
230 elif not rid:
231
232 raise ValueError("No RID found in the 'please wait' page."
233 " (although RTOE = %s)" % repr(rtoe))
234 elif not rtoe:
235
236 raise ValueError("No RTOE found in the 'please wait' page."
237 " (although RID = %s)" % repr(rid))
238
239 try:
240 return rid, int(rtoe)
241 except ValueError:
242 raise ValueError("A non-integer RTOE found in " \
243 +"the 'please wait' page, %s" % repr(rtoe))
244