1
2
3
4
5
6
7 """Module for working with Prosite files from ExPASy (DEPRECATED).
8
9 Most of the functionality in this module has moved to Bio.ExPASy.Prosite;
10 please see
11
12 Bio.ExPASy.Prosite.read To read a Prosite file containing one entry.
13 Bio.ExPASy.Prosite.parse Iterates over entries in a Prosite file.
14 Bio.ExPASy.Prosite.Record Holds Prosite data.
15
16 For
17 scan_sequence_expasy Scan a sequence for occurrences of Prosite patterns.
18 _extract_pattern_hits Extract Prosite patterns from a web page.
19 PatternHit Holds data from a hit against a Prosite pattern.
20 please see the new module Bio.ExPASy.ScanProsite.
21
22 The other functions and classes in Bio.Prosite (including
23 Bio.Prosite.index_file and Bio.Prosite.Dictionary) are considered deprecated,
24 and were not moved to Bio.ExPASy.Prosite. If you use this functionality,
25 please contact the Biopython developers at biopython-dev@biopython.org to
26 avoid permanent removal of this module from Biopython.
27
28
29 This module provides code to work with the prosite dat file from
30 Prosite.
31 http://www.expasy.ch/prosite/
32
33 Tested with:
34 Release 15.0, July 1998
35 Release 16.0, July 1999
36 Release 17.0, Dec 2001
37 Release 19.0, Mar 2006
38
39
40 Functions:
41 parse Iterates over entries in a Prosite file.
42 scan_sequence_expasy Scan a sequence for occurrences of Prosite patterns.
43 index_file Index a Prosite file for a Dictionary.
44 _extract_record Extract Prosite data from a web page.
45 _extract_pattern_hits Extract Prosite patterns from a web page.
46
47
48 Classes:
49 Record Holds Prosite data.
50 PatternHit Holds data from a hit against a Prosite pattern.
51 Dictionary Accesses a Prosite file using a dictionary interface.
52 RecordParser Parses a Prosite record into a Record object.
53
54 _Scanner Scans Prosite-formatted data.
55 _RecordConsumer Consumes Prosite data to a Record object.
56
57 """
58
59 import warnings
60 import Bio
61 warnings.warn("Bio.Prosite is deprecated, and will be removed in a"\
62 " future release of Biopython. Most of the functionality "
63 " is now provided by Bio.ExPASy.Prosite. If you want to "
64 " continue to use Bio.Prosite, please get in contact "
65 " via the mailing lists to avoid its permanent removal from"\
66 " Biopython.", Bio.BiopythonDeprecationWarning)
67
68 from types import *
69 import re
70 import sgmllib
71 from Bio import File
72 from Bio import Index
73 from Bio.ParserSupport import *
74
75
76
77
91
106
108 """Holds information from a Prosite record.
109
110 Members:
111 name ID of the record. e.g. ADH_ZINC
112 type Type of entry. e.g. PATTERN, MATRIX, or RULE
113 accession e.g. PS00387
114 created Date the entry was created. (MMM-YYYY)
115 data_update Date the 'primary' data was last updated.
116 info_update Date data other than 'primary' data was last updated.
117 pdoc ID of the PROSITE DOCumentation.
118
119 description Free-format description.
120 pattern The PROSITE pattern. See docs.
121 matrix List of strings that describes a matrix entry.
122 rules List of rule definitions (from RU lines). (strings)
123 prorules List of prorules (from PR lines). (strings)
124
125 NUMERICAL RESULTS
126 nr_sp_release SwissProt release.
127 nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int)
128 nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs)
129 nr_positive True positives. tuple of (hits, seqs)
130 nr_unknown Could be positives. tuple of (hits, seqs)
131 nr_false_pos False positives. tuple of (hits, seqs)
132 nr_false_neg False negatives. (int)
133 nr_partial False negatives, because they are fragments. (int)
134
135 COMMENTS
136 cc_taxo_range Taxonomic range. See docs for format
137 cc_max_repeat Maximum number of repetitions in a protein
138 cc_site Interesting site. list of tuples (pattern pos, desc.)
139 cc_skip_flag Can this entry be ignored?
140 cc_matrix_type
141 cc_scaling_db
142 cc_author
143 cc_ft_key
144 cc_ft_desc
145 cc_version version number (introduced in release 19.0)
146
147 DATA BANK REFERENCES - The following are all
148 lists of tuples (swiss-prot accession,
149 swiss-prot name)
150 dr_positive
151 dr_false_neg
152 dr_false_pos
153 dr_potential Potential hits, but fingerprint region not yet available.
154 dr_unknown Could possibly belong
155
156 pdb_structs List of PDB entries.
157
158 """
160 self.name = ''
161 self.type = ''
162 self.accession = ''
163 self.created = ''
164 self.data_update = ''
165 self.info_update = ''
166 self.pdoc = ''
167
168 self.description = ''
169 self.pattern = ''
170 self.matrix = []
171 self.rules = []
172 self.prorules = []
173 self.postprocessing = []
174
175 self.nr_sp_release = ''
176 self.nr_sp_seqs = ''
177 self.nr_total = (None, None)
178 self.nr_positive = (None, None)
179 self.nr_unknown = (None, None)
180 self.nr_false_pos = (None, None)
181 self.nr_false_neg = None
182 self.nr_partial = None
183
184 self.cc_taxo_range = ''
185 self.cc_max_repeat = ''
186 self.cc_site = []
187 self.cc_skip_flag = ''
188
189 self.dr_positive = []
190 self.dr_false_neg = []
191 self.dr_false_pos = []
192 self.dr_potential = []
193 self.dr_unknown = []
194
195 self.pdb_structs = []
196
198 """Holds information from a hit against a Prosite pattern.
199
200 Members:
201 name ID of the record. e.g. ADH_ZINC
202 accession e.g. PS00387
203 pdoc ID of the PROSITE DOCumentation.
204 description Free-format description.
205 matches List of tuples (start, end, sequence) where
206 start and end are indexes of the match, and sequence is
207 the sequence matched.
208
209 """
217 lines = []
218 lines.append("%s %s %s" % (self.accession, self.pdoc, self.name))
219 lines.append(self.description)
220 lines.append('')
221 if len(self.matches) > 1:
222 lines.append("Number of matches: %s" % len(self.matches))
223 for i in range(len(self.matches)):
224 start, end, seq = self.matches[i]
225 range_str = "%d-%d" % (start, end)
226 if len(self.matches) > 1:
227 lines.append("%7d %10s %s" % (i+1, range_str, seq))
228 else:
229 lines.append("%7s %10s %s" % (' ', range_str, seq))
230 return "\n".join(lines)
231
232
234 """Accesses a Prosite file using a dictionary interface.
235
236 """
237 __filename_key = '__filename'
238
239 - def __init__(self, indexname, parser=None):
240 """__init__(self, indexname, parser=None)
241
242 Open a Prosite Dictionary. indexname is the name of the
243 index for the dictionary. The index should have been created
244 using the index_file function. parser is an optional Parser
245 object to change the results into another form. If set to None,
246 then the raw contents of the file will be returned.
247
248 """
249 self._index = Index.Index(indexname)
250 self._handle = open(self._index[Dictionary.__filename_key])
251 self._parser = parser
252
255
263
266
268 """Parses Prosite data into a Record object.
269
270 """
274
275 - def parse(self, handle):
276 self._scanner.feed(handle, self._consumer)
277 return self._consumer.data
278
280 """Scans Prosite-formatted data.
281
282 Tested with:
283 Release 15.0, July 1998
284
285 """
286 - def feed(self, handle, consumer):
287 """feed(self, handle, consumer)
288
289 Feed in Prosite data for scanning. handle is a file-like
290 object that contains prosite data. consumer is a
291 Consumer object that will receive events as the report is scanned.
292
293 """
294 if isinstance(handle, File.UndoHandle):
295 uhandle = handle
296 else:
297 uhandle = File.UndoHandle(handle)
298
299 consumer.finished = False
300 while not consumer.finished:
301 line = uhandle.peekline()
302 if not line:
303 break
304 elif is_blank_line(line):
305
306 uhandle.readline()
307 continue
308 elif line[:2] == 'ID':
309 self._scan_record(uhandle, consumer)
310 elif line[:2] == 'CC':
311 self._scan_copyrights(uhandle, consumer)
312 else:
313 raise ValueError("There doesn't appear to be a record")
314
316 consumer.start_copyrights()
317 self._scan_line('CC', uhandle, consumer.copyright, any_number=1)
318 self._scan_terminator(uhandle, consumer)
319 consumer.end_copyrights()
320
333
334 - def _scan_line(self, line_type, uhandle, event_fn,
335 exactly_one=None, one_or_more=None, any_number=None,
336 up_to_one=None):
354
357
360
363
366
369
372
373
374
375
376
377
378
379
380
381
382
383
384
385
389
392
396
399
403
407
411
414
417
418
419
420
421 _scan_fns = [
422 _scan_id,
423 _scan_ac,
424 _scan_dt,
425 _scan_de,
426 _scan_pa,
427 _scan_ma,
428 _scan_pp,
429 _scan_ru,
430 _scan_nr,
431 _scan_cc,
432
433
434
435
436
437 _scan_ma,
438 _scan_nr,
439 _scan_cc,
440
441 _scan_dr,
442 _scan_3d,
443 _scan_pr,
444 _scan_do,
445 _scan_terminator
446 ]
447
449 """Consumer that converts a Prosite record to a Record object.
450
451 Members:
452 data Record with Prosite data.
453
454 """
457
460
462 self._clean_record(self.data)
463
465 cols = line.split()
466 if len(cols) != 3:
467 raise ValueError("I don't understand identification line\n%s" \
468 % line)
469 self.data.name = self._chomp(cols[1])
470 self.data.type = self._chomp(cols[2])
471
473 cols = line.split()
474 if len(cols) != 2:
475 raise ValueError("I don't understand accession line\n%s" % line)
476 self.data.accession = self._chomp(cols[1])
477
478 - def date(self, line):
479 uprline = line.upper()
480 cols = uprline.split()
481
482
483 if cols[2] != '(CREATED);' or \
484 cols[4] != '(DATA' or cols[5] != 'UPDATE);' or \
485 cols[7][:4] != '(INF' or cols[8] != 'UPDATE).':
486 raise ValueError("I don't understand date line\n%s" % line)
487
488 self.data.created = cols[1]
489 self.data.data_update = cols[3]
490 self.data.info_update = cols[6]
491
494
497
500
501 - def postprocessing(self, line):
504
505 - def rule(self, line):
507
509 cols = self._clean(line).split(";")
510 for col in cols:
511 if not col:
512 continue
513 qual, data = [word.lstrip() for word in col.split("=")]
514 if qual == '/RELEASE':
515 release, seqs = data.split(",")
516 self.data.nr_sp_release = release
517 self.data.nr_sp_seqs = int(seqs)
518 elif qual == '/FALSE_NEG':
519 self.data.nr_false_neg = int(data)
520 elif qual == '/PARTIAL':
521 self.data.nr_partial = int(data)
522 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']:
523 m = re.match(r'(\d+)\((\d+)\)', data)
524 if not m:
525 raise Exception("Broken data %s in comment line\n%s" \
526 % (repr(data), line))
527 hits = tuple(map(int, m.groups()))
528 if(qual == "/TOTAL"):
529 self.data.nr_total = hits
530 elif(qual == "/POSITIVE"):
531 self.data.nr_positive = hits
532 elif(qual == "/UNKNOWN"):
533 self.data.nr_unknown = hits
534 elif(qual == "/FALSE_POS"):
535 self.data.nr_false_pos = hits
536 else:
537 raise ValueError("Unknown qual %s in comment line\n%s" \
538 % (repr(qual), line))
539
581
600
605
610
613
616
617 - def _chomp(self, word, to_chomp='.,;'):
618
619 if word[-1] in to_chomp:
620 return word[:-1]
621 return word
622
623 - def _clean(self, line, rstrip=1):
624
625 if rstrip:
626 return line[5:].rstrip()
627 return line[5:]
628
630 """scan_sequence_expasy(seq=None, id=None, exclude_frequent=None) ->
631 list of PatternHit's
632
633 Search a sequence for occurrences of Prosite patterns. You can
634 specify either a sequence in seq or a SwissProt/trEMBL ID or accession
635 in id. Only one of those should be given. If exclude_frequent
636 is true, then the patterns with the high probability of occurring
637 will be excluded.
638
639 """
640 from Bio import ExPASy
641 if (seq and id) or not (seq or id):
642 raise ValueError("Please specify either a sequence or an id")
643 handle = ExPASy.scanprosite1(seq, id, exclude_frequent)
644 return _extract_pattern_hits(handle)
645
647 """_extract_pattern_hits(handle) -> list of PatternHit's
648
649 Extract hits from a web page. Raises a ValueError if there
650 was an error in the query.
651
652 """
653 class parser(sgmllib.SGMLParser):
654 def __init__(self):
655 sgmllib.SGMLParser.__init__(self)
656 self.hits = []
657 self.broken_message = 'Some error occurred'
658 self._in_pre = 0
659 self._current_hit = None
660 self._last_found = None
661 def handle_data(self, data):
662 if data.find('try again') >= 0:
663 self.broken_message = data
664 return
665 elif data == 'illegal':
666 self.broken_message = 'Sequence contains illegal characters'
667 return
668 if not self._in_pre:
669 return
670 elif not data.strip():
671 return
672 if self._last_found is None and data[:4] == 'PDOC':
673 self._current_hit.pdoc = data
674 self._last_found = 'pdoc'
675 elif self._last_found == 'pdoc':
676 if data[:2] != 'PS':
677 raise ValueError("Expected accession but got:\n%s" % data)
678 self._current_hit.accession = data
679 self._last_found = 'accession'
680 elif self._last_found == 'accession':
681 self._current_hit.name = data
682 self._last_found = 'name'
683 elif self._last_found == 'name':
684 self._current_hit.description = data
685 self._last_found = 'description'
686 elif self._last_found == 'description':
687 m = re.findall(r'(\d+)-(\d+) (\w+)', data)
688 for start, end, seq in m:
689 self._current_hit.matches.append(
690 (int(start), int(end), seq))
691
692 def do_hr(self, attrs):
693
694 if self._in_pre:
695 self._current_hit = PatternHit()
696 self.hits.append(self._current_hit)
697 self._last_found = None
698 def start_pre(self, attrs):
699 self._in_pre = 1
700 self.broken_message = None
701 def end_pre(self):
702 self._in_pre = 0
703 p = parser()
704 p.feed(handle.read())
705 if p.broken_message:
706 raise ValueError(p.broken_message)
707 return p.hits
708
709
710
711
712 -def index_file(filename, indexname, rec2key=None):
713 """index_file(filename, indexname, rec2key=None)
714
715 Index a Prosite file. filename is the name of the file.
716 indexname is the name of the dictionary. rec2key is an
717 optional callback that takes a Record and generates a unique key
718 (e.g. the accession number) for the record. If not specified,
719 the id name will be used.
720
721 """
722 import os
723 if not os.path.exists(filename):
724 raise ValueError("%s does not exist" % filename)
725
726 index = Index.Index(indexname, truncate=1)
727 index[Dictionary._Dictionary__filename_key] = filename
728
729 handle = open(filename)
730 records = parse(handle)
731 end = 0L
732 for record in records:
733 start = end
734 end = handle.tell()
735 length = end - start
736
737 if rec2key is not None:
738 key = rec2key(record)
739 else:
740 key = record.name
741
742 if not key:
743 raise KeyError("empty key was produced")
744 elif key in index:
745 raise KeyError("duplicate key %s found" % key)
746
747 index[key] = start, length
748