1
2
3
4
5
6
7
8 """Alphabets used in Seq objects etc to declare sequence type and letters.
9
10 This is used by sequences which contain a finite number of similar words.
11 """
12
14 size = None
15 letters = None
16
17
18
19
20
22 return self.__class__.__name__ + "()"
23
25 """Does this alphabet 'contain' the other (OBSOLETE?).
26
27 Returns a boolean. This relies on the Alphabet subclassing
28 hierarchy only, and does not check the letters property.
29 This isn't ideal, and doesn't seem to work as intended
30 with the AlphabetEncoder classes."""
31 return isinstance(other, self.__class__)
32
48
50 """Return an upper case variant of the current alphabet (PRIVATE)."""
51 if not self.letters or self.letters==self.letters.upper():
52
53 return self
54 else:
55
56 return self._case_less()
57
59 """Return a lower case variant of the current alphabet (PRIVATE)."""
60 if not self.letters or self.letters==self.letters.lower():
61
62 return self
63 else:
64
65 return self._case_less()
66
67 generic_alphabet = Alphabet()
68
72
73 single_letter_alphabet = SingleLetterAlphabet()
74
75
76
79
80 generic_protein = ProteinAlphabet()
81
82
85
86 generic_nucleotide = NucleotideAlphabet()
87
90
91 generic_dna = DNAAlphabet()
92
93
94
95
98
99 generic_rna = RNAAlphabet()
100
101
102
103
104
107
109 size = 3
110 letters = [
111 "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile",
112 "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr",
113 "Sec", "Val", "Trp", "Xaa", "Tyr", "Glx",
114 ]
115
116
117
118
119
121 - def __init__(self, alphabet, new_letters):
129 if key[:2] == "__" and key[-2:] == "__":
130 raise AttributeError(key)
131 return getattr(self.alphabet, key)
132
134 return "%s(%r, %r)" % (self.__class__.__name__, self.alphabet,
135 self.new_letters)
136
138 """Does this alphabet 'contain' the other (OBSOLETE?).
139
140 This is isn't implemented for the base AlphabetEncoder,
141 which will always return 0 (False)."""
142 return 0
143
147
151
152
153 -class Gapped(AlphabetEncoder):
154 - def __init__(self, alphabet, gap_char = "-"):
157
159 """Does this alphabet 'contain' the other (OBSOLETE?).
160
161 Returns a boolean. This relies on the Alphabet subclassing
162 hierarchy, and attempts to check the gap character. This fails
163 if the other alphabet does not have a gap character!
164 """
165 return other.gap_char == self.gap_char and \
166 self.alphabet.contains(other.alphabet)
167
169 """Return an upper case variant of the current alphabet (PRIVATE)."""
170 return Gapped(self.alphabet._upper(), self.gap_char.upper())
171
173 """Return a lower case variant of the current alphabet (PRIVATE)."""
174 return Gapped(self.alphabet._lower(), self.gap_char.lower())
175
176
178 - def __init__(self, alphabet, stop_symbol = "*"):
181
187
189 """Does this alphabet 'contain' the other (OBSOLETE?).
190
191 Returns a boolean. This relies on the Alphabet subclassing
192 hierarchy, and attempts to check the stop symbol. This fails
193 if the other alphabet does not have a stop symbol!
194 """
195 return other.stop_symbol == self.stop_symbol and \
196 self.alphabet.contains(other.alphabet)
197
201
205
206
208 """Returns the non-gapped non-stop-codon Alphabet object (PRIVATE)."""
209 a = alphabet
210 while isinstance(a, AlphabetEncoder):
211 a = a.alphabet
212 assert isinstance(a, Alphabet), \
213 "Invalid alphabet found, %s" % repr(a)
214 return a
215
229
231 """Returns a common but often generic base alphabet object (PRIVATE).
232
233 This throws away any AlphabetEncoder information, e.g. Gapped alphabets.
234
235 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single
236 letter. These DO NOT raise an exception!"""
237 common = None
238 for alpha in alphabets:
239 a = _get_base_alphabet(alpha)
240 if common is None:
241 common = a
242 elif common == a:
243 pass
244 elif isinstance(a, common.__class__):
245 pass
246 elif isinstance(common, a.__class__):
247 common = a
248 elif isinstance(a, NucleotideAlphabet) \
249 and isinstance(common, NucleotideAlphabet):
250
251 common = generic_nucleotide
252 elif isinstance(a, SingleLetterAlphabet) \
253 and isinstance(common, SingleLetterAlphabet):
254
255 common = single_letter_alphabet
256 else:
257
258 return generic_alphabet
259 if common is None:
260
261 return generic_alphabet
262 return common
263
265 """Returns a common but often generic alphabet object (PRIVATE).
266
267 >>> from Bio.Alphabet import IUPAC
268 >>> _consensus_alphabet([IUPAC.extended_protein, IUPAC.protein])
269 ExtendedIUPACProtein()
270 >>> _consensus_alphabet([generic_protein, IUPAC.protein])
271 ProteinAlphabet()
272
273 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single
274 letter. These DO NOT raise an exception!
275
276 >>> _consensus_alphabet([generic_dna, generic_nucleotide])
277 NucleotideAlphabet()
278 >>> _consensus_alphabet([generic_dna, generic_rna])
279 NucleotideAlphabet()
280 >>> _consensus_alphabet([generic_dna, generic_protein])
281 SingleLetterAlphabet()
282 >>> _consensus_alphabet([single_letter_alphabet, generic_protein])
283 SingleLetterAlphabet()
284
285 This is aware of Gapped and HasStopCodon and new letters added by
286 other AlphabetEncoders. This WILL raise an exception if more than
287 one gap character or stop symbol is present.
288
289 >>> from Bio.Alphabet import IUPAC
290 >>> _consensus_alphabet([Gapped(IUPAC.extended_protein), HasStopCodon(IUPAC.protein)])
291 HasStopCodon(Gapped(ExtendedIUPACProtein(), '-'), '*')
292 >>> _consensus_alphabet([Gapped(IUPAC.protein, "-"), Gapped(IUPAC.protein, "=")])
293 Traceback (most recent call last):
294 ...
295 ValueError: More than one gap character present
296 >>> _consensus_alphabet([HasStopCodon(IUPAC.protein, "*"), HasStopCodon(IUPAC.protein, "+")])
297 Traceback (most recent call last):
298 ...
299 ValueError: More than one stop symbol present
300 """
301 base = _consensus_base_alphabet(alphabets)
302 gap = None
303 stop = None
304 new_letters = ""
305 for alpha in alphabets:
306
307 if not hasattr(alpha, "gap_char"):
308 pass
309 elif gap is None:
310 gap = alpha.gap_char
311 elif gap == alpha.gap_char:
312 pass
313 else:
314 raise ValueError("More than one gap character present")
315
316 if not hasattr(alpha, "stop_symbol"):
317 pass
318 elif stop is None:
319 stop = alpha.stop_symbol
320 elif stop == alpha.stop_symbol:
321 pass
322 else:
323 raise ValueError("More than one stop symbol present")
324
325 if hasattr(alpha, "new_letters"):
326 for letter in alpha.new_letters:
327 if letter not in new_letters \
328 and letter != gap and letter != stop:
329 new_letters += letter
330
331 alpha = base
332 if new_letters:
333 alpha = AlphabetEncoder(alpha, new_letters)
334 if gap:
335 alpha = Gapped(alpha, gap_char=gap)
336 if stop:
337 alpha = HasStopCodon(alpha, stop_symbol=stop)
338 return alpha
339
341 """Returns True except for DNA+RNA or Nucleotide+Protein (PRIVATE).
342
343 >>> _check_type_compatible([generic_dna, generic_nucleotide])
344 True
345 >>> _check_type_compatible([generic_dna, generic_rna])
346 False
347 >>> _check_type_compatible([generic_dna, generic_protein])
348 False
349 >>> _check_type_compatible([single_letter_alphabet, generic_protein])
350 True
351
352 This relies on the Alphabet subclassing hierarchy. It does not
353 check things like gap characters or stop symbols."""
354 dna, rna, nucl, protein = False, False, False, False
355 for alpha in alphabets:
356 a = _get_base_alphabet(alpha)
357 if isinstance(a, DNAAlphabet):
358 dna = True
359 nucl = True
360 if rna or protein : return False
361 elif isinstance(a, RNAAlphabet):
362 rna = True
363 nucl = True
364 if dna or protein : return False
365 elif isinstance(a, NucleotideAlphabet):
366 nucl = True
367 if protein : return False
368 elif isinstance(a, ProteinAlphabet):
369 protein = True
370 if nucl : return False
371 return True
372
374 """Check all letters in sequence are in the alphabet (PRIVATE).
375
376 >>> from Bio.Seq import Seq
377 >>> from Bio.Alphabet import IUPAC
378 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
379 ... IUPAC.protein)
380 >>> _verify_alphabet(my_seq)
381 True
382
383 This example has an X, which is not in the IUPAC protein alphabet
384 (you should be using the IUPAC extended protein alphabet):
385
386 >>> bad_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVFX",
387 ... IUPAC.protein)
388 >>> _verify_alphabet(bad_seq)
389 False
390
391 This replaces Bio.utils.verify_alphabet() since we are deprecating
392 that. Potentially this could be added to the Alphabet object, and
393 I would like it to be an option when creating a Seq object... but
394 that might slow things down.
395 """
396 letters = sequence.alphabet.letters
397 if not letters:
398 raise ValueError("Alphabet does not define letters.")
399 for letter in sequence:
400 if letter not in letters:
401 return False
402 return True
403