Package Bio :: Package SeqUtils :: Module CheckSum
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.CheckSum

  1  # Copyright 2002 by Yves Bastide and Brad Chapman. 
  2  # Copyright 2007 by Sebastian Bassi 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Functions to calculate assorted sequence checksums.""" 
  9   
 10  # crc32, crc64, gcg, and seguid 
 11  # crc64 is adapted from BioPerl 
 12   
 13  from binascii import crc32 as _crc32 
 14  from Bio._py3k import _as_bytes 
 15   
16 -def crc32(seq):
17 """Returns the crc32 checksum for a sequence (string or Seq object).""" 18 #NOTE - On Python 2 returns a signed int, on Python 3 it is unsigned 19 #Docs suggest should use crc32(x) & 0xffffffff for consistency. 20 #TODO - Should we return crc32(x) & 0xffffffff here? 21 try: 22 #Assume its a Seq object 23 return _crc32(_as_bytes(seq.tostring())) 24 except AttributeError: 25 #Assume its a string/unicode 26 return _crc32(_as_bytes(seq))
27
28 -def _init_table_h():
29 _table_h = [] 30 for i in range(256): 31 l = i 32 part_h = 0 33 for j in range(8): 34 rflag = l & 1 35 l >>= 1 36 if part_h & 1: l |= (1L << 31) 37 part_h >>= 1L 38 if rflag: part_h ^= 0xd8000000L 39 _table_h.append(part_h) 40 return _table_h
41 42 # Initialisation 43 _table_h = _init_table_h() 44
45 -def crc64(s):
46 """Returns the crc64 checksum for a sequence (string or Seq object).""" 47 crcl = 0 48 crch = 0 49 for c in s: 50 shr = (crch & 0xFF) << 24 51 temp1h = crch >> 8 52 temp1l = (crcl >> 8) | shr 53 idx = (crcl ^ ord(c)) & 0xFF 54 crch = temp1h ^ _table_h[idx] 55 crcl = temp1l 56 57 return "CRC-%08X%08X" % (crch, crcl)
58 59
60 -def gcg(seq):
61 """Returns the GCG checksum (int) for a sequence (string or Seq object). 62 63 Given a nucleotide or amino-acid secuence (or any string), 64 returns the GCG checksum (int). Checksum used by GCG program. 65 seq type = str. 66 Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi 67 with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina. 68 All sequences are converted to uppercase """ 69 try: 70 #Assume its a Seq object 71 seq = seq.tostring() 72 except AttributeError: 73 #Assume its a string 74 pass 75 index = checksum = 0 76 for char in seq: 77 index += 1 78 checksum += index * ord(char.upper()) 79 if index == 57: index = 0 80 return checksum % 10000
81
82 -def seguid(seq):
83 """Returns the SEGUID (string) for a sequence (string or Seq object). 84 85 Given a nucleotide or amino-acid secuence (or any string), 86 returns the SEGUID string (A SEquence Globally Unique IDentifier). 87 seq type = str. 88 For more information about SEGUID, see: 89 http://bioinformatics.anl.gov/seguid/ 90 DOI: 10.1002/pmic.200600032 """ 91 try: 92 #Python 2.5 sha1 is in hashlib 93 import hashlib 94 m = hashlib.sha1() 95 except: 96 #For older versions 97 import sha 98 m = sha.new() 99 import base64 100 try: 101 #Assume its a Seq object 102 seq = seq.tostring() 103 except AttributeError: 104 #Assume its a string 105 pass 106 m.update(_as_bytes(seq.upper())) 107 try: 108 #For Python 3+ 109 return base64.encodebytes(m.digest()).decode().replace("\n","").rstrip("=") 110 except AttributeError: 111 pass 112 try: 113 #For Python 2.5+ 114 return base64.b64encode(m.digest()).rstrip("=") 115 except: 116 #For older versions 117 import os 118 #Note: Using os.linesep doesn't work on Windows, 119 #where os.linesep= "\r\n" but the encoded string 120 #contains "\n" but not "\r\n" 121 return base64.encodestring(m.digest()).replace("\n","").rstrip("=")
122 123 if __name__ == "__main__": 124 print "Quick self test" 125 126 str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ 127 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ 128 + "YCSSYAGSSTLVFGGGTKLTVL" 129 130 str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ 131 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ 132 + "YCCSYAGSSTWVFGGGTKLTVL" 133 134 assert crc64(str_light_chain_one) == crc64(str_light_chain_two) 135 assert 'CRC-44CAAD88706CC153' == crc64(str_light_chain_one) 136 137 assert 'BpBeDdcNUYNsdk46JoJdw7Pd3BI' == seguid(str_light_chain_one) 138 assert 'X5XEaayob1nZLOc7eVT9qyczarY' == seguid(str_light_chain_two) 139 140 print "Done" 141