Package Bio :: Package NeuralNetwork :: Package Gene :: Module Signature
[hide private]
[frames] | no frames]

Source Code for Module Bio.NeuralNetwork.Gene.Signature

  1  """Find and deal with signatures in biological sequence data. 
  2   
  3  In addition to representing sequences according to motifs (see Motif.py 
  4  for more information), we can also use Signatures, which are conserved 
  5  regions that are not necessarily consecutive. This may be useful in 
  6  the case of very diverged sequences, where signatures may pick out 
  7  important conservation that can't be found by motifs (hopefully!). 
  8  """ 
  9  # biopython 
 10  from Bio.Alphabet import _verify_alphabet 
 11  from Bio.Seq import Seq 
 12   
 13  # local stuff 
 14  from Pattern import PatternRepository 
 15   
16 -class SignatureFinder:
17 """Find Signatures in a group of sequence records. 18 19 In this simple implementation, signatures are just defined as a 20 two motifs separated by a gap. We need something a lot smarter than 21 this to find more complicated signatures. 22 """
23 - def __init__(self, alphabet_strict = 1):
24 """Initialize a finder to get signatures. 25 26 Arguments: 27 28 o alphabet_strict - Specify whether signatures should be required 29 to have all letters in the signature be consistent with the 30 alphabet of the original sequence. This requires that all Seqs 31 used have a consistent alphabet. This helps protect against getting 32 useless signatures full of ambiguity signals. 33 """ 34 self._alphabet_strict = alphabet_strict
35
36 - def find(self, seq_records, signature_size, max_gap):
37 """Find all signatures in a group of sequences. 38 39 Arguments: 40 41 o seq_records - A list of SeqRecord objects we'll use the sequences 42 from to find signatures. 43 44 o signature_size - The size of each half of a signature (ie. if this 45 is set at 3, then the signature could be AGC-----GAC) 46 47 o max_gap - The maximum gap size between two parts of a signature. 48 """ 49 sig_info = self._get_signature_dict(seq_records, signature_size, 50 max_gap) 51 52 return PatternRepository(sig_info)
53
54 - def _get_signature_dict(self, seq_records, sig_size, max_gap):
55 """Return a dictionary with all signatures and their counts. 56 57 This internal function does all of the hard work for the 58 find_signatures function. 59 """ 60 if self._alphabet_strict: 61 alphabet = seq_records[0].seq.alphabet 62 else: 63 alphabet = None 64 65 # loop through all records to find signatures 66 all_sigs = {} 67 for seq_record in seq_records: 68 # if we are working with alphabets, make sure we are consistent 69 if alphabet is not None: 70 assert seq_record.seq.alphabet == alphabet, \ 71 "Working with alphabet %s and got %s" % \ 72 (alphabet, seq_record.seq.alphabet) 73 74 # now start finding signatures in the sequence 75 largest_sig_size = sig_size * 2 + max_gap 76 for start in range(len(seq_record.seq) - (largest_sig_size - 1)): 77 # find the first part of the signature 78 first_sig = seq_record.seq[start:start + sig_size].tostring() 79 80 # now find all of the second parts of the signature 81 for second in range(start + 1, (start + 1) + max_gap): 82 second_sig = seq_record.seq[second: second + sig_size].tostring() 83 84 # if we are being alphabet strict, make sure both parts 85 # of the sig fall within the specified alphabet 86 if alphabet is not None: 87 first_seq = Seq(first_sig, alphabet) 88 second_seq = Seq(second_sig, alphabet) 89 if _verify_alphabet(first_seq) \ 90 and _verify_alphabet(second_seq): 91 all_sigs = self._add_sig(all_sigs, 92 (first_sig, second_sig)) 93 94 # if we are not being strict, just add the motif 95 else: 96 all_sigs = self._add_sig(all_sigs, 97 (first_sig, second_sig)) 98 99 return all_sigs
100
101 - def _add_sig(self, sig_dict, sig_to_add):
102 """Add a signature to the given dictionary. 103 """ 104 # incrememt the count of the signature if it is already present 105 if sig_to_add in sig_dict: 106 sig_dict[sig_to_add] += 1 107 # otherwise add it to the dictionary 108 else: 109 sig_dict[sig_to_add] = 1 110 111 return sig_dict
112
113 -class SignatureCoder:
114 """Convert a Sequence into its signature representatives. 115 116 This takes a sequence and a set of signatures, and converts the 117 sequence into a list of numbers representing the relative amounts 118 each signature is seen in the sequence. This allows a sequence to 119 serve as input into a neural network. 120 """
121 - def __init__(self, signatures, max_gap):
122 """Initialize with the signatures to look for. 123 124 Arguments: 125 126 o signatures - A complete list of signatures, in order, that 127 are to be searched for in the sequences. The signatures should 128 be represented as a tuple of (first part of the signature, 129 second_part of the signature) -- ('GATC', 'GATC'). 130 131 o max_gap - The maximum gap we can have between the two 132 elements of the signature. 133 """ 134 self._signatures = signatures 135 self._max_gap = max_gap 136 137 # check to be sure the signatures are all the same size 138 # only do this if we actually have signatures 139 if len(self._signatures) > 0: 140 first_sig_size = len(self._signatures[0][0]) 141 second_sig_size = len(self._signatures[0][1]) 142 143 assert first_sig_size == second_sig_size, \ 144 "Ends of the signature do not match: %s" \ 145 % self._signatures[0] 146 147 for sig in self._signatures: 148 assert len(sig[0]) == first_sig_size, \ 149 "Got first part of signature %s, expected size %s" % \ 150 (sig[0], first_sig_size) 151 assert len(sig[1]) == second_sig_size, \ 152 "Got second part of signature %s, expected size %s" % \ 153 (sig[1], second_sig_size)
154
155 - def representation(self, sequence):
156 """Convert a sequence into a representation of its signatures. 157 158 Arguments: 159 160 o sequence - A Seq object we are going to convert into a set of 161 signatures. 162 163 Returns: 164 A list of relative signature representations. Each item in the 165 list corresponds to the signature passed in to the initializer and 166 is the number of times that the signature was found, divided by the 167 total number of signatures found in the sequence. 168 """ 169 # check to be sure we have signatures to deal with, 170 # otherwise just return an empty list 171 if len(self._signatures) == 0: 172 return [] 173 174 # initialize a dictionary to hold the signature counts 175 sequence_sigs = {} 176 for sig in self._signatures: 177 sequence_sigs[sig] = 0 178 179 # get a list of all of the first parts of the signatures 180 all_first_sigs = [] 181 for sig_start, sig_end in self._signatures: 182 all_first_sigs.append(sig_start) 183 184 # count all of the signatures we are looking for in the sequence 185 sig_size = len(self._signatures[0][0]) 186 smallest_sig_size = sig_size * 2 187 188 for start in range(len(sequence) - (smallest_sig_size - 1)): 189 # if the first part matches any of the signatures we are looking 190 # for, then expand out to look for the second part 191 first_sig = sequence[start:start + sig_size].tostring() 192 if first_sig in all_first_sigs: 193 for second in range(start + sig_size, 194 (start + sig_size + 1) + self._max_gap): 195 second_sig = sequence[second:second + sig_size].tostring() 196 197 # if we find the motif, increase the counts for it 198 if (first_sig, second_sig) in sequence_sigs: 199 sequence_sigs[(first_sig, second_sig)] += 1 200 201 # -- normalize the signature info to go between zero and one 202 min_count = min(sequence_sigs.values()) 203 max_count = max(sequence_sigs.values()) 204 205 # as long as we have some signatures present, normalize them 206 # otherwise we'll just return 0 for everything 207 if max_count > 0: 208 for sig in sequence_sigs: 209 sequence_sigs[sig] = (float(sequence_sigs[sig] - min_count) 210 / float(max_count)) 211 212 # return the relative signature info in the specified order 213 sig_amounts = [] 214 for sig in self._signatures: 215 sig_amounts.append(sequence_sigs[sig]) 216 217 return sig_amounts
218