1 """Generic functionality useful for all gene representations.
2
3 This module contains classes which can be used for all the different
4 types of patterns available for representing gene information (ie. motifs,
5 signatures and schemas). These are the general classes which should be
6 handle any of the different specific patterns.
7 """
8
9 import random
10
11
12 from Bio.Alphabet import _verify_alphabet
13 from Bio.Seq import Seq, MutableSeq
14
16 """Allow reading and writing of patterns to files.
17
18 This just defines a simple persistance class for patterns, making
19 it easy to write them to a file and read 'em back.
20 """
22 """Intialize the reader and writer class.
23
24 Arguments:
25
26 o alphabet - An optional argument specifying the alphabet
27 which patterns should follow. If an alphabet is set it'll be used
28 to verify that all patterns follow it.
29
30 Attributes:
31 o separator - A character to use in separating items in a signature
32 when it is written to a file and read back. This character should
33 not be in the possible alphabet of the sequences, or there will
34 be trouble.
35 """
36 self._alphabet = alphabet
37
38 self.separator = ";"
39
40 - def write(self, pattern_list, output_handle):
41 """Write a list of patterns to the given handle.
42 """
43 for pattern in pattern_list:
44
45 if (type(pattern) == type([]) or
46 type(pattern) == type(tuple([]))):
47 string_pattern = self.separator.join(pattern)
48
49 else:
50 string_pattern = pattern
51
52 output_handle.write("%s\n" % string_pattern)
53
54 - def write_seq(self, seq_pattern_list, output_handle):
55 """Convenience function to write Seq objects to a file.
56
57 This can take Seqs and MutableSeqs, and write them to a file
58 as strings.
59 """
60
61 all_patterns = []
62
63 for seq_pattern in seq_pattern_list:
64 if isinstance(seq_pattern, MutableSeq):
65 seq = seq_pattern.toseq()
66 all_patterns.append(seq.tostring())
67 elif isinstance(seq_pattern, Seq):
68 all_patterns.append(seq_pattern.tostring())
69 else:
70 raise ValueError("Unexpected pattern type %r" % seq_pattern)
71
72 self.write(all_patterns, output_handle)
73
74 - def read(self, input_handle):
75 """Read patterns from the specified handle.
76 """
77 all_patterns = []
78
79 while 1:
80 cur_line = input_handle.readline()
81
82 if not(cur_line):
83 break
84
85 cur_pattern = cur_line.rstrip()
86
87 if cur_pattern.find(self.separator) >= 0:
88 cur_pattern = tuple(cur_pattern.split(self.separator))
89
90 if self._alphabet is not None:
91
92
93 if type(cur_pattern) != type(tuple([])):
94 test_pattern = [cur_pattern]
95 else:
96 test_pattern = cur_pattern
97 for pattern_item in test_pattern:
98 pattern_seq = Seq(pattern_item, self._alphabet)
99 if not(_verify_alphabet(pattern_seq)):
100 raise ValueError("Pattern %s not matching alphabet %s"
101 % (cur_pattern, self._alphabet))
102
103 all_patterns.append(cur_pattern)
104
105 return all_patterns
106
108 """This holds a list of specific patterns found in sequences.
109
110 This is designed to be a general holder for a set of patterns and
111 should be subclassed for specific implementations (ie. holding Motifs
112 or Signatures.
113 """
115 """Initialize a repository with patterns,
116
117 Arguments:
118
119 o pattern_info - A representation of all of the patterns found in
120 a *Finder search. This should be a dictionary, where the keys
121 are patterns, and the values are the number of times a pattern is
122 found.
123
124 The patterns are represented interally as a list of two
125 tuples, where the first element is the number of times a pattern
126 occurs, and the second is the pattern itself. This makes it easy
127 to sort the list and return the top N patterns.
128 """
129 self._pattern_dict = pattern_info
130
131
132 self._pattern_list = []
133 for pattern_name in self._pattern_dict:
134 self._pattern_list.append((self._pattern_dict[pattern_name],
135 pattern_name))
136
137 self._pattern_list.sort()
138 self._pattern_list.reverse()
139
141 """Retrieve all of the patterns in the repository.
142 """
143 patterns = []
144 for pattern_info in self._pattern_list:
145 patterns.append(pattern_info[1])
146
147 return patterns
148
150 """Retrieve the specified number of patterns randomly.
151
152 Randomly selects patterns from the list and returns them.
153
154 Arguments:
155
156 o num_patterns - The total number of patterns to return.
157 """
158 all_patterns = []
159
160 while len(all_patterns) < num_patterns:
161
162 new_pattern_info = random.choice(self._pattern_list)
163
164 if new_pattern_info[1] not in all_patterns:
165 all_patterns.append(new_pattern_info[1])
166
167 return all_patterns
168
170 """Return a percentage of the patterns.
171
172 This returns the top 'percent' percentage of the patterns in the
173 repository.
174 """
175 all_patterns = self.get_all()
176
177 num_to_return = int(len(all_patterns) * percent)
178
179 return all_patterns[:num_to_return]
180
182 """Return the specified number of most frequently occurring patterns
183
184 Arguments:
185
186 o num_patterns - The number of patterns to return.
187 """
188 all_patterns = []
189 for pattern_info in self._pattern_list[:num_patterns]:
190 all_patterns.append(pattern_info[1])
191
192 return all_patterns
193
195 """Retrieve patterns that are at the extreme ranges.
196
197 This returns both patterns at the top of the list (ie. the same as
198 returned by get_top) and at the bottom of the list. This
199 is especially useful for patterns that are the differences between
200 two sets of patterns.
201
202 Arguments:
203
204 o top_num - The number of patterns to take from the top of the list.
205
206 o bottom_num - The number of patterns to take from the bottom of
207 the list.
208 """
209 all_patterns = []
210
211 for pattern_info in self._pattern_list[:top_num]:
212 all_patterns.append(pattern_info[1])
213
214
215 for pattern_info in self._pattern_list[-bottom_num:]:
216 all_patterns.append(pattern_info[1])
217
218 return all_patterns
219
221 """Remove patterns which are likely due to polyA tails from the lists.
222
223 This is just a helper function to remove pattenrs which are likely
224 just due to polyA tails, and thus are not really great motifs.
225 This will also get rid of stuff like ATATAT, which might be a
226 useful motif, so use at your own discretion.
227
228 XXX Could we write a more general function, based on info content
229 or something like that?
230
231 Arguments:
232
233 o at_percentage - The percentage of A and T residues in a pattern
234 that qualifies it for being removed.
235 """
236 remove_list = []
237
238 for pattern_info in self._pattern_list:
239 pattern_at = float(pattern_info[1].count('A') + pattern_info[1].count('T')) / len(pattern_info[1])
240 if pattern_at > at_percentage:
241 remove_list.append(pattern_info)
242
243
244 for to_remove in remove_list:
245 self._pattern_list.remove(to_remove)
246
247 - def count(self, pattern):
248 """Return the number of times the specified pattern is found.
249 """
250 try:
251 return self._pattern_dict[pattern]
252 except KeyError:
253 return 0
254