1
2
3
4
5
6 """
7 This class provides code to parse BIG GenePop files.
8
9 The difference between this class and the standard Bio.PopGen.GenePop.Record
10 class is that this one does not read the whole file to memory.
11 It provides an iterator interface, slower but consuming much mess memory.
12 Should be used with big files (Thousands of markers and individuals).
13
14 See http://wbiomed.curtin.edu.au/genepop/ , the format is documented
15 here: http://wbiomed.curtin.edu.au/genepop/help_input.html .
16
17 Classes:
18 FileRecord Holds GenePop data.
19
20 Functions:
21
22
23 """
24 from copy import deepcopy
25 from Bio.PopGen.GenePop import get_indiv
26
28 """Parses a file containing a GenePop file.
29
30 fname is a file name that contains a GenePop record.
31 """
32 record = FileRecord(fname)
33 return record
34
35
37 """Holds information from a GenePop record.
38
39 Members:
40 marker_len The marker length (2 or 3 digit code per allele).
41
42 comment_line Comment line.
43
44 loci_list List of loci names.
45
46 Functions:
47 get_individual Returns the next individual of the current population.
48
49 skip_population Skips the current population.
50
51 skip_population skips the individuals of the current population, returns
52 True if there are more populations.
53
54 get_individual returns an individual of the current population (or None
55 if the list ended).
56 Each individual is a pair composed by individual
57 name and a list of alleles (2 per marker or 1 for haploid data).
58 Examples
59 ('Ind1', [(1,2), (3,3), (200,201)]
60 ('Ind2', [(2,None), (3,3), (None,None)]
61 ('Other1', [(1,1), (4,3), (200,200)]
62
63
64 """
66 self.comment_line = ""
67 self.loci_list = []
68 self.fname = fname
69 self.start_read()
70
72 """Returns (reconstructs) a GenePop textual representation.
73
74 This might take a lot of memory.
75 Marker length will be 3.
76 """
77 marker_len = 3
78 rep = [self.comment_line + '\n']
79 rep.append('\n'.join(self.loci_list) + '\n')
80 current_pop = self.current_pop
81 current_ind = self.current_ind
82 self._handle.seek(0)
83 self.skip_header()
84 rep.append('Pop\n')
85 more = True
86 while more:
87 res = self.get_individual()
88 if res == True:
89 rep.append('Pop\n')
90 elif res == False:
91 more = False
92 else:
93 name, markers = res
94 rep.append(name)
95 rep.append(',')
96 for marker in markers:
97 rep.append(' ')
98 for al in marker:
99 if al == None:
100 al = '0'
101 aStr = str(al)
102 while len(aStr)<marker_len:
103 aStr = "".join(['0', aStr])
104 rep.append(aStr)
105 rep.append('\n')
106 self.seek_position(current_pop, current_ind)
107 return "".join(rep)
108
109
111 """Starts parsing a file containing a GenePop file.
112 """
113 self._handle = open(self.fname)
114 self.comment_line = self._handle.readline().rstrip()
115
116
117
118 sample_loci_line = self._handle.readline().rstrip().replace(',', '')
119 all_loci = sample_loci_line.split(' ')
120 self.loci_list.extend(all_loci)
121 for line in self._handle:
122 line = line.rstrip()
123 if line.upper()=='POP':
124 break
125 self.loci_list.append(line)
126 else:
127 raise ValueError('No population data found, file probably not GenePop related')
128
129 self.current_pop = 0
130 self.current_ind = 0
131
133 """Skips the Header. To be done after a re-open."""
134 self.current_pop = 0
135 self.current_ind = 0
136 for line in self._handle:
137 if line.rstrip().upper()=="POP":
138 return
139
141 """Seeks a certain position in the file.
142
143 pop - pop position (0 is first)
144 indiv - individual in pop
145 """
146 self._handle.seek(0)
147 self.skip_header()
148 while pop>0:
149 self.skip_population()
150 pop -= 1
151 while indiv>0:
152 self.get_individual()
153 indiv -= 1
154
156 "Skips the current population. Returns true if there is another pop."
157 for line in self._handle:
158 if line=="":
159 return False
160 line = line.rstrip()
161 if line.upper()=='POP':
162 self.current_pop += 1
163 self.current_ind = 0
164 return True
165
167 """Gets the next individual.
168
169 Returns individual information if there are more individuals
170 in the current population.
171 Returns True if there are no more individuals in the current
172 population, but there are more populations. Next read will
173 be of the following pop.
174 Returns False if at end of file.
175 """
176 marker_len = None
177 for line in self._handle:
178 line = line.rstrip()
179 if line.upper()=='POP':
180 self.current_pop += 1
181 self.current_ind = 0
182 return True
183 else:
184 self.current_ind += 1
185 indiv_name, allele_list, ignore = get_indiv(line)
186 return (indiv_name, allele_list)
187 return False
188
190 """Removes a population (by position).
191
192 pos - position
193 fname - file to be created with population removed
194 """
195 old_rec = read(self.fname)
196 f = open(fname, "w")
197 f.write(self.comment_line + "\n")
198 for locus in old_rec.loci_list:
199 f.write(locus + "\n")
200 curr_pop = 0
201 l_parser = old_rec.get_individual()
202 start_pop = True
203 while l_parser:
204 if curr_pop == pos:
205 old_rec.skip_population()
206 curr_pop += 1
207 else:
208 if l_parser == True:
209 curr_pop += 1
210 start_pop = True
211 else:
212 if start_pop:
213 f.write("POP\n")
214 start_pop = False
215 name, markers = l_parser
216 f.write(name + ",")
217 for marker in markers:
218 f.write(' ')
219 for al in marker:
220 if al == None:
221 al = '0'
222 aStr = str(al)
223 while len(aStr)<3:
224 aStr = "".join(['0', aStr])
225 f.write(aStr)
226 f.write('\n')
227
228 l_parser = old_rec.get_individual()
229 f.close()
230
232 """Removes a locus by position.
233
234 pos - position
235 fname - file to be created with locus removed
236 """
237 old_rec = read(self.fname)
238 f = open(fname, "w")
239 f.write(self.comment_line + "\n")
240 loci_list = old_rec.loci_list
241 del loci_list[pos]
242 for locus in loci_list:
243 f.write(locus + "\n")
244 l_parser = old_rec.get_individual()
245 f.write("POP\n")
246 while l_parser:
247 if l_parser == True:
248 f.write("POP\n")
249 else:
250 name, markers = l_parser
251 f.write(name + ",")
252 marker_pos = 0
253 for marker in markers:
254 if marker_pos == pos:
255 marker_pos += 1
256 continue
257 marker_pos += 1
258 f.write(' ')
259 for al in marker:
260 if al == None:
261 al = '0'
262 aStr = str(al)
263 while len(aStr)<3:
264 aStr = "".join(['0', aStr])
265 f.write(aStr)
266 f.write('\n')
267
268 l_parser = old_rec.get_individual()
269 f.close()
270
272 """Removes a set of loci by position.
273
274 positions - positions
275 fname - file to be created with locus removed
276 """
277 old_rec = read(self.fname)
278 f = open(fname, "w")
279 f.write(self.comment_line + "\n")
280 loci_list = old_rec.loci_list
281 positions.sort()
282 positions.reverse()
283 for pos in positions:
284 del loci_list[pos]
285 for locus in loci_list:
286 f.write(locus + "\n")
287 l_parser = old_rec.get_individual()
288 f.write("POP\n")
289 while l_parser:
290 if l_parser == True:
291 f.write("POP\n")
292 else:
293 name, markers = l_parser
294 f.write(name + ",")
295 marker_pos = 0
296 for marker in markers:
297 if marker_pos in positions:
298 marker_pos += 1
299 continue
300 marker_pos += 1
301 f.write(' ')
302 for al in marker:
303 if al == None:
304 al = '0'
305 aStr = str(al)
306 while len(aStr)<3:
307 aStr = "".join(['0', aStr])
308 f.write(aStr)
309 f.write('\n')
310
311 l_parser = old_rec.get_individual()
312 f.close()
313
315 """Removes a locus by name.
316
317 name - name
318 fname - file to be created with locus removed
319 """
320 for i in range(len(self.loci_list)):
321 if self.loci_list[i] == name:
322 self.remove_locus_by_position(i, fname)
323 return
324
325
326
328 """Removes a loci list (by name).
329
330 names - names
331 fname - file to be created with loci removed
332 """
333 positions = []
334 for i in range(len(self.loci_list)):
335 if self.loci_list[i] in names:
336 positions.append(i)
337 self.remove_loci_by_position(positions, fname)
338
339
340