Package Bio :: Package PopGen :: Package GenePop :: Module FileParser
[hide private]
[frames] | no frames]

Source Code for Module Bio.PopGen.GenePop.FileParser

  1  # Copyright 2010 by Tiago Antao.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This class provides code to parse BIG GenePop files. 
  8   
  9  The difference between this class and the standard Bio.PopGen.GenePop.Record 
 10  class is that this one does not read the whole file to memory. 
 11  It provides an iterator interface, slower but consuming much mess memory. 
 12  Should be used with big files (Thousands of markers and individuals). 
 13   
 14  See http://wbiomed.curtin.edu.au/genepop/ , the format is documented 
 15  here: http://wbiomed.curtin.edu.au/genepop/help_input.html . 
 16   
 17  Classes: 
 18  FileRecord           Holds GenePop data. 
 19   
 20  Functions: 
 21   
 22   
 23  """ 
 24  from copy import deepcopy 
 25  from Bio.PopGen.GenePop import get_indiv 
 26   
27 -def read(fname):
28 """Parses a file containing a GenePop file. 29 30 fname is a file name that contains a GenePop record. 31 """ 32 record = FileRecord(fname) 33 return record
34 35
36 -class FileRecord:
37 """Holds information from a GenePop record. 38 39 Members: 40 marker_len The marker length (2 or 3 digit code per allele). 41 42 comment_line Comment line. 43 44 loci_list List of loci names. 45 46 Functions: 47 get_individual Returns the next individual of the current population. 48 49 skip_population Skips the current population. 50 51 skip_population skips the individuals of the current population, returns 52 True if there are more populations. 53 54 get_individual returns an individual of the current population (or None 55 if the list ended). 56 Each individual is a pair composed by individual 57 name and a list of alleles (2 per marker or 1 for haploid data). 58 Examples 59 ('Ind1', [(1,2), (3,3), (200,201)] 60 ('Ind2', [(2,None), (3,3), (None,None)] 61 ('Other1', [(1,1), (4,3), (200,200)] 62 63 64 """
65 - def __init__(self, fname):
66 self.comment_line = "" 67 self.loci_list = [] 68 self.fname = fname 69 self.start_read()
70
71 - def __str__(self):
72 """Returns (reconstructs) a GenePop textual representation. 73 74 This might take a lot of memory. 75 Marker length will be 3. 76 """ 77 marker_len = 3 78 rep = [self.comment_line + '\n'] 79 rep.append('\n'.join(self.loci_list) + '\n') 80 current_pop = self.current_pop 81 current_ind = self.current_ind 82 self._handle.seek(0) 83 self.skip_header() 84 rep.append('Pop\n') 85 more = True 86 while more: 87 res = self.get_individual() 88 if res == True: 89 rep.append('Pop\n') 90 elif res == False: 91 more = False 92 else: 93 name, markers = res 94 rep.append(name) 95 rep.append(',') 96 for marker in markers: 97 rep.append(' ') 98 for al in marker: 99 if al == None: 100 al = '0' 101 aStr = str(al) 102 while len(aStr)<marker_len: 103 aStr = "".join(['0', aStr]) 104 rep.append(aStr) 105 rep.append('\n') 106 self.seek_position(current_pop, current_ind) 107 return "".join(rep)
108 109
110 - def start_read(self):
111 """Starts parsing a file containing a GenePop file. 112 """ 113 self._handle = open(self.fname) 114 self.comment_line = self._handle.readline().rstrip() 115 #We can now have one loci per line or all loci in a single line 116 #separated by either space or comma+space... 117 #We will remove all commas on loci... that should not be a problem 118 sample_loci_line = self._handle.readline().rstrip().replace(',', '') 119 all_loci = sample_loci_line.split(' ') 120 self.loci_list.extend(all_loci) 121 for line in self._handle: 122 line = line.rstrip() 123 if line.upper()=='POP': 124 break 125 self.loci_list.append(line) 126 else: 127 raise ValueError('No population data found, file probably not GenePop related') 128 #self._after_pop = True 129 self.current_pop = 0 130 self.current_ind = 0
131
132 - def skip_header(self):
133 """Skips the Header. To be done after a re-open.""" 134 self.current_pop = 0 135 self.current_ind = 0 136 for line in self._handle: 137 if line.rstrip().upper()=="POP": 138 return
139
140 - def seek_position(self, pop, indiv):
141 """Seeks a certain position in the file. 142 143 pop - pop position (0 is first) 144 indiv - individual in pop 145 """ 146 self._handle.seek(0) 147 self.skip_header() 148 while pop>0: 149 self.skip_population() 150 pop -= 1 151 while indiv>0: 152 self.get_individual() 153 indiv -= 1
154
155 - def skip_population(self):
156 "Skips the current population. Returns true if there is another pop." 157 for line in self._handle: 158 if line=="": 159 return False 160 line = line.rstrip() 161 if line.upper()=='POP': 162 self.current_pop += 1 163 self.current_ind = 0 164 return True
165
166 - def get_individual(self):
167 """Gets the next individual. 168 169 Returns individual information if there are more individuals 170 in the current population. 171 Returns True if there are no more individuals in the current 172 population, but there are more populations. Next read will 173 be of the following pop. 174 Returns False if at end of file. 175 """ 176 marker_len = None 177 for line in self._handle: 178 line = line.rstrip() 179 if line.upper()=='POP': 180 self.current_pop += 1 181 self.current_ind = 0 182 return True 183 else: 184 self.current_ind += 1 185 indiv_name, allele_list, ignore = get_indiv(line) 186 return (indiv_name, allele_list) 187 return False
188
189 - def remove_population(self, pos, fname):
190 """Removes a population (by position). 191 192 pos - position 193 fname - file to be created with population removed 194 """ 195 old_rec = read(self.fname) 196 f = open(fname, "w") 197 f.write(self.comment_line + "\n") 198 for locus in old_rec.loci_list: 199 f.write(locus + "\n") 200 curr_pop = 0 201 l_parser = old_rec.get_individual() 202 start_pop = True 203 while l_parser: 204 if curr_pop == pos: 205 old_rec.skip_population() 206 curr_pop += 1 207 else: 208 if l_parser == True: 209 curr_pop += 1 210 start_pop = True 211 else: 212 if start_pop: 213 f.write("POP\n") 214 start_pop = False 215 name, markers = l_parser 216 f.write(name + ",") 217 for marker in markers: 218 f.write(' ') 219 for al in marker: 220 if al == None: 221 al = '0' 222 aStr = str(al) 223 while len(aStr)<3: 224 aStr = "".join(['0', aStr]) 225 f.write(aStr) 226 f.write('\n') 227 228 l_parser = old_rec.get_individual() 229 f.close()
230
231 - def remove_locus_by_position(self, pos, fname):
232 """Removes a locus by position. 233 234 pos - position 235 fname - file to be created with locus removed 236 """ 237 old_rec = read(self.fname) 238 f = open(fname, "w") 239 f.write(self.comment_line + "\n") 240 loci_list = old_rec.loci_list 241 del loci_list[pos] 242 for locus in loci_list: 243 f.write(locus + "\n") 244 l_parser = old_rec.get_individual() 245 f.write("POP\n") 246 while l_parser: 247 if l_parser == True: 248 f.write("POP\n") 249 else: 250 name, markers = l_parser 251 f.write(name + ",") 252 marker_pos = 0 253 for marker in markers: 254 if marker_pos == pos: 255 marker_pos += 1 256 continue 257 marker_pos += 1 258 f.write(' ') 259 for al in marker: 260 if al == None: 261 al = '0' 262 aStr = str(al) 263 while len(aStr)<3: 264 aStr = "".join(['0', aStr]) 265 f.write(aStr) 266 f.write('\n') 267 268 l_parser = old_rec.get_individual() 269 f.close()
270
271 - def remove_loci_by_position(self, positions, fname):
272 """Removes a set of loci by position. 273 274 positions - positions 275 fname - file to be created with locus removed 276 """ 277 old_rec = read(self.fname) 278 f = open(fname, "w") 279 f.write(self.comment_line + "\n") 280 loci_list = old_rec.loci_list 281 positions.sort() 282 positions.reverse() 283 for pos in positions: 284 del loci_list[pos] 285 for locus in loci_list: 286 f.write(locus + "\n") 287 l_parser = old_rec.get_individual() 288 f.write("POP\n") 289 while l_parser: 290 if l_parser == True: 291 f.write("POP\n") 292 else: 293 name, markers = l_parser 294 f.write(name + ",") 295 marker_pos = 0 296 for marker in markers: 297 if marker_pos in positions: 298 marker_pos += 1 299 continue 300 marker_pos += 1 301 f.write(' ') 302 for al in marker: 303 if al == None: 304 al = '0' 305 aStr = str(al) 306 while len(aStr)<3: 307 aStr = "".join(['0', aStr]) 308 f.write(aStr) 309 f.write('\n') 310 311 l_parser = old_rec.get_individual() 312 f.close()
313
314 - def remove_locus_by_name(self, name, fname):
315 """Removes a locus by name. 316 317 name - name 318 fname - file to be created with locus removed 319 """ 320 for i in range(len(self.loci_list)): 321 if self.loci_list[i] == name: 322 self.remove_locus_by_position(i, fname) 323 return
324 #If here than locus not existent... Maybe raise exception? 325 # Although it should be Ok... Just a boolean return, maybe? 326
327 - def remove_loci_by_name(self, names, fname):
328 """Removes a loci list (by name). 329 330 names - names 331 fname - file to be created with loci removed 332 """ 333 positions = [] 334 for i in range(len(self.loci_list)): 335 if self.loci_list[i] in names: 336 positions.append(i) 337 self.remove_loci_by_position(positions, fname)
338 #If here than locus not existent... Maybe raise exception? 339 # Although it should be Ok... Just a boolean return, maybe? 340