Package Bio :: Package UniGene
[hide private]
[frames] | no frames]

Source Code for Package Bio.UniGene

  1  # Copyright 2006 by Sean Davis.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # $Id: __init__.py,v 1.12 2009-04-24 12:03:45 mdehoon Exp $ 
  7  # Sean Davis <sdavis2 at mail dot nih dot gov> 
  8  # National Cancer Institute 
  9  # National Institutes of Health 
 10  # Bethesda, MD, USA 
 11  # 
 12   
 13  """Parse Unigene flat file format files such as the Hs.data file. 
 14   
 15  Here is an overview of the flat file format that this parser deals with: 
 16     Line types/qualifiers: 
 17   
 18         ID           UniGene cluster ID 
 19         TITLE        Title for the cluster 
 20         GENE         Gene symbol 
 21         CYTOBAND     Cytological band 
 22         EXPRESS      Tissues of origin for ESTs in cluster 
 23         RESTR_EXPR   Single tissue or development stage contributes  
 24                      more than half the total EST frequency for this gene. 
 25         GNM_TERMINUS genomic confirmation of presence of a 3' terminus;  
 26                      T if a non-templated polyA tail is found among  
 27                      a cluster's sequences; else 
 28                      I if templated As are found in genomic sequence or 
 29                      S if a canonical polyA signal is found on  
 30                        the genomic sequence 
 31         GENE_ID      Entrez gene identifier associated with at least one 
 32                      sequence in this cluster;  
 33                      to be used instead of LocusLink.   
 34         LOCUSLINK    LocusLink identifier associated with at least one 
 35                      sequence in this cluster;   
 36                      deprecated in favor of GENE_ID 
 37         HOMOL        Homology; 
 38         CHROMOSOME   Chromosome.  For plants, CHROMOSOME refers to mapping 
 39                      on the arabidopsis genome. 
 40         STS          STS 
 41              ACC=         GenBank/EMBL/DDBJ accession number of STS 
 42                           [optional field] 
 43              UNISTS=      identifier in NCBI's UNISTS database 
 44         TXMAP        Transcript map interval 
 45              MARKER=      Marker found on at least one sequence in this 
 46                           cluster 
 47              RHPANEL=     Radiation Hybrid panel used to place marker 
 48         PROTSIM      Protein Similarity data for the sequence with 
 49                      highest-scoring protein similarity in this cluster 
 50              ORG=         Organism 
 51              PROTGI=      Sequence GI of protein 
 52              PROTID=      Sequence ID of protein 
 53              PCT=         Percent alignment 
 54              ALN=         length of aligned region (aa) 
 55         SCOUNT       Number of sequences in the cluster 
 56         SEQUENCE     Sequence 
 57              ACC=         GenBank/EMBL/DDBJ accession number of sequence 
 58              NID=         Unique nucleotide sequence identifier (gi) 
 59              PID=         Unique protein sequence identifier (used for 
 60                           non-ESTs) 
 61              CLONE=       Clone identifier (used for ESTs only) 
 62              END=         End (5'/3') of clone insert read (used for 
 63                           ESTs only)  
 64              LID=         Library ID; see Hs.lib.info for library name 
 65                           and tissue 
 66              MGC=         5' CDS-completeness indicator; if present, the 
 67                           clone associated with this sequence is believed 
 68                           CDS-complete. A value greater than 511 is the gi 
 69                           of the CDS-complete mRNA matched by the EST, 
 70                           otherwise the value is an indicator of the 
 71                           reliability of the test indicating CDS 
 72                           completeness; higher values indicate more 
 73                           reliable CDS-completeness predictions.  
 74             SEQTYPE=      Description of the nucleotide sequence. 
 75                           Possible values are mRNA, EST and HTC. 
 76             TRACE=        The Trace ID of the EST sequence, as provided by 
 77                           NCBI Trace Archive 
 78  """ 
 79   
 80   
81 -class SequenceLine:
82 """Store the information for one SEQUENCE line from a Unigene file 83 84 Initialize with the text part of the SEQUENCE line, or nothing. 85 86 Attributes and descriptions (access as LOWER CASE) 87 ACC= GenBank/EMBL/DDBJ accession number of sequence 88 NID= Unique nucleotide sequence identifier (gi) 89 PID= Unique protein sequence identifier (used for non-ESTs) 90 CLONE= Clone identifier (used for ESTs only) 91 END= End (5'/3') of clone insert read (used for ESTs only) 92 LID= Library ID; see Hs.lib.info for library name and tissue 93 MGC= 5' CDS-completeness indicator; if present, 94 the clone associated with this sequence 95 is believed CDS-complete. A value greater than 511 96 is the gi of the CDS-complete mRNA matched by the EST, 97 otherwise the value is an indicator of the reliability 98 of the test indicating CDS completeness; 99 higher values indicate more reliable CDS-completeness 100 predictions. 101 SEQTYPE= Description of the nucleotide sequence. Possible values 102 are mRNA, EST and HTC. 103 TRACE= The Trace ID of the EST sequence, as provided by NCBI 104 Trace Archive 105 """ 106
107 - def __init__(self,text=None):
108 self.acc = '' 109 self.nid = '' 110 self.lid = '' 111 self.pid = '' 112 self.clone = '' 113 self.image = '' 114 self.is_image = False 115 self.end = '' 116 self.mgc = '' 117 self.seqtype = '' 118 self.trace = '' 119 if not text==None: 120 self.text=text 121 self._init_from_text(text)
122
123 - def _init_from_text(self,text):
124 parts = text.split('; '); 125 for part in parts: 126 key, val = part.split("=") 127 if key=='CLONE': 128 if val[:5]=='IMAGE': 129 self.is_image=True 130 self.image = val[6:] 131 setattr(self,key.lower(),val)
132
133 - def __repr__(self):
134 return self.text
135 136
137 -class ProtsimLine:
138 """Store the information for one PROTSIM line from a Unigene file 139 140 Initialize with the text part of the PROTSIM line, or nothing. 141 142 Attributes and descriptions (access as LOWER CASE) 143 ORG= Organism 144 PROTGI= Sequence GI of protein 145 PROTID= Sequence ID of protein 146 PCT= Percent alignment 147 ALN= length of aligned region (aa) 148 """ 149
150 - def __init__(self,text=None):
151 self.org = '' 152 self.protgi = '' 153 self.protid = '' 154 self.pct = '' 155 self.aln = '' 156 if not text==None: 157 self.text=text 158 self._init_from_text(text)
159
160 - def _init_from_text(self,text):
161 parts = text.split('; '); 162 163 for part in parts: 164 key, val = part.split("=") 165 setattr(self,key.lower(),val)
166
167 - def __repr__(self):
168 return self.text
169 170
171 -class STSLine:
172 """Store the information for one STS line from a Unigene file 173 174 Initialize with the text part of the STS line, or nothing. 175 176 Attributes and descriptions (access as LOWER CASE) 177 178 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field] 179 UNISTS= identifier in NCBI's UNISTS database 180 """ 181
182 - def __init__(self,text=None):
183 self.acc = '' 184 self.unists = '' 185 if not text==None: 186 self.text=text 187 self._init_from_text(text)
188
189 - def _init_from_text(self,text):
190 parts = text.split(' '); 191 192 for part in parts: 193 key, val = part.split("=") 194 setattr(self,key.lower(),val)
195
196 - def __repr__(self):
197 return self.text
198 199
200 -class Record:
201 """Store a Unigene record 202 203 Here is what is stored: 204 205 self.ID = '' # ID line 206 self.species = '' # Hs, Bt, etc. 207 self.title = '' # TITLE line 208 self.symbol = '' # GENE line 209 self.cytoband = '' # CYTOBAND line 210 self.express = [] # EXPRESS line, parsed on ';' 211 # Will be an array of strings 212 self.restr_expr = '' # RESTR_EXPR line 213 self.gnm_terminus = '' # GNM_TERMINUS line 214 self.gene_id = '' # GENE_ID line 215 self.locuslink = '' # LOCUSLINK line 216 self.homol = '' # HOMOL line 217 self.chromosome = '' # CHROMOSOME line 218 self.protsim = [] # PROTSIM entries, array of Protsims 219 # Type ProtsimLine 220 self.sequence = [] # SEQUENCE entries, array of Sequence entries 221 # Type SequenceLine 222 self.sts = [] # STS entries, array of STS entries 223 # Type STSLine 224 self.txmap = [] # TXMAP entries, array of TXMap entries 225 """ 226
227 - def __init__(self):
228 self.ID = '' # ID line 229 self.species = '' # Hs, Bt, etc. 230 self.title = '' # TITLE line 231 self.symbol = '' # GENE line 232 self.cytoband = '' # CYTOBAND line 233 self.express = [] # EXPRESS line, parsed on ';' 234 self.restr_expr = '' # RESTR_EXPR line 235 self.gnm_terminus = '' # GNM_TERMINUS line 236 self.gene_id = '' # GENE_ID line 237 self.locuslink = '' # LOCUSLINK line 238 self.homol = '' # HOMOL line 239 self.chromosome = '' # CHROMOSOME line 240 self.protsim = [] # PROTSIM entries, array of Protsims 241 self.sequence = [] # SEQUENCE entries, array of Sequence entries 242 self.sts = [] # STS entries, array of STS entries 243 self.txmap = [] # TXMAP entries, array of TXMap entries
244
245 - def __repr__(self):
246 return "<%s> %s %s\n%s" % (self.__class__.__name__, 247 self.ID, self.symbol, self.title)
248
249 -def parse(handle):
250 while True: 251 record = _read(handle) 252 if not record: 253 return 254 yield record
255 256
257 -def read(handle):
258 record = _read(handle) 259 if not record: 260 raise ValueError("No SwissProt record found") 261 # We should have reached the end of the record by now 262 remainder = handle.read() 263 if remainder: 264 raise ValueError("More than one SwissProt record found") 265 return record
266 267 268 # Everything below is private 269 270
271 -def _read(handle):
272 UG_INDENT = 12 273 record = None 274 for line in handle: 275 tag, value = line[:UG_INDENT].rstrip(), line[UG_INDENT:].rstrip() 276 line = line.rstrip() 277 if tag=="ID": 278 record = Record() 279 record.ID = value 280 record.species = record.ID.split('.')[0] 281 elif tag=="TITLE": 282 record.title = value 283 elif tag=="GENE": 284 record.symbol = value 285 elif tag=="GENE_ID": 286 record.gene_id = value 287 elif tag=="LOCUSLINK": 288 record.locuslink = value 289 elif tag=="HOMOL": 290 if value=="YES": 291 record.homol = True 292 elif value=="NO": 293 record.homol = True 294 else: 295 raise ValueError, "Cannot parse HOMOL line %s" % line 296 elif tag=="EXPRESS": 297 record.express = [word.strip() for word in value.split("|")] 298 elif tag=="RESTR_EXPR": 299 record.restr_expr = [word.strip() for word in value.split("|")] 300 elif tag=="CHROMOSOME": 301 record.chromosome = value 302 elif tag=="CYTOBAND": 303 record.cytoband = value 304 elif tag=="PROTSIM": 305 protsim = ProtsimLine(value) 306 record.protsim.append(protsim) 307 elif tag=="SCOUNT": 308 scount = int(value) 309 elif tag=="SEQUENCE": 310 sequence = SequenceLine(value) 311 record.sequence.append(sequence) 312 elif tag=="STS": 313 sts = STSLine(value) 314 record.sts.append(sts) 315 elif tag=='//': 316 if len(record.sequence)!=scount: 317 raise ValueError, "The number of sequences specified in the record (%d) does not agree with the number of sequences found (%d)" % (scount, len(record.sequence)) 318 return record 319 else: 320 raise ValueError, "Unknown tag %s" % tag 321 if record: 322 raise ValueError("Unexpected end of stream.")
323 324 325 # Everything below is deprecated 326 327 328 from Bio.ParserSupport import * 329 import re 330 import Bio 331 332 # 333 # CONSTANTS 334 # 335 UG_INDENT=12 336
337 -class UnigeneSequenceRecord:
338 """Store the information for one SEQUENCE line from a Unigene file 339 (DEPRECATED). 340 341 Initialize with the text part of the SEQUENCE line, or nothing. 342 343 Attributes and descriptions (access as LOWER CASE) 344 ACC= GenBank/EMBL/DDBJ accession number of sequence 345 NID= Unique nucleotide sequence identifier (gi) 346 PID= Unique protein sequence identifier (used for non-ESTs) 347 CLONE= Clone identifier (used for ESTs only) 348 END= End (5'/3') of clone insert read (used for ESTs only) 349 LID= Library ID; see Hs.lib.info for library name and tissue 350 MGC= 5' CDS-completeness indicator; if present, 351 the clone associated with this sequence 352 is believed CDS-complete. A value greater than 511 353 is the gi of the CDS-complete mRNA matched by the EST, 354 otherwise the value is an indicator of the reliability 355 of the test indicating CDS comleteness; 356 higher values indicate more reliable CDS-completeness predictions. 357 SEQTYPE= Description of the nucleotide sequence. Possible values are 358 mRNA, EST and HTC. 359 TRACE= The Trace ID of the EST sequence, as provided by NCBI Trace Archive 360 PERIPHERAL= Indicator that the sequence is a suboptimal 361 representative of the gene represented by this cluster. 362 Peripheral sequences are those that are in a cluster 363 which represents a spliced gene without sharing a 364 splice junction with any other sequence. In many 365 cases, they are unspliced transcripts originating 366 from the gene. 367 368 This class is DEPRECATED; please use the read() function in this module 369 instead. 370 """ 371
372 - def __init__(self,text=None):
373 import warnings 374 warnings.warn("Bio.UniGene.UnigeneSequenceRecord is deprecated; please use the read() function in this module instead", Bio.BiopythonDeprecationWarning) 375 self.acc = '' 376 self.nid = '' 377 self.lid = '' 378 self.pid = '' 379 self.clone = '' 380 self.image = '' 381 self.is_image = False 382 self.end = '' 383 self.mgc = '' 384 self.seqtype = '' 385 self.Trace = '' 386 self.peripheral = '' 387 if not text==None: 388 self.text=text 389 return self._init_from_text(text)
390
391 - def _init_from_text(self,text):
392 parts = text.split('; '); 393 for part in parts: 394 key,val = re.match('(\w+)=(\S+)',part).groups() 395 if key=='CLONE': 396 if val[:5]=='IMAGE': 397 self.is_image=True 398 self.image = val[6:] 399 setattr(self,key.lower(),val)
400
401 - def __repr__(self):
402 return self.text
403 404
405 -class UnigeneProtsimRecord:
406 """Store the information for one PROTSIM line from a Unigene file 407 (DEPRECATED). 408 409 Initialize with the text part of the PROTSIM line, or nothing. 410 411 Attributes and descriptions (access as LOWER CASE) 412 ORG= Organism 413 PROTGI= Sequence GI of protein 414 PROTID= Sequence ID of protein 415 PCT= Percent alignment 416 ALN= length of aligned region (aa) 417 418 This class is DEPRECATED; please use the read() function in this module 419 instead. 420 """ 421
422 - def __init__(self,text=None):
423 import warnings 424 warnings.warn("Bio.UniGene.UnigeneProtsimRecord is deprecated; please use the read() function in this module instead", Bio.BiopythonDeprecationWarning) 425 self.org = '' 426 self.protgi = '' 427 self.protid = '' 428 self.pct = '' 429 self.aln = '' 430 if not text==None: 431 self.text=text 432 return self._init_from_text(text)
433
434 - def _init_from_text(self,text):
435 parts = text.split('; '); 436 437 for part in parts: 438 key,val = re.match('(\w+)=(\S+)',part).groups() 439 setattr(self,key.lower(),val)
440
441 - def __repr__(self):
442 return self.text
443 444
445 -class UnigeneSTSRecord:
446 """Store the information for one STS line from a Unigene file 447 (DEPRECATED). 448 449 Initialize with the text part of the STS line, or nothing. 450 451 Attributes and descriptions (access as LOWER CASE) 452 453 NAME= Name of STS 454 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field] 455 DSEG= GDB Dsegment number [optional field] 456 UNISTS= identifier in NCBI's UNISTS database 457 458 This class is DEPRECATED; please use the read() function in this module 459 instead. 460 """ 461
462 - def __init__(self,text=None):
463 import warnings 464 warnings.warn("Bio.UniGene.UnigeneSTSRecord is deprecated; please use the read() function in this module instead", Bio.BiopythonDeprecationWarning) 465 self.name = '' 466 self.acc = '' 467 self.dseg = '' 468 self.unists = '' 469 if not text==None: 470 self.text=text 471 return self._init_from_text(text)
472
473 - def _init_from_text(self,text):
474 parts = text.split(' '); 475 476 for part in parts: 477 key,val = re.match('(\w+)=(\S+)',part).groups() 478 setattr(self,key.lower(),val)
479
480 - def __repr__(self):
481 return self.text
482 483
484 -class UnigeneRecord:
485 """Store a Unigene record (DEPRECATED). 486 487 Here is what is stored: 488 489 self.ID = '' # ID line 490 self.species = '' # Hs, Bt, etc. 491 self.title = '' # TITLE line 492 self.symbol = '' # GENE line 493 self.cytoband = '' # CYTOBAND line 494 self.express = [] # EXPRESS line, parsed on ';' 495 # Will be an array of strings 496 self.restr_expr = '' # RESTR_EXPR line 497 self.gnm_terminus = '' # GNM_TERMINUS line 498 self.gene_id = '' # GENE_ID line 499 self.chromosome = '' # CHROMOSOME 500 self.protsim = [] # PROTSIM entries, array of Protsims 501 # Type UnigeneProtsimRecord 502 self.sequence = [] # SEQUENCE entries, array of Sequence entries 503 # Type UnigeneSequenceRecord 504 self.sts = [] # STS entries, array of STS entries 505 # Type UnigeneSTSRecord 506 self.txmap = [] # TXMAP entries, array of TXMap entries 507 508 This class is DEPRECATED; please use the read() function in this module 509 instead. 510 """ 511
512 - def __init__(self):
513 import warnings 514 warnings.warn("Bio.UniGene.UnigeneRecord is deprecated; please use the read() function in this module instead", Bio.BiopythonDeprecationWarning) 515 self.ID = '' # ID line 516 self.species = '' # Hs, Bt, etc. 517 self.title = '' # TITLE line 518 self.symbol = '' # GENE line 519 self.cytoband = '' # CYTOBAND line 520 self.express = [] # EXPRESS line, parsed on ';' 521 self.restr_expr = '' # RESTR_EXPR line 522 self.gnm_terminus = '' # GNM_TERMINUS line 523 self.gene_id = '' # GENE_ID line 524 self.chromosome = '' # CHROMOSOME 525 self.protsim = [] # PROTSIM entries, array of Protsims 526 self.sequence = [] # SEQUENCE entries, array of Sequence entries 527 self.sts = [] # STS entries, array of STS entries 528 self.txmap = [] # TXMAP entries, array of TXMap entries
529
530 - def __repr__(self):
531 return "<%s> %s %s\n%s" % (self.__class__.__name__, 532 self.ID, self.symbol, self.title)
533 534
535 -class _RecordConsumer(AbstractConsumer):
536 """This class is DEPRECATED; please use the read() function in this module 537 instead.""" 538
539 - def __init__(self):
540 import warnings 541 warnings.warn("Bio.UniGene._RecordConsumer is deprecated; please use the read() function in this module instead", Bio.BiopythonDeprecationWarning) 542 self.unigene_record = UnigeneRecord()
543 - def ID(self,line):
544 self.unigene_record.ID = self._get_single_entry(line) 545 self.unigene_record.species = self.unigene_record.ID.split('.')[0]
546 - def TITLE(self,line):
547 self.unigene_record.title = self._get_single_entry(line)
548 - def GENE(self,line):
549 self.unigene_record.symbol = self._get_single_entry(line)
550 - def EXPRESS(self,line):
551 self.unigene_record.express = self._get_array_entry(line,split_on='; ')
552 - def RESTR_EXPR(self,line):
553 self.unigene_record.restr_expr = self._get_single_entry(line)
554 - def GENE_ID(self,line):
555 self.unigene_record.gene_id = self._get_single_entry(line)
556 - def CHROMOSOME(self,line):
557 self.unigene_record.chromosome = self._get_single_entry(line)
558 - def GENE_ID(self,line):
559 self.unigene_record.gene_id = self._get_single_entry(line)
560 - def SEQUENCE(self,line):
561 ug_seqrecord = UnigeneSequenceRecord(self._get_single_entry(line)) 562 self.unigene_record.sequence.append(ug_seqrecord)
563 - def PROTSIM(self,line):
564 ug_protsimrecord = UnigeneProtsimRecord(self._get_single_entry(line)) 565 self.unigene_record.protsim.append(ug_protsimrecord)
566 - def STS(self,line):
567 ug_stsrecord = UnigeneSTSRecord(self._get_single_entry(line)) 568 self.unigene_record.sts.append(ug_stsrecord)
569 570
571 - def _get_single_entry(self,line):
572 """Consume a single-value line 573 """ 574 return line[UG_INDENT:]
575
576 - def _get_array_entry(self,line,split_on):
577 """Consume a multi-value line by splitting on split_on 578 """ 579 return line[UG_INDENT:].split(split_on)
580 581
582 -class _Scanner:
583 """Scans a Unigene Flat File Format file (DEPRECATED). 584 585 This class is DEPRECATED; please use the read() function in this module 586 instead. 587 """ 588
589 - def __init__(self):
590 import warnings 591 warnings.warn("Bio.UniGene._Scanner is deprecated; please use the read() function in this module instead", Bio.BiopythonDeprecationWarning)
592
593 - def feed(self, handle, consumer):
594 """feed(self, handle, consumer) 595 596 Feed events from parsing a Unigene file to a consumer. 597 handle is a file-like object, and consumer is a consumer object 598 that will receive events as the file is scanned 599 600 """ 601 consumer.start_record() 602 for line in handle: 603 tag = line.split(' ')[0] 604 line = line.rstrip() 605 if line=='//': 606 consumer.end_record() 607 break 608 try: 609 f = getattr(consumer, tag) 610 except AttributeError: 611 print 'no method called', tag 612 else: 613 if callable(f): 614 f(line)
615 616
617 -class RecordParser(AbstractParser):
618 """This class is DEPRECATED; please use the read() function in this module 619 instead.""" 620
621 - def __init__(self):
622 import warnings 623 warnings.warn("Bio.UniGene._RecordParser is deprecated; please use the read() function in this module instead", Bio.BiopythonDeprecationWarning) 624 self._scanner = _Scanner() 625 self._consumer = _RecordConsumer()
626
627 - def parse(self, handle):
628 if isinstance(handle, File.UndoHandle): 629 uhandle = handle 630 else: 631 uhandle = File.UndoHandle(handle) 632 self._scanner.feed(uhandle, self._consumer) 633 return self._consumer.unigene_record
634
635 -class Iterator:
636 """This class is DEPRECATED; please use the parse() function in this module 637 instead.""" 638
639 - def __init__(self, handle, parser=None):
640 import warnings 641 warnings.warn("Bio.UniGene.Iterator is deprecated; please use the parse() function in this module instead", Bio.BiopythonDeprecationWarning) 642 self._uhandle = File.UndoHandle(handle)
643
644 - def next(self):
645 self._parser = RecordParser() 646 lines = [] 647 while True: 648 line = self._uhandle.readline() 649 if not line: break 650 if line[:2] == '//': 651 break 652 lines.append(line) 653 if not lines: 654 return None 655 lines.append('//') 656 data = ''.join(lines) 657 if self._parser is not None: 658 return self._parser.parse(File.StringHandle(data)) 659 return data
660
661 - def __iter__(self):
662 return iter(self.next, None)
663