1
2
3
4
5
6
7
8 """Substitution matrices, log odds matrices, and operations on them.
9
10 General:
11 -------
12
13 This module provides a class and a few routines for generating
14 substitution matrices, similar ot BLOSUM or PAM matrices, but based on
15 user-provided data.
16 The class used for these matrices is SeqMat
17
18 Matrices are implemented as a dictionary. Each index contains a 2-tuple,
19 which are the two residue/nucleotide types replaced. The value differs
20 according to the matrix's purpose: e.g in a log-odds frequency matrix, the
21 value would be log(Pij/(Pi*Pj)) where:
22 Pij: frequency of substitution of letter (residue/nucletide) i by j
23 Pi, Pj: expected frequencies of i and j, respectively.
24
25 Usage:
26 -----
27 The following section is layed out in the order by which most people wish
28 to generate a log-odds matrix. Of course, interim matrices can be
29 generated and investigated. Most people just want a log-odds matrix,
30 that's all.
31
32 Generating an Accepted Replacement Matrix:
33 -----------------------------------------
34 Initially, you should generate an accepted replacement matrix (ARM)
35 from your data. The values in ARM are the _counted_ number of
36 replacements according to your data. The data could be a set of pairs
37 or multiple alignments. So for instance if Alanine was replaced by
38 Cysteine 10 times, and Cysteine by Alanine 12 times, the corresponding
39 ARM entries would be:
40 ['A','C']: 10,
41 ['C','A'] 12
42 As order doesn't matter, user can already provide only one entry:
43 ['A','C']: 22
44 A SeqMat instance may be initialized with either a full (first
45 method of counting: 10, 12) or half (the latter method, 22) matrices. A
46 Full protein alphabet matrix would be of the size 20x20 = 400. A Half
47 matrix of that alphabet would be 20x20/2 + 20/2 = 210. That is because
48 same-letter entries don't change. (The matrix diagonal). Given an
49 alphabet size of N:
50 Full matrix size:N*N
51 Half matrix size: N(N+1)/2
52
53 If you provide a full matrix, the constructore will create a half-matrix
54 automatically.
55 If you provide a half-matrix, make sure of a (low, high) sorted order in
56 the keys: there should only be
57 a ('A','C') not a ('C','A').
58
59 Internal functions:
60
61 Generating the observed frequency matrix (OFM):
62 ----------------------------------------------
63 Use: OFM = _build_obs_freq_mat(ARM)
64 The OFM is generated from the ARM, only instead of replacement counts, it
65 contains replacement frequencies.
66
67 Generating an expected frequency matrix (EFM):
68 ---------------------------------------------
69 Use: EFM = _build_exp_freq_mat(OFM,exp_freq_table)
70 exp_freq_table: should be a freqTableC instantiation. See freqTable.py for
71 detailed information. Briefly, the expected frequency table has the
72 frequencies of appearance for each member of the alphabet
73
74 Generating a substitution frequency matrix (SFM):
75 ------------------------------------------------
76 Use: SFM = _build_subs_mat(OFM,EFM)
77 Accepts an OFM, EFM. Provides the division product of the corresponding
78 values.
79
80 Generating a log-odds matrix (LOM):
81 ----------------------------------
82 Use: LOM=_build_log_odds_mat(SFM[,logbase=10,factor=10.0,roundit=1])
83 Accepts an SFM. logbase: base of the logarithm used to generate the
84 log-odds values. factor: factor used to multiply the log-odds values.
85 roundit: default - true. Whether to round the values.
86 Each entry is generated by log(LOM[key])*factor
87 And rounded if required.
88
89 External:
90 ---------
91 In most cases, users will want to generate a log-odds matrix only, without
92 explicitly calling the OFM --> EFM --> SFM stages. The function
93 build_log_odds_matrix does that. User provides an ARM and an expected
94 frequency table. The function returns the log-odds matrix.
95
96 Methods for subtraction, addition and multiplication of matrices:
97 -----------------------------------------------------------------
98 * Generation of an expected frequency table from an observed frequency
99 matrix.
100 * Calculation of linear correlation coefficient between two matrices.
101 * Calculation of relative entropy is now done using the
102 _make_relative_entropy method and is stored in the member
103 self.relative_entropy
104 * Calculation of entropy is now done using the _make_entropy method and
105 is stored in the member self.entropy.
106 * Jensen-Shannon distance between the distributions from which the
107 matrices are derived. This is a distance function based on the
108 distribution's entropies.
109 """
110
111
112 import re
113 import sys
114 import copy
115 import math
116 import warnings
117
118
119 import Bio
120 from Bio import Alphabet
121 from Bio.SubsMat import FreqTable
122
123 log = math.log
124
125 NOTYPE = 0
126 ACCREP = 1
127 OBSFREQ = 2
128 SUBS = 3
129 EXPFREQ = 4
130 LO = 5
131 EPSILON = 0.00000000000001
132
133
135 """A Generic sequence matrix class
136 The key is a 2-tuple containing the letter indices of the matrix. Those
137 should be sorted in the tuple (low, high). Because each matrix is dealt
138 with as a half-matrix."""
139
141 ab_dict = {}
142 s = ''
143 for i in self:
144 ab_dict[i[0]] = 1
145 ab_dict[i[1]] = 1
146 for i in sorted(ab_dict):
147 s += i
148 self.alphabet.letters = s
149
150 - def __init__(self,data=None, alphabet=None,
151 mat_type=None,mat_name='',build_later=0):
208
210 keylist = self.keys()
211 for key in keylist:
212 if key[0] > key[1]:
213 self[(key[1],key[0])] = self[key]
214 del self[key]
215
217 """
218 Convert a full-matrix to a half-matrix
219 """
220
221
222
223
224 N = len(self.alphabet.letters)
225
226 if len(self) == N*(N+1)/2:
227 return
228 for i in self.ab_list:
229 for j in self.ab_list[:self.ab_list.index(i)+1]:
230 if i != j:
231 self[j,i] = self[j,i] + self[i,j]
232 del self[i,j]
233
235 for i in self.ab_list:
236 for j in self.ab_list[:self.ab_list.index(i)+1]:
237 self[j,i] = 0.
238
240 """if this matrix is a log-odds matrix, return its entropy
241 Needs the observed frequency matrix for that"""
242
243 ent = 0.
244 if self.mat_type == LO:
245 for i in self:
246 ent += obs_freq_mat[i]*self[i]/log(2)
247 elif self.mat_type == SUBS:
248 for i in self:
249 if self[i] > EPSILON:
250 ent += obs_freq_mat[i]*log(self[i])/log(2)
251 else:
252 raise TypeError("entropy: substitution or log-odds matrices only")
253 self.relative_entropy = ent
254
256 self.entropy = 0
257 for i in self:
258 if self[i] > EPSILON:
259 self.entropy += self[i]*log(self[i])/log(2)
260 self.entropy = -self.entropy
261
273
279
281 result = {}
282 for letter in self.alphabet.letters:
283 result[letter] = 0.0
284 for pair, value in self.iteritems():
285 i1, i2 = pair
286 if i1==i2:
287 result[i1] += value
288 else:
289 result[i1] += value / 2
290 result[i2] += value / 2
291 return result
292
293 - def print_full_mat(self,f=None,format="%4d",topformat="%4s",
294 alphabet=None,factor=1,non_sym=None):
295 f = f or sys.stdout
296
297
298 assert non_sym == None or type(non_sym) == type(1.) or \
299 type(non_sym) == type(1)
300 full_mat = copy.copy(self)
301 for i in self:
302 if i[0] != i[1]:
303 full_mat[(i[1],i[0])] = full_mat[i]
304 if not alphabet:
305 alphabet = self.ab_list
306 topline = ''
307 for i in alphabet:
308 topline = topline + topformat % i
309 topline = topline + '\n'
310 f.write(topline)
311 for i in alphabet:
312 outline = i
313 for j in alphabet:
314 if alphabet.index(j) > alphabet.index(i) and non_sym is not None:
315 val = non_sym
316 else:
317 val = full_mat[i,j]
318 val *= factor
319 if val <= -999:
320 cur_str = ' ND'
321 else:
322 cur_str = format % val
323
324 outline = outline+cur_str
325 outline = outline+'\n'
326 f.write(outline)
327
328 - def print_mat(self,f=None,format="%4d",bottomformat="%4s",
329 alphabet=None,factor=1):
330 """Print a nice half-matrix. f=sys.stdout to see on the screen
331 User may pass own alphabet, which should contain all letters in the
332 alphabet of the matrix, but may be in a different order. This
333 order will be the order of the letters on the axes"""
334
335 f = f or sys.stdout
336 if not alphabet:
337 alphabet = self.ab_list
338 bottomline = ''
339 for i in alphabet:
340 bottomline = bottomline + bottomformat % i
341 bottomline = bottomline + '\n'
342 for i in alphabet:
343 outline = i
344 for j in alphabet[:alphabet.index(i)+1]:
345 try:
346 val = self[j,i]
347 except KeyError:
348 val = self[i,j]
349 val *= factor
350 if val == -999:
351 cur_str = ' ND'
352 else:
353 cur_str = format % val
354
355 outline = outline+cur_str
356 outline = outline+'\n'
357 f.write(outline)
358 f.write(bottomline)
359
361 """Print a nice half-matrix."""
362 output = ""
363 alphabet = self.ab_list
364 n = len(alphabet)
365 for i in range(n):
366 c1 = alphabet[i]
367 output += c1
368 for j in range(i+1):
369 c2 = alphabet[j]
370 try:
371 val = self[c2,c1]
372 except KeyError:
373 val = self[c1,c2]
374 if val == -999:
375 output += ' ND'
376 else:
377 output += "%4d" % val
378 output += '\n'
379 output += '%4s' * n % tuple(alphabet) + "\n"
380 return output
381
383 """ returns a number which is the subtraction product of the two matrices"""
384 mat_diff = 0
385 for i in self:
386 mat_diff += (self[i] - other[i])
387 return mat_diff
388
390 """ returns a matrix for which each entry is the multiplication product of the
391 two matrices passed"""
392 new_mat = copy.copy(self)
393 for i in self:
394 new_mat[i] *= other[i]
395 return new_mat
396
398 new_mat = copy.copy(self)
399 for i in self:
400 new_mat[i] += other[i]
401 return new_mat
402
404 """Accepted replacements matrix"""
405
407 """Observed frequency matrix"""
408
410 """Expected frequency matrix"""
411
412
414 """Substitution matrix"""
415
417 """Calculate and return the relative entropy with respect to an
418 observed frequency matrix"""
419 relative_entropy = 0.
420 for key, value in self.iteritems():
421 if value > EPSILON:
422 relative_entropy += obs_freq_mat[key]*log(value)
423 relative_entropy /= log(2)
424 return relative_entropy
425
426
428 """Log odds matrix"""
429
431 """Calculate and return the relative entropy with respect to an
432 observed frequency matrix"""
433 relative_entropy = 0.
434 for key, value in self.iteritems():
435 relative_entropy += obs_freq_mat[key]*value/log(2)
436 return relative_entropy
437
438
440 """
441 build_obs_freq_mat(acc_rep_mat):
442 Build the observed frequency matrix, from an accepted replacements matrix
443 The acc_rep_mat matrix should be generated by the user.
444 """
445
446 total = float(sum(acc_rep_mat.values()))
447 obs_freq_mat = ObservedFrequencyMatrix(alphabet=acc_rep_mat.alphabet,
448 build_later=1)
449 for i in acc_rep_mat:
450 obs_freq_mat[i] = acc_rep_mat[i]/total
451 return obs_freq_mat
452
454 exp_freq_table = {}
455 for i in obs_freq_mat.alphabet.letters:
456 exp_freq_table[i] = 0.
457 for i in obs_freq_mat:
458 if i[0] == i[1]:
459 exp_freq_table[i[0]] += obs_freq_mat[i]
460 else:
461 exp_freq_table[i[0]] += obs_freq_mat[i] / 2.
462 exp_freq_table[i[1]] += obs_freq_mat[i] / 2.
463 return FreqTable.FreqTable(exp_freq_table,FreqTable.FREQ)
464
466 """Build an expected frequency matrix
467 exp_freq_table: should be a FreqTable instance
468 """
469 exp_freq_mat = ExpectedFrequencyMatrix(alphabet=exp_freq_table.alphabet,
470 build_later=1)
471 for i in exp_freq_mat:
472 if i[0] == i[1]:
473 exp_freq_mat[i] = exp_freq_table[i[0]]**2
474 else:
475 exp_freq_mat[i] = 2.0*exp_freq_table[i[0]]*exp_freq_table[i[1]]
476 return exp_freq_mat
477
478
479
481 """ Build the substitution matrix """
482 if obs_freq_mat.ab_list != exp_freq_mat.ab_list:
483 raise ValueError("Alphabet mismatch in passed matrices")
484 subs_mat = SubstitutionMatrix(obs_freq_mat)
485 for i in obs_freq_mat:
486 subs_mat[i] = obs_freq_mat[i]/exp_freq_mat[i]
487 return subs_mat
488
489
490
491
493 """_build_log_odds_mat(subs_mat,logbase=10,factor=10.0,round_digit=1):
494 Build a log-odds matrix
495 logbase=2: base of logarithm used to build (default 2)
496 factor=10.: a factor by which each matrix entry is multiplied
497 round_digit: roundoff place after decimal point
498 keep_nd: if true, keeps the -999 value for non-determined values (for which there
499 are no substitutions in the frequency substitutions matrix). If false, plants the
500 minimum log-odds value of the matrix in entries containing -999
501 """
502 lo_mat = LogOddsMatrix(subs_mat)
503 for key, value in subs_mat.iteritems():
504 if value < EPSILON:
505 lo_mat[key] = -999
506 else:
507 lo_mat[key] = round(factor*log(value)/log(logbase),round_digit)
508 mat_min = min(lo_mat.values())
509 if not keep_nd:
510 for i in lo_mat:
511 if lo_mat[i] <= -999:
512 lo_mat[i] = mat_min
513 return lo_mat
514
515
516
517
518
519
520
521 -def make_log_odds_matrix(acc_rep_mat,exp_freq_table=None,logbase=2,
522 factor=1.,round_digit=9,keep_nd=0):
530
536
537 -def read_text_matrix(data_file,mat_type=NOTYPE):
538 matrix = {}
539 tmp = data_file.read().split("\n")
540 table=[]
541 for i in tmp:
542 table.append(i.split())
543
544 for rec in table[:]:
545 if (rec.count('#') > 0):
546 table.remove(rec)
547
548
549 while (table.count([]) > 0):
550 table.remove([])
551
552 alphabet = table[0]
553 j = 0
554 for rec in table[1:]:
555
556 row = alphabet[j]
557
558 if re.compile('[A-z\*]').match(rec[0]):
559 first_col = 1
560 else:
561 first_col = 0
562 i = 0
563 for field in rec[first_col:]:
564 col = alphabet[i]
565 matrix[(row,col)] = float(field)
566 i += 1
567 j += 1
568
569 for i in matrix.keys():
570 if '*' in i: del(matrix[i])
571 ret_mat = SeqMat(matrix,mat_type=mat_type)
572 return ret_mat
573
574 diagNO = 1
575 diagONLY = 2
576 diagALL = 3
577
579 rel_ent = 0.
580 key_list_1 = sorted(mat_1)
581 key_list_2 = sorted(mat_2)
582 key_list = []
583 sum_ent_1 = 0.; sum_ent_2 = 0.
584 for i in key_list_1:
585 if i in key_list_2:
586 key_list.append(i)
587 if len(key_list_1) != len(key_list_2):
588 sys.stderr.write("Warning:first matrix has more entries than the second\n")
589 if key_list_1 != key_list_2:
590 sys.stderr.write("Warning: indices not the same between matrices\n")
591 for key in key_list:
592 if diag == diagNO and key[0] == key[1]:
593 continue
594 if diag == diagONLY and key[0] != key[1]:
595 continue
596 if mat_1[key] > EPSILON and mat_2[key] > EPSILON:
597 sum_ent_1 += mat_1[key]
598 sum_ent_2 += mat_2[key]
599
600 for key in key_list:
601 if diag == diagNO and key[0] == key[1]:
602 continue
603 if diag == diagONLY and key[0] != key[1]:
604 continue
605 if mat_1[key] > EPSILON and mat_2[key] > EPSILON:
606 val_1 = mat_1[key] / sum_ent_1
607 val_2 = mat_2[key] / sum_ent_2
608
609 rel_ent += val_1 * log(val_1/val_2)/log(logbase)
610 return rel_ent
611
612
614 try:
615 import numpy
616 except ImportError:
617 raise ImportError, "Please install Numerical Python (numpy) if you want to use this function"
618 values = []
619 assert mat_1.ab_list == mat_2.ab_list
620 for ab_pair in mat_1:
621 try:
622 values.append((mat_1[ab_pair], mat_2[ab_pair]))
623 except KeyError:
624 raise ValueError, "%s is not a common key" % ab_pair
625 correlation_matrix = numpy.corrcoef(values, rowvar=0)
626 correlation = correlation_matrix[0,1]
627 return correlation
628
629
630
632 assert mat_1.ab_list == mat_2.ab_list
633 assert pi_1 > 0 and pi_2 > 0 and pi_1< 1 and pi_2 <1
634 assert not (pi_1 + pi_2 - 1.0 > EPSILON)
635 sum_mat = SeqMat(build_later=1)
636 sum_mat.ab_list = mat_1.ab_list
637 for i in mat_1:
638 sum_mat[i] = pi_1 * mat_1[i] + pi_2 * mat_2[i]
639 sum_mat.make_entropy()
640 mat_1.make_entropy()
641 mat_2.make_entropy()
642
643 dJS = sum_mat.entropy - pi_1 * mat_1.entropy - pi_2 *mat_2.entropy
644 return dJS
645
646 """
647 This isn't working yet. Boo hoo!
648 def two_mat_print(mat_1, mat_2, f=None,alphabet=None,factor_1=1, factor_2=1,
649 format="%4d",bottomformat="%4s",topformat="%4s",
650 topindent=7*" ", bottomindent=1*" "):
651 f = f or sys.stdout
652 if not alphabet:
653 assert mat_1.ab_list == mat_2.ab_list
654 alphabet = mat_1.ab_list
655 len_alphabet = len(alphabet)
656 print_mat = {}
657 topline = topindent
658 bottomline = bottomindent
659 for i in alphabet:
660 bottomline += bottomformat % i
661 topline += topformat % alphabet[len_alphabet-alphabet.index(i)-1]
662 topline += '\n'
663 bottomline += '\n'
664 f.write(topline)
665 for i in alphabet:
666 for j in alphabet:
667 print_mat[i,j] = -999
668 diag_1 = {}; diag_2 = {}
669 for i in alphabet:
670 for j in alphabet[:alphabet.index(i)+1]:
671 if i == j:
672 diag_1[i] = mat_1[(i,i)]
673 diag_2[i] = mat_2[(alphabet[len_alphabet-alphabet.index(i)-1],
674 alphabet[len_alphabet-alphabet.index(i)-1])]
675 else:
676 if i > j:
677 key = (j,i)
678 else:
679 key = (i,j)
680 mat_2_key = [alphabet[len_alphabet-alphabet.index(key[0])-1],
681 alphabet[len_alphabet-alphabet.index(key[1])-1]]
682 # print mat_2_key
683 mat_2_key.sort(); mat_2_key = tuple(mat_2_key)
684 # print key ,"||", mat_2_key
685 print_mat[key] = mat_2[mat_2_key]
686 print_mat[(key[1],key[0])] = mat_1[key]
687 for i in alphabet:
688 outline = i
689 for j in alphabet:
690 if i == j:
691 if diag_1[i] == -999:
692 val_1 = ' ND'
693 else:
694 val_1 = format % (diag_1[i]*factor_1)
695 if diag_2[i] == -999:
696 val_2 = ' ND'
697 else:
698 val_2 = format % (diag_2[i]*factor_2)
699 cur_str = val_1 + " " + val_2
700 else:
701 if print_mat[(i,j)] == -999:
702 val = ' ND'
703 elif alphabet.index(i) > alphabet.index(j):
704 val = format % (print_mat[(i,j)]*factor_1)
705 else:
706 val = format % (print_mat[(i,j)]*factor_2)
707 cur_str = val
708 outline += cur_str
709 outline += bottomformat % (alphabet[len_alphabet-alphabet.index(i)-1] +
710 '\n')
711 f.write(outline)
712 f.write(bottomline)
713 """
714