1
2
3
4 """Simple protein analysis.
5
6 Example,
7
8 X = ProteinAnalysis("MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPV")
9 print X.count_amino_acids()
10 print X.get_amino_acids_percent()
11 print X.molecular_weight()
12 print X.aromaticity()
13 print X.instability_index()
14 print X.flexibility()
15 print X.isoelectric_point()
16 print X.secondary_structure_fraction()
17 print X.protein_scale(ProtParamData.kd, 9, 0.4)
18 """
19
20 import sys
21 import ProtParamData, IsoelectricPoint
22 from ProtParamData import kd
23 from Bio.Seq import Seq
24 from Bio.Alphabet import IUPAC
25 from Bio.Data import IUPACData
26
27
29 """Class containing methods for protein analysis.
30
31 The class init method takes only one argument, the protein sequence as a
32 string and builds a sequence object using the Bio.Seq module. This is done
33 just to make sure the sequence is a protein sequence and not anything else.
34
35 """
44
46 """Count standard amino acids, returns a dict.
47
48 Simply counts the number times an amino acid is repeated in the protein
49 sequence. Returns a dictionary {AminoAcid:Number} and also stores the
50 dictionary in self.amino_acids_content.
51 """
52 ProtDic = dict([ (k, 0) for k in IUPACData.protein_letters])
53 for i in ProtDic:
54 ProtDic[i]=self.sequence.count(i)
55 self.amino_acids_content = ProtDic
56 return ProtDic
57
59 """Calculate the amino acid content in percents.
60
61 The same as count_amino_acids only returns the Number in percentage of
62 entire sequence. Returns a dictionary and stores the dictionary in
63 self.amino_acids_content_percent.
64
65 input is the dictionary from CountAA.
66 output is a dictionary with AA as keys.
67 """
68 if not self.amino_acids_content:
69 self.count_amino_acids()
70
71 PercentAA = {}
72 for i in self.amino_acids_content:
73 if self.amino_acids_content[i] > 0:
74 PercentAA[i]=self.amino_acids_content[i]/float(self.length)
75 else:
76 PercentAA[i] = 0
77 self.amino_acids_percent = PercentAA
78 return PercentAA
79
91
93 """Calculate the aromaticity according to Lobry, 1994.
94
95 Calculates the aromaticity value of a protein according to Lobry, 1994.
96 It is simply the relative frequency of Phe+Trp+Tyr.
97 """
98 if not self.amino_acids_percent:
99 self.get_amino_acids_percent()
100
101 Arom= self.amino_acids_percent['Y']+self.amino_acids_percent['W']+self.amino_acids_percent['F']
102 return Arom
103
105 """Calculate the instability index according to Guruprasad et al 1990.
106
107 Implementation of the method of Guruprasad et al. 1990 to test a
108 protein for stability. Any value above 40 means the protein is unstable
109 (has a short half life).
110
111 See: Guruprasad K., Reddy B.V.B., Pandit M.W.
112 Protein Engineering 4:155-161(1990).
113 """
114
115 DIWV=ProtParamData.DIWV.copy()
116 score=0.0
117 for i in range(self.length - 1):
118 DiPeptide=DIWV[self.sequence[i]][self.sequence[i+1]]
119 score += DiPeptide
120 return (10.0/self.length) * score
121
123 """Calculate the flexibility according to Vihinen, 1994.
124
125 No argument to change window size because parameters are specific for a
126 window=9. The parameters used are optimized for determining the flexibility.
127 """
128 Flex = ProtParamData.Flex.copy()
129 Window=9
130 Weights=[0.25,0.4375,0.625,0.8125,1]
131 List=[]
132 for i in range(self.length - Window):
133 SubSeq=self.sequence[i:i+Window]
134 score = 0.0
135 for j in range(Window//2):
136 score += (Flex[SubSeq[j]]+Flex[SubSeq[Window-j-1]]) * Weights[j]
137 score += Flex[SubSeq[Window//2+1]]
138 List.append(score/5.25)
139 return List
140
142 """Calculate the gravy according to Kyte and Doolittle."""
143 ProtGravy=0.0
144 for i in self.sequence:
145 ProtGravy += kd[i]
146
147 return ProtGravy/self.length
148
149
150
151
152
154 unit = ((1.0-edge)/(window-1))*2
155 list = [0.0]*(window//2)
156 for i in range(window//2):
157 list[i] = edge + unit * i
158 return list
159
160
161
162
164 """Compute a profile by any amino acid scale.
165
166 An amino acid scale is defined by a numerical value assigned to each type of
167 amino acid. The most frequently used scales are the hydrophobicity or
168 hydrophilicity scales and the secondary structure conformational parameters
169 scales, but many other scales exist which are based on different chemical and
170 physical properties of the amino acids. You can set several parameters that
171 control the computation of a scale profile, such as the window size and the
172 window edge relative weight value. WindowSize: The window size is the length
173 of the interval to use for the profile computation. For a window size n, we
174 use the i- ( n-1)/2 neighboring residues on each side of residue it compute
175 the score for residue i. The score for residue is the sum of the scale values
176 for these amino acids, optionally weighted according to their position in the
177 window. Edge: The central amino acid of the window always has a weight of 1.
178 By default, the amino acids at the remaining window positions have the same
179 weight, but you can make the residue at the center of the window have a
180 larger weight than the others by setting the edge value for the residues at
181 the beginning and end of the interval to a value between 0 and 1. For
182 instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7,
183 1.0, 0.7, 0.4. The method returns a list of values which can be plotted to
184 view the change along a protein sequence. Many scales exist. Just add your
185 favorites to the ProtParamData modules.
186
187 Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl
188 """
189
190 weight = self._weight_list(Window,Edge)
191 list = []
192
193 sum_of_weights = 0.0
194 for i in weight: sum_of_weights += i
195
196 sum_of_weights = sum_of_weights*2+1
197
198 for i in range(self.length-Window+1):
199 subsequence = self.sequence[i:i+Window]
200 score = 0.0
201 for j in range(Window//2):
202
203
204 try:
205 score += weight[j] * ParamDict[subsequence[j]] + weight[j] * ParamDict[subsequence[Window-j-1]]
206 except KeyError:
207 sys.stderr.write('warning: %s or %s is not a standard amino acid.\n' %
208 (subsequence[j],subsequence[Window-j-1]))
209
210
211 if subsequence[Window//2] in ParamDict:
212 score += ParamDict[subsequence[Window//2]]
213 else:
214 sys.stderr.write('warning: %s is not a standard amino acid.\n' % (subsequence[Window//2]))
215
216 list.append(score/sum_of_weights)
217 return list
218
220 """Calculate the isoelectric point.
221
222 This method uses the module IsoelectricPoint to calculate the pI of a protein.
223 """
224 if not self.amino_acids_content:
225 self.count_amino_acids()
226 X = IsoelectricPoint.IsoelectricPoint(self.sequence, self.amino_acids_content)
227 return X.pi()
228
230 """Calculate fraction of helix, turn and sheet.
231
232 This methods returns a list of the fraction of amino acids which tend
233 to be in Helix, Turn or Sheet.
234
235 Amino acids in helix: V, I, Y, F, W, L.
236 Amino acids in Turn: N, P, G, S.
237 Amino acids in sheet: E, M, A, L.
238
239 Returns a tuple of three integers (Helix, Turn, Sheet).
240 """
241 if not self.amino_acids_percent:
242 self.get_amino_acids_percent()
243 Helix = self.amino_acids_percent['V'] + self.amino_acids_percent['I'] + self.amino_acids_percent['Y'] + self.amino_acids_percent['F'] + self.amino_acids_percent['W'] + self.amino_acids_percent['L']
244 Turn = self.amino_acids_percent['N'] + self.amino_acids_percent['P'] + self.amino_acids_percent['G'] + self.amino_acids_percent['S']
245 Sheet = self.amino_acids_percent['E'] + self.amino_acids_percent['M'] + self.amino_acids_percent['A'] + self.amino_acids_percent['L']
246 return Helix, Turn, Sheet
247