RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
MolStandardize/Tautomer.h
Go to the documentation of this file.
1//
2// Copyright (C) 2018-2021 Susan H. Leung and other RDKit contributors
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef RD_TAUTOMER_H
12#define RD_TAUTOMER_H
13
14#include <boost/function.hpp>
15#include <string>
16#include <utility>
17#include <iterator>
18#include <Catalogs/Catalog.h>
23#include <boost/dynamic_bitset.hpp>
24
25namespace RDKit {
26class ROMol;
27class RWMol;
28
29namespace MolStandardize {
30
31typedef RDCatalog::HierarchCatalog<TautomerCatalogEntry, TautomerCatalogParams,
32 int>
34
36const std::string tautomerScoringVersion = "1.0.0";
37
38//! The SubstructTerm controls how Tautomers are generated
39/// Each Term is defined by a name, smarts pattern and score
40/// For example, the C=O term is defined as
41/// SubstructTerm("C=O", "[#6]=,:[#8]", 2)
42/// This gets a score of +2 for each Carbon doubly or aromatically
43/// Bonded to an Oxygen.
44/// For a list of current definitions, see getDefaultTautomerScoreSubstructs
46 std::string name;
47 std::string smarts;
48 int score;
49 RWMol matcher; // requires assignment
50
51 // Pre-screening support: elements that must be present (empty = no filter)
52 std::vector<int> requiredElements;
53 // Bond-order-agnostic connectivity pattern for pre-screening (may be empty)
54 std::string connectivitySmarts;
56
57 SubstructTerm(std::string aname, std::string asmarts, int ascore,
58 std::vector<int> reqElements = {},
59 std::string connSmarts = "");
60 SubstructTerm(const SubstructTerm &rhs) = default;
61
62 bool operator==(const SubstructTerm &rhs) const {
63 return name == rhs.name && smarts == rhs.smarts && score == rhs.score;
64 }
65};
66
67//! getDefaultTautomerSubstructs returns the SubstructTerms used in scoring
68/// tautomer forms. See SubstructTerm for details.
69RDKIT_MOLSTANDARDIZE_EXPORT const std::vector<SubstructTerm>
71
72//! Score the rings of the current tautomer
73/// Aromatic rings score 100, all carbon aromatic rings score 250
74/*!
75 \param mol Molcule to score
76 \returns integer score for the molecule's rings
77*/
79
80//! scoreSubstructs scores the molecule based on the substructure definitions
81/*!
82 \param mol Molecule to score
83 \param terms Substruct Terms used for scoring this particular tautomer form
84 \returns integer score for the molecule's substructure terms
85*/
87 const ROMol &mol, const std::vector<SubstructTerm> &terms =
89
90//! Determine which SubstructTerms are potentially relevant for a molecule.
91/// This pre-filters terms in two stages:
92/// 1. Element check: skip terms requiring elements not in the molecule
93/// 2. Connectivity check: skip terms whose bond-order-agnostic pattern
94/// doesn't match (since tautomerization doesn't create/destroy bonds)
95/// Returns indices into the terms vector for relevant terms.
97 const ROMol &mol,
98 const std::vector<SubstructTerm> &terms = getDefaultTautomerScoreSubstructs());
99
100//! Score substructures using only the terms at the specified indices.
101/// Uses specialized matchers for simple patterns (C=O, N=O, P=O, methyl, etc.)
102/// and falls back to VF2 for complex patterns. Use with
103/// getRelevantSubstructTermIndices for efficient scoring of many tautomers
104/// from the same parent molecule.
106 const ROMol &mol, const std::vector<SubstructTerm> &terms,
107 const std::vector<size_t> &relevantIndices);
108//! scoreHeteroHs score the molecules hydrogens
109/// This gives a negative penalty to hydrogens attached to S,P, Se and Te
110/*!
111 \param mol Molecule to score
112 \returns integer score for the molecule hetero hydrogens
113*/
115
116inline int scoreTautomer(const ROMol &mol) {
117 return scoreRings(mol) + scoreSubstructs(mol) + scoreHeteroHs(mol);
118}
119
120//! Create an optimized scoring function for tautomers of a specific molecule.
121/// This pre-filters SubstructTerms based on elements and connectivity present
122/// in the input molecule, avoiding unnecessary substructure searches for
123/// patterns that can never match any tautomer of this molecule.
124/// The returned function captures the filtered term indices and can be passed
125/// to pickCanonical or canonicalize.
126inline boost::function<int(const ROMol &)> makeOptimizedScorer(
127 const ROMol &mol) {
128 auto relevantIndices = getRelevantSubstructTermIndices(mol);
129 // Capture by value since the indices are small and we want the lambda to
130 // outlive this function.
131 return [relevantIndices](const ROMol &taut) {
132 const auto &terms = getDefaultTautomerScoreSubstructs();
133 return scoreRings(taut) +
134 scoreSubstructsFiltered(taut, terms, relevantIndices) +
135 scoreHeteroHs(taut);
136 };
137}
138} // namespace TautomerScoringFunctions
139
146
147class Tautomer {
148 friend class TautomerEnumerator;
149
150 public:
151 Tautomer() : d_numModifiedAtoms(0), d_numModifiedBonds(0), d_done(false) {}
152 // Constructor with just the tautomer - kekulized form will be created lazily
153 Tautomer(ROMOL_SPTR t, size_t a = 0, size_t b = 0)
154 : tautomer(std::move(t)),
155 d_numModifiedAtoms(a),
156 d_numModifiedBonds(b),
157 d_done(false) {}
158 // Legacy constructor for compatibility
159 Tautomer(ROMOL_SPTR t, ROMOL_SPTR k, size_t a = 0, size_t b = 0)
160 : tautomer(std::move(t)),
161 kekulized(std::move(k)),
162 d_numModifiedAtoms(a),
163 d_numModifiedBonds(b),
164 d_done(false) {}
166 mutable ROMOL_SPTR kekulized; // Lazily initialized
167
168 // Get the kekulized form, creating it lazily if needed
169 const ROMOL_SPTR &getKekulized() const {
170 if (!kekulized && tautomer) {
171 kekulized.reset(new RWMol(*tautomer));
172 MolOps::Kekulize(static_cast<RWMol &>(*kekulized), false, true);
173 }
174 return kekulized;
175 }
176
177 private:
178 size_t d_numModifiedAtoms;
179 size_t d_numModifiedBonds;
180 bool d_done;
181};
182
183typedef std::map<std::string, Tautomer> SmilesTautomerMap;
184typedef std::pair<std::string, Tautomer> SmilesTautomerPair;
185
186//! Contains results of tautomer enumeration
188 friend class TautomerEnumerator;
189
190 public:
192 public:
194 typedef std::ptrdiff_t difference_type;
195 typedef const ROMol *pointer;
196 typedef const ROMOL_SPTR &reference;
197 typedef std::bidirectional_iterator_tag iterator_category;
198
199 explicit const_iterator(const SmilesTautomerMap::const_iterator &it)
200 : d_it(it) {}
201 reference operator*() const { return d_it->second.tautomer; }
202 pointer operator->() const { return d_it->second.tautomer.get(); }
203 bool operator==(const const_iterator &other) const {
204 return (d_it == other.d_it);
205 }
206 bool operator!=(const const_iterator &other) const {
207 return !(*this == other);
208 }
210 const_iterator copy(d_it);
211 operator++();
212 return copy;
213 }
215 ++d_it;
216 return *this;
217 }
219 const_iterator copy(d_it);
220 operator--();
221 return copy;
222 }
224 --d_it;
225 return *this;
226 }
227
228 private:
229 SmilesTautomerMap::const_iterator d_it;
230 };
233 : d_tautomers(other.d_tautomers),
234 d_status(other.d_status),
235 d_modifiedAtoms(other.d_modifiedAtoms),
236 d_modifiedBonds(other.d_modifiedBonds) {
237 fillTautomersItVec();
238 }
239 const const_iterator begin() const {
240 return const_iterator(d_tautomers.begin());
241 }
242 const const_iterator end() const { return const_iterator(d_tautomers.end()); }
243 size_t size() const { return d_tautomers.size(); }
244 bool empty() const { return d_tautomers.empty(); }
245 const ROMOL_SPTR &at(size_t pos) const {
246 PRECONDITION(pos < d_tautomers.size(), "index out of bounds");
247 return d_tautomersItVec.at(pos)->second.tautomer;
248 }
249 const ROMOL_SPTR &operator[](size_t pos) const { return at(pos); }
250 const boost::dynamic_bitset<> &modifiedAtoms() const {
251 return d_modifiedAtoms;
252 }
253 const boost::dynamic_bitset<> &modifiedBonds() const {
254 return d_modifiedBonds;
255 }
256 TautomerEnumeratorStatus status() const { return d_status; }
257 std::vector<ROMOL_SPTR> tautomers() const {
258 std::vector<ROMOL_SPTR> tautomerVec;
259 tautomerVec.reserve(d_tautomers.size());
260 std::transform(
261 d_tautomers.begin(), d_tautomers.end(), std::back_inserter(tautomerVec),
262 [](const SmilesTautomerPair &t) { return t.second.tautomer; });
263 return tautomerVec;
264 }
265 std::vector<ROMOL_SPTR> operator()() const { return tautomers(); }
266 std::vector<std::string> smiles() const {
267 std::vector<std::string> smilesVec;
268 smilesVec.reserve(d_tautomers.size());
269 std::transform(d_tautomers.begin(), d_tautomers.end(),
270 std::back_inserter(smilesVec),
271 [](const SmilesTautomerPair &t) { return t.first; });
272 return smilesVec;
273 }
274 const SmilesTautomerMap &smilesTautomerMap() const { return d_tautomers; }
275
276 private:
277 void fillTautomersItVec() {
278 for (auto it = d_tautomers.begin(); it != d_tautomers.end(); ++it) {
279 d_tautomersItVec.push_back(it);
280 }
281 }
282 // the enumerated tautomers
283 SmilesTautomerMap d_tautomers;
284 // internal; vector of iterators into map items to enable random
285 // access to map items by index
286 std::vector<SmilesTautomerMap::const_iterator> d_tautomersItVec;
287 // status of the enumeration: did it complete? did it hit a limit?
288 // was it canceled?
289 TautomerEnumeratorStatus d_status;
290 // bit vector: flags atoms modified by the transforms
291 boost::dynamic_bitset<> d_modifiedAtoms;
292 // bit vector: flags bonds modified by the transforms
293 boost::dynamic_bitset<> d_modifiedBonds;
294};
295
302
304 public:
306 : dp_catalog(tautCat),
307 d_maxTautomers(1000),
308 d_maxTransforms(1000),
309 d_removeSp3Stereo(true),
310 d_removeBondStereo(true),
311 d_removeIsotopicHs(true),
312 d_reassignStereo(true) {}
315 : dp_catalog(other.dp_catalog),
316 d_callback(other.d_callback),
317 d_maxTautomers(other.d_maxTautomers),
318 d_maxTransforms(other.d_maxTransforms),
319 d_removeSp3Stereo(other.d_removeSp3Stereo),
320 d_removeBondStereo(other.d_removeBondStereo),
321 d_removeIsotopicHs(other.d_removeIsotopicHs),
322 d_reassignStereo(other.d_reassignStereo) {}
324 if (this == &other) {
325 return *this;
326 }
327 dp_catalog = other.dp_catalog;
328 d_callback = other.d_callback;
329 d_maxTautomers = other.d_maxTautomers;
330 d_maxTransforms = other.d_maxTransforms;
331 d_removeSp3Stereo = other.d_removeSp3Stereo;
332 d_removeBondStereo = other.d_removeBondStereo;
333 d_removeIsotopicHs = other.d_removeIsotopicHs;
334 d_reassignStereo = other.d_reassignStereo;
335 return *this;
336 }
337 //! \param maxTautomers maximum number of tautomers to be generated
338 void setMaxTautomers(unsigned int maxTautomers) {
339 d_maxTautomers = maxTautomers;
340 }
341 //! \return maximum number of tautomers to be generated
342 unsigned int getMaxTautomers() { return d_maxTautomers; }
343 /*! \param maxTransforms maximum number of transformations to be applied
344 this limit is usually hit earlier than the maxTautomers limit
345 and leads to a more linear scaling of CPU time with increasing
346 number of tautomeric centers (see Sitzmann et al.)
347 */
348 void setMaxTransforms(unsigned int maxTransforms) {
349 d_maxTransforms = maxTransforms;
350 }
351 //! \return maximum number of transformations to be applied
352 unsigned int getMaxTransforms() { return d_maxTransforms; }
353 /*! \param removeSp3Stereo; if set to true, stereochemistry information
354 will be removed from sp3 atoms involved in tautomerism.
355 This means that S-aminoacids will lose their stereochemistry after going
356 through tautomer enumeration because of the amido-imidol tautomerism.
357 This defaults to true in RDKit, false in the workflow described
358 by Sitzmann et al.
359 */
360 void setRemoveSp3Stereo(bool removeSp3Stereo) {
361 d_removeSp3Stereo = removeSp3Stereo;
362 }
363 /*! \return whether stereochemistry information will be removed from
364 sp3 atoms involved in tautomerism
365 */
366 bool getRemoveSp3Stereo() { return d_removeSp3Stereo; }
367 /*! \param removeBondStereo; if set to true, stereochemistry information
368 will be removed from double bonds involved in tautomerism.
369 This means that enols will lose their E/Z stereochemistry after going
370 through tautomer enumeration because of the keto-enolic tautomerism.
371 This defaults to true in RDKit and also in the workflow described
372 by Sitzmann et al.
373 */
374 void setRemoveBondStereo(bool removeBondStereo) {
375 d_removeBondStereo = removeBondStereo;
376 }
377 /*! \return whether stereochemistry information will be removed from
378 double bonds involved in tautomerism
379 */
380 bool getRemoveBondStereo() { return d_removeBondStereo; }
381 /*! \param removeIsotopicHs; if set to true, isotopic Hs
382 will be removed from centers involved in tautomerism.
383 */
384 void setRemoveIsotopicHs(bool removeIsotopicHs) {
385 d_removeIsotopicHs = removeIsotopicHs;
386 }
387 /*! \return whether isotpoic Hs will be removed from
388 centers involved in tautomerism
389 */
390 bool getRemoveIsotopicHs() { return d_removeIsotopicHs; }
391 /*! \param reassignStereo; if set to true, assignStereochemistry
392 will be called on each tautomer generated by the enumerate() method.
393 This defaults to true.
394 */
395 void setReassignStereo(bool reassignStereo) {
396 d_reassignStereo = reassignStereo;
397 }
398 /*! \return whether assignStereochemistry will be called on each
399 tautomer generated by the enumerate() method
400 */
401 bool getReassignStereo() { return d_reassignStereo; }
402 /*! set this to an instance of a class derived from
403 TautomerEnumeratorCallback where operator() is overridden.
404 DO NOT delete the instance as ownership of the pointer is transferred
405 to the TautomerEnumerator
406 */
408 d_callback.reset(callback);
409 }
410 /*! \return pointer to an instance of a class derived from
411 TautomerEnumeratorCallback.
412 DO NOT delete the instance as ownership of the pointer is transferred
413 to the TautomerEnumerator
414 */
415 TautomerEnumeratorCallback *getCallback() const { return d_callback.get(); }
416
417 //! returns a \c TautomerEnumeratorResult structure for the input molecule
418 /*!
419 The enumeration rules are inspired by the publication:
420 M. Sitzmann et al., “Tautomerism in Large Databases.”, JCAMD 24:521 (2010)
421 https://doi.org/10.1007/s10822-010-9346-4
422
423 \param mol: the molecule to be enumerated
424
425 Note: the definitions used here are that the atoms modified during
426 tautomerization are the atoms at the beginning and end of each tautomer
427 transform (the H "donor" and H "acceptor" in the transform) and the bonds
428 modified during transformation are any bonds whose order is changed during
429 the tautomer transform (these are the bonds between the "donor" and the
430 "acceptor")
431
432 */
434
435 //! Deprecated, please use the form returning a \c TautomerEnumeratorResult
436 //! instead
437 [[deprecated(
438 "please use the form returning a TautomerEnumeratorResult "
439 "instead")]] std::vector<ROMOL_SPTR>
440 enumerate(const ROMol &mol, boost::dynamic_bitset<> *modifiedAtoms,
441 boost::dynamic_bitset<> *modifiedBonds = nullptr) const;
442
443 //! returns the canonical tautomer from a \c TautomerEnumeratorResult
445 boost::function<int(const ROMol &mol)> scoreFunc =
447
448 //! returns the canonical tautomer from an iterable of possible tautomers
449 /// When Iterable is TautomerEnumeratorResult we use the other non-templated
450 /// overload for efficiency (TautomerEnumeratorResult already has SMILES so no
451 /// need to recompute them)
452 template <class Iterable,
453 typename std::enable_if<
454 !std::is_same<Iterable, TautomerEnumeratorResult>::value,
455 int>::type = 0>
456 ROMol *pickCanonical(const Iterable &tautomers,
457 boost::function<int(const ROMol &mol)> scoreFunc =
459 ROMOL_SPTR bestMol;
460 if (tautomers.size() == 1) {
461 bestMol = *tautomers.begin();
462 } else {
463 // Calculate score for each tautomer
464 int bestScore = std::numeric_limits<int>::min();
465 std::string bestSmiles = "";
466 for (const auto &t : tautomers) {
467 auto score = scoreFunc(*t);
468#ifdef VERBOSE_ENUMERATION
469 std::cerr << " " << MolToSmiles(*t) << " " << score << std::endl;
470#endif
471 if (score > bestScore) {
472 bestScore = score;
473 bestSmiles = MolToSmiles(*t);
474 bestMol = t;
475 } else if (score == bestScore) {
476 auto smiles = MolToSmiles(*t);
477 if (smiles < bestSmiles) {
478 bestSmiles = smiles;
479 bestMol = t;
480 }
481 }
482 }
483 }
484 ROMol *res = new ROMol(*bestMol);
485 static const bool cleanIt = true;
486 static const bool force = true;
487 MolOps::assignStereochemistry(*res, cleanIt, force);
488
489 return res;
490 }
491
492 //! returns the canonical tautomer for a molecule
493 /*!
494 Note that the canonical tautomer is very likely not the most stable tautomer
495 for any given conditions. The default scoring rules are designed to produce
496 "reasonable" tautomers, but the primary concern is that the results are
497 canonical: you always get the same canonical tautomer for a molecule
498 regardless of what the input tautomer or atom ordering were.
499
500 The default scoring scheme is inspired by the publication:
501 M. Sitzmann et al., “Tautomerism in Large Databases.”, JCAMD 24:521 (2010)
502 https://doi.org/10.1007/s10822-010-9346-4
503
504 */
505 /// When \p scoreFunc is empty (default), an optimized scorer is created
506 /// that pre-filters substructure patterns once for the input molecule.
508 const ROMol &mol,
509 boost::function<int(const ROMol &mol)> scoreFunc = {}) const;
511 RWMol &mol, boost::function<int(const ROMol &mol)> scoreFunc = {}) const;
512
513 private:
514 bool setTautomerStereoAndIsoHs(const ROMol &mol, ROMol &taut,
515 const TautomerEnumeratorResult &res) const;
516 std::shared_ptr<TautomerCatalog> dp_catalog;
517 std::shared_ptr<TautomerEnumeratorCallback> d_callback;
518 unsigned int d_maxTautomers;
519 unsigned int d_maxTransforms;
520 bool d_removeSp3Stereo;
521 bool d_removeBondStereo;
522 bool d_removeIsotopicHs;
523 bool d_reassignStereo;
524}; // TautomerEnumerator class
525
526// caller owns the pointer
528 const CleanupParameters &params) {
529 return new TautomerEnumerator(params);
530}
531// caller owns the pointer
537
538} // namespace MolStandardize
539} // namespace RDKit
540
541#endif
#define PRECONDITION(expr, mess)
Definition Invariant.h:108
Defines the CleanupParameters and some convenience functions.
virtual bool operator()(const ROMol &, const TautomerEnumeratorResult &)=0
Contains results of tautomer enumeration.
const boost::dynamic_bitset & modifiedBonds() const
const boost::dynamic_bitset & modifiedAtoms() const
TautomerEnumeratorResult(const TautomerEnumeratorResult &other)
TautomerEnumerator(const CleanupParameters &params=CleanupParameters())
std::vector< ROMOL_SPTR > enumerate(const ROMol &mol, boost::dynamic_bitset<> *modifiedAtoms, boost::dynamic_bitset<> *modifiedBonds=nullptr) const
ROMol * pickCanonical(const TautomerEnumeratorResult &tautRes, boost::function< int(const ROMol &mol)> scoreFunc=TautomerScoringFunctions::scoreTautomer) const
returns the canonical tautomer from a TautomerEnumeratorResult
TautomerEnumeratorCallback * getCallback() const
TautomerEnumeratorResult enumerate(const ROMol &mol) const
returns a TautomerEnumeratorResult structure for the input molecule
TautomerEnumerator & operator=(const TautomerEnumerator &other)
void canonicalizeInPlace(RWMol &mol, boost::function< int(const ROMol &mol)> scoreFunc={}) const
void setCallback(TautomerEnumeratorCallback *callback)
ROMol * canonicalize(const ROMol &mol, boost::function< int(const ROMol &mol)> scoreFunc={}) const
returns the canonical tautomer for a molecule
TautomerEnumerator(const TautomerEnumerator &other)
void setMaxTransforms(unsigned int maxTransforms)
void setMaxTautomers(unsigned int maxTautomers)
ROMol * pickCanonical(const Iterable &tautomers, boost::function< int(const ROMol &mol)> scoreFunc=TautomerScoringFunctions::scoreTautomer) const
Tautomer(ROMOL_SPTR t, ROMOL_SPTR k, size_t a=0, size_t b=0)
Tautomer(ROMOL_SPTR t, size_t a=0, size_t b=0)
const ROMOL_SPTR & getKekulized() const
RWMol is a molecule class that is intended to be edited.
Definition RWMol.h:32
#define RDKIT_MOLSTANDARDIZE_EXPORT
Definition export.h:377
RDKIT_GRAPHMOL_EXPORT void assignStereochemistry(ROMol &mol, bool cleanIt=false, bool force=false, bool flagPossibleStereoCenters=false)
Assign stereochemistry tags to atoms and bonds.
RDKIT_GRAPHMOL_EXPORT void Kekulize(RWMol &mol, bool markAtomsBonds=true, bool canonical=true, unsigned int maxBackTracks=100)
Kekulizes the molecule.
RDKIT_MOLSTANDARDIZE_EXPORT int scoreHeteroHs(const ROMol &mol)
RDKIT_MOLSTANDARDIZE_EXPORT std::vector< size_t > getRelevantSubstructTermIndices(const ROMol &mol, const std::vector< SubstructTerm > &terms=getDefaultTautomerScoreSubstructs())
RDKIT_MOLSTANDARDIZE_EXPORT int scoreSubstructsFiltered(const ROMol &mol, const std::vector< SubstructTerm > &terms, const std::vector< size_t > &relevantIndices)
RDKIT_MOLSTANDARDIZE_EXPORT const std::vector< SubstructTerm > & getDefaultTautomerScoreSubstructs()
RDKIT_MOLSTANDARDIZE_EXPORT int scoreRings(const ROMol &mol)
RDKIT_MOLSTANDARDIZE_EXPORT int scoreSubstructs(const ROMol &mol, const std::vector< SubstructTerm > &terms=getDefaultTautomerScoreSubstructs())
scoreSubstructs scores the molecule based on the substructure definitions
boost::function< int(const ROMol &)> makeOptimizedScorer(const ROMol &mol)
RDKIT_MOLSTANDARDIZE_EXPORT const TautomerTransformDefs defaultTautomerTransformsv1
std::map< std::string, Tautomer > SmilesTautomerMap
TautomerEnumerator * tautomerEnumeratorFromParams(const CleanupParameters &params)
TautomerEnumerator * getV1TautomerEnumerator()
RDCatalog::HierarchCatalog< TautomerCatalogEntry, TautomerCatalogParams, int > TautomerCatalog
std::pair< std::string, Tautomer > SmilesTautomerPair
Std stuff.
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, const SmilesWriteParams &params)
returns canonical SMILES for a molecule
boost::shared_ptr< ROMol > ROMOL_SPTR
SubstructTerm(std::string aname, std::string asmarts, int ascore, std::vector< int > reqElements={}, std::string connSmarts="")