Source code for ccpnmodel.ccpncore.lib.assignment.ChemicalShift

"""Chemical shift-related library functions at API (data storage) level

"""
#=========================================================================================
# Licence, Reference and Credits
#=========================================================================================
__copyright__ = "Copyright (C) CCPN project (http://www.ccpn.ac.uk) 2014 - 2020"
__credits__ = ("Ed Brooksbank, Luca Mureddu, Timothy J Ragan & Geerten W Vuister")
__licence__ = ("CCPN licence. See http://www.ccpn.ac.uk/v3-software/downloads/license")
__reference__ = ("Skinner, S.P., Fogh, R.H., Boucher, W., Ragan, T.J., Mureddu, L.G., & Vuister, G.W.",
                 "CcpNmr AnalysisAssign: a flexible platform for integrated NMR analysis",
                 "J.Biomol.Nmr (2016), 66, 111-124, http://doi.org/10.1007/s10858-016-0060-y")

#=========================================================================================
# Last code modification
#=========================================================================================
__modifiedBy__ = "$modifiedBy: Ed Brooksbank $"
__dateModified__ = "$dateModified: 2020-10-16 14:38:51 +0100 (Fri, October 16, 2020) $"
__version__ = "$Revision: 3.0.1 $"
#=========================================================================================
# Created
#=========================================================================================

__author__ = "$Author: CCPN $"
__date__ = "$Date: 2017-04-07 10:28:48 +0000 (Fri, April 07, 2017) $"
#=========================================================================================
# Start of code
#=========================================================================================

import os
from math import exp
from collections import OrderedDict
import numpy as np
from numba import jit
from scipy.stats import multivariate_normal, norm


# TBD DNA/RNA residue probs
#
# Sanity checks:
#   Only on amide
#   Gly CA - ignore CB
#

ROOT_TWO_PI = 2.506628274631
PROTEIN_MOLTYPE = 'protein'
REF_STORE_DICT = {}
CHEM_ATOM_REF_DICT = {}

# Moved here from deleted ChemicalShiftRef :

REFDB_SD_MEAN = {
    ('protein', 'Ala') : {
        "C"  : (177.828848, 2.109680, 0.452147, None),
        "CA" : (53.352984, 1.902676, 0.210713, "HA"),
        "CB" : (19.222672, 1.746981, 0.258369, "HB*"),
        "H"  : (8.198200, 0.628909, 0.033675, "N"),
        "HA" : (4.276693, 0.446260, 0.146317, "CA"),
        "HB*": (1.364080, 0.226681, 0.212091, "CB"),
        "N"  : (122.708700, 3.549744, 0.159512, "H2"),
        },
    ('protein', 'Arg') : {
        "C"   : (176.476635, 1.985958, 0.496077, None),
        "CA"  : (56.956118, 2.286571, 0.236270, "HA"),
        "CB"  : (30.902098, 1.776631, 0.317743, "HB2"),
        "CD"  : (43.264272, 0.631000, 0.568196, "HD2"),
        "CG"  : (27.332456, 1.047844, 0.581473, "HG2"),
        "CZ"  : (160.254836, 3.230362, 0.982800, None),
        "H"   : (8.251157, 0.626805, 0.038322, "N"),
        "HA"  : (4.312683, 0.477117, 0.133977, "CA"),
        "HB2" : (1.796660, 0.261361, 0.246530, "CB"),
        "HB3" : (1.770165, 0.267594, 0.301147, "CB"),
        "HD2" : (3.124174, 0.221769, 0.361798, "CD"),
        "HD3" : (3.108990, 0.237850, 0.419433, "CD"),
        "HE"  : (7.358268, 0.581758, 0.718769, "NE"),
        "HG2" : (1.565723, 0.256262, 0.342788, "CG"),
        "HG3" : (1.543494, 0.272259, 0.396198, "CG"),
        "HH11": (6.806578, 0.394752, 0.968618, "NH1"),
        "HH12": (6.804218, 0.413009, 0.973446, "NH1"),
        "HH21": (6.725168, 0.354246, 0.971032, "NH2"),
        "HH22": (6.729098, 0.436935, 0.971636, "NH2"),
        "N"   : (120.208274, 3.814534, 0.208509, "H2"),
        "NE"  : (91.403172, 13.311577, 0.865419, "HE"),
        "NH1" : (76.300233, 12.184632, 0.993060, "HH12"),
        "NH2" : (75.668286, 10.259347, 0.993060, "HH21"),
        },
    ('protein', 'Asn') : {
        "C"   : (175.387335, 1.722342, 0.461435, None),
        "CA"  : (53.715068, 1.780935, 0.233736, "HA"),
        "CB"  : (38.882610, 1.637450, 0.291751, "HB2"),
        "CG"  : (176.826660, 1.209325, 0.911804, None),
        "H"   : (8.349260, 0.632518, 0.049966, "N"),
        "HA"  : (4.681485, 0.371691, 0.153253, "CA"),
        "HB2" : (2.819383, 0.300069, 0.232059, "CB"),
        "HB3" : (2.776859, 0.314775, 0.260899, "CB"),
        "HD21": (7.356185, 0.463488, 0.462441, "ND2"),
        "HD22": (7.141474, 0.456559, 0.466465, "ND2"),
        "N"   : (118.480741, 4.131918, 0.192153, "H2"),
        "ND2" : (112.784090, 2.263417, 0.616030, "HD21"),
        },
    ('protein', 'Asp') : {
        "C"  : (176.552707, 1.676911, 0.428646, None),
        "CA" : (54.879282, 1.972805, 0.195573, "HA"),
        "CB" : (41.088030, 1.475807, 0.253906, "HB2"),
        "CG" : (179.515246, 1.549972, 0.978906, None),
        "H"  : (8.315900, 0.588151, 0.036198, "N"),
        "HA" : (4.603843, 0.309474, 0.133594, "CA"),
        "HB2": (2.726400, 0.280057, 0.220312, "CB"),
        "HB3": (2.683034, 0.255266, 0.255729, "CB"),
        "N"  : (120.230927, 3.966225, 0.146615, "H"),
        },
    ('protein', 'Cys') : {  # CB from CSI, SD guess
        "C"  : (174.775094, 1.963682, 0.690521, None),
        "CA" : (58.071390, 3.292878, 0.495920, "HA"),
        "CB" : (28.6, 1.8, 0.549906, "HB2"),
        "H"  : (8.415828, 0.683783, 0.061519, "N"),
        "HA" : (4.699157, 0.549761, 0.084118, "CA"),
        "HB2": (2.927206, 0.473968, 0.123038, "CB"),
        "HB3": (2.884519, 0.495915, 0.140615, "CB"),
        "HG" : (2.539318, 2.428652, 0.998117, "SG"),
        "N"  : (119.339964, 4.450665, 0.438795, "H"),
        },
    ('protein', 'Cyss'): {  # CB from CSI, SD guess
        "C"  : (174.775094, 1.963682, 0.690521, None),
        "CA" : (58.071390, 3.292878, 0.495920, "HA"),
        "CB" : (41.8, 1.8, 0.549906, "HB2"),
        "H"  : (8.415828, 0.683783, 0.061519, "N"),
        "HA" : (4.699157, 0.549761, 0.084118, "CA"),
        "HB2": (2.927206, 0.473968, 0.123038, "CB"),
        "HB3": (2.884519, 0.495915, 0.140615, "CB"),
        "N"  : (119.339964, 4.450665, 0.438795, "H"),
        },
    ('protein', 'Gln') : {
        "C"   : (176.404030, 1.920946, 0.433155, None),
        "CA"  : (56.761798, 2.082327, 0.192157, "HA"),
        "CB"  : (29.358559, 1.772749, 0.260606, "HB3"),
        "CD"  : (179.827346, 1.026666, 0.926916, None),
        "CG"  : (33.876419, 0.890980, 0.516221, "HG2"),
        "H"   : (8.216737, 0.612440, 0.037790, "N"),
        "HA"  : (4.281435, 0.448260, 0.145811, "CA"),
        "HB2" : (2.056673, 0.235446, 0.253476, "CB"),
        "HB3" : (2.032193, 0.243198, 0.301604, "CB"),
        "HE21": (7.243197, 0.430273, 0.512656, "NE2"),
        "HE22": (7.014516, 0.403005, 0.512656, "NE2"),
        "HG2" : (2.327917, 0.242786, 0.313369, "CG"),
        "HG3" : (2.302919, 0.262972, 0.378610, "CG"),
        "N"   : (119.290354, 3.686866, 0.153298, "H"),
        "NE2" : (111.888680, 1.758582, 0.612478, "HE21"),
        },
    ('protein', 'Glu') : {
        "C"  : (177.019843, 1.918170, 0.413230, None),
        "CA" : (57.531344, 2.064791, 0.179611, "HA"),
        "CB" : (30.197538, 1.660817, 0.241365, "HB2"),
        "CD" : (182.899624, 1.786294, 0.982834, None),
        "CG" : (36.202941, 0.965699, 0.497593, "HG3"),
        "H"  : (8.343540, 0.613833, 0.030563, "N"),
        "HA" : (4.266130, 0.434106, 0.131673, "CA"),
        "HB2": (2.031746, 0.198885, 0.230061, "CB"),
        "HB3": (2.009291, 0.202896, 0.284069, "CB"),
        "HG2": (2.279046, 0.202965, 0.311911, "CG"),
        "HG3": (2.260935, 0.206668, 0.364455, "CG"),
        "N"  : (120.173056, 3.592825, 0.139837, "H2"),
        },
    ('protein', 'Gly') : {
        "C"  : (174.017069, 1.841177, 0.473382, None),
        "CA" : (45.562676, 1.121685, 0.235463, "HA3"),
        "H"  : (8.330610, 0.673335, 0.044840, "N"),
        "HA2": (3.988405, 0.369611, 0.150491, "CA"),
        "HA3": (3.912364, 0.373510, 0.191237, "CA"),
        "N"  : (109.068021, 3.809180, 0.190418, "H"),
        },
    ('protein', 'His') : {
        "C"  : (175.302898, 1.948641, 0.514085, None),
        "CA" : (56.675864, 2.330521, 0.279577, "HA"),
        "CB" : (30.475373, 2.009559, 0.331690, "HB2"),
        "CD2": (119.952558, 2.769141, 0.787324, "HD2"),
        "CE1": (137.238155, 2.346656, 0.840141, "HE1"),
        "CG" : (132.303333, 3.787014, 0.992253, None),
        "H"  : (8.262735, 0.688922, 0.100704, "N"),
        "HA" : (4.627289, 0.467128, 0.186620, "CA"),
        "HB2": (3.110051, 0.349972, 0.261972, "CB"),
        "HB3": (3.057166, 0.355392, 0.291549, "CB"),
        "HD1": (8.265318, 2.399673, 0.943662, "ND1"),
        "HD2": (7.037220, 0.429625, 0.454930, "CD2"),
        "HE1": (8.013054, 0.520085, 0.517606, "CE1"),
        "HE2": (9.920824, 2.588519, 0.981690, "NE2"),
        "N"  : (119.109795, 4.192319, 0.235211, "H"),
        "ND1": (195.537424, 37.008263, 0.947183, "HD1"),
        "NE2": (177.698489, 16.243778, 0.952817, "HE2"),
        },
    ('protein', 'Ile') : {
        "C"   : (175.864340, 1.828210, 0.422507, None),
        "CA"  : (61.705518, 2.600251, 0.188136, "HA"),
        "CB"  : (38.854432, 1.944834, 0.252690, "HB"),
        "CD1" : (13.584336, 1.626539, 0.482408, "HD1*"),
        "CG1" : (27.780248, 1.684249, 0.515848, "HG13"),
        "CG2" : (17.636566, 1.237755, 0.469613, "HG2*"),
        "H"   : (8.308229, 0.703239, 0.032858, "N"),
        "HA"  : (4.224109, 0.580233, 0.132306, "CA"),
        "HB"  : (1.784957, 0.304183, 0.218377, "CB"),
        "HD1*": (0.685842, 0.282454, 0.294272, "CD1"),
        "HG12": (1.270219, 0.392634, 0.329165, "CG1"),
        "HG13": (1.216866, 0.399083, 0.364059, "CG1"),
        "HG2*": (0.789078, 0.258937, 0.281477, "CG2"),
        "N"   : (121.173088, 4.413042, 0.150044, "H"),
        },
    ('protein', 'Leu') : {
        "C"   : (177.056361, 1.919039, 0.431646, None),
        "CA"  : (55.752106, 2.056969, 0.191568, "HA"),
        "CB"  : (42.540153, 1.789209, 0.255424, "HB2"),
        "CD1" : (24.785797, 1.489808, 0.494267, "HD1*"),
        "CD2" : (24.227473, 1.601505, 0.521609, "HD2*"),
        "CG"  : (26.875515, 0.996005, 0.543658, "HG"),
        "H"   : (8.238177, 0.674706, 0.033163, "N"),
        "HA"  : (4.346146, 0.491254, 0.154877, "CA"),
        "HB2" : (1.616793, 0.329304, 0.261069, "CB"),
        "HB3" : (1.534532, 0.346944, 0.302699, "CB"),
        "HD1*": (0.755018, 0.268818, 0.295114, "CD1"),
        "HD2*": (0.728231, 0.285852, 0.323337, "CD2"),
        "HG"  : (1.509274, 0.327438, 0.357559, "CG"),
        "N"   : (121.455301, 4.025737, 0.146587, "H2"),
        },
    ('protein', 'Lys') : {
        "C"  : (176.775235, 1.906749, 0.452691, None),
        "CA" : (57.154802, 2.135196, 0.229246, "HA"),
        "CB" : (32.992345, 1.700470, 0.286857, "HB3"),
        "CD" : (29.036786, 0.920116, 0.588718, "HD2"),
        "CE" : (41.966911, 0.532535, 0.611722, "HE2"),
        "CG" : (25.002996, 0.954247, 0.555511, "HG3"),
        "H"  : (8.190187, 0.626470, 0.036807, "N"),
        "HA" : (4.283917, 0.449233, 0.134827, "CA"),
        "HB2": (1.788608, 0.231602, 0.230646, "CB"),
        "HB3": (1.765164, 0.240192, 0.286457, "CB"),
        "HD2": (1.606115, 0.230201, 0.433887, "CD"),
        "HD3": (1.597734, 0.235158, 0.490898, "CD"),
        "HE2": (2.924529, 0.170173, 0.447289, "CE"),
        "HE3": (2.919496, 0.175103, 0.516303, "CE"),
        "HG2": (1.380507, 0.240617, 0.340068, "CG"),
        "HG3": (1.370381, 0.245205, 0.400480, "CG"),
        "HZ*": (7.483198, 0.367244, 0.962393, "NZ"),
        "N"  : (120.486754, 3.843313, 0.179236, "H"),
        "NZ" : (48.671163, 36.303503, 0.998200, "HZ*"),
        },
    ('protein', 'Met') : {
        "C"  : (176.273626, 2.049059, 0.449133, None),
        "CA" : (56.283714, 2.173536, 0.205727, "HA"),
        "CB" : (33.259592, 2.202185, 0.269028, "HB2"),
        "CE" : (17.113516, 1.142950, 0.691786, "HE*"),
        "CG" : (32.098797, 1.057559, 0.567445, "HG3"),
        "H"  : (8.266266, 0.624249, 0.094951, "N"),
        "HA" : (4.423979, 0.489446, 0.147702, "CA"),
        "HB2": (2.033462, 0.323993, 0.271289, "CB"),
        "HB3": (2.006533, 0.336088, 0.324039, "CB"),
        "HE*": (1.870402, 0.366566, 0.595328, "CE"),
        "HG2": (2.416216, 0.370668, 0.368500, "CG"),
        "HG3": (2.376700, 0.421305, 0.403919, "CG"),
        "N"  : (119.593417, 3.693716, 0.196684, "H2"),
        },
    ('protein', 'Phe') : {
        "C"  : (175.565521, 1.948348, 0.445665, None),
        "CA" : (58.305101, 2.551980, 0.217338, "HA"),
        "CB" : (40.181708, 1.952764, 0.279609, "HB2"),
        "CD1": (131.626637, 1.261599, 0.746439, "HD1"),
        "CD2": (131.620012, 1.257999, 0.828246, "HD2"),
        "CE1": (130.696973, 1.484393, 0.782662, "HE1"),
        "CE2": (130.731140, 1.294712, 0.853480, "HE2"),
        "CD*": (131.620012, 1.257999, 0.828246, "HD*"),
        "CE*": (130.696973, 1.484393, 0.782662, "HE*"),
        "CG" : (137.410725, 3.251523, 0.994302, None),
        "CZ" : (129.222986, 1.622311, 0.835165, "HZ"),
        "H"  : (8.382242, 0.749659, 0.058201, "N"),
        "HA" : (4.648786, 0.588854, 0.160765, "CA"),
        "HB2": (2.998135, 0.362851, 0.245828, "CB"),
        "HB3": (2.952455, 0.375443, 0.268620, "CB"),
        "HD1": (7.058833, 0.306262, 0.365893, "CD1"),
        "HD2": (7.065550, 0.304958, 0.494506, "CD2"),
        "HE1": (7.081438, 0.306502, 0.435083, "CE1"),
        "HE2": (7.084839, 0.303505, 0.541718, "CE2"),
        "HD*": (7.065550, 0.304958, 0.494506, "CD*"),
        "HE*": (7.081438, 0.306502, 0.435083, "CE*"),
        "HZ" : (6.996208, 0.413472, 0.582418, "CZ"),
        "N"  : (120.085399, 4.232138, 0.169312, "H3"),
        },
    ('protein', 'Pro') : {
        "C"  : (176.780325, 1.499687, 0.483755, None),
        "CA" : (63.539193, 1.404973, 0.228159, "HA"),
        "CB" : (31.977618, 1.011604, 0.292058, "HB2"),
        "CD" : (50.448136, 0.757834, 0.560289, "HD2"),
        "CG" : (27.325400, 0.926700, 0.575090, "HG3"),
        "HA" : (4.408505, 0.329217, 0.180505, "CA"),
        "HB2": (2.077811, 0.327969, 0.257040, "CB"),
        "HB3": (2.029493, 0.322236, 0.275812, "CB"),
        "HD2": (3.665299, 0.317100, 0.331408, "CD"),
        "HD3": (3.643425, 0.338100, 0.362816, "CD"),
        "HG2": (1.944234, 0.266628, 0.361372, "CG"),
        "HG3": (1.920546, 0.282568, 0.404693, "CG"),
        "N"  : (131.085662, 9.209093, 0.979783, "H3"),
        },
    ('protein', 'Ser') : {
        "C"  : (174.683288, 1.659248, 0.464592, None),
        "CA" : (58.850778, 2.018005, 0.217349, "HA"),
        "CB" : (63.993613, 1.384288, 0.300172, "HB3"),
        "H"  : (8.290302, 0.607958, 0.058564, "N"),
        "HA" : (4.513436, 0.420120, 0.147023, "CA"),
        "HB2": (3.884335, 0.239598, 0.248714, "CB"),
        "HB3": (3.861902, 0.250070, 0.313159, "CB"),
        "HG" : (5.575657, 1.195882, 0.988238, "OG"),
        "N"  : (115.813074, 3.723294, 0.185739, "H3"),
        },
    ('protein', 'Thr') : {
        "C"   : (174.631688, 1.707400, 0.462333, None),
        "CA"  : (62.366001, 2.580251, 0.224368, "HA"),
        "CB"  : (69.886568, 1.524338, 0.292086, "HB"),
        "CG2" : (21.649765, 0.958760, 0.519445, "HG2*"),
        "H"   : (8.264453, 0.646697, 0.039162, "N"),
        "HA"  : (4.487641, 0.495678, 0.153658, "CA"),
        "HB"  : (4.167755, 0.343464, 0.253196, "CB"),
        "HG1" : (5.132804, 1.738798, 0.975524, "OG1"),
        "HG2*": (1.144699, 0.201156, 0.278216, "CG2"),
        "N"   : (115.081474, 4.991324, 0.158009, "H"),
        },
    ('protein', 'Trp') : {
        "C"  : (176.218157, 1.939642, 0.518771, None),
        "CA" : (57.833198, 2.507920, 0.324232, "HA"),
        "CB" : (30.290294, 1.875868, 0.369738, "HB2"),
        "CD1": (126.600126, 1.837343, 0.715586, "HD1"),
        "CD2": (128.065000, 2.873504, 0.990899, None),
        "CE2": (137.984074, 9.322232, 0.980660, None),
        "CE3": (120.528840, 1.591558, 0.786121, "HE3"),
        "CG" : (111.433182, 0.972309, 0.987486, None),
        "CH2": (123.902050, 1.477779, 0.773606, "HH2"),
        "CZ2": (114.407511, 1.382949, 0.754266, "HZ2"),
        "CZ3": (121.530652, 1.444072, 0.781570, "HZ3"),
        "H"  : (8.287413, 0.814690, 0.110353, "N"),
        "HA" : (4.708245, 0.556373, 0.196815, "CA"),
        "HB2": (3.173023, 0.343723, 0.266212, "CB"),
        "HB3": (3.134429, 0.352943, 0.299204, "CB"),
        "HD1": (7.138713, 0.340667, 0.357224, "CD1"),
        "HE1": (10.116375, 0.537535, 0.356086, "NE1"),
        "HE3": (7.305717, 0.382677, 0.411832, "CE3"),
        "HH2": (6.964333, 0.346475, 0.420933, "CH2"),
        "HZ2": (7.286362, 0.324283, 0.379977, "CZ2"),
        "HZ3": (6.871129, 0.368175, 0.431172, "CZ3"),
        "N"  : (121.285471, 4.389069, 0.271900, "H2"),
        "NE1": (129.398739, 2.021644, 0.547213, "HE1"),
        },
    ('protein', 'Tyr') : {
        "C"  : (175.491657, 1.900925, 0.487770, None),
        "CA" : (58.281804, 2.475766, 0.264346, "HA"),
        "CB" : (39.616805, 2.081812, 0.337723, "HB3"),
        "CD1": (132.791984, 1.567221, 0.745532, "HD1"),
        "CD2": (132.624189, 2.080215, 0.835842, "HD2"),
        "CE1": (118.057560, 1.375306, 0.740828, "HE1"),
        "CE2": (118.004726, 1.152713, 0.833960, "HE2"),
        "CD*": (132.624189, 2.080215, 0.835842, "HD*"),
        "CE*": (118.057560, 1.375306, 0.740828, "HE*"),
        "CG" : (129.985070, 2.882228, 0.988241, None),
        "CZ" : (157.562708, 1.371867, 0.992004, None),
        "H"  : (8.325909, 0.750383, 0.062088, "N"),
        "HA" : (4.645483, 0.583003, 0.161336, "CA"),
        "HB2": (2.903203, 0.371810, 0.248824, "CB"),
        "HB3": (2.849544, 0.377313, 0.265757, "CB"),
        "HD1": (6.937622, 0.279576, 0.331609, "CD1"),
        "HD2": (6.935375, 0.282040, 0.447789, "CD2"),
        "HE1": (6.704741, 0.220055, 0.355597, "CE1"),
        "HE2": (6.705084, 0.218493, 0.468956, "CE2"),
        "HD*": (6.935375, 0.282040, 0.447789, "CD*"),
        "HE*": (6.704741, 0.220055, 0.355597, "CE*"),
        "HH" : (9.149362, 1.563879, 0.984478, "OH"),
        "N"  : (120.200602, 4.345637, 0.224365, "H"),
        },
    ('protein', 'Val') : {
        "C"   : (175.732054, 1.819574, 0.437803, None),
        "CA"  : (62.631900, 2.794401, 0.199169, "HA"),
        "CB"  : (32.935144, 1.675944, 0.260559, "HB"),
        "CG1" : (21.586062, 1.243471, 0.477960, "HG1*"),
        "CG2" : (21.437030, 1.431185, 0.509808, "HG2*"),
        "H"   : (8.307943, 0.704303, 0.039465, "N"),
        "HA"  : (4.204042, 0.596150, 0.143780, "CA"),
        "HB"  : (1.992687, 0.286555, 0.228479, "CB"),
        "HG1*": (0.832713, 0.242229, 0.252943, "CG1"),
        "HG2*": (0.810046, 0.266148, 0.275790, "CG2"),
        "N"   : (120.724161, 4.673536, 0.144242, "H"),
        }}

REFDB_SD_MEAN_LIST = {k: [(kk, kv) for kk, kv in v.items()] for k, v in REFDB_SD_MEAN.items()}

"""
Below values from ftp://ftp.cbs.cnrs.fr/pub/RESCUE2

For reference see:

J Biomol NMR. 2004 Sep;30(1:47-60
From NMR chemical shifts to amino acid types: investigation of the predictive power
carried by nuclei.
Marin A, Malliavin TE, Nicolas P, Delsuc MA.
"""

RESCUE2_STATS_MISSING = [('Ala', 'H', 0.033675),
                         ('Ala', 'HA', 0.146317),
                         ('Ala', 'HB', 0.212091),
                         ('Ala', 'C', 0.452147),
                         ('Ala', 'CA', 0.210713),
                         ('Ala', 'CB', 0.258369),
                         ('Ala', 'N', 0.159512),
                         ('Arg', 'H', 0.038322),
                         ('Arg', 'HA', 0.133977),
                         ('Arg', 'HB2', 0.246530),
                         ('Arg', 'HB3', 0.301147),
                         ('Arg', 'HG2', 0.342788),
                         ('Arg', 'HG3', 0.396198),
                         ('Arg', 'HD2', 0.361798),
                         ('Arg', 'HD3', 0.419433),
                         ('Arg', 'HE', 0.718769),
                         ('Arg', 'HH11', 0.968618),
                         ('Arg', 'HH12', 0.973446),
                         ('Arg', 'HH21', 0.971032),
                         ('Arg', 'HH22', 0.971636),
                         ('Arg', 'C', 0.496077),
                         ('Arg', 'CA', 0.236270),
                         ('Arg', 'CB', 0.317743),
                         ('Arg', 'CG', 0.581473),
                         ('Arg', 'CD', 0.568196),
                         ('Arg', 'CZ', 0.982800),
                         ('Arg', 'N', 0.208509),
                         ('Arg', 'NE', 0.865419),
                         ('Arg', 'NH1', 0.993060),
                         ('Arg', 'NH2', 0.993060),
                         ('Asp', 'H', 0.036198),
                         ('Asp', 'HA', 0.133594),
                         ('Asp', 'HB2', 0.220312),
                         ('Asp', 'HB3', 0.255729),
                         ('Asp', 'C', 0.428646),
                         ('Asp', 'CA', 0.195573),
                         ('Asp', 'CB', 0.253906),
                         ('Asp', 'CG', 0.978906),
                         ('Asp', 'N', 0.146615),
                         ('Asn', 'H', 0.049966),
                         ('Asn', 'HA', 0.153253),
                         ('Asn', 'HB2', 0.232059),
                         ('Asn', 'HB3', 0.260899),
                         ('Asn', 'HD21', 0.462441),
                         ('Asn', 'HD22', 0.466465),
                         ('Asn', 'C', 0.461435),
                         ('Asn', 'CA', 0.233736),
                         ('Asn', 'CB', 0.291751),
                         ('Asn', 'CG', 0.911804),
                         ('Asn', 'N', 0.192153),
                         ('Asn', 'ND2', 0.616030),
                         ('Cys', 'H', 0.061519),
                         ('Cys', 'HA', 0.084118),
                         ('Cys', 'HB2', 0.123038),
                         ('Cys', 'HB3', 0.140615),
                         ('Cys', 'HG', 0.998117),
                         ('Cys', 'C', 0.690521),
                         ('Cys', 'CA', 0.495920),
                         ('Cys', 'CB', 0.549906),
                         ('Cys', 'N', 0.438795),
                         ('Glu', 'H', 0.030563),
                         ('Glu', 'HA', 0.131673),
                         ('Glu', 'HB2', 0.230061),
                         ('Glu', 'HB3', 0.284069),
                         ('Glu', 'HG2', 0.311911),
                         ('Glu', 'HG3', 0.364455),
                         ('Glu', 'C', 0.413230),
                         ('Glu', 'CA', 0.179611),
                         ('Glu', 'CB', 0.241365),
                         ('Glu', 'CG', 0.497593),
                         ('Glu', 'CD', 0.982834),
                         ('Glu', 'N', 0.139837),
                         ('Gln', 'H', 0.037790),
                         ('Gln', 'HA', 0.145811),
                         ('Gln', 'HB2', 0.253476),
                         ('Gln', 'HB3', 0.301604),
                         ('Gln', 'HG2', 0.313369),
                         ('Gln', 'HG3', 0.378610),
                         ('Gln', 'HE21', 0.512656),
                         ('Gln', 'HE22', 0.512656),
                         ('Gln', 'C', 0.433155),
                         ('Gln', 'CA', 0.192157),
                         ('Gln', 'CB', 0.260606),
                         ('Gln', 'CG', 0.516221),
                         ('Gln', 'CD', 0.926916),
                         ('Gln', 'N', 0.153298),
                         ('Gln', 'NE2', 0.612478),
                         ('Gly', 'H', 0.044840),
                         ('Gly', 'HA2', 0.150491),
                         ('Gly', 'HA3', 0.191237),
                         ('Gly', 'C', 0.473382),
                         ('Gly', 'CA', 0.235463),
                         ('Gly', 'N', 0.190418),
                         ('His', 'H', 0.100704),
                         ('His', 'HA', 0.186620),
                         ('His', 'HB2', 0.261972),
                         ('His', 'HB3', 0.291549),
                         ('His', 'HD1', 0.943662),
                         ('His', 'HD2', 0.454930),
                         ('His', 'HE1', 0.517606),
                         ('His', 'HE2', 0.981690),
                         ('His', 'C', 0.514085),
                         ('His', 'CA', 0.279577),
                         ('His', 'CB', 0.331690),
                         ('His', 'CG', 0.992253),
                         ('His', 'CD2', 0.787324),
                         ('His', 'CE1', 0.840141),
                         ('His', 'N', 0.235211),
                         ('His', 'ND1', 0.947183),
                         ('His', 'NE2', 0.952817),
                         ('Ile', 'H', 0.032858),
                         ('Ile', 'HA', 0.132306),
                         ('Ile', 'HB', 0.218377),
                         ('Ile', 'HG12', 0.329165),
                         ('Ile', 'HG13', 0.364059),
                         ('Ile', 'HG2', 0.281477),
                         ('Ile', 'HD1', 0.294272),
                         ('Ile', 'C', 0.422507),
                         ('Ile', 'CA', 0.188136),
                         ('Ile', 'CB', 0.252690),
                         ('Ile', 'CG1', 0.515848),
                         ('Ile', 'CG2', 0.469613),
                         ('Ile', 'CD1', 0.482408),
                         ('Ile', 'N', 0.150044),
                         ('Leu', 'H', 0.033163),
                         ('Leu', 'HA', 0.154877),
                         ('Leu', 'HB2', 0.261069),
                         ('Leu', 'HB3', 0.302699),
                         ('Leu', 'HG', 0.357559),
                         ('Leu', 'HD1', 0.295114),
                         ('Leu', 'HD2', 0.323337),
                         ('Leu', 'C', 0.431646),
                         ('Leu', 'CA', 0.191568),
                         ('Leu', 'CB', 0.255424),
                         ('Leu', 'CG', 0.543658),
                         ('Leu', 'CD1', 0.494267),
                         ('Leu', 'CD2', 0.521609),
                         ('Leu', 'N', 0.146587),
                         ('Lys', 'H', 0.036807),
                         ('Lys', 'HA', 0.134827),
                         ('Lys', 'HB2', 0.230646),
                         ('Lys', 'HB3', 0.286457),
                         ('Lys', 'HG2', 0.340068),
                         ('Lys', 'HG3', 0.400480),
                         ('Lys', 'HD2', 0.433887),
                         ('Lys', 'HD3', 0.490898),
                         ('Lys', 'HE2', 0.447289),
                         ('Lys', 'HE3', 0.516303),
                         ('Lys', 'HZ', 0.962393),
                         ('Lys', 'C', 0.452691),
                         ('Lys', 'CA', 0.229246),
                         ('Lys', 'CB', 0.286857),
                         ('Lys', 'CG', 0.555511),
                         ('Lys', 'CD', 0.588718),
                         ('Lys', 'CE', 0.611722),
                         ('Lys', 'N', 0.179236),
                         ('Lys', 'NZ', 0.998200),
                         ('Met', 'H', 0.094951),
                         ('Met', 'HA', 0.147702),
                         ('Met', 'HB2', 0.271289),
                         ('Met', 'HB3', 0.324039),
                         ('Met', 'HG2', 0.368500),
                         ('Met', 'HG3', 0.403919),
                         ('Met', 'HE', 0.595328),
                         ('Met', 'C', 0.449133),
                         ('Met', 'CA', 0.205727),
                         ('Met', 'CB', 0.269028),
                         ('Met', 'CG', 0.567445),
                         ('Met', 'CE', 0.691786),
                         ('Met', 'N', 0.196684),
                         ('Phe', 'H', 0.058201),
                         ('Phe', 'HA', 0.160765),
                         ('Phe', 'HB2', 0.245828),
                         ('Phe', 'HB3', 0.268620),
                         ('Phe', 'HD1', 0.365893),
                         ('Phe', 'HD2', 0.494506),
                         ('Phe', 'HE1', 0.435083),
                         ('Phe', 'HE2', 0.541718),
                         ('Phe', 'HZ', 0.582418),
                         ('Phe', 'C', 0.445665),
                         ('Phe', 'CA', 0.217338),
                         ('Phe', 'CB', 0.279609),
                         ('Phe', 'CG', 0.994302),
                         ('Phe', 'CD1', 0.746439),
                         ('Phe', 'CD2', 0.828246),
                         ('Phe', 'CE1', 0.782662),
                         ('Phe', 'CE2', 0.853480),
                         ('Phe', 'CZ', 0.835165),
                         ('Phe', 'N', 0.169312),
                         ('Pro', 'HA', 0.180505),
                         ('Pro', 'HB2', 0.257040),
                         ('Pro', 'HB3', 0.275812),
                         ('Pro', 'HG2', 0.361372),
                         ('Pro', 'HG3', 0.404693),
                         ('Pro', 'HD2', 0.331408),
                         ('Pro', 'HD3', 0.362816),
                         ('Pro', 'C', 0.483755),
                         ('Pro', 'CA', 0.228159),
                         ('Pro', 'CB', 0.292058),
                         ('Pro', 'CG', 0.575090),
                         ('Pro', 'CD', 0.560289),
                         ('Pro', 'N', 0.979783),
                         ('Ser', 'H', 0.058564),
                         ('Ser', 'HA', 0.147023),
                         ('Ser', 'HB2', 0.248714),
                         ('Ser', 'HB3', 0.313159),
                         ('Ser', 'HG', 0.988238),
                         ('Ser', 'C', 0.464592),
                         ('Ser', 'CA', 0.217349),
                         ('Ser', 'CB', 0.300172),
                         ('Ser', 'N', 0.185739),
                         ('Thr', 'H', 0.039162),
                         ('Thr', 'HA', 0.153658),
                         ('Thr', 'HB', 0.253196),
                         ('Thr', 'HG1', 0.975524),
                         ('Thr', 'HG2', 0.278216),
                         ('Thr', 'C', 0.462333),
                         ('Thr', 'CA', 0.224368),
                         ('Thr', 'CB', 0.292086),
                         ('Thr', 'CG2', 0.519445),
                         ('Thr', 'N', 0.158009),
                         ('Trp', 'H', 0.110353),
                         ('Trp', 'HA', 0.196815),
                         ('Trp', 'HB2', 0.266212),
                         ('Trp', 'HB3', 0.299204),
                         ('Trp', 'HD1', 0.357224),
                         ('Trp', 'HE1', 0.356086),
                         ('Trp', 'HE3', 0.411832),
                         ('Trp', 'HZ2', 0.379977),
                         ('Trp', 'HZ3', 0.431172),
                         ('Trp', 'HH2', 0.420933),
                         ('Trp', 'C', 0.518771),
                         ('Trp', 'CA', 0.324232),
                         ('Trp', 'CB', 0.369738),
                         ('Trp', 'CG', 0.987486),
                         ('Trp', 'CD1', 0.715586),
                         ('Trp', 'CD2', 0.990899),
                         ('Trp', 'CE2', 0.980660),
                         ('Trp', 'CE3', 0.786121),
                         ('Trp', 'CZ2', 0.754266),
                         ('Trp', 'CZ3', 0.781570),
                         ('Trp', 'CH2', 0.773606),
                         ('Trp', 'N', 0.271900),
                         ('Trp', 'NE1', 0.547213),
                         ('Tyr', 'H', 0.062088),
                         ('Tyr', 'HA', 0.161336),
                         ('Tyr', 'HB2', 0.248824),
                         ('Tyr', 'HB3', 0.265757),
                         ('Tyr', 'HD1', 0.331609),
                         ('Tyr', 'HD2', 0.447789),
                         ('Tyr', 'HE1', 0.355597),
                         ('Tyr', 'HE2', 0.468956),
                         ('Tyr', 'HH', 0.984478),
                         ('Tyr', 'C', 0.487770),
                         ('Tyr', 'CA', 0.264346),
                         ('Tyr', 'CB', 0.337723),
                         ('Tyr', 'CG', 0.988241),
                         ('Tyr', 'CD1', 0.745532),
                         ('Tyr', 'CD2', 0.835842),
                         ('Tyr', 'CE1', 0.740828),
                         ('Tyr', 'CE2', 0.833960),
                         ('Tyr', 'CZ', 0.992004),
                         ('Tyr', 'N', 0.224365),
                         ('Val', 'H', 0.039465),
                         ('Val', 'HA', 0.143780),
                         ('Val', 'HB', 0.228479),
                         ('Val', 'HG1', 0.252943),
                         ('Val', 'HG2', 0.275790),
                         ('Val', 'C', 0.437803),
                         ('Val', 'CA', 0.199169),
                         ('Val', 'CB', 0.260559),
                         ('Val', 'CG1', 0.477960),
                         ('Val', 'CG2', 0.509808),
                         ('Val', 'N', 0.144242)]

PROTEIN_ATOM_NAMES_UPPER = {
    'ALA': 'ALA',
    'ARG': 'ARG',
    'ASN': 'ASN',
    'ASP': 'ASP',
    'CYS': 'CyS',
    'GLN': 'GLN',
    'GLU': 'GLU',
    'GLY': 'GLy',
    'HIS': 'HIS',
    'ILE': 'ILE',
    'LEU': 'LEU',
    'LYS': 'LyS',
    'MET': 'MET',
    'PHE': 'PHE',
    'PRO': 'PRO',
    'SER': 'SER',
    'THR': 'THR',
    'TRP': 'TRP',
    'TYR': 'TyR',
    'VAL': 'VAL'
    }

PROTEIN_ATOM_NAMES = {
    'ALA': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HB%'],
    'ARG': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HBx', 'HBy', 'HB2', 'HB3', 'CG', 'HGx', 'HGy',
            'HG2', 'HG3', 'CD', 'HDx', 'HDy', 'HD2', 'HD3', 'NE', 'HE', 'CZ', 'NHx', 'NHy',
            'NH1', 'NH2', 'HH1x', 'HH1y', 'HH11', 'HH12', 'HH2x', 'HH2y', 'HH21', 'HH22'],
    'ASN': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HBx', 'HBy', 'HB2', 'HB3', 'CG', 'ND2',
            'HD2x', 'HD2y', 'HD21', 'HD22'],
    'ASP': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HBx', 'HBy', 'HB2', 'HB3', 'CG'],
    'CYS': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HBx', 'HBy', 'HB2', 'HB3', 'HG'],
    'GLN': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HBx', 'HBy', 'HB2', 'HB3', 'CG', 'HGx', 'HGy',
            'HG2', 'HG3', 'CD', 'NE2', 'HE2x', 'HE2y', 'HE21', 'HE22'],
    'GLU': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HBx', 'HBy', 'HB2', 'HB3', 'CG', 'HGx', 'HGy',
            'HG2', 'HG3', 'CD'],
    'GLY': ['H', 'N', 'C', 'CA', 'HAx', 'HAy', 'HA2', 'HA3'],
    'HIS': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HBx', 'HBy', 'HB2', 'HB3', 'CG', 'ND1', 'HD1',
            'CD2', 'HD2', 'CE1', 'HE1', 'NE2', 'HE2'],
    'ILE': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HB', 'CG1', 'HG1x', 'HG1y',
            'HG12', 'HG13', 'CG2', 'HG2%', 'CD1', 'HD1%'],
    'LEU': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HBx', 'HBy', 'HB2', 'HB3', 'CG', 'HG', 'CDx',
            'CDy', 'CD1', 'CD2', 'HDx%', 'HDy%', 'HD1%', 'HD2%'],
    'LYS': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HBx', 'HBy', 'HB2', 'HB3', 'CG', 'HGx', 'HGy',
            'HG2', 'HG3', 'CD', 'HDx', 'HDy', 'HD2', 'HD3', 'CE', 'HEx', 'HEy', 'HE2', 'HE3',
            'NZ', 'HZ%'],
    'MET': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HBx', 'HBy', 'HB2', 'HB3', 'CG', 'HGx', 'HGy',
            'HG2', 'HG3', 'CE', 'HE%'],
    'PHE': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HBx', 'HBy', 'HB2', 'HB3', 'CG', 'CDx', 'CDy',
            'CD1', 'CD2', 'HDx', 'HDy', 'HD1', 'HD2', 'CEx', 'CEy', 'CE1', 'CE2', 'HEx', 'HEy',
            'HE1', 'HE2', 'CZ', 'HZ'],
    'PRO': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HBx', 'HBy', 'HB2', 'HB3', 'CG', 'HGx', 'HGy',
            'HG2', 'HG3', 'CD', 'HDx', 'HDy', 'HD2', 'HD3'],
    'SER': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HBx', 'HBy', 'HB2', 'HB3', 'HG'],
    'THR': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HB', 'CG2', 'HG1', 'HG2%'],
    'TRP': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HBx', 'HBy', 'HB2', 'HB3', 'CG', 'CD1', 'CD2', 'HD1',
            'NE1', 'HE1', 'CE2', 'CE3', 'HE3', 'CZ2', 'CZ3', 'HZ2', 'HZ3', 'CH2', 'HH2'],
    'TYR': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HBx', 'HBy', 'HB2', 'HB3', 'CG', 'CDx', 'CDy', 'CD1',
            'CD2', 'HDx', 'HDy', 'HD1', 'HD2', 'CEx', 'CEy', 'CE1', 'CE2', 'HEx', 'HEy', 'HE1', 'HE2',
            'CZ', 'HH'],
    'VAL': ['H', 'N', 'C', 'CA', 'HA', 'CB', 'HB', 'CGx', 'CGy', 'CG1', 'CG2',
            'HGx%', 'HGy%', 'HG1%', 'HG2%']
    }

ALL_ATOMS_SORTED = {'alphas'      : ['CA', 'HA', 'HAx', 'HAy', 'HA2', 'HA3'],
                    'betas'       : ['CB', 'HB', 'HBx', 'HBy', 'HB%', 'HB2', 'HB3'],
                    'gammas'      : ['CG', 'CGx', 'CGy', 'CG1', 'CG2', 'HG', 'HGx', 'HGy', 'HG2', 'HG3', 'HGx%', 'HGy%'],
                    'moreGammas'  : ['HG1', 'HG1x', 'HG1y', 'HG12', 'HG13', 'HG1%', 'HG2%'],
                    'deltas'      : ['CD', 'CDx', 'CDy', 'CD1', 'CD2', 'HDx', 'HDy', 'HD1', 'HD2', 'HD3', 'HDx%', 'HDy%'],
                    'moreDeltas'  : ['ND1', 'ND2', 'HD1%', 'HD2%', 'HD2x', 'HD2y', 'HD21', 'HD22'],
                    'epsilons'    : ['CE', 'CEx', 'CEy', 'CE1', 'CE2', 'HE', 'HEx', 'HEy', 'HE1', 'HE2', 'HE3'],
                    'moreEpsilons': ['CE3', 'NE', 'NE1', 'NE2', 'HE2x', 'HE2y', 'HE21', 'HE22', 'HE%'],
                    'zetas'       : ['CZ', 'CZ2', 'CZ3', 'HZ', 'HZ2', 'HZ3', 'HZ%', 'NZ'],
                    'etas'        : ['CH2', 'HH2', 'HH1x', 'HH1y', 'HH2x', 'HH2y', 'NH1', 'NH2', 'NHx', 'NHy', 'HH21', 'HH22'],
                    'moreEtas'    : ['HH', 'HH11', 'HH12']
                    }


# End moved-in data - functions from original ChemicalShift


def _getResidueProbability(ppms, ccpCode, elements, shiftNames=None, ppmsBound=None,
                           prior=0.05, molType=PROTEIN_MOLTYPE, cutoff=1e-10):
    """Probability that data match a given ccpCode and molType
    NBNB unassigned (unnamed) resonances make no differences, but named resonances
    that do not fit a residue type WILL GIVE PROBABILITY ZERO!"""

    # Use refExperiment info
    # Use bound resonances info

    shiftRefs = REFDB_SD_MEAN.get((molType, ccpCode))

    if not shiftRefs:
        return None

    if not shiftNames:
        shiftNames = [None] * len(ppms)

    # if not ppmsBound:
    #     ppmsBound = [None] * len(ppms)

    atomData = REFDB_SD_MEAN_LIST.get((molType, ccpCode))
    # atomData = [(x, shiftRefs[x]) for x in shiftRefs.keys()]

    if ppms:
        pass

    # data = []
    # dataAppend = data.append
    probTot = 1.0
    for i, ppm in enumerate(ppms):
        element = elements[i]
        shiftName = shiftNames[i]
        # ppmB = ppmsBound[i]
        n = 0

        for atomName, stats in atomData:
            if not atomName.startswith(element):
                continue

            if shiftName and not _isAssignmentCompatible(shiftName, atomName):
                continue

            mean, sd, _, _ = stats
            # probTot *= multivariate_normal.pdf(ppm, mean, sd)

            d = ppm - mean

            if (not shiftName) and (abs(d) > 5 * sd):
                continue

            # probTot *= _wit(d, sd)
            e = d / sd
            probTot *= exp(-0.5 * e * e) / (sd * ROOT_TWO_PI)
            #
            # # if bound and (ppmB is not None):
            # #     boundData = shiftRefs.get(bound)
            # #
            # #     if boundData:
            # #         meanB, sdB, pMissingB, boundB = boundData
            # #         dB = ppmB - meanB
            # #         eB = dB / sdB
            # #         pB = exp(-0.5 * eB * eB) / (sdB * ROOT_TWO_PI)
            # #
            # #         p = (p * pB) ** 0.5
            #
            # if (not shiftName) and (p < cutoff):
            #     continue
            #
            # # dataAppend((i, j, p))
            # probTot *= p
            n += 1

        if n == 0:
            return 0.0

    # groups = [set([node, ]) for node in data if node[0] == 0]
    #
    # # print('>>> data {}'.format(data))
    # _data = data.copy()
    # while data:
    #     node = data.pop()
    #     i, j, p = node
    #
    #     for group in groups[:]:
    #         for node2 in group:
    #             i2, j2, p2 = node2
    #
    #             if (i == i2) or (j == j2):
    #                 break
    #
    #         else:
    #             newGroup = group.copy()
    #             newGroup.add(node)
    #             groups.append(newGroup)
    #
    # probTot = 0.0
    # _lens = [True for group in groups if len(group) == len(ppms)]
    # # if _lens and len(_lens) > 1:
    # #     print('>>> TOO MANY?')
    # #     print('>>> data {}'.format(_data))
    # #     print('>>> attributes {}  {}  {}  {}'.format(ppms, ccpCode, elements, shiftNames))
    # #     print('>>> groups {}  {}'.format(len(_lens), groups))
    #
    # for group in groups:
    #
    #     if len(group) != len(ppms):
    #         continue
    #
    #     found = set([])
    #     prob = 1.0
    #     for i, j, p in group:
    #         found.add(j)
    #         prob *= p
    #
    #     #for k, datum in enumerate(atomData:
    #     #  atomName, stats = datum
    #     #  pMissing = stats[2]
    #     #
    #     #  if k in found:
    #     #    prob *= 1-pMissing
    #     #  else:
    #     #    prob *= pMissing
    #
    #     if found:
    #         probTot += prob

    return probTot


[docs]def getSpinSystemChainProbabilities(spinSystem, chain, shiftList, resShifts): probDict = {} getProb = getSpinSystemResidueProbability priors = getChainResTypesPriors(chain) ccpCodes = set(getChainResidueCodes(chain)) for ccpCode, molType in ccpCodes: probDict[ccpCode] = getProb(spinSystem, shiftList, ccpCode, priors[ccpCode], molType, resShifts) return probDict
[docs]def getChainResidueCodes(chain): ccpCodes = [] for residue in chain.residues: ccpCode = residue.ccpCode if (ccpCode == 'Cys') and (residue.descriptor == 'link:SG'): ccpCode = 'Cyss' ccpCodes.append((ccpCode, residue.molType)) return ccpCodes
[docs]def getSpinSystemScore(spinSystem, resShifts, chain, shiftList): scores = getSpinSystemChainProbabilities(spinSystem, chain, shiftList, resShifts=resShifts) # ejb - error here, this was empty in nef file: 1nk2_docr_extended.ccpn.nef if scores: total = sum(scores.values()) if total: for ccpCode in scores: scores[ccpCode] *= 100.0 / total else: return scores return scores
[docs]def getChainResTypesPriors(chain): priors = {} ccpCodes = [x[0] for x in getChainResidueCodes(chain)] n = float(len(ccpCodes)) for ccpCode in set(ccpCodes): priors[ccpCode] = ccpCodes.count(ccpCode) / n return priors
[docs]def getCcpCodes(chain): codeDict = {} for residue in chain.residues: codeDict[residue.ccpCode] = True ccpCodes = list(codeDict.keys()) ccpCodes.sort() return ccpCodes
[docs]def getSpinSystemResidueProbability(spinSystem, shiftList, ccpCode, prior=0.05, molType=PROTEIN_MOLTYPE, resShifts=()): """Get probability that Spin system matches molType and ccpCode NB to avoid rejection all atom names must be either unassigned (default) or correct!""" ppms = [] elements = [] atomNames = [] ppmsAppend = ppms.append elementsAppend = elements.append atomNamesAppend = atomNames.append for resonance, shift in resShifts: # for resonance in spinSystem.resonances: # isotope = resonance.isotope if isotope: # # shift = resonance.findFirstShift(parentList=shiftList) # if shift: ppmsAppend(shift.value) elementsAppend(isotope.chemElement.symbol) # NB, use implName to avoid default (unassigned) names. atomNamesAppend(resonance.implName) prob = _getResidueProbability(ppms, ccpCode, elements, atomNames, prior=prior, molType=molType) return prob
def _isAssignmentCompatible(assignName: str, atomName: str) -> bool: """Is assignName compatible with assignment to atomName? NB allows for non-standard assignment strings NB does NOT do case conversions (nor should it - names are case-sensitive). NB does NOT accept 'x' and 'y' as wildcards, only 'x' and 'y'""" # convert pseudoAtom names to proton wildcard names if assignName[0] in 'QM': assignName = 'H' + assignName[1:] + '%' if assignName == atomName: return True # lenPrefix = len(os.path.commonprefix((assignName, atomName))) lenPrefix = 0 for c1, c2 in zip(assignName, atomName): if c1 != c2: break lenPrefix += 1 lenAtomName = len(atomName) if lenPrefix == 0: return False elif lenAtomName == lenPrefix: if assignName[lenPrefix:] in ('*', '%'): # E.g. assign HG* v. HG return True elif lenAtomName - lenPrefix == 1: if atomName[-1] in '123*%' and assignName[lenPrefix:] in ('', 'x', 'y', '*', '%'): # assigned wildcard v. wildcard or single digit, e.g. HGx v. HG* or HG1 return True elif lenAtomName - lenPrefix == 2: if atomName[-2] in '123' and atomName[-1] in '123*%' and assignName[lenPrefix:] in ('', 'x', 'y', '*', '%', 'x%', 'y%', 'x*', 'y*'): # E.g. HG, HG%, or HGy* v. HG21 or HG1* return True
[docs]def getAtomProbability(ccpCode, atomName, shiftValue, molType=PROTEIN_MOLTYPE): shiftRefs = REFDB_SD_MEAN.get((molType, ccpCode)) if not shiftRefs: return stats = shiftRefs.get(atomName) if not stats: return mean, sd, pMissing, bound = stats d = shiftValue - mean e = d / sd p = exp(-0.5 * e * e) / (sd * ROOT_TWO_PI) return p
[docs]def getResidueAtoms(ccpCode, molType=PROTEIN_MOLTYPE): return REFDB_SD_MEAN.get((molType, ccpCode)).keys()
[docs]def getCcpCodeData(nmrProject, ccpCode, molType=None, atomType=None): dataDict = {} sourceName = 'RefDB' nmrRefStore = nmrProject.root.findFirstNmrReferenceStore(molType=molType, ccpCode=ccpCode) chemCompNmrRef = nmrRefStore.findFirstChemCompNmrRef(sourceName=sourceName) if chemCompNmrRef: chemCompVarNmrRef = chemCompNmrRef.findFirstChemCompVarNmrRef(linking='any', descriptor='any') if chemCompVarNmrRef: for chemAtomNmrRef in chemCompVarNmrRef.chemAtomNmrRefs: atomName = chemAtomNmrRef.name element = chemAtomNmrRef.findFirstChemAtom().elementSymbol if not atomType: dataDict[atomName] = chemAtomNmrRef elif (atomType == 'Hydrogen' and element == 'H') or \ (atomType == 'Heavy' and element != 'H'): dataDict[atomName] = chemAtomNmrRef return dataDict