Source code for ccpn.core.lib.CcpnStarIo

# Temporary Framework before fixing NEF style Reader


import pandas as pd
import os
import glob
import numpy as np
from collections import OrderedDict as od
from ccpn.util.isotopes import name2IsotopeCode
from ccpn.core.lib.AxisCodeLib import getAxisCodeMatchIndices
from ccpn.framework.PathsAndUrls import macroPath as mp
from ccpn.core.lib.ContextManagers import undoBlock
from ccpn.util.Logging import getLogger
from ccpn.core.lib.ContextManagers import notificationEchoBlocking
from ccpn.core.ChemicalShiftList import ChemicalShiftList

NMRSTARVersion = '3.2.1.32'
NMRSTARV3GROUPS = od([
                    ("citations",
                                (
                                "Citation",
                                "Citation_author",
                                "Citation_editor",
                                "Citation_keyword"
                                )),
                    ("Entry_information", (
                                "deposited_data_files",
                                "entry_information",
                                "entry_interview",
                                "study_list",
                                )),
                    ("Experimental_descriptions",
                                (
                                "applied_software",
                                "applied_software_history",
                                "chromatographic_column",
                                "chromatographic_system",
                                "computer",
                                "EMR_expt",
                                "EMR_instrument",
                                "experiment_list",
                                "fluorescence_instrument",
                                "FRET_expt",
                                "Mass_spec_ref_compd",
                                "Mass_spectrometer",
                                "Mass_spectrometer_list",
                                "method",
                                "molecule_purity",
                                "MS_expt",
                                "NMR_spectral_processing",
                                )),
                    ("Kinetic_data",
                                (
                                "auto_relaxation",
                                "chemical_rates",
                                "dipole_CSA_cross_correlations",
                                "dipole_dipole_cross_correlations",
                                "dipole_dipole_relaxation",
                                "heteronucl_NOEs",
                                "heteronucl_T1_relaxation",
                                "heteronucl_T1rho_relaxation",
                                "heteronucl_T2_relaxation",
                                "H_exch_protection_factors",
                                "H_exch_rates",
                                "homonucl_NOEs",
                                "theoretical_auto_relaxation",
                                "theoretical_dipole_dipole_cross_correlations",
                                "theoretical_heteronucl_NOEs",
                                "theoretical_heteronucl_T1_relaxation",
                                "theoretical_heteronucl_T2_relaxation",
                                )),
                    ("Molecular_assembly",
                                (
                                "assembly",
                                "assembly_annotation",
                                "assembly_subsystems",
                                "chem_comp",
                                "entity",
                                "experimental_source",
                                "natural_source",
                                )),
                    ("NMR_parameters",
                                (
                                "assigned_chemical_shifts",
                                "chem_shift_anisotropy",
                                "chem_shift_isotope_effect",
                                "chem_shift_perturbation",
                                "chem_shift_reference",
                                "chem_shifts_calc_type",
                                "coupling_constants",
                                "dipolar_couplings",
                                "other_data_types",
                                "RDCs",
                                "resonance_linker",
                                "spectral_density_values",
                                "spectral_peak_list",
                                "theoretical_chem_shifts",
                                "theoretical_coupling_constants",
                                )),
                    ("Structure",
                                (
                                "angular_order_parameters",
                                "bond_annotation",
                                "CA_CB_chem_shift_constraints",
                                "conformer_family_coord_set",
                                "conformer_statistics",
                                "constraint_statistics",
                                "deduced_hydrogen_bonds",
                                "deduced_secd_struct_features",
                                "distance_constraints",
                                "floating_chiral_stereo_assign",
                                "force_constants",
                                "general_distance_constraints",
                                "H_chem_shift_constraints",
                                "interatomic_distance",
                                "J_three_bond_constraints",
                                "MS_chromatogram",
                                "MS_MZ_data",
                                "org_constr_file_comment",
                                "other_constraints",
                                "other_struct_features",
                                "peak_constraint_links",
                                "RDC_constraints",
                                "representative_conformer",
                                "SAXS_constraints",
                                "secondary_structs",
                                "structure_annotation",
                                "structure_interactions",
                                "tensor",
                                "tertiary_struct_elements",
                                "torsion_angle_constraints"
                                )),
                    ("Thermodynamic_data",
                        (
                        "binding_data",
                        "binding_param_list",
                        "D_H_fractionation_factors",
                        "order_parameters",
                        "pH_param_list",
                        "pH_titration",
                    ))
                    ])



# Variable definitions from NmrStar 3.1
ID = 'ID'
Assembly_atom_ID = 'Assembly_atom_ID'
Entity_assembly_ID = 'Entity_assembly_ID'
Entity_ID = 'Entity_ID'
Comp_index_ID = 'Comp_index_ID'
Seq_ID = 'Seq_ID'
Comp_ID =  'Comp_ID'
Atom_ID = 'Atom_ID'
Atom_type = 'Atom_type'
Atom_isotope_number = 'Atom_isotope_number'
Val = 'Val'
Val_err = 'Val_err'
Assign_fig_of_merit = 'Assign_fig_of_merit'
Ambiguity_code = 'Ambiguity_code'
Occupancy = 'Occupancy'
Resonance_ID = 'Resonance_ID'
Auth_entity_assembly_ID = 'Auth_entity_assembly_ID'
Auth_asym_ID = 'Auth_asym_ID'
Auth_seq_ID = 'Auth_seq_ID'
Auth_comp_ID = 'Auth_comp_ID'
Auth_atom_ID = 'Auth_atom_ID'
Details = 'Details'
Entry_ID = 'Entry_ID'
Assigned_chem_shift_list_ID = 'Assigned_chem_shift_list_ID'

STOP = "stop_"
_Atom_chem_shift = "_Atom_chem_shift." # used to find the loop of interest in the BMRB and shorten the column header name on the DataFrame
Atom_chem_shift = _Atom_chem_shift+ID


def _getTitles(filePath):
    lines = []
    with open(filePath, "r") as bmrbFileText:
        for line in bmrbFileText:
            if '#' in line:
                lines.append(line)
                break
    return lines

def _getCSListLinesFromBmrbFile(filePath):
    # HardCoded bit to find just the Chemical Shifts, Replace with NEF Style
    lines = []
    with open (filePath, "r") as bmrbFileText:
        for line in bmrbFileText:
            if Atom_chem_shift in line:
                lines.append(line)
                break
        for line in bmrbFileText:
            if STOP in line:
                break
            lines.append(line)
    return lines

[docs]def makeCSLDataFrame(bmrbLines): ## create dataframe of interest using the same names as appear in the BMRB. # NB Columns are not always the same in number (E.G. fileA=10columns, fileB=12columns). table = [] columns = [] for i in [sub.splitlines() for sub in bmrbLines]: vv = [j.strip() for j in i if j] columns += [j for j in [i for i in vv if i.startswith('_')]] row = [j for i in [i.split(' ') for i in vv if not i.startswith('_')] for j in i if j] if len(row)>0: table.append(row) shortColumns = [c.replace(_Atom_chem_shift,'') for c in columns] return pd.DataFrame(table, columns=shortColumns)
def _connectNmrResidues(nmrChain): updatingNmrChain = None nrs = nmrChain.nmrResidues for i in range(len(nrs) - 1): currentItem, nextItem = nrs[i], nrs[i + 1] if currentItem and nextItem: # check that the sequence codes are consecutive if int(nextItem.sequenceCode) == int(currentItem.sequenceCode) + 1: updatingNmrChain = currentItem.connectNext(nextItem, ) return updatingNmrChain def _isfloat(value): try: float(value) return True except ValueError: return False def _isValidValue(row, header): if row[header] != '.': return True else: return False
[docs]def makeCSLfromDF(project, df): """ :param df: Pandas dataFrame from parsing an NmrStar 3 File. :return: ChemicalShiftList Object Creates a V3 ChemicalShiftList and associated V3 objects from a dataFrame created parsing an NmrStar 3 File. The reader assumes the dataframe contains tags as specified in the bmrb dictionary. See reference. TAGs reference: https://bmrb.io/dictionary/tag.php?tagcat=Atom_chem_shift +------------------------------|-----------------------------------------+ + CCPN V3 Object/property | BMRB *.TAG (*prefix: _Atom_chem_shift) + +------------------------------|-----------------------------------------+ chemicalShiftList: name => Entry_ID nmrChain(s): name => Entity_assembly_ID NmrResidue(s): sequenceCode => seq_ID or Auth_seq_ID (if present) residueType => Comp_ID NmrAtom(s): name => Atom_ID isotopeCode => Atom_isotope_number + Atom_type ChemicalShift(s): value => Val valueError => Val_er comment => Details Others not stored. Maybe save in ChemicalShift internal? Assembly_atom_ID Assign_fig_of_merit Assigned_chem_shift_list_ID Ambiguity_code Ambiguity_set_ID Auth_asym_ID Occupancy_ID Resonance_ID # TODO: NmrAtom Name => use Auth_atom_ID and Ambiguity_code to create an NEF-compliant nomenclature """ ## check if the required columns are present. requiredColumns = [Entry_ID, Entity_assembly_ID, Seq_ID, Auth_seq_ID, Comp_ID, Val, Val_err, Details] if not set(requiredColumns).issubset(df.columns): missingColumns = [c for c in requiredColumns if not c in df.columns] getLogger().warn('Creating ChemicalShiftList failed. Missing some of the required Tags (%s) in the NmrStar file.' %', '.join(missingColumns)) return ## check if the Auth_seq_ID is present (not as '.'). Use the author value as preferred option. useAuthSeqID = True if not df[Auth_seq_ID].eq('.').all() else False nmrChains = set() with notificationEchoBlocking(): chemicalShiftListName = df[Entry_ID].astype(str, errors='ignore').unique()[-1] #Mandatory tag. Values always present # chemicalShiftListName = _incrementObjectName(project, ChemicalShiftList._pluralLinkName, chemicalShiftListName) chemicalShiftList = project.newChemicalShiftList(name=chemicalShiftListName) for index, row in df.iterrows(): nmrChain = project.fetchNmrChain(row[Entity_assembly_ID]) sequenceCode = row[Auth_seq_ID] if useAuthSeqID else row[Seq_ID] nmrResidue = nmrChain.fetchNmrResidue(residueType=row[Comp_ID], sequenceCode=sequenceCode) nmrAtom = nmrResidue.fetchNmrAtom(row[Atom_ID]) isotopeCode = row[Atom_isotope_number] + row[Atom_type] try: nmrAtom._setIsotopeCode(isotopeCode) except Exception as icError: getLogger().warn('Error setting the IsotopeCode: %s for the nmrAtom %s. %s.' %(isotopeCode, nmrAtom.pid, icError)) try: if _isfloat(row[Val]): val = float(row[Val]) if _isfloat(row[Val]) else None valueErr = float(row[Val_err]) if _isfloat(row[Val_err]) else None cs = chemicalShiftList.newChemicalShift(value=val, valueError=valueErr, nmrAtom=nmrAtom) cs.comment = ' '.join([cs.comment or '', 'Imported: '+row[Details]]) except Exception as e: getLogger().warn('Error creating a new ChemicalShift: %s' %e) nmrChains.add(nmrChain) # for nmrChain in nmrChains: # nmrChain._connectNmrResidues() return chemicalShiftList
def _simulatedSpectrumFromCSL(project, csl, axesCodesMap): """ key= NmrAtom name as appears in the CSL ; value = AxisCode name to Assign to the Spectrum AxisCode E.G. use NmrAtom Ca from CSL (imported from BMRB) and assign to a peak with axisCode named C :param csl: chemicalShiftList Object :param axesCodesMap: Ordered Dict containing Atoms of interest and parallel Spectrum Axes Codes :return: """ success = False if csl is None: getLogger().warn("Provide a Chemical Shift List") return success if axesCodesMap is None: getLogger().warn("Select NmrAtom to use and axis codes") return success with notificationEchoBlocking(): nmrAtomNames = list(axesCodesMap.keys()) isotopeCodes = [name2IsotopeCode(x) for x in list(axesCodesMap.values())] targetSpectrum = project.newEmptySpectrum(isotopeCodes=isotopeCodes, name=csl.name) targetSpectrum.chemicalShiftList = csl peakList = targetSpectrum.peakLists[-1] # filter by NmrAtom of interest nmrResiduesOD = od() for chemicalShift in csl.chemicalShifts: na = chemicalShift.nmrAtom if na.name in nmrAtomNames: try: nmrResiduesOD[na.nmrResidue].append([na, chemicalShift.value]) except KeyError: nmrResiduesOD[na.nmrResidue] = [(na, chemicalShift.value)] incompleteNmrResiduesAssignments = [] for nmrResidue in nmrResiduesOD: shifts = [] atoms = [] for v in nmrResiduesOD[nmrResidue]: atoms.append(v[0]) shifts.append(v[1]) targetSpectrumAxCodes = targetSpectrum.axisCodes axisCodes = [n.name for n in atoms] i = getAxisCodeMatchIndices(axisCodes,targetSpectrumAxCodes) # get the correct order. if len(i) != len(nmrAtomNames): #if not all the NmrAtoms are found for a specific CSL cannot make the peak, incomplete assignment/shifts incompleteNmrResiduesAssignments.append(nmrResidue.pid) continue i = np.array(i) shifts = np.array(shifts) atoms = np.array(atoms) try: # trap unknown issues peak = peakList.newPeak(list(shifts[i])) for na, sa in zip(atoms[i],targetSpectrumAxCodes) : peak.assignDimension(sa, [na]) except Exception as e: getLogger().warn('Error assigning NmrResidue %s , NmrAtoms: %s, Spectrum Axes: %s, Shifts: %s . Error: %s' %(nmrResidue,atoms,targetSpectrumAxCodes,shifts, e)) if len(incompleteNmrResiduesAssignments)>0: message = 'Could not fully assign peaks for NmrResidue(s): %s. ' \ 'Not enough nmrAtom information.' %', '.join(incompleteNmrResiduesAssignments) getLogger().warn(message) success = True return success def _importAndCreateV3Objs(project, file, acMap, simulateSpectra=False): getLogger().info('NmrStar file import started...') success = False lines = _getCSListLinesFromBmrbFile(file) df = makeCSLDataFrame(lines) with undoBlock(): csl = makeCSLfromDF(project, df) if csl is not None: success = True if simulateSpectra: success = _simulatedSpectrumFromCSL(project, csl, axesCodesMap=acMap) else: getLogger().warn('NmrStar file import aborted.') return success getLogger().info('NmrStar file import completed.') return success