Source code for ccpnmodel.ccpncore.lib.Io.Pdb

"""Pdb IO functions

# Licence, Reference and Credits

__copyright__ = "Copyright (C) CCPN project ( 2014 - 2017"
__credits__ = ("Wayne Boucher, Ed Brooksbank, Rasmus H Fogh, Luca Mureddu, Timothy J Ragan & Geerten W Vuister")
__licence__ = ("CCPN licence. See",
               "or ccpnmodel.ccpncore.memops.Credits.CcpnLicense for licence text")
__reference__ = ("For publications, please use reference from",
               "or ccpnmodel.ccpncore.memops.Credits.CcpNmrReference")

# Last code modification
__modifiedBy__ = "$modifiedBy: CCPN $"
__dateModified__ = "$dateModified: 2017-07-07 16:33:13 +0100 (Fri, July 07, 2017) $"
__version__ = "$Revision: 3.0.0 $"
# Created

__author__ = "$Author: CCPN $"
__date__ = "$Date: 2017-04-07 10:28:48 +0000 (Fri, April 07, 2017) $"
# Start of code

NaN = float('NaN')
from typing import List, Tuple
from ccpn.util import Common as commonUtil
from ccpnmodel.ccpncore.lib.Io import PyMMLibPDB as PdbLib

[docs]class PdbRecordProcessor(PdbLib.RecordProcessor): """Class for custom record processing"""
[docs] def process_ATOM(self, rec): """fix atom record - make only globally acceptable changes, special-case stuff is for later""" name = rec.get('name') # If no name it cannot be fixed. if name[0].isdigit(): # move leading digit to end of atom name rec['name'] = name[1:] + name[0] if not rec.get('chainID', '').strip(): # replace empty chainID with seqID if length is suitable seqId = rec.get('seqID', '').strip() if len(seqId) == 1: rec['chainID'] = seqId else: # May not be necessary, but just in case rec['chainID'] = ' '
[docs]def readPdbRecorsds(fil): """Read file or input stream, and return of PDBRecords, one per model Header records are given in the first list""" pdbFile = PdbLib.PDBFile() pdbFile.load_file(fil) recordProcessor = PdbLib.RecordProcessor() recordProcessor.process_pdb_records(pdbFile) # return pdbFile
[docs]def readModelRecords(fil) -> Tuple[List[PdbLib.PDBRecord], List[List[PdbLib.PDBRecord]]]: """Read file or input stream, and return list-of-lists-of PDBRecords, one per model All records are given in the first list, subsequent lists contain only ATOM records""" pdbFile = readPdbRecorsds(fil) model = [] header = [] data = [] for rec in pdbFile: if rec._name == 'ENDMDL': # put model into result and make a new one data.append(model) model = [] elif rec._name in('ATOM ', 'HETATM'): # Always append ATOM and HETATM records model.append(rec) elif not data: # For the first model only append all records, in case we want them later header.append(rec) # if model: # Special case: ENDMDL record missing # Only arrive here if we have had ATOM records since the last ENDMDL record (or beginning) data.append(model) # return header, data
[docs]def loadStructureEnsemble(molSystem:"MolSystem", fil) -> "StructureEnsemble": """Load PDB file into new structure ensemble matching MolSystem NB MolSystem is a required parameter for the data model, but there is no requirement that the data match """ from ccpn.util.isotopes import name2ElementSymbol # TBD further data extraction, use of header, match chains to existing one, make new chains?? ... header, data = readModelRecords(fil) if data: atomCount = len(data[0]) modelCount = len(data) if any(x for x in data[1:] if len(x) != atomCount): raise ValueError("Multiple models have different atom counts in PDB file %s - loading aborted") # NBNB TBD check that names match in different models memopsRoot = molSystem.root ll = [x.ensembleId for x in molSystem.structureEnsembles] nextId = max(ll) + 1 if ll else 1 apiEnsemble = memopsRoot.newStructureEnsemble(molSystem=molSystem, ensembleId=nextId) for rec in data[0]: chain = (apiEnsemble.findFirstCoordChain(code=rec.get('chainID')) or apiEnsemble.newChain(code=rec.get('chainID'))) residue = (chain.findFirstResidue(seqCode=rec.get('resSeq'), seqInsertCode=rec.get('iCode', ' ')) or chain.newResidue(seqCode=rec.get('resSeq'), seqInsertCode=rec.get('iCode', ' '), code3Letter=rec.get('resName'))) # NBNB Heuristic. We need an elementName elementName = (rec.get('element') or name2ElementSymbol(rec.get('name')) or 'Unknown') # NBNB wil likely break with altLocated atoms. Meanwhile do it right residue.newAtom(name=rec.get('name'), altLocationCode=rec.get('altLoc', ' '), elementName=elementName.title()) # Gather data # NBNB TBD atomNameData need doing for modelData in data: apiModel = apiEnsemble.newModel() coordinates = [] addCoordinate = coordinates.append occupancies = [] addOccupancy = occupancies.append bFactors = [] addBFactor = bFactors.append # NBNB TBD Add atomNames array for rec in modelData: addCoordinate(rec.get('x', NaN)) addCoordinate(rec.get('y', NaN)) addCoordinate(rec.get('z', NaN)) addCoordinate(rec.get('occupancy', NaN)) addCoordinate(rec.get('tempFactor', NaN)) apiModel.setSubmatrixData('coordinates', coordinates) apiModel.setSubmatrixData('occupancies', occupancies) apiModel.setSubmatrixData('bFactors', bFactors) # Set data # apiEnsemble.findFirstDataMatrix(name='coordinates', shape=(modelCount, atomCount,3), data=coordinates) # apiEnsemble.newDataMatrix(name='occupancies', shape=(modelCount, atomCount), data=occupancies) # apiEnsemble.newDataMatrix(name='bFactors', shape=(modelCount, atomCount), data=bFactors) # return apiEnsemble else: raise RuntimeError("no ATOM data in PDB file %s - loading aborted" % fil)