Source code for ccpn.util.nef.StarTokeniser

# encoding: utf-8

"""

STAR file tokenizer

# Copyright © 2011, 2013 Global Phasing Ltd. All rights reserved.
#
# Author: Peter Keller
#
# This file forms part of the GPhL StarTools library.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
#  Redistributions of source code must retain the above copyright
#  notice, this list of conditions and the following disclaimer.
#
#  Redistributions in binary form must reproduce the above copyright
#  notice, this list of conditions and the following disclaimer in the
#  documentation and/or other materials provided with the
#  distribution.
#
#  If the regular expression used to match STAR/CIF data in the
#  redistribution is not identical to that in the original version,
#  this fact must be stated wherever the copyright notice is
#  reproduced.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# OF THE POSSIBILITY OF SUCH DAMAGE.

'''
Created on 25 Nov 2013

@author: pkeller
'''

#
# Modified by Rasmus Fogh, CCPN project, 5/2/2016
#

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals


#=========================================================================================
# Licence, Reference and Credits
#=========================================================================================
__copyright__ = "Copyright (C) CCPN project (http://www.ccpn.ac.uk) 2014 - 2019"
__credits__ = ("Ed Brooksbank, Luca Mureddu, Timothy J Ragan & Geerten W Vuister")
__licence__ = ("CCPN licence. See http://www.ccpn.ac.uk/v3-software/downloads/license")
__reference__ = ("Skinner, S.P., Fogh, R.H., Boucher, W., Ragan, T.J., Mureddu, L.G., & Vuister, G.W.",
                 "CcpNmr AnalysisAssign: a flexible platform for integrated NMR analysis",
                 "J.Biomol.Nmr (2016), 66, 111-124, http://doi.org/10.1007/s10858-016-0060-y")
#=========================================================================================
# Last code modification
#=========================================================================================
__modifiedBy__ = "$modifiedBy: CCPN $"
__dateModified__ = "$dateModified: 2017-07-07 16:33:02 +0100 (Fri, July 07, 2017) $"
__version__ = "$Revision: 3.0.0 $"
#=========================================================================================
# Created
#=========================================================================================
__author__ = "$Author: CCPN $"
__date__ = "$Date: 2017-04-07 10:28:41 +0000 (Fri, April 07, 2017) $"
#=========================================================================================
# Start of code
#=========================================================================================

import re
import collections


# STAR parsing REGEX, following International Tables for Crystallography volume G section 2.1
_REGEX = r"""(?xmi) # $Revision$  # No 'u' flag for perl 5.8.8/RHEL5 compatibility
^;([\S\s]*?)(?:\r\n|\s)^;(?:(?=\s)|$)  # Multi-line string
|(?:^|(?<=\s))(\#.*?)\r?$              # Comment
|(?:^|(?<=\s))(?:
  (global_)                            # STAR global block
  |(save_\S*)                          # STAR save frame header or terminator
  |(\$\S+)                             # STAR save frame reference
  |(stop_)                             # STAR nested loop terminator
  |(data_\S+)                          # Data block header
  |(loop_)                             # Loop header
  |((?:global_\S+)|(?:stop_\S+)|(?:data_)|(?:loop_\S+))  # Invalid privileged construct
  |(_\S+)                              # Data name
  |'(.*?)'                             # Single-quoted string
  |"(.*?)"                             # Double-quoted string
  |(\.)                                # CIF null
  |(\?)                                # CIF unknown/missing
  |([\[\]]\S*)                         # Square bracketed constructs (reserved)
  |((?:[^'";_$\s]|(?<!^);)\S*)         # Non-quoted string
  |(\S+)                               # Catch-all bad token
)
(?:(?=\s)|$)"""

# Compiled form of _REGEX
_star_pattern = re.compile(_REGEX, re.UNICODE)

# Token types. NB numbers must be synced to regex - these are used directly!!!
TOKEN_MULTILINE         = 1
TOKEN_COMMENT           = 2
TOKEN_GLOBAL            = 3
TOKEN_SAVE_FRAME        = 4
TOKEN_SAVE_FRAME_REF    = 5
TOKEN_LOOP_STOP         = 6
TOKEN_DATA_BLOCK        = 7
TOKEN_LOOP              = 8
TOKEN_BAD_CONSTRUCT     = 9
TOKEN_DATA_NAME         = 10
TOKEN_SQUOTE_STRING     = 11
TOKEN_DQUOTE_STRING     = 12
TOKEN_NULL              = 13
TOKEN_UNKNOWN           = 14
TOKEN_SQUARE_BRACKET    = 15
TOKEN_STRING            = 16
TOKEN_BAD_TOKEN         = 17

# Rasmus Fogh, CCPN project 5/2/2016
# # Modified Tokeniser to
# - use namedtuples instead of custom objects
# - to use a string input instead of a memory map (which gave string/byte conflict errors)
# - to wrap the regex iterator without a wrapping class.

#
StarToken = collections.namedtuple('StarToken', ('type', 'value'))


# Removed for Python 2 compatibility:
# StarToken.__doc__ = "StarToken named tuple (with fields 'type', 'value')"
# "returned by the STAR token iterator"

[docs]def getTokenIterator(text): """Iterator that returns an iterator over all STAR tokens in a generic STAR file""" return (StarToken(x.lastindex, x.group(x.lastindex)) for x in _star_pattern.finditer(text))