# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; encoding: utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
#
# MDAnalysis --- http://mdanalysis.googlecode.com
# Copyright (c) 2006-2011 Naveen Michaud-Agrawal,
# Elizabeth J. Denning, Oliver Beckstein,
# and contributors (see website for details)
# Released under the GNU Public Licence, v2 or any higher version
#
# Please cite your use of MDAnalysis in published work:
#
# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and
# O. Beckstein. MDAnalysis: A Toolkit for the Analysis of
# Molecular Dynamics Simulations. J. Comput. Chem. 32 (2011), 2319--2327,
# doi:10.1002/jcc.21787
#
"""
Helper functions --- :mod:`MDAnalysis.core.util`
====================================================
Small helper functions that don't fit anywhere else.
Files and directories
---------------------
.. autofunction:: filename
.. function:: openany(directory[,mode='r'])
Context manager to open a compressed (bzip2, gzip) or plain file
(uses :func:`anyopen`).
.. autofunction:: anyopen
.. autofunction:: greedy_splitext
.. autofunction:: which
.. autofunction:: realpath
Containers and lists
--------------------
.. autofunction:: iterable
.. autofunction:: asiterable
File parsing
------------
.. autoclass:: FORTRANReader
:members:
.. autodata:: FORTRAN_format_regex
Data manipulation and handling
------------------------------
.. autofunction:: fixedwidth_bins
Strings
-------
.. autofunction:: convert_aa_code
.. autofunction:: parse_residue
.. autofunction:: conv_float
Mathematics and Geometry
------------------------
.. autofunction:: normal
.. autofunction:: norm
.. autofunction:: angle
.. autofunction:: dihedral
.. autofunction:: stp
"""
__docformat__ = "restructuredtext en"
import os.path
from contextlib import contextmanager
import bz2, gzip
import re
import numpy
[docs]def filename(name,ext=None,keep=False):
"""Return a new name that has suffix attached; replaces other extensions.
:Arguments:
*name*
filename; extension is replaced unless keep=True
*ext*
extension
*keep*
``False``: replace existing extension; ``True``: keep if exists
"""
name = str(name)
if ext is None:
return name
if not ext.startswith(os.path.extsep):
ext = os.path.extsep + ext
#if name.find(ext) > 0: # normally >= 0 but if we start with '.' then we keep it
# return name
root, origext = os.path.splitext(name)
if len(origext) == 0 or not keep:
return root + ext
return name
@contextmanager
[docs]def openany(datasource, mode='r'):
"""Open the datasource and close it when the context exits."""
stream, filename = anyopen(datasource, mode=mode)
try:
yield stream
finally:
stream.close()
[docs]def anyopen(datasource, mode='r'):
"""Open datasource (gzipped, bzipped, uncompressed) and return a stream.
:Arguments:
*datasource*
a file or a stream
*mode*
'r' or 'w'
"""
# TODO: - make this act as ContextManager (and not return filename)
# - need to add binary 'b' to mode for compressed files?
handlers = {'bz2': bz2.BZ2File, 'gz': gzip.open, '': file}
if mode.startswith('r'):
if hasattr(datasource,'next') or hasattr(datasource,'readline'):
stream = datasource
filename = '(%s)' % stream.name # maybe that does not always work?
else:
stream = None
filename = datasource
for ext in ('bz2', 'gz', ''): # file == '' should be last
openfunc = handlers[ext]
stream = _get_stream(datasource, openfunc, mode=mode)
if not stream is None:
break
if stream is None:
raise IOError("Cannot open %(filename)r in mode=%(mode)r." % vars())
elif mode.startswith('w'):
if hasattr(datasource, 'write'):
stream = datasource
filename = '(%s)' % stream.name # maybe that does not always work?
else:
stream = None
filename = datasource
name, ext = os.path.splitext(filename)
if ext.startswith('.'):
ext = ext[1:]
if not ext in ('bz2', 'gz'):
ext = '' # anything else but bz2 or gz is just a normal file
openfunc = handlers[ext]
stream = openfunc(datasource, mode=mode)
if stream is None:
raise IOError("Cannot open %(filename)r in mode=%(mode)r with type %(ext)r." % vars())
else:
raise NotImplementedError("Sorry, mode=%(mode)r is not implemented for %(datasource)r" % vars())
return stream, filename
def _get_stream(filename, openfunction=file, mode='r'):
try:
stream = openfunction(filename, mode=mode)
except IOError:
return None
try:
stream.readline()
stream.close()
stream = openfunction(filename,'r')
except IOError:
stream.close()
stream = None
return stream
[docs]def greedy_splitext(p):
"""Split extension in path *p* at the left-most separator."""
path, root = os.path.split(p)
extension = ''
while True:
root, ext = os.path.splitext(root)
extension = ext + extension
if not ext:
break
return root, extension
[docs]def which(program):
"""Determine full path of executable *program* on :envvar:`PATH`.
(Jay at http://stackoverflow.com/questions/377017/test-if-executable-exists-in-python)
"""
def is_exe(fpath):
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
fpath, fname = os.path.split(program)
if fpath:
real_program = realpath(program)
if is_exe(real_program):
return real_program
else:
for path in os.environ["PATH"].split(os.pathsep):
exe_file = os.path.join(path, program)
if is_exe(exe_file):
return exe_file
return None
[docs]def realpath(*args):
"""Join all args and return the real path, rooted at /.
Expands '~', '~user', and environment variables such as :envvar`$HOME`.
Returns ``None`` if any of the args is ``None``.
"""
if None in args:
return None
return os.path.realpath(os.path.expanduser(os.path.expandvars(os.path.join(*args))))
[docs]def iterable(obj):
"""Returns ``True`` if *obj* can be iterated over and is *not* a string."""
if isinstance(obj, basestring):
return False # avoid iterating over characters of a string
if hasattr(obj, 'next'):
return True # any iterator will do
try:
len(obj) # anything else that might work
except TypeError:
return False
return True
[docs]def asiterable(obj):
"""Returns obj so that it can be iterated over; a string is *not* treated as iterable"""
if not iterable(obj):
obj = [obj]
return obj
#: Regular expresssion (see :mod:`re`) to parse a simple `FORTRAN edit descriptor`_.
#: ``(?P<repeat>\d?)(?P<format>[IFELAX])(?P<numfmt>(?P<length>\d+)(\.(?P<decimals>\d+))?)?``
#:
#: .. _FORTRAN edit descriptor: http://www.cs.mtu.edu/~shene/COURSES/cs201/NOTES/chap05/format.html
FORTRAN_format_regex = "(?P<repeat>\d+?)(?P<format>[IFEAX])(?P<numfmt>(?P<length>\d+)(\.(?P<decimals>\d+))?)?"
_FORTRAN_format_pattern = re.compile(FORTRAN_format_regex)
def strip(s):
"""Convert *s* to a string and return it white-space stripped."""
return str(s).strip()
class FixedcolumnEntry(object):
"""Represent an entry at specific fixed columns.
Reads from line[start:stop] and converts according to
typespecifier.
"""
convertors = {'I': int, 'F': float, 'E': float, 'A': strip}
def __init__(self, start, stop, typespecifier):
"""
:Arguments:
*start*
first column
*stop*
last column + 1
*typespecifier*
'I': int, 'F': float, 'E': float, 'A': stripped string
The start/stop arguments follow standard Python convention in that
they are 0-based and that the *stop* argument is not included.
"""
self.start = start
self.stop = stop
self.typespecifier = typespecifier
self.convertor = self.convertors[typespecifier]
def read(self, line):
"""Read the entry from *line* and convert to appropriate type."""
try:
return self.convertor(line[self.start:self.stop])
except ValueError:
raise ValueError("%r: Failed to read&convert %r" % (self, line[self.start:self.stop]))
def __len__(self):
"""Length of the field in columns (stop - start)"""
return self.stop - self.start
def __repr__(self):
return "FixedcolumnEntry(%d,%d,%r)" % (self.start, self.stop, self.typespecifier)
[docs]class FORTRANReader(object):
"""FORTRANReader provides a method to parse FORTRAN formatted lines in a file.
Usage::
atomformat = FORTRANReader('2I10,2X,A8,2X,A8,3F20.10,2X,A8,2X,A8,F20.10')
for line in open('coordinates.crd'):
serial,TotRes,resName,name,x,y,z,chainID,resSeq,tempFactor = atomformat.read(line)
Fortran format edit descriptors; see `Fortran Formats`_ for the syntax.
Only simple one-character specifiers supported here: *I F E A X* (see
:data:`FORTRAN_format_regex`).
Strings are stripped of leading and trailing white space.
.. _`Fortran Formats`: http://www.webcitation.org/5xbaWMV2x
.. _`Fortran Formats (URL)`:
http://www.cs.mtu.edu/~shene/COURSES/cs201/NOTES/chap05/format.html
"""
def __init__(self, fmt):
"""Set up the reader with the FORTRAN format string.
The string *fmt* should look like '2I10,2X,A8,2X,A8,3F20.10,2X,A8,2X,A8,F20.10'.
"""
self.fmt = fmt.split(',')
descriptors = [self.parse_FORTRAN_format(descriptor) for descriptor in self.fmt]
start = 0
self.entries = []
for d in descriptors:
if d['format'] != 'X':
for x in range(d['repeat']):
stop = start + d['length']
self.entries.append(FixedcolumnEntry(start,stop,d['format']))
start = stop
else:
start += d['totallength']
[docs] def read(self, line):
"""Parse *line* according to the format string and return list of values.
Values are converted to Python types according to the format specifier.
:Returns: list of entries with appropriate types
:Raises: :exc:`ValueError` if any of the conversions cannot be made
(e.g. space for an int)
.. SeeAlso:: :meth:`FORTRANReader.number_of_matches`
"""
return [e.read(line) for e in self.entries]
[docs] def number_of_matches(self, line):
"""Return how many format entries could be populated with legal values."""
# not optimal, I suppose...
matches = 0
for e in self.entries:
try:
e.read(line)
matches += 1
except ValueError:
pass
return matches
def __len__(self):
"""Returns number of entries."""
return len(self.entries)
def __repr__(self):
return self.__class__.__name__+"("+",".join(self.fmt)+")"
[docs]def fixedwidth_bins(delta,xmin,xmax):
"""Return bins of width delta that cover xmin,xmax (or a larger range).
dict = fixedwidth_bins(delta,xmin,xmax)
The dict contains 'Nbins', 'delta', 'min', and 'max'.
"""
if not numpy.all(xmin < xmax):
raise ValueError('Boundaries are not sane: should be xmin < xmax.')
_delta = numpy.asarray(delta,dtype=numpy.float_)
_xmin = numpy.asarray(xmin,dtype=numpy.float_)
_xmax = numpy.asarray(xmax,dtype=numpy.float_)
_length = _xmax - _xmin
N = numpy.ceil(_length/_delta).astype(numpy.int_) # number of bins
dx = 0.5 * (N*_delta - _length) # add half of the excess to each end
return {'Nbins':N, 'delta':_delta,'min':_xmin-dx, 'max':_xmax+dx}
# geometric functions
[docs]def norm(v):
r"""Returns the length of a vector, ``sqrt(v.v)``.
.. math::
v = \sqrt{\mathbf{v}\cdot\mathbf{v}}
Faster than :func:`numpy.linalg.norm` because no frills.
"""
return numpy.sqrt(numpy.dot(v,v))
[docs]def normal(vec1, vec2):
r"""Returns the unit vector normal to two vectors.
.. math::
\hat{\mathbf{n}} = \frac{\mathbf{v}_1 \times \mathbf{v}_2}{|\mathbf{v}_1 \times \mathbf{v}_2|}
If the two vectors are collinear, the vector :math:`\mathbf{0}` is returned.
"""
normal = numpy.cross(vec1, vec2)
n = norm(normal)
if n == 0.0:
return normal # returns [0,0,0] instead of [nan,nan,nan]
return normal/n # ... could also use numpy.nan_to_num(normal/norm(normal))
[docs]def angle(a, b):
"""Returns the angle between two vectors in radians"""
x = numpy.dot(a, b) / (norm(a)*norm(b))
# catch roundoffs that lead to nan otherwise
if x > 1.0:
return 0.0
elif x < -1.0:
return -numpy.pi
return numpy.arccos(x)
[docs]def stp(vec1, vec2, vec3):
r"""Takes the scalar triple product of three vectors.
Returns the volume *V* of the parallel epiped spanned by the three
vectors
.. math::
V = \mathbf{v}_3 \cdot (\mathbf{v}_1 \times \mathbf{v}_2)
"""
return numpy.dot(vec3, numpy.cross(vec1, vec2))
[docs]def dihedral(ab, bc, cd):
r"""Returns the dihedral angle in radians between vectors connecting A,B,C,D.
The dihedral measures the rotation around bc::
ab
A---->B
\ bc
_\'
C---->D
cd
The dihedral angle is restricted to the range -π <= x <= π.
.. versionadded:: 0.8
"""
x = angle(normal(ab, bc), normal(bc, cd))
return (x if stp(ab, bc, cd) <= 0.0 else -x)
# String functions
# ----------------
#: translation table for 3-letter codes --> 1-letter codes
#: .. SeeAlso:: :data:`alternative_inverse_aa_codes`
canonical_inverse_aa_codes = {'ALA':'A', 'CYS':'C', 'ASP':'D', 'GLU':'E',
'PHE':'F', 'GLY':'G', 'HIS':'H', 'ILE':'I',
'LYS':'K', 'LEU':'L', 'MET':'M', 'ASN':'N',
'PRO':'P', 'GLN':'Q', 'ARG':'R', 'SER':'S',
'THR':'T', 'VAL':'V', 'TRP':'W', 'TYR':'Y'}
#: translation table for 1-letter codes --> *canonical* 3-letter codes.
#: The table is used for :func:`convert_aa_code`.
amino_acid_codes = dict([(one, three) for three,one in canonical_inverse_aa_codes.items()])
#: non-default charge state amino acids or special charge state descriptions
#: (Not fully synchronized with :class:`MDAnalysis.core.Selection.ProteinSelection`.)
alternative_inverse_aa_codes = {'HISA':'H', 'HISB':'H', 'HSE':'H', 'HSD':'H', 'HID':'H', 'HIE':'H', 'HIS1':'H', 'HIS2':'H',
'ASPH': 'D', 'ASH': 'D',
'GLUH': 'E', 'GLH': 'E',
'LYSH': 'K', 'LYN': 'K',
'ARGN': 'R',
'CYSH':'C', 'CYS1':'C', 'CYS2':'C'}
#: lookup table from 3/4 letter resnames to 1-letter codes. Note that non-standard residue names
#: for tautomers or different protonation states such as HSE are converted to canonical 1-letter codes ("H").
#: The table is used for :func:`convert_aa_code`.
#: .. SeeAlso:: :data:`canonical_inverse_aa_codes` and :data:`alternative_inverse_aa_codes`
inverse_aa_codes = {}
inverse_aa_codes.update(canonical_inverse_aa_codes)
inverse_aa_codes.update(alternative_inverse_aa_codes)
[docs]def convert_aa_code(x):
"""Converts between 3-letter and 1-letter amino acid codes.
.. SeeAlso:: Data are defined in :data:`amino_acid_codes` and :data:`inverse_aa_codes`.
"""
if len(x) == 1:
return amino_acid_codes[x.upper()]
elif len(x) > 1:
return inverse_aa_codes[x.upper()]
else:
raise ValueError("No conversion for {} found (1 letter -> 3 letter or 3/4 letter -> 1 letter)".format(x))
#: Regular expression to match and parse a residue-atom selection; will match
#: "LYS300:HZ1" or "K300:HZ1" or "K300" or "4GB300:H6O" or "4GB300" or "YaA300".
RESIDUE = re.compile("""
(?P<aa>([ACDEFGHIKLMNPQRSTVWY]) # 1-letter amino acid
| # or
([0-9A-Z][a-zA-Z][A-Z][A-Z]?) # 3-letter or 4-letter residue name
)
\s* # white space allowed
(?P<resid>\d+) # resid
\s*
(: # separator ':'
\s*
(?P<atom>\w+) # atom name
)? # possibly one
""", re.VERBOSE | re.IGNORECASE)
# from GromacsWrapper cbook.IndexBuilder
[docs]def parse_residue(residue):
"""Process residue string.
Examples:
- "LYS300:HZ1" --> ("LYS", 300, "HZ1")
- "K300:HZ1" --> ("LYS", 300, "HZ1")
- "K300" --> ("LYS", 300, None)
- "4GB300:H6O" --> ("4GB", 300, "H6O")
- "4GB300" --> ("4GB", 300, None)
:Argument: The *residue* must contain a 1-letter or 3-letter or
4-letter residue string, a number (the resid) and
optionally an atom identifier, which must be separate
from the residue with a colon (":"). White space is
allowed in between.
:Returns: `(3-letter aa string, resid, atomname)`; known 1-letter
aa codes are converted to 3-letter codes
"""
# XXX: use _translate_residue() ....
m = RESIDUE.match(residue)
if not m:
raise ValueError("Selection %(residue)r is not valid (only 1/3/4 letter resnames, resid required)." % vars())
resid = int(m.group('resid'))
residue = m.group('aa')
if len(residue) == 1:
resname = convert_aa_code(residue) # only works for AA
else:
resname = residue # use 3-letter for any resname
atomname = m.group('atom')
return (resname, resid, atomname)
[docs]def conv_float(s):
"""Convert an object *s* to float if possible.
Function to be passed into :func:`map` or a list comprehension. If
the argument can be interpreted as a float it is converted,
otherwise the original object is passed back.
"""
try:
return float(s)
except ValueError:
return s