diff --git a/package/MDAnalysis/coordinates/MMCIF.py b/package/MDAnalysis/coordinates/MMCIF.py index 2ebf752fd9..a97b8ad1d3 100644 --- a/package/MDAnalysis/coordinates/MMCIF.py +++ b/package/MDAnalysis/coordinates/MMCIF.py @@ -74,6 +74,7 @@ import numpy as np from . import base +from ..lib import util try: import gemmi @@ -119,7 +120,7 @@ class MMCIFReader(base.SingleFrameReaderBase): units = {"time": None, "length": "Angstrom"} def _read_first_frame(self): - structure = gemmi.read_structure(self.filename) + structure = self._get_structure() cell_dims = np.array( [ getattr(structure.cell, name) @@ -145,3 +146,36 @@ def _read_first_frame(self): else: self.ts.dimensions = cell_dims self.ts.frame = 0 + + def _get_structure(self): + # This method exists because of some lacking methods in the gemmi Python API. + # within gemmi in C++, one can call `read_structure` and in-memory, string, and filepath + # arguments will all be accepted: + # https://github.com/project-gemmi/gemmi/blob/4416e298f204b7b57bf5b3051d7efd4fe02957cf/include/gemmi/mmread.hpp#L86 + + # However, for MDA to similarly accept common input types like streams (open File-like objs and StringIO objs) + # as well as pathlib.Path() objects, we have to use the Python API methods available currently (as of 0.7.3) + # with a string as a common target for all input types + # For this, we call gemmi.cif.read_string (https://gemmi.readthedocs.io/en/latest/cif.html#reading) to handle CIF + # strings and gemmi.read_pdb to handle PDB strings (no one method can handle both formats currently Py-side) + + # openany() is called instead of passing file paths (when available) differently from streams + # even though reading the file into a string is less efficient, this is easier to maintain + + # if the gemmi Python API is extended, this method can be simplified/removed and replaced with something like + # gemmi.read_structure + + with util.openany(self.filename) as f: + content_as_str = f.read() + try: + # String -> Doc -> Block -> Structure + # making Structure from first Block in Document as is done internally in gemmi: + # https://github.com/project-gemmi/gemmi/blob/4416e298f204b7b57bf5b3051d7efd4fe02957cf/include/gemmi/mmcif.hpp#L32 + return gemmi.make_structure_from_block( + gemmi.cif.read_string(content_as_str)[0] + ) + except ValueError as e: + try: + return gemmi.read_pdb_string(content_as_str) + except ValueError: + raise e \ No newline at end of file diff --git a/package/MDAnalysis/topology/MMCIFParser.py b/package/MDAnalysis/topology/MMCIFParser.py index ca19027cb2..547d44fe1e 100644 --- a/package/MDAnalysis/topology/MMCIFParser.py +++ b/package/MDAnalysis/topology/MMCIFParser.py @@ -70,6 +70,7 @@ Tempfactors, ) from .base import TopologyReaderBase, change_squash +from ..lib import util class MMCIFParser(TopologyReaderBase): @@ -108,7 +109,7 @@ def parse(self, **kwargs) -> Topology: ------- MDAnalysis Topology object """ - structure = gemmi.read_structure(self.filename) + structure = self._get_structure() if len(structure) > 1: warnings.warn( @@ -224,3 +225,36 @@ def parse(self, **kwargs) -> Topology: atom_resindex=residx, residue_segindex=segidx, ) + + def _get_structure(self): + # This method exists because of some lacking methods in the gemmi Python API. + # within gemmi in C++, one can call `read_structure` and in-memory, string, and filepath + # arguments will all be accepted: + # https://github.com/project-gemmi/gemmi/blob/4416e298f204b7b57bf5b3051d7efd4fe02957cf/include/gemmi/mmread.hpp#L86 + + # However, for MDA to similarly accept common input types like streams (open File-like objs and StringIO objs) + # as well as pathlib.Path() objects, we have to use the Python API methods available currently (as of 0.7.3) + # with a string as a common target for all input types + # For this, we call gemmi.cif.read_string (https://gemmi.readthedocs.io/en/latest/cif.html#reading) to handle CIF + # strings and gemmi.read_pdb to handle PDB strings (no one method can handle both formats currently Py-side) + + # openany() is called instead of passing file paths (when available) differently from streams + # even though reading the file into a string is less efficient, this is easier to maintain + + # if the gemmi Python API is extended, this method can be simplified/removed and replaced with something like + # gemmi.read_structure + + with util.openany(self.filename) as f: + content_as_str = f.read() + try: + # String -> Doc -> Block -> Structure + # making Structure from first Block in Document as is done internally in gemmi: + # https://github.com/project-gemmi/gemmi/blob/4416e298f204b7b57bf5b3051d7efd4fe02957cf/include/gemmi/mmcif.hpp#L32 + return gemmi.make_structure_from_block( + gemmi.cif.read_string(content_as_str)[0] + ) + except ValueError as e: + try: + return gemmi.read_pdb_string(content_as_str) + except ValueError: + raise e \ No newline at end of file diff --git a/testsuite/MDAnalysisTests/topology/test_mmcif.py b/testsuite/MDAnalysisTests/topology/test_mmcif.py index 453df4b0ef..72fa24cb98 100644 --- a/testsuite/MDAnalysisTests/topology/test_mmcif.py +++ b/testsuite/MDAnalysisTests/topology/test_mmcif.py @@ -1,5 +1,9 @@ import MDAnalysis as mda import pytest +from pathlib import Path +from io import StringIO +import gzip +from MDAnalysis.lib import util from MDAnalysis.coordinates.MMCIF import HAS_GEMMI from MDAnalysisTests.datafiles import MMCIF as MMCIF_FOLDER @@ -105,3 +109,48 @@ def test_multimodel_warning_msg(): mda.topology.MMCIFParser.MMCIFParser( f"{MMCIF_FOLDER}/multimodel_warning.cif" ).parse() + + +@pytest.mark.skipif(not HAS_GEMMI, reason="gemmi not installed") +@pytest.mark.parametrize( + "filename,fmt", + [ + (f"{MMCIF_FOLDER}/1BD2_short.cif.gz", None), + (Path(f"{MMCIF_FOLDER}/1BD2_short.cif.gz"), None), + ( + StringIO(util.anyopen(f"{MMCIF_FOLDER}/1BD2_short.cif.gz").read()), + "CIF", + ), + (gzip.open(f"{MMCIF_FOLDER}/1BD2_short.cif.gz"), "CIF"), + ( + util.NamedStream( + StringIO( + util.anyopen(f"{MMCIF_FOLDER}/1BD2_short.cif.gz").read() + ), + "some_name.cif", + ), + "CIF", + ), + (f"{MMCIF_FOLDER}/1BD2_short.pdb.gz", None), + (Path(f"{MMCIF_FOLDER}/1BD2_short.pdb.gz"), None), + ( + StringIO(util.anyopen(f"{MMCIF_FOLDER}/1BD2_short.pdb.gz").read()), + "CIF", + ), + ( + util.anyopen(f"{MMCIF_FOLDER}/1BD2_short.pdb.gz"), + "CIF", + ), + ( + util.NamedStream( + StringIO( + util.anyopen(f"{MMCIF_FOLDER}/1BD2_short.pdb.gz").read() + ), + "some_name.pdb", + ), + "CIF", + ), + ], +) +def test_input_methods(filename, fmt): + mda.Universe(filename, topology_format=fmt)