Source code for gromacs_extra.ndx2resttop

#!/usr/bin/env python3

"""Module containing the Ndx2resttop class and the command line interface."""
import fnmatch
from typing import Optional
from typing import Any
from pathlib import Path
from biobb_common.generic.biobb_object import BiobbObject
from biobb_common.tools import file_utils as fu
from biobb_common.tools.file_utils import launchlogger



[docs]
class Ndx2resttop(BiobbObject):
    """
    | biobb_gromacs Ndx2resttop
    | Generate a restrained topology from an index NDX file.
    | This module automatizes the process of restrained topology generation starting from an index NDX file.

    Args:
        input_ndx_path (str): Path to the input NDX index file. File type: input. `Sample file <https://github.com/bioexcel/biobb_gromacs/raw/master/biobb_gromacs/test/data/gromacs_extra/ndx2resttop.ndx>`_. Accepted formats: ndx (edam:format_2033).
        input_top_zip_path (str): Path the input TOP topology in zip format. File type: input. `Sample file <https://github.com/bioexcel/biobb_gromacs/raw/master/biobb_gromacs/test/data/gromacs_extra/ndx2resttop.zip>`_. Accepted formats: zip (edam:format_3987).
        output_top_zip_path (str): Path the output TOP topology in zip format. File type: output. `Sample file <https://github.com/bioexcel/biobb_gromacs/raw/master/biobb_gromacs/test/reference/gromacs_extra/ref_ndx2resttop.zip>`_. Accepted formats: zip (edam:format_3987).
        properties (dict - Python dictionary object containing the tool parameters, not input/output files):
            * **posres_names** (*str*) - ("CUSTOM_POSRES") Space-separated preprocessor macro names, one per triplet (e.g. ``"CHAIN_A_POSRES CHAIN_B_POSRES"``). Activated at grompp time via ``-D <name>``. Must match the length of ref_rest_mol_triplet_list.
            * **force_constants** (*str*) - ("500 500 500") Three space-separated force constants Fx Fy Fz in kJ/mol/nm². Written verbatim into each ITP line so the same value is applied to every restrained atom.
            * **ref_rest_mol_triplet_list** (*str*) - (None) Comma-separated triplets ``(reference_group, restrain_group, molecule_name)``, one per molecule to restrain.``reference_group``: NDX group containing *all* atoms of the molecule in global (system-wide) GROMACS numbering. Used only as a coordinate frame to convert global indices to molecule-local 1-based indices. Example: ``r_1-9280``. ``restrain_group``: NDX group with the subset of atoms to actually restrain (must be a strict subset of reference_group). Example: ``P_&_r_1-9280``. ``molecule_name``: the ``[ moleculetype ]`` name in the topology used to locate where to splice the ``#ifdef`` block. Example: ``1TSR``.

    Examples:
        This is a use example of how to use the building block from Python::

            from biobb_gromacs.gromacs_extra.ndx2resttop import ndx2resttop
            prop = { 'ref_rest_mol_triplet_list': '( Chain_A, Chain_A_noMut, Protein_chain_A ), ( Chain_B, Chain_B_noMut, Protein_chain_B ), ( Chain_C, Chain_C_noMut, Protein_chain_C ), ( Chain_D, Chain_D_noMut, Protein_chain_D )' }
            ndx2resttop(input_ndx_path='/path/to/myIndex.ndx',
                        input_top_zip_path='/path/to/myTopology.zip',
                        output_top_zip_path='/path/to/newTopology.zip',
                        properties=prop)

    Info:
        * wrapped_software:
            * name: In house
            * license: Apache-2.0
        * ontology:
            * name: EDAM
            * schema: http://edamontology.org/EDAM.owl
    """

    def __init__(self, input_ndx_path: str, input_top_zip_path: str, output_top_zip_path: str,
                 properties: Optional[dict] = None, **kwargs) -> None:
        properties = properties or {}

        # Call parent class constructor
        super().__init__(properties)
        self.locals_var_dict = locals().copy()

        # Input/Output files
        self.io_dict = {
            "in": {"input_ndx_path": input_ndx_path, "input_top_zip_path": input_top_zip_path},
            "out": {"output_top_zip_path": output_top_zip_path}
        }

        # Properties specific for BB
        self.posres_names = properties.get('posres_names')
        self.force_constants = properties.get('force_constants', '500 500 500')
        self.ref_rest_mol_triplet_list = properties.get('ref_rest_mol_triplet_list')

        # Check the properties
        self.check_properties(properties)
        self.check_arguments()

    # --- Private helpers ---

    def _parse_ndx_groups(self, ndx_lines: list) -> dict:
        """Parse a GROMACS NDX file into a lookup table of group line ranges.

        NDX format: each named group starts with a ``[ group_name ]`` header line,
        followed by one or more rows of whitespace-separated global atom indices.

        Returns a dict mapping the raw header string (e.g. ``'[ System ]'``) to a
        two-element list ``[start_line, end_line]`` where the range is **half-open**
        (matching Python slice semantics): ``ndx_lines[start_line:end_line]`` yields
        the data rows for that group (the header line itself at ``start_line - 1`` is
        excluded).

        Edge cases handled:

        - **Last group**: the final group has no successor header to trigger the end
          assignment, so it is closed explicitly with ``index + 1`` (not ``index``)
          to include the very last data line.
        - **Single-content-line groups**: when a header is immediately followed by
          another header (no trailing blank line), the default range would be
          ``[H, H+1]`` giving an empty slice. The post-loop fix increments end by 1
          to capture that single data line.
        """
        groups_dic: dict[str, Any] = {}
        current_group = ''
        previous_group = ''

        for index, line in enumerate(ndx_lines):
            if line.startswith('['):
                current_group = line
                groups_dic[current_group] = [index, 0]
                if previous_group:
                    groups_dic[previous_group][1] = index
                previous_group = current_group

            if index == len(ndx_lines) - 1:
                groups_dic[current_group][1] = index + 1

        # Fix single-content-line groups
        for group_name in groups_dic:
            if groups_dic[group_name][0] + 1 == groups_dic[group_name][1]:
                groups_dic[group_name][1] += 1

        return groups_dic

    def _read_ndx_atoms(self, ndx_lines: list, groups_dic: dict, group_name: str) -> list:
        """Return the flat list of global atom indices for a named NDX group.

        Slices the group's data rows from ndx_lines using the half-open range
        stored in groups_dic, then flattens all whitespace-separated tokens into
        a single list of integers.

        Returned integers are **global** (system-wide) GROMACS atom numbers, 1-based.
        """
        key = f'[ {group_name} ]'
        start = groups_dic[key][0] + 1
        stop = groups_dic[key][1]
        fu.log(f'{group_name}: start_closed={start} stop_open={stop}', self.out_log, self.global_log)
        atoms = [int(atom) for line in ndx_lines[start:stop] for atom in line.split()]
        return atoms

    def _write_posre_itp(self, itp_path: str, selected_atoms: list) -> None:
        """Write a GROMACS position restraint ITP file.

        ``selected_atoms`` must contain **molecule-local** 1-based atom indices
        (not global system-wide indices). The atom type column is hardcoded to
        ``1`` (GROMACS position restraint type). ``self.force_constants`` is a
        verbatim string ``"Fx Fy Fz"`` applied uniformly to every restrained atom.
        """
        with open(itp_path, 'w') as f:
            fu.log(f'Creating {itp_path} with selected atoms and force constants', self.out_log, self.global_log)
            f.write('[ position_restraints ]\n')
            f.write('; atom  type      fx      fy      fz\n')
            for atom in selected_atoms:
                f.write(f'{atom}     1  {self.force_constants}\n')

    def _insert_posres_in_top(self, top_file: str, molecule_name: str, posre_name: str, itp_name: str) -> None:
        """Splice the #ifdef posres block into the .top file for single-chain topologies.

        Uses a two-phase scan over the topology lines:

        Phase 1 – locate the target moleculetype:
          For each ``[ moleculetype ]`` directive, read the first non-comment
          content line and compare its first token against ``molecule_name``.
          Set ``found_molecule = True`` when matched.

        Phase 2 – find the insertion point (only after found_molecule):
          Scan forward for the earliest of:
          - another top-level section (``[ moleculetype ]``, ``[ system ]``,
            ``[ molecules ]``)
          - an existing ``#include`` or ``#ifdef POSRES`` line
          Insert immediately before that line. Fall back to EOF if none found.

        Important: the insertion-point check (Phase 2) runs *before* the
        ``[ moleculetype ]`` detector within the same loop iteration. Without this
        ordering, a new ``[ moleculetype ]`` that should trigger insertion would
        instead be consumed by the ``continue`` in the detector, causing the block
        to be placed in the wrong molecule section.
        """
        with open(top_file, 'r') as f:
            lines = f.readlines()

        in_moleculetype = False
        found_molecule = False
        index = None

        for i, line in enumerate(lines):
            stripped = line.strip()

            # Find insertion point: just before the next top-level section or existing restraint block
            if found_molecule:
                is_next_section = stripped.startswith('[') and (
                    'system' in stripped or 'molecules' in stripped or 'moleculetype' in stripped)
                if is_next_section or line.startswith('#include ') or line.startswith('#ifdef POSRES'):
                    index = i
                    break

            # Detect [ moleculetype ] directive
            if stripped.startswith('[') and 'moleculetype' in stripped:
                in_moleculetype = True
                continue

            # Find the molecule name line inside the [ moleculetype ] block
            if in_moleculetype:
                if stripped and not stripped.startswith(';'):
                    in_moleculetype = False
                    if not stripped.startswith('[') and stripped.split()[0] == molecule_name:
                        found_molecule = True

        if not found_molecule:
            raise ValueError(f"Molecule type '{molecule_name}' not found in the topology file")

        if index is None:
            index = len(lines)

        lines.insert(index, '\n')
        lines.insert(index + 1, '; Include Position restraint file\n')
        lines.insert(index + 2, f'#ifdef {posre_name}\n')
        lines.insert(index + 3, f'#include "{itp_name}"\n')
        lines.insert(index + 4, '#endif\n')
        lines.insert(index + 5, '\n')

        with open(top_file, 'w') as f:
            f.writelines(lines)

    # --- Main entry point ---


[docs]
    @launchlogger
    def launch(self) -> int:
        """Execute the :class:`Ndx2resttop <gromacs_extra.ndx2resttop.Ndx2resttop>` object."""
        if self.check_restart():
            return 0

        top_file = fu.unzip_top(zip_file=self.io_dict['in'].get("input_top_zip_path", ""), out_log=self.out_log, unique_dir=self.stage_io_dict.get("unique_dir", ""))

        ndx_lines = open(self.io_dict['in'].get("input_ndx_path", "")).read().splitlines()
        groups_dic = self._parse_ndx_groups(ndx_lines)
        fu.log(f'Parsed NDX groups: {list(groups_dic.keys())}', self.out_log, self.global_log)

        # Parse triplet string: "(ref, restrain, mol), ..." -> list of 3-tuples
        raw_triplets = str(self.ref_rest_mol_triplet_list).split('),')
        self.ref_rest_mol_triplet_list = [
            tuple(elem.strip(' ()').replace(' ', '').split(','))
            for elem in raw_triplets
        ]
        fu.log('ref_rest_mol_triplet_list: ' + str(self.ref_rest_mol_triplet_list), self.out_log, self.global_log)

        if self.posres_names:
            self.posres_names = [elem.strip() for elem in self.posres_names.split()]
        else:
            self.posres_names = ['CUSTOM_POSRES'] * len(self.ref_rest_mol_triplet_list)
        fu.log('posres_names: ' + str(self.posres_names), self.out_log, self.global_log)

        if len(self.posres_names) != len(self.ref_rest_mol_triplet_list):
            raise ValueError("posres_names length must match ref_rest_mol_triplet_list length")

        for triplet, posre_name in zip(self.ref_rest_mol_triplet_list, self.posres_names):
            # Per-triplet workflow:
            #   1. Unpack (reference_group, restrain_group, molecule_name)
            #   2. Read global NDX indices for both groups
            #   3. Remap restrain atoms to molecule-local 1-based indices
            #   4. Write the _posre.itp
            #   5. Inject #ifdef guard — multi-chain (per-chain .itp) or single-chain (.top)

            reference_group, restrain_group, molecule_name = triplet
            fu.log(f'Reference group: {reference_group}', self.out_log, self.global_log)
            fu.log(f'Restrain group: {restrain_group}', self.out_log, self.global_log)
            fu.log(f'Molecule name: {molecule_name}', self.out_log, self.global_log)

            itp_path = fu.create_name(path=str(Path(top_file).parent), prefix=self.prefix,
                                      step=self.step, name=restrain_group + '_posre.itp')
            self.io_dict['out']["output_itp_path"] = itp_path

            # Map global NDX indices -> molecule-local 1-based indices.
            # reference_atoms.index(atom) gives the 0-based position of each global
            # restrain atom within the reference group list. Adding 1 converts to
            # GROMACS 1-based local numbering, which matches the atom order in the
            # [ atoms ] section of each moleculetype block.
            reference_atoms = self._read_ndx_atoms(ndx_lines, groups_dic, reference_group)
            restrain_atoms = self._read_ndx_atoms(ndx_lines, groups_dic, restrain_group)
            selected_atoms = [reference_atoms.index(atom) + 1 for atom in restrain_atoms]

            self._write_posre_itp(itp_path, selected_atoms)
            itp_name = Path(itp_path).name

            # Multi-chain: append ifdef block to the matching per-chain ITP.
            # Detection heuristic: if any .itp file in the topology directory has
            # molecule_name in its filename (and is not already a posre/pr file),
            # this is a multi-chain topology where each chain has its own .itp
            # included by the .top. In that case, append directly to that file.
            # Otherwise fall through to single-chain mode (_insert_posres_in_top).
            multi_chain = False
            for file_dir in Path(top_file).parent.iterdir():
                if "posre" not in file_dir.name and not file_dir.name.endswith("_pr.itp"):
                    match = fnmatch.fnmatch(str(file_dir), "*" + molecule_name + "*.itp")
                    if match:
                        multi_chain = True
                        with open(str(file_dir), 'a') as f:
                            fu.log(f'Opening {file_dir} and adding ifdef include', self.out_log, self.global_log)
                            f.write('\n')
                            f.write('; Include Position restraint file\n')
                            f.write(f'#ifdef {posre_name}\n')
                            f.write(f'#include "{itp_name}"\n')
                            f.write('#endif\n')
                            f.write('\n')

            # Single-chain: splice ifdef block into the .top file
            if not multi_chain:
                self._insert_posres_in_top(top_file, molecule_name, posre_name, itp_name)

        fu.zip_top(zip_file=self.io_dict['out'].get("output_top_zip_path", ""),
                   top_file=top_file, out_log=self.out_log, remove_original_files=self.remove_tmp)
        self.remove_tmp_files()
        self.check_arguments(output_files_created=True, raise_exception=False)
        return 0





[docs]
def ndx2resttop(input_ndx_path: str, input_top_zip_path: str, output_top_zip_path: str,
                properties: Optional[dict] = None, **kwargs) -> int:
    """Create :class:`Ndx2resttop <gromacs_extra.ndx2resttop.Ndx2resttop>` class and
    execute the :meth:`launch() <gromacs_extra.ndx2resttop.Ndx2resttop.launch>` method."""
    return Ndx2resttop(**dict(locals())).launch()



ndx2resttop.__doc__ = Ndx2resttop.__doc__
main = Ndx2resttop.get_main(ndx2resttop, "Generate a restrained topology from an index NDX file.")


if __name__ == '__main__':
    main()