Source code for chilife.io

from __future__ import annotations
from typing import Tuple, Dict, Union, BinaryIO, TextIO
import textwrap
import warnings
import os
import urllib
import MDAnalysis
from numpy.typing import ArrayLike
from collections import defaultdict
from collections.abc import Sequence
from hashlib import sha256
from pathlib import Path
import pickle
import shutil
from io import StringIO
import zipfile

import numpy as np
from scipy.stats import gaussian_kde
from memoization import cached, suppress_warnings
import MDAnalysis as mda

import chilife.RotamerEnsemble as re
import chilife.SpinLabel as sl
import chilife.dRotamerEnsemble as dre
import chilife.LigandEnsemble as le
import chilife
from .globals import (
    dihedral_defs,
    rotlib_indexes,
    RL_DIR,
    SUPPORTED_BB_LABELS,
    USER_RL_DIR,
    rotlib_defaults,
    alt_prot_states,
)

from .alignment_methods import local_mx
from .IntrinsicLabel import IntrinsicLabel
from .MolSys import MolecularSystemBase, MolSys
from .Topology import BondType
from .MolSysIC import MolSysIC
from .pdb_utils import parse_connect, sort_pdb

#                 ID    name   res  chain resnum      X     Y      Z      q      b              elem
fmt_str = "ATOM  {:5d} {:^4s} {:3s} {:1s}{:4d}    {:8.3f}{:8.3f}{:8.3f}{:6.2f}{:6.2f}          {:>2s}  \n"
CHEM_COMP_ATOM_KEYS = (
    "comp_id",
    "atom_id",
    "type_symbol",
    "pdbx_aromatic_flag",
    "pdbx_stereo_config",
)

CHEM_COMP_BOND_KEYS = (
    "comp_id",
    "atom_id_1",
    "atom_id_2",
    "value_order",
    "pdbx_aromatic_flag",
    "pdbx_stereo_config",
)

CIF_BOND_TO_XL = {
    "arom": BondType.AROMATIC,
    "delo": BondType.DELOCALIZED,
    "doub": BondType.DOUBLE,
    "pi": BondType.PI,
    "poly": BondType.POLYMERIC,
    "quad": BondType.QUADRUPLE,
    "sing": BondType.SINGLE,
    "trip": BondType.TRIPLE,
    "metalc": BondType.DATIVE,
    "disulf": BondType.UNSPECIFIED,
}

XL_BOND_TO_CIF = {v: k for k, v in CIF_BOND_TO_XL.items()}

SDF_BOND_TO_XL = {
    1: BondType.SINGLE,
    2: BondType.DOUBLE,
    3: BondType.TRIPLE,
    4: BondType.AROMATIC,
    5: BondType.UNSPECIFIED,
    6: BondType.SINGLE,
    7: BondType.DOUBLE,
    8: BondType.UNSPECIFIED,
}

XL_BOND_TO_SDF = {v: k for k, v in SDF_BOND_TO_XL.items() if k not in (5, 6, 7)}

SDF_ATOM_CHARGE_TO_INT = {
    0: 0,
    1: 3,
    2: 2,
    3: 1,
    4: 0,
    5: -1,
    6: -2,
    7: -3,
}


[docs] def read_distance_distribution(file_name: str) -> Tuple[np.ndarray, np.ndarray]: """Reads a DEER distance distribution file in the DeerAnalysis or similar format. Parameters ---------- file_name : str File name of distance distribution Returns ------- r: np.ndarray Domain of distance distribution in the same units as the file. p: np.ndarray Probability density over r normalized such that the integral of p over r is 1. """ # Load DA file data = np.loadtxt(file_name) # Convert nm to angstroms r = data[:, 0] if max(r) < 15: r *= 10 # Extract distance domain coordinates p = data[:, 1] return r, p
def hash_file(file: Union[Path, BinaryIO]): """ Helper function for hashing rotamer library files Parameters ---------- file : str, Path, BinaryIO The name, Path or IOBytes of the file to hash Returns ------- hashstring : str file hash """ hash = sha256() with open(file, "rb") as f: while True: block = f.read(hash.block_size) if not block: break hash.update(block) return hash.hexdigest() def get_possible_rotlibs( rotlib: str, suffix: str, extension: str, return_all: bool = False, was_none: bool = False, ) -> Union[Path, None]: """ Search all known rotlib directories and the current working directory for rotlib(s) that match the provided information. Parameters ---------- rotlib: str Fullname, base name or partial name of the rotamer libraries to search for. suffix: str possible suffixes the rotamer library may have, e.g. ip2 to indicate an i+2 rotamer library. extension: str filetype extension to look for. This will be either `npz` for monofunctional rotlibs or zip for bifunctional rotlibs. return_all: bool By default, only the first found rotlib will be returned unless ``return_all = True``, in which case was_none: bool For internal use only. Returns ------- rotlib: Path, List[Path], None The path to the found rotlib or a list of paths to the rotlibs that match the search criteri, or ``None`` if no rotlibs are found. """ cwd = Path.cwd() sufplusex = "_" + suffix + extension # Assemble a list of possible rotlib paths starting in the current directory possible_rotlibs = [ Path(rotlib), cwd / rotlib, cwd / (rotlib + extension), cwd / (rotlib + sufplusex), ] possible_rotlibs += list(cwd.glob(f"{rotlib}*{sufplusex}")) # Then in the user defined rotamer library directory for pth in USER_RL_DIR: possible_rotlibs += list(pth.glob(f"{rotlib}*{sufplusex}")) if not was_none: possible_rotlibs += list((RL_DIR / "user_rotlibs").glob(f"*{rotlib}*")) if return_all: rotlib = [] for possible_file in possible_rotlibs: if possible_file.exists() and return_all: rotlib.append(possible_file) elif possible_file.exists() and not possible_file.is_dir(): rotlib = possible_file break else: if isinstance(rotlib, str) and was_none and rotlib in rotlib_defaults: rotlib = RL_DIR / "user_rotlibs" / (rotlib_defaults[rotlib][0] + sufplusex) elif not isinstance(rotlib, list) or rotlib == []: rotlib = None # rotlib lists need to be sorted to prevent position mismatches for results with tests. if isinstance(rotlib, list): rotlib = list(set(rotlib)) rotlib = [rot for rot in rotlib if str(rot).endswith(extension)] rotlib = sorted(rotlib) else: rotlib = rotlib if str(rotlib).endswith(extension) else None return rotlib suppress_warnings() @cached(custom_key_maker=hash_file) def read_rotlib(rotlib: Union[Path, BinaryIO] = None) -> Dict: """Reads RotamerEnsemble for stored spin labels. Parameters ---------- rotlib : Path, BinaryIO Path object pointing to the rotamer library. Or a BytesIO object containing the rotamer library file Returns ------- lib: dict Dictionary of SpinLabel rotamer ensemble attributes including coords, weights, dihedrals etc. """ with np.load(rotlib, allow_pickle=True) as files: if files["format_version"] <= 1.1: raise RuntimeError( "The rotlib that was provided is an old version that is not compatible with your " "version of chiLife. You can either remake the rotlib, or use the update_rotlib.py " "script provided in the chilife scripts directory to update this rotamer library to the " "new format." ) lib = dict(files) if "allow_pickle" in lib: del lib["allow_pickle"] if "sigmas" not in lib: lib["sigmas"] = np.array([]) lib["internal_coords"] = lib["internal_coords"].item() lib["_rdihedrals"] = np.deg2rad(lib["dihedrals"]) lib["_rsigmas"] = np.deg2rad(lib["sigmas"]) lib["rotlib"] = str(lib["rotlib"]) lib["type"] = str(lib["type"]) lib["format_version"] = float(lib["format_version"]) return lib @cached(custom_key_maker=hash_file) def read_drotlib(rotlib: Path) -> Tuple[dict]: """Reads RotamerEnsemble for stored spin labels. Parameters ---------- rotlib : Path The Path to the rotamer library file. Returns ------- lib: Tuple[dict] Dictionaries of rotamer library attributes including sub_residues . """ with zipfile.ZipFile(rotlib, "r") as archive: for f in archive.namelist(): if "csts" in f: with archive.open(f) as of: csts = np.load(of) elif f[-12] == "A": with archive.open(f) as of: libA = read_rotlib.__wrapped__(of) elif f[-12] == "B": with archive.open(f) as of: libB = read_rotlib.__wrapped__(of) return libA, libB, csts @cached def read_bbdep(res: str, Phi: int, Psi: int) -> Dict: """Read the Dunbrack rotamer library for the provided residue and backbone conformation. Parameters ---------- res : str 3-letter residue code Phi : int Backbone Phi dihedral angle for the provided residue Psi : int Backbone Psi dihedral angle for the provided residue Returns ------- lib: dict Dictionary of arrays containing rotamer library information in cartesian and dihedral space """ ic_res = alt_prot_states.get(res, res) lib = {} Phi, Psi = str(Phi), str(Psi) # Read residue internal coordinate structure with open(RL_DIR / f"residue_internal_coords/{res.lower()}_ic.pkl", "rb") as f: ICs = pickle.load(f) atom_types = ICs.atom_types.copy() atom_names = ICs.atom_names.copy() maxchi = 5 if res in SUPPORTED_BB_LABELS else 4 nchi = np.minimum(len(dihedral_defs[res]), maxchi) if res not in ("ALA", "GLY"): library = "R1C.lib" if res in SUPPORTED_BB_LABELS else "ALL.bbdep.rotamers.lib" start, length = rotlib_indexes[f"{ic_res} {Phi:>4}{Psi:>5}"] with open(RL_DIR / library, "rb") as f: f.seek(start) rotlib_string = f.read(length).decode() s = StringIO(rotlib_string) s.seek(0) data = np.genfromtxt(s, usecols=range(maxchi + 4, maxchi + 5 + 2 * maxchi)) lib["weights"] = data[:, 0] lib["dihedrals"] = data[:, 1 : nchi + 1] lib["sigmas"] = data[:, maxchi + 1 : maxchi + nchi + 1] dihedral_atoms = dihedral_defs[res][:nchi] # Calculate cartesian coordinates for each rotamer z_matrix = ICs.batch_set_dihedrals( np.zeros(len(lib["dihedrals"]), dtype=int), np.deg2rad(lib["dihedrals"]), 1, dihedral_atoms, ) ICs._chain_operators = ICs._chain_operators[0] ICs.load_new(np.array(z_matrix)) internal_coords = ICs.copy() coords = ICs.protein.trajectory.coordinate_array.copy() else: lib["weights"] = np.array([1]) lib["dihedrals"], lib["sigmas"], dihedral_atoms = [], [], [] coords = ICs.to_cartesian()[None, ...] internal_coords = ICs.copy() # Get origin and rotation matrix of local frame mask = np.isin(atom_names, ["N", "CA", "C"]) ori, mx = local_mx(*coords[0, mask]) # Set coords in local frame and prepare output coords -= ori lib["coords"] = np.einsum("ijk,kl->ijl", coords, mx) lib["internal_coords"] = internal_coords lib["atom_types"] = np.asarray(atom_types, dtype=str) lib["atom_names"] = np.asarray(atom_names, dtype=str) lib["dihedral_atoms"] = np.asarray(dihedral_atoms, dtype=str) lib["_rdihedrals"] = np.deg2rad(lib["dihedrals"]) lib["_rsigmas"] = np.deg2rad(lib["sigmas"]) lib["rotlib"] = res lib["backbone_atoms"] = ["H", "N", "CA", "HA", "C", "O"] lib["aln_atoms"] = ["N", "CA", "C"] # hacky solution to experimental backbone dependent rotlibs if res == "R1C": lib["spin_atoms"] = np.array(["N1", "O1"]) lib["spin_weights"] = np.array([0.5, 0.5]) return lib
[docs] def read_library(rotlib: str, Phi: float = None, Psi: float = None) -> Dict: """Function to read rotamer libraries and bifunctional rotamer libraries as dictionaries. Parameters ---------- rotlib : str, Path 3 letter residue code or path the rotamer library file. Phi : float Backbone Phi dihedral angle for the provided residue. Only applicable for canonical amino acids. Psi : float Backbone Psi dihedral angle for the provided residue. Only applicable for canonical amino acids. Returns ------- lib: dict Dictionary of arrays containing rotamer library information in cartesian and dihedral space. """ backbone_exists = Phi and Psi if backbone_exists: Phi = int((Phi // 10) * 10) Psi = int((Psi // 10) * 10) # Use helix backbone if not otherwise specified else: Phi, Psi = -60, -50 if isinstance(rotlib, Path): if rotlib.suffix == ".npz": return read_rotlib(rotlib) elif rotlib.suffix == ".zip": return read_drotlib(rotlib) else: raise ValueError(f"{rotlib.name} is not a valid rotamer library file type") elif isinstance(rotlib, str): return read_bbdep(rotlib, Phi, Psi) else: raise ValueError(f"{rotlib} is not a valid rotamer library")
[docs] def save( file_name: str, *molecules: Union[ re.RotamerEnsemble, MolecularSystemBase, mda.Universe, mda.AtomGroup, str ], protein_path: Union[str, Path] = None, mode: str = "w", conect: bool = False, frames: Union[int, str, ArrayLike, None] = "all", **kwargs, ) -> None: """Save a pdb file of the provided labels and proteins Parameters ---------- file_name : str Desired file name for output file. Will be automatically made based off of protein name and labels if not provided. *molecules : RotmerLibrary, chiLife.MolSys, mda.Universe, mda.AtomGroup, str Object(s) to save. Includes RotamerEnsemble, SpinLabels, dRotamerEnsembles, proteins, or path to protein pdb. Can add as many as desired, except for path, for which only one can be given. protein_path : str, Path The Path to a pdb file to use as the protein object. mode : str Which mode to open the file in. Accepts 'w' or 'a' to overwrite or append. conect : bool Whether to save PDB CONECT records. Defaults to False. frames : int, str, ArrayLike[int], None Exclusively used for ``MolSys`` and ``AtomGroup`` objects. ``frames`` are the indices of the frames to save. Can be a single index, array of indices, 'all', or ``None``. If ``None`` is passed only the activate frame will be saved. If 'all' is passed, all frames are saved. Default is 'all' **kwargs : Additional arguments to pass to ``write_labels`` write_spin_centers : bool Write spin centers (atoms named NEN) as a separate object with weights mapped to q-factor. """ if isinstance(file_name, tuple(molecule_class.keys())): molecules = list(molecules) molecules.insert(0, file_name) file_name = None # Check for filename at the beginning of args tmolecules = defaultdict(list) for mol in molecules: mcls = [val for key, val in molecule_class.items() if isinstance(mol, key)] if mcls == []: raise TypeError( f"{type(mol)} is not a supported type for this function. " "chiLife can only save objects of the following types:\n" + "".join(f"{key.__name__}\n" for key in molecule_class) + "Please check that your input is compatible" ) tmolecules[mcls[0]].append(mol) molecules = tmolecules # Ensure only one protein path was provided (for now) protein_path = [] if protein_path is None else [protein_path] if len(protein_path) > 1: raise ValueError("More than one protein path was provided. C") elif len(protein_path) == 1: protein_path = protein_path[0] else: protein_path = None # Create a file name from protein information if file_name is None: if protein_path is not None: f = Path(protein_path) file_name = f.name[:-4] else: file_name = "" for protein in molecules["molcart"]: if getattr(protein, "filename", None): file_name += " " + Path(protein.filename).name file_name = file_name[:-4] if file_name == "" and len(molecules["molcart"]) > 0: file_name = "No_Name_Protein" # Add spin label information to file name if 0 < len(molecules["rotens"]) < 3: naml = [file_name] + [rotens.name for rotens in molecules["rotens"]] file_name = "_".join(naml) file_name = file_name.strip("_") elif len(molecules["rotens"]) >= 3: file_name += "_many_labels" file_name += ".pdb" file_name = file_name.strip() if protein_path is not None: print(protein_path, file_name) shutil.copy(protein_path, file_name) pdb_file = open(file_name, "a+") else: pdb_file = open(file_name, mode) used_names = {} for protein in molecules["molcart"]: if isinstance(protein, (mda.AtomGroup, mda.Universe)): name = ( Path(protein.universe.filename) if protein.universe.filename is not None else Path(pdb_file.name) ) name = name.name else: name = protein.fname if hasattr(protein, "fname") else None if name is None: name = Path(pdb_file.name).name name = name[:-4] if name.endswith(".pdb") else name name_ = name + str(used_names[name]) if name in used_names else name used_names[name] = used_names.setdefault(name, 0) + 1 write_protein(pdb_file, protein, name_, conect=conect, frames=frames) for ic in molecules["molic"]: if isinstance(ic.protein, (mda.AtomGroup, mda.Universe)): name = ( Path(ic.protein.universe.filename) if ic.protein.universe.filename is not None else Path(pdb_file.name) ) name = name.name else: name = ic.protein.fname if hasattr(ic.protein, "fname") else None if name is None: name = Path(pdb_file.name).name name = name[:-4] if name.endswith(".pdb") else name name_ = name + str(used_names[name]) if name in used_names else name write_ic(pdb_file, ic, name=name_, conect=conect, frames=frames) if len(molecules["rotens"]) > 0: write_labels(pdb_file, *molecules["rotens"], conect=conect, **kwargs) pdb_file.close()
def fetch( accession_number: str, save: bool = False, format: str | None = None ) -> MDAnalysis.Universe: """Fetch pdb file from the protein data bank or the AlphaFold Database and optionally save to disk. Parameters ---------- accession_number : str 4 letter structure PDB ID or alpha fold accession number. Note that AlphaFold accession numbers must begin with 'AF-'. save : bool If true the fetched PDB will be saved to the disk. format : str Format of the structure file (pdb or cif). Returns ------- U : MDAnalysis.Universe MDAnalysis Universe object of the protein corresponding to the provided PDB ID or AlphaFold accession number """ if format is None and accession_number.lower().endswith(".pdb"): format = "pdb" elif format is None and accession_number.lower().endswith(".cif"): format = "cif" elif format is None: format = "pdb" elif format.lower().startswith("."): format = format[1:].lower() else: format = format.lower() accession_number = accession_number.split(f".{format}")[0] pdb_name = accession_number + "." + format if accession_number.startswith("AF-"): print( f"https://alphafold.ebi.ac.uk/files/{accession_number}-F1-model_v6.{format}" ) urllib.request.urlretrieve( f"https://alphafold.ebi.ac.uk/files/{accession_number}-F1-model_v6.{format}", pdb_name, ) else: urllib.request.urlretrieve( f"http://files.rcsb.org/download/{pdb_name}", pdb_name ) if format.lower() == "pdb": with warnings.catch_warnings(): warnings.simplefilter("ignore") U = mda.Universe(pdb_name, in_memory=True) elif format.lower() == "cif": U = MolSys.from_cif(pdb_name) if not save: os.remove(pdb_name) return U def load_protein( struct_file: Union[str, Path], *traj_file: Union[str, Path] ) -> MDAnalysis.AtomGroup: """ Parameters ---------- struct_file : Union[TextIO, str, Path] Name, Path or TextIO object referencing the structure file (e.g. pdb, gro, psf) *traj_file : Union[TextIO, str, Path] (optional) Name, Path or TextIO object(s) referencing the trajectory file (e.g. pdb, xtc, dcd) Returns ------- protein: MDAnalysis.AtomGroup An MDA AtomGroup object containing the protein structure and trajectory. The object is always loaded into memory to allow coordinate manipulations. """ if traj_file != []: traj_file = [str(file) for file in traj_file] with warnings.catch_warnings(): warnings.simplefilter("ignore") protein = mda.Universe(str(struct_file), *traj_file, in_memory=True) else: with warnings.catch_warnings(): warnings.simplefilter("ignore") protein = mda.Universe(struct_file, in_memory=True) return protein
[docs] def read_pdb(pdb_file: Union[str, Path], sort_atoms=False): """ Read A PDB file into a dictionary containing all standard pdb atom information and bonds if CONECT records are present. Parameters ---------- pdb_file : str | Path File name or ``Path`` to the PDB file to be read. Returns ------- pdb_data : dict Dictionary of PDB data. """ keys = [ "record_types", "atomids", "names", "altlocs", "resnames", "chains", "resnums", "icodes", "coords", "occupancies", "bs", "segs", "atypes", "charges", ] if sort_atoms: lines = sort_pdb(pdb_file) else: with open(pdb_file, "r") as f: lines = f.readlines() connect = [] atom_lines = [] for line in lines: if line.startswith(("MODEL", "ENDMDL", "ATOM", "HETATM")): atom_lines.append(line) elif line.startswith("CONECT"): connect.append(line) start_idxs = [] end_idxs = [] for i, line in enumerate(atom_lines): if line.startswith("MODEL"): start_idxs.append(i + 1) elif line.startswith("ENDMDL"): end_idxs.append(i) if len(start_idxs) > 0: lines = [atom_lines[start:end] for start, end in zip(start_idxs, end_idxs)] else: lines = [atom_lines] PDB_data = [ ( line[:6].strip(), i, line[12:16].strip(), line[16:17].strip(), line[17:20].strip(), line[21:22].strip(), int(line[22:26]), line[26:27].strip(), (float(line[30:38]), float(line[38:46]), float(line[46:54])), float(line[54:60]), float(line[60:66]), line[72:73].strip(), line[76:78].strip(), line[78:80].strip(), ) for i, line in enumerate(lines[0]) ] pdb_dict = {key: np.array(data) for key, data in zip(keys, zip(*PDB_data))} # Parse connect data if len(connect) > 0: c_bonds, h_bonds, i_bonds = parse_connect(connect) bonds = np.array(list(c_bonds)) pdb_dict["bonds"] = bonds pdb_dict["bond_types"] = np.array( [BondType.UNSPECIFIED for _ in range(len(bonds))] ) trajectory = [pdb_dict.pop("coords")] if len(lines) > 1: for struct in lines[1:]: frame_coords = [(line[30:38], line[38:46], line[46:54]) for line in struct] if len(frame_coords) != len(PDB_data): raise ValueError( "All models in a multistate PDB must have the same atoms" ) trajectory.append(frame_coords) pdb_dict["trajectory"] = np.array(trajectory, dtype=float) pdb_dict["name"] = Path(pdb_file).name return pdb_dict
[docs] def read_sdf(file_name): """ Reads SDF file into a dictionary structure. Parameters ---------- file_name: str, Path string or Path object directing the function to the sdf file to read. Returns ------- mols: List[dict] List of dictionaries containing all the sdf data for each molecule in the sdf file. """ # Read in mol blocks with open(file_name, "r") as f: blocks, block = [], [] for line in f: line = line.rstrip("\r\n") if not line == "$$$$": block.append(line) else: blocks.append(block) block = [] mols = [] for block in blocks: mol_data = { "name": block[0], "software": block[1].strip(), "comment": block[2].strip(), } n_atoms = block[3][:3].strip() n_bonds = block[3][3:6].strip() n_atom_lists = block[3][6:9].strip() chiral = block[3][12:15].strip() n_stexts = block[3][15:18].strip() properties_lines = block[3][30:33] format_version = block[3][33:].strip() mol_data["n_atoms"] = int(n_atoms) mol_data["n_bonds"] = int(n_bonds) mol_data["chiral"] = bool(int(chiral)) mol_data["format"] = format_version atom_end_idx = 4 + mol_data["n_atoms"] bond_end_idx = atom_end_idx + mol_data["n_bonds"] mol_data["atoms"] = [] for idx, line in enumerate(block[4:atom_end_idx]): atom = {} atom["index"] = idx atom["xyz"] = [float(line[0:10]), float(line[10:20]), float(line[20:30])] atom["element"] = line[31:34].strip() atom["mass difference"] = int(float(line[34:36])) atom["charge"] = int(line[36:39]) atom["stereo"] = int(line[39:42]) atom["valence"] = int(line[48:51]) if len(line) >= 51 else 0 mol_data["atoms"].append(atom) mol_data["bonds"] = [] for idx, line in enumerate(block[atom_end_idx:bond_end_idx]): bond = {} bond["idx1"] = int(line[0:3]) - 1 # Coerce to 0 index bond["idx2"] = int(line[3:6]) - 1 # Coerce to 0 index bond["type"] = int(line[6:9]) bond["stereo"] = int(line[9:12]) mol_data["bonds"].append(bond) properties = {} for idx, line in enumerate(block[bond_end_idx:]): if line == "M END": break if line.startswith("M "): key = line[3:6] lst = properties.setdefault(key, []) lst.append(line[6:]) else: break for key, val in properties.items(): properties[key] = parse_property_lines(key, val) mol_data["properties"] = properties data_end_idx = bond_end_idx + idx + 1 data_iterator = iter(block[data_end_idx:]) data_block = [] for line in data_iterator: opened = line.startswith(">") while opened: data_block.append(line) line = next(data_iterator, "") opened = bool(line) if data_block: header, data = parse_data_block(data_block) mol_data[header] = data data_block = [] mols.append(mol_data) return mols
def parse_data_block(data_block): """ Helper function to parse the data block of an SDF file. Only used internally. Parameters ---------- data_block: List[str] The lines of the sdf file pertaining to the data block Returns ------- header: str The header of the data block data_string: str a string containing all data in the data block """ data_string = "\n".join(data_block[1:]) header_block = data_block[0][1:] header = header_block[header_block.find("<") + 1 : header_block.find(">")] return header, data_string standard_property_vnames = { "CHG": "charges", "RAD": "multiplicities", "ISO": "mass", "RBC": "ring bond count", "SUB": "substitution count", "UNS": "unsaturated atom", } def parse_property_lines(property, lines): """ Helper function to parse properties from sdf files. Parameters ---------- property: str Property being parsed lines: List[str] The lines of the sdf file pertaining to the property block being parsed. Returns ------- dict | List[str] dictionary of the atom ids mapping to the properties for each atom if `property` is known to chilife otherwise the input lines as provided. """ if property in standard_property_vnames: vname = standard_property_vnames[property] k, v = parse_standard_property(lines) return { "atom_ids": np.array(k, dtype=int) - 1, vname: np.array(v, dtype=int), } else: return lines def parse_standard_property(lines): """ Helper function to parse standard properties from sdf files. Parameters ---------- lines: List[str] The lines of the sdf file pertaining to the property block being parsed. Returns ------- k: List Atom ids for which this property is relevant. V: List Quantitative value of the property. """ k, v = [], [] for line in lines: entries = textwrap.wrap(line, 4) k.extend(entries[1::2]) v.extend(entries[2::2]) return k, v def standard_property_to_string(key, property): """ Helper function to convert standard properties in a dictionary to a string format for writing sdf files. Parameters ---------- key: str 3 letter keycode of the property property: dict Property dict of a standard property as parsed by `parse_standard_property`. Returns ------- strings: List[str] List of strings containing the property lines of an sdf file. """ strings = [] data_col = standard_property_vnames[key] n = 1 running_str = " " for idx, val in zip(property["atom_ids"], property[data_col]): if n >= 8: running_str = f"M {key}{n:>3d}" + running_str + "\n" strings.append(running_str) running_str = " " n = 1 running_str = running_str + f"{str(idx + 1):>3s} {str(val):>3s}" n += 1 if running_str != " ": running_str = f"M {key}{n:>3d}" + running_str + "\n" strings.append(running_str) return strings
[docs] def write_sdf(sdf_data, file_name): """ Function to write sdf file from sdf format in a dict as read by `read_sdf`. Parameters ---------- sdf_data: Dict | List[Dict] The sdf data in chilife format file_name: str Name of the file to write the sdf data to. """ if isinstance(sdf_data, dict): sdf_data = [sdf_data] lines = [] for mol in sdf_data: # Headder lines.append(f"{mol['name']}\n") lines.append(f"\tchiLife {chilife.__version__}\n") lines.append(f"{mol['comment']}\n") lines.append( f"{mol['n_atoms']:>3}{mol['n_bonds']:>3}{0:>3}{0:>3}{mol['chiral']:>3}" f"{0:>3}{0:>3}{0:>3}{0:>3}{0:>3}999 V2000\n" ) # Coords atoms = mol["atoms"] for atom in atoms: x, y, z = atom["xyz"] lines.append( f"{x:10.4f}{y:10.4f}{z:10.4f}{atom['element']:>2} {atom['mass difference']:>3}{atom['charge']:>3}" f"{atom['stereo']:>3}{0:>3}{0:>3}{atom['valence']:>3}" + f"{0:>3}" * 6 + "\n" ) # Bonds bonds = mol["bonds"] for bond in bonds: lines.append( f"{bond['idx1'] + 1:>3}{bond['idx2'] + 1:>3}{bond['type']:>3}{bond['stereo']:>3}\n" ) # Write CHG RAD ISO Data for key, property in mol["properties"].items(): if key in standard_property_vnames: lines += standard_property_to_string(key, property) else: lines += property lines.append("M END\n") # Write data blocks standard_keys = { "name", "software", "comment", "n_atoms", "n_bonds", "chiral", "format", "atoms", "bonds", "properties", } data_keys = [key for key in mol if key not in standard_keys] for key in data_keys: lines.append(f"> <{key}>\n") lines.append(f"{mol[key]}") lines.append("\n\n") lines.append("$$$$\n") with open(file_name, "w") as f: f.writelines(lines)
def write_protein( pdb_file: TextIO, protein: Union[mda.Universe, mda.AtomGroup, MolecularSystemBase], name: str = None, conect: bool = False, frames: Union[int, str, ArrayLike, None] = "all", ) -> None: """ Helper function to write protein PDBs from MDAnalysis and MolSys objects. Parameters ---------- pdb_file : TextIO File to save the protein to protein : MDAnalysis.Universe, MDAnalysis.AtomGroup, MolSys MDAnalysis or MolSys object to save name : str Name of the protein to put in the header frames : int, str, ArrayLike or None Frames of the trajectory/ensemble to save to file """ # Change chain identifier if longer than 1 available_segids = iter("ABCDEFGHIJKLMNOPQRSTUVWXYZ") for seg in protein.segments: if len(seg.segid) > 1: seg.segid = next(available_segids) traj = protein.universe.trajectory pdb_file.write(f"HEADER {name}\n") frames = None if frames == "all" and len(traj) == 1 else frames if frames == "all": for mdl, ts in enumerate(traj): write_frame(pdb_file, protein.atoms, mdl) elif hasattr(frames, "__len__"): for mdl, frame in enumerate(frames): traj[frame] write_frame(pdb_file, protein.atoms, mdl) elif frames is not None: traj[frames] write_frame(pdb_file, protein.atoms) else: write_frame(pdb_file, protein.atoms) if conect: bonds = ( protein.bonds.indices if hasattr(protein, "bonds") else protein.topology.bonds if hasattr(protein.topology, "bonds") else None ) if bonds is not None: bonds_1 = bonds + 1 write_bonds(pdb_file, bonds_1) def write_frame(pdb_file: TextIO, atoms, frame=None, coords=None): """ Helper function to write multiple conformational states of a protein to a PDB file. Parameters ---------- pdb_file : TextIO IO object to write the frame to. atoms: chilife.MolSys.AtomSelection | MDAnalysis.AtomGroup Set of atoms to write to the pdb file. frame: int | None The frame number to write. Should be 0 indexed. coords: ArrayLike | None Coords to write to the pdb file. If None are provided the coords from `atoms` are used. """ if frame is not None: pdb_file.write(f"MODEL {frame + 1}\n") if coords is None: coords = atoms.positions if not hasattr(atoms.universe, "occupancies"): atoms.universe.add_TopologyAttr( "occupancies", np.ones(len(atoms.universe.atoms)) ) if not hasattr(atoms.universe, "bfactors"): atoms.universe.add_TopologyAttr("bfactors", np.ones(len(atoms.universe.atoms))) [ pdb_file.write( fmt_str.format( i + 1, atom.name, atom.resname[:3], atom.segid, atom.resnum, *coord, atom.occupancy, atom.bfactor, atom.type, ) ) for i, (atom, coord) in enumerate(zip(atoms, coords)) ] pdb_file.write("TER\n") if frame is not None: pdb_file.write("ENDMDL\n") def write_ic( pdb_file: TextIO, ic: MolSysIC, name: str = None, conect: bool = None, frames: Union[int, str, ArrayLike, None] = "all", ) -> None: """ Write a :class:`~MolSysIC` internal coordinates object to a pdb file. Parameters ---------- pdb_file : TextIO open file or io object. ic: MolSysIC chiLife internal coordinates object. name : str Name of the molecule to be used for the HEADER. conect : bool, optional Write PDB CONECT information to file. frames : int, str, ArrayLike or None Frames of the trajectory/ensemble to save to file """ frames = None if frames == "all" and len(ic.trajectory) == 1 else frames pdb_file.write(f"HEADER {name}\n") traj = ic.trajectory if frames == "all": for mdl, ts in enumerate(traj): write_frame(pdb_file, ic.atoms, mdl, ic.coords) elif hasattr(frames, "__len__"): for mdl, frame in enumerate(frames): traj[frame] write_frame(pdb_file, ic.atoms, mdl, ic.coords) elif frames is not None: traj[frames] write_frame(pdb_file, ic.atoms, coords=ic.coords) else: write_frame(pdb_file, ic.atoms, coords=ic.coords) if conect: bonds = ic.topology.bonds write_bonds(pdb_file, bonds) def write_labels( pdb_file: TextIO, *args: sl.SpinLabel, KDE: bool = True, sorted: bool = True, write_spin_centers: bool = True, conect: bool = False, ) -> None: """Lower level helper function for saving SpinLabels and RotamerEnsembles. Loops over SpinLabel objects and appends atoms and electron coordinates to the provided file. Parameters ---------- pdb_file : str File name to write to. *args: SpinLabel, RotamerEnsemble The RotamerEnsemble or SpinLabel objects to be saved. KDE: bool Perform kernel density estimate smoothing on rotamer weights before saving. Usefull for uniformly weighted RotamerEnsembles or RotamerEnsembles with lots of rotamers sorted : bool Sort rotamers by weight before saving. write_spin_centers : bool Write spin centers (atoms named NEN) as a seperate object with weights mapped to q-factor. Returns ------- None """ # Write spin label models for k, label in enumerate(args): pdb_file.write(f"HEADER {label.name}\n") # Save models in order of weight sorted_index = ( np.argsort(label.weights)[::-1] if sorted else np.arange(len(label.weights)) ) norm_weights = label.weights / label.weights.max() if isinstance(label, dre.dRotamerEnsemble): sites = np.concatenate( [ np.ones(len(label.RE1.atoms), dtype=int) * int(label.site1), np.ones(len(label.RE2.atoms), dtype=int) * int(label.site2), ] ) else: sites = [atom.resi for atom in label.atoms] for mdl, (conformer, weight) in enumerate( zip(label.coords[sorted_index], norm_weights[sorted_index]) ): pdb_file.write(f"MODEL {mdl}\n") [ pdb_file.write( fmt_str.format( i, label.atom_names[i], label.res[:3], label.chain, sites[i], *conformer[i], weight, 1.00, label.atom_types[i], ) ) for i in range(len(label.atom_names)) ] pdb_file.write("TER\n") pdb_file.write("ENDMDL\n") if conect: write_bonds(pdb_file, label.bonds) # Write spin density at electron coordinates for k, label in enumerate(args): if not hasattr(label, "spin_centers"): continue if not np.any(label.spin_centers): continue spin_centers = np.atleast_2d(label.spin_centers) if KDE and len(spin_centers) > 5: try: # Perform gaussian KDE to determine electron density gkde = gaussian_kde(spin_centers.T, weights=label.weights) # Map KDE density to pseudoatoms vals = gkde.pdf(spin_centers.T) except: vals = label.weights else: vals = label.weights if write_spin_centers: pdb_file.write(f"HEADER {label.name}_density\n") norm_weights = vals / vals.max() [ pdb_file.write( fmt_str.format( i, "NEN", label.res[:3], label.chain, int(label.site), *spin_centers[i], norm_weights[i], 1.00, "N", ) ) for i in range(len(norm_weights)) ] pdb_file.write("TER\n")
[docs] def write_atoms( file: TextIO, atoms: Union[MDAnalysis.Universe, MDAnalysis.AtomGroup, MolecularSystemBase], coords: ArrayLike = None, ) -> None: """ Write a set of atoms to a file in PDB format without any prefix or suffix, i.e. Not wrapped in `MODEL` or `ENDMDL` Parameters ---------- file : TextIO A writable file (TextIO) object. The file that the atoms will be written to. atoms : MDAnalysis.Universe, MDAnalysis.AtomGroup, MolecularSystemBase Object containing atoms to be written. May also work with duck-typed set of atoms that are iterable and each atom has properties ``index``, ``name``, coords : ArrayLike Array of atom coordinates corresponding to atoms """ if isinstance(atoms, MDAnalysis.Universe): atoms = atoms.atoms if coords is None: coords = atoms.positions for atom, coord in zip(atoms, coords): file.write( f"ATOM {atom.index + 1:5d} {atom.name:^4s} {atom.resname:3s} {'A':1s}{atom.resnum:4d} " f"{coord[0]:8.3f}{coord[1]:8.3f}{coord[2]:8.3f}{atom.occupancy:6.2f}{atom.bfactor:6.2f} {atom.type:>2s} \n" )
def write_bonds(pdb_file, bonds): """ Write PDB CONECT information to ``pdb_file`` for the provided ``bonds``. Parameters ---------- pdb_file: TextIO File to write bonds to. bonds: ArrayLike Array of the indices of atom pairs that make the bonds. """ active_atom = bonds[0][0] line = "CONECT" + f"{active_atom:>5d}" lines = [] for b in bonds: if b[0] != active_atom or len(line) > 30: lines.append(line + "\n") active_atom = b[0] line = "CONECT" + f"{active_atom:>5d}" line += f"{b[1]:>5d}" lines.append(line + "\n") pdb_file.writelines(lines) molecule_class = { re.RotamerEnsemble: "rotens", dre.dRotamerEnsemble: "rotens", IntrinsicLabel: "rotens", le.LigandEnsemble: "rotens", mda.Universe: "molcart", mda.AtomGroup: "molcart", MolecularSystemBase: "molcart", MolSysIC: "molic", } rotlib_formats = { 1.0: ( "rotlib", # "resname", "coords", "internal_coords", "weights", "atom_types", "atom_names", "dihedrals", "dihedral_atoms", "type", "format_version", ) } rotlib_formats[1.1] = *rotlib_formats[1.0], "description", "comment", "reference" rotlib_formats[1.2] = rotlib_formats[1.0] rotlib_formats[1.3] = rotlib_formats[1.2] rotlib_formats[1.4] = *rotlib_formats[1.3], "backbone_atoms", "aln_atoms" rotlib_formats[1.5] = rotlib_formats[1.4]
[docs] def read_cif(file_name): """ Read a CIF file into a dictionary. Parameters ---------- file_name: str | Path The cif file to read. Returns ------- cif_data: Dict All data contained in the cif file. """ data_blocks = {} with open(file_name, "r") as f: for line in f: if line.startswith("data"): key = line.split("_")[1].strip() current_block = data_blocks.setdefault(key, []) current_block.append(line) elif "current_block" in locals(): current_block.append(line) else: continue cif_data = {} for key, value in data_blocks.items(): cif_data[key] = parse_cif_data_block(value) return cif_data
def parse_cif_data_block(data_block: list[str]) -> dict: """ Helper function to parse cif data from a cif file. Parameters ---------- data_block: list[str] List of lines from a cif file corresponding to a singular data block as defined by cif specifications. Returns ------- parsed_block: dict The provided data block parsed into a dictionary. """ parsed_block = {} data_block_iterator = iter(data_block) in_loop = False for line in data_block_iterator: if line.startswith("loop_"): in_loop = True subject_keys = [] subject_values = [] elif line.startswith("#") and in_loop: for k, v in zip(subject_keys, zip(*subject_values)): topic_dict[k] = v in_loop = False elif in_loop: if line.startswith("_"): topic_key, subject_key = line.strip().split(".") topic_dict = parsed_block.setdefault(topic_key[1:], {}) subject_keys.append(subject_key) else: subject_values.append(split_with_quotes(line)) elif line.startswith("_"): kv = split_with_quotes(line) if len(kv) == 2: key, value = kv else: # Extract next line value key = kv.pop(0) value = next(data_block_iterator).strip() if value[0] == "'": value.strip("'") # check for multiline entry if value[0] == ";": value = value[1:] while True: line = next(data_block_iterator) if line.startswith(";"): break value += line.strip() topic_key, subject_key = key.split(".") topic_dict = parsed_block.setdefault(topic_key[1:], {}) topic_dict[subject_key] = value return parsed_block def split_with_quotes(string): """ Helper function to split a string by it's whitespace into a list while respecting quotes. Parameters ---------- string: The string to split Returns ------- vals: list[str] List of substrings obtained from splitting the parnet string. """ vals = string.split() myit = enumerate(vals) replace = [] for i, val in myit: if val[0] == '"' or val[0] == "'": sym = val[0] tvals = [val] ends_with_quote = val[-1] == sym and len(val) > 1 j = i + 1 while not ends_with_quote: _j, val = next(myit) j = _j + 1 tvals.append(val) ends_with_quote = val[-1] == sym val = " ".join(tvals) val = val.strip(sym) replace.append((i, j, val)) if replace: for i, j, val in reversed(replace): vals[i:j] = [val] return vals
[docs] def create_ccd_dicts(cif_data): """ Creates chemical composition dictionary (CCD) dictionaries from CCD entries in a cif file. Parameters ---------- cif_data: dict cif file data as parsed from :func:`~read_cif`. Returns ------- ccd_data: dict The dictionary containing the CCD data from a cif file in a chilife-readable format. """ ccd_data = {} atom_data = cif_data.get("chem_comp_atom", None) if atom_data is not None: for res, atom_id, atom_type, aromatic, stereo in zip( atom_data["comp_id"], atom_data["atom_id"], atom_data["type_symbol"], atom_data["pdbx_aromatic_flag"], atom_data["pdbx_stereo_config"], ): res_dict = ccd_data.setdefault(res, {}) if "chem_comp_atom" not in res_dict: res_dict["chem_comp_atom"] = { "comp_id": [], "atom_id": [], "type_symbol": [], "pdbx_aromatic_flag": [], "pdbx_stereo_config": [], } atom_dict = res_dict["chem_comp_atom"] atom_dict["comp_id"].append(res) atom_dict["atom_id"].append(atom_id) atom_dict["type_symbol"].append(atom_type) atom_dict["pdbx_aromatic_flag"].append(aromatic) atom_dict["pdbx_stereo_config"].append(stereo) bond_data = cif_data.get("chem_comp_bond", None) if bond_data is not None: for res, atom_id1, atom_id2, bond_order, aromatic, stereo in zip( bond_data["comp_id"], bond_data["atom_id_1"], bond_data["atom_id_2"], bond_data["value_order"], bond_data["pdbx_aromatic_flag"], bond_data["pdbx_stereo_config"], ): res_dict = ccd_data.setdefault(res, {}) if "chem_comp_bond" not in res_dict: res_dict["chem_comp_bond"] = { "comp_id": [], "atom_id_1": [], "atom_id_2": [], "value_order": [], "pdbx_aromatic_flag": [], "pdbx_stereo_config": [], } bond_dict = res_dict["chem_comp_bond"] bond_dict["comp_id"].append(res) bond_dict["atom_id_1"].append(atom_id1) bond_dict["atom_id_2"].append(atom_id2) bond_dict["value_order"].append(bond_order) bond_dict["pdbx_aromatic_flag"].append(aromatic) bond_dict["pdbx_stereo_config"].append(stereo) chem_comp_data = cif_data.get("chem_comp", None) if chem_comp_data is not None and all( key in chem_comp_data for key in [ "id", "type", "mon_nstd_flag", "name", "pdbx_synonyms", "formula", "formula_weight", ] ): for res, link_type, mon_nsdt_flag, name, pdbx_synonyms, formula, weight in zip( chem_comp_data["id"], chem_comp_data["type"], chem_comp_data["mon_nstd_flag"], chem_comp_data["name"], chem_comp_data["pdbx_synonyms"], chem_comp_data["formula"], chem_comp_data["formula_weight"], ): res_dict = ccd_data.setdefault(res, {}) res_dict["chem_comp"] = { "link type": link_type, "mon nsdt flag": mon_nsdt_flag, "name": name, "pdbx synonyms": pdbx_synonyms, "formula": formula, "weight": weight, } return ccd_data
[docs] def write_cif(file_name, cif_data): """ Function to write a CIF file from cif_data as read by :func:`~read_cif` Parameters ---------- file_name: str | Path File to write the cif_data to. cif_data: Dict cif data to write to the file. """ lines = [] for struct in cif_data: lines.append(f"data_{struct}\n") lines.append("# \n") for key1, value1 in cif_data[struct].items(): max_subkey_len = max(len(k) for k in value1.keys()) key_length = len(key1) + 1 + max_subkey_len + 3 loop_keys = [] loop_values = [] max_val_lens = [] for key2, value2 in value1.items(): line = f"_{key1}.{key2}".ljust(key_length) if isinstance(value2, str): if " " in value2 and value2[0] != "'": value2 = "'" + value2 + "'" if len(value2) < 80: line += f" {value2} \n" lines.append(line) elif " " in value2: lines.append(line + "\n") lines.append(value2 + "\n") else: lines.append(line + "\n") prelude = ";" for string in textwrap.wrap(value2, 80): lines.append(prelude + string + "\n") prelude = "" lines.append("; \n") elif isinstance(value2, Sequence): loop_keys.append(line + "\n") value2 = [v if "'" not in v else '"' + v + '"' for v in value2] try: value2 = [ v if (" " not in v) and (v[0] != "_") else "'" + v + "'" for v in value2 ] except IndexError: breakpoint() loop_values.append(value2) try: max_val_lens.append(max(len(x) for x in value2)) except ValueError: print(value2) if loop_values: lines.append("loop_\n") for key in loop_keys: lines.append(key) for row in zip(*loop_values): line = " ".join(x.ljust(n) for x, n in zip(row, max_val_lens)) lines.append(line + "\n") lines.append("# \n") with open(file_name, "w") as f: f.writelines(lines)
[docs] def join_ccd_info(ccd_list, ccd_data=None): """ Join CCD info from a list of CCD entries into a dictionary that can be used to write ccd info to a cif file. Parameters ---------- ccd_list: List[str] List of CCD codes to extract from the ccd_data dictionary to be joined in a single ccd_info dict. ccd_data: Dict Superset dictionary of CCD entries that contain all the CCD info for the CCD codes specified in the ``ccd_list`` If ``ccd_data`` is not provided the default chilife ``bio_ccd`` will be used which contains all biomolecular CCD entries from the PDB (as of mid 2025). If a CCD code in ``ccd_list`` is not in ``ccd_data`` it will be skipped. Returns ------- return_ccd: Dict A dictionary containing all ccd data for the set of codes provided and ready to be written to a cif file. """ # use default CCD if none is provided if ccd_data is None: ccd_data = chilife.bio_ccd # Otherwise overwrite default CCD data with what was provided and add missing CCD data else: _ccd_data = {k: v for k, v in chilife.bio_ccd.items()} _ccd_data.update(ccd_data) ccd_data = _ccd_data individual_CCDs = [ccd_data[ccd] for ccd in ccd_list if ccd in ccd_data] return_ccd = { "chem_comp": {}, "chem_comp_atom": {key: [] for key in CHEM_COMP_ATOM_KEYS}, "chem_comp_bond": {key: [] for key in CHEM_COMP_BOND_KEYS}, } return_ccd["chem_comp"]["id"] = tuple( ccd["chem_comp"]["id"] for ccd in individual_CCDs if "chem_comp" in ccd ) return_ccd["chem_comp"]["type"] = tuple( ccd["chem_comp"]["type"] for ccd in individual_CCDs if "chem_comp" in ccd ) return_ccd["chem_comp"]["name"] = tuple( ccd["chem_comp"]["name"] for ccd in individual_CCDs if "chem_comp" in ccd ) return_ccd["chem_comp"]["pdbx_synonyms"] = tuple( ccd["chem_comp"]["pdbx_synonyms"] for ccd in individual_CCDs if "chem_comp" in ccd ) return_ccd["chem_comp"]["formula"] = tuple( ccd["chem_comp"]["formula"] for ccd in individual_CCDs if "chem_comp" in ccd ) return_ccd["chem_comp"]["formula_weight"] = tuple( ccd["chem_comp"]["formula_weight"] for ccd in individual_CCDs if "chem_comp" in ccd ) for ccd in individual_CCDs: # Add Atom information for key in CHEM_COMP_ATOM_KEYS: return_ccd["chem_comp_atom"][key].extend(ccd["chem_comp_atom"][key]) if "chem_comp_bond" not in ccd: continue # Ensure all fields are tuples/lists ccd["chem_comp_bond"] = { k: v if isinstance(v, (list, tuple)) else (v,) for k, v in ccd["chem_comp_bond"].items() } # Add bond information for key in CHEM_COMP_BOND_KEYS: return_ccd["chem_comp_bond"][key].extend(ccd["chem_comp_bond"][key]) # Convert to tuples return_ccd["chem_comp_atom"] = { k: tuple(v) for k, v in return_ccd["chem_comp_atom"].items() } return_ccd["chem_comp_bond"] = { k: tuple(v) for k, v in return_ccd["chem_comp_bond"].items() } # Add PDBx Ordinal return_ccd["chem_comp_atom"]["pdbx_ordinal"] = tuple( str(x) for x in range(1, len(return_ccd["chem_comp_atom"]["comp_id"]) + 1) ) return_ccd["chem_comp_bond"]["pdbx_ordinal"] = tuple( str(x) for x in range(1, len(return_ccd["chem_comp_bond"]["comp_id"]) + 1) ) return return_ccd