from __future__ import annotations
from typing import Tuple, Dict, Union, BinaryIO, TextIO
import textwrap
import warnings
import os
import urllib
import MDAnalysis
from numpy.typing import ArrayLike
from collections import defaultdict
from collections.abc import Sequence
from hashlib import sha256
from pathlib import Path
import pickle
import shutil
from io import StringIO
import zipfile
import numpy as np
from scipy.stats import gaussian_kde
from memoization import cached, suppress_warnings
import MDAnalysis as mda
import chilife.RotamerEnsemble as re
import chilife.SpinLabel as sl
import chilife.dRotamerEnsemble as dre
import chilife.LigandEnsemble as le
import chilife
from .globals import (
dihedral_defs,
rotlib_indexes,
RL_DIR,
SUPPORTED_BB_LABELS,
USER_RL_DIR,
rotlib_defaults,
alt_prot_states,
)
from .alignment_methods import local_mx
from .IntrinsicLabel import IntrinsicLabel
from .MolSys import MolecularSystemBase, MolSys
from .Topology import BondType
from .MolSysIC import MolSysIC
from .pdb_utils import parse_connect, sort_pdb
# ID name res chain resnum X Y Z q b elem
fmt_str = "ATOM {:5d} {:^4s} {:3s} {:1s}{:4d} {:8.3f}{:8.3f}{:8.3f}{:6.2f}{:6.2f} {:>2s} \n"
CHEM_COMP_ATOM_KEYS = (
"comp_id",
"atom_id",
"type_symbol",
"pdbx_aromatic_flag",
"pdbx_stereo_config",
)
CHEM_COMP_BOND_KEYS = (
"comp_id",
"atom_id_1",
"atom_id_2",
"value_order",
"pdbx_aromatic_flag",
"pdbx_stereo_config",
)
CIF_BOND_TO_XL = {
"arom": BondType.AROMATIC,
"delo": BondType.DELOCALIZED,
"doub": BondType.DOUBLE,
"pi": BondType.PI,
"poly": BondType.POLYMERIC,
"quad": BondType.QUADRUPLE,
"sing": BondType.SINGLE,
"trip": BondType.TRIPLE,
"metalc": BondType.DATIVE,
"disulf": BondType.UNSPECIFIED,
}
XL_BOND_TO_CIF = {v: k for k, v in CIF_BOND_TO_XL.items()}
SDF_BOND_TO_XL = {
1: BondType.SINGLE,
2: BondType.DOUBLE,
3: BondType.TRIPLE,
4: BondType.AROMATIC,
5: BondType.UNSPECIFIED,
6: BondType.SINGLE,
7: BondType.DOUBLE,
8: BondType.UNSPECIFIED,
}
XL_BOND_TO_SDF = {v: k for k, v in SDF_BOND_TO_XL.items() if k not in (5, 6, 7)}
SDF_ATOM_CHARGE_TO_INT = {
0: 0,
1: 3,
2: 2,
3: 1,
4: 0,
5: -1,
6: -2,
7: -3,
}
[docs]
def read_distance_distribution(file_name: str) -> Tuple[np.ndarray, np.ndarray]:
"""Reads a DEER distance distribution file in the DeerAnalysis or similar format.
Parameters
----------
file_name : str
File name of distance distribution
Returns
-------
r: np.ndarray
Domain of distance distribution in the same units as the file.
p: np.ndarray
Probability density over r normalized such that the integral of p over r is 1.
"""
# Load DA file
data = np.loadtxt(file_name)
# Convert nm to angstroms
r = data[:, 0]
if max(r) < 15:
r *= 10
# Extract distance domain coordinates
p = data[:, 1]
return r, p
def hash_file(file: Union[Path, BinaryIO]):
"""
Helper function for hashing rotamer library files
Parameters
----------
file : str, Path, BinaryIO
The name, Path or IOBytes of the file to hash
Returns
-------
hashstring : str
file hash
"""
hash = sha256()
with open(file, "rb") as f:
while True:
block = f.read(hash.block_size)
if not block:
break
hash.update(block)
return hash.hexdigest()
def get_possible_rotlibs(
rotlib: str,
suffix: str,
extension: str,
return_all: bool = False,
was_none: bool = False,
) -> Union[Path, None]:
"""
Search all known rotlib directories and the current working directory for rotlib(s) that match the provided
information.
Parameters
----------
rotlib: str
Fullname, base name or partial name of the rotamer libraries to search for.
suffix: str
possible suffixes the rotamer library may have, e.g. ip2 to indicate an i+2 rotamer library.
extension: str
filetype extension to look for. This will be either `npz` for monofunctional rotlibs or zip for bifunctional
rotlibs.
return_all: bool
By default, only the first found rotlib will be returned unless ``return_all = True``, in which case
was_none: bool
For internal use only.
Returns
-------
rotlib: Path, List[Path], None
The path to the found rotlib or a list of paths to the rotlibs that match the search criteri, or ``None`` if
no rotlibs are found.
"""
cwd = Path.cwd()
sufplusex = "_" + suffix + extension
# Assemble a list of possible rotlib paths starting in the current directory
possible_rotlibs = [
Path(rotlib),
cwd / rotlib,
cwd / (rotlib + extension),
cwd / (rotlib + sufplusex),
]
possible_rotlibs += list(cwd.glob(f"{rotlib}*{sufplusex}"))
# Then in the user defined rotamer library directory
for pth in USER_RL_DIR:
possible_rotlibs += list(pth.glob(f"{rotlib}*{sufplusex}"))
if not was_none:
possible_rotlibs += list((RL_DIR / "user_rotlibs").glob(f"*{rotlib}*"))
if return_all:
rotlib = []
for possible_file in possible_rotlibs:
if possible_file.exists() and return_all:
rotlib.append(possible_file)
elif possible_file.exists() and not possible_file.is_dir():
rotlib = possible_file
break
else:
if isinstance(rotlib, str) and was_none and rotlib in rotlib_defaults:
rotlib = RL_DIR / "user_rotlibs" / (rotlib_defaults[rotlib][0] + sufplusex)
elif not isinstance(rotlib, list) or rotlib == []:
rotlib = None
# rotlib lists need to be sorted to prevent position mismatches for results with tests.
if isinstance(rotlib, list):
rotlib = list(set(rotlib))
rotlib = [rot for rot in rotlib if str(rot).endswith(extension)]
rotlib = sorted(rotlib)
else:
rotlib = rotlib if str(rotlib).endswith(extension) else None
return rotlib
suppress_warnings()
@cached(custom_key_maker=hash_file)
def read_rotlib(rotlib: Union[Path, BinaryIO] = None) -> Dict:
"""Reads RotamerEnsemble for stored spin labels.
Parameters
----------
rotlib : Path, BinaryIO
Path object pointing to the rotamer library. Or a BytesIO object containing the rotamer library file
Returns
-------
lib: dict
Dictionary of SpinLabel rotamer ensemble attributes including coords, weights, dihedrals etc.
"""
with np.load(rotlib, allow_pickle=True) as files:
if files["format_version"] <= 1.1:
raise RuntimeError(
"The rotlib that was provided is an old version that is not compatible with your "
"version of chiLife. You can either remake the rotlib, or use the update_rotlib.py "
"script provided in the chilife scripts directory to update this rotamer library to the "
"new format."
)
lib = dict(files)
if "allow_pickle" in lib:
del lib["allow_pickle"]
if "sigmas" not in lib:
lib["sigmas"] = np.array([])
lib["internal_coords"] = lib["internal_coords"].item()
lib["_rdihedrals"] = np.deg2rad(lib["dihedrals"])
lib["_rsigmas"] = np.deg2rad(lib["sigmas"])
lib["rotlib"] = str(lib["rotlib"])
lib["type"] = str(lib["type"])
lib["format_version"] = float(lib["format_version"])
return lib
@cached(custom_key_maker=hash_file)
def read_drotlib(rotlib: Path) -> Tuple[dict]:
"""Reads RotamerEnsemble for stored spin labels.
Parameters
----------
rotlib : Path
The Path to the rotamer library file.
Returns
-------
lib: Tuple[dict]
Dictionaries of rotamer library attributes including sub_residues .
"""
with zipfile.ZipFile(rotlib, "r") as archive:
for f in archive.namelist():
if "csts" in f:
with archive.open(f) as of:
csts = np.load(of)
elif f[-12] == "A":
with archive.open(f) as of:
libA = read_rotlib.__wrapped__(of)
elif f[-12] == "B":
with archive.open(f) as of:
libB = read_rotlib.__wrapped__(of)
return libA, libB, csts
@cached
def read_bbdep(res: str, Phi: int, Psi: int) -> Dict:
"""Read the Dunbrack rotamer library for the provided residue and backbone conformation.
Parameters
----------
res : str
3-letter residue code
Phi : int
Backbone Phi dihedral angle for the provided residue
Psi : int
Backbone Psi dihedral angle for the provided residue
Returns
-------
lib: dict
Dictionary of arrays containing rotamer library information in cartesian and dihedral space
"""
ic_res = alt_prot_states.get(res, res)
lib = {}
Phi, Psi = str(Phi), str(Psi)
# Read residue internal coordinate structure
with open(RL_DIR / f"residue_internal_coords/{res.lower()}_ic.pkl", "rb") as f:
ICs = pickle.load(f)
atom_types = ICs.atom_types.copy()
atom_names = ICs.atom_names.copy()
maxchi = 5 if res in SUPPORTED_BB_LABELS else 4
nchi = np.minimum(len(dihedral_defs[res]), maxchi)
if res not in ("ALA", "GLY"):
library = "R1C.lib" if res in SUPPORTED_BB_LABELS else "ALL.bbdep.rotamers.lib"
start, length = rotlib_indexes[f"{ic_res} {Phi:>4}{Psi:>5}"]
with open(RL_DIR / library, "rb") as f:
f.seek(start)
rotlib_string = f.read(length).decode()
s = StringIO(rotlib_string)
s.seek(0)
data = np.genfromtxt(s, usecols=range(maxchi + 4, maxchi + 5 + 2 * maxchi))
lib["weights"] = data[:, 0]
lib["dihedrals"] = data[:, 1 : nchi + 1]
lib["sigmas"] = data[:, maxchi + 1 : maxchi + nchi + 1]
dihedral_atoms = dihedral_defs[res][:nchi]
# Calculate cartesian coordinates for each rotamer
z_matrix = ICs.batch_set_dihedrals(
np.zeros(len(lib["dihedrals"]), dtype=int),
np.deg2rad(lib["dihedrals"]),
1,
dihedral_atoms,
)
ICs._chain_operators = ICs._chain_operators[0]
ICs.load_new(np.array(z_matrix))
internal_coords = ICs.copy()
coords = ICs.protein.trajectory.coordinate_array.copy()
else:
lib["weights"] = np.array([1])
lib["dihedrals"], lib["sigmas"], dihedral_atoms = [], [], []
coords = ICs.to_cartesian()[None, ...]
internal_coords = ICs.copy()
# Get origin and rotation matrix of local frame
mask = np.isin(atom_names, ["N", "CA", "C"])
ori, mx = local_mx(*coords[0, mask])
# Set coords in local frame and prepare output
coords -= ori
lib["coords"] = np.einsum("ijk,kl->ijl", coords, mx)
lib["internal_coords"] = internal_coords
lib["atom_types"] = np.asarray(atom_types, dtype=str)
lib["atom_names"] = np.asarray(atom_names, dtype=str)
lib["dihedral_atoms"] = np.asarray(dihedral_atoms, dtype=str)
lib["_rdihedrals"] = np.deg2rad(lib["dihedrals"])
lib["_rsigmas"] = np.deg2rad(lib["sigmas"])
lib["rotlib"] = res
lib["backbone_atoms"] = ["H", "N", "CA", "HA", "C", "O"]
lib["aln_atoms"] = ["N", "CA", "C"]
# hacky solution to experimental backbone dependent rotlibs
if res == "R1C":
lib["spin_atoms"] = np.array(["N1", "O1"])
lib["spin_weights"] = np.array([0.5, 0.5])
return lib
[docs]
def read_library(rotlib: str, Phi: float = None, Psi: float = None) -> Dict:
"""Function to read rotamer libraries and bifunctional rotamer libraries as dictionaries.
Parameters
----------
rotlib : str, Path
3 letter residue code or path the rotamer library file.
Phi : float
Backbone Phi dihedral angle for the provided residue. Only applicable for canonical amino acids.
Psi : float
Backbone Psi dihedral angle for the provided residue. Only applicable for canonical amino acids.
Returns
-------
lib: dict
Dictionary of arrays containing rotamer library information in cartesian and dihedral space.
"""
backbone_exists = Phi and Psi
if backbone_exists:
Phi = int((Phi // 10) * 10)
Psi = int((Psi // 10) * 10)
# Use helix backbone if not otherwise specified
else:
Phi, Psi = -60, -50
if isinstance(rotlib, Path):
if rotlib.suffix == ".npz":
return read_rotlib(rotlib)
elif rotlib.suffix == ".zip":
return read_drotlib(rotlib)
else:
raise ValueError(f"{rotlib.name} is not a valid rotamer library file type")
elif isinstance(rotlib, str):
return read_bbdep(rotlib, Phi, Psi)
else:
raise ValueError(f"{rotlib} is not a valid rotamer library")
[docs]
def save(
file_name: str,
*molecules: Union[
re.RotamerEnsemble, MolecularSystemBase, mda.Universe, mda.AtomGroup, str
],
protein_path: Union[str, Path] = None,
mode: str = "w",
conect: bool = False,
frames: Union[int, str, ArrayLike, None] = "all",
**kwargs,
) -> None:
"""Save a pdb file of the provided labels and proteins
Parameters
----------
file_name : str
Desired file name for output file. Will be automatically made based off of protein name and labels if not
provided.
*molecules : RotmerLibrary, chiLife.MolSys, mda.Universe, mda.AtomGroup, str
Object(s) to save. Includes RotamerEnsemble, SpinLabels, dRotamerEnsembles, proteins, or path to protein pdb.
Can add as many as desired, except for path, for which only one can be given.
protein_path : str, Path
The Path to a pdb file to use as the protein object.
mode : str
Which mode to open the file in. Accepts 'w' or 'a' to overwrite or append.
conect : bool
Whether to save PDB CONECT records. Defaults to False.
frames : int, str, ArrayLike[int], None
Exclusively used for ``MolSys`` and ``AtomGroup`` objects. ``frames`` are the indices of the frames to save.
Can be a single index, array of indices, 'all', or ``None``. If ``None`` is passed only the
activate frame will be saved. If 'all' is passed, all frames are saved. Default is 'all'
**kwargs :
Additional arguments to pass to ``write_labels``
write_spin_centers : bool
Write spin centers (atoms named NEN) as a separate object with weights mapped to q-factor.
"""
if isinstance(file_name, tuple(molecule_class.keys())):
molecules = list(molecules)
molecules.insert(0, file_name)
file_name = None
# Check for filename at the beginning of args
tmolecules = defaultdict(list)
for mol in molecules:
mcls = [val for key, val in molecule_class.items() if isinstance(mol, key)]
if mcls == []:
raise TypeError(
f"{type(mol)} is not a supported type for this function. "
"chiLife can only save objects of the following types:\n"
+ "".join(f"{key.__name__}\n" for key in molecule_class)
+ "Please check that your input is compatible"
)
tmolecules[mcls[0]].append(mol)
molecules = tmolecules
# Ensure only one protein path was provided (for now)
protein_path = [] if protein_path is None else [protein_path]
if len(protein_path) > 1:
raise ValueError("More than one protein path was provided. C")
elif len(protein_path) == 1:
protein_path = protein_path[0]
else:
protein_path = None
# Create a file name from protein information
if file_name is None:
if protein_path is not None:
f = Path(protein_path)
file_name = f.name[:-4]
else:
file_name = ""
for protein in molecules["molcart"]:
if getattr(protein, "filename", None):
file_name += " " + Path(protein.filename).name
file_name = file_name[:-4]
if file_name == "" and len(molecules["molcart"]) > 0:
file_name = "No_Name_Protein"
# Add spin label information to file name
if 0 < len(molecules["rotens"]) < 3:
naml = [file_name] + [rotens.name for rotens in molecules["rotens"]]
file_name = "_".join(naml)
file_name = file_name.strip("_")
elif len(molecules["rotens"]) >= 3:
file_name += "_many_labels"
file_name += ".pdb"
file_name = file_name.strip()
if protein_path is not None:
print(protein_path, file_name)
shutil.copy(protein_path, file_name)
pdb_file = open(file_name, "a+")
else:
pdb_file = open(file_name, mode)
used_names = {}
for protein in molecules["molcart"]:
if isinstance(protein, (mda.AtomGroup, mda.Universe)):
name = (
Path(protein.universe.filename)
if protein.universe.filename is not None
else Path(pdb_file.name)
)
name = name.name
else:
name = protein.fname if hasattr(protein, "fname") else None
if name is None:
name = Path(pdb_file.name).name
name = name[:-4] if name.endswith(".pdb") else name
name_ = name + str(used_names[name]) if name in used_names else name
used_names[name] = used_names.setdefault(name, 0) + 1
write_protein(pdb_file, protein, name_, conect=conect, frames=frames)
for ic in molecules["molic"]:
if isinstance(ic.protein, (mda.AtomGroup, mda.Universe)):
name = (
Path(ic.protein.universe.filename)
if ic.protein.universe.filename is not None
else Path(pdb_file.name)
)
name = name.name
else:
name = ic.protein.fname if hasattr(ic.protein, "fname") else None
if name is None:
name = Path(pdb_file.name).name
name = name[:-4] if name.endswith(".pdb") else name
name_ = name + str(used_names[name]) if name in used_names else name
write_ic(pdb_file, ic, name=name_, conect=conect, frames=frames)
if len(molecules["rotens"]) > 0:
write_labels(pdb_file, *molecules["rotens"], conect=conect, **kwargs)
pdb_file.close()
def fetch(
accession_number: str, save: bool = False, format: str | None = None
) -> MDAnalysis.Universe:
"""Fetch pdb file from the protein data bank or the AlphaFold Database and optionally save to disk.
Parameters
----------
accession_number : str
4 letter structure PDB ID or alpha fold accession number. Note that AlphaFold accession numbers must begin with
'AF-'.
save : bool
If true the fetched PDB will be saved to the disk.
format : str
Format of the structure file (pdb or cif).
Returns
-------
U : MDAnalysis.Universe
MDAnalysis Universe object of the protein corresponding to the provided PDB ID or AlphaFold accession number
"""
if format is None and accession_number.lower().endswith(".pdb"):
format = "pdb"
elif format is None and accession_number.lower().endswith(".cif"):
format = "cif"
elif format is None:
format = "pdb"
elif format.lower().startswith("."):
format = format[1:].lower()
else:
format = format.lower()
accession_number = accession_number.split(f".{format}")[0]
pdb_name = accession_number + "." + format
if accession_number.startswith("AF-"):
print(
f"https://alphafold.ebi.ac.uk/files/{accession_number}-F1-model_v6.{format}"
)
urllib.request.urlretrieve(
f"https://alphafold.ebi.ac.uk/files/{accession_number}-F1-model_v6.{format}",
pdb_name,
)
else:
urllib.request.urlretrieve(
f"http://files.rcsb.org/download/{pdb_name}", pdb_name
)
if format.lower() == "pdb":
with warnings.catch_warnings():
warnings.simplefilter("ignore")
U = mda.Universe(pdb_name, in_memory=True)
elif format.lower() == "cif":
U = MolSys.from_cif(pdb_name)
if not save:
os.remove(pdb_name)
return U
def load_protein(
struct_file: Union[str, Path], *traj_file: Union[str, Path]
) -> MDAnalysis.AtomGroup:
"""
Parameters
----------
struct_file : Union[TextIO, str, Path]
Name, Path or TextIO object referencing the structure file (e.g. pdb, gro, psf)
*traj_file : Union[TextIO, str, Path] (optional)
Name, Path or TextIO object(s) referencing the trajectory file (e.g. pdb, xtc, dcd)
Returns
-------
protein: MDAnalysis.AtomGroup
An MDA AtomGroup object containing the protein structure and trajectory. The object is always loaded into
memory to allow coordinate manipulations.
"""
if traj_file != []:
traj_file = [str(file) for file in traj_file]
with warnings.catch_warnings():
warnings.simplefilter("ignore")
protein = mda.Universe(str(struct_file), *traj_file, in_memory=True)
else:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
protein = mda.Universe(struct_file, in_memory=True)
return protein
[docs]
def read_pdb(pdb_file: Union[str, Path], sort_atoms=False):
"""
Read A PDB file into a dictionary containing all standard pdb atom information and bonds if CONECT records are
present.
Parameters
----------
pdb_file : str | Path
File name or ``Path`` to the PDB file to be read.
Returns
-------
pdb_data : dict
Dictionary of PDB data.
"""
keys = [
"record_types",
"atomids",
"names",
"altlocs",
"resnames",
"chains",
"resnums",
"icodes",
"coords",
"occupancies",
"bs",
"segs",
"atypes",
"charges",
]
if sort_atoms:
lines = sort_pdb(pdb_file)
else:
with open(pdb_file, "r") as f:
lines = f.readlines()
connect = []
atom_lines = []
for line in lines:
if line.startswith(("MODEL", "ENDMDL", "ATOM", "HETATM")):
atom_lines.append(line)
elif line.startswith("CONECT"):
connect.append(line)
start_idxs = []
end_idxs = []
for i, line in enumerate(atom_lines):
if line.startswith("MODEL"):
start_idxs.append(i + 1)
elif line.startswith("ENDMDL"):
end_idxs.append(i)
if len(start_idxs) > 0:
lines = [atom_lines[start:end] for start, end in zip(start_idxs, end_idxs)]
else:
lines = [atom_lines]
PDB_data = [
(
line[:6].strip(),
i,
line[12:16].strip(),
line[16:17].strip(),
line[17:20].strip(),
line[21:22].strip(),
int(line[22:26]),
line[26:27].strip(),
(float(line[30:38]), float(line[38:46]), float(line[46:54])),
float(line[54:60]),
float(line[60:66]),
line[72:73].strip(),
line[76:78].strip(),
line[78:80].strip(),
)
for i, line in enumerate(lines[0])
]
pdb_dict = {key: np.array(data) for key, data in zip(keys, zip(*PDB_data))}
# Parse connect data
if len(connect) > 0:
c_bonds, h_bonds, i_bonds = parse_connect(connect)
bonds = np.array(list(c_bonds))
pdb_dict["bonds"] = bonds
pdb_dict["bond_types"] = np.array(
[BondType.UNSPECIFIED for _ in range(len(bonds))]
)
trajectory = [pdb_dict.pop("coords")]
if len(lines) > 1:
for struct in lines[1:]:
frame_coords = [(line[30:38], line[38:46], line[46:54]) for line in struct]
if len(frame_coords) != len(PDB_data):
raise ValueError(
"All models in a multistate PDB must have the same atoms"
)
trajectory.append(frame_coords)
pdb_dict["trajectory"] = np.array(trajectory, dtype=float)
pdb_dict["name"] = Path(pdb_file).name
return pdb_dict
[docs]
def read_sdf(file_name):
"""
Reads SDF file into a dictionary structure.
Parameters
----------
file_name: str, Path
string or Path object directing the function to the sdf file to read.
Returns
-------
mols: List[dict]
List of dictionaries containing all the sdf data for each molecule in the sdf file.
"""
# Read in mol blocks
with open(file_name, "r") as f:
blocks, block = [], []
for line in f:
line = line.rstrip("\r\n")
if not line == "$$$$":
block.append(line)
else:
blocks.append(block)
block = []
mols = []
for block in blocks:
mol_data = {
"name": block[0],
"software": block[1].strip(),
"comment": block[2].strip(),
}
n_atoms = block[3][:3].strip()
n_bonds = block[3][3:6].strip()
n_atom_lists = block[3][6:9].strip()
chiral = block[3][12:15].strip()
n_stexts = block[3][15:18].strip()
properties_lines = block[3][30:33]
format_version = block[3][33:].strip()
mol_data["n_atoms"] = int(n_atoms)
mol_data["n_bonds"] = int(n_bonds)
mol_data["chiral"] = bool(int(chiral))
mol_data["format"] = format_version
atom_end_idx = 4 + mol_data["n_atoms"]
bond_end_idx = atom_end_idx + mol_data["n_bonds"]
mol_data["atoms"] = []
for idx, line in enumerate(block[4:atom_end_idx]):
atom = {}
atom["index"] = idx
atom["xyz"] = [float(line[0:10]), float(line[10:20]), float(line[20:30])]
atom["element"] = line[31:34].strip()
atom["mass difference"] = int(float(line[34:36]))
atom["charge"] = int(line[36:39])
atom["stereo"] = int(line[39:42])
atom["valence"] = int(line[48:51]) if len(line) >= 51 else 0
mol_data["atoms"].append(atom)
mol_data["bonds"] = []
for idx, line in enumerate(block[atom_end_idx:bond_end_idx]):
bond = {}
bond["idx1"] = int(line[0:3]) - 1 # Coerce to 0 index
bond["idx2"] = int(line[3:6]) - 1 # Coerce to 0 index
bond["type"] = int(line[6:9])
bond["stereo"] = int(line[9:12])
mol_data["bonds"].append(bond)
properties = {}
for idx, line in enumerate(block[bond_end_idx:]):
if line == "M END":
break
if line.startswith("M "):
key = line[3:6]
lst = properties.setdefault(key, [])
lst.append(line[6:])
else:
break
for key, val in properties.items():
properties[key] = parse_property_lines(key, val)
mol_data["properties"] = properties
data_end_idx = bond_end_idx + idx + 1
data_iterator = iter(block[data_end_idx:])
data_block = []
for line in data_iterator:
opened = line.startswith(">")
while opened:
data_block.append(line)
line = next(data_iterator, "")
opened = bool(line)
if data_block:
header, data = parse_data_block(data_block)
mol_data[header] = data
data_block = []
mols.append(mol_data)
return mols
def parse_data_block(data_block):
"""
Helper function to parse the data block of an SDF file. Only used internally.
Parameters
----------
data_block: List[str]
The lines of the sdf file pertaining to the data block
Returns
-------
header: str
The header of the data block
data_string: str
a string containing all data in the data block
"""
data_string = "\n".join(data_block[1:])
header_block = data_block[0][1:]
header = header_block[header_block.find("<") + 1 : header_block.find(">")]
return header, data_string
standard_property_vnames = {
"CHG": "charges",
"RAD": "multiplicities",
"ISO": "mass",
"RBC": "ring bond count",
"SUB": "substitution count",
"UNS": "unsaturated atom",
}
def parse_property_lines(property, lines):
"""
Helper function to parse properties from sdf files.
Parameters
----------
property: str
Property being parsed
lines: List[str]
The lines of the sdf file pertaining to the property block being parsed.
Returns
-------
dict | List[str]
dictionary of the atom ids mapping to the properties for each atom if `property` is known to chilife otherwise
the input lines as provided.
"""
if property in standard_property_vnames:
vname = standard_property_vnames[property]
k, v = parse_standard_property(lines)
return {
"atom_ids": np.array(k, dtype=int) - 1,
vname: np.array(v, dtype=int),
}
else:
return lines
def parse_standard_property(lines):
"""
Helper function to parse standard properties from sdf files.
Parameters
----------
lines: List[str]
The lines of the sdf file pertaining to the property block being parsed.
Returns
-------
k: List
Atom ids for which this property is relevant.
V: List
Quantitative value of the property.
"""
k, v = [], []
for line in lines:
entries = textwrap.wrap(line, 4)
k.extend(entries[1::2])
v.extend(entries[2::2])
return k, v
def standard_property_to_string(key, property):
"""
Helper function to convert standard properties in a dictionary to a string format for writing sdf files.
Parameters
----------
key: str
3 letter keycode of the property
property: dict
Property dict of a standard property as parsed by `parse_standard_property`.
Returns
-------
strings: List[str]
List of strings containing the property lines of an sdf file.
"""
strings = []
data_col = standard_property_vnames[key]
n = 1
running_str = " "
for idx, val in zip(property["atom_ids"], property[data_col]):
if n >= 8:
running_str = f"M {key}{n:>3d}" + running_str + "\n"
strings.append(running_str)
running_str = " "
n = 1
running_str = running_str + f"{str(idx + 1):>3s} {str(val):>3s}"
n += 1
if running_str != " ":
running_str = f"M {key}{n:>3d}" + running_str + "\n"
strings.append(running_str)
return strings
[docs]
def write_sdf(sdf_data, file_name):
"""
Function to write sdf file from sdf format in a dict as read by `read_sdf`.
Parameters
----------
sdf_data: Dict | List[Dict]
The sdf data in chilife format
file_name: str
Name of the file to write the sdf data to.
"""
if isinstance(sdf_data, dict):
sdf_data = [sdf_data]
lines = []
for mol in sdf_data:
# Headder
lines.append(f"{mol['name']}\n")
lines.append(f"\tchiLife {chilife.__version__}\n")
lines.append(f"{mol['comment']}\n")
lines.append(
f"{mol['n_atoms']:>3}{mol['n_bonds']:>3}{0:>3}{0:>3}{mol['chiral']:>3}"
f"{0:>3}{0:>3}{0:>3}{0:>3}{0:>3}999 V2000\n"
)
# Coords
atoms = mol["atoms"]
for atom in atoms:
x, y, z = atom["xyz"]
lines.append(
f"{x:10.4f}{y:10.4f}{z:10.4f}{atom['element']:>2} {atom['mass difference']:>3}{atom['charge']:>3}"
f"{atom['stereo']:>3}{0:>3}{0:>3}{atom['valence']:>3}"
+ f"{0:>3}" * 6
+ "\n"
)
# Bonds
bonds = mol["bonds"]
for bond in bonds:
lines.append(
f"{bond['idx1'] + 1:>3}{bond['idx2'] + 1:>3}{bond['type']:>3}{bond['stereo']:>3}\n"
)
# Write CHG RAD ISO Data
for key, property in mol["properties"].items():
if key in standard_property_vnames:
lines += standard_property_to_string(key, property)
else:
lines += property
lines.append("M END\n")
# Write data blocks
standard_keys = {
"name",
"software",
"comment",
"n_atoms",
"n_bonds",
"chiral",
"format",
"atoms",
"bonds",
"properties",
}
data_keys = [key for key in mol if key not in standard_keys]
for key in data_keys:
lines.append(f"> <{key}>\n")
lines.append(f"{mol[key]}")
lines.append("\n\n")
lines.append("$$$$\n")
with open(file_name, "w") as f:
f.writelines(lines)
def write_protein(
pdb_file: TextIO,
protein: Union[mda.Universe, mda.AtomGroup, MolecularSystemBase],
name: str = None,
conect: bool = False,
frames: Union[int, str, ArrayLike, None] = "all",
) -> None:
"""
Helper function to write protein PDBs from MDAnalysis and MolSys objects.
Parameters
----------
pdb_file : TextIO
File to save the protein to
protein : MDAnalysis.Universe, MDAnalysis.AtomGroup, MolSys
MDAnalysis or MolSys object to save
name : str
Name of the protein to put in the header
frames : int, str, ArrayLike or None
Frames of the trajectory/ensemble to save to file
"""
# Change chain identifier if longer than 1
available_segids = iter("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
for seg in protein.segments:
if len(seg.segid) > 1:
seg.segid = next(available_segids)
traj = protein.universe.trajectory
pdb_file.write(f"HEADER {name}\n")
frames = None if frames == "all" and len(traj) == 1 else frames
if frames == "all":
for mdl, ts in enumerate(traj):
write_frame(pdb_file, protein.atoms, mdl)
elif hasattr(frames, "__len__"):
for mdl, frame in enumerate(frames):
traj[frame]
write_frame(pdb_file, protein.atoms, mdl)
elif frames is not None:
traj[frames]
write_frame(pdb_file, protein.atoms)
else:
write_frame(pdb_file, protein.atoms)
if conect:
bonds = (
protein.bonds.indices
if hasattr(protein, "bonds")
else protein.topology.bonds
if hasattr(protein.topology, "bonds")
else None
)
if bonds is not None:
bonds_1 = bonds + 1
write_bonds(pdb_file, bonds_1)
def write_frame(pdb_file: TextIO, atoms, frame=None, coords=None):
"""
Helper function to write multiple conformational states of a protein to a PDB file.
Parameters
----------
pdb_file : TextIO
IO object to write the frame to.
atoms: chilife.MolSys.AtomSelection | MDAnalysis.AtomGroup
Set of atoms to write to the pdb file.
frame: int | None
The frame number to write. Should be 0 indexed.
coords: ArrayLike | None
Coords to write to the pdb file. If None are provided the coords from `atoms` are used.
"""
if frame is not None:
pdb_file.write(f"MODEL {frame + 1}\n")
if coords is None:
coords = atoms.positions
if not hasattr(atoms.universe, "occupancies"):
atoms.universe.add_TopologyAttr(
"occupancies", np.ones(len(atoms.universe.atoms))
)
if not hasattr(atoms.universe, "bfactors"):
atoms.universe.add_TopologyAttr("bfactors", np.ones(len(atoms.universe.atoms)))
[
pdb_file.write(
fmt_str.format(
i + 1,
atom.name,
atom.resname[:3],
atom.segid,
atom.resnum,
*coord,
atom.occupancy,
atom.bfactor,
atom.type,
)
)
for i, (atom, coord) in enumerate(zip(atoms, coords))
]
pdb_file.write("TER\n")
if frame is not None:
pdb_file.write("ENDMDL\n")
def write_ic(
pdb_file: TextIO,
ic: MolSysIC,
name: str = None,
conect: bool = None,
frames: Union[int, str, ArrayLike, None] = "all",
) -> None:
"""
Write a :class:`~MolSysIC` internal coordinates object to a pdb file.
Parameters
----------
pdb_file : TextIO
open file or io object.
ic: MolSysIC
chiLife internal coordinates object.
name : str
Name of the molecule to be used for the HEADER.
conect : bool, optional
Write PDB CONECT information to file.
frames : int, str, ArrayLike or None
Frames of the trajectory/ensemble to save to file
"""
frames = None if frames == "all" and len(ic.trajectory) == 1 else frames
pdb_file.write(f"HEADER {name}\n")
traj = ic.trajectory
if frames == "all":
for mdl, ts in enumerate(traj):
write_frame(pdb_file, ic.atoms, mdl, ic.coords)
elif hasattr(frames, "__len__"):
for mdl, frame in enumerate(frames):
traj[frame]
write_frame(pdb_file, ic.atoms, mdl, ic.coords)
elif frames is not None:
traj[frames]
write_frame(pdb_file, ic.atoms, coords=ic.coords)
else:
write_frame(pdb_file, ic.atoms, coords=ic.coords)
if conect:
bonds = ic.topology.bonds
write_bonds(pdb_file, bonds)
def write_labels(
pdb_file: TextIO,
*args: sl.SpinLabel,
KDE: bool = True,
sorted: bool = True,
write_spin_centers: bool = True,
conect: bool = False,
) -> None:
"""Lower level helper function for saving SpinLabels and RotamerEnsembles. Loops over SpinLabel objects and appends
atoms and electron coordinates to the provided file.
Parameters
----------
pdb_file : str
File name to write to.
*args: SpinLabel, RotamerEnsemble
The RotamerEnsemble or SpinLabel objects to be saved.
KDE: bool
Perform kernel density estimate smoothing on rotamer weights before saving. Usefull for uniformly weighted
RotamerEnsembles or RotamerEnsembles with lots of rotamers
sorted : bool
Sort rotamers by weight before saving.
write_spin_centers : bool
Write spin centers (atoms named NEN) as a seperate object with weights mapped to q-factor.
Returns
-------
None
"""
# Write spin label models
for k, label in enumerate(args):
pdb_file.write(f"HEADER {label.name}\n")
# Save models in order of weight
sorted_index = (
np.argsort(label.weights)[::-1] if sorted else np.arange(len(label.weights))
)
norm_weights = label.weights / label.weights.max()
if isinstance(label, dre.dRotamerEnsemble):
sites = np.concatenate(
[
np.ones(len(label.RE1.atoms), dtype=int) * int(label.site1),
np.ones(len(label.RE2.atoms), dtype=int) * int(label.site2),
]
)
else:
sites = [atom.resi for atom in label.atoms]
for mdl, (conformer, weight) in enumerate(
zip(label.coords[sorted_index], norm_weights[sorted_index])
):
pdb_file.write(f"MODEL {mdl}\n")
[
pdb_file.write(
fmt_str.format(
i,
label.atom_names[i],
label.res[:3],
label.chain,
sites[i],
*conformer[i],
weight,
1.00,
label.atom_types[i],
)
)
for i in range(len(label.atom_names))
]
pdb_file.write("TER\n")
pdb_file.write("ENDMDL\n")
if conect:
write_bonds(pdb_file, label.bonds)
# Write spin density at electron coordinates
for k, label in enumerate(args):
if not hasattr(label, "spin_centers"):
continue
if not np.any(label.spin_centers):
continue
spin_centers = np.atleast_2d(label.spin_centers)
if KDE and len(spin_centers) > 5:
try:
# Perform gaussian KDE to determine electron density
gkde = gaussian_kde(spin_centers.T, weights=label.weights)
# Map KDE density to pseudoatoms
vals = gkde.pdf(spin_centers.T)
except:
vals = label.weights
else:
vals = label.weights
if write_spin_centers:
pdb_file.write(f"HEADER {label.name}_density\n")
norm_weights = vals / vals.max()
[
pdb_file.write(
fmt_str.format(
i,
"NEN",
label.res[:3],
label.chain,
int(label.site),
*spin_centers[i],
norm_weights[i],
1.00,
"N",
)
)
for i in range(len(norm_weights))
]
pdb_file.write("TER\n")
[docs]
def write_atoms(
file: TextIO,
atoms: Union[MDAnalysis.Universe, MDAnalysis.AtomGroup, MolecularSystemBase],
coords: ArrayLike = None,
) -> None:
"""
Write a set of atoms to a file in PDB format without any prefix or suffix, i.e. Not wrapped in `MODEL` or `ENDMDL`
Parameters
----------
file : TextIO
A writable file (TextIO) object. The file that the atoms will be written to.
atoms : MDAnalysis.Universe, MDAnalysis.AtomGroup, MolecularSystemBase
Object containing atoms to be written. May also work with duck-typed set of atoms that are iterable and each
atom has properties ``index``, ``name``,
coords : ArrayLike
Array of atom coordinates corresponding to atoms
"""
if isinstance(atoms, MDAnalysis.Universe):
atoms = atoms.atoms
if coords is None:
coords = atoms.positions
for atom, coord in zip(atoms, coords):
file.write(
f"ATOM {atom.index + 1:5d} {atom.name:^4s} {atom.resname:3s} {'A':1s}{atom.resnum:4d} "
f"{coord[0]:8.3f}{coord[1]:8.3f}{coord[2]:8.3f}{atom.occupancy:6.2f}{atom.bfactor:6.2f} {atom.type:>2s} \n"
)
def write_bonds(pdb_file, bonds):
"""
Write PDB CONECT information to ``pdb_file`` for the provided ``bonds``.
Parameters
----------
pdb_file: TextIO
File to write bonds to.
bonds: ArrayLike
Array of the indices of atom pairs that make the bonds.
"""
active_atom = bonds[0][0]
line = "CONECT" + f"{active_atom:>5d}"
lines = []
for b in bonds:
if b[0] != active_atom or len(line) > 30:
lines.append(line + "\n")
active_atom = b[0]
line = "CONECT" + f"{active_atom:>5d}"
line += f"{b[1]:>5d}"
lines.append(line + "\n")
pdb_file.writelines(lines)
molecule_class = {
re.RotamerEnsemble: "rotens",
dre.dRotamerEnsemble: "rotens",
IntrinsicLabel: "rotens",
le.LigandEnsemble: "rotens",
mda.Universe: "molcart",
mda.AtomGroup: "molcart",
MolecularSystemBase: "molcart",
MolSysIC: "molic",
}
rotlib_formats = {
1.0: (
"rotlib", #
"resname",
"coords",
"internal_coords",
"weights",
"atom_types",
"atom_names",
"dihedrals",
"dihedral_atoms",
"type",
"format_version",
)
}
rotlib_formats[1.1] = *rotlib_formats[1.0], "description", "comment", "reference"
rotlib_formats[1.2] = rotlib_formats[1.0]
rotlib_formats[1.3] = rotlib_formats[1.2]
rotlib_formats[1.4] = *rotlib_formats[1.3], "backbone_atoms", "aln_atoms"
rotlib_formats[1.5] = rotlib_formats[1.4]
[docs]
def read_cif(file_name):
"""
Read a CIF file into a dictionary.
Parameters
----------
file_name: str | Path
The cif file to read.
Returns
-------
cif_data: Dict
All data contained in the cif file.
"""
data_blocks = {}
with open(file_name, "r") as f:
for line in f:
if line.startswith("data"):
key = line.split("_")[1].strip()
current_block = data_blocks.setdefault(key, [])
current_block.append(line)
elif "current_block" in locals():
current_block.append(line)
else:
continue
cif_data = {}
for key, value in data_blocks.items():
cif_data[key] = parse_cif_data_block(value)
return cif_data
def parse_cif_data_block(data_block: list[str]) -> dict:
"""
Helper function to parse cif data from a cif file.
Parameters
----------
data_block: list[str]
List of lines from a cif file corresponding to a singular data block as defined by cif specifications.
Returns
-------
parsed_block: dict
The provided data block parsed into a dictionary.
"""
parsed_block = {}
data_block_iterator = iter(data_block)
in_loop = False
for line in data_block_iterator:
if line.startswith("loop_"):
in_loop = True
subject_keys = []
subject_values = []
elif line.startswith("#") and in_loop:
for k, v in zip(subject_keys, zip(*subject_values)):
topic_dict[k] = v
in_loop = False
elif in_loop:
if line.startswith("_"):
topic_key, subject_key = line.strip().split(".")
topic_dict = parsed_block.setdefault(topic_key[1:], {})
subject_keys.append(subject_key)
else:
subject_values.append(split_with_quotes(line))
elif line.startswith("_"):
kv = split_with_quotes(line)
if len(kv) == 2:
key, value = kv
else:
# Extract next line value
key = kv.pop(0)
value = next(data_block_iterator).strip()
if value[0] == "'":
value.strip("'")
# check for multiline entry
if value[0] == ";":
value = value[1:]
while True:
line = next(data_block_iterator)
if line.startswith(";"):
break
value += line.strip()
topic_key, subject_key = key.split(".")
topic_dict = parsed_block.setdefault(topic_key[1:], {})
topic_dict[subject_key] = value
return parsed_block
def split_with_quotes(string):
"""
Helper function to split a string by it's whitespace into a list while respecting quotes.
Parameters
----------
string:
The string to split
Returns
-------
vals: list[str]
List of substrings obtained from splitting the parnet string.
"""
vals = string.split()
myit = enumerate(vals)
replace = []
for i, val in myit:
if val[0] == '"' or val[0] == "'":
sym = val[0]
tvals = [val]
ends_with_quote = val[-1] == sym and len(val) > 1
j = i + 1
while not ends_with_quote:
_j, val = next(myit)
j = _j + 1
tvals.append(val)
ends_with_quote = val[-1] == sym
val = " ".join(tvals)
val = val.strip(sym)
replace.append((i, j, val))
if replace:
for i, j, val in reversed(replace):
vals[i:j] = [val]
return vals
[docs]
def create_ccd_dicts(cif_data):
"""
Creates chemical composition dictionary (CCD) dictionaries from CCD entries in a cif file.
Parameters
----------
cif_data: dict
cif file data as parsed from :func:`~read_cif`.
Returns
-------
ccd_data: dict
The dictionary containing the CCD data from a cif file in a chilife-readable format.
"""
ccd_data = {}
atom_data = cif_data.get("chem_comp_atom", None)
if atom_data is not None:
for res, atom_id, atom_type, aromatic, stereo in zip(
atom_data["comp_id"],
atom_data["atom_id"],
atom_data["type_symbol"],
atom_data["pdbx_aromatic_flag"],
atom_data["pdbx_stereo_config"],
):
res_dict = ccd_data.setdefault(res, {})
if "chem_comp_atom" not in res_dict:
res_dict["chem_comp_atom"] = {
"comp_id": [],
"atom_id": [],
"type_symbol": [],
"pdbx_aromatic_flag": [],
"pdbx_stereo_config": [],
}
atom_dict = res_dict["chem_comp_atom"]
atom_dict["comp_id"].append(res)
atom_dict["atom_id"].append(atom_id)
atom_dict["type_symbol"].append(atom_type)
atom_dict["pdbx_aromatic_flag"].append(aromatic)
atom_dict["pdbx_stereo_config"].append(stereo)
bond_data = cif_data.get("chem_comp_bond", None)
if bond_data is not None:
for res, atom_id1, atom_id2, bond_order, aromatic, stereo in zip(
bond_data["comp_id"],
bond_data["atom_id_1"],
bond_data["atom_id_2"],
bond_data["value_order"],
bond_data["pdbx_aromatic_flag"],
bond_data["pdbx_stereo_config"],
):
res_dict = ccd_data.setdefault(res, {})
if "chem_comp_bond" not in res_dict:
res_dict["chem_comp_bond"] = {
"comp_id": [],
"atom_id_1": [],
"atom_id_2": [],
"value_order": [],
"pdbx_aromatic_flag": [],
"pdbx_stereo_config": [],
}
bond_dict = res_dict["chem_comp_bond"]
bond_dict["comp_id"].append(res)
bond_dict["atom_id_1"].append(atom_id1)
bond_dict["atom_id_2"].append(atom_id2)
bond_dict["value_order"].append(bond_order)
bond_dict["pdbx_aromatic_flag"].append(aromatic)
bond_dict["pdbx_stereo_config"].append(stereo)
chem_comp_data = cif_data.get("chem_comp", None)
if chem_comp_data is not None and all(
key in chem_comp_data
for key in [
"id",
"type",
"mon_nstd_flag",
"name",
"pdbx_synonyms",
"formula",
"formula_weight",
]
):
for res, link_type, mon_nsdt_flag, name, pdbx_synonyms, formula, weight in zip(
chem_comp_data["id"],
chem_comp_data["type"],
chem_comp_data["mon_nstd_flag"],
chem_comp_data["name"],
chem_comp_data["pdbx_synonyms"],
chem_comp_data["formula"],
chem_comp_data["formula_weight"],
):
res_dict = ccd_data.setdefault(res, {})
res_dict["chem_comp"] = {
"link type": link_type,
"mon nsdt flag": mon_nsdt_flag,
"name": name,
"pdbx synonyms": pdbx_synonyms,
"formula": formula,
"weight": weight,
}
return ccd_data
[docs]
def write_cif(file_name, cif_data):
"""
Function to write a CIF file from cif_data as read by :func:`~read_cif`
Parameters
----------
file_name: str | Path
File to write the cif_data to.
cif_data: Dict
cif data to write to the file.
"""
lines = []
for struct in cif_data:
lines.append(f"data_{struct}\n")
lines.append("# \n")
for key1, value1 in cif_data[struct].items():
max_subkey_len = max(len(k) for k in value1.keys())
key_length = len(key1) + 1 + max_subkey_len + 3
loop_keys = []
loop_values = []
max_val_lens = []
for key2, value2 in value1.items():
line = f"_{key1}.{key2}".ljust(key_length)
if isinstance(value2, str):
if " " in value2 and value2[0] != "'":
value2 = "'" + value2 + "'"
if len(value2) < 80:
line += f" {value2} \n"
lines.append(line)
elif " " in value2:
lines.append(line + "\n")
lines.append(value2 + "\n")
else:
lines.append(line + "\n")
prelude = ";"
for string in textwrap.wrap(value2, 80):
lines.append(prelude + string + "\n")
prelude = ""
lines.append("; \n")
elif isinstance(value2, Sequence):
loop_keys.append(line + "\n")
value2 = [v if "'" not in v else '"' + v + '"' for v in value2]
try:
value2 = [
v if (" " not in v) and (v[0] != "_") else "'" + v + "'"
for v in value2
]
except IndexError:
breakpoint()
loop_values.append(value2)
try:
max_val_lens.append(max(len(x) for x in value2))
except ValueError:
print(value2)
if loop_values:
lines.append("loop_\n")
for key in loop_keys:
lines.append(key)
for row in zip(*loop_values):
line = " ".join(x.ljust(n) for x, n in zip(row, max_val_lens))
lines.append(line + "\n")
lines.append("# \n")
with open(file_name, "w") as f:
f.writelines(lines)
[docs]
def join_ccd_info(ccd_list, ccd_data=None):
"""
Join CCD info from a list of CCD entries into a dictionary that can be used to write ccd info to a cif file.
Parameters
----------
ccd_list: List[str]
List of CCD codes to extract from the ccd_data dictionary to be joined in a single ccd_info dict.
ccd_data: Dict
Superset dictionary of CCD entries that contain all the CCD info for the CCD codes specified in the ``ccd_list``
If ``ccd_data`` is not provided the default chilife ``bio_ccd`` will be used which contains all biomolecular CCD
entries from the PDB (as of mid 2025). If a CCD code in ``ccd_list`` is not in ``ccd_data`` it will be skipped.
Returns
-------
return_ccd: Dict
A dictionary containing all ccd data for the set of codes provided and ready to be written to a cif file.
"""
# use default CCD if none is provided
if ccd_data is None:
ccd_data = chilife.bio_ccd
# Otherwise overwrite default CCD data with what was provided and add missing CCD data
else:
_ccd_data = {k: v for k, v in chilife.bio_ccd.items()}
_ccd_data.update(ccd_data)
ccd_data = _ccd_data
individual_CCDs = [ccd_data[ccd] for ccd in ccd_list if ccd in ccd_data]
return_ccd = {
"chem_comp": {},
"chem_comp_atom": {key: [] for key in CHEM_COMP_ATOM_KEYS},
"chem_comp_bond": {key: [] for key in CHEM_COMP_BOND_KEYS},
}
return_ccd["chem_comp"]["id"] = tuple(
ccd["chem_comp"]["id"] for ccd in individual_CCDs if "chem_comp" in ccd
)
return_ccd["chem_comp"]["type"] = tuple(
ccd["chem_comp"]["type"] for ccd in individual_CCDs if "chem_comp" in ccd
)
return_ccd["chem_comp"]["name"] = tuple(
ccd["chem_comp"]["name"] for ccd in individual_CCDs if "chem_comp" in ccd
)
return_ccd["chem_comp"]["pdbx_synonyms"] = tuple(
ccd["chem_comp"]["pdbx_synonyms"]
for ccd in individual_CCDs
if "chem_comp" in ccd
)
return_ccd["chem_comp"]["formula"] = tuple(
ccd["chem_comp"]["formula"] for ccd in individual_CCDs if "chem_comp" in ccd
)
return_ccd["chem_comp"]["formula_weight"] = tuple(
ccd["chem_comp"]["formula_weight"]
for ccd in individual_CCDs
if "chem_comp" in ccd
)
for ccd in individual_CCDs:
# Add Atom information
for key in CHEM_COMP_ATOM_KEYS:
return_ccd["chem_comp_atom"][key].extend(ccd["chem_comp_atom"][key])
if "chem_comp_bond" not in ccd:
continue
# Ensure all fields are tuples/lists
ccd["chem_comp_bond"] = {
k: v if isinstance(v, (list, tuple)) else (v,)
for k, v in ccd["chem_comp_bond"].items()
}
# Add bond information
for key in CHEM_COMP_BOND_KEYS:
return_ccd["chem_comp_bond"][key].extend(ccd["chem_comp_bond"][key])
# Convert to tuples
return_ccd["chem_comp_atom"] = {
k: tuple(v) for k, v in return_ccd["chem_comp_atom"].items()
}
return_ccd["chem_comp_bond"] = {
k: tuple(v) for k, v in return_ccd["chem_comp_bond"].items()
}
# Add PDBx Ordinal
return_ccd["chem_comp_atom"]["pdbx_ordinal"] = tuple(
str(x) for x in range(1, len(return_ccd["chem_comp_atom"]["comp_id"]) + 1)
)
return_ccd["chem_comp_bond"]["pdbx_ordinal"] = tuple(
str(x) for x in range(1, len(return_ccd["chem_comp_bond"]["comp_id"]) + 1)
)
return return_ccd