Source code for chembl_downloader.api

"""API for :mod:`chembl_downloader`."""

from __future__ import annotations

import gzip
import logging
import os
import pickle
import sqlite3
import tarfile
from collections.abc import Generator, Iterable, Sequence
from contextlib import closing, contextmanager
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal, NamedTuple, TypeAlias, cast, overload
from xml.etree import ElementTree

import pystow
import requests
from tqdm import tqdm

from . import queries

if TYPE_CHECKING:
    import chemfp.arena
    import numpy
    import pandas
    import rdkit.Chem
    import rdkit.Chem.rdSubstructLibrary
    from numpy.typing import NDArray

__all__ = [
    "VersionHint",
    "VersionPathPair",
    "chemfp_load_fps",
    "connect",
    "cursor",
    # Chemreps
    "download_chemreps",
    "download_extract_sqlite",
    # Fingerprints
    "download_fps",
    # Monomers
    "download_monomer_library",
    "download_readme",
    # SDF
    "download_sdf",
    # Database
    "download_sqlite",
    # UniProt mappings
    "download_uniprot_mapping",
    "get_chemreps_df",
    "get_date",
    "get_monomer_library_root",
    "get_substructure_library",
    "get_uniprot_mapping_df",
    "iterate_fps",
    "iterate_smiles",
    "latest",
    "query",
    "query_scalar",
    "summarize",
    "supplier",
    "versions",
]

logger = logging.getLogger(__name__)

#: The default path inside the :mod:`pystow` directory
PYSTOW_PARTS = ["chembl"]
RELEASE_PREFIX = "* Release:"
DATE_PREFIX = "* Date:"


class VersionInfo(NamedTuple):
    """A pair of format version and regular version."""

    fmt_version: str
    version: str
    module: pystow.Module

    def _pre_molecule_dictionary(self) -> bool:
        return float(self.version) < 9


#: A hint for a version, which can either be an integer, string, or float (for minor versions)
VersionHint: TypeAlias = str | int | float | VersionInfo



[docs]
class VersionPathPair(NamedTuple):
    """A pair of a version and path."""

    version: str
    path: Path



LATEST_README_URL = "https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/README"


# docstr-coverage:excused `overload`
@overload
def latest(*, full: Literal[True] = True, prefix: Sequence[str] | None = ...) -> VersionInfo: ...


# docstr-coverage:excused `overload`
@overload
def latest(*, full: Literal[False] = False, prefix: Sequence[str] | None = ...) -> str: ...



[docs]
def latest(*, full: bool = False, prefix: Sequence[str] | None = None) -> str | VersionInfo:
    """Get the latest version of ChEMBL as a string.

    :returns: The latest version string of ChEMBL

    :raises ValueError: If the latest README can not be parsed
    """
    res = requests.get(LATEST_README_URL, timeout=5)
    res.raise_for_status()
    for line_binary in res.iter_lines(decode_unicode=True):
        line: str = line_binary.decode("utf8")
        if line.startswith(RELEASE_PREFIX):
            line = line.removeprefix(RELEASE_PREFIX)
            line = line.strip()
            line = line.removeprefix("chembl_")
            if not full:
                return line
            return _get_version_info(line, prefix=prefix)

    raise ValueError("could not find latest ChEMBL version")



# docstr-coverage:excused `overload`
@overload
def versions(
    *, full: Literal[True] = ..., prefix: Sequence[str] | None = ...
) -> list[VersionInfo]: ...


# docstr-coverage:excused `overload`
@overload
def versions(*, full: Literal[False] = ..., prefix: Sequence[str] | None = ...) -> list[str]: ...



[docs]
def versions(
    *, full: bool = False, prefix: Sequence[str] | None = None
) -> list[str] | list[VersionInfo]:
    """Get all versions of ChEMBL."""
    latest_version_info = latest(full=True, prefix=prefix)
    rv = [str(i).zfill(2) for i in range(1, int(latest_version_info.version) + 1)]
    # Side version in ChEMBL
    rv.extend(["22_1", "24_1"])
    rv = sorted(rv, reverse=True)
    if not full:
        return rv
    return [_get_version_info(version, prefix) for version in rv]



_CHEMBL_HOST = "ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases"


def _download_helper(
    suffix: str,
    version: VersionHint | None = None,
    prefix: Sequence[str] | None = None,
    *,
    return_version: bool,
    filename_repeats_version: bool = True,
) -> Path | VersionPathPair:
    """Ensure the latest ChEMBL file with the given suffix is downloaded.

    :param suffix: The suffix of the file
    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use
    :param return_version: Should the version get returned? Turn this to true if you're
        looking up the latest version and want to reduce redundant code.
    :param filename_repeats_version: True if filename contains ``chembl_<version>`` in
        the beginning. Set to false to allow downloading arbitrarily named files.

    :returns: If ``return_version`` is true, return a pair of the version and the local
        file path to the downloaded file. Otherwise, just return the path.

    :raises ValueError: If file could not be downloaded
    """
    version_info = _get_version_info(version, prefix=prefix)

    if filename_repeats_version:
        filename = f"chembl_{version_info.fmt_version}{suffix}"
    else:
        filename = suffix

    base = f"https://{_CHEMBL_HOST}/chembl_{version_info.fmt_version}"
    urls = [
        f"{base}/{filename}",
        f"{base}/archived/{filename}",
    ]
    for url in urls:
        try:
            path = version_info.module.ensure(url=url)
        except OSError:
            continue
        if return_version:
            return VersionPathPair(version_info.version, path)
        else:
            return path

    urls_fmt = "\n".join(f"   - {url}" for url in urls)
    raise ValueError(f"""\

[ChEMBL v{version_info.fmt_version}] could not ensure {filename}

1. It wasn't already cached in the PyStow directory:
   {version_info.module.base}

2. It couldn't be downloaded from any of the following URLs:
{urls_fmt}
    """)


def _get_version_info(
    version: VersionHint | None, prefix: Sequence[str] | None = None
) -> VersionInfo:
    if isinstance(version, VersionInfo):
        return version

    flavor = _ensure_version_helper(version)
    if prefix is None:
        # it's important that this is a None check so it's possible
        # to pass an empty list
        prefix = PYSTOW_PARTS
    module = pystow.module(*prefix, flavor.version)
    return VersionInfo(flavor.fmt_version, flavor.version, module)


class _VersionFlavorsHelper(NamedTuple):
    """A pair of format version and regular version."""

    fmt_version: str
    version: str


def _ensure_version_helper(version: VersionHint | None) -> _VersionFlavorsHelper:
    if version is None:
        version = latest()
    if isinstance(version, int):
        # versions 1-9 are left padded with a zero
        fmt_version = f"{version:02}"
        version = str(version)
    elif isinstance(version, str):
        # remove all leading zeros
        version = version.lstrip("0")

        # for versions 22.1 and 24.1, it's important to canonicalize the version number
        # for versions < 10 it's important to left pad with a zero
        fmt_version = version.replace(".", "_").zfill(2)
    elif isinstance(version, float):
        version = str(version)
        fmt_version = version.replace(".", "_")
    else:
        raise TypeError(f"invalid type for version: {version}")

    return _VersionFlavorsHelper(fmt_version, version)


# docstr-coverage:excused `overload`
@overload
def download_sqlite(
    version: VersionHint | None = ...,
    *,
    prefix: Sequence[str] | None = ...,
    return_version: Literal[True] = ...,
) -> VersionPathPair: ...


# docstr-coverage:excused `overload`
@overload
def download_sqlite(
    version: VersionHint | None = ...,
    *,
    prefix: Sequence[str] | None = ...,
    return_version: Literal[False] = ...,
) -> Path: ...



[docs]
def download_sqlite(
    version: VersionHint | None = None,
    *,
    prefix: Sequence[str] | None = None,
    return_version: bool = False,
) -> Path | VersionPathPair:
    """Ensure the latest ChEMBL SQLite dump is downloaded.

    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use
    :param return_version: Should the version get returned? Turn this to true if you're
        looking up the latest version and want to reduce redundant code.

    :returns: If ``return_version`` is true, return a pair of the version and the local
        file path to the downloaded ``*.tar.gz`` file. Otherwise, just return the path.
    """
    return _download_helper(
        suffix="_sqlite.tar.gz",
        version=version,
        prefix=prefix,
        return_version=return_version,
    )



# docstr-coverage:excused `overload`
@overload
def download_extract_sqlite(
    version: VersionHint | None = ...,
    *,
    prefix: Sequence[str] | None = ...,
    return_version: Literal[True] = ...,
    retain: bool = ...,
) -> VersionPathPair: ...


# docstr-coverage:excused `overload`
@overload
def download_extract_sqlite(
    version: VersionHint | None = ...,
    *,
    prefix: Sequence[str] | None = ...,
    return_version: Literal[False] = ...,
    retain: bool = ...,
) -> Path: ...



[docs]
def download_extract_sqlite(
    version: VersionHint | None = None,
    *,
    prefix: Sequence[str] | None = None,
    return_version: bool = False,
    retain: bool = False,
) -> Path | VersionPathPair:
    """Ensure the latest ChEMBL SQLite dump is downloaded and extracted.

    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use
    :param return_version: Should the version get returned? Turn this to true if you're
        looking up the latest version and want to reduce redundant code.
    :param retain: If true, keeps the original archive.

    :returns: If ``return_version`` is true, return a pair of the version and the local
        file path to the downloaded ChEMBLSQLite database file. Otherwise, just return
        the path.

    :raises FileNotFoundError: If no database file could be found in the extracted
        directories
    """
    version_info = _get_version_info(version, prefix)

    name = f"chembl_{version_info.fmt_version}.db"
    rv = version_info.module.join(name=name)

    if not rv.is_file():
        tar_path = download_sqlite(version=version_info, prefix=prefix, return_version=False)
        with tarfile.open(tar_path, mode="r", encoding="utf-8") as tar_file:
            tar_info = _get_tar_info(tar_file)
            if tar_info is None:
                raise FileNotFoundError("could not find a .db file in the ChEMBL archive")
            logger.info("unarchiving %s to %s", tar_path, rv)
            tar_file._extract_member(tar_info, rv.as_posix())

        if not retain:
            logger.info("deleting original archive %s", tar_path)
            tar_path.unlink()

    if return_version:
        return VersionPathPair(version_info.version, rv)
    else:
        return rv



def _get_tar_info(tar_file: tarfile.TarFile) -> tarfile.TarInfo | None:
    """Walk an archive and find a file with the ``.db`` extension."""
    for tar_info in tar_file:
        if tar_info.name.endswith(".db"):
            return tar_info
    return None


def _find_sqlite_file(directory: str | Path) -> Path | None:
    # Since the structure of the zip changes from version to version,
    # it's better to just walk through the unarchived folders recursively
    # and find the DB file
    for root, _dirs, files in os.walk(directory):
        for file in files:
            if not file.endswith(".db"):
                continue
            rv = Path(root).joinpath(file)
            return rv
    return None



[docs]
@contextmanager
def connect(
    version: VersionHint | None = None, *, prefix: Sequence[str] | None = None
) -> Generator[sqlite3.Connection, None, None]:
    """Ensure and connect to the database.

    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use

    :yields: The SQLite connection object.

    Example: .. code-block:: python

        import chembl_downloader

        with chembl_downloader.connect() as conn:
            with closing(conn.cursor()) as cursor:
                cursor.execute(...)
    """
    path = download_extract_sqlite(version=version, prefix=prefix, return_version=False)
    with closing(sqlite3.connect(path.as_posix())) as conn:
        yield conn




[docs]
@contextmanager
def cursor(
    version: VersionHint | None = None, *, prefix: Sequence[str] | None = None
) -> Generator[sqlite3.Cursor]:
    """Ensure, connect, and get a cursor from the database to the database.

    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use

    :yields: The SQLite cursor object.

    Example: .. code-block:: python

        import chembl_downloader

        with chembl_downloader.cursor() as cursor:
            cursor.execute(...)
    """
    with connect(version=version, prefix=prefix) as conn:
        with closing(conn.cursor()) as yv:
            yield yv




[docs]
def query(
    sql: str,
    version: VersionHint | None = None,
    *,
    prefix: Sequence[str] | None = None,
    **kwargs: Any,
) -> pandas.DataFrame:
    """Ensure the data is available, run the query, then put the results in a dataframe.

    :param sql: A SQL query string or table name
    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use
    :param kwargs: keyword arguments to pass through to :func:`pandas.read_sql`, such as
        ``index_col``.

    :returns: A dataframe

    Example: .. code-block:: python

        import chembl_downloader
        from chembl_downloader.queries import ID_NAME_QUERY_EXAMPLE

        df = chembl_downloader.query(ID_NAME_QUERY_EXAMPLE)
    """
    import pandas as pd

    with connect(version=version, prefix=prefix) as con:
        return pd.read_sql(sql, con=con, **kwargs)




[docs]
def query_scalar(
    sql: str,
    version: VersionHint | None = None,
    *,
    prefix: Sequence[str] | None = None,
    **kwargs: Any,
) -> Any:
    """Ensure the data is available, run the query, then extract the result.

    Similar to :func:`query`, but automatically unpacks the value, assuming that only
    one record is returned with just a single column.

    :param sql: A SQL query string or table name
    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use
    :param kwargs: keyword arguments to pass through to :func:`pandas.read_sql`, such as
        ``index_col``.

    :returns: A value (int, str, etc.) from the database.

    Example: .. code-block:: python

        import chembl_downloader

        sql = "SELECT COUNT(activity_id) FROM activities"
        count: int = chembl_downloader.query_one(sql)
    """
    df = query(sql, version=version, prefix=prefix, **kwargs)
    return df[df.columns[0]][0].item()



# docstr-coverage:excused `overload`
@overload
def download_fps(
    version: VersionHint | None = ...,
    *,
    prefix: Sequence[str] | None = ...,
    return_version: Literal[True] = ...,
) -> VersionPathPair: ...


# docstr-coverage:excused `overload`
@overload
def download_fps(
    version: VersionHint | None = ...,
    *,
    prefix: Sequence[str] | None = ...,
    return_version: Literal[False] = ...,
) -> Path: ...



[docs]
def download_fps(
    version: VersionHint | None = None,
    *,
    prefix: Sequence[str] | None = None,
    return_version: bool = False,
) -> Path | VersionPathPair:
    """Ensure the latest ChEMBL fingerprints file is downloaded.

    This file contains 2048 bit radius 2 morgan fingerprints.

    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use
    :param return_version: Should the version get returned? Turn this to true if you're
        looking up the latest version and want to reduce redundant code.

    :returns: If ``return_version`` is true, return a pair of the version and the local
        file path to the downloaded ``*.fps.gz`` file. Otherwise, just return the path.
    """
    return _download_helper(
        suffix=".fps.gz", version=version, prefix=prefix, return_version=return_version
    )




[docs]
def chemfp_load_fps(
    version: VersionHint | None = None, *, prefix: Sequence[str] | None = None, **kwargs: Any
) -> chemfp.arena.FingerprintArena:
    """Download and open the ChEMBL fingerprints via :func:`chemfp.load_fingerprints`.

    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use
    :param kwargs: Remaining keyword arguments are passed into
        :func:`chemfp.load_fingerprints`.

    :returns: A fingerprint arena object
    """
    import chemfp

    path = download_fps(version=version, prefix=prefix, return_version=False)
    return chemfp.load_fingerprints(path, **kwargs)




[docs]
def iterate_fps(
    version: VersionHint | None = None,
    *,
    prefix: Sequence[str] | None = None,
    identifier_format: Literal["local", "curie"] = "local",
) -> Iterable[tuple[str, NDArray[numpy.uint8]]]:
    """Download and open the ChEMBL fingerprints via RDKit/Numpy.

    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use
    :param identifier_format: Should identifiers get returned as local unique
        identifiers, or compact URIs (CURIEs)?

    :returns: A pair of identifiers and numpy arrays
    """
    import numpy as np
    from rdkit import DataStructs
    from rdkit.DataStructs import ConvertToNumpyArray

    use_curie = identifier_format == "curie"

    path = download_fps(version=version, prefix=prefix, return_version=False)
    with gzip.open(path, mode="rt") as file:
        for _ in range(6):  # throw away headers
            next(file)
        for line in tqdm(
            file, unit_scale=True, desc="Getting chemical features", unit="fingerprint"
        ):
            hex_fp, chembl_id = line.strip().split("\t")
            binary_fp = bytes.fromhex(hex_fp)
            bitvect = DataStructs.cDataStructs.CreateFromBinaryText(binary_fp)
            arr = np.zeros((bitvect.GetNumBits(),), dtype=np.uint8)
            ConvertToNumpyArray(bitvect, arr)
            if use_curie:
                chembl_id = f"chembl.compound:{chembl_id}"
            yield chembl_id, arr



# docstr-coverage:excused `overload`
@overload
def download_chemreps(
    version: VersionHint | None = ...,
    *,
    prefix: Sequence[str] | None = ...,
    return_version: Literal[True] = True,
) -> VersionPathPair: ...


# docstr-coverage:excused `overload`
@overload
def download_chemreps(
    version: VersionHint | None = ...,
    *,
    prefix: Sequence[str] | None = ...,
    return_version: Literal[False] = False,
) -> Path: ...



[docs]
def download_chemreps(
    version: VersionHint | None = None,
    *,
    prefix: Sequence[str] | None = None,
    return_version: bool = False,
) -> Path | VersionPathPair:
    """Ensure the latest ChEMBL chemical representations file is downloaded.

    This file is tab-separated and has four columns:

    1. ``chembl_id``
    2. ``canonical_smiles``
    3. ``standard_inchi``
    4. ``standard_inchi_key``

    If you want to directly parse it with :mod:`pandas`, use :func:`get_chemreps_df`.

    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use
    :param return_version: Should the version get returned? Turn this to true if you're
        looking up the latest version and want to reduce redundant code.

    :returns: If ``return_version`` is true, return a pair of the version and the local
        file path to the downloaded ``*_chemreps.txt.gz`` file. Otherwise, just return
        the path.
    """
    return _download_helper(
        suffix="_chemreps.txt.gz ",
        version=version,
        prefix=prefix,
        return_version=return_version,
    )




[docs]
def get_chemreps_df(
    version: VersionHint | None = None, *, prefix: Sequence[str] | None = None
) -> pandas.DataFrame:
    """Download and parse the latest ChEMBL chemical representations file.

    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use

    :returns: A dataframe with four columns: 1. ``chembl_id`` 2. ``canonical_smiles`` 3.
        ``standard_inchi`` 4. ``standard_inchi_key``
    """
    import pandas

    path = download_chemreps(version=version, prefix=prefix, return_version=False)
    df = pandas.read_csv(path, sep="\t", compression="gzip")
    return df



# docstr-coverage:excused `overload`
@overload
def download_sdf(
    version: VersionHint | None = ...,
    *,
    prefix: Sequence[str] | None = ...,
    return_version: Literal[True] = ...,
) -> VersionPathPair: ...


# docstr-coverage:excused `overload`
@overload
def download_sdf(
    version: VersionHint | None = ...,
    *,
    prefix: Sequence[str] | None = ...,
    return_version: Literal[False] = ...,
) -> Path: ...



[docs]
def download_sdf(
    version: VersionHint | None = None,
    *,
    prefix: Sequence[str] | None = None,
    return_version: bool = False,
) -> Path | VersionPathPair:
    """Ensure the latest ChEMBL SDF dump is downloaded.

    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use
    :param return_version: Should the version get returned? Turn this to true if you're
        looking up the latest version and want to reduce redundant code.

    :returns: If ``return_version`` is true, return a pair of the version and the local
        file path to the downloaded ``*.sdf.gz`` file. Otherwise, just return the path.
    """
    return _download_helper(
        suffix=".sdf.gz", version=version, prefix=prefix, return_version=return_version
    )



# docstr-coverage:excused `overload`
@overload
def download_monomer_library(
    version: VersionHint | None = ...,
    *,
    prefix: Sequence[str] | None = ...,
    return_version: Literal[True] = ...,
) -> VersionPathPair: ...


# docstr-coverage:excused `overload`
@overload
def download_monomer_library(
    version: VersionHint | None = ...,
    *,
    prefix: Sequence[str] | None = ...,
    return_version: Literal[False] = ...,
) -> Path: ...



[docs]
def download_monomer_library(
    version: VersionHint | None = None,
    *,
    prefix: Sequence[str] | None = None,
    return_version: bool = False,
) -> Path | VersionPathPair:
    """Ensure the latest ChEMBL monomer library is downloaded.

    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use
    :param return_version: Should the version get returned? Turn this to true if you're
        looking up the latest version and want to reduce redundant code.

    :returns: If ``return_version`` is true, return a pair of the version and the local
        file path to the downloaded ``*_monomer_library.xml`` file. Otherwise, just
        return the path.
    """
    return _download_helper(
        suffix="_monomer_library.xml",
        version=version,
        prefix=prefix,
        return_version=return_version,
    )




[docs]
def get_monomer_library_root(
    version: VersionHint | None = None,
    *,
    prefix: Sequence[str] | None = None,
) -> ElementTree.Element:
    """Ensure the latest ChEMBL monomer library is downloaded and parse its root with :mod:`xml`.

    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use

    :returns: Return the root of the monomers XML tree, parsed
    """
    monomers_path = download_monomer_library(version=version, prefix=prefix, return_version=False)
    tree = ElementTree.parse(monomers_path)  # noqa:S314
    return tree.getroot()




[docs]
@contextmanager
def supplier(
    version: VersionHint | None = None,
    *,
    prefix: Sequence[str] | None = None,
    **kwargs: Any,
) -> Generator[rdkit.Chem.ForwardSDMolSupplier]:
    """Get a :class:`rdkit.Chem.ForwardSDMolSupplier` for the given version of ChEMBL.

    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use
    :param kwargs: keyword arguments to pass through to
        :class:`rdkit.Chem.ForwardSDMolSupplier`, such as ``sanitize`` and ``removeHs``.

    :yields: A supplier to be used in a context manager

    In the following example, a supplier is used to get fingerprints and SMILES.

    .. code-block:: python

        from rdkit import Chem

        import chembl_downloader

        data = []
        with chembl_downloader.supplier() as suppl:
            for i, mol in enumerate(suppl):
                if mol is None or mol.GetNumAtoms() > 50:
                    continue
                fp = Chem.PatternFingerprint(mol, fpSize=1024, tautomerFingerprints=True)
                smi = Chem.MolToSmiles(mol)
                data.append((smi, fp))
    """
    from rdkit import Chem

    path = download_sdf(version=version, prefix=prefix, return_version=False)
    with gzip.open(path) as file:
        yield Chem.ForwardSDMolSupplier(file, **kwargs)




[docs]
def iterate_smiles(
    version: VersionHint | None = None,
    *,
    prefix: Sequence[str] | None = None,
    **kwargs: Any,
) -> Iterable[str]:
    """Iterate over SMILES via RDKit."""
    from rdkit import Chem

    with supplier(version=version, prefix=prefix, **kwargs) as suppl:
        for mol in suppl:
            if mol is None:
                continue
            smiles = Chem.MolToSmiles(mol)
            if smiles:
                yield smiles




[docs]
def get_substructure_library(
    version: VersionHint | None = None,
    *,
    max_heavy: int = 75,
    prefix: Sequence[str] | None = None,
    **kwargs: Any,
) -> rdkit.Chem.rdSubstructLibrary.SubstructLibrary:
    """Get the ChEMBL substructure library.

    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param max_heavy: The largest number of heavy atoms that are considered before
        skipping the molecule.
    :param prefix: The directory inside :mod:`pystow` to use
    :param kwargs: keyword arguments to pass through to
        :class:`rdkit.Chem.ForwardSDMolSupplier`, such as ``sanitize`` and ``removeHs``
        via :func:`supplier`.

    :returns: A substructure library object

    .. seealso::

        https://greglandrum.github.io/rdkit-blog/tutorial/substructure/2021/12/20/substructlibrary-search-order.html
    """
    # Requires minimum version of v2021.09
    from rdkit.Chem.rdSubstructLibrary import (
        CachedTrustedSmilesMolHolder,
        KeyFromPropHolder,
        SubstructLibrary,
        TautomerPatternHolder,
    )

    version_info = _get_version_info(version, prefix)

    path = version_info.module.join(name="ssslib.pkl")
    if path.is_file():
        logger.info("loading substructure library from pickle: %s", path)
        with path.open("rb") as file:
            return pickle.load(file)  # noqa:S301

    molecule_holder = CachedTrustedSmilesMolHolder()
    tautomer_pattern_holder = TautomerPatternHolder()
    key_from_prop_holder = KeyFromPropHolder()
    library = SubstructLibrary(molecule_holder, tautomer_pattern_holder, key_from_prop_holder)
    with supplier(version=version_info, prefix=prefix, **kwargs) as suppl:
        for mol in tqdm(
            suppl,
            unit="molecule",
            unit_scale=True,
            desc="Building substructure library",
        ):
            if mol is None:
                continue
            if mol.GetNumHeavyAtoms() > max_heavy:  # skip huge molecules
                continue
            library.AddMol(mol)
    with path.open("wb") as file:
        pickle.dump(library, file, protocol=pickle.HIGHEST_PROTOCOL)
    return library



# docstr-coverage:excused `overload`
@overload
def download_readme(
    version: VersionHint | None = ...,
    *,
    prefix: Sequence[str] | None = ...,
    return_version: Literal[True] = ...,
) -> VersionPathPair: ...


# docstr-coverage:excused `overload`
@overload
def download_readme(
    version: VersionHint | None = ...,
    *,
    prefix: Sequence[str] | None = ...,
    return_version: Literal[False] = ...,
) -> Path: ...



[docs]
def download_readme(
    version: VersionHint | None = None,
    *,
    prefix: Sequence[str] | None = None,
    return_version: bool = False,
) -> Path | VersionPathPair:
    """Ensure the latest ChEMBL README.

    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use
    :param return_version: Should the version get returned? Turn this to true if you're
        looking up the latest version and want to reduce redundant code.

    :returns: If ``return_version`` is true, return a pair of the version and the local
        file path to the downloaded ``*.sdf.gz`` file. Otherwise, just return the path.
    """
    return _download_helper(
        "README",
        version=version,
        prefix=prefix,
        return_version=return_version,
        filename_repeats_version=False,
    )



# manually encoded versions that can't be directly looked up
DATE_FIX = {"22": "2016-09-28", "24": "2018-05-01"}



[docs]
def get_date(
    version: VersionHint | None = None, *, prefix: Sequence[str] | None = None, **kwargs: Any
) -> str:
    """Get the date of a given version."""
    version_info = _get_version_info(version, prefix=prefix)
    if version_info.version in DATE_FIX:
        return DATE_FIX[version_info.version]

    path = download_readme(version=version_info, return_version=False, **kwargs)
    try:
        date_p = next(
            line for line in path.read_text().splitlines() if line.startswith(DATE_PREFIX)
        )
        date_p = date_p.removeprefix(DATE_PREFIX)
        date_p = date_p.lstrip()
        date_p = date_p.replace(" ", "")  # for v14, which has a typo
    except StopIteration:
        return ""  # happens on 22.1 and 24.1
    else:
        day, month, year = date_p.split("/")
        return f"{year}-{month}-{day}"



# docstr-coverage:excused `overload`
@overload
def download_uniprot_mapping(
    version: VersionHint | None = ...,
    *,
    prefix: Sequence[str] | None = ...,
    return_version: Literal[False] = ...,
) -> Path: ...


# docstr-coverage:excused `overload`
@overload
def download_uniprot_mapping(
    version: VersionHint | None = ...,
    *,
    prefix: Sequence[str] | None = ...,
    return_version: Literal[True] = ...,
) -> VersionPathPair: ...



[docs]
def download_uniprot_mapping(
    version: VersionHint | None = None,
    *,
    prefix: Sequence[str] | None = None,
    return_version: bool = False,
) -> Path | VersionPathPair:
    """Ensure the latest ChEMBL-UniProt target mapping TSV file.

    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use
    :param return_version: Should the version get returned? Turn this to true if you're
        looking up the latest version and want to reduce redundant code.

    :returns: If ``return_version`` is true, return a pair of the version and the local
        file path to the downloaded ``*.txt`` file. Otherwise, just return the path.
    """
    return _download_helper(
        "chembl_uniprot_mapping.txt",
        version=version,
        prefix=prefix,
        return_version=return_version,
        filename_repeats_version=False,
    )




[docs]
def get_uniprot_mapping_df(
    version: VersionHint | None = None,
    *,
    prefix: Sequence[str] | None = None,
) -> pandas.DataFrame:
    """Download and parse the latest ChEMBL-UniProt target mapping TSV file.

    :param version: The version number of ChEMBL to get. If none specified, uses
        :func:`latest` to look up the latest.
    :param prefix: The directory inside :mod:`pystow` to use

    :returns: A dataframe with four columns:

        1. ``uniprot_id``
        2. ``chembl_target_id``
        3. ``name``, the name from ChEMBL
        4. ``type``, which can have one of the following values:

           - ``CHIMERIC PROTEIN``
           - ``NUCLEIC-ACID``
           - ``PROTEIN COMPLEX``
           - ``PROTEIN COMPLEX GROUP``
           - ``PROTEIN FAMILY``
           - ``PROTEIN NUCLEIC-ACID COMPLEX``
           - ``PROTEIN-PROTEIN INTERACTION``
           - ``SELECTIVITY GROUP``
           - ``SINGLE PROTEIN``
    """
    import pandas as pd

    path = download_uniprot_mapping(version=version, prefix=prefix, return_version=False)
    df = pd.read_csv(
        path,
        sep="\t",
        skiprows=1,
        header=None,
        names=["uniprot_id", "chembl_target_id", "name", "type"],
    )
    return df



class SummaryTuple(NamedTuple):
    """A summary tuple."""

    version: str
    date: str
    compounds: int
    named_compounds: int
    assays: int
    activities: int
    documents: int
    targets: int
    cells: int
    tissues: int
    drug_warnings: int
    drug_indications: int
    drug_mechanisms: int
    # TODO count drugs



[docs]
def summarize(
    version: VersionHint | None = None, *, prefix: Sequence[str] | None = None
) -> SummaryTuple:
    """Get a summary for a given version of ChEMBL."""
    version_info = _get_version_info(version, prefix)

    if version_info._pre_molecule_dictionary():
        compound_sql = "SELECT COUNT(*) from compounds"
    else:
        compound_sql = queries.COUNT_COMPOUNDS_SQL

    return SummaryTuple(
        version=version_info.version,
        date=get_date(version=version_info),
        compounds=_count(compound_sql, version_info=version_info),
        named_compounds=_count(queries.COUNT_NAMED_COMPOUNDS_SQL, version_info=version_info),
        assays=_count(queries.COUNT_ASSAYS_SQL, version_info=version_info),
        activities=_count(queries.COUNT_ACTIVITIES_SQL, version_info=version_info),
        documents=_count(queries.COUNT_DOCUMENTS_SQL, version_info=version_info),
        targets=_count(queries.COUNT_TARGETS_SQL, version_info=version_info),
        cells=_count(queries.COUNT_CELLS_SQL, version_info=version_info),
        tissues=_count(queries.COUNT_TISSUES_SQL, version_info=version_info),
        drug_warnings=_count(queries.COUNT_DRUG_WARNINGS_SQL, version_info=version_info),
        drug_indications=_count(queries.COUNT_DRUG_INDICATIONS_SQL, version_info=version_info),
        drug_mechanisms=_count(queries.COUNT_DRUG_MECHANISMS_SQL, version_info=version_info),
    )



def _count(sql: str, version_info: VersionInfo) -> int:
    try:
        rv = query_scalar(sql, version=version_info)
    except OSError:  # pandas.errors.DatabaseError
        return 0
    else:
        return cast(int, rv)