Source code for chembl_downloader.api

# -*- coding: utf-8 -*-

"""API for :mod:`chembl_downloader`."""

import ftplib
import gzip
import io
import logging
import os
import pickle
import sqlite3
import tarfile
from contextlib import closing, contextmanager
from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Sequence, Tuple, Union, cast
from xml.etree import ElementTree

import pystow
from tqdm import tqdm

if TYPE_CHECKING:
    import pandas

__all__ = [
    "latest",
    "versions",
    "download_readme",
    "get_date",
    # Database
    "download_sqlite",
    "download_extract_sqlite",
    "connect",
    "cursor",
    "query",
    # SDF
    "download_sdf",
    "supplier",
    "iterate_smiles",
    "get_substructure_library",
    # Chemreps
    "download_chemreps",
    "get_chemreps_df",
    # Fingerprints
    "download_fps",
    "chemfp_load_fps",
    # Monomers
    "download_monomer_library",
    "get_monomer_library_root",
    # UniProt mappings
    "download_uniprot_mapping",
    "get_uniprot_mapping_df",
]

logger = logging.getLogger(__name__)

#: The default path inside the :mod:`pystow` directory
PYSTOW_PARTS = ["chembl"]
RELEASE_PREFIX = "* Release:"
DATE_PREFIX = "* Date:"


def _removeprefix(s: str, prefix: str) -> str:
    if s.startswith(prefix):
        return s[len(prefix) :]
    return s


[docs] def latest() -> str: """Get the latest version of ChEMBL as a string. :returns: The latest version string of ChEMBL :raises ValueError: If the latest README can not be parsed """ bio = io.BytesIO() with ftplib.FTP("ftp.ebi.ac.uk") as ftp: ftp.login() ftp.retrbinary("RETR pub/databases/chembl/ChEMBLdb/latest/README", bio.write) bio.seek(0) for line in bio.read().decode("utf-8").split("\n"): if line.startswith(RELEASE_PREFIX): return _removeprefix(_removeprefix(line, RELEASE_PREFIX).strip(), "chembl_") raise ValueError("could not find latest ChEMBL version")
[docs] def versions() -> List[str]: """Get all versions of ChEMBL.""" version_list = [str(i).zfill(2) for i in range(1, int(latest()) + 1)] # Side version in ChEMBL version_list.extend(["22_1", "24_1"]) return sorted(version_list, reverse=True)
def _download_helper( suffix: str, version: Optional[str] = None, prefix: Optional[Sequence[str]] = None, *, return_version: bool, filename_repeats_version: bool = True, ) -> Union[Path, Tuple[str, Path]]: """Ensure the latest ChEMBL file with the given suffix is downloaded. :param suffix: The suffix of the file :param version: The version number of ChEMBL to get. If none specified, uses :func:`latest` to look up the latest. :param prefix: The directory inside :mod:`pystow` to use :param return_version: Should the version get returned? Turn this to true if you're looking up the latest version and want to reduce redundant code. :param filename_repeats_version: True if filename contains ``chembl_<version>`` in the beginning. Set to false to allow downloading arbitrarily named files. :return: If ``return_version`` is true, return a pair of the version and the local file path to the downloaded file. Otherwise, just return the path. :raises ValueError: If file could not be downloaded """ if version is None: version = latest() # for versions 22.1 and 24.1, it's important to canonicalize the version number # for versions < 10 it's important to left pad with a zero fmt_version = version.replace(".", "_").zfill(2) base = f"ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_{fmt_version}" if filename_repeats_version: filename = f"chembl_{fmt_version}{suffix}" else: filename = suffix for url in [ f"{base}/{filename}", f"{base}/archived/{filename}", ]: try: path = pystow.ensure(*(prefix or PYSTOW_PARTS), fmt_version, url=url) except IOError: continue if return_version: return version, path else: return path raise ValueError(f"could not find {filename} in data for ChEMBL {fmt_version} in {base}")
[docs] def download_sqlite( version: Optional[str] = None, prefix: Optional[Sequence[str]] = None, return_version: bool = False, ) -> Union[Path, Tuple[str, Path]]: """Ensure the latest ChEMBL SQLite dump is downloaded. :param version: The version number of ChEMBL to get. If none specified, uses :func:`latest` to look up the latest. :param prefix: The directory inside :mod:`pystow` to use :param return_version: Should the version get returned? Turn this to true if you're looking up the latest version and want to reduce redundant code. :return: If ``return_version`` is true, return a pair of the version and the local file path to the downloaded ``*.tar.gz`` file. Otherwise, just return the path. """ return _download_helper( suffix="_sqlite.tar.gz", version=version, prefix=prefix, return_version=return_version )
[docs] def download_extract_sqlite( version: Optional[str] = None, *, prefix: Optional[Sequence[str]] = None, return_version: bool = False, ) -> Union[Path, Tuple[str, Path]]: """Ensure the latest ChEMBL SQLite dump is downloaded and extracted. :param version: The version number of ChEMBL to get. If none specified, uses :func:`latest` to look up the latest. :param prefix: The directory inside :mod:`pystow` to use :param return_version: Should the version get returned? Turn this to true if you're looking up the latest version and want to reduce redundant code. :return: If ``return_version`` is true, return a pair of the version and the local file path to the downloaded ChEMBLSQLite database file. Otherwise, just return the path. :raises FileNotFoundError: If no database file could be found in the extracted directories """ if version is not None: _directory = pystow.join(*(prefix or PYSTOW_PARTS), version) if _directory.is_dir(): rv = _find_sqlite_file(_directory) if rv: if return_version: return version, rv return rv version, path = cast( Tuple[str, Path], download_sqlite(version=version, prefix=prefix, return_version=True) ) # Extraction will be done in the same directory as the download. # All ChEMBL SQLite dumps have the same internal folder structure, # so assume there's going to be a directory here directory = path.parent.joinpath("data") if not directory.is_dir(): logger.info("unarchiving %s to %s", path, directory) with tarfile.open(path, mode="r", encoding="utf-8") as tar_file: tar_file.extractall(directory) # noqa:S202 else: logger.debug("did not re-unarchive %s to %s", path, directory) rv = _find_sqlite_file(directory) if rv is None: raise FileNotFoundError("could not find a .db file in the ChEMBL archive") elif return_version: return version, rv else: return rv
def _find_sqlite_file(directory: Union[str, Path]) -> Optional[Path]: # Since the structure of the zip changes from version to version, # it's better to just walk through the unarchived folders recursively # and find the DB file for root, _dirs, files in os.walk(directory): for file in files: if not file.endswith(".db"): continue rv = Path(root).joinpath(file) return rv return None
[docs] @contextmanager def connect(version: Optional[str] = None, *, prefix: Optional[Sequence[str]] = None): """Ensure and connect to the database. :param version: The version number of ChEMBL to get. If none specified, uses :func:`latest` to look up the latest. :param prefix: The directory inside :mod:`pystow` to use :yields: The SQLite connection object. Example: .. code-block:: python import chembl_downloader with chembl_downloader.connect() as conn: with closing(conn.cursor()) as cursor: cursor.execute(...) """ path = cast(Path, download_extract_sqlite(version=version, prefix=prefix, return_version=False)) with closing(sqlite3.connect(path.as_posix())) as conn: yield conn
[docs] @contextmanager def cursor(version: Optional[str] = None, *, prefix: Optional[Sequence[str]] = None): """Ensure, connect, and get a cursor from the database to the database. :param version: The version number of ChEMBL to get. If none specified, uses :func:`latest` to look up the latest. :param prefix: The directory inside :mod:`pystow` to use :yields: The SQLite cursor object. Example: .. code-block:: python import chembl_downloader with chembl_downloader.cursor() as cursor: cursor.execute(...) """ with connect(version=version, prefix=prefix) as conn: with closing(conn.cursor()) as yv: yield yv
[docs] def query( sql: str, version: Optional[str] = None, *, prefix: Optional[Sequence[str]] = None, **kwargs ) -> "pandas.DataFrame": """Ensure the data is available, run the query, then put the results in a dataframe. :param sql: A SQL query string or table name :param version: The version number of ChEMBL to get. If none specified, uses :func:`latest` to look up the latest. :param prefix: The directory inside :mod:`pystow` to use :param kwargs: keyword arguments to pass through to :func:`pandas.read_sql`, such as ``index_col``. :return: A dataframe Example: .. code-block:: python import chembl_downloader from chembl_downloader.queries import ID_NAME_QUERY_EXAMPLE df = chembl_downloader.query(ID_NAME_QUERY_EXAMPLE) """ import pandas as pd with connect(version=version, prefix=prefix) as con: return pd.read_sql(sql, con=con, **kwargs)
[docs] def download_fps( version: Optional[str] = None, *, prefix: Optional[Sequence[str]] = None, return_version: bool = False, ) -> Union[Path, Tuple[str, Path]]: """Ensure the latest ChEMBL fingerprints file is downloaded. This file contains 2048 bit radius 2 morgan fingerprints. :param version: The version number of ChEMBL to get. If none specified, uses :func:`latest` to look up the latest. :param prefix: The directory inside :mod:`pystow` to use :param return_version: Should the version get returned? Turn this to true if you're looking up the latest version and want to reduce redundant code. :return: If ``return_version`` is true, return a pair of the version and the local file path to the downloaded ``*.fps.gz`` file. Otherwise, just return the path. """ return _download_helper( suffix=".fps.gz", version=version, prefix=prefix, return_version=return_version )
[docs] def chemfp_load_fps( version: Optional[str] = None, *, prefix: Optional[Sequence[str]] = None, **kwargs ): """Ensure the ChEMBL fingerprints file is downloaded and open with :func:`chemfp.load_fingerprints`. :param version: The version number of ChEMBL to get. If none specified, uses :func:`latest` to look up the latest. :param prefix: The directory inside :mod:`pystow` to use :param kwargs: Remaining keyword arguments are passed into :func:`chemfp.load_fingerprints`. :return: A fingerpring arena object :rtype: chemfp.arena.FingerprintArena """ import chemfp path = download_fps(version=version, prefix=prefix, return_version=False) return chemfp.load_fingerprints(path, **kwargs)
[docs] def download_chemreps( version: Optional[str] = None, *, prefix: Optional[Sequence[str]] = None, return_version: bool = False, ) -> Union[Path, Tuple[str, Path]]: """Ensure the latest ChEMBL chemical representations file is downloaded. This file is tab-separated and has four columns: 1. ``chembl_id`` 2. ``canonical_smiles`` 3. ``standard_inchi`` 4. ``standard_inchi_key`` If you want to directly parse it with :mod:`pandas`, use :func:`get_chemreps_df`. :param version: The version number of ChEMBL to get. If none specified, uses :func:`latest` to look up the latest. :param prefix: The directory inside :mod:`pystow` to use :param return_version: Should the version get returned? Turn this to true if you're looking up the latest version and want to reduce redundant code. :return: If ``return_version`` is true, return a pair of the version and the local file path to the downloaded ``*_chemreps.txt.gz`` file. Otherwise, just return the path. """ return _download_helper( suffix="_chemreps.txt.gz ", version=version, prefix=prefix, return_version=return_version )
[docs] def get_chemreps_df( version: Optional[str] = None, *, prefix: Optional[Sequence[str]] = None ) -> "pandas.DataFrame": """Download and parse the latest ChEMBL chemical representations file. :param version: The version number of ChEMBL to get. If none specified, uses :func:`latest` to look up the latest. :param prefix: The directory inside :mod:`pystow` to use :return: A dataframe with four columns: 1. ``chembl_id`` 2. ``canonical_smiles`` 3. ``standard_inchi`` 4. ``standard_inchi_key`` """ import pandas path = cast(Path, download_chemreps(version=version, prefix=prefix, return_version=False)) df = pandas.read_csv(path, sep="\t", compression="gzip") return df
[docs] def download_sdf( version: Optional[str] = None, *, prefix: Optional[Sequence[str]] = None, return_version: bool = False, ) -> Union[Path, Tuple[str, Path]]: """Ensure the latest ChEMBL SDF dump is downloaded. :param version: The version number of ChEMBL to get. If none specified, uses :func:`latest` to look up the latest. :param prefix: The directory inside :mod:`pystow` to use :param return_version: Should the version get returned? Turn this to true if you're looking up the latest version and want to reduce redundant code. :return: If ``return_version`` is true, return a pair of the version and the local file path to the downloaded ``*.sdf.gz`` file. Otherwise, just return the path. """ return _download_helper( suffix=".sdf.gz", version=version, prefix=prefix, return_version=return_version )
[docs] def download_monomer_library( version: Optional[str] = None, *, prefix: Optional[Sequence[str]] = None, return_version: bool = False, ) -> Union[Path, Tuple[str, Path]]: """Ensure the latest ChEMBL monomer library is downloaded. :param version: The version number of ChEMBL to get. If none specified, uses :func:`latest` to look up the latest. :param prefix: The directory inside :mod:`pystow` to use :param return_version: Should the version get returned? Turn this to true if you're looking up the latest version and want to reduce redundant code. :return: If ``return_version`` is true, return a pair of the version and the local file path to the downloaded ``*_monomer_library.xml`` file. Otherwise, just return the path. """ return _download_helper( suffix="_monomer_library.xml", version=version, prefix=prefix, return_version=return_version )
[docs] def get_monomer_library_root( version: Optional[str] = None, *, prefix: Optional[Sequence[str]] = None, ) -> ElementTree.Element: """Ensure the latest ChEMBL monomer library is downloaded and parse its root with :mod:`xml`. :param version: The version number of ChEMBL to get. If none specified, uses :func:`latest` to look up the latest. :param prefix: The directory inside :mod:`pystow` to use :return: Return the root of the monomers XML tree, parsed """ monomers_path = cast( Path, download_monomer_library(version=version, prefix=prefix, return_version=False) ) tree = ElementTree.parse(monomers_path) return tree.getroot()
[docs] @contextmanager def supplier( version: Optional[str] = None, *, prefix: Optional[Sequence[str]] = None, **kwargs, ): """Get a :class:`rdkit.Chem.ForwardSDMolSupplier` for the given version of ChEMBL. :param version: The version number of ChEMBL to get. If none specified, uses :func:`latest` to look up the latest. :param prefix: The directory inside :mod:`pystow` to use :param kwargs: keyword arguments to pass through to :class:`rdkit.Chem.ForwardSDMolSupplier`, such as ``sanitize`` and ``removeHs``. :yields: A supplier to be used in a context manager In the following example, a supplier is used to get fingerprints and SMILES. .. code-block:: python from rdkit import Chem import chembl_downloader data = [] with chembl_downloader.supplier() as suppl: for i, mol in enumerate(suppl): if mol is None or mol.GetNumAtoms() > 50: continue fp = Chem.PatternFingerprint(mol, fpSize=1024, tautomerFingerprints=True) smi = Chem.MolToSmiles(mol) data.append((smi, fp)) """ from rdkit import Chem path = cast(Path, download_sdf(version=version, prefix=prefix, return_version=False)) with gzip.open(path) as file: yield Chem.ForwardSDMolSupplier(file, **kwargs)
[docs] def iterate_smiles( version: Optional[str] = None, *, prefix: Optional[Sequence[str]] = None, **kwargs, ) -> Iterable[str]: """Iterate over SMILES via RDKit.""" from rdkit import Chem with supplier(version=version, prefix=prefix, **kwargs) as suppl: for mol in suppl: if mol is None: continue smiles = Chem.MolToSmiles(mol) if smiles: yield smiles
[docs] def get_substructure_library( version: Optional[str] = None, *, max_heavy: int = 75, prefix: Optional[Sequence[str]] = None, **kwargs, ): """Get the ChEMBL substructure library. :param version: The version number of ChEMBL to get. If none specified, uses :func:`latest` to look up the latest. :param max_heavy: The largest number of heavy atoms that are considered before skipping the molecule. :param prefix: The directory inside :mod:`pystow` to use :param kwargs: keyword arguments to pass through to :class:`rdkit.Chem.ForwardSDMolSupplier`, such as ``sanitize`` and ``removeHs`` via :func:`supplier`. :returns: A substructure library object :rtype: rdkit.Chem.rdSubstructLibrary.SubstructLibrary .. seealso:: https://greglandrum.github.io/rdkit-blog/tutorial/substructure/2021/12/20/substructlibrary-search-order.html """ # Requires minimum version of v2021.09 from rdkit.Chem.rdSubstructLibrary import ( CachedTrustedSmilesMolHolder, KeyFromPropHolder, SubstructLibrary, TautomerPatternHolder, ) if version is None: version = latest() path = pystow.join(*(prefix or PYSTOW_PARTS), version, name="ssslib.pkl") if path.is_file(): logger.info("loading substructure library from pickle: %s", path) with path.open("rb") as file: return pickle.load(file) molecule_holder = CachedTrustedSmilesMolHolder() tautomer_pattern_holder = TautomerPatternHolder() key_from_prop_holder = KeyFromPropHolder() library = SubstructLibrary(molecule_holder, tautomer_pattern_holder, key_from_prop_holder) with supplier(version=version, prefix=prefix, **kwargs) as suppl: for mol in tqdm( suppl, unit="molecule", unit_scale=True, desc="Building substructure library" ): if mol is None: continue if mol.GetNumHeavyAtoms() > max_heavy: # skip huge molecules continue library.AddMol(mol) with path.open("wb") as file: pickle.dump(library, file, protocol=pickle.HIGHEST_PROTOCOL) return library
[docs] def download_readme( version: Optional[str] = None, *, prefix: Optional[Sequence[str]] = None, return_version: bool = False, ) -> Union[Path, Tuple[str, Path]]: """Ensure the latest ChEMBL README. :param version: The version number of ChEMBL to get. If none specified, uses :func:`latest` to look up the latest. :param prefix: The directory inside :mod:`pystow` to use :param return_version: Should the version get returned? Turn this to true if you're looking up the latest version and want to reduce redundant code. :return: If ``return_version`` is true, return a pair of the version and the local file path to the downloaded ``*.sdf.gz`` file. Otherwise, just return the path. """ return _download_helper( "README", version=version, prefix=prefix, return_version=return_version, filename_repeats_version=False, )
[docs] def get_date(version: str, **kwargs) -> str: """Get the date of a given version.""" path = cast(Path, download_readme(version=version, **kwargs)) try: date_p = _removeprefix( next(line for line in path.read_text().splitlines() if line.startswith("* Date:")), DATE_PREFIX, ).lstrip() except StopIteration: return "" # happens on 22.1 and 24.1 else: day, month, year = date_p.split("/") return f"{year}-{month}-{day}"
[docs] def download_uniprot_mapping( version: Optional[str] = None, *, prefix: Optional[Sequence[str]] = None, return_version: bool = False, ): """Ensure the latest ChEMBL-UniProt target mapping TSV file. :param version: The version number of ChEMBL to get. If none specified, uses :func:`latest` to look up the latest. :param prefix: The directory inside :mod:`pystow` to use :param return_version: Should the version get returned? Turn this to true if you're looking up the latest version and want to reduce redundant code. :return: If ``return_version`` is true, return a pair of the version and the local file path to the downloaded ``*.txt`` file. Otherwise, just return the path. """ return _download_helper( "chembl_uniprot_mapping.txt", version=version, prefix=prefix, return_version=return_version, filename_repeats_version=False, )
[docs] def get_uniprot_mapping_df( version: Optional[str] = None, *, prefix: Optional[Sequence[str]] = None, ) -> "pandas.DataFrame": """Download and parse the latest ChEMBL-UniProt target mapping TSV file. :param version: The version number of ChEMBL to get. If none specified, uses :func:`latest` to look up the latest. :param prefix: The directory inside :mod:`pystow` to use :return: A dataframe with four columns: 1. ``uniprot_id`` 2. ``chembl_target_id`` 3. ``name``, the name from ChEMBL 4. ``type``, which can have one of the following values: - ``CHIMERIC PROTEIN`` - ``NUCLEIC-ACID`` - ``PROTEIN COMPLEX`` - ``PROTEIN COMPLEX GROUP`` - ``PROTEIN FAMILY`` - ``PROTEIN NUCLEIC-ACID COMPLEX`` - ``PROTEIN-PROTEIN INTERACTION`` - ``SELECTIVITY GROUP`` - ``SINGLE PROTEIN`` """ import pandas as pd path = cast( Path, download_uniprot_mapping(version=version, prefix=prefix, return_version=False) ) df = pd.read_csv( path, sep="\t", skiprows=1, header=None, names=["uniprot_id", "chembl_target_id", "name", "type"], ) return df