Source code for chembl_downloader.contrib

"""Extended functionality not in main scope of chembl-downloader."""

from __future__ import annotations

from collections.abc import Sequence
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal

import click

from chembl_downloader.api import VersionHint, _get_version_info, query
from chembl_downloader.queries import (
    get_assay_sql,
    get_document_molecule_sql,
    get_target_sql,
)

if TYPE_CHECKING:
    import pandas

__all__ = [
    "get_assay_smi_df",
    "get_document_smi_df",
    "get_target_smi_df",
    "write_assay_smi_file",
    "write_document_smi_file",
    "write_target_smi_file",
]



[docs]
def get_target_smi_df(
    target_id: str,
    *,
    version: VersionHint | None = None,
    prefix: Sequence[str] | None = None,
    refresh: bool = False,
    standard_relation: str | None = None,
    standard_type: str | None = None,
    aggregate: Literal["mean", "gmean"] | None = "mean",
    **kwargs: Any,
) -> pandas.DataFrame:
    """Get bioactivities for compounds against the given target (from :func:`get_target_sql`).

    :param target_id: ChEMBL identifier for the target. For example, use CHEMBL1867 for
        the human A2A receptor.
    :param version: The version of ChEMBL to use. If not given, uses the latest version.
    :param aggregate: The aggregation to use (either "mean" or "gmean" for geometric
        mean). If none, do not do aggregation.
    :param refresh: If true, rebuild the cached file.
    :param standard_relation: Relation type filter, applied before aggregation. For
        example, can be "="
    :param standard_type: Assay type filter, applied before aggregation. For example,
        can be "IC50"
    :param kwargs: Remaining keyword arguments to pass through to :func:`get_target_sql`

    :returns: A dataframe

    :raises ValueError: If an unknown ``aggregate`` value is given

    Note, this caches the unfiltered, unaggregated data as a SMI file for later reuse.
    """
    import pandas as pd

    version_info = _get_version_info(version=version, prefix=prefix)

    path = version_info.module.join("targets", name=f"{target_id}.smi")
    if path.is_file() and not refresh:
        df = pd.read_csv(path)
    else:
        sql = get_target_sql(
            target_id,
            standard_relation=standard_relation,
            standard_type=standard_type,
            **kwargs,
        )
        df = query(sql=sql, version=version_info)
        df.to_csv(path, index=False)

    if aggregate is not None:
        group_object = df[["canonical_smiles", "molecule_chembl_id", "pchembl_value"]].groupby(
            ["canonical_smiles", "molecule_chembl_id"]
        )
        if aggregate == "gmean":
            from scipy import stats

            df = group_object.agg(stats.gmean)["pchembl_value"].reset_index()
        elif aggregate == "mean":
            df = group_object.mean(numeric_only=True)["pchembl_value"].reset_index()
        else:
            raise ValueError(f"unknown aggregate: {aggregate}")
    return df




[docs]
def write_target_smi_file(
    target_id: str,
    path: Path,
    *,
    version: VersionHint | None = None,
    prefix: Sequence[str] | None = None,
    sep: str = ",",
    **kwargs: Any,
) -> None:
    """Write SMI file for the given target."""
    df = get_target_smi_df(target_id=target_id, version=version, prefix=prefix, **kwargs)
    df.to_csv(path, sep=sep, index=False, header=False)




[docs]
def get_assay_smi_df(
    assay_chembl_id: str,
    *,
    version: VersionHint | None = None,
    prefix: Sequence[str] | None = None,
    refresh: bool = False,
    **kwargs: Any,
) -> pandas.DataFrame:
    """Get a dataframe for bioactivties in a given assay (from :func:`get_assay_sql`)."""
    import pandas as pd

    version_info = _get_version_info(version=version, prefix=prefix)

    path = version_info.module.join("assays", name=f"{assay_chembl_id}.smi")
    if path.is_file() and not refresh:
        df = pd.read_csv(path)
    else:
        sql = get_assay_sql(assay_chembl_id)
        df = query(sql=sql, version=version_info, **kwargs)
        df.to_csv(path, index=False)

    return df




[docs]
def write_assay_smi_file(
    assay_chembl_id: str,
    path: Path,
    *,
    version: VersionHint | None = None,
    prefix: Sequence[str] | None = None,
    sep: str = ",",
    **kwargs: Any,
) -> None:
    """Write SMI file for the given assay."""
    df = get_assay_smi_df(assay_chembl_id=assay_chembl_id, version=version, prefix=prefix, **kwargs)
    df.to_csv(path, sep=sep, index=False, header=False)




[docs]
def get_document_smi_df(
    document_chembl_id: str,
    *,
    version: VersionHint | None = None,
    prefix: Sequence[str] | None = None,
    refresh: bool = False,
    **kwargs: Any,
) -> pandas.DataFrame:
    """Get bioactivties in a given document (from :func:`get_document_molecule_sql`)."""
    import pandas as pd

    version_info = _get_version_info(version=version, prefix=prefix)

    path = version_info.module.join("documents", name=f"{document_chembl_id}.smi")
    if path.is_file() and not refresh:
        df = pd.read_csv(path)
    else:
        sql = get_document_molecule_sql(document_chembl_id)
        df = query(sql=sql, version=version_info, **kwargs)
        df.to_csv(path, index=False)

    return df




[docs]
def write_document_smi_file(
    document_chembl_id: str,
    path: Path,
    *,
    version: VersionHint | None = None,
    prefix: Sequence[str] | None = None,
    sep: str = ",",
    **kwargs: Any,
) -> None:
    """Write SMI file for bioactivities in the given document."""
    df = get_document_smi_df(
        document_chembl_id=document_chembl_id, version=version, prefix=prefix, **kwargs
    )
    df.to_csv(path, sep=sep, index=False, header=False)



@click.command()
def _main() -> None:
    path = Path(__file__).parent.resolve().joinpath("CHEMBL3098111.smi")
    write_document_smi_file("CHEMBL3098111", path=path, version="31", refresh=True)
    click.echo(f"output to {path}")


if __name__ == "__main__":
    _main()