Source code for chembl_downloader.queries

# -*- coding: utf-8 -*-

"""A collection of query strings for ChEMBL."""

from textwrap import dedent
from typing import Optional

__all__ = [
    "ID_NAME_QUERY",
    "ACTIVITIES_QUERY",
    "DRUG_INDICATIONS_SQL",
    "CHEBI_UNMAPPED_SQL",
    # Functions
    "get_assay_sql",
    "get_target_sql",
    "get_document_molecule_sql",
]


def markdown(s: str):
    """Get a markdown object for pretty display in Jupyter."""
    from IPython.display import Markdown

    return Markdown(f"```sql\n{s.lstrip()}```")


#: This query yields tuples of ChEMBL identifiers and their preferred names, omitting
#: pairs where there is no name or if there's no structure
ID_NAME_QUERY = """\
SELECT
    MOLECULE_DICTIONARY.chembl_id,
    MOLECULE_DICTIONARY.pref_name
FROM MOLECULE_DICTIONARY
JOIN COMPOUND_STRUCTURES ON MOLECULE_DICTIONARY.molregno == COMPOUND_STRUCTURES.molregno
WHERE molecule_dictionary.pref_name IS NOT NULL
"""

#: This query returns five examples from the molecule dictionary table
ID_NAME_QUERY_EXAMPLE = ID_NAME_QUERY + "\nLIMIT 5"

ACTIVITIES_QUERY = """\
SELECT
    COMPOUND_STRUCTURES.canonical_smiles,
    MOLECULE_DICTIONARY.chembl_id,
    ACTIVITIES.BAO_ENDPOINT,
    ACTIVITIES.STANDARD_RELATION,
    ACTIVITIES.STANDARD_VALUE,
    ACTIVITIES.STANDARD_UNITS,
    ASSAYS.ASSAY_TYPE,
    TARGET_DICTIONARY.organism,
    ASSAYS.TID,
    TARGET_DICTIONARY.chembl_id as target_chembl_id,
    TARGET_DICTIONARY.pref_name
FROM MOLECULE_DICTIONARY
JOIN COMPOUND_STRUCTURES ON MOLECULE_DICTIONARY.molregno == COMPOUND_STRUCTURES.molregno
JOIN ACTIVITIES ON MOLECULE_DICTIONARY.molregno == ACTIVITIES.molregno
JOIN ASSAYS ON ACTIVITIES.ASSAY_ID == ASSAYS.ASSAY_ID
JOIN TARGET_DICTIONARY ON ASSAYS.TID == TARGET_DICTIONARY.TID
WHERE
    assay_type in ('B','F')
    and ACTIVITIES.standard_value is not null
    and ACTIVITIES.standard_units = 'nM'
    and ACTIVITIES.standard_relation is not null
    and ACTIVITIES.standard_type = 'IC50'
    and ACTIVITIES.standard_relation = '='
"""


[docs] def get_assay_sql(assay_chembl_id: str) -> str: """Get the SQL for the given assay.""" return dedent( f"""\ SELECT COMPOUND_STRUCTURES.canonical_smiles, MOLECULE_DICTIONARY.chembl_id, ACTIVITIES.STANDARD_TYPE, ACTIVITIES.STANDARD_RELATION, ACTIVITIES.STANDARD_VALUE, ACTIVITIES.STANDARD_UNITS FROM MOLECULE_DICTIONARY JOIN COMPOUND_STRUCTURES ON MOLECULE_DICTIONARY.molregno == COMPOUND_STRUCTURES.molregno JOIN ACTIVITIES ON MOLECULE_DICTIONARY.molregno == ACTIVITIES.molregno JOIN ASSAYS ON ACTIVITIES.ASSAY_ID == ASSAYS.ASSAY_ID WHERE ASSAYS.chembl_id = '{assay_chembl_id}' and ACTIVITIES.standard_value is not null and ACTIVITIES.standard_relation is not null and ACTIVITIES.standard_relation = '=' """ # noqa: S608 )
[docs] def get_target_sql( target_id: str, target_type: Optional[str] = None, standard_relation: Optional[str] = None, standard_type: Optional[str] = None, tax_id: Optional[str] = None, max_phase: bool = False, ) -> str: """Get the SQL for all chemicals inhibiting the target.""" ar = ( "" if standard_relation is None else f"AND ACTIVITIES.standard_relation = '{standard_relation}'" ) st = "" if standard_relation is None else f"AND ACTIVITIES.standard_type = '{standard_type}'" tt = "" if target_type is None else f"AND TARGET_DICTIONARY.target_type = '{target_type}'" tax = "" if tax_id is None else f"AND TARGET_DICTIONARY.tax_id = '{tax_id}'" mp = "\n MOLECULE_DICTIONARY.max_phase," if max_phase else "" return dedent( f"""\ SELECT ASSAYS.chembl_id AS assay_chembl_id, TARGET_DICTIONARY.target_type, TARGET_DICTIONARY.tax_id, COMPOUND_STRUCTURES.canonical_smiles, MOLECULE_DICTIONARY.chembl_id AS molecule_chembl_id,{mp} ACTIVITIES.standard_type, ACTIVITIES.pchembl_value FROM TARGET_DICTIONARY JOIN ASSAYS ON TARGET_DICTIONARY.tid == ASSAYS.tid JOIN ACTIVITIES ON ASSAYS.assay_id == ACTIVITIES.assay_id JOIN MOLECULE_DICTIONARY ON MOLECULE_DICTIONARY.molregno == ACTIVITIES.molregno JOIN COMPOUND_STRUCTURES ON MOLECULE_DICTIONARY.molregno == COMPOUND_STRUCTURES.molregno WHERE TARGET_DICTIONARY.chembl_id = '{target_id}' AND ACTIVITIES.pchembl_value IS NOT NULL {tt} {ar} {st} {tax} """ # noqa: S608 ).strip()
DRUG_INDICATIONS_SQL = """\ SELECT MOLECULE_DICTIONARY.chembl_id, MOLECULE_DICTIONARY.pref_name, MOLECULE_DICTIONARY.chebi_par_id, DRUG_INDICATION.mesh_id, DRUG_INDICATION.mesh_heading, DRUG_INDICATION.efo_id AS indication_curie, DRUG_INDICATION.efo_term AS indication_label, DRUG_INDICATION.max_phase_for_ind FROM MOLECULE_DICTIONARY JOIN DRUG_INDICATION ON MOLECULE_DICTIONARY.molregno == DRUG_INDICATION.molregno """ #: A query for ChEMBL molecules that are unmapped to ChEBI CHEBI_UNMAPPED_SQL = """\ SELECT chembl_id, pref_name FROM MOLECULE_DICTIONARY WHERE chebi_par_id IS NULL AND pref_name IS NOT NULL """ #: Return the count of molecules COUNT_QUERY_SQL = """\ SELECT COUNT(MOLECULE_DICTIONARY.chembl_id) as count FROM MOLECULE_DICTIONARY JOIN COMPOUND_STRUCTURES ON MOLECULE_DICTIONARY.molregno == COMPOUND_STRUCTURES.molregno WHERE molecule_dictionary.pref_name IS NOT NULL """
[docs] def get_document_molecule_sql(document_chembl_id: str) -> str: """Get all molecules mentioned in a document.""" return dedent( f"""\ SELECT DISTINCT MOLECULE_DICTIONARY.chembl_id, COMPOUND_RECORDS.compound_name, COMPOUND_STRUCTURES.canonical_smiles FROM DOCS JOIN COMPOUND_RECORDS ON COMPOUND_RECORDS.doc_id == DOCS.doc_id JOIN MOLECULE_DICTIONARY ON MOLECULE_DICTIONARY.molregno == COMPOUND_RECORDS.molregno JOIN COMPOUND_STRUCTURES ON COMPOUND_RECORDS.molregno == COMPOUND_STRUCTURES.molregno WHERE DOCS.chembl_id = '{document_chembl_id}' """ # noqa: S608 )