library(biomaRt)
library(data.table)
library(rlog)
library(stringr)

#' Species IDs of known replicatively aging species.
species_ids_replicative <- c(
    "bihybrid",
    "btaurus",
    "bthybrid",
    "cfamiliaris",
    "chircus",
    "cjacchus",
    "clfamiliaris",
    "csabaeus",
    "ecaballus",
    "fcatus",
    "ggorilla",
    "hsapiens",
    "lafricana",
    "mfascicularis",
    "mmulatta",
    "mmurinus",
    "mnemestrina",
    "nleucogenys",
    "oaries",
    "pabelii",
    "panubis",
    "ppaniscus",
    "ptroglodytes",
    "sscrofa",
    "tgelada"
)

#' Gene names of genes for verified TPE-OLD genes.
genes_verified_tpe_old <- c(
    "C1S",
    "DSP",
    "ISG15",
    "SORBS2",
    "TERT"
)

#' Gene names of genes with a suggested TPE-OLD.
genes_suggested_tpe_old <- c(
    "AKAP3",
    "ANO2",
    "CCND2",
    "CD163L1",
    "CD9",
    "FOXM1",
    "GALNT8",
    "NDUFA9",
    "TEAD4",
    "TIGAR",
    "TSPAN9"
)

ensembl <- useEnsembl(
    biomart = "ensembl",
    version = 104
)

#' Retrieve information on species.
#'
#' The result will be a `data.table` with the following columns:
#'
#'  - `id` Species ID as presented by Ensembl.
#'  - `name` Human readable species name.
#'  - `replicative` Whether the species is likely to be aging replicatively.
retrieve_species <- function() {
    # Ensembl datasets correspond to distinct species.
    ensembl_datasets <- data.table(listDatasets(ensembl))

    # Filter out species ID and name from the result.
    species <- ensembl_datasets[, .(
        id = str_match(dataset, "(.*)_gene_ensembl")[, 2],
        name = str_match(description, "(.*) genes \\(.*\\)")[, 2]
    )]

    species[, replicative := id %chin% species_ids_replicative]
}

#' Retrieve information on human genes.
#'
#' The result will be a `data.table` with the following columns:
#'
#'  - `id` Ensembl gene ID.
#'  - `ǹame` HGNC name of the gene.
#'  - `chromosome` Human chromosome on which the gene is located.
retrieve_genes <- function() {
    genes <- data.table(getBM(
        attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name"),
        mart = useDataset("hsapiens_gene_ensembl", mart = ensembl)
    ))

    genes[, .(
        id = ensembl_gene_id,
        name = hgnc_symbol,
        chromosome = chromosome_name,
        verified = hgnc_symbol %chin% genes_verified_tpe_old,
        suggested = hgnc_symbol %chin% genes_suggested_tpe_old
    )]
}

#' Retrieve gene distance data.
#'
#' The data will include all available values for the given species and genes.
#' Specific values on naturally or artificially (e.g. due to incomplete
#' sequencing) short chromosomes will be excluded.
#'
#' The result will be a `data.table` with the following columns:
#'
#'  - `species` Species ID.
#'  - `gene` Ensembl gene ID.
#'  - `distance` Distance to nearest telomere in base pairs.
retrieve_distances <- function(species_ids, gene_ids) {
    # Special case the human species and retrieve all available distance
    # information.

    ensembl <- useDataset("hsapiens_gene_ensembl", mart = ensembl)

    human_distances <- data.table(getBM(
        attributes = c(
            "ensembl_gene_id",
            "chromosome_name",
            "start_position",
            "end_position"
        ),
        mart = ensembl
    ))

    human_distances[,
        chromosome_length := max(end_position),
        by = chromosome_name
    ]

    # Filter out relevant information (see below).
    distances <- human_distances[
        chromosome_length > 15000000,
        .(
            species = "hsapiens",
            gene = ensembl_gene_id,
            distance = pmin(
                start_position,
                chromosome_length - end_position
            )
        )
    ]

    # Exclude the human from the species, in case it is present there.
    species_ids <- species_ids[species_ids != "hsapiens"]

    species_count <- length(species_ids)

    for (i in 1:species_count) {
        species_id <- species_ids[i]

        log_info(sprintf(
            "[%3i%%] Loading species \"%s\"",
            round(i / species_count * 100),
            species_id
        ))

        ensembl <- useDataset(
            sprintf("%s_gene_ensembl", species_id),
            mart = ensembl
        )

        # Besides the attributes that are always present, we need to check for
        # human orthologs. Some species don't have that information and will be
        # skipped.
        if (!"hsapiens_homolog_ensembl_gene" %chin%
            listAttributes(ensembl, what = "name")) {
            next
        }

        # Retrieve information on all genes of the current species, that have
        # human orthologs. This is called "homolog" in the Ensembl schema.
        ensembl_distances <- data.table(getBM(
            filters = c("with_hsapiens_homolog"),
            values = c(TRUE),
            attributes = c(
                "hsapiens_homolog_ensembl_gene",
                "chromosome_name",
                "start_position",
                "end_position"
            ),
            mart = ensembl
        ))

        ensembl_distances[,
            chromosome_length := max(end_position),
            by = chromosome_name
        ]

        # Filter out relevant information and precompute the genes' distance to
        # the nearest telomere. Exclude genes on naturally or artificially
        # short chromosomes.
        species_distances <- ensembl_distances[
            chromosome_length > 15000000,
            .(
                species = species_id,
                gene = hsapiens_homolog_ensembl_gene,
                distance = pmin(
                    start_position,
                    chromosome_length - end_position
                )
            )
        ]

        distances <- rbindlist(list(distances, species_distances))
    }

    # Arbitrarily exclude duplicated genes.
    # TODO: Consider a refined approach or work out how to include all
    # duplicates.
    unique(distances, by = c("species", "gene"))
}