geposanui/input.R

library(biomaRt)
library(data.table)
library(rlog)
library(stringr)

#' Species IDs of known replicatively aging species.
species_ids_replicative <- c(
    "bihybrid",
    "btaurus",
    "bthybrid",
    "cfamiliaris",
    "chircus",
    "cjacchus",
    "clfamiliaris",
    "csabaeus",
    "ecaballus",
    "fcatus",
    "ggorilla",
    "hsapiens",
    "lafricana",
    "mfascicularis",
    "mmulatta",
    "mmurinus",
    "mnemestrina",
    "nleucogenys",
    "oaries",
    "pabelii",
    "panubis",
    "ppaniscus",
    "ptroglodytes",
    "sscrofa",
    "tgelada"
)

#' Gene names of genes for verified TPE-OLD genes.
genes_verified_tpe_old <- c(
    "C1S",
    "DSP",
    "ISG15",
    "SORBS2",
    "TERT"
)

#' Gene names of genes with a suggested TPE-OLD.
genes_suggested_tpe_old <- c(
    "AKAP3",
    "ANO2",
    "CCND2",
    "CD163L1",
    "CD9",
    "FOXM1",
    "GALNT8",
    "NDUFA9",
    "TEAD4",
    "TIGAR",
    "TSPAN9"
)

ensembl <- useEnsembl(
    biomart = "ensembl",
    version = 104
)

#' Retrieve information on species.
#'
#' The result will be a `data.table` with the following columns:
#'
#'  - `id` Species ID as presented by Ensembl.
#'  - `name` Human readable species name.
#'  - `replicative` Whether the species is likely to be aging replicatively.
retrieve_species <- function() {
    # Ensembl datasets correspond to distinct species.
    ensembl_datasets <- data.table(listDatasets(ensembl))

    # Filter out species ID and name from the result.
    species <- ensembl_datasets[, .(
        id = str_match(dataset, "(.*)_gene_ensembl")[, 2],
        name = str_match(description, "(.*) genes \\(.*\\)")[, 2]
    )]

    species[, replicative := id %chin% species_ids_replicative]
}

#' Retrieve information on human genes.
#'
#' The result will be a `data.table` with the following columns:
#'
#'  - `id` Ensembl gene ID.
#'  - `ǹame` HGNC name of the gene.
#'  - `chromosome` Human chromosome on which the gene is located.
retrieve_genes <- function() {
    genes <- data.table(getBM(
        attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name"),
        mart = useDataset("hsapiens_gene_ensembl", mart = ensembl)
    ))

    genes[, .(
        id = ensembl_gene_id,
        name = hgnc_symbol,
        chromosome = chromosome_name,
        verified = hgnc_symbol %chin% genes_verified_tpe_old,
        suggested = hgnc_symbol %chin% genes_suggested_tpe_old
    )]
}

#' Retrieve gene distance data.
#'
#' The data will include all available values for the given species and genes.
#' Specific values on naturally or artificially (e.g. due to incomplete
#' sequencing) short chromosomes will be excluded.
#'
#' The result will be a `data.table` with the following columns:
#'
#'  - `species` Species ID.
#'  - `gene` Ensembl gene ID.
#'  - `distance` Distance to nearest telomere in base pairs.
retrieve_distances <- function(species_ids, gene_ids) {
    # Special case the human species and retrieve all available distance
    # information.

    ensembl <- useDataset("hsapiens_gene_ensembl", mart = ensembl)

    human_distances <- data.table(getBM(
        attributes = c(
            "ensembl_gene_id",
            "chromosome_name",
            "start_position",
            "end_position"
        ),
        mart = ensembl
    ))

    human_distances[,
        chromosome_length := max(end_position),
        by = chromosome_name
    ]

    # Filter out relevant information (see below).
    distances <- human_distances[
        chromosome_length > 15000000,
        .(
            species = "hsapiens",
            gene = ensembl_gene_id,
            distance = pmin(
                start_position,
                chromosome_length - end_position
            )
        )
    ]

    # Exclude the human from the species, in case it is present there.
    species_ids <- species_ids[species_ids != "hsapiens"]

    species_count <- length(species_ids)

    for (i in 1:species_count) {
        species_id <- species_ids[i]

        log_info(sprintf(
            "[%3i%%] Loading species \"%s\"",
            round(i / species_count * 100),
            species_id
        ))

        ensembl <- useDataset(
            sprintf("%s_gene_ensembl", species_id),
            mart = ensembl
        )

        # Besides the attributes that are always present, we need to check for
        # human orthologs. Some species don't have that information and will be
        # skipped.
        if (!"hsapiens_homolog_ensembl_gene" %chin%
            listAttributes(ensembl, what = "name")) {
            next
        }

        # Retrieve information on all genes of the current species, that have
        # human orthologs. This is called "homolog" in the Ensembl schema.
        ensembl_distances <- data.table(getBM(
            filters = c("with_hsapiens_homolog"),
            values = c(TRUE),
            attributes = c(
                "hsapiens_homolog_ensembl_gene",
                "chromosome_name",
                "start_position",
                "end_position"
            ),
            mart = ensembl
        ))

        ensembl_distances[,
            chromosome_length := max(end_position),
            by = chromosome_name
        ]

        # Filter out relevant information and precompute the genes' distance to
        # the nearest telomere. Exclude genes on naturally or artificially
        # short chromosomes.
        species_distances <- ensembl_distances[
            chromosome_length > 15000000,
            .(
                species = species_id,
                gene = hsapiens_homolog_ensembl_gene,
                distance = pmin(
                    start_position,
                    chromosome_length - end_position
                )
            )
        ]

        distances <- rbindlist(list(distances, species_distances))
    }

    # Arbitrarily exclude duplicated genes.
    # TODO: Consider a refined approach or work out how to include all
    # duplicates.
    unique(distances, by = c("species", "gene"))
}
Retrieve input data using biomaRt 2021-09-16 00:06:54 +02:00			`library(biomaRt)`
data: Revert to using data.table 2021-06-24 22:36:02 +02:00			`library(data.table)`
Include all species 2021-08-25 12:00:22 +02:00			`library(rlog)`
Retrieve input data using biomaRt 2021-09-16 00:06:54 +02:00			`library(stringr)`

			`#' Species IDs of known replicatively aging species.`
			`species_ids_replicative <- c(`
			`"bihybrid",`
			`"btaurus",`
			`"bthybrid",`
			`"cfamiliaris",`
			`"chircus",`
			`"cjacchus",`
			`"clfamiliaris",`
			`"csabaeus",`
			`"ecaballus",`
			`"fcatus",`
			`"ggorilla",`
			`"hsapiens",`
			`"lafricana",`
			`"mfascicularis",`
			`"mmulatta",`
			`"mmurinus",`
			`"mnemestrina",`
			`"nleucogenys",`
			`"oaries",`
			`"pabelii",`
			`"panubis",`
			`"ppaniscus",`
			`"ptroglodytes",`
			`"sscrofa",`
			`"tgelada"`
			`)`
Initial commit 2021-06-16 22:01:09 +02:00
Include data on TPE-OLD status of genes 2021-08-29 14:02:37 +02:00			`#' Gene names of genes for verified TPE-OLD genes.`
			`genes_verified_tpe_old <- c(`
			`"C1S",`
			`"DSP",`
			`"ISG15",`
			`"SORBS2",`
			`"TERT"`
			`)`

			`#' Gene names of genes with a suggested TPE-OLD.`
			`genes_suggested_tpe_old <- c(`
			`"AKAP3",`
			`"ANO2",`
			`"CCND2",`
			`"CD163L1",`
			`"CD9",`
			`"FOXM1",`
			`"GALNT8",`
			`"NDUFA9",`
			`"TEAD4",`
			`"TIGAR",`
			`"TSPAN9"`
			`)`

Retrieve input data using biomaRt 2021-09-16 00:06:54 +02:00			`ensembl <- useEnsembl(`
			`biomart = "ensembl",`
			`version = 104`
			`)`

			`#' Retrieve information on species.`
Initial commit 2021-06-16 22:01:09 +02:00			`#'`
Retrieve input data using biomaRt 2021-09-16 00:06:54 +02:00			#' The result will be a `data.table` with the following columns:
			`#'`
			#' - `id` Species ID as presented by Ensembl.
			#' - `name` Human readable species name.
			#' - `replicative` Whether the species is likely to be aging replicatively.
			`retrieve_species <- function() {`
			`# Ensembl datasets correspond to distinct species.`
			`ensembl_datasets <- data.table(listDatasets(ensembl))`

			`# Filter out species ID and name from the result.`
			`species <- ensembl_datasets[, .(`
			`id = str_match(dataset, "(.*)_gene_ensembl")[, 2],`
			`name = str_match(description, "(.) genes \\(.\\)")[, 2]`
			`)]`

			`species[, replicative := id %chin% species_ids_replicative]`
			`}`

			`#' Retrieve information on human genes.`
			`#'`
			#' The result will be a `data.table` with the following columns:
			`#'`
			#' - `id` Ensembl gene ID.
			#' - `ǹame` HGNC name of the gene.
			#' - `chromosome` Human chromosome on which the gene is located.
			`retrieve_genes <- function() {`
			`genes <- data.table(getBM(`
			`attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name"),`
			`mart = useDataset("hsapiens_gene_ensembl", mart = ensembl)`
			`))`
data: Migrate to tidyverse 2021-06-21 13:03:26 +02:00
Retrieve input data using biomaRt 2021-09-16 00:06:54 +02:00			`genes[, .(`
			`id = ensembl_gene_id,`
			`name = hgnc_symbol,`
			`chromosome = chromosome_name,`
			`verified = hgnc_symbol %chin% genes_verified_tpe_old,`
			`suggested = hgnc_symbol %chin% genes_suggested_tpe_old`
			`)]`
			`}`

			`#' Retrieve gene distance data.`
			`#'`
			`#' The data will include all available values for the given species and genes.`
			`#' Specific values on naturally or artificially (e.g. due to incomplete`
			`#' sequencing) short chromosomes will be excluded.`
			`#'`
			#' The result will be a `data.table` with the following columns:
			`#'`
			#' - `species` Species ID.
			#' - `gene` Ensembl gene ID.
			#' - `distance` Distance to nearest telomere in base pairs.
			`retrieve_distances <- function(species_ids, gene_ids) {`
Reinclude human into distance data 2021-09-19 12:07:59 +02:00			`# Special case the human species and retrieve all available distance`
			`# information.`

			`ensembl <- useDataset("hsapiens_gene_ensembl", mart = ensembl)`

			`human_distances <- data.table(getBM(`
			`attributes = c(`
			`"ensembl_gene_id",`
			`"chromosome_name",`
			`"start_position",`
			`"end_position"`
			`),`
			`mart = ensembl`
			`))`

			`human_distances[,`
			`chromosome_length := max(end_position),`
			`by = chromosome_name`
			`]`

			`# Filter out relevant information (see below).`
			`distances <- human_distances[`
			`chromosome_length > 15000000,`
			`.(`
			`species = "hsapiens",`
			`gene = ensembl_gene_id,`
			`distance = pmin(`
			`start_position,`
			`chromosome_length - end_position`
			`)`
			`)`
			`]`

			`# Exclude the human from the species, in case it is present there.`
			`species_ids <- species_ids[species_ids != "hsapiens"]`
data: Revert to using data.table 2021-06-24 22:36:02 +02:00
Retrieve input data using biomaRt 2021-09-16 00:06:54 +02:00			`species_count <- length(species_ids)`
data: Revert to using data.table 2021-06-24 22:36:02 +02:00
Retrieve input data using biomaRt 2021-09-16 00:06:54 +02:00			`for (i in 1:species_count) {`
			`species_id <- species_ids[i]`
Initial commit 2021-06-16 22:01:09 +02:00
Include all species 2021-08-25 12:00:22 +02:00			`log_info(sprintf(`
Retrieve input data using biomaRt 2021-09-16 00:06:54 +02:00			`"[%3i%%] Loading species \"%s\"",`
			`round(i / species_count * 100),`
			`species_id`
Include all species 2021-08-25 12:00:22 +02:00			`))`
Initial commit 2021-06-16 22:01:09 +02:00
Retrieve input data using biomaRt 2021-09-16 00:06:54 +02:00			`ensembl <- useDataset(`
			`sprintf("%s_gene_ensembl", species_id),`
			`mart = ensembl`
Include all species 2021-08-25 12:00:22 +02:00			`)`

Retrieve input data using biomaRt 2021-09-16 00:06:54 +02:00			`# Besides the attributes that are always present, we need to check for`
			`# human orthologs. Some species don't have that information and will be`
			`# skipped.`
			`if (!"hsapiens_homolog_ensembl_gene" %chin%`
			`listAttributes(ensembl, what = "name")) {`
			`next`
			`}`

			`# Retrieve information on all genes of the current species, that have`
			`# human orthologs. This is called "homolog" in the Ensembl schema.`
			`ensembl_distances <- data.table(getBM(`
			`filters = c("with_hsapiens_homolog"),`
			`values = c(TRUE),`
			`attributes = c(`
			`"hsapiens_homolog_ensembl_gene",`
			`"chromosome_name",`
			`"start_position",`
			`"end_position"`
			`),`
Reinclude human into distance data 2021-09-19 12:07:59 +02:00			`mart = ensembl`
Retrieve input data using biomaRt 2021-09-16 00:06:54 +02:00			`))`

			`ensembl_distances[,`
			`chromosome_length := max(end_position),`
			`by = chromosome_name`
			`]`

			`# Filter out relevant information and precompute the genes' distance to`
			`# the nearest telomere. Exclude genes on naturally or artificially`
			`# short chromosomes.`
			`species_distances <- ensembl_distances[`
			`chromosome_length > 15000000,`
			`.(`
			`species = species_id,`
			`gene = hsapiens_homolog_ensembl_gene,`
			`distance = pmin(`
			`start_position,`
			`chromosome_length - end_position`
			`)`
			`)`
			`]`

Include all species 2021-08-25 12:00:22 +02:00			`distances <- rbindlist(list(distances, species_distances))`
Initial commit 2021-06-16 22:01:09 +02:00			`}`

Retrieve input data using biomaRt 2021-09-16 00:06:54 +02:00			`# Arbitrarily exclude duplicated genes.`
			`# TODO: Consider a refined approach or work out how to include all`
			`# duplicates.`
			`unique(distances, by = c("species", "gene"))`
data: Migrate to tidyverse 2021-06-21 13:03:26 +02:00			`}`