Initial commit

2025-10-26 10:47:25 +01:00 · 2021-10-19 13:39:55 +02:00 · 2021-10-19 13:39:55 +02:00 · c52d42c2b6
commit c52d42c2b6
24 changed files with 1350 additions and 0 deletions
--- a/R/analyze.R
+++ b/R/analyze.R
@ -0,0 +1,72 @@
+#' Create a new preset.
+#'
+#' A preset is used to specify which methods and inputs should be used for an
+#' analysis. Note that the genes to process should normally include the
+#' reference genes to be able to assess the results later.
+#'
+#' Available methods are:
+#'
+#'  - `clusteriness` How much the gene distances cluster across species.
+#'  - `correlation` The mean correlation with the reference genes.
+#'  - `proximity` Mean proximity to telomeres.
+#'  - `neural` Assessment by neural network.
+#'
+#' @param methods IDs of methods to apply.
+#' @param species IDs of species to include.
+#' @param genes IDs of genes to screen.
+#' @param reference_genes IDs of reference genes to compare to.
+#'
+#' @return The preset to use with [analyze()].
+#'
+#' @export
+preset <- function(methods, species, genes, reference_genes) {
+    list(
+        method_ids = methods,
+        species_ids = species,
+        gene_ids = genes,
+        reference_gene_ids = reference_genes
+    )
+}
+
+#' Analyze by applying the specified preset.
+#'
+#' @param preset The preset to use which can be created using [preset()].
+#'
+#' @return A [data.table] with one row for each gene identified by it's ID
+#'   (`gene` column). The additional columns contain the resulting scores per
+#'   method and are named after the method IDs.
+#'
+#' @export
+analyze <- function(preset) {
+    # Available methods by ID.
+    #
+    # A method describes a way to perform a computation on gene distance data
+    # that results in a single score per gene. The function should accept the
+    # preset to apply as a single parameter (see [preset()]).
+    #
+    # The function should return a [data.table] with the following columns:
+    #
+    #  - `gene` Gene ID of the processed gene.
+    #  - `score` Score for the gene between 0.0 and 1.0.
+    methods <- list(
+        "clusteriness" = clusteriness,
+        "correlation" = correlation,
+        "proximity" = proximity,
+        "neural" = neural
+    )
+
+    results <- data.table(gene = genes$id)
+
+    for (method_id in preset$method_ids) {
+        method_results <- methods[[method_id]](distances, preset)
+        setnames(method_results, "score", method_id)
+
+        results <- merge(
+            results,
+            method_results,
+            by = "gene"
+        )
+    }
+
+    results
+}
--- a/R/clusteriness.R
+++ b/R/clusteriness.R
@ -0,0 +1,54 @@
+# Perform a cluster analysis.
+#
+# This function will cluster the data using `hclust` and `cutree` (with the
+# specified height). Every cluster with at least two members qualifies for
+# further analysis. Clusters are then ranked based on their size in relation
+# to the number of values. The return value is a final score between zero and
+# one. Lower ranking clusters contribute less to this score.
+clusteriness_priv <- function(data, height = 1000000) {
+    n <- length(data)
+
+    # Return a score of 0.0 if there is just one or no value at all.
+    if (n < 2) {
+        return(0.0)
+    }
+
+    # Cluster the data and compute the cluster sizes.
+
+    tree <- stats::hclust(stats::dist(data))
+    clusters <- stats::cutree(tree, h = height)
+    cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
+
+    # Compute the "clusteriness" score.
+
+    score <- 0.0
+
+    for (i in seq_along(cluster_sizes)) {
+        cluster_size <- cluster_sizes[i]
+
+        if (cluster_size >= 2) {
+            cluster_score <- cluster_size / n
+            score <- score + cluster_score / i
+        }
+    }
+
+    score
+}
+
+# Process genes clustering their distance to telomeres.
+clusteriness <- function(distances, preset) {
+    results <- data.table(gene = preset$gene_ids)
+
+    # Prefilter the input data by species.
+    distances <- distances[species %chin% preset$species_ids]
+
+    # Add an index for quickly accessing data per gene.
+    setkey(distances, gene)
+
+    # Perform the cluster analysis for one gene.
+    compute <- function(gene_id) {
+        clusteriness_priv(distances[gene_id, distance])
+    }
+
+    results[, score := compute(gene), by = 1:nrow(results)]
+}
--- a/R/correlation.R
+++ b/R/correlation.R
@ -0,0 +1,61 @@
+# Compute the mean correlation coefficient comparing gene distances with a set
+# of reference genes.
+correlation <- function(distances, preset) {
+    results <- data.table(gene = preset$gene_ids)
+    reference_gene_ids <- preset$reference_gene_ids
+    reference_count <- length(reference_gene_ids)
+
+    # Prefilter distances by species.
+    distances <- distances[species %chin% preset$species_ids]
+
+    # Add an index for quickly accessing data per gene.
+    setkey(distances, gene)
+
+    # Prepare the reference genes' data.
+    reference_distances <- distances[gene %chin% reference_gene_ids]
+
+    # Perform the correlation for one gene.
+    compute <- function(gene_id) {
+        gene_distances <- distances[gene_id]
+        gene_species_count <- nrow(gene_distances)
+
+        # Return a score of 0.0 if there is just one or no value at all.
+        if (gene_species_count <= 1) {
+            return(0.0)
+        }
+
+        # Buffer for the sum of correlation coefficients.
+        correlation_sum <- 0
+
+        # Correlate with all reference genes but not with the gene itself.
+        for (reference_gene_id in
+             reference_gene_ids[reference_gene_ids != gene_id]) {
+            data <- merge(
+                gene_distances,
+                reference_distances[reference_gene_id],
+                by = "species"
+            )
+
+            # Skip this reference gene, if there are not enough value pairs.
+            # This will lessen the final score, because it effectively
+            # represents a correlation coefficient of 0.0.
+            if (nrow(data) <= 1) {
+                next
+            }
+
+            # Order data by the reference gene's distance to get a monotonic
+            # relation.
+            setorder(data, distance.y)
+
+            correlation_sum <- correlation_sum + abs(stats::cor(
+                data[, distance.x], data[, distance.y],
+                method = "spearman"
+            ))
+        }
+
+        # Compute the score as the mean correlation coefficient.
+        score <- correlation_sum / reference_count
+    }
+
+    results[, score := compute(gene), by = 1:nrow(results)]
+}
--- a/R/data.R
+++ b/R/data.R
@ -0,0 +1,35 @@
+#' Information on included species from the Ensembl database.
+#'
+#' @format A [data.table] with 91 rows and 2 variables:
+#' \describe{
+#'   \item{id}{Unique species ID}
+#'   \item{name}{Human readable species name}
+#' }
+"species"
+
+#' Information on human genes within the Ensembl database.
+#'
+#' This includes only genes on the primary suggested assembly of the human
+#' nuclear DNA.
+#'
+#' @format A [data.table] with 60568 rows and 3 variables:
+#' \describe{
+#'   \item{id}{Ensembl gene ID}
+#'   \item{name}{The gene's HGNC name}
+#'   \item{chrosome}{The human chromosome the gene is located on}
+#'   \item{n_species}{Number of known species with the gene.}
+#' }
+"genes"
+
+#' Information on gene positions across species.
+#'
+#' This dataset contains each known value for a gene's distance to the telomeres
+#' per species. The data is sourced from Ensembl.
+#'
+#' @format A [data.table] with 1390730 rows and 3 variables:
+#' \describe{
+#'   \item{species}{Species ID}
+#'   \item{gene}{Gene ID}
+#'   \item{distance}{Distance to nearest telomere}
+#' }
+"distances"
--- a/R/neural.R
+++ b/R/neural.R
@ -0,0 +1,96 @@
+# Find genes by training a neural network on reference position data.
+#
+# @param seed A seed to get reproducible results.
+neural <- function(distances, preset, seed = 448077) {
+    species_ids <- preset$species_ids
+    reference_gene_ids <- preset$reference_gene_ids
+
+    set.seed(seed)
+    gene_count <- length(preset$gene_ids)
+
+    # Prefilter distances by species.
+    distances <- distances[species %chin% species_ids]
+
+    # Input data for the network. This contains the gene ID as an identifier
+    # as well as the per-species gene distances as input variables.
+    data <- data.table(gene = preset$gene_ids)
+
+    # Buffer to keep track of species included in the computation. Species
+    # from `species_ids` may be excluded if they don't have enough data.
+    species_ids_included <- NULL
+
+    # Make a column containing distance data for each species.
+    for (species_id in species_ids) {
+        species_distances <- distances[species == species_id, .(gene, distance)]
+
+        # Only include species with at least 25% known values.
+
+        species_distances <- stats::na.omit(species_distances)
+
+        if (nrow(species_distances) >= 0.25 * gene_count) {
+            species_ids_included <- c(species_ids_included, species_id)
+            data <- merge(data, species_distances, all.x = TRUE)
+
+            # Replace missing data with mean values. The neural network can't
+            # handle NAs in a meaningful way. Choosing extreme values here
+            # would result in heavily biased results. Therefore, the mean value
+            # is chosen as a compromise. However, this will of course lessen the
+            # significance of the results.
+
+            mean_distance <- round(species_distances[, mean(distance)])
+            data[is.na(distance), distance := mean_distance]
+
+            # Name the new column after the species.
+            setnames(data, "distance", species_id)
+        }
+    }
+
+    # Extract the reference genes.
+
+    reference_data <- data[gene %chin% reference_gene_ids]
+    reference_data[, neural := 1.0]
+
+    # Take out random samples from the remaining genes. This is another
+    # compromise with a negative impact on significance. Because there is no
+    # information on genes with are explicitely *not* TPE-OLD genes, we have to
+    # assume that a random sample of genes has a low probability of including
+    # TPE-OLD genes.
+
+    without_reference_data <- data[!gene %chin% reference_gene_ids]
+
+    reference_samples <- without_reference_data[
+        sample(
+            nrow(without_reference_data),
+            nrow(reference_data)
+        )
+    ]
+
+    reference_samples[, neural := 0.0]
+
+    # Merge training data. The training data includes all reference genes as
+    # well as an equal number of random sample genes.
+    training_data <- rbindlist(list(reference_data, reference_samples))
+
+    # Construct and train the neural network.
+
+    nn_formula <- stats::as.formula(sprintf(
+        "neural~%s",
+        paste(species_ids_included, collapse = "+")
+    ))
+
+    layer1 <- length(species_ids) * 0.66
+    layer2 <- layer1 * 0.66
+    layer3 <- layer2 * 0.66
+
+    nn <- neuralnet::neuralnet(
+        nn_formula,
+        training_data,
+        hidden = c(layer1, layer2, layer3),
+        linear.output = FALSE
+    )
+
+    # Return the resulting scores given by applying the neural network.
+
+    data[, score := neuralnet::compute(nn, data)$net.result]
+    data[, .(gene, score)]
+}
--- a/R/proximity.R
+++ b/R/proximity.R
@ -0,0 +1,18 @@
+# Score the mean distance of genes to the telomeres across species.
+#
+# A score will be given to each gene such that 0.0 corresponds to the maximal
+# mean distance across all genes and 1.0 corresponds to a distance of 0.
+proximity <- function(distances, preset) {
+    # Prefilter distances by species and gene.
+    distances <- distances[
+        species %chin% preset$species_ids & gene %chin% preset$gene_ids
+    ]
+
+    # Compute the score as described above.
+
+    distances <- distances[, .(mean_distance = mean(distance)), by = "gene"]
+    max_distance <- distances[, max(mean_distance)]
+    distances[, score := 1 - mean_distance / max_distance]
+
+    distances[, .(gene, score)]
+}
--- a/R/ranking.R
+++ b/R/ranking.R
@ -0,0 +1,63 @@
+#' Rank the results by computing a score.
+#'
+#' This function takes the result from [analyze()] and creates a score by
+#' computing a weighted mean across the different methods' results.
+#'
+#' @param results Results from [analyze()].
+#' @param weights Named list pairing method names with weighting factors.
+#'
+#' @result The input data with an additional column containing the score and
+#'   another column containing the rank.
+#'
+#' @export
+ranking <- function(results, weights) {
+    results <- copy(results)
+    results[, score := 0.0]
+
+    for (method in names(weights)) {
+        weighted <- weights[[method]] * results[, ..method]
+        results[, score := score + weighted]
+    }
+
+    # Normalize scores to be between 0.0 and 1.0.
+    results[, score := score / sum(unlist(weights))]
+
+    setorder(results, -score)
+    results[, rank := .I]
+}
+
+#' Find the best weights to rank the results.
+#'
+#' This function finds the optimal parameters to [ranking()] that result in the
+#' reference genes ranking particulary high.
+#'
+#' @param results Results from [analyze()] or [ranking()].
+#' @param methods Methods to include in the score.
+#' @param reference_gene_ids IDs of the reference genes.
+#'
+#' @returns Named list pairing method names with their optimal weights.
+#'
+#' @export
+optimize_weights <- function(results, methods, reference_gene_ids) {
+    # Create the named list from the factors vector.
+    weights <- function(factors) {
+        result <- NULL
+
+        mapply(function(method, factor) {
+            result[[method]] <<- factor
+        }, methods, factors)
+
+        result
+    }
+
+    # Compute the mean rank of the reference genes when applying the weights.
+    mean_rank <- function(factors) {
+        data <- ranking(results, weights(factors))
+        data[gene %chin% reference_gene_ids, mean(rank)]
+    }
+
+    factors <- stats::optim(rep(1.0, length(methods)), mean_rank)$par
+    total_weight <- sum(factors)
+
+    weights(factors / total_weight)
+}
--- a/R/utils.R
+++ b/R/utils.R
@ -0,0 +1,3 @@
+# This is needed to make data.table's symbols available within the package.
+#' @import data.table
+NULL