Initial commit

This commit is contained in:
Elias Projahn 2021-10-19 13:39:55 +02:00
commit c52d42c2b6
24 changed files with 1350 additions and 0 deletions

72
R/analyze.R Normal file
View file

@ -0,0 +1,72 @@
#' Create a new preset.
#'
#' A preset is used to specify which methods and inputs should be used for an
#' analysis. Note that the genes to process should normally include the
#' reference genes to be able to assess the results later.
#'
#' Available methods are:
#'
#' - `clusteriness` How much the gene distances cluster across species.
#' - `correlation` The mean correlation with the reference genes.
#' - `proximity` Mean proximity to telomeres.
#' - `neural` Assessment by neural network.
#'
#' @param methods IDs of methods to apply.
#' @param species IDs of species to include.
#' @param genes IDs of genes to screen.
#' @param reference_genes IDs of reference genes to compare to.
#'
#' @return The preset to use with [analyze()].
#'
#' @export
preset <- function(methods, species, genes, reference_genes) {
list(
method_ids = methods,
species_ids = species,
gene_ids = genes,
reference_gene_ids = reference_genes
)
}
#' Analyze by applying the specified preset.
#'
#' @param preset The preset to use which can be created using [preset()].
#'
#' @return A [data.table] with one row for each gene identified by it's ID
#' (`gene` column). The additional columns contain the resulting scores per
#' method and are named after the method IDs.
#'
#' @export
analyze <- function(preset) {
# Available methods by ID.
#
# A method describes a way to perform a computation on gene distance data
# that results in a single score per gene. The function should accept the
# preset to apply as a single parameter (see [preset()]).
#
# The function should return a [data.table] with the following columns:
#
# - `gene` Gene ID of the processed gene.
# - `score` Score for the gene between 0.0 and 1.0.
methods <- list(
"clusteriness" = clusteriness,
"correlation" = correlation,
"proximity" = proximity,
"neural" = neural
)
results <- data.table(gene = genes$id)
for (method_id in preset$method_ids) {
method_results <- methods[[method_id]](distances, preset)
setnames(method_results, "score", method_id)
results <- merge(
results,
method_results,
by = "gene"
)
}
results
}

54
R/clusteriness.R Normal file
View file

@ -0,0 +1,54 @@
# Perform a cluster analysis.
#
# This function will cluster the data using `hclust` and `cutree` (with the
# specified height). Every cluster with at least two members qualifies for
# further analysis. Clusters are then ranked based on their size in relation
# to the number of values. The return value is a final score between zero and
# one. Lower ranking clusters contribute less to this score.
clusteriness_priv <- function(data, height = 1000000) {
n <- length(data)
# Return a score of 0.0 if there is just one or no value at all.
if (n < 2) {
return(0.0)
}
# Cluster the data and compute the cluster sizes.
tree <- stats::hclust(stats::dist(data))
clusters <- stats::cutree(tree, h = height)
cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
# Compute the "clusteriness" score.
score <- 0.0
for (i in seq_along(cluster_sizes)) {
cluster_size <- cluster_sizes[i]
if (cluster_size >= 2) {
cluster_score <- cluster_size / n
score <- score + cluster_score / i
}
}
score
}
# Process genes clustering their distance to telomeres.
clusteriness <- function(distances, preset) {
results <- data.table(gene = preset$gene_ids)
# Prefilter the input data by species.
distances <- distances[species %chin% preset$species_ids]
# Add an index for quickly accessing data per gene.
setkey(distances, gene)
# Perform the cluster analysis for one gene.
compute <- function(gene_id) {
clusteriness_priv(distances[gene_id, distance])
}
results[, score := compute(gene), by = 1:nrow(results)]
}

61
R/correlation.R Normal file
View file

@ -0,0 +1,61 @@
# Compute the mean correlation coefficient comparing gene distances with a set
# of reference genes.
correlation <- function(distances, preset) {
results <- data.table(gene = preset$gene_ids)
reference_gene_ids <- preset$reference_gene_ids
reference_count <- length(reference_gene_ids)
# Prefilter distances by species.
distances <- distances[species %chin% preset$species_ids]
# Add an index for quickly accessing data per gene.
setkey(distances, gene)
# Prepare the reference genes' data.
reference_distances <- distances[gene %chin% reference_gene_ids]
# Perform the correlation for one gene.
compute <- function(gene_id) {
gene_distances <- distances[gene_id]
gene_species_count <- nrow(gene_distances)
# Return a score of 0.0 if there is just one or no value at all.
if (gene_species_count <= 1) {
return(0.0)
}
# Buffer for the sum of correlation coefficients.
correlation_sum <- 0
# Correlate with all reference genes but not with the gene itself.
for (reference_gene_id in
reference_gene_ids[reference_gene_ids != gene_id]) {
data <- merge(
gene_distances,
reference_distances[reference_gene_id],
by = "species"
)
# Skip this reference gene, if there are not enough value pairs.
# This will lessen the final score, because it effectively
# represents a correlation coefficient of 0.0.
if (nrow(data) <= 1) {
next
}
# Order data by the reference gene's distance to get a monotonic
# relation.
setorder(data, distance.y)
correlation_sum <- correlation_sum + abs(stats::cor(
data[, distance.x], data[, distance.y],
method = "spearman"
))
}
# Compute the score as the mean correlation coefficient.
score <- correlation_sum / reference_count
}
results[, score := compute(gene), by = 1:nrow(results)]
}

35
R/data.R Normal file
View file

@ -0,0 +1,35 @@
#' Information on included species from the Ensembl database.
#'
#' @format A [data.table] with 91 rows and 2 variables:
#' \describe{
#' \item{id}{Unique species ID}
#' \item{name}{Human readable species name}
#' }
"species"
#' Information on human genes within the Ensembl database.
#'
#' This includes only genes on the primary suggested assembly of the human
#' nuclear DNA.
#'
#' @format A [data.table] with 60568 rows and 3 variables:
#' \describe{
#' \item{id}{Ensembl gene ID}
#' \item{name}{The gene's HGNC name}
#' \item{chrosome}{The human chromosome the gene is located on}
#' \item{n_species}{Number of known species with the gene.}
#' }
"genes"
#' Information on gene positions across species.
#'
#' This dataset contains each known value for a gene's distance to the telomeres
#' per species. The data is sourced from Ensembl.
#'
#' @format A [data.table] with 1390730 rows and 3 variables:
#' \describe{
#' \item{species}{Species ID}
#' \item{gene}{Gene ID}
#' \item{distance}{Distance to nearest telomere}
#' }
"distances"

96
R/neural.R Normal file
View file

@ -0,0 +1,96 @@
# Find genes by training a neural network on reference position data.
#
# @param seed A seed to get reproducible results.
neural <- function(distances, preset, seed = 448077) {
species_ids <- preset$species_ids
reference_gene_ids <- preset$reference_gene_ids
set.seed(seed)
gene_count <- length(preset$gene_ids)
# Prefilter distances by species.
distances <- distances[species %chin% species_ids]
# Input data for the network. This contains the gene ID as an identifier
# as well as the per-species gene distances as input variables.
data <- data.table(gene = preset$gene_ids)
# Buffer to keep track of species included in the computation. Species
# from `species_ids` may be excluded if they don't have enough data.
species_ids_included <- NULL
# Make a column containing distance data for each species.
for (species_id in species_ids) {
species_distances <- distances[species == species_id, .(gene, distance)]
# Only include species with at least 25% known values.
species_distances <- stats::na.omit(species_distances)
if (nrow(species_distances) >= 0.25 * gene_count) {
species_ids_included <- c(species_ids_included, species_id)
data <- merge(data, species_distances, all.x = TRUE)
# Replace missing data with mean values. The neural network can't
# handle NAs in a meaningful way. Choosing extreme values here
# would result in heavily biased results. Therefore, the mean value
# is chosen as a compromise. However, this will of course lessen the
# significance of the results.
mean_distance <- round(species_distances[, mean(distance)])
data[is.na(distance), distance := mean_distance]
# Name the new column after the species.
setnames(data, "distance", species_id)
}
}
# Extract the reference genes.
reference_data <- data[gene %chin% reference_gene_ids]
reference_data[, neural := 1.0]
# Take out random samples from the remaining genes. This is another
# compromise with a negative impact on significance. Because there is no
# information on genes with are explicitely *not* TPE-OLD genes, we have to
# assume that a random sample of genes has a low probability of including
# TPE-OLD genes.
without_reference_data <- data[!gene %chin% reference_gene_ids]
reference_samples <- without_reference_data[
sample(
nrow(without_reference_data),
nrow(reference_data)
)
]
reference_samples[, neural := 0.0]
# Merge training data. The training data includes all reference genes as
# well as an equal number of random sample genes.
training_data <- rbindlist(list(reference_data, reference_samples))
# Construct and train the neural network.
nn_formula <- stats::as.formula(sprintf(
"neural~%s",
paste(species_ids_included, collapse = "+")
))
layer1 <- length(species_ids) * 0.66
layer2 <- layer1 * 0.66
layer3 <- layer2 * 0.66
nn <- neuralnet::neuralnet(
nn_formula,
training_data,
hidden = c(layer1, layer2, layer3),
linear.output = FALSE
)
# Return the resulting scores given by applying the neural network.
data[, score := neuralnet::compute(nn, data)$net.result]
data[, .(gene, score)]
}

18
R/proximity.R Normal file
View file

@ -0,0 +1,18 @@
# Score the mean distance of genes to the telomeres across species.
#
# A score will be given to each gene such that 0.0 corresponds to the maximal
# mean distance across all genes and 1.0 corresponds to a distance of 0.
proximity <- function(distances, preset) {
# Prefilter distances by species and gene.
distances <- distances[
species %chin% preset$species_ids & gene %chin% preset$gene_ids
]
# Compute the score as described above.
distances <- distances[, .(mean_distance = mean(distance)), by = "gene"]
max_distance <- distances[, max(mean_distance)]
distances[, score := 1 - mean_distance / max_distance]
distances[, .(gene, score)]
}

63
R/ranking.R Normal file
View file

@ -0,0 +1,63 @@
#' Rank the results by computing a score.
#'
#' This function takes the result from [analyze()] and creates a score by
#' computing a weighted mean across the different methods' results.
#'
#' @param results Results from [analyze()].
#' @param weights Named list pairing method names with weighting factors.
#'
#' @result The input data with an additional column containing the score and
#' another column containing the rank.
#'
#' @export
ranking <- function(results, weights) {
results <- copy(results)
results[, score := 0.0]
for (method in names(weights)) {
weighted <- weights[[method]] * results[, ..method]
results[, score := score + weighted]
}
# Normalize scores to be between 0.0 and 1.0.
results[, score := score / sum(unlist(weights))]
setorder(results, -score)
results[, rank := .I]
}
#' Find the best weights to rank the results.
#'
#' This function finds the optimal parameters to [ranking()] that result in the
#' reference genes ranking particulary high.
#'
#' @param results Results from [analyze()] or [ranking()].
#' @param methods Methods to include in the score.
#' @param reference_gene_ids IDs of the reference genes.
#'
#' @returns Named list pairing method names with their optimal weights.
#'
#' @export
optimize_weights <- function(results, methods, reference_gene_ids) {
# Create the named list from the factors vector.
weights <- function(factors) {
result <- NULL
mapply(function(method, factor) {
result[[method]] <<- factor
}, methods, factors)
result
}
# Compute the mean rank of the reference genes when applying the weights.
mean_rank <- function(factors) {
data <- ranking(results, weights(factors))
data[gene %chin% reference_gene_ids, mean(rank)]
}
factors <- stats::optim(rep(1.0, length(methods)), mean_rank)$par
total_weight <- sum(factors)
weights(factors / total_weight)
}

3
R/utils.R Normal file
View file

@ -0,0 +1,3 @@
# This is needed to make data.table's symbols available within the package.
#' @import data.table
NULL