mirror of
https://github.com/johrpan/geposan.git
synced 2025-10-26 10:47:25 +01:00
Initial commit
This commit is contained in:
commit
c52d42c2b6
24 changed files with 1350 additions and 0 deletions
72
R/analyze.R
Normal file
72
R/analyze.R
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
#' Create a new preset.
|
||||
#'
|
||||
#' A preset is used to specify which methods and inputs should be used for an
|
||||
#' analysis. Note that the genes to process should normally include the
|
||||
#' reference genes to be able to assess the results later.
|
||||
#'
|
||||
#' Available methods are:
|
||||
#'
|
||||
#' - `clusteriness` How much the gene distances cluster across species.
|
||||
#' - `correlation` The mean correlation with the reference genes.
|
||||
#' - `proximity` Mean proximity to telomeres.
|
||||
#' - `neural` Assessment by neural network.
|
||||
#'
|
||||
#' @param methods IDs of methods to apply.
|
||||
#' @param species IDs of species to include.
|
||||
#' @param genes IDs of genes to screen.
|
||||
#' @param reference_genes IDs of reference genes to compare to.
|
||||
#'
|
||||
#' @return The preset to use with [analyze()].
|
||||
#'
|
||||
#' @export
|
||||
preset <- function(methods, species, genes, reference_genes) {
|
||||
list(
|
||||
method_ids = methods,
|
||||
species_ids = species,
|
||||
gene_ids = genes,
|
||||
reference_gene_ids = reference_genes
|
||||
)
|
||||
}
|
||||
|
||||
#' Analyze by applying the specified preset.
|
||||
#'
|
||||
#' @param preset The preset to use which can be created using [preset()].
|
||||
#'
|
||||
#' @return A [data.table] with one row for each gene identified by it's ID
|
||||
#' (`gene` column). The additional columns contain the resulting scores per
|
||||
#' method and are named after the method IDs.
|
||||
#'
|
||||
#' @export
|
||||
analyze <- function(preset) {
|
||||
# Available methods by ID.
|
||||
#
|
||||
# A method describes a way to perform a computation on gene distance data
|
||||
# that results in a single score per gene. The function should accept the
|
||||
# preset to apply as a single parameter (see [preset()]).
|
||||
#
|
||||
# The function should return a [data.table] with the following columns:
|
||||
#
|
||||
# - `gene` Gene ID of the processed gene.
|
||||
# - `score` Score for the gene between 0.0 and 1.0.
|
||||
methods <- list(
|
||||
"clusteriness" = clusteriness,
|
||||
"correlation" = correlation,
|
||||
"proximity" = proximity,
|
||||
"neural" = neural
|
||||
)
|
||||
|
||||
results <- data.table(gene = genes$id)
|
||||
|
||||
for (method_id in preset$method_ids) {
|
||||
method_results <- methods[[method_id]](distances, preset)
|
||||
setnames(method_results, "score", method_id)
|
||||
|
||||
results <- merge(
|
||||
results,
|
||||
method_results,
|
||||
by = "gene"
|
||||
)
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
54
R/clusteriness.R
Normal file
54
R/clusteriness.R
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
# Perform a cluster analysis.
|
||||
#
|
||||
# This function will cluster the data using `hclust` and `cutree` (with the
|
||||
# specified height). Every cluster with at least two members qualifies for
|
||||
# further analysis. Clusters are then ranked based on their size in relation
|
||||
# to the number of values. The return value is a final score between zero and
|
||||
# one. Lower ranking clusters contribute less to this score.
|
||||
clusteriness_priv <- function(data, height = 1000000) {
|
||||
n <- length(data)
|
||||
|
||||
# Return a score of 0.0 if there is just one or no value at all.
|
||||
if (n < 2) {
|
||||
return(0.0)
|
||||
}
|
||||
|
||||
# Cluster the data and compute the cluster sizes.
|
||||
|
||||
tree <- stats::hclust(stats::dist(data))
|
||||
clusters <- stats::cutree(tree, h = height)
|
||||
cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
|
||||
|
||||
# Compute the "clusteriness" score.
|
||||
|
||||
score <- 0.0
|
||||
|
||||
for (i in seq_along(cluster_sizes)) {
|
||||
cluster_size <- cluster_sizes[i]
|
||||
|
||||
if (cluster_size >= 2) {
|
||||
cluster_score <- cluster_size / n
|
||||
score <- score + cluster_score / i
|
||||
}
|
||||
}
|
||||
|
||||
score
|
||||
}
|
||||
|
||||
# Process genes clustering their distance to telomeres.
|
||||
clusteriness <- function(distances, preset) {
|
||||
results <- data.table(gene = preset$gene_ids)
|
||||
|
||||
# Prefilter the input data by species.
|
||||
distances <- distances[species %chin% preset$species_ids]
|
||||
|
||||
# Add an index for quickly accessing data per gene.
|
||||
setkey(distances, gene)
|
||||
|
||||
# Perform the cluster analysis for one gene.
|
||||
compute <- function(gene_id) {
|
||||
clusteriness_priv(distances[gene_id, distance])
|
||||
}
|
||||
|
||||
results[, score := compute(gene), by = 1:nrow(results)]
|
||||
}
|
||||
61
R/correlation.R
Normal file
61
R/correlation.R
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
# Compute the mean correlation coefficient comparing gene distances with a set
|
||||
# of reference genes.
|
||||
correlation <- function(distances, preset) {
|
||||
results <- data.table(gene = preset$gene_ids)
|
||||
reference_gene_ids <- preset$reference_gene_ids
|
||||
reference_count <- length(reference_gene_ids)
|
||||
|
||||
# Prefilter distances by species.
|
||||
distances <- distances[species %chin% preset$species_ids]
|
||||
|
||||
# Add an index for quickly accessing data per gene.
|
||||
setkey(distances, gene)
|
||||
|
||||
# Prepare the reference genes' data.
|
||||
reference_distances <- distances[gene %chin% reference_gene_ids]
|
||||
|
||||
# Perform the correlation for one gene.
|
||||
compute <- function(gene_id) {
|
||||
gene_distances <- distances[gene_id]
|
||||
gene_species_count <- nrow(gene_distances)
|
||||
|
||||
# Return a score of 0.0 if there is just one or no value at all.
|
||||
if (gene_species_count <= 1) {
|
||||
return(0.0)
|
||||
}
|
||||
|
||||
# Buffer for the sum of correlation coefficients.
|
||||
correlation_sum <- 0
|
||||
|
||||
# Correlate with all reference genes but not with the gene itself.
|
||||
for (reference_gene_id in
|
||||
reference_gene_ids[reference_gene_ids != gene_id]) {
|
||||
data <- merge(
|
||||
gene_distances,
|
||||
reference_distances[reference_gene_id],
|
||||
by = "species"
|
||||
)
|
||||
|
||||
# Skip this reference gene, if there are not enough value pairs.
|
||||
# This will lessen the final score, because it effectively
|
||||
# represents a correlation coefficient of 0.0.
|
||||
if (nrow(data) <= 1) {
|
||||
next
|
||||
}
|
||||
|
||||
# Order data by the reference gene's distance to get a monotonic
|
||||
# relation.
|
||||
setorder(data, distance.y)
|
||||
|
||||
correlation_sum <- correlation_sum + abs(stats::cor(
|
||||
data[, distance.x], data[, distance.y],
|
||||
method = "spearman"
|
||||
))
|
||||
}
|
||||
|
||||
# Compute the score as the mean correlation coefficient.
|
||||
score <- correlation_sum / reference_count
|
||||
}
|
||||
|
||||
results[, score := compute(gene), by = 1:nrow(results)]
|
||||
}
|
||||
35
R/data.R
Normal file
35
R/data.R
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
#' Information on included species from the Ensembl database.
|
||||
#'
|
||||
#' @format A [data.table] with 91 rows and 2 variables:
|
||||
#' \describe{
|
||||
#' \item{id}{Unique species ID}
|
||||
#' \item{name}{Human readable species name}
|
||||
#' }
|
||||
"species"
|
||||
|
||||
#' Information on human genes within the Ensembl database.
|
||||
#'
|
||||
#' This includes only genes on the primary suggested assembly of the human
|
||||
#' nuclear DNA.
|
||||
#'
|
||||
#' @format A [data.table] with 60568 rows and 3 variables:
|
||||
#' \describe{
|
||||
#' \item{id}{Ensembl gene ID}
|
||||
#' \item{name}{The gene's HGNC name}
|
||||
#' \item{chrosome}{The human chromosome the gene is located on}
|
||||
#' \item{n_species}{Number of known species with the gene.}
|
||||
#' }
|
||||
"genes"
|
||||
|
||||
#' Information on gene positions across species.
|
||||
#'
|
||||
#' This dataset contains each known value for a gene's distance to the telomeres
|
||||
#' per species. The data is sourced from Ensembl.
|
||||
#'
|
||||
#' @format A [data.table] with 1390730 rows and 3 variables:
|
||||
#' \describe{
|
||||
#' \item{species}{Species ID}
|
||||
#' \item{gene}{Gene ID}
|
||||
#' \item{distance}{Distance to nearest telomere}
|
||||
#' }
|
||||
"distances"
|
||||
96
R/neural.R
Normal file
96
R/neural.R
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
# Find genes by training a neural network on reference position data.
|
||||
#
|
||||
# @param seed A seed to get reproducible results.
|
||||
neural <- function(distances, preset, seed = 448077) {
|
||||
species_ids <- preset$species_ids
|
||||
reference_gene_ids <- preset$reference_gene_ids
|
||||
|
||||
set.seed(seed)
|
||||
gene_count <- length(preset$gene_ids)
|
||||
|
||||
# Prefilter distances by species.
|
||||
distances <- distances[species %chin% species_ids]
|
||||
|
||||
# Input data for the network. This contains the gene ID as an identifier
|
||||
# as well as the per-species gene distances as input variables.
|
||||
data <- data.table(gene = preset$gene_ids)
|
||||
|
||||
# Buffer to keep track of species included in the computation. Species
|
||||
# from `species_ids` may be excluded if they don't have enough data.
|
||||
species_ids_included <- NULL
|
||||
|
||||
# Make a column containing distance data for each species.
|
||||
for (species_id in species_ids) {
|
||||
species_distances <- distances[species == species_id, .(gene, distance)]
|
||||
|
||||
# Only include species with at least 25% known values.
|
||||
|
||||
species_distances <- stats::na.omit(species_distances)
|
||||
|
||||
if (nrow(species_distances) >= 0.25 * gene_count) {
|
||||
species_ids_included <- c(species_ids_included, species_id)
|
||||
data <- merge(data, species_distances, all.x = TRUE)
|
||||
|
||||
# Replace missing data with mean values. The neural network can't
|
||||
# handle NAs in a meaningful way. Choosing extreme values here
|
||||
# would result in heavily biased results. Therefore, the mean value
|
||||
# is chosen as a compromise. However, this will of course lessen the
|
||||
# significance of the results.
|
||||
|
||||
mean_distance <- round(species_distances[, mean(distance)])
|
||||
data[is.na(distance), distance := mean_distance]
|
||||
|
||||
# Name the new column after the species.
|
||||
setnames(data, "distance", species_id)
|
||||
}
|
||||
}
|
||||
|
||||
# Extract the reference genes.
|
||||
|
||||
reference_data <- data[gene %chin% reference_gene_ids]
|
||||
reference_data[, neural := 1.0]
|
||||
|
||||
# Take out random samples from the remaining genes. This is another
|
||||
# compromise with a negative impact on significance. Because there is no
|
||||
# information on genes with are explicitely *not* TPE-OLD genes, we have to
|
||||
# assume that a random sample of genes has a low probability of including
|
||||
# TPE-OLD genes.
|
||||
|
||||
without_reference_data <- data[!gene %chin% reference_gene_ids]
|
||||
|
||||
reference_samples <- without_reference_data[
|
||||
sample(
|
||||
nrow(without_reference_data),
|
||||
nrow(reference_data)
|
||||
)
|
||||
]
|
||||
|
||||
reference_samples[, neural := 0.0]
|
||||
|
||||
# Merge training data. The training data includes all reference genes as
|
||||
# well as an equal number of random sample genes.
|
||||
training_data <- rbindlist(list(reference_data, reference_samples))
|
||||
|
||||
# Construct and train the neural network.
|
||||
|
||||
nn_formula <- stats::as.formula(sprintf(
|
||||
"neural~%s",
|
||||
paste(species_ids_included, collapse = "+")
|
||||
))
|
||||
|
||||
layer1 <- length(species_ids) * 0.66
|
||||
layer2 <- layer1 * 0.66
|
||||
layer3 <- layer2 * 0.66
|
||||
|
||||
nn <- neuralnet::neuralnet(
|
||||
nn_formula,
|
||||
training_data,
|
||||
hidden = c(layer1, layer2, layer3),
|
||||
linear.output = FALSE
|
||||
)
|
||||
|
||||
# Return the resulting scores given by applying the neural network.
|
||||
|
||||
data[, score := neuralnet::compute(nn, data)$net.result]
|
||||
data[, .(gene, score)]
|
||||
}
|
||||
18
R/proximity.R
Normal file
18
R/proximity.R
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
# Score the mean distance of genes to the telomeres across species.
|
||||
#
|
||||
# A score will be given to each gene such that 0.0 corresponds to the maximal
|
||||
# mean distance across all genes and 1.0 corresponds to a distance of 0.
|
||||
proximity <- function(distances, preset) {
|
||||
# Prefilter distances by species and gene.
|
||||
distances <- distances[
|
||||
species %chin% preset$species_ids & gene %chin% preset$gene_ids
|
||||
]
|
||||
|
||||
# Compute the score as described above.
|
||||
|
||||
distances <- distances[, .(mean_distance = mean(distance)), by = "gene"]
|
||||
max_distance <- distances[, max(mean_distance)]
|
||||
distances[, score := 1 - mean_distance / max_distance]
|
||||
|
||||
distances[, .(gene, score)]
|
||||
}
|
||||
63
R/ranking.R
Normal file
63
R/ranking.R
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
#' Rank the results by computing a score.
|
||||
#'
|
||||
#' This function takes the result from [analyze()] and creates a score by
|
||||
#' computing a weighted mean across the different methods' results.
|
||||
#'
|
||||
#' @param results Results from [analyze()].
|
||||
#' @param weights Named list pairing method names with weighting factors.
|
||||
#'
|
||||
#' @result The input data with an additional column containing the score and
|
||||
#' another column containing the rank.
|
||||
#'
|
||||
#' @export
|
||||
ranking <- function(results, weights) {
|
||||
results <- copy(results)
|
||||
results[, score := 0.0]
|
||||
|
||||
for (method in names(weights)) {
|
||||
weighted <- weights[[method]] * results[, ..method]
|
||||
results[, score := score + weighted]
|
||||
}
|
||||
|
||||
# Normalize scores to be between 0.0 and 1.0.
|
||||
results[, score := score / sum(unlist(weights))]
|
||||
|
||||
setorder(results, -score)
|
||||
results[, rank := .I]
|
||||
}
|
||||
|
||||
#' Find the best weights to rank the results.
|
||||
#'
|
||||
#' This function finds the optimal parameters to [ranking()] that result in the
|
||||
#' reference genes ranking particulary high.
|
||||
#'
|
||||
#' @param results Results from [analyze()] or [ranking()].
|
||||
#' @param methods Methods to include in the score.
|
||||
#' @param reference_gene_ids IDs of the reference genes.
|
||||
#'
|
||||
#' @returns Named list pairing method names with their optimal weights.
|
||||
#'
|
||||
#' @export
|
||||
optimize_weights <- function(results, methods, reference_gene_ids) {
|
||||
# Create the named list from the factors vector.
|
||||
weights <- function(factors) {
|
||||
result <- NULL
|
||||
|
||||
mapply(function(method, factor) {
|
||||
result[[method]] <<- factor
|
||||
}, methods, factors)
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
# Compute the mean rank of the reference genes when applying the weights.
|
||||
mean_rank <- function(factors) {
|
||||
data <- ranking(results, weights(factors))
|
||||
data[gene %chin% reference_gene_ids, mean(rank)]
|
||||
}
|
||||
|
||||
factors <- stats::optim(rep(1.0, length(methods)), mean_rank)$par
|
||||
total_weight <- sum(factors)
|
||||
|
||||
weights(factors / total_weight)
|
||||
}
|
||||
3
R/utils.R
Normal file
3
R/utils.R
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
# This is needed to make data.table's symbols available within the package.
|
||||
#' @import data.table
|
||||
NULL
|
||||
Loading…
Add table
Add a link
Reference in a new issue