Reindent code to use just two spaces

This commit is contained in:
Elias Projahn 2022-05-26 12:42:19 +02:00
parent a1e6147466
commit c04b6337e9
17 changed files with 1583 additions and 1582 deletions

View file

@ -18,54 +18,54 @@
#'
#' @export
analyze <- function(preset, progress = NULL, include_results = TRUE) {
if (!inherits(preset, "geposan_preset")) {
stop("Preset is invalid. Use geposan::preset() to create one.")
}
if (!inherits(preset, "geposan_preset")) {
stop("Preset is invalid. Use geposan::preset() to create one.")
}
if (is.null(progress)) {
progress_bar <- progress::progress_bar$new()
progress_bar$update(0.0)
if (is.null(progress)) {
progress_bar <- progress::progress_bar$new()
progress_bar$update(0.0)
progress <- function(progress_value) {
if (!progress_bar$finished) {
progress_bar$update(progress_value)
if (progress_value >= 1.0) {
progress_bar$terminate()
}
}
progress <- function(progress_value) {
if (!progress_bar$finished) {
progress_bar$update(progress_value)
if (progress_value >= 1.0) {
progress_bar$terminate()
}
}
}
}
progress_buffer <- 0.0
method_count <- length(preset$methods)
progress_buffer <- 0.0
method_count <- length(preset$methods)
method_progress <- function(progress_value) {
progress(progress_buffer + progress_value / method_count)
}
method_progress <- function(progress_value) {
progress(progress_buffer + progress_value / method_count)
}
scores <- data.table(gene = preset$gene_id)
results <- list()
scores <- data.table(gene = preset$gene_id)
results <- list()
for (method in preset$methods) {
method_results <- method$func(preset, method_progress)
for (method in preset$methods) {
method_results <- method$func(preset, method_progress)
scores <- merge(scores, method_results$scores, by = "gene")
setnames(scores, "score", method$id)
scores <- merge(scores, method_results$scores, by = "gene")
setnames(scores, "score", method$id)
results <- c(results, list(method_results))
results <- c(results, list(method_results))
progress_buffer <- progress_buffer + 1 / method_count
progress(progress_buffer)
}
progress_buffer <- progress_buffer + 1 / method_count
progress(progress_buffer)
}
structure(
list(
preset = preset,
scores = scores,
results = if (include_results) results else NULL
),
class = "geposan_analysis"
)
structure(
list(
preset = preset,
scores = scores,
results = if (include_results) results else NULL
),
class = "geposan_analysis"
)
}
#' Print an analysis object.
@ -77,14 +77,14 @@ analyze <- function(preset, progress = NULL, include_results = TRUE) {
#'
#' @export
print.geposan_analysis <- function(x, ...) {
cat("geposan analysis:\n\n")
print(x$preset)
cat("geposan analysis:\n\n")
print(x$preset)
cat("\n")
for (result in x$results) {
print(result)
cat("\n")
}
for (result in x$results) {
print(result)
cat("\n")
}
invisible(x)
invisible(x)
}

View file

@ -18,38 +18,38 @@
#'
#' @export
compare <- function(ranking, comparison_gene_ids) {
if (!inherits(ranking, "geposan_ranking")) {
stop("Invalid ranking. Use geposan::ranking().")
}
if (!inherits(ranking, "geposan_ranking")) {
stop("Invalid ranking. Use geposan::ranking().")
}
comparison_ranking <- ranking[gene %chin% comparison_gene_ids]
comparison_ranking <- ranking[gene %chin% comparison_gene_ids]
quantiles <- data.table(
quantile = c("0%", "25%", "50%", "75%", "100%"),
score = stats::quantile(comparison_ranking[, score]),
rank = stats::quantile(
comparison_ranking[, rank],
probs = seq(1, 0, -0.25)
),
percentile = stats::quantile(comparison_ranking[, percentile])
)
quantiles <- data.table(
quantile = c("0%", "25%", "50%", "75%", "100%"),
score = stats::quantile(comparison_ranking[, score]),
rank = stats::quantile(
comparison_ranking[, rank],
probs = seq(1, 0, -0.25)
),
percentile = stats::quantile(comparison_ranking[, percentile])
)
p_value <- stats::wilcox.test(
x = comparison_ranking[, score],
y = ranking[!gene %chin% comparison_gene_ids, score],
alternative = "greater"
)$p.value
p_value <- stats::wilcox.test(
x = comparison_ranking[, score],
y = ranking[!gene %chin% comparison_gene_ids, score],
alternative = "greater"
)$p.value
structure(
list(
quantiles = quantiles,
mean_score = comparison_ranking[, mean(score)],
mean_rank = comparison_ranking[, mean(rank)],
mean_percentile = comparison_ranking[, mean(percentile)],
p_value = p_value
),
class = "geposan_comparison"
)
structure(
list(
quantiles = quantiles,
mean_score = comparison_ranking[, mean(score)],
mean_rank = comparison_ranking[, mean(rank)],
mean_percentile = comparison_ranking[, mean(percentile)],
p_value = p_value
),
class = "geposan_comparison"
)
}
#' S3 method to print a comparison object.
@ -61,32 +61,32 @@ compare <- function(ranking, comparison_gene_ids) {
#'
#' @export
print.geposan_comparison <- function(x, ...) {
cat("geposan comparison:\n\n")
cat("geposan comparison:\n\n")
quantiles_formatted <- x$quantiles[, .(
"Quantile" = quantile,
"Score" = round(score, 3),
"Rank" = rank,
"Percentile" = paste0(
format(round(percentile * 100, 1), nsmall = 1),
"%"
)
)]
quantiles_formatted <- x$quantiles[, .(
"Quantile" = quantile,
"Score" = round(score, 3),
"Rank" = rank,
"Percentile" = paste0(
format(round(percentile * 100, 1), nsmall = 1),
"%"
)
)]
print(quantiles_formatted, row.names = FALSE)
print(quantiles_formatted, row.names = FALSE)
cat(sprintf(
paste0(
"\n Mean score: %.3f",
"\n Mean rank: %.1f",
"\n Mean percentile: %.1f%%",
"\n p-value for better scores: %.4f\n"
),
x$mean_score,
x$mean_rank,
x$mean_percentile * 100,
x$p_value
))
cat(sprintf(
paste0(
"\n Mean score: %.3f",
"\n Mean rank: %.1f",
"\n Mean percentile: %.1f%%",
"\n p-value for better scores: %.4f\n"
),
x$mean_score,
x$mean_rank,
x$mean_percentile * 100,
x$p_value
))
invisible(x)
invisible(x)
}

View file

@ -12,34 +12,34 @@
#'
#' @export
method <- function(id, name, description, func) {
stopifnot(is.character(id) & length(id) == 1)
stopifnot(is.character(name) & length(name) == 1)
stopifnot(is.character(description) & length(description) == 1)
stopifnot(is.function(func))
stopifnot(is.character(id) & length(id) == 1)
stopifnot(is.character(name) & length(name) == 1)
stopifnot(is.character(description) & length(description) == 1)
stopifnot(is.function(func))
structure(
list(
id = id,
name = name,
description = description,
func = func
),
class = "geposan_method"
)
structure(
list(
id = id,
name = name,
description = description,
func = func
),
class = "geposan_method"
)
}
#' Get a list of all available methods.
#'
#' @export
all_methods <- function() {
list(
clustering(),
correlation(),
neural(),
adjacency(),
species_adjacency(),
proximity()
)
list(
clustering(),
correlation(),
neural(),
adjacency(),
species_adjacency(),
proximity()
)
}
#' Print a method object.
@ -51,18 +51,18 @@ all_methods <- function() {
#'
#' @export
print.geposan_method <- function(x, ...) {
cat(sprintf(
paste0(
"geposan method:",
"\n Method ID: %s",
"\n Name: %s",
"\n Description: %s",
"\n"
),
x$id,
x$name,
x$description
))
cat(sprintf(
paste0(
"geposan method:",
"\n Method ID: %s",
"\n Name: %s",
"\n Description: %s",
"\n"
),
x$id,
x$name,
x$description
))
invisible(x)
invisible(x)
}

View file

@ -12,14 +12,14 @@
#'
#' @export
densest <- function(data) {
as.numeric(if (length(data) <= 0) {
NULL
} else if (length(data) == 1) {
data
} else {
density <- stats::density(data)
mean(density$x[density$y == max(density$y)])
})
as.numeric(if (length(data) <= 0) {
NULL
} else if (length(data) == 1) {
data
} else {
density <- stats::density(data)
mean(density$x[density$y == max(density$y)])
})
}
#' Score genes based on their proximity to the reference genes.
@ -40,80 +40,80 @@ densest <- function(data) {
#'
#' @export
adjacency <- function(distance_estimate = densest, summarize = stats::median) {
method(
id = "adjacency",
name = "Adjacency",
description = "Adjacency to reference genes",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids
method(
id = "adjacency",
name = "Adjacency",
description = "Adjacency to reference genes",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids
cached(
"adjacency",
c(
species_ids,
gene_ids,
reference_gene_ids,
distance_estimate,
summarize
),
{ # nolint
# Filter distances by species and gene and summarize each
# gene's distance values using the estimation function.
data <- geposan::distances[
species %chin% species_ids & gene %chin% gene_ids,
.(distance = as.numeric(distance_estimate(distance))),
by = gene
]
cached(
"adjacency",
c(
species_ids,
gene_ids,
reference_gene_ids,
distance_estimate,
summarize
),
{ # nolint
# Filter distances by species and gene and summarize each
# gene's distance values using the estimation function.
data <- geposan::distances[
species %chin% species_ids & gene %chin% gene_ids,
.(distance = as.numeric(distance_estimate(distance))),
by = gene
]
# Compute the absolute value of the difference between the
# estimated distances of each gene to the reference genes.
compute_difference <- function(distance_value,
comparison_ids) {
differences <- data[
gene %chin% comparison_ids,
.(difference = abs(distance_value - distance))
]
# Compute the absolute value of the difference between the
# estimated distances of each gene to the reference genes.
compute_difference <- function(distance_value,
comparison_ids) {
differences <- data[
gene %chin% comparison_ids,
.(difference = abs(distance_value - distance))
]
summarize(differences$difference)
}
summarize(differences$difference)
}
# Compute the differences to the reference genes.
data[
!gene %chin% reference_gene_ids,
difference := compute_difference(
distance,
reference_gene_ids
),
by = gene
]
# Compute the differences to the reference genes.
data[
!gene %chin% reference_gene_ids,
difference := compute_difference(
distance,
reference_gene_ids
),
by = gene
]
progress(0.5)
progress(0.5)
# Exclude the reference gene itself when computing its
# difference.
data[
gene %chin% reference_gene_ids,
difference := compute_difference(
distance,
reference_gene_ids[reference_gene_ids != gene]
),
by = gene
]
# Exclude the reference gene itself when computing its
# difference.
data[
gene %chin% reference_gene_ids,
difference := compute_difference(
distance,
reference_gene_ids[reference_gene_ids != gene]
),
by = gene
]
# Compute the final score by normalizing the difference.
data[, score := 1 - difference / max(difference)]
# Compute the final score by normalizing the difference.
data[, score := 1 - difference / max(difference)]
progress(1.0)
progress(1.0)
result(
method = "adjacency",
scores = data[, .(gene, score)],
details = list(data = data)
)
}
)
result(
method = "adjacency",
scores = data[, .(gene, score)],
details = list(data = data)
)
}
)
)
}
)
}

View file

@ -13,33 +13,33 @@
#' default), the first cluster will weigh 1.0, the second 0.7, the third 0.49
#' etc.
clusteriness <- function(data, span = 100000, weight = 0.7) {
n <- length(data)
n <- length(data)
# Return a score of 0.0 if there is just one or no value at all.
if (n < 2) {
return(0.0)
# Return a score of 0.0 if there is just one or no value at all.
if (n < 2) {
return(0.0)
}
# Cluster the data and compute the cluster sizes.
tree <- stats::hclust(stats::dist(data))
clusters <- stats::cutree(tree, h = span)
cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
# Compute the "clusteriness" score.
score <- 0.0
for (i in seq_along(cluster_sizes)) {
cluster_size <- cluster_sizes[i]
if (cluster_size >= 2) {
cluster_score <- cluster_size / n
score <- score + weight^(i - 1) * cluster_score
}
}
# Cluster the data and compute the cluster sizes.
tree <- stats::hclust(stats::dist(data))
clusters <- stats::cutree(tree, h = span)
cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
# Compute the "clusteriness" score.
score <- 0.0
for (i in seq_along(cluster_sizes)) {
cluster_size <- cluster_sizes[i]
if (cluster_size >= 2) {
cluster_score <- cluster_size / n
score <- score + weight^(i - 1) * cluster_score
}
}
score
score
}
#' Process genes clustering their distance to telomeres.
@ -53,41 +53,41 @@ clusteriness <- function(data, span = 100000, weight = 0.7) {
#'
#' @export
clustering <- function() {
method(
id = "clustering",
name = "Clustering",
description = "Clustering of genes",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
method(
id = "clustering",
name = "Clustering",
description = "Clustering of genes",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
cached("clustering", c(species_ids, gene_ids), {
scores <- data.table(gene = gene_ids)
cached("clustering", c(species_ids, gene_ids), {
scores <- data.table(gene = gene_ids)
# Prefilter the input data by species.
distances <- geposan::distances[species %chin% species_ids]
# Prefilter the input data by species.
distances <- geposan::distances[species %chin% species_ids]
genes_done <- 0
genes_total <- length(gene_ids)
genes_done <- 0
genes_total <- length(gene_ids)
# Perform the cluster analysis for one gene.
compute <- function(gene_id) {
data <- distances[gene == gene_id, distance]
score <- clusteriness(data)
# Perform the cluster analysis for one gene.
compute <- function(gene_id) {
data <- distances[gene == gene_id, distance]
score <- clusteriness(data)
genes_done <<- genes_done + 1
progress(genes_done / genes_total)
genes_done <<- genes_done + 1
progress(genes_done / genes_total)
score
}
scores[, score := compute(gene), by = gene]
result(
method = "clustering",
scores = scores
)
})
score
}
)
scores[, score := compute(gene), by = gene]
result(
method = "clustering",
scores = scores
)
})
}
)
}

View file

@ -8,96 +8,97 @@
#'
#' @export
correlation <- function(summarize = stats::median) {
method(
id = "correlation",
name = "Correlation",
description = "Correlation with reference genes",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids
method(
id = "correlation",
name = "Correlation",
description = "Correlation with reference genes",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids
cached(
"correlation",
c(species_ids, gene_ids, reference_gene_ids, summarize),
{ # nolint
# Prefilter distances by species.
distances <- geposan::distances[species %chin% species_ids]
cached(
"correlation",
c(species_ids, gene_ids, reference_gene_ids, summarize),
{ # nolint
# Prefilter distances by species.
distances <- geposan::distances[species %chin% species_ids]
# Tranform data to get species as rows and genes as columns.
# We construct columns per species, because it requires
# fewer iterations, and transpose the table afterwards.
# Tranform data to get species as rows and genes as columns.
# We construct columns per species, because it requires
# fewer iterations, and transpose the table afterwards.
data <- data.table(gene = gene_ids)
data <- data.table(gene = gene_ids)
# Make a column containing distance data for each species.
for (species_id in species_ids) {
species_data <- distances[
species == species_id,
.(gene, distance)
]
# Make a column containing distance data for each species.
for (species_id in species_ids) {
species_data <- distances[
species == species_id,
.(gene, distance)
]
data <- merge(data, species_data, all.x = TRUE)
setnames(data, "distance", species_id)
}
data <- merge(data, species_data, all.x = TRUE)
setnames(data, "distance", species_id)
}
# Transpose to the desired format.
data <- transpose(data, make.names = "gene")
# Transpose to the desired format.
data <- transpose(data, make.names = "gene")
progress(0.33)
progress(0.33)
# Take the reference data.
reference_data <- data[, ..reference_gene_ids]
# Take the reference data.
reference_data <- data[, ..reference_gene_ids]
# Perform the correlation between all possible pairs.
results <- stats::cor(
data[, ..gene_ids],
reference_data,
use = "pairwise.complete.obs",
method = "spearman"
)
# Perform the correlation between all possible pairs.
results <- stats::cor(
data[, ..gene_ids],
reference_data,
use = "pairwise.complete.obs",
method = "spearman"
)
results <- data.table(results, keep.rownames = TRUE)
setnames(results, "rn", "gene")
results <- data.table(results, keep.rownames = TRUE)
setnames(results, "rn", "gene")
# Remove correlations between the reference genes
# themselves.
for (reference_gene_id in reference_gene_ids) {
column <- quote(reference_gene_id)
results[gene == reference_gene_id, eval(column) := NA]
}
# Remove correlations between the reference genes
# themselves.
for (reference_gene_id in reference_gene_ids) {
column <- quote(reference_gene_id)
results[gene == reference_gene_id, eval(column) := NA]
}
progress(0.66)
progress(0.66)
# Combine the correlation coefficients.
results[,
max_correlation := as.double(summarize(stats::na.omit(
# Convert the data.table subset into a
# vector to get the correct na.omit
# behavior.
as.matrix(.SD)[1, ]
))),
.SDcols = reference_gene_ids,
by = gene
]
# Combine the correlation coefficients.
results[,
max_correlation := as.double(summarize(stats::na.omit(
# Convert the data.table subset into a
# vector to get the correct na.omit
# behavior.
as.matrix(.SD)[1, ]
))),
.SDcols = reference_gene_ids,
by = gene
]
# Normalize scores.
results[,
score := (max_correlation - min(max_correlation)) /
(max(max_correlation) - min(max_correlation))
]
# Normalize scores.
results[
,
score := (max_correlation - min(max_correlation)) /
(max(max_correlation) - min(max_correlation))
]
# Normalize scores.
# Normalize scores.
results[, .(gene, score)]
results[, .(gene, score)]
result(
method = "correlation",
scores = results[, .(gene, score)],
details = list(all_correlations = results)
)
}
)
result(
method = "correlation",
scores = results[, .(gene, score)],
details = list(all_correlations = results)
)
}
)
)
}
)
}

View file

@ -12,244 +12,244 @@
#'
#' @export
neural <- function(seed = 180199, n_models = 5) {
method(
id = "neural",
name = "Neural",
description = "Assessment by neural network",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids
method(
id = "neural",
name = "Neural",
description = "Assessment by neural network",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids
cached(
"neural",
c(species_ids, gene_ids, reference_gene_ids, seed, n_models),
{ # nolint
reference_count <- length(reference_gene_ids)
stopifnot(n_models %in% 2:reference_count)
cached(
"neural",
c(species_ids, gene_ids, reference_gene_ids, seed, n_models),
{ # nolint
reference_count <- length(reference_gene_ids)
stopifnot(n_models %in% 2:reference_count)
# Make results reproducible.
tensorflow::set_random_seed(seed)
# Make results reproducible.
tensorflow::set_random_seed(seed)
# Step 1: Prepare input data.
# ---------------------------
# Step 1: Prepare input data.
# ---------------------------
# Prefilter distances by species.
distances <- geposan::distances[species %chin% species_ids]
# Prefilter distances by species.
distances <- geposan::distances[species %chin% species_ids]
# Input data for the network. This contains the gene ID as
# an identifier as well as the per-species gene distances as
# input variables.
data <- data.table(gene = gene_ids)
# Input data for the network. This contains the gene ID as
# an identifier as well as the per-species gene distances as
# input variables.
data <- data.table(gene = gene_ids)
# Buffer to keep track of the names of the input variables.
input_vars <- NULL
# Buffer to keep track of the names of the input variables.
input_vars <- NULL
# Make a columns containing positions and distances for each
# species.
for (species_id in species_ids) {
species_data <- distances[
species == species_id,
.(gene, distance)
]
# Make a columns containing positions and distances for each
# species.
for (species_id in species_ids) {
species_data <- distances[
species == species_id,
.(gene, distance)
]
# Only include species with at least 25% known values.
# As positions and distances always coexist, we don't
# loose any data here.
# Only include species with at least 25% known values.
# As positions and distances always coexist, we don't
# loose any data here.
species_data <- stats::na.omit(species_data)
species_data <- stats::na.omit(species_data)
if (nrow(species_data) >= 0.25 * length(gene_ids)) {
data <- merge(data, species_data, all.x = TRUE)
if (nrow(species_data) >= 0.25 * length(gene_ids)) {
data <- merge(data, species_data, all.x = TRUE)
# Replace missing data with mean values. The neural
# network can't handle NAs in a meaningful way.
# Choosing extreme values here would result in
# heavily biased results. Therefore, the mean value
# is chosen as a compromise. However, this will of
# course lessen the significance of the results.
# Replace missing data with mean values. The neural
# network can't handle NAs in a meaningful way.
# Choosing extreme values here would result in
# heavily biased results. Therefore, the mean value
# is chosen as a compromise. However, this will of
# course lessen the significance of the results.
mean_distance <- round(
species_data[, mean(distance)]
)
mean_distance <- round(
species_data[, mean(distance)]
)
data[is.na(distance), distance := mean_distance]
data[is.na(distance), distance := mean_distance]
# Name the new column after the species.
setnames(data, "distance", species_id)
# Name the new column after the species.
setnames(data, "distance", species_id)
# Add the input variable to the buffer.
input_vars <- c(input_vars, species_id)
}
}
# Add the input variable to the buffer.
input_vars <- c(input_vars, species_id)
}
}
progress(0.1)
progress(0.1)
# Step 2: Prepare training data.
# ------------------------------
# Step 2: Prepare training data.
# ------------------------------
# Take out the reference data.
# Take out the reference data.
reference_data <- data[gene %chin% reference_gene_ids]
reference_data[, score := 1.0]
reference_data <- data[gene %chin% reference_gene_ids]
reference_data[, score := 1.0]
# Take out random samples from the remaining genes. This is
# another compromise with a negative impact on
# significance. We assume that a random gene is not likely
# to match the reference genes.
# Take out random samples from the remaining genes. This is
# another compromise with a negative impact on
# significance. We assume that a random gene is not likely
# to match the reference genes.
without_reference_data <- data[
!gene %chin% reference_gene_ids
]
without_reference_data <- data[
!gene %chin% reference_gene_ids
]
control_data <- without_reference_data[
sample(
nrow(without_reference_data),
reference_count
)
]
control_data[, score := 0.0]
# Split the training data into random sets to have
# validation data for each model.
# Scramble the source tables.
reference_data <- reference_data[sample(reference_count)]
control_data <- control_data[sample(reference_count)]
networks <- list()
indices <- seq_len(reference_count)
indices_split <- split(indices, indices %% n_models)
for (i in seq_len(n_models)) {
training_data <- rbindlist(list(
reference_data[!indices_split[[i]]],
control_data[!indices_split[[i]]]
))
validation_data <- rbindlist(list(
reference_data[indices_split[[i]]],
control_data[indices_split[[i]]]
))
networks[[i]] <- list(
training_data = training_data,
validation_data = validation_data
)
}
# Step 3: Create, train and apply neural network.
# -----------------------------------------------
# Layers for the neural network.
input_layer <- length(input_vars)
layer1 <- input_layer
layer2 <- 0.5 * input_layer
layer3 <- 0.5 * layer2
# Convert data to matrix and normalize it.
to_matrix <- function(data) {
data_matrix <- as.matrix(data[, ..input_vars])
colnames(data_matrix) <- NULL
keras::normalize(data_matrix)
}
data_matrix <- to_matrix(data)
output_vars <- NULL
for (i in seq_along(networks)) {
# Create a new model for each training session, because
# the model would keep its state across training
# sessions otherwise.
model <- keras::keras_model_sequential() |>
keras::layer_dense(
units = layer1,
activation = "relu",
input_shape = input_layer,
) |>
keras::layer_dense(
units = layer2,
activation = "relu",
kernel_regularizer = keras::regularizer_l2()
) |>
keras::layer_dense(
units = layer3,
activation = "relu",
kernel_regularizer = keras::regularizer_l2()
) |>
keras::layer_dense(
units = 1,
activation = "sigmoid"
) |>
keras::compile(
loss = keras::loss_mean_absolute_error(),
optimizer = keras::optimizer_adam()
)
# Train the model.
network <- networks[[i]]
training_data <- network$training_data
training_matrix <- to_matrix(training_data)
validation_data <- network$validation_data
validation_matrix <- to_matrix(validation_data)
fit <- keras::fit(
model,
x = training_matrix,
y = training_data$score,
validation_data = list(
x_val = validation_matrix,
y_val = validation_data$score
),
epochs = 500,
verbose = FALSE
)
# Apply the model.
data[, new_score := stats::predict(model, data_matrix)]
# Remove the values of the training data itself.
data[gene %chin% training_data$gene, new_score := NA]
output_var <- sprintf("score%i", i)
setnames(data, "new_score", output_var)
output_vars <- c(output_vars, output_var)
# Store the details.
networks[[i]]$model <- keras::serialize_model(model)
networks[[i]]$fit <- fit
progress(0.1 + i * (0.9 / n_models))
}
# Compute the final score as the mean score.
data[,
score := mean(as.numeric(.SD), na.rm = TRUE),
.SDcols = output_vars,
by = gene
]
progress(1.0)
result(
method = "neural",
scores = data[, .(gene, score)],
details = list(
seed = seed,
n_models = n_models,
all_results = data[, !..input_vars],
networks = networks
)
)
}
control_data <- without_reference_data[
sample(
nrow(without_reference_data),
reference_count
)
]
control_data[, score := 0.0]
# Split the training data into random sets to have
# validation data for each model.
# Scramble the source tables.
reference_data <- reference_data[sample(reference_count)]
control_data <- control_data[sample(reference_count)]
networks <- list()
indices <- seq_len(reference_count)
indices_split <- split(indices, indices %% n_models)
for (i in seq_len(n_models)) {
training_data <- rbindlist(list(
reference_data[!indices_split[[i]]],
control_data[!indices_split[[i]]]
))
validation_data <- rbindlist(list(
reference_data[indices_split[[i]]],
control_data[indices_split[[i]]]
))
networks[[i]] <- list(
training_data = training_data,
validation_data = validation_data
)
}
# Step 3: Create, train and apply neural network.
# -----------------------------------------------
# Layers for the neural network.
input_layer <- length(input_vars)
layer1 <- input_layer
layer2 <- 0.5 * input_layer
layer3 <- 0.5 * layer2
# Convert data to matrix and normalize it.
to_matrix <- function(data) {
data_matrix <- as.matrix(data[, ..input_vars])
colnames(data_matrix) <- NULL
keras::normalize(data_matrix)
}
data_matrix <- to_matrix(data)
output_vars <- NULL
for (i in seq_along(networks)) {
# Create a new model for each training session, because
# the model would keep its state across training
# sessions otherwise.
model <- keras::keras_model_sequential() |>
keras::layer_dense(
units = layer1,
activation = "relu",
input_shape = input_layer,
) |>
keras::layer_dense(
units = layer2,
activation = "relu",
kernel_regularizer = keras::regularizer_l2()
) |>
keras::layer_dense(
units = layer3,
activation = "relu",
kernel_regularizer = keras::regularizer_l2()
) |>
keras::layer_dense(
units = 1,
activation = "sigmoid"
) |>
keras::compile(
loss = keras::loss_mean_absolute_error(),
optimizer = keras::optimizer_adam()
)
# Train the model.
network <- networks[[i]]
training_data <- network$training_data
training_matrix <- to_matrix(training_data)
validation_data <- network$validation_data
validation_matrix <- to_matrix(validation_data)
fit <- keras::fit(
model,
x = training_matrix,
y = training_data$score,
validation_data = list(
x_val = validation_matrix,
y_val = validation_data$score
),
epochs = 500,
verbose = FALSE
)
# Apply the model.
data[, new_score := stats::predict(model, data_matrix)]
# Remove the values of the training data itself.
data[gene %chin% training_data$gene, new_score := NA]
output_var <- sprintf("score%i", i)
setnames(data, "new_score", output_var)
output_vars <- c(output_vars, output_var)
# Store the details.
networks[[i]]$model <- keras::serialize_model(model)
networks[[i]]$fit <- fit
progress(0.1 + i * (0.9 / n_models))
}
# Compute the final score as the mean score.
data[,
score := mean(as.numeric(.SD), na.rm = TRUE),
.SDcols = output_vars,
by = gene
]
progress(1.0)
result(
method = "neural",
scores = data[, .(gene, score)],
details = list(
seed = seed,
n_models = n_models,
all_results = data[, !..input_vars],
networks = networks
)
)
}
)
)
}
)
}

View file

@ -11,38 +11,38 @@
#'
#' @export
proximity <- function(summarize = stats::median) {
method(
id = "proximity",
name = "Proximity",
description = "Proximity to telomeres",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
method(
id = "proximity",
name = "Proximity",
description = "Proximity to telomeres",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
cached("proximity", c(species_ids, gene_ids), {
# Prefilter distances by species and gene.
data <- geposan::distances[
species %chin% preset$species_ids &
gene %chin% preset$gene_ids
]
cached("proximity", c(species_ids, gene_ids), {
# Prefilter distances by species and gene.
data <- geposan::distances[
species %chin% preset$species_ids &
gene %chin% preset$gene_ids
]
# Compute the score as described above.
data <- data[,
.(combined_distance = as.double(summarize(distance))),
by = "gene"
]
# Compute the score as described above.
data <- data[,
.(combined_distance = as.double(summarize(distance))),
by = "gene"
]
# Normalize scores.
data[, score := 1 - combined_distance / max(combined_distance)]
# Normalize scores.
data[, score := 1 - combined_distance / max(combined_distance)]
progress(1.0)
progress(1.0)
result(
method = "proximity",
scores = data[, .(gene, score)],
details = list(data = data)
)
})
}
)
result(
method = "proximity",
scores = data[, .(gene, score)],
details = list(data = data)
)
})
}
)
}

View file

@ -15,136 +15,136 @@
#' @export
species_adjacency <- function(distance_estimate = stats::median,
summarize = stats::median) {
method(
id = "species_adjacency",
name = "Species adj.",
description = "Species adjacency",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids
method(
id = "species_adjacency",
name = "Species adj.",
description = "Species adjacency",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids
cached(
"species_adjacency",
c(
species_ids,
gene_ids,
reference_gene_ids,
distance_estimate,
summarize
),
{ # nolint
# Prefilter distances.
data <- geposan::distances[
species %chin% species_ids & gene %chin% gene_ids
]
cached(
"species_adjacency",
c(
species_ids,
gene_ids,
reference_gene_ids,
distance_estimate,
summarize
),
{ # nolint
# Prefilter distances.
data <- geposan::distances[
species %chin% species_ids & gene %chin% gene_ids
]
progress_state <- 0.0
progress_step <- 0.9 / length(species_ids)
progress_state <- 0.0
progress_step <- 0.9 / length(species_ids)
# Iterate through all species and find the distance
# estimates within that species.
for (species_id in species_ids) {
# For all genes, compute the distance to one reference
# gene at a time in one go.
for (reference_gene_id in reference_gene_ids) {
comparison_distance <- data[
species == species_id &
gene == reference_gene_id,
distance
]
# Iterate through all species and find the distance
# estimates within that species.
for (species_id in species_ids) {
# For all genes, compute the distance to one reference
# gene at a time in one go.
for (reference_gene_id in reference_gene_ids) {
comparison_distance <- data[
species == species_id &
gene == reference_gene_id,
distance
]
column <- quote(reference_gene_id)
column <- quote(reference_gene_id)
if (length(comparison_distance) != 1) {
# If we don't have a comparison distance, we
# can't compute a difference. This happens, if
# the species doesn't have the reference gene.
data[
species == species_id &
gene %chin% gene_ids,
eval(column) := NA_integer_
]
} else {
data[
species == species_id &
gene %chin% gene_ids,
eval(column) :=
abs(distance - comparison_distance)
]
}
}
if (length(comparison_distance) != 1) {
# If we don't have a comparison distance, we
# can't compute a difference. This happens, if
# the species doesn't have the reference gene.
data[
species == species_id &
gene %chin% gene_ids,
eval(column) := NA_integer_
]
} else {
data[
species == species_id &
gene %chin% gene_ids,
eval(column) :=
abs(distance - comparison_distance)
]
}
}
# Combine the distances to the different reference genes
# into one value using the provided function.
data[
species == species_id &
gene %chin% gene_ids,
combined_distance := as.numeric(
distance_estimate(stats::na.omit(
# Convert the data.table subset into a
# vector to get the correct na.omit
# behavior.
as.matrix(.SD)[1, ]
))
),
.SDcols = reference_gene_ids,
by = gene
]
# Combine the distances to the different reference genes
# into one value using the provided function.
data[
species == species_id &
gene %chin% gene_ids,
combined_distance := as.numeric(
distance_estimate(stats::na.omit(
# Convert the data.table subset into a
# vector to get the correct na.omit
# behavior.
as.matrix(.SD)[1, ]
))
),
.SDcols = reference_gene_ids,
by = gene
]
progress_state <- progress_state + progress_step
progress(progress_state)
}
progress_state <- progress_state + progress_step
progress(progress_state)
}
progress(0.9)
progress(0.9)
# Remove the distances between the reference genes.
for (reference_gene_id in reference_gene_ids) {
column <- quote(reference_gene_id)
data[gene == reference_gene_id, eval(column) := NA]
}
# Remove the distances between the reference genes.
for (reference_gene_id in reference_gene_ids) {
column <- quote(reference_gene_id)
data[gene == reference_gene_id, eval(column) := NA]
}
# Recompute the combined distance for the reference genes.
data[
gene %chin% reference_gene_ids,
combined_distance := as.numeric(
distance_estimate(stats::na.omit(
as.matrix(.SD)[1, ])
)
),
.SDcols = reference_gene_ids,
by = list(species, gene)
]
# Recompute the combined distance for the reference genes.
data[
gene %chin% reference_gene_ids,
combined_distance := as.numeric(
distance_estimate(stats::na.omit(
as.matrix(.SD)[1, ]
))
),
.SDcols = reference_gene_ids,
by = list(species, gene)
]
# Combine the distances into one value.
results <- data[,
.(
summarized_distances = as.numeric(
summarize(stats::na.omit(combined_distance))
)
),
by = gene
]
# Combine the distances into one value.
results <- data[,
.(
summarized_distances = as.numeric(
summarize(stats::na.omit(combined_distance))
)
),
by = gene
]
# Compute the final score by normalizing the difference.
results[
,
score := 1 - summarized_distances /
max(summarized_distances)
]
# Compute the final score by normalizing the difference.
results[
,
score := 1 - summarized_distances /
max(summarized_distances)
]
progress(1.0)
progress(1.0)
result(
method = "species_adjacency",
scores = results[, .(gene, score)],
details = list(
data = data,
results = results
)
)
}
result(
method = "species_adjacency",
scores = results[, .(gene, score)],
details = list(
data = data,
results = results
)
)
}
)
)
}
)
}

674
R/plots.R
View file

@ -9,7 +9,7 @@ base_color_transparent <- function() "#1964bf80"
#' Color palette for gene sets.
#' @noRd
gene_set_color <- function(index) {
c("#FF7F00", "#4DAF4A", "#984EA3")[index]
c("#FF7F00", "#4DAF4A", "#984EA3")[index]
}
#' Plot gene positions.
@ -22,74 +22,74 @@ gene_set_color <- function(index) {
#'
#' @export
plot_positions <- function(species_ids, gene_sets) {
if (!requireNamespace("plotly", quietly = TRUE)) {
stop("Please install \"plotly\" to use this function.")
}
if (!requireNamespace("plotly", quietly = TRUE)) {
stop("Please install \"plotly\" to use this function.")
}
# Prefilter data by species.
data <- geposan::distances[species %chin% species_ids]
# Prefilter data by species.
data <- geposan::distances[species %chin% species_ids]
species_max_distance <- data[,
.(max_distance = max(distance)),
by = species
]
species_max_distance <- data[,
.(max_distance = max(distance)),
by = species
]
# Prefilter species.
species <- geposan::species[id %chin% species_ids]
# Prefilter species.
species <- geposan::species[id %chin% species_ids]
plot <- plotly::plot_ly() |>
plotly::layout(
xaxis = list(
title = "Species",
tickvals = species$id,
ticktext = species$name
),
yaxis = list(title = "Distance to telomeres [Bp]"),
bargap = 0.9
) |>
plotly::add_bars(
data = species_max_distance,
x = ~species,
y = ~max_distance,
name = "All genes",
marker = list(color = base_color())
plot <- plotly::plot_ly() |>
plotly::layout(
xaxis = list(
title = "Species",
tickvals = species$id,
ticktext = species$name
),
yaxis = list(title = "Distance to telomeres [Bp]"),
bargap = 0.9
) |>
plotly::add_bars(
data = species_max_distance,
x = ~species,
y = ~max_distance,
name = "All genes",
marker = list(color = base_color())
)
if (length(gene_sets) > 0) {
# Include gene information which will be used for labeling
gene_set_data <- merge(
data[gene %chin% unlist(gene_sets)],
geposan::genes,
by.x = "gene",
by.y = "id"
)
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set <- gene_sets[[gene_set_name]]
plot <- plot |> plotly::add_markers(
data = gene_set_data[gene %chin% gene_set],
x = ~species,
y = ~distance,
name = gene_set_name,
text = ~ glue::glue(
"<b>{name}</b><br>",
"{round(distance / 1000000, digits = 2)} MBp"
),
hoverinfo = "text",
marker = list(
size = 10,
color = gene_set_color(index)
)
)
if (length(gene_sets) > 0) {
# Include gene information which will be used for labeling
gene_set_data <- merge(
data[gene %chin% unlist(gene_sets)],
geposan::genes,
by.x = "gene",
by.y = "id"
)
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set <- gene_sets[[gene_set_name]]
plot <- plot |> plotly::add_markers(
data = gene_set_data[gene %chin% gene_set],
x = ~species,
y = ~distance,
name = gene_set_name,
text = ~ glue::glue(
"<b>{name}</b><br>",
"{round(distance / 1000000, digits = 2)} MBp"
),
hoverinfo = "text",
marker = list(
size = 10,
color = gene_set_color(index)
)
)
index <- index + 1
}
index <- index + 1
}
}
plot
plot
}
@ -108,75 +108,75 @@ plot_positions <- function(species_ids, gene_sets) {
#'
#' @export
plot_rankings <- function(rankings, gene_sets) {
if (!requireNamespace("plotly", quietly = TRUE)) {
stop("Please install \"plotly\" to use this function.")
}
if (!requireNamespace("plotly", quietly = TRUE)) {
stop("Please install \"plotly\" to use this function.")
}
plot <- plotly::plot_ly() |>
plotly::layout(
xaxis = list(tickvals = names(rankings)),
yaxis = list(title = "Score")
plot <- plotly::plot_ly() |>
plotly::layout(
xaxis = list(tickvals = names(rankings)),
yaxis = list(title = "Score")
)
is_first <- TRUE
for (ranking_name in names(rankings)) {
ranking <- rankings[[ranking_name]]
plot <- plot |> plotly::add_trace(
data = ranking,
x = ranking_name,
y = ~score,
name = "All genes",
type = "violin",
spanmode = "hard",
points = FALSE,
showlegend = is_first,
hoverinfo = "skip",
line = list(color = base_color()),
fillcolor = base_color_transparent()
)
if (length(gene_sets) > 0) {
gene_set_data <- merge(
ranking[gene %chin% unlist(gene_sets)],
geposan::genes,
by.x = "gene",
by.y = "id"
)
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set <- gene_sets[[gene_set_name]]
plot <- plot |> plotly::add_markers(
data = gene_set_data[gene %chin% gene_set],
x = ranking_name,
y = ~score,
name = gene_set_name,
text = ~ glue::glue(
"<b>{name}</b><br>",
"Score: {round(score, digits = 2)}<br>",
"Rank: {rank}<br>",
"Percentile: {round(percentile * 100, digits = 2)}%"
),
hoverinfo = "text",
showlegend = is_first,
marker = list(
size = 10,
color = gene_set_color(index)
)
)
is_first <- TRUE
for (ranking_name in names(rankings)) {
ranking <- rankings[[ranking_name]]
plot <- plot |> plotly::add_trace(
data = ranking,
x = ranking_name,
y = ~score,
name = "All genes",
type = "violin",
spanmode = "hard",
points = FALSE,
showlegend = is_first,
hoverinfo = "skip",
line = list(color = base_color()),
fillcolor = base_color_transparent()
)
if (length(gene_sets) > 0) {
gene_set_data <- merge(
ranking[gene %chin% unlist(gene_sets)],
geposan::genes,
by.x = "gene",
by.y = "id"
)
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set <- gene_sets[[gene_set_name]]
plot <- plot |> plotly::add_markers(
data = gene_set_data[gene %chin% gene_set],
x = ranking_name,
y = ~score,
name = gene_set_name,
text = ~ glue::glue(
"<b>{name}</b><br>",
"Score: {round(score, digits = 2)}<br>",
"Rank: {rank}<br>",
"Percentile: {round(percentile * 100, digits = 2)}%"
),
hoverinfo = "text",
showlegend = is_first,
marker = list(
size = 10,
color = gene_set_color(index)
)
)
index <- index + 1
}
}
is_first <- FALSE
index <- index + 1
}
}
plot
is_first <- FALSE
}
plot
}
@ -194,88 +194,88 @@ plot_rankings <- function(rankings, gene_sets) {
#'
#' @export
plot_scores <- function(ranking, gene_sets = NULL, max_rank = NULL) {
if (!requireNamespace("plotly", quietly = TRUE)) {
stop("Please install \"plotly\" to use this function.")
}
if (!requireNamespace("plotly", quietly = TRUE)) {
stop("Please install \"plotly\" to use this function.")
}
# To speed up rendering, don't show every single gene.
n_ranks <- nrow(ranking)
sample_ranking <- ranking[seq(1, n_ranks, 5)]
# To speed up rendering, don't show every single gene.
n_ranks <- nrow(ranking)
sample_ranking <- ranking[seq(1, n_ranks, 5)]
plot <- plotly::plot_ly() |>
plotly::add_lines(
data = sample_ranking,
x = ~percentile,
y = ~score,
name = "All genes",
hoverinfo = "skip",
line = list(width = 4, color = base_color())
) |>
plotly::layout(
xaxis = list(
title = "Percentile",
tickformat = ".0%"
),
yaxis = list(title = "Score")
plot <- plotly::plot_ly() |>
plotly::add_lines(
data = sample_ranking,
x = ~percentile,
y = ~score,
name = "All genes",
hoverinfo = "skip",
line = list(width = 4, color = base_color())
) |>
plotly::layout(
xaxis = list(
title = "Percentile",
tickformat = ".0%"
),
yaxis = list(title = "Score")
)
if (length(gene_sets) > 0) {
# Include gene information which will be used for labeling
gene_set_data <- merge(
ranking[gene %chin% unlist(gene_sets)],
geposan::genes,
by.x = "gene",
by.y = "id"
)
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set <- gene_sets[[gene_set_name]]
plot <- plot |> plotly::add_markers(
data = gene_set_data[gene %chin% gene_set],
x = ~percentile,
y = ~score,
name = gene_set_name,
text = ~ glue::glue(
"<b>{name}</b><br>",
"Score: {round(score, digits = 2)}<br>",
"Rank: {rank}<br>",
"Percentile: {round(percentile * 100, digits = 2)}%"
),
hoverinfo = "text",
marker = list(
size = 10,
color = gene_set_color(index)
)
)
if (length(gene_sets) > 0) {
# Include gene information which will be used for labeling
gene_set_data <- merge(
ranking[gene %chin% unlist(gene_sets)],
geposan::genes,
by.x = "gene",
by.y = "id"
index <- index + 1
}
}
if (!is.null(max_rank)) {
first_not_included_rank <- max_rank + 1
last_rank <- ranking[, .N]
if (first_not_included_rank <= last_rank) {
plot <- plot |> plotly::layout(
shapes = list(
type = "rect",
fillcolor = "black",
opacity = 0.1,
x0 = 1 - first_not_included_rank / n_ranks,
x1 = 1 - last_rank / n_ranks,
y0 = 0.0,
y1 = 1.0
)
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set <- gene_sets[[gene_set_name]]
plot <- plot |> plotly::add_markers(
data = gene_set_data[gene %chin% gene_set],
x = ~percentile,
y = ~score,
name = gene_set_name,
text = ~ glue::glue(
"<b>{name}</b><br>",
"Score: {round(score, digits = 2)}<br>",
"Rank: {rank}<br>",
"Percentile: {round(percentile * 100, digits = 2)}%"
),
hoverinfo = "text",
marker = list(
size = 10,
color = gene_set_color(index)
)
)
index <- index + 1
}
)
}
}
if (!is.null(max_rank)) {
first_not_included_rank <- max_rank + 1
last_rank <- ranking[, .N]
if (first_not_included_rank <= last_rank) {
plot <- plot |> plotly::layout(
shapes = list(
type = "rect",
fillcolor = "black",
opacity = 0.1,
x0 = 1 - first_not_included_rank / n_ranks,
x1 = 1 - last_rank / n_ranks,
y0 = 0.0,
y1 = 1.0
)
)
}
}
plot
plot
}
#' Visualize a ranking by comparing gene sets in a boxplot.
@ -290,44 +290,44 @@ plot_scores <- function(ranking, gene_sets = NULL, max_rank = NULL) {
#'
#' @export
plot_boxplot <- function(ranking, gene_sets = NULL) {
if (!requireNamespace("plotly", quietly = TRUE)) {
stop("Please install \"plotly\" to use this function.")
if (!requireNamespace("plotly", quietly = TRUE)) {
stop("Please install \"plotly\" to use this function.")
}
plot <- plotly::plot_ly() |>
plotly::add_boxplot(
data = ranking,
x = "All genes",
y = ~score,
name = "All genes",
showlegend = FALSE,
line = list(color = base_color())
) |>
plotly::layout(
xaxis = list(tickvals = c("All genes", names(gene_sets))),
yaxis = list(title = "Score")
)
if (length(gene_sets) > 0) {
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set <- gene_sets[[gene_set_name]]
plot <- plot |> plotly::add_boxplot(
data = ranking[gene %chin% gene_set],
x = gene_set_name,
y = ~score,
name = gene_set_name,
showlegend = FALSE,
line = list(color = gene_set_color(index))
)
index <- index + 1
}
}
plot <- plotly::plot_ly() |>
plotly::add_boxplot(
data = ranking,
x = "All genes",
y = ~score,
name = "All genes",
showlegend = FALSE,
line = list(color = base_color())
) |>
plotly::layout(
xaxis = list(tickvals = c("All genes", names(gene_sets))),
yaxis = list(title = "Score")
)
if (length(gene_sets) > 0) {
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set <- gene_sets[[gene_set_name]]
plot <- plot |> plotly::add_boxplot(
data = ranking[gene %chin% gene_set],
x = gene_set_name,
y = ~score,
name = gene_set_name,
showlegend = FALSE,
line = list(color = gene_set_color(index))
)
index <- index + 1
}
}
plot
plot
}
#' Show the distribution of scores across chromosomes.
@ -340,46 +340,46 @@ plot_boxplot <- function(ranking, gene_sets = NULL) {
#'
#' @export
plot_chromosomes <- function(ranking) {
if (!requireNamespace("plotly", quietly = TRUE)) {
stop("Please install \"plotly\" to use this function.")
}
if (!requireNamespace("plotly", quietly = TRUE)) {
stop("Please install \"plotly\" to use this function.")
}
data <- merge(ranking, geposan::genes, by.x = "gene", by.y = "id")
data <- data[, .(score = mean(score)), by = "chromosome"]
data <- merge(ranking, geposan::genes, by.x = "gene", by.y = "id")
data <- data[, .(score = mean(score)), by = "chromosome"]
# Get an orderable integer from a chromosome name.
chromosome_index <- function(chromosome) {
index <- suppressWarnings(as.integer(chromosome))
# Get an orderable integer from a chromosome name.
chromosome_index <- function(chromosome) {
index <- suppressWarnings(as.integer(chromosome))
ifelse(
!is.na(index),
index,
ifelse(
chromosome == "X",
998,
999
)
)
}
ifelse(
!is.na(index),
index,
ifelse(
chromosome == "X",
998,
999
)
)
}
data[, index := chromosome_index(chromosome)]
setorder(data, "index")
data[, index := chromosome_index(chromosome)]
setorder(data, "index")
plotly::plot_ly(
data = data,
x = ~chromosome,
y = ~score,
type = "bar",
marker = list(color = base_color())
) |>
plotly::layout(
xaxis = list(
title = "Chromosome",
categoryorder = "array",
categoryarray = ~chromosome
),
yaxis = list(title = "Mean score")
)
plotly::plot_ly(
data = data,
x = ~chromosome,
y = ~score,
type = "bar",
marker = list(color = base_color())
) |>
plotly::layout(
xaxis = list(
title = "Chromosome",
categoryorder = "array",
categoryarray = ~chromosome
),
yaxis = list(title = "Mean score")
)
}
#' Plot scores in relation to chromosomal position of genes.
@ -398,74 +398,74 @@ plot_chromosomes <- function(ranking) {
plot_scores_by_position <- function(ranking,
chromosome_name = NULL,
gene_sets = NULL) {
if (!requireNamespace("plotly", quietly = TRUE)) {
stop("Please install \"plotly\" to use this function.")
}
if (!requireNamespace("plotly", quietly = TRUE)) {
stop("Please install \"plotly\" to use this function.")
}
distance_data <- if (!is.null(chromosome_name)) {
chromosome_name_ <- chromosome_name
geposan::distances[
species == "hsapiens" &
chromosome_name == chromosome_name_
]
} else {
geposan::distances[species == "hsapiens"]
}
distance_data <- if (!is.null(chromosome_name)) {
chromosome_name_ <- chromosome_name
geposan::distances[
species == "hsapiens" &
chromosome_name == chromosome_name_
]
} else {
geposan::distances[species == "hsapiens"]
}
data <- merge(ranking, distance_data, by = "gene")
data <- merge(ranking, distance_data, by = "gene")
data <- merge(
data,
geposan::genes,
by.x = "gene",
by.y = "id"
data <- merge(
data,
geposan::genes,
by.x = "gene",
by.y = "id"
)
data[, `:=`(gene_set = "All genes", color = base_color())]
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set_genes <- gene_sets[[gene_set_name]]
data[
gene %chin% gene_set_genes,
`:=`(
gene_set = gene_set_name,
color = gene_set_color(index)
)
]
index <- index + 1
}
# Use distances instead of positions in case all chromosomes are included.
if (is.null(chromosome_name)) {
data[, x := distance]
} else {
data[, x := start_position]
}
plotly::plot_ly() |>
plotly::add_markers(
data = data,
x = ~x,
y = ~score,
name = ~gene_set,
text = ~ glue::glue(
"<b>{name}</b><br>",
if (is.null(chromosome_name)) "Distance: " else "Position: ",
"{round(x / 1000000, digits = 2)} MBp<br>",
"Score: {round(score, digits = 2)}<br>",
"Rank: {rank}<br>",
"Percentile: {round(percentile * 100, digits = 2)}%"
),
hoverinfo = "text",
) |>
plotly::layout(
xaxis = list(title = if (is.null(chromosome_name)) {
"Distance (Bp)"
} else {
"Position (Bp)"
}),
yaxis = list(title = "Score")
)
data[, `:=`(gene_set = "All genes", color = base_color())]
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set_genes <- gene_sets[[gene_set_name]]
data[
gene %chin% gene_set_genes,
`:=`(
gene_set = gene_set_name,
color = gene_set_color(index)
)
]
index <- index + 1
}
# Use distances instead of positions in case all chromosomes are included.
if (is.null(chromosome_name)) {
data[, x := distance]
} else {
data[, x := start_position]
}
plotly::plot_ly() |>
plotly::add_markers(
data = data,
x = ~x,
y = ~score,
name = ~gene_set,
text = ~ glue::glue(
"<b>{name}</b><br>",
if (is.null(chromosome_name)) "Distance: " else "Position: ",
"{round(x / 1000000, digits = 2)} MBp<br>",
"Score: {round(score, digits = 2)}<br>",
"Rank: {rank}<br>",
"Percentile: {round(percentile * 100, digits = 2)}%"
),
hoverinfo = "text",
) |>
plotly::layout(
xaxis = list(title = if (is.null(chromosome_name)) {
"Distance (Bp)"
} else {
"Position (Bp)"
}),
yaxis = list(title = "Score")
)
}

View file

@ -21,55 +21,55 @@ preset <- function(reference_gene_ids,
methods = all_methods(),
species_ids = geposan::species$id,
gene_ids = geposan::genes$id) {
# Count included species per gene.
genes_n_species <- geposan::distances[
species %chin% species_ids,
.(n_species = .N),
by = "gene"
]
# Count included species per gene.
genes_n_species <- geposan::distances[
species %chin% species_ids,
.(n_species = .N),
by = "gene"
]
# Filter out genes with less than 25% existing orthologs.
gene_ids_filtered <- genes_n_species[
gene %chin% gene_ids &
n_species >= 0.25 * length(species_ids),
gene
]
# Filter out genes with less than 25% existing orthologs.
gene_ids_filtered <- genes_n_species[
gene %chin% gene_ids &
n_species >= 0.25 * length(species_ids),
gene
]
reference_gene_ids_excluded <- reference_gene_ids[
!reference_gene_ids %chin% gene_ids_filtered
]
reference_gene_ids_excluded <- reference_gene_ids[
!reference_gene_ids %chin% gene_ids_filtered
]
if (length(reference_gene_ids_excluded > 0)) {
warning(paste0(
"The following reference gene IDs are excluded from the preset ",
"because they don't have enough data: ",
paste(reference_gene_ids_excluded, collapse = ", ")
))
}
if (length(reference_gene_ids_excluded > 0)) {
warning(paste0(
"The following reference gene IDs are excluded from the preset ",
"because they don't have enough data: ",
paste(reference_gene_ids_excluded, collapse = ", ")
))
}
reference_gene_ids_included <- reference_gene_ids[
reference_gene_ids %chin% gene_ids_filtered
]
reference_gene_ids_included <- reference_gene_ids[
reference_gene_ids %chin% gene_ids_filtered
]
if (length(reference_gene_ids_included) < 1) {
stop(paste0(
"There has to be at least one reference gene for the preset to be ",
"valid. Please note that some methods may require more reference ",
"genes."
))
}
if (length(reference_gene_ids_included) < 1) {
stop(paste0(
"There has to be at least one reference gene for the preset to be ",
"valid. Please note that some methods may require more reference ",
"genes."
))
}
# The included data gets sorted to be able to produce predictable hashes
# for the object later.
structure(
list(
reference_gene_ids = sort(reference_gene_ids_included),
methods = methods,
species_ids = sort(species_ids),
gene_ids = sort(gene_ids_filtered)
),
class = "geposan_preset"
)
# The included data gets sorted to be able to produce predictable hashes
# for the object later.
structure(
list(
reference_gene_ids = sort(reference_gene_ids_included),
methods = methods,
species_ids = sort(species_ids),
gene_ids = sort(gene_ids_filtered)
),
class = "geposan_preset"
)
}
#' S3 method to print a preset object.
@ -81,20 +81,20 @@ preset <- function(reference_gene_ids,
#'
#' @export
print.geposan_preset <- function(x, ...) {
cat(sprintf(
paste0(
"geposan preset:",
"\n Reference genes: %i",
"\n Included methods: %s",
"\n Number of species: %i",
"\n Number of genes: %i",
"\n"
),
length(x$reference_gene_ids),
paste(sapply(x$methods, function(m) m$id), collapse = ", "),
length(x$species_ids),
length(x$gene_ids)
))
cat(sprintf(
paste0(
"geposan preset:",
"\n Reference genes: %i",
"\n Included methods: %s",
"\n Number of species: %i",
"\n Number of genes: %i",
"\n"
),
length(x$reference_gene_ids),
paste(sapply(x$methods, function(m) m$id), collapse = ", "),
length(x$species_ids),
length(x$gene_ids)
))
invisible(x)
invisible(x)
}

View file

@ -13,35 +13,35 @@
#'
#' @export
ranking <- function(analysis, weights) {
ranking <- if (inherits(analysis, "geposan_analysis")) {
copy(analysis$scores)
} else if (inherits(analysis, "geposan_ranking")) {
copy(analysis)
} else {
stop("Invalid analyis. Use geposan::analyze().")
}
ranking <- if (inherits(analysis, "geposan_analysis")) {
copy(analysis$scores)
} else if (inherits(analysis, "geposan_ranking")) {
copy(analysis)
} else {
stop("Invalid analyis. Use geposan::analyze().")
}
ranking[, score := 0.0]
ranking[, score := 0.0]
for (method in names(weights)) {
weighted <- weights[[method]] * ranking[, ..method]
ranking[, score := score + weighted]
}
for (method in names(weights)) {
weighted <- weights[[method]] * ranking[, ..method]
ranking[, score := score + weighted]
}
# Normalize scores to be between 0.0 and 1.0.
min_score <- ranking[, min(score)]
max_score <- ranking[, max(score)]
score_range <- max_score - min_score
ranking[, score := (score - min_score) / score_range]
# Normalize scores to be between 0.0 and 1.0.
min_score <- ranking[, min(score)]
max_score <- ranking[, max(score)]
score_range <- max_score - min_score
ranking[, score := (score - min_score) / score_range]
setorder(ranking, -score)
ranking[, rank := .I]
ranking[, percentile := 1 - rank / nrow(ranking)]
setorder(ranking, -score)
ranking[, rank := .I]
ranking[, percentile := 1 - rank / nrow(ranking)]
structure(
ranking,
class = c("geposan_ranking", class(ranking))
)
structure(
ranking,
class = c("geposan_ranking", class(ranking))
)
}
#' Find the best weights to rank the results.
@ -61,40 +61,40 @@ ranking <- function(analysis, weights) {
#' @export
optimal_weights <- function(analysis, methods, reference_gene_ids,
target = "mean") {
if (!inherits(analysis, c("geposan_analysis", "geposan_ranking"))) {
stop("Invalid analyis. Use geposan::analyze().")
if (!inherits(analysis, c("geposan_analysis", "geposan_ranking"))) {
stop("Invalid analyis. Use geposan::analyze().")
}
# Compute the target rank of the reference genes when applying the
# weights.
target_rank <- function(factors) {
data <- ranking(analysis, as.list(factors))
result <- data[
gene %chin% reference_gene_ids,
if (target == "min") {
min(rank)
} else if (target == "max") {
max(rank)
} else if (target == "mean") {
mean(rank)
} else {
stats::median(rank)
}
]
if (result > 0) {
result
} else {
Inf
}
}
initial_factors <- rep(1.0, length(methods))
names(initial_factors) <- methods
# Compute the target rank of the reference genes when applying the
# weights.
target_rank <- function(factors) {
data <- ranking(analysis, as.list(factors))
optimal_factors <- stats::optim(initial_factors, target_rank)$par
result <- data[
gene %chin% reference_gene_ids,
if (target == "min") {
min(rank)
} else if (target == "max") {
max(rank)
} else if (target == "mean") {
mean(rank)
} else {
stats::median(rank)
}
]
if (result > 0) {
result
} else {
Inf
}
}
initial_factors <- rep(1.0, length(methods))
names(initial_factors) <- methods
optimal_factors <- stats::optim(initial_factors, target_rank)$par
as.list(optimal_factors / max(abs(optimal_factors)))
as.list(optimal_factors / max(abs(optimal_factors)))
}

View file

@ -10,18 +10,18 @@
#'
#' @export
result <- function(method_id, scores, details = list()) {
stopifnot(is.data.frame(scores) &
c("gene", "score") %chin% colnames(scores))
stopifnot(is.list(details))
stopifnot(is.data.frame(scores) &
c("gene", "score") %chin% colnames(scores))
stopifnot(is.list(details))
structure(
list(
method_id = method_id,
scores = scores,
details = details
),
class = "geposan_result"
)
structure(
list(
method_id = method_id,
scores = scores,
details = details
),
class = "geposan_result"
)
}
#' Print a result object.
@ -33,18 +33,18 @@ result <- function(method_id, scores, details = list()) {
#'
#' @export
print.geposan_result <- function(x, ...) {
cat(sprintf(
paste0(
"geposan result:",
"\n Method: %s",
"\n Number of genes: %i",
"\n Available details: %s",
"\n"
),
x$method_id,
nrow(x$scores),
paste(names(x$details), collapse = ", ")
))
cat(sprintf(
paste0(
"geposan result:",
"\n Method: %s",
"\n Number of genes: %i",
"\n Available details: %s",
"\n"
),
x$method_id,
nrow(x$scores),
paste(names(x$details), collapse = ", ")
))
invisible(x)
invisible(x)
}

View file

@ -8,25 +8,25 @@
# @param objects A vector of objects that this expression depends on. The hash
# of those objects will be used for identifying the cache file.
cached <- function(name, objects, expr) {
if (!dir.exists("cache")) {
dir.create("cache")
}
if (!dir.exists("cache")) {
dir.create("cache")
}
id <- rlang::hash(objects)
cache_file <- sprintf("cache/%s_%s.rda", name, id)
id <- rlang::hash(objects)
cache_file <- sprintf("cache/%s_%s.rda", name, id)
if (file.exists(cache_file)) {
# If the cache file exists, we restore the data from it.
load(cache_file)
} else {
# If the cache file doesn't exist, we have to do the computation.
data <- expr
if (file.exists(cache_file)) {
# If the cache file exists, we restore the data from it.
load(cache_file)
} else {
# If the cache file doesn't exist, we have to do the computation.
data <- expr
# The results are cached for the next run.
save(data, file = cache_file, compress = "xz")
}
# The results are cached for the next run.
save(data, file = cache_file, compress = "xz")
}
data
data
}
# This is needed to make data.table's symbols available within the package.

View file

@ -27,68 +27,68 @@
#'
#' @export
validate <- function(ranking, reference_gene_ids, method_ids, progress = NULL) {
if (!inherits(ranking, "geposan_ranking")) {
stop("Ranking is invalid. Use geposan::ranking().")
}
if (!inherits(ranking, "geposan_ranking")) {
stop("Ranking is invalid. Use geposan::ranking().")
}
if (is.null(progress)) {
progress_bar <- progress::progress_bar$new()
progress_bar$update(0.0)
if (is.null(progress)) {
progress_bar <- progress::progress_bar$new()
progress_bar$update(0.0)
progress <- function(progress_value) {
if (!progress_bar$finished) {
progress_bar$update(progress_value)
if (progress_value >= 1.0) {
progress_bar$terminate()
}
}
progress <- function(progress_value) {
if (!progress_bar$finished) {
progress_bar$update(progress_value)
if (progress_value >= 1.0) {
progress_bar$terminate()
}
}
}
}
progress_state <- 0.0
progress_step <- 1.0 / length(reference_gene_ids)
progress_state <- 0.0
progress_step <- 1.0 / length(reference_gene_ids)
results <- ranking[gene %chin% reference_gene_ids, .(gene, percentile)]
results <- ranking[gene %chin% reference_gene_ids, .(gene, percentile)]
for (gene_id in reference_gene_ids) {
included_gene_ids <- reference_gene_ids[
reference_gene_ids != gene_id
]
for (gene_id in reference_gene_ids) {
included_gene_ids <- reference_gene_ids[
reference_gene_ids != gene_id
]
weights <- optimal_weights(
ranking,
method_ids,
included_gene_ids
)
ranking_validation <- ranking(ranking, weights)
results[
gene == gene_id,
percentile_validation := ranking_validation[
gene == gene_id,
percentile
]
]
if (!is.null(progress)) {
progress_state <- progress_state + progress_step
progress(progress_state)
}
}
results[, error := percentile - percentile_validation]
setorder(results, error)
structure(
list(
validation = results,
mean_percentile_original = results[, mean(percentile)],
mean_percentile_validation = results[, mean(percentile_validation)],
mean_error = results[, mean(error)]
),
class = "geposan_validation"
weights <- optimal_weights(
ranking,
method_ids,
included_gene_ids
)
ranking_validation <- ranking(ranking, weights)
results[
gene == gene_id,
percentile_validation := ranking_validation[
gene == gene_id,
percentile
]
]
if (!is.null(progress)) {
progress_state <- progress_state + progress_step
progress(progress_state)
}
}
results[, error := percentile - percentile_validation]
setorder(results, error)
structure(
list(
validation = results,
mean_percentile_original = results[, mean(percentile)],
mean_percentile_validation = results[, mean(percentile_validation)],
mean_error = results[, mean(error)]
),
class = "geposan_validation"
)
}
#' S3 method to print a validation object.
@ -100,18 +100,18 @@ validate <- function(ranking, reference_gene_ids, method_ids, progress = NULL) {
#'
#' @export
print.geposan_validation <- function(x, ...) {
cat(sprintf(
paste0(
"geposan validation:",
"\n Mean percentile original: %.1f%%",
"\n Mean percentile validation: %.1f%%",
"\n Mean error: %.1f percent points",
"\n"
),
x$mean_percentile_original * 100,
x$mean_percentile_validation * 100,
x$mean_error * 100
))
cat(sprintf(
paste0(
"geposan validation:",
"\n Mean percentile original: %.1f%%",
"\n Mean percentile validation: %.1f%%",
"\n Mean error: %.1f percent points",
"\n"
),
x$mean_percentile_original * 100,
x$mean_percentile_validation * 100,
x$mean_error * 100
))
invisible(x)
invisible(x)
}

View file

@ -5,23 +5,23 @@ ensembl_api_url <- "https://rest.ensembl.org"
#' Perform a request to the Ensembl REST API.
ensembl_request <- function(api_path) {
content(stop_for_status(GET(
paste0(ensembl_api_url, api_path),
content_type_json()
)))
content(stop_for_status(GET(
paste0(ensembl_api_url, api_path),
content_type_json()
)))
}
#' Get IDs of all available vertebrates.
get_species_ids <- function() {
species <- ensembl_request("/info/species")$species
sapply(species, function(species) species$name)
species <- ensembl_request("/info/species")$species
sapply(species, function(species) species$name)
}
#' Get all chromosomes names for a species.
get_species_chromosomes <- function(species_id) {
chromosomes <- unlist(ensembl_request(
paste0("/info/assembly/", species_id)
)$karyotype)
chromosomes <- unlist(ensembl_request(
paste0("/info/assembly/", species_id)
)$karyotype)
}
#' Get a vector of all available unqiue chromosome names.
@ -29,6 +29,6 @@ get_species_chromosomes <- function(species_id) {
#' There are multiple names for mitochondrial DNA which have to be removed
#' manually, unfortunately.
get_all_chromosomes <- function() {
chromosomes <- sapply(get_species_ids(), get_species_chromosomes)
unique(unlist(chromosomes))
chromosomes <- sapply(get_species_ids(), get_species_chromosomes)
unique(unlist(chromosomes))
}

View file

@ -12,8 +12,8 @@ ensembl_datasets <- data.table(biomaRt::listDatasets(ensembl))
# Filter out species ID and name from the result.
species <- ensembl_datasets[, .(
id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2],
name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2]
id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2],
name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2]
)]
# List of assemblies that the Ensembl Rest API advertises as chromosomes.
@ -23,232 +23,232 @@ species <- ensembl_datasets[, .(
#
# See get_all_chromosomes()
valid_chromosome_names <- c(
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"10",
"11",
"12",
"13",
"14",
"15",
"16",
"17",
"18",
"19",
"20",
"X",
"Y",
"21",
"4A",
"1A",
"22",
"23",
"24",
"25LG1",
"25LG2",
"26",
"27",
"28",
"LGE22",
"Z",
"25",
"29",
"A1",
"A2",
"A3",
"B1",
"B2",
"B3",
"B4",
"C1",
"C2",
"D1",
"D2",
"D3",
"D4",
"E1",
"E2",
"E3",
"F1",
"F2",
"2A",
"2B",
"LG01",
"LG02",
"LG03",
"LG04",
"LG05",
"LG06",
"LG07",
"LG08",
"LG09",
"LG10",
"LG11",
"LG12",
"LG13",
"LG14",
"LG15",
"LG16",
"LG17",
"LG18",
"LG19",
"LG20",
"LG21",
"LG22",
"LG23",
"LG24",
"LG25",
"I",
"II",
"III",
"IV",
"V",
"VI",
"VII",
"VIII",
"IX",
"XI",
"XII",
"XIII",
"XIV",
"XV",
"XVI",
"XVII",
"XVIII",
"XIX",
"XX",
"XXI",
"XXII",
"XXIII",
"XXIV",
"W",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"40",
"2L",
"2R",
"3L",
"3R",
"LG1",
"LG2",
"LG3",
"LG4",
"LG5",
"LG6",
"LG7",
"LG8",
"LG9",
"LG26",
"LG27",
"LG28",
"LG29",
"LG30",
"1a",
"7b",
"22a",
"LGE22C19W28_E50C23",
"LGE64",
"7a",
"MIC_1",
"MIC_10",
"MIC_11",
"MIC_2",
"MIC_3",
"MIC_4",
"MIC_5",
"MIC_6",
"MIC_7",
"MIC_8",
"MIC_9",
"sgr01",
"sgr02",
"sgr03",
"sgr04",
"sgr05",
"sgr06",
"sgr07",
"sgr08",
"sgr09",
"sgr10",
"sgr11",
"sgr12",
"sgr13",
"sgr14",
"sgr15",
"sgr16",
"sgr17",
"sgr18",
"sgr19",
"X1",
"X2",
"X3",
"X4",
"X5",
"a",
"b",
"c",
"d",
"f",
"g",
"h",
"41",
"42",
"43",
"44",
"45",
"46",
"47",
"48",
"49",
"50",
"LG28B",
"LG30F",
"LG36F",
"LG37M",
"LG42F",
"LG44F",
"LG45M",
"LG48F",
"LG49B",
"LG34",
"LG35",
"LG7_11",
"groupI",
"groupII",
"groupIII",
"groupIV",
"groupV",
"groupVI",
"groupVII",
"groupVIII",
"groupIX",
"groupX",
"groupXI",
"groupXII",
"groupXIII",
"groupXIV",
"groupXV",
"groupXVI",
"groupXVII",
"groupXVIII",
"groupXIX",
"groupXX",
"groupXXI"
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"10",
"11",
"12",
"13",
"14",
"15",
"16",
"17",
"18",
"19",
"20",
"X",
"Y",
"21",
"4A",
"1A",
"22",
"23",
"24",
"25LG1",
"25LG2",
"26",
"27",
"28",
"LGE22",
"Z",
"25",
"29",
"A1",
"A2",
"A3",
"B1",
"B2",
"B3",
"B4",
"C1",
"C2",
"D1",
"D2",
"D3",
"D4",
"E1",
"E2",
"E3",
"F1",
"F2",
"2A",
"2B",
"LG01",
"LG02",
"LG03",
"LG04",
"LG05",
"LG06",
"LG07",
"LG08",
"LG09",
"LG10",
"LG11",
"LG12",
"LG13",
"LG14",
"LG15",
"LG16",
"LG17",
"LG18",
"LG19",
"LG20",
"LG21",
"LG22",
"LG23",
"LG24",
"LG25",
"I",
"II",
"III",
"IV",
"V",
"VI",
"VII",
"VIII",
"IX",
"XI",
"XII",
"XIII",
"XIV",
"XV",
"XVI",
"XVII",
"XVIII",
"XIX",
"XX",
"XXI",
"XXII",
"XXIII",
"XXIV",
"W",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"40",
"2L",
"2R",
"3L",
"3R",
"LG1",
"LG2",
"LG3",
"LG4",
"LG5",
"LG6",
"LG7",
"LG8",
"LG9",
"LG26",
"LG27",
"LG28",
"LG29",
"LG30",
"1a",
"7b",
"22a",
"LGE22C19W28_E50C23",
"LGE64",
"7a",
"MIC_1",
"MIC_10",
"MIC_11",
"MIC_2",
"MIC_3",
"MIC_4",
"MIC_5",
"MIC_6",
"MIC_7",
"MIC_8",
"MIC_9",
"sgr01",
"sgr02",
"sgr03",
"sgr04",
"sgr05",
"sgr06",
"sgr07",
"sgr08",
"sgr09",
"sgr10",
"sgr11",
"sgr12",
"sgr13",
"sgr14",
"sgr15",
"sgr16",
"sgr17",
"sgr18",
"sgr19",
"X1",
"X2",
"X3",
"X4",
"X5",
"a",
"b",
"c",
"d",
"f",
"g",
"h",
"41",
"42",
"43",
"44",
"45",
"46",
"47",
"48",
"49",
"50",
"LG28B",
"LG30F",
"LG36F",
"LG37M",
"LG42F",
"LG44F",
"LG45M",
"LG48F",
"LG49B",
"LG34",
"LG35",
"LG7_11",
"groupI",
"groupII",
"groupIII",
"groupIV",
"groupV",
"groupVI",
"groupVII",
"groupVIII",
"groupIX",
"groupX",
"groupXI",
"groupXII",
"groupXIII",
"groupXIV",
"groupXV",
"groupXVI",
"groupXVII",
"groupXVIII",
"groupXIX",
"groupXX",
"groupXXI"
)
#' Get all chromosome names for an Ensembl dataset.
@ -256,8 +256,8 @@ valid_chromosome_names <- c(
#' The function tries to filter out valid chromosome names from the available
#' assemblies in the dataset.
get_chromosome_names <- function(dataset) {
chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
chromosome_names[chromosome_names %chin% valid_chromosome_names]
chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
chromosome_names[chromosome_names %chin% valid_chromosome_names]
}
# Retrieve information on human genes. This will only include genes on
@ -267,16 +267,16 @@ rlog::log_info("Retrieving information on human genes")
dataset <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl)
human_data <- data.table(biomaRt::getBM(
attributes = c(
"ensembl_gene_id",
"hgnc_symbol",
"chromosome_name",
"start_position",
"end_position"
),
filters = "chromosome_name",
values = get_chromosome_names(dataset),
mart = dataset
attributes = c(
"ensembl_gene_id",
"hgnc_symbol",
"chromosome_name",
"start_position",
"end_position"
),
filters = "chromosome_name",
values = get_chromosome_names(dataset),
mart = dataset
))
# Remove duplicated gene IDs (at the time of writing, there are a handful).
@ -284,9 +284,9 @@ human_data <- unique(human_data, by = "ensembl_gene_id")
# Only keep relevant information on genes.
genes <- human_data[, .(
id = ensembl_gene_id,
name = hgnc_symbol,
chromosome = chromosome_name
id = ensembl_gene_id,
name = hgnc_symbol,
chromosome = chromosome_name
)]
# Retrieve gene distance data across species.
@ -296,39 +296,39 @@ distances <- data.table()
#' Handle data for one species.
handle_species <- function(species_id, species_data) {
chromosomes <- species_data[,
.(chromosome_length = max(end_position)),
by = chromosome_name
]
chromosomes <- species_data[,
.(chromosome_length = max(end_position)),
by = chromosome_name
]
# Store the number of chromosomes in the species table.
species[id == species_id, n_chromosomes := nrow(chromosomes)]
# Store the number of chromosomes in the species table.
species[id == species_id, n_chromosomes := nrow(chromosomes)]
# Store the median chromosome length in the species table.
species[
id == species_id,
median_chromosome_length := chromosomes[, median(chromosome_length)]
]
# Store the median chromosome length in the species table.
species[
id == species_id,
median_chromosome_length := chromosomes[, median(chromosome_length)]
]
# Precompute the genes' distance to the nearest telomere.
species_distances <- species_data[
chromosomes,
.(
species = species_id,
gene = ensembl_gene_id,
chromosome_name = chromosome_name,
start_position = start_position,
end_position = end_position,
distance = pmin(
start_position,
chromosome_length - end_position
)
),
on = "chromosome_name"
]
# Precompute the genes' distance to the nearest telomere.
species_distances <- species_data[
chromosomes,
.(
species = species_id,
gene = ensembl_gene_id,
chromosome_name = chromosome_name,
start_position = start_position,
end_position = end_position,
distance = pmin(
start_position,
chromosome_length - end_position
)
),
on = "chromosome_name"
]
# Add species distances to the distances table.
distances <<- rbindlist(list(distances, species_distances))
# Add species distances to the distances table.
distances <<- rbindlist(list(distances, species_distances))
}
# Handle the human first, as we already retrieved the data and don't need to
@ -337,67 +337,67 @@ handle_species("hsapiens", human_data)
# Iterate through all other species and retrieve their distance data.
for (species_id in species[id != "hsapiens", id]) {
rlog::log_info(sprintf("Loading species \"%s\"", species_id))
rlog::log_info(sprintf("Loading species \"%s\"", species_id))
dataset <- biomaRt::useDataset(
sprintf("%s_gene_ensembl", species_id),
mart = ensembl
)
dataset <- biomaRt::useDataset(
sprintf("%s_gene_ensembl", species_id),
mart = ensembl
)
# Besides the attributes that are always present, we need to check for
# human orthologs. Some species don't have that information and will be
# skipped.
if (!"hsapiens_homolog_ensembl_gene" %chin%
biomaRt::listAttributes(dataset, what = "name")) {
rlog::log_info("No data on human orthologs")
species <- species[id != species_id]
# Besides the attributes that are always present, we need to check for
# human orthologs. Some species don't have that information and will be
# skipped.
if (!"hsapiens_homolog_ensembl_gene" %chin%
biomaRt::listAttributes(dataset, what = "name")) {
rlog::log_info("No data on human orthologs")
species <- species[id != species_id]
next
}
next
}
chromosome_names <- get_chromosome_names(dataset)
chromosome_names <- get_chromosome_names(dataset)
# Skip the species, if there are no assembled chromosomes.
if (length(chromosome_names) <= 0) {
rlog::log_info("No matching chromosome assemblies")
species <- species[id != species_id]
# Skip the species, if there are no assembled chromosomes.
if (length(chromosome_names) <= 0) {
rlog::log_info("No matching chromosome assemblies")
species <- species[id != species_id]
next
}
next
}
# Retrieve information on all genes of the current species, that have
# human orthologs. This is called "homolog" in the Ensembl schema.
species_distances <- data.table(biomaRt::getBM(
attributes = c(
"hsapiens_homolog_ensembl_gene",
"chromosome_name",
"start_position",
"end_position"
),
filters = c("with_hsapiens_homolog", "chromosome_name"),
values = list(TRUE, chromosome_names),
mart = dataset
))
# Retrieve information on all genes of the current species, that have
# human orthologs. This is called "homolog" in the Ensembl schema.
species_distances <- data.table(biomaRt::getBM(
attributes = c(
"hsapiens_homolog_ensembl_gene",
"chromosome_name",
"start_position",
"end_position"
),
filters = c("with_hsapiens_homolog", "chromosome_name"),
values = list(TRUE, chromosome_names),
mart = dataset
))
# Only include human genes that we have information on.
species_distances <- species_distances[
hsapiens_homolog_ensembl_gene %chin% genes$id
]
# Only include human genes that we have information on.
species_distances <- species_distances[
hsapiens_homolog_ensembl_gene %chin% genes$id
]
# Only include one ortholog per human gene.
species_distances <- unique(
species_distances,
by = "hsapiens_homolog_ensembl_gene"
)
# Only include one ortholog per human gene.
species_distances <- unique(
species_distances,
by = "hsapiens_homolog_ensembl_gene"
)
# Rename gene ID column to match the human data.
setnames(
species_distances,
"hsapiens_homolog_ensembl_gene",
"ensembl_gene_id"
)
# Rename gene ID column to match the human data.
setnames(
species_distances,
"hsapiens_homolog_ensembl_gene",
"ensembl_gene_id"
)
handle_species(species_id, species_distances)
handle_species(species_id, species_distances)
}
# Save data in the appropriate place.