Reindent code to use just two spaces

This commit is contained in:
Elias Projahn 2022-05-26 12:42:19 +02:00
parent a1e6147466
commit c04b6337e9
17 changed files with 1583 additions and 1582 deletions

View file

@ -18,54 +18,54 @@
#' #'
#' @export #' @export
analyze <- function(preset, progress = NULL, include_results = TRUE) { analyze <- function(preset, progress = NULL, include_results = TRUE) {
if (!inherits(preset, "geposan_preset")) { if (!inherits(preset, "geposan_preset")) {
stop("Preset is invalid. Use geposan::preset() to create one.") stop("Preset is invalid. Use geposan::preset() to create one.")
} }
if (is.null(progress)) { if (is.null(progress)) {
progress_bar <- progress::progress_bar$new() progress_bar <- progress::progress_bar$new()
progress_bar$update(0.0) progress_bar$update(0.0)
progress <- function(progress_value) { progress <- function(progress_value) {
if (!progress_bar$finished) { if (!progress_bar$finished) {
progress_bar$update(progress_value) progress_bar$update(progress_value)
if (progress_value >= 1.0) { if (progress_value >= 1.0) {
progress_bar$terminate() progress_bar$terminate()
}
}
} }
}
} }
}
progress_buffer <- 0.0 progress_buffer <- 0.0
method_count <- length(preset$methods) method_count <- length(preset$methods)
method_progress <- function(progress_value) { method_progress <- function(progress_value) {
progress(progress_buffer + progress_value / method_count) progress(progress_buffer + progress_value / method_count)
} }
scores <- data.table(gene = preset$gene_id) scores <- data.table(gene = preset$gene_id)
results <- list() results <- list()
for (method in preset$methods) { for (method in preset$methods) {
method_results <- method$func(preset, method_progress) method_results <- method$func(preset, method_progress)
scores <- merge(scores, method_results$scores, by = "gene") scores <- merge(scores, method_results$scores, by = "gene")
setnames(scores, "score", method$id) setnames(scores, "score", method$id)
results <- c(results, list(method_results)) results <- c(results, list(method_results))
progress_buffer <- progress_buffer + 1 / method_count progress_buffer <- progress_buffer + 1 / method_count
progress(progress_buffer) progress(progress_buffer)
} }
structure( structure(
list( list(
preset = preset, preset = preset,
scores = scores, scores = scores,
results = if (include_results) results else NULL results = if (include_results) results else NULL
), ),
class = "geposan_analysis" class = "geposan_analysis"
) )
} }
#' Print an analysis object. #' Print an analysis object.
@ -77,14 +77,14 @@ analyze <- function(preset, progress = NULL, include_results = TRUE) {
#' #'
#' @export #' @export
print.geposan_analysis <- function(x, ...) { print.geposan_analysis <- function(x, ...) {
cat("geposan analysis:\n\n") cat("geposan analysis:\n\n")
print(x$preset) print(x$preset)
cat("\n")
for (result in x$results) {
print(result)
cat("\n") cat("\n")
}
for (result in x$results) { invisible(x)
print(result)
cat("\n")
}
invisible(x)
} }

View file

@ -18,38 +18,38 @@
#' #'
#' @export #' @export
compare <- function(ranking, comparison_gene_ids) { compare <- function(ranking, comparison_gene_ids) {
if (!inherits(ranking, "geposan_ranking")) { if (!inherits(ranking, "geposan_ranking")) {
stop("Invalid ranking. Use geposan::ranking().") stop("Invalid ranking. Use geposan::ranking().")
} }
comparison_ranking <- ranking[gene %chin% comparison_gene_ids] comparison_ranking <- ranking[gene %chin% comparison_gene_ids]
quantiles <- data.table( quantiles <- data.table(
quantile = c("0%", "25%", "50%", "75%", "100%"), quantile = c("0%", "25%", "50%", "75%", "100%"),
score = stats::quantile(comparison_ranking[, score]), score = stats::quantile(comparison_ranking[, score]),
rank = stats::quantile( rank = stats::quantile(
comparison_ranking[, rank], comparison_ranking[, rank],
probs = seq(1, 0, -0.25) probs = seq(1, 0, -0.25)
), ),
percentile = stats::quantile(comparison_ranking[, percentile]) percentile = stats::quantile(comparison_ranking[, percentile])
) )
p_value <- stats::wilcox.test( p_value <- stats::wilcox.test(
x = comparison_ranking[, score], x = comparison_ranking[, score],
y = ranking[!gene %chin% comparison_gene_ids, score], y = ranking[!gene %chin% comparison_gene_ids, score],
alternative = "greater" alternative = "greater"
)$p.value )$p.value
structure( structure(
list( list(
quantiles = quantiles, quantiles = quantiles,
mean_score = comparison_ranking[, mean(score)], mean_score = comparison_ranking[, mean(score)],
mean_rank = comparison_ranking[, mean(rank)], mean_rank = comparison_ranking[, mean(rank)],
mean_percentile = comparison_ranking[, mean(percentile)], mean_percentile = comparison_ranking[, mean(percentile)],
p_value = p_value p_value = p_value
), ),
class = "geposan_comparison" class = "geposan_comparison"
) )
} }
#' S3 method to print a comparison object. #' S3 method to print a comparison object.
@ -61,32 +61,32 @@ compare <- function(ranking, comparison_gene_ids) {
#' #'
#' @export #' @export
print.geposan_comparison <- function(x, ...) { print.geposan_comparison <- function(x, ...) {
cat("geposan comparison:\n\n") cat("geposan comparison:\n\n")
quantiles_formatted <- x$quantiles[, .( quantiles_formatted <- x$quantiles[, .(
"Quantile" = quantile, "Quantile" = quantile,
"Score" = round(score, 3), "Score" = round(score, 3),
"Rank" = rank, "Rank" = rank,
"Percentile" = paste0( "Percentile" = paste0(
format(round(percentile * 100, 1), nsmall = 1), format(round(percentile * 100, 1), nsmall = 1),
"%" "%"
) )
)] )]
print(quantiles_formatted, row.names = FALSE) print(quantiles_formatted, row.names = FALSE)
cat(sprintf( cat(sprintf(
paste0( paste0(
"\n Mean score: %.3f", "\n Mean score: %.3f",
"\n Mean rank: %.1f", "\n Mean rank: %.1f",
"\n Mean percentile: %.1f%%", "\n Mean percentile: %.1f%%",
"\n p-value for better scores: %.4f\n" "\n p-value for better scores: %.4f\n"
), ),
x$mean_score, x$mean_score,
x$mean_rank, x$mean_rank,
x$mean_percentile * 100, x$mean_percentile * 100,
x$p_value x$p_value
)) ))
invisible(x) invisible(x)
} }

View file

@ -12,34 +12,34 @@
#' #'
#' @export #' @export
method <- function(id, name, description, func) { method <- function(id, name, description, func) {
stopifnot(is.character(id) & length(id) == 1) stopifnot(is.character(id) & length(id) == 1)
stopifnot(is.character(name) & length(name) == 1) stopifnot(is.character(name) & length(name) == 1)
stopifnot(is.character(description) & length(description) == 1) stopifnot(is.character(description) & length(description) == 1)
stopifnot(is.function(func)) stopifnot(is.function(func))
structure( structure(
list( list(
id = id, id = id,
name = name, name = name,
description = description, description = description,
func = func func = func
), ),
class = "geposan_method" class = "geposan_method"
) )
} }
#' Get a list of all available methods. #' Get a list of all available methods.
#' #'
#' @export #' @export
all_methods <- function() { all_methods <- function() {
list( list(
clustering(), clustering(),
correlation(), correlation(),
neural(), neural(),
adjacency(), adjacency(),
species_adjacency(), species_adjacency(),
proximity() proximity()
) )
} }
#' Print a method object. #' Print a method object.
@ -51,18 +51,18 @@ all_methods <- function() {
#' #'
#' @export #' @export
print.geposan_method <- function(x, ...) { print.geposan_method <- function(x, ...) {
cat(sprintf( cat(sprintf(
paste0( paste0(
"geposan method:", "geposan method:",
"\n Method ID: %s", "\n Method ID: %s",
"\n Name: %s", "\n Name: %s",
"\n Description: %s", "\n Description: %s",
"\n" "\n"
), ),
x$id, x$id,
x$name, x$name,
x$description x$description
)) ))
invisible(x) invisible(x)
} }

View file

@ -12,14 +12,14 @@
#' #'
#' @export #' @export
densest <- function(data) { densest <- function(data) {
as.numeric(if (length(data) <= 0) { as.numeric(if (length(data) <= 0) {
NULL NULL
} else if (length(data) == 1) { } else if (length(data) == 1) {
data data
} else { } else {
density <- stats::density(data) density <- stats::density(data)
mean(density$x[density$y == max(density$y)]) mean(density$x[density$y == max(density$y)])
}) })
} }
#' Score genes based on their proximity to the reference genes. #' Score genes based on their proximity to the reference genes.
@ -40,80 +40,80 @@ densest <- function(data) {
#' #'
#' @export #' @export
adjacency <- function(distance_estimate = densest, summarize = stats::median) { adjacency <- function(distance_estimate = densest, summarize = stats::median) {
method( method(
id = "adjacency", id = "adjacency",
name = "Adjacency", name = "Adjacency",
description = "Adjacency to reference genes", description = "Adjacency to reference genes",
function(preset, progress) { function(preset, progress) {
species_ids <- preset$species_ids species_ids <- preset$species_ids
gene_ids <- preset$gene_ids gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids reference_gene_ids <- preset$reference_gene_ids
cached( cached(
"adjacency", "adjacency",
c( c(
species_ids, species_ids,
gene_ids, gene_ids,
reference_gene_ids, reference_gene_ids,
distance_estimate, distance_estimate,
summarize summarize
), ),
{ # nolint { # nolint
# Filter distances by species and gene and summarize each # Filter distances by species and gene and summarize each
# gene's distance values using the estimation function. # gene's distance values using the estimation function.
data <- geposan::distances[ data <- geposan::distances[
species %chin% species_ids & gene %chin% gene_ids, species %chin% species_ids & gene %chin% gene_ids,
.(distance = as.numeric(distance_estimate(distance))), .(distance = as.numeric(distance_estimate(distance))),
by = gene by = gene
] ]
# Compute the absolute value of the difference between the # Compute the absolute value of the difference between the
# estimated distances of each gene to the reference genes. # estimated distances of each gene to the reference genes.
compute_difference <- function(distance_value, compute_difference <- function(distance_value,
comparison_ids) { comparison_ids) {
differences <- data[ differences <- data[
gene %chin% comparison_ids, gene %chin% comparison_ids,
.(difference = abs(distance_value - distance)) .(difference = abs(distance_value - distance))
] ]
summarize(differences$difference) summarize(differences$difference)
} }
# Compute the differences to the reference genes. # Compute the differences to the reference genes.
data[ data[
!gene %chin% reference_gene_ids, !gene %chin% reference_gene_ids,
difference := compute_difference( difference := compute_difference(
distance, distance,
reference_gene_ids reference_gene_ids
), ),
by = gene by = gene
] ]
progress(0.5) progress(0.5)
# Exclude the reference gene itself when computing its # Exclude the reference gene itself when computing its
# difference. # difference.
data[ data[
gene %chin% reference_gene_ids, gene %chin% reference_gene_ids,
difference := compute_difference( difference := compute_difference(
distance, distance,
reference_gene_ids[reference_gene_ids != gene] reference_gene_ids[reference_gene_ids != gene]
), ),
by = gene by = gene
] ]
# Compute the final score by normalizing the difference. # Compute the final score by normalizing the difference.
data[, score := 1 - difference / max(difference)] data[, score := 1 - difference / max(difference)]
progress(1.0) progress(1.0)
result( result(
method = "adjacency", method = "adjacency",
scores = data[, .(gene, score)], scores = data[, .(gene, score)],
details = list(data = data) details = list(data = data)
) )
}
)
} }
) )
}
)
} }

View file

@ -13,33 +13,33 @@
#' default), the first cluster will weigh 1.0, the second 0.7, the third 0.49 #' default), the first cluster will weigh 1.0, the second 0.7, the third 0.49
#' etc. #' etc.
clusteriness <- function(data, span = 100000, weight = 0.7) { clusteriness <- function(data, span = 100000, weight = 0.7) {
n <- length(data) n <- length(data)
# Return a score of 0.0 if there is just one or no value at all. # Return a score of 0.0 if there is just one or no value at all.
if (n < 2) { if (n < 2) {
return(0.0) return(0.0)
}
# Cluster the data and compute the cluster sizes.
tree <- stats::hclust(stats::dist(data))
clusters <- stats::cutree(tree, h = span)
cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
# Compute the "clusteriness" score.
score <- 0.0
for (i in seq_along(cluster_sizes)) {
cluster_size <- cluster_sizes[i]
if (cluster_size >= 2) {
cluster_score <- cluster_size / n
score <- score + weight^(i - 1) * cluster_score
} }
}
# Cluster the data and compute the cluster sizes. score
tree <- stats::hclust(stats::dist(data))
clusters <- stats::cutree(tree, h = span)
cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
# Compute the "clusteriness" score.
score <- 0.0
for (i in seq_along(cluster_sizes)) {
cluster_size <- cluster_sizes[i]
if (cluster_size >= 2) {
cluster_score <- cluster_size / n
score <- score + weight^(i - 1) * cluster_score
}
}
score
} }
#' Process genes clustering their distance to telomeres. #' Process genes clustering their distance to telomeres.
@ -53,41 +53,41 @@ clusteriness <- function(data, span = 100000, weight = 0.7) {
#' #'
#' @export #' @export
clustering <- function() { clustering <- function() {
method( method(
id = "clustering", id = "clustering",
name = "Clustering", name = "Clustering",
description = "Clustering of genes", description = "Clustering of genes",
function(preset, progress) { function(preset, progress) {
species_ids <- preset$species_ids species_ids <- preset$species_ids
gene_ids <- preset$gene_ids gene_ids <- preset$gene_ids
cached("clustering", c(species_ids, gene_ids), { cached("clustering", c(species_ids, gene_ids), {
scores <- data.table(gene = gene_ids) scores <- data.table(gene = gene_ids)
# Prefilter the input data by species. # Prefilter the input data by species.
distances <- geposan::distances[species %chin% species_ids] distances <- geposan::distances[species %chin% species_ids]
genes_done <- 0 genes_done <- 0
genes_total <- length(gene_ids) genes_total <- length(gene_ids)
# Perform the cluster analysis for one gene. # Perform the cluster analysis for one gene.
compute <- function(gene_id) { compute <- function(gene_id) {
data <- distances[gene == gene_id, distance] data <- distances[gene == gene_id, distance]
score <- clusteriness(data) score <- clusteriness(data)
genes_done <<- genes_done + 1 genes_done <<- genes_done + 1
progress(genes_done / genes_total) progress(genes_done / genes_total)
score score
}
scores[, score := compute(gene), by = gene]
result(
method = "clustering",
scores = scores
)
})
} }
)
scores[, score := compute(gene), by = gene]
result(
method = "clustering",
scores = scores
)
})
}
)
} }

View file

@ -8,96 +8,97 @@
#' #'
#' @export #' @export
correlation <- function(summarize = stats::median) { correlation <- function(summarize = stats::median) {
method( method(
id = "correlation", id = "correlation",
name = "Correlation", name = "Correlation",
description = "Correlation with reference genes", description = "Correlation with reference genes",
function(preset, progress) { function(preset, progress) {
species_ids <- preset$species_ids species_ids <- preset$species_ids
gene_ids <- preset$gene_ids gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids reference_gene_ids <- preset$reference_gene_ids
cached( cached(
"correlation", "correlation",
c(species_ids, gene_ids, reference_gene_ids, summarize), c(species_ids, gene_ids, reference_gene_ids, summarize),
{ # nolint { # nolint
# Prefilter distances by species. # Prefilter distances by species.
distances <- geposan::distances[species %chin% species_ids] distances <- geposan::distances[species %chin% species_ids]
# Tranform data to get species as rows and genes as columns. # Tranform data to get species as rows and genes as columns.
# We construct columns per species, because it requires # We construct columns per species, because it requires
# fewer iterations, and transpose the table afterwards. # fewer iterations, and transpose the table afterwards.
data <- data.table(gene = gene_ids) data <- data.table(gene = gene_ids)
# Make a column containing distance data for each species. # Make a column containing distance data for each species.
for (species_id in species_ids) { for (species_id in species_ids) {
species_data <- distances[ species_data <- distances[
species == species_id, species == species_id,
.(gene, distance) .(gene, distance)
] ]
data <- merge(data, species_data, all.x = TRUE) data <- merge(data, species_data, all.x = TRUE)
setnames(data, "distance", species_id) setnames(data, "distance", species_id)
} }
# Transpose to the desired format. # Transpose to the desired format.
data <- transpose(data, make.names = "gene") data <- transpose(data, make.names = "gene")
progress(0.33) progress(0.33)
# Take the reference data. # Take the reference data.
reference_data <- data[, ..reference_gene_ids] reference_data <- data[, ..reference_gene_ids]
# Perform the correlation between all possible pairs. # Perform the correlation between all possible pairs.
results <- stats::cor( results <- stats::cor(
data[, ..gene_ids], data[, ..gene_ids],
reference_data, reference_data,
use = "pairwise.complete.obs", use = "pairwise.complete.obs",
method = "spearman" method = "spearman"
) )
results <- data.table(results, keep.rownames = TRUE) results <- data.table(results, keep.rownames = TRUE)
setnames(results, "rn", "gene") setnames(results, "rn", "gene")
# Remove correlations between the reference genes # Remove correlations between the reference genes
# themselves. # themselves.
for (reference_gene_id in reference_gene_ids) { for (reference_gene_id in reference_gene_ids) {
column <- quote(reference_gene_id) column <- quote(reference_gene_id)
results[gene == reference_gene_id, eval(column) := NA] results[gene == reference_gene_id, eval(column) := NA]
} }
progress(0.66) progress(0.66)
# Combine the correlation coefficients. # Combine the correlation coefficients.
results[, results[,
max_correlation := as.double(summarize(stats::na.omit( max_correlation := as.double(summarize(stats::na.omit(
# Convert the data.table subset into a # Convert the data.table subset into a
# vector to get the correct na.omit # vector to get the correct na.omit
# behavior. # behavior.
as.matrix(.SD)[1, ] as.matrix(.SD)[1, ]
))), ))),
.SDcols = reference_gene_ids, .SDcols = reference_gene_ids,
by = gene by = gene
] ]
# Normalize scores. # Normalize scores.
results[, results[
score := (max_correlation - min(max_correlation)) / ,
(max(max_correlation) - min(max_correlation)) score := (max_correlation - min(max_correlation)) /
] (max(max_correlation) - min(max_correlation))
]
# Normalize scores. # Normalize scores.
results[, .(gene, score)] results[, .(gene, score)]
result( result(
method = "correlation", method = "correlation",
scores = results[, .(gene, score)], scores = results[, .(gene, score)],
details = list(all_correlations = results) details = list(all_correlations = results)
) )
}
)
} }
) )
}
)
} }

View file

@ -12,244 +12,244 @@
#' #'
#' @export #' @export
neural <- function(seed = 180199, n_models = 5) { neural <- function(seed = 180199, n_models = 5) {
method( method(
id = "neural", id = "neural",
name = "Neural", name = "Neural",
description = "Assessment by neural network", description = "Assessment by neural network",
function(preset, progress) { function(preset, progress) {
species_ids <- preset$species_ids species_ids <- preset$species_ids
gene_ids <- preset$gene_ids gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids reference_gene_ids <- preset$reference_gene_ids
cached( cached(
"neural", "neural",
c(species_ids, gene_ids, reference_gene_ids, seed, n_models), c(species_ids, gene_ids, reference_gene_ids, seed, n_models),
{ # nolint { # nolint
reference_count <- length(reference_gene_ids) reference_count <- length(reference_gene_ids)
stopifnot(n_models %in% 2:reference_count) stopifnot(n_models %in% 2:reference_count)
# Make results reproducible. # Make results reproducible.
tensorflow::set_random_seed(seed) tensorflow::set_random_seed(seed)
# Step 1: Prepare input data. # Step 1: Prepare input data.
# --------------------------- # ---------------------------
# Prefilter distances by species. # Prefilter distances by species.
distances <- geposan::distances[species %chin% species_ids] distances <- geposan::distances[species %chin% species_ids]
# Input data for the network. This contains the gene ID as # Input data for the network. This contains the gene ID as
# an identifier as well as the per-species gene distances as # an identifier as well as the per-species gene distances as
# input variables. # input variables.
data <- data.table(gene = gene_ids) data <- data.table(gene = gene_ids)
# Buffer to keep track of the names of the input variables. # Buffer to keep track of the names of the input variables.
input_vars <- NULL input_vars <- NULL
# Make a columns containing positions and distances for each # Make a columns containing positions and distances for each
# species. # species.
for (species_id in species_ids) { for (species_id in species_ids) {
species_data <- distances[ species_data <- distances[
species == species_id, species == species_id,
.(gene, distance) .(gene, distance)
] ]
# Only include species with at least 25% known values. # Only include species with at least 25% known values.
# As positions and distances always coexist, we don't # As positions and distances always coexist, we don't
# loose any data here. # loose any data here.
species_data <- stats::na.omit(species_data) species_data <- stats::na.omit(species_data)
if (nrow(species_data) >= 0.25 * length(gene_ids)) { if (nrow(species_data) >= 0.25 * length(gene_ids)) {
data <- merge(data, species_data, all.x = TRUE) data <- merge(data, species_data, all.x = TRUE)
# Replace missing data with mean values. The neural # Replace missing data with mean values. The neural
# network can't handle NAs in a meaningful way. # network can't handle NAs in a meaningful way.
# Choosing extreme values here would result in # Choosing extreme values here would result in
# heavily biased results. Therefore, the mean value # heavily biased results. Therefore, the mean value
# is chosen as a compromise. However, this will of # is chosen as a compromise. However, this will of
# course lessen the significance of the results. # course lessen the significance of the results.
mean_distance <- round( mean_distance <- round(
species_data[, mean(distance)] species_data[, mean(distance)]
) )
data[is.na(distance), distance := mean_distance] data[is.na(distance), distance := mean_distance]
# Name the new column after the species. # Name the new column after the species.
setnames(data, "distance", species_id) setnames(data, "distance", species_id)
# Add the input variable to the buffer. # Add the input variable to the buffer.
input_vars <- c(input_vars, species_id) input_vars <- c(input_vars, species_id)
} }
} }
progress(0.1) progress(0.1)
# Step 2: Prepare training data. # Step 2: Prepare training data.
# ------------------------------ # ------------------------------
# Take out the reference data. # Take out the reference data.
reference_data <- data[gene %chin% reference_gene_ids] reference_data <- data[gene %chin% reference_gene_ids]
reference_data[, score := 1.0] reference_data[, score := 1.0]
# Take out random samples from the remaining genes. This is # Take out random samples from the remaining genes. This is
# another compromise with a negative impact on # another compromise with a negative impact on
# significance. We assume that a random gene is not likely # significance. We assume that a random gene is not likely
# to match the reference genes. # to match the reference genes.
without_reference_data <- data[ without_reference_data <- data[
!gene %chin% reference_gene_ids !gene %chin% reference_gene_ids
] ]
control_data <- without_reference_data[ control_data <- without_reference_data[
sample( sample(
nrow(without_reference_data), nrow(without_reference_data),
reference_count reference_count
)
]
control_data[, score := 0.0]
# Split the training data into random sets to have
# validation data for each model.
# Scramble the source tables.
reference_data <- reference_data[sample(reference_count)]
control_data <- control_data[sample(reference_count)]
networks <- list()
indices <- seq_len(reference_count)
indices_split <- split(indices, indices %% n_models)
for (i in seq_len(n_models)) {
training_data <- rbindlist(list(
reference_data[!indices_split[[i]]],
control_data[!indices_split[[i]]]
))
validation_data <- rbindlist(list(
reference_data[indices_split[[i]]],
control_data[indices_split[[i]]]
))
networks[[i]] <- list(
training_data = training_data,
validation_data = validation_data
)
}
# Step 3: Create, train and apply neural network.
# -----------------------------------------------
# Layers for the neural network.
input_layer <- length(input_vars)
layer1 <- input_layer
layer2 <- 0.5 * input_layer
layer3 <- 0.5 * layer2
# Convert data to matrix and normalize it.
to_matrix <- function(data) {
data_matrix <- as.matrix(data[, ..input_vars])
colnames(data_matrix) <- NULL
keras::normalize(data_matrix)
}
data_matrix <- to_matrix(data)
output_vars <- NULL
for (i in seq_along(networks)) {
# Create a new model for each training session, because
# the model would keep its state across training
# sessions otherwise.
model <- keras::keras_model_sequential() |>
keras::layer_dense(
units = layer1,
activation = "relu",
input_shape = input_layer,
) |>
keras::layer_dense(
units = layer2,
activation = "relu",
kernel_regularizer = keras::regularizer_l2()
) |>
keras::layer_dense(
units = layer3,
activation = "relu",
kernel_regularizer = keras::regularizer_l2()
) |>
keras::layer_dense(
units = 1,
activation = "sigmoid"
) |>
keras::compile(
loss = keras::loss_mean_absolute_error(),
optimizer = keras::optimizer_adam()
)
# Train the model.
network <- networks[[i]]
training_data <- network$training_data
training_matrix <- to_matrix(training_data)
validation_data <- network$validation_data
validation_matrix <- to_matrix(validation_data)
fit <- keras::fit(
model,
x = training_matrix,
y = training_data$score,
validation_data = list(
x_val = validation_matrix,
y_val = validation_data$score
),
epochs = 500,
verbose = FALSE
)
# Apply the model.
data[, new_score := stats::predict(model, data_matrix)]
# Remove the values of the training data itself.
data[gene %chin% training_data$gene, new_score := NA]
output_var <- sprintf("score%i", i)
setnames(data, "new_score", output_var)
output_vars <- c(output_vars, output_var)
# Store the details.
networks[[i]]$model <- keras::serialize_model(model)
networks[[i]]$fit <- fit
progress(0.1 + i * (0.9 / n_models))
}
# Compute the final score as the mean score.
data[,
score := mean(as.numeric(.SD), na.rm = TRUE),
.SDcols = output_vars,
by = gene
]
progress(1.0)
result(
method = "neural",
scores = data[, .(gene, score)],
details = list(
seed = seed,
n_models = n_models,
all_results = data[, !..input_vars],
networks = networks
)
)
}
) )
]
control_data[, score := 0.0]
# Split the training data into random sets to have
# validation data for each model.
# Scramble the source tables.
reference_data <- reference_data[sample(reference_count)]
control_data <- control_data[sample(reference_count)]
networks <- list()
indices <- seq_len(reference_count)
indices_split <- split(indices, indices %% n_models)
for (i in seq_len(n_models)) {
training_data <- rbindlist(list(
reference_data[!indices_split[[i]]],
control_data[!indices_split[[i]]]
))
validation_data <- rbindlist(list(
reference_data[indices_split[[i]]],
control_data[indices_split[[i]]]
))
networks[[i]] <- list(
training_data = training_data,
validation_data = validation_data
)
}
# Step 3: Create, train and apply neural network.
# -----------------------------------------------
# Layers for the neural network.
input_layer <- length(input_vars)
layer1 <- input_layer
layer2 <- 0.5 * input_layer
layer3 <- 0.5 * layer2
# Convert data to matrix and normalize it.
to_matrix <- function(data) {
data_matrix <- as.matrix(data[, ..input_vars])
colnames(data_matrix) <- NULL
keras::normalize(data_matrix)
}
data_matrix <- to_matrix(data)
output_vars <- NULL
for (i in seq_along(networks)) {
# Create a new model for each training session, because
# the model would keep its state across training
# sessions otherwise.
model <- keras::keras_model_sequential() |>
keras::layer_dense(
units = layer1,
activation = "relu",
input_shape = input_layer,
) |>
keras::layer_dense(
units = layer2,
activation = "relu",
kernel_regularizer = keras::regularizer_l2()
) |>
keras::layer_dense(
units = layer3,
activation = "relu",
kernel_regularizer = keras::regularizer_l2()
) |>
keras::layer_dense(
units = 1,
activation = "sigmoid"
) |>
keras::compile(
loss = keras::loss_mean_absolute_error(),
optimizer = keras::optimizer_adam()
)
# Train the model.
network <- networks[[i]]
training_data <- network$training_data
training_matrix <- to_matrix(training_data)
validation_data <- network$validation_data
validation_matrix <- to_matrix(validation_data)
fit <- keras::fit(
model,
x = training_matrix,
y = training_data$score,
validation_data = list(
x_val = validation_matrix,
y_val = validation_data$score
),
epochs = 500,
verbose = FALSE
)
# Apply the model.
data[, new_score := stats::predict(model, data_matrix)]
# Remove the values of the training data itself.
data[gene %chin% training_data$gene, new_score := NA]
output_var <- sprintf("score%i", i)
setnames(data, "new_score", output_var)
output_vars <- c(output_vars, output_var)
# Store the details.
networks[[i]]$model <- keras::serialize_model(model)
networks[[i]]$fit <- fit
progress(0.1 + i * (0.9 / n_models))
}
# Compute the final score as the mean score.
data[,
score := mean(as.numeric(.SD), na.rm = TRUE),
.SDcols = output_vars,
by = gene
]
progress(1.0)
result(
method = "neural",
scores = data[, .(gene, score)],
details = list(
seed = seed,
n_models = n_models,
all_results = data[, !..input_vars],
networks = networks
)
)
} }
) )
}
)
} }

View file

@ -11,38 +11,38 @@
#' #'
#' @export #' @export
proximity <- function(summarize = stats::median) { proximity <- function(summarize = stats::median) {
method( method(
id = "proximity", id = "proximity",
name = "Proximity", name = "Proximity",
description = "Proximity to telomeres", description = "Proximity to telomeres",
function(preset, progress) { function(preset, progress) {
species_ids <- preset$species_ids species_ids <- preset$species_ids
gene_ids <- preset$gene_ids gene_ids <- preset$gene_ids
cached("proximity", c(species_ids, gene_ids), { cached("proximity", c(species_ids, gene_ids), {
# Prefilter distances by species and gene. # Prefilter distances by species and gene.
data <- geposan::distances[ data <- geposan::distances[
species %chin% preset$species_ids & species %chin% preset$species_ids &
gene %chin% preset$gene_ids gene %chin% preset$gene_ids
] ]
# Compute the score as described above. # Compute the score as described above.
data <- data[, data <- data[,
.(combined_distance = as.double(summarize(distance))), .(combined_distance = as.double(summarize(distance))),
by = "gene" by = "gene"
] ]
# Normalize scores. # Normalize scores.
data[, score := 1 - combined_distance / max(combined_distance)] data[, score := 1 - combined_distance / max(combined_distance)]
progress(1.0) progress(1.0)
result( result(
method = "proximity", method = "proximity",
scores = data[, .(gene, score)], scores = data[, .(gene, score)],
details = list(data = data) details = list(data = data)
) )
}) })
} }
) )
} }

View file

@ -15,136 +15,136 @@
#' @export #' @export
species_adjacency <- function(distance_estimate = stats::median, species_adjacency <- function(distance_estimate = stats::median,
summarize = stats::median) { summarize = stats::median) {
method( method(
id = "species_adjacency", id = "species_adjacency",
name = "Species adj.", name = "Species adj.",
description = "Species adjacency", description = "Species adjacency",
function(preset, progress) { function(preset, progress) {
species_ids <- preset$species_ids species_ids <- preset$species_ids
gene_ids <- preset$gene_ids gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids reference_gene_ids <- preset$reference_gene_ids
cached( cached(
"species_adjacency", "species_adjacency",
c( c(
species_ids, species_ids,
gene_ids, gene_ids,
reference_gene_ids, reference_gene_ids,
distance_estimate, distance_estimate,
summarize summarize
), ),
{ # nolint { # nolint
# Prefilter distances. # Prefilter distances.
data <- geposan::distances[ data <- geposan::distances[
species %chin% species_ids & gene %chin% gene_ids species %chin% species_ids & gene %chin% gene_ids
] ]
progress_state <- 0.0 progress_state <- 0.0
progress_step <- 0.9 / length(species_ids) progress_step <- 0.9 / length(species_ids)
# Iterate through all species and find the distance # Iterate through all species and find the distance
# estimates within that species. # estimates within that species.
for (species_id in species_ids) { for (species_id in species_ids) {
# For all genes, compute the distance to one reference # For all genes, compute the distance to one reference
# gene at a time in one go. # gene at a time in one go.
for (reference_gene_id in reference_gene_ids) { for (reference_gene_id in reference_gene_ids) {
comparison_distance <- data[ comparison_distance <- data[
species == species_id & species == species_id &
gene == reference_gene_id, gene == reference_gene_id,
distance distance
] ]
column <- quote(reference_gene_id) column <- quote(reference_gene_id)
if (length(comparison_distance) != 1) { if (length(comparison_distance) != 1) {
# If we don't have a comparison distance, we # If we don't have a comparison distance, we
# can't compute a difference. This happens, if # can't compute a difference. This happens, if
# the species doesn't have the reference gene. # the species doesn't have the reference gene.
data[ data[
species == species_id & species == species_id &
gene %chin% gene_ids, gene %chin% gene_ids,
eval(column) := NA_integer_ eval(column) := NA_integer_
] ]
} else { } else {
data[ data[
species == species_id & species == species_id &
gene %chin% gene_ids, gene %chin% gene_ids,
eval(column) := eval(column) :=
abs(distance - comparison_distance) abs(distance - comparison_distance)
] ]
} }
} }
# Combine the distances to the different reference genes # Combine the distances to the different reference genes
# into one value using the provided function. # into one value using the provided function.
data[ data[
species == species_id & species == species_id &
gene %chin% gene_ids, gene %chin% gene_ids,
combined_distance := as.numeric( combined_distance := as.numeric(
distance_estimate(stats::na.omit( distance_estimate(stats::na.omit(
# Convert the data.table subset into a # Convert the data.table subset into a
# vector to get the correct na.omit # vector to get the correct na.omit
# behavior. # behavior.
as.matrix(.SD)[1, ] as.matrix(.SD)[1, ]
)) ))
), ),
.SDcols = reference_gene_ids, .SDcols = reference_gene_ids,
by = gene by = gene
] ]
progress_state <- progress_state + progress_step progress_state <- progress_state + progress_step
progress(progress_state) progress(progress_state)
} }
progress(0.9) progress(0.9)
# Remove the distances between the reference genes. # Remove the distances between the reference genes.
for (reference_gene_id in reference_gene_ids) { for (reference_gene_id in reference_gene_ids) {
column <- quote(reference_gene_id) column <- quote(reference_gene_id)
data[gene == reference_gene_id, eval(column) := NA] data[gene == reference_gene_id, eval(column) := NA]
} }
# Recompute the combined distance for the reference genes. # Recompute the combined distance for the reference genes.
data[ data[
gene %chin% reference_gene_ids, gene %chin% reference_gene_ids,
combined_distance := as.numeric( combined_distance := as.numeric(
distance_estimate(stats::na.omit( distance_estimate(stats::na.omit(
as.matrix(.SD)[1, ]) as.matrix(.SD)[1, ]
) ))
), ),
.SDcols = reference_gene_ids, .SDcols = reference_gene_ids,
by = list(species, gene) by = list(species, gene)
] ]
# Combine the distances into one value. # Combine the distances into one value.
results <- data[, results <- data[,
.( .(
summarized_distances = as.numeric( summarized_distances = as.numeric(
summarize(stats::na.omit(combined_distance)) summarize(stats::na.omit(combined_distance))
) )
), ),
by = gene by = gene
] ]
# Compute the final score by normalizing the difference. # Compute the final score by normalizing the difference.
results[ results[
, ,
score := 1 - summarized_distances / score := 1 - summarized_distances /
max(summarized_distances) max(summarized_distances)
] ]
progress(1.0) progress(1.0)
result( result(
method = "species_adjacency", method = "species_adjacency",
scores = results[, .(gene, score)], scores = results[, .(gene, score)],
details = list( details = list(
data = data, data = data,
results = results results = results
)
)
}
) )
)
} }
) )
}
)
} }

674
R/plots.R
View file

@ -9,7 +9,7 @@ base_color_transparent <- function() "#1964bf80"
#' Color palette for gene sets. #' Color palette for gene sets.
#' @noRd #' @noRd
gene_set_color <- function(index) { gene_set_color <- function(index) {
c("#FF7F00", "#4DAF4A", "#984EA3")[index] c("#FF7F00", "#4DAF4A", "#984EA3")[index]
} }
#' Plot gene positions. #' Plot gene positions.
@ -22,74 +22,74 @@ gene_set_color <- function(index) {
#' #'
#' @export #' @export
plot_positions <- function(species_ids, gene_sets) { plot_positions <- function(species_ids, gene_sets) {
if (!requireNamespace("plotly", quietly = TRUE)) { if (!requireNamespace("plotly", quietly = TRUE)) {
stop("Please install \"plotly\" to use this function.") stop("Please install \"plotly\" to use this function.")
} }
# Prefilter data by species. # Prefilter data by species.
data <- geposan::distances[species %chin% species_ids] data <- geposan::distances[species %chin% species_ids]
species_max_distance <- data[, species_max_distance <- data[,
.(max_distance = max(distance)), .(max_distance = max(distance)),
by = species by = species
] ]
# Prefilter species. # Prefilter species.
species <- geposan::species[id %chin% species_ids] species <- geposan::species[id %chin% species_ids]
plot <- plotly::plot_ly() |> plot <- plotly::plot_ly() |>
plotly::layout( plotly::layout(
xaxis = list( xaxis = list(
title = "Species", title = "Species",
tickvals = species$id, tickvals = species$id,
ticktext = species$name ticktext = species$name
), ),
yaxis = list(title = "Distance to telomeres [Bp]"), yaxis = list(title = "Distance to telomeres [Bp]"),
bargap = 0.9 bargap = 0.9
) |> ) |>
plotly::add_bars( plotly::add_bars(
data = species_max_distance, data = species_max_distance,
x = ~species, x = ~species,
y = ~max_distance, y = ~max_distance,
name = "All genes", name = "All genes",
marker = list(color = base_color()) marker = list(color = base_color())
)
if (length(gene_sets) > 0) {
# Include gene information which will be used for labeling
gene_set_data <- merge(
data[gene %chin% unlist(gene_sets)],
geposan::genes,
by.x = "gene",
by.y = "id"
)
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set <- gene_sets[[gene_set_name]]
plot <- plot |> plotly::add_markers(
data = gene_set_data[gene %chin% gene_set],
x = ~species,
y = ~distance,
name = gene_set_name,
text = ~ glue::glue(
"<b>{name}</b><br>",
"{round(distance / 1000000, digits = 2)} MBp"
),
hoverinfo = "text",
marker = list(
size = 10,
color = gene_set_color(index)
) )
)
if (length(gene_sets) > 0) { index <- index + 1
# Include gene information which will be used for labeling
gene_set_data <- merge(
data[gene %chin% unlist(gene_sets)],
geposan::genes,
by.x = "gene",
by.y = "id"
)
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set <- gene_sets[[gene_set_name]]
plot <- plot |> plotly::add_markers(
data = gene_set_data[gene %chin% gene_set],
x = ~species,
y = ~distance,
name = gene_set_name,
text = ~ glue::glue(
"<b>{name}</b><br>",
"{round(distance / 1000000, digits = 2)} MBp"
),
hoverinfo = "text",
marker = list(
size = 10,
color = gene_set_color(index)
)
)
index <- index + 1
}
} }
}
plot plot
} }
@ -108,75 +108,75 @@ plot_positions <- function(species_ids, gene_sets) {
#' #'
#' @export #' @export
plot_rankings <- function(rankings, gene_sets) { plot_rankings <- function(rankings, gene_sets) {
if (!requireNamespace("plotly", quietly = TRUE)) { if (!requireNamespace("plotly", quietly = TRUE)) {
stop("Please install \"plotly\" to use this function.") stop("Please install \"plotly\" to use this function.")
} }
plot <- plotly::plot_ly() |> plot <- plotly::plot_ly() |>
plotly::layout( plotly::layout(
xaxis = list(tickvals = names(rankings)), xaxis = list(tickvals = names(rankings)),
yaxis = list(title = "Score") yaxis = list(title = "Score")
)
is_first <- TRUE
for (ranking_name in names(rankings)) {
ranking <- rankings[[ranking_name]]
plot <- plot |> plotly::add_trace(
data = ranking,
x = ranking_name,
y = ~score,
name = "All genes",
type = "violin",
spanmode = "hard",
points = FALSE,
showlegend = is_first,
hoverinfo = "skip",
line = list(color = base_color()),
fillcolor = base_color_transparent()
)
if (length(gene_sets) > 0) {
gene_set_data <- merge(
ranking[gene %chin% unlist(gene_sets)],
geposan::genes,
by.x = "gene",
by.y = "id"
)
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set <- gene_sets[[gene_set_name]]
plot <- plot |> plotly::add_markers(
data = gene_set_data[gene %chin% gene_set],
x = ranking_name,
y = ~score,
name = gene_set_name,
text = ~ glue::glue(
"<b>{name}</b><br>",
"Score: {round(score, digits = 2)}<br>",
"Rank: {rank}<br>",
"Percentile: {round(percentile * 100, digits = 2)}%"
),
hoverinfo = "text",
showlegend = is_first,
marker = list(
size = 10,
color = gene_set_color(index)
)
) )
is_first <- TRUE index <- index + 1
}
for (ranking_name in names(rankings)) {
ranking <- rankings[[ranking_name]]
plot <- plot |> plotly::add_trace(
data = ranking,
x = ranking_name,
y = ~score,
name = "All genes",
type = "violin",
spanmode = "hard",
points = FALSE,
showlegend = is_first,
hoverinfo = "skip",
line = list(color = base_color()),
fillcolor = base_color_transparent()
)
if (length(gene_sets) > 0) {
gene_set_data <- merge(
ranking[gene %chin% unlist(gene_sets)],
geposan::genes,
by.x = "gene",
by.y = "id"
)
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set <- gene_sets[[gene_set_name]]
plot <- plot |> plotly::add_markers(
data = gene_set_data[gene %chin% gene_set],
x = ranking_name,
y = ~score,
name = gene_set_name,
text = ~ glue::glue(
"<b>{name}</b><br>",
"Score: {round(score, digits = 2)}<br>",
"Rank: {rank}<br>",
"Percentile: {round(percentile * 100, digits = 2)}%"
),
hoverinfo = "text",
showlegend = is_first,
marker = list(
size = 10,
color = gene_set_color(index)
)
)
index <- index + 1
}
}
is_first <- FALSE
} }
plot is_first <- FALSE
}
plot
} }
@ -194,88 +194,88 @@ plot_rankings <- function(rankings, gene_sets) {
#' #'
#' @export #' @export
plot_scores <- function(ranking, gene_sets = NULL, max_rank = NULL) { plot_scores <- function(ranking, gene_sets = NULL, max_rank = NULL) {
if (!requireNamespace("plotly", quietly = TRUE)) { if (!requireNamespace("plotly", quietly = TRUE)) {
stop("Please install \"plotly\" to use this function.") stop("Please install \"plotly\" to use this function.")
} }
# To speed up rendering, don't show every single gene. # To speed up rendering, don't show every single gene.
n_ranks <- nrow(ranking) n_ranks <- nrow(ranking)
sample_ranking <- ranking[seq(1, n_ranks, 5)] sample_ranking <- ranking[seq(1, n_ranks, 5)]
plot <- plotly::plot_ly() |> plot <- plotly::plot_ly() |>
plotly::add_lines( plotly::add_lines(
data = sample_ranking, data = sample_ranking,
x = ~percentile, x = ~percentile,
y = ~score, y = ~score,
name = "All genes", name = "All genes",
hoverinfo = "skip", hoverinfo = "skip",
line = list(width = 4, color = base_color()) line = list(width = 4, color = base_color())
) |> ) |>
plotly::layout( plotly::layout(
xaxis = list( xaxis = list(
title = "Percentile", title = "Percentile",
tickformat = ".0%" tickformat = ".0%"
), ),
yaxis = list(title = "Score") yaxis = list(title = "Score")
)
if (length(gene_sets) > 0) {
# Include gene information which will be used for labeling
gene_set_data <- merge(
ranking[gene %chin% unlist(gene_sets)],
geposan::genes,
by.x = "gene",
by.y = "id"
)
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set <- gene_sets[[gene_set_name]]
plot <- plot |> plotly::add_markers(
data = gene_set_data[gene %chin% gene_set],
x = ~percentile,
y = ~score,
name = gene_set_name,
text = ~ glue::glue(
"<b>{name}</b><br>",
"Score: {round(score, digits = 2)}<br>",
"Rank: {rank}<br>",
"Percentile: {round(percentile * 100, digits = 2)}%"
),
hoverinfo = "text",
marker = list(
size = 10,
color = gene_set_color(index)
) )
)
if (length(gene_sets) > 0) { index <- index + 1
# Include gene information which will be used for labeling }
gene_set_data <- merge( }
ranking[gene %chin% unlist(gene_sets)],
geposan::genes,
by.x = "gene", if (!is.null(max_rank)) {
by.y = "id" first_not_included_rank <- max_rank + 1
last_rank <- ranking[, .N]
if (first_not_included_rank <= last_rank) {
plot <- plot |> plotly::layout(
shapes = list(
type = "rect",
fillcolor = "black",
opacity = 0.1,
x0 = 1 - first_not_included_rank / n_ranks,
x1 = 1 - last_rank / n_ranks,
y0 = 0.0,
y1 = 1.0
) )
)
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set <- gene_sets[[gene_set_name]]
plot <- plot |> plotly::add_markers(
data = gene_set_data[gene %chin% gene_set],
x = ~percentile,
y = ~score,
name = gene_set_name,
text = ~ glue::glue(
"<b>{name}</b><br>",
"Score: {round(score, digits = 2)}<br>",
"Rank: {rank}<br>",
"Percentile: {round(percentile * 100, digits = 2)}%"
),
hoverinfo = "text",
marker = list(
size = 10,
color = gene_set_color(index)
)
)
index <- index + 1
}
} }
}
plot
if (!is.null(max_rank)) {
first_not_included_rank <- max_rank + 1
last_rank <- ranking[, .N]
if (first_not_included_rank <= last_rank) {
plot <- plot |> plotly::layout(
shapes = list(
type = "rect",
fillcolor = "black",
opacity = 0.1,
x0 = 1 - first_not_included_rank / n_ranks,
x1 = 1 - last_rank / n_ranks,
y0 = 0.0,
y1 = 1.0
)
)
}
}
plot
} }
#' Visualize a ranking by comparing gene sets in a boxplot. #' Visualize a ranking by comparing gene sets in a boxplot.
@ -290,44 +290,44 @@ plot_scores <- function(ranking, gene_sets = NULL, max_rank = NULL) {
#' #'
#' @export #' @export
plot_boxplot <- function(ranking, gene_sets = NULL) { plot_boxplot <- function(ranking, gene_sets = NULL) {
if (!requireNamespace("plotly", quietly = TRUE)) { if (!requireNamespace("plotly", quietly = TRUE)) {
stop("Please install \"plotly\" to use this function.") stop("Please install \"plotly\" to use this function.")
}
plot <- plotly::plot_ly() |>
plotly::add_boxplot(
data = ranking,
x = "All genes",
y = ~score,
name = "All genes",
showlegend = FALSE,
line = list(color = base_color())
) |>
plotly::layout(
xaxis = list(tickvals = c("All genes", names(gene_sets))),
yaxis = list(title = "Score")
)
if (length(gene_sets) > 0) {
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set <- gene_sets[[gene_set_name]]
plot <- plot |> plotly::add_boxplot(
data = ranking[gene %chin% gene_set],
x = gene_set_name,
y = ~score,
name = gene_set_name,
showlegend = FALSE,
line = list(color = gene_set_color(index))
)
index <- index + 1
} }
}
plot <- plotly::plot_ly() |> plot
plotly::add_boxplot(
data = ranking,
x = "All genes",
y = ~score,
name = "All genes",
showlegend = FALSE,
line = list(color = base_color())
) |>
plotly::layout(
xaxis = list(tickvals = c("All genes", names(gene_sets))),
yaxis = list(title = "Score")
)
if (length(gene_sets) > 0) {
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set <- gene_sets[[gene_set_name]]
plot <- plot |> plotly::add_boxplot(
data = ranking[gene %chin% gene_set],
x = gene_set_name,
y = ~score,
name = gene_set_name,
showlegend = FALSE,
line = list(color = gene_set_color(index))
)
index <- index + 1
}
}
plot
} }
#' Show the distribution of scores across chromosomes. #' Show the distribution of scores across chromosomes.
@ -340,46 +340,46 @@ plot_boxplot <- function(ranking, gene_sets = NULL) {
#' #'
#' @export #' @export
plot_chromosomes <- function(ranking) { plot_chromosomes <- function(ranking) {
if (!requireNamespace("plotly", quietly = TRUE)) { if (!requireNamespace("plotly", quietly = TRUE)) {
stop("Please install \"plotly\" to use this function.") stop("Please install \"plotly\" to use this function.")
} }
data <- merge(ranking, geposan::genes, by.x = "gene", by.y = "id") data <- merge(ranking, geposan::genes, by.x = "gene", by.y = "id")
data <- data[, .(score = mean(score)), by = "chromosome"] data <- data[, .(score = mean(score)), by = "chromosome"]
# Get an orderable integer from a chromosome name. # Get an orderable integer from a chromosome name.
chromosome_index <- function(chromosome) { chromosome_index <- function(chromosome) {
index <- suppressWarnings(as.integer(chromosome)) index <- suppressWarnings(as.integer(chromosome))
ifelse( ifelse(
!is.na(index), !is.na(index),
index, index,
ifelse( ifelse(
chromosome == "X", chromosome == "X",
998, 998,
999 999
) )
) )
} }
data[, index := chromosome_index(chromosome)] data[, index := chromosome_index(chromosome)]
setorder(data, "index") setorder(data, "index")
plotly::plot_ly( plotly::plot_ly(
data = data, data = data,
x = ~chromosome, x = ~chromosome,
y = ~score, y = ~score,
type = "bar", type = "bar",
marker = list(color = base_color()) marker = list(color = base_color())
) |> ) |>
plotly::layout( plotly::layout(
xaxis = list( xaxis = list(
title = "Chromosome", title = "Chromosome",
categoryorder = "array", categoryorder = "array",
categoryarray = ~chromosome categoryarray = ~chromosome
), ),
yaxis = list(title = "Mean score") yaxis = list(title = "Mean score")
) )
} }
#' Plot scores in relation to chromosomal position of genes. #' Plot scores in relation to chromosomal position of genes.
@ -398,74 +398,74 @@ plot_chromosomes <- function(ranking) {
plot_scores_by_position <- function(ranking, plot_scores_by_position <- function(ranking,
chromosome_name = NULL, chromosome_name = NULL,
gene_sets = NULL) { gene_sets = NULL) {
if (!requireNamespace("plotly", quietly = TRUE)) { if (!requireNamespace("plotly", quietly = TRUE)) {
stop("Please install \"plotly\" to use this function.") stop("Please install \"plotly\" to use this function.")
} }
distance_data <- if (!is.null(chromosome_name)) { distance_data <- if (!is.null(chromosome_name)) {
chromosome_name_ <- chromosome_name chromosome_name_ <- chromosome_name
geposan::distances[ geposan::distances[
species == "hsapiens" & species == "hsapiens" &
chromosome_name == chromosome_name_ chromosome_name == chromosome_name_
] ]
} else { } else {
geposan::distances[species == "hsapiens"] geposan::distances[species == "hsapiens"]
} }
data <- merge(ranking, distance_data, by = "gene") data <- merge(ranking, distance_data, by = "gene")
data <- merge( data <- merge(
data, data,
geposan::genes, geposan::genes,
by.x = "gene", by.x = "gene",
by.y = "id" by.y = "id"
)
data[, `:=`(gene_set = "All genes", color = base_color())]
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set_genes <- gene_sets[[gene_set_name]]
data[
gene %chin% gene_set_genes,
`:=`(
gene_set = gene_set_name,
color = gene_set_color(index)
)
]
index <- index + 1
}
# Use distances instead of positions in case all chromosomes are included.
if (is.null(chromosome_name)) {
data[, x := distance]
} else {
data[, x := start_position]
}
plotly::plot_ly() |>
plotly::add_markers(
data = data,
x = ~x,
y = ~score,
name = ~gene_set,
text = ~ glue::glue(
"<b>{name}</b><br>",
if (is.null(chromosome_name)) "Distance: " else "Position: ",
"{round(x / 1000000, digits = 2)} MBp<br>",
"Score: {round(score, digits = 2)}<br>",
"Rank: {rank}<br>",
"Percentile: {round(percentile * 100, digits = 2)}%"
),
hoverinfo = "text",
) |>
plotly::layout(
xaxis = list(title = if (is.null(chromosome_name)) {
"Distance (Bp)"
} else {
"Position (Bp)"
}),
yaxis = list(title = "Score")
) )
data[, `:=`(gene_set = "All genes", color = base_color())]
index <- 1
for (gene_set_name in names(gene_sets)) {
gene_set_genes <- gene_sets[[gene_set_name]]
data[
gene %chin% gene_set_genes,
`:=`(
gene_set = gene_set_name,
color = gene_set_color(index)
)
]
index <- index + 1
}
# Use distances instead of positions in case all chromosomes are included.
if (is.null(chromosome_name)) {
data[, x := distance]
} else {
data[, x := start_position]
}
plotly::plot_ly() |>
plotly::add_markers(
data = data,
x = ~x,
y = ~score,
name = ~gene_set,
text = ~ glue::glue(
"<b>{name}</b><br>",
if (is.null(chromosome_name)) "Distance: " else "Position: ",
"{round(x / 1000000, digits = 2)} MBp<br>",
"Score: {round(score, digits = 2)}<br>",
"Rank: {rank}<br>",
"Percentile: {round(percentile * 100, digits = 2)}%"
),
hoverinfo = "text",
) |>
plotly::layout(
xaxis = list(title = if (is.null(chromosome_name)) {
"Distance (Bp)"
} else {
"Position (Bp)"
}),
yaxis = list(title = "Score")
)
} }

View file

@ -21,55 +21,55 @@ preset <- function(reference_gene_ids,
methods = all_methods(), methods = all_methods(),
species_ids = geposan::species$id, species_ids = geposan::species$id,
gene_ids = geposan::genes$id) { gene_ids = geposan::genes$id) {
# Count included species per gene. # Count included species per gene.
genes_n_species <- geposan::distances[ genes_n_species <- geposan::distances[
species %chin% species_ids, species %chin% species_ids,
.(n_species = .N), .(n_species = .N),
by = "gene" by = "gene"
] ]
# Filter out genes with less than 25% existing orthologs. # Filter out genes with less than 25% existing orthologs.
gene_ids_filtered <- genes_n_species[ gene_ids_filtered <- genes_n_species[
gene %chin% gene_ids & gene %chin% gene_ids &
n_species >= 0.25 * length(species_ids), n_species >= 0.25 * length(species_ids),
gene gene
] ]
reference_gene_ids_excluded <- reference_gene_ids[ reference_gene_ids_excluded <- reference_gene_ids[
!reference_gene_ids %chin% gene_ids_filtered !reference_gene_ids %chin% gene_ids_filtered
] ]
if (length(reference_gene_ids_excluded > 0)) { if (length(reference_gene_ids_excluded > 0)) {
warning(paste0( warning(paste0(
"The following reference gene IDs are excluded from the preset ", "The following reference gene IDs are excluded from the preset ",
"because they don't have enough data: ", "because they don't have enough data: ",
paste(reference_gene_ids_excluded, collapse = ", ") paste(reference_gene_ids_excluded, collapse = ", ")
)) ))
} }
reference_gene_ids_included <- reference_gene_ids[ reference_gene_ids_included <- reference_gene_ids[
reference_gene_ids %chin% gene_ids_filtered reference_gene_ids %chin% gene_ids_filtered
] ]
if (length(reference_gene_ids_included) < 1) { if (length(reference_gene_ids_included) < 1) {
stop(paste0( stop(paste0(
"There has to be at least one reference gene for the preset to be ", "There has to be at least one reference gene for the preset to be ",
"valid. Please note that some methods may require more reference ", "valid. Please note that some methods may require more reference ",
"genes." "genes."
)) ))
} }
# The included data gets sorted to be able to produce predictable hashes # The included data gets sorted to be able to produce predictable hashes
# for the object later. # for the object later.
structure( structure(
list( list(
reference_gene_ids = sort(reference_gene_ids_included), reference_gene_ids = sort(reference_gene_ids_included),
methods = methods, methods = methods,
species_ids = sort(species_ids), species_ids = sort(species_ids),
gene_ids = sort(gene_ids_filtered) gene_ids = sort(gene_ids_filtered)
), ),
class = "geposan_preset" class = "geposan_preset"
) )
} }
#' S3 method to print a preset object. #' S3 method to print a preset object.
@ -81,20 +81,20 @@ preset <- function(reference_gene_ids,
#' #'
#' @export #' @export
print.geposan_preset <- function(x, ...) { print.geposan_preset <- function(x, ...) {
cat(sprintf( cat(sprintf(
paste0( paste0(
"geposan preset:", "geposan preset:",
"\n Reference genes: %i", "\n Reference genes: %i",
"\n Included methods: %s", "\n Included methods: %s",
"\n Number of species: %i", "\n Number of species: %i",
"\n Number of genes: %i", "\n Number of genes: %i",
"\n" "\n"
), ),
length(x$reference_gene_ids), length(x$reference_gene_ids),
paste(sapply(x$methods, function(m) m$id), collapse = ", "), paste(sapply(x$methods, function(m) m$id), collapse = ", "),
length(x$species_ids), length(x$species_ids),
length(x$gene_ids) length(x$gene_ids)
)) ))
invisible(x) invisible(x)
} }

View file

@ -13,35 +13,35 @@
#' #'
#' @export #' @export
ranking <- function(analysis, weights) { ranking <- function(analysis, weights) {
ranking <- if (inherits(analysis, "geposan_analysis")) { ranking <- if (inherits(analysis, "geposan_analysis")) {
copy(analysis$scores) copy(analysis$scores)
} else if (inherits(analysis, "geposan_ranking")) { } else if (inherits(analysis, "geposan_ranking")) {
copy(analysis) copy(analysis)
} else { } else {
stop("Invalid analyis. Use geposan::analyze().") stop("Invalid analyis. Use geposan::analyze().")
} }
ranking[, score := 0.0] ranking[, score := 0.0]
for (method in names(weights)) { for (method in names(weights)) {
weighted <- weights[[method]] * ranking[, ..method] weighted <- weights[[method]] * ranking[, ..method]
ranking[, score := score + weighted] ranking[, score := score + weighted]
} }
# Normalize scores to be between 0.0 and 1.0. # Normalize scores to be between 0.0 and 1.0.
min_score <- ranking[, min(score)] min_score <- ranking[, min(score)]
max_score <- ranking[, max(score)] max_score <- ranking[, max(score)]
score_range <- max_score - min_score score_range <- max_score - min_score
ranking[, score := (score - min_score) / score_range] ranking[, score := (score - min_score) / score_range]
setorder(ranking, -score) setorder(ranking, -score)
ranking[, rank := .I] ranking[, rank := .I]
ranking[, percentile := 1 - rank / nrow(ranking)] ranking[, percentile := 1 - rank / nrow(ranking)]
structure( structure(
ranking, ranking,
class = c("geposan_ranking", class(ranking)) class = c("geposan_ranking", class(ranking))
) )
} }
#' Find the best weights to rank the results. #' Find the best weights to rank the results.
@ -61,40 +61,40 @@ ranking <- function(analysis, weights) {
#' @export #' @export
optimal_weights <- function(analysis, methods, reference_gene_ids, optimal_weights <- function(analysis, methods, reference_gene_ids,
target = "mean") { target = "mean") {
if (!inherits(analysis, c("geposan_analysis", "geposan_ranking"))) { if (!inherits(analysis, c("geposan_analysis", "geposan_ranking"))) {
stop("Invalid analyis. Use geposan::analyze().") stop("Invalid analyis. Use geposan::analyze().")
}
# Compute the target rank of the reference genes when applying the
# weights.
target_rank <- function(factors) {
data <- ranking(analysis, as.list(factors))
result <- data[
gene %chin% reference_gene_ids,
if (target == "min") {
min(rank)
} else if (target == "max") {
max(rank)
} else if (target == "mean") {
mean(rank)
} else {
stats::median(rank)
}
]
if (result > 0) {
result
} else {
Inf
} }
}
initial_factors <- rep(1.0, length(methods))
names(initial_factors) <- methods
# Compute the target rank of the reference genes when applying the optimal_factors <- stats::optim(initial_factors, target_rank)$par
# weights.
target_rank <- function(factors) {
data <- ranking(analysis, as.list(factors))
result <- data[ as.list(optimal_factors / max(abs(optimal_factors)))
gene %chin% reference_gene_ids,
if (target == "min") {
min(rank)
} else if (target == "max") {
max(rank)
} else if (target == "mean") {
mean(rank)
} else {
stats::median(rank)
}
]
if (result > 0) {
result
} else {
Inf
}
}
initial_factors <- rep(1.0, length(methods))
names(initial_factors) <- methods
optimal_factors <- stats::optim(initial_factors, target_rank)$par
as.list(optimal_factors / max(abs(optimal_factors)))
} }

View file

@ -10,18 +10,18 @@
#' #'
#' @export #' @export
result <- function(method_id, scores, details = list()) { result <- function(method_id, scores, details = list()) {
stopifnot(is.data.frame(scores) & stopifnot(is.data.frame(scores) &
c("gene", "score") %chin% colnames(scores)) c("gene", "score") %chin% colnames(scores))
stopifnot(is.list(details)) stopifnot(is.list(details))
structure( structure(
list( list(
method_id = method_id, method_id = method_id,
scores = scores, scores = scores,
details = details details = details
), ),
class = "geposan_result" class = "geposan_result"
) )
} }
#' Print a result object. #' Print a result object.
@ -33,18 +33,18 @@ result <- function(method_id, scores, details = list()) {
#' #'
#' @export #' @export
print.geposan_result <- function(x, ...) { print.geposan_result <- function(x, ...) {
cat(sprintf( cat(sprintf(
paste0( paste0(
"geposan result:", "geposan result:",
"\n Method: %s", "\n Method: %s",
"\n Number of genes: %i", "\n Number of genes: %i",
"\n Available details: %s", "\n Available details: %s",
"\n" "\n"
), ),
x$method_id, x$method_id,
nrow(x$scores), nrow(x$scores),
paste(names(x$details), collapse = ", ") paste(names(x$details), collapse = ", ")
)) ))
invisible(x) invisible(x)
} }

View file

@ -8,25 +8,25 @@
# @param objects A vector of objects that this expression depends on. The hash # @param objects A vector of objects that this expression depends on. The hash
# of those objects will be used for identifying the cache file. # of those objects will be used for identifying the cache file.
cached <- function(name, objects, expr) { cached <- function(name, objects, expr) {
if (!dir.exists("cache")) { if (!dir.exists("cache")) {
dir.create("cache") dir.create("cache")
} }
id <- rlang::hash(objects) id <- rlang::hash(objects)
cache_file <- sprintf("cache/%s_%s.rda", name, id) cache_file <- sprintf("cache/%s_%s.rda", name, id)
if (file.exists(cache_file)) { if (file.exists(cache_file)) {
# If the cache file exists, we restore the data from it. # If the cache file exists, we restore the data from it.
load(cache_file) load(cache_file)
} else { } else {
# If the cache file doesn't exist, we have to do the computation. # If the cache file doesn't exist, we have to do the computation.
data <- expr data <- expr
# The results are cached for the next run. # The results are cached for the next run.
save(data, file = cache_file, compress = "xz") save(data, file = cache_file, compress = "xz")
} }
data data
} }
# This is needed to make data.table's symbols available within the package. # This is needed to make data.table's symbols available within the package.

View file

@ -27,68 +27,68 @@
#' #'
#' @export #' @export
validate <- function(ranking, reference_gene_ids, method_ids, progress = NULL) { validate <- function(ranking, reference_gene_ids, method_ids, progress = NULL) {
if (!inherits(ranking, "geposan_ranking")) { if (!inherits(ranking, "geposan_ranking")) {
stop("Ranking is invalid. Use geposan::ranking().") stop("Ranking is invalid. Use geposan::ranking().")
} }
if (is.null(progress)) { if (is.null(progress)) {
progress_bar <- progress::progress_bar$new() progress_bar <- progress::progress_bar$new()
progress_bar$update(0.0) progress_bar$update(0.0)
progress <- function(progress_value) { progress <- function(progress_value) {
if (!progress_bar$finished) { if (!progress_bar$finished) {
progress_bar$update(progress_value) progress_bar$update(progress_value)
if (progress_value >= 1.0) { if (progress_value >= 1.0) {
progress_bar$terminate() progress_bar$terminate()
}
}
} }
}
} }
}
progress_state <- 0.0 progress_state <- 0.0
progress_step <- 1.0 / length(reference_gene_ids) progress_step <- 1.0 / length(reference_gene_ids)
results <- ranking[gene %chin% reference_gene_ids, .(gene, percentile)] results <- ranking[gene %chin% reference_gene_ids, .(gene, percentile)]
for (gene_id in reference_gene_ids) { for (gene_id in reference_gene_ids) {
included_gene_ids <- reference_gene_ids[ included_gene_ids <- reference_gene_ids[
reference_gene_ids != gene_id reference_gene_ids != gene_id
] ]
weights <- optimal_weights( weights <- optimal_weights(
ranking, ranking,
method_ids, method_ids,
included_gene_ids included_gene_ids
)
ranking_validation <- ranking(ranking, weights)
results[
gene == gene_id,
percentile_validation := ranking_validation[
gene == gene_id,
percentile
]
]
if (!is.null(progress)) {
progress_state <- progress_state + progress_step
progress(progress_state)
}
}
results[, error := percentile - percentile_validation]
setorder(results, error)
structure(
list(
validation = results,
mean_percentile_original = results[, mean(percentile)],
mean_percentile_validation = results[, mean(percentile_validation)],
mean_error = results[, mean(error)]
),
class = "geposan_validation"
) )
ranking_validation <- ranking(ranking, weights)
results[
gene == gene_id,
percentile_validation := ranking_validation[
gene == gene_id,
percentile
]
]
if (!is.null(progress)) {
progress_state <- progress_state + progress_step
progress(progress_state)
}
}
results[, error := percentile - percentile_validation]
setorder(results, error)
structure(
list(
validation = results,
mean_percentile_original = results[, mean(percentile)],
mean_percentile_validation = results[, mean(percentile_validation)],
mean_error = results[, mean(error)]
),
class = "geposan_validation"
)
} }
#' S3 method to print a validation object. #' S3 method to print a validation object.
@ -100,18 +100,18 @@ validate <- function(ranking, reference_gene_ids, method_ids, progress = NULL) {
#' #'
#' @export #' @export
print.geposan_validation <- function(x, ...) { print.geposan_validation <- function(x, ...) {
cat(sprintf( cat(sprintf(
paste0( paste0(
"geposan validation:", "geposan validation:",
"\n Mean percentile original: %.1f%%", "\n Mean percentile original: %.1f%%",
"\n Mean percentile validation: %.1f%%", "\n Mean percentile validation: %.1f%%",
"\n Mean error: %.1f percent points", "\n Mean error: %.1f percent points",
"\n" "\n"
), ),
x$mean_percentile_original * 100, x$mean_percentile_original * 100,
x$mean_percentile_validation * 100, x$mean_percentile_validation * 100,
x$mean_error * 100 x$mean_error * 100
)) ))
invisible(x) invisible(x)
} }

View file

@ -5,23 +5,23 @@ ensembl_api_url <- "https://rest.ensembl.org"
#' Perform a request to the Ensembl REST API. #' Perform a request to the Ensembl REST API.
ensembl_request <- function(api_path) { ensembl_request <- function(api_path) {
content(stop_for_status(GET( content(stop_for_status(GET(
paste0(ensembl_api_url, api_path), paste0(ensembl_api_url, api_path),
content_type_json() content_type_json()
))) )))
} }
#' Get IDs of all available vertebrates. #' Get IDs of all available vertebrates.
get_species_ids <- function() { get_species_ids <- function() {
species <- ensembl_request("/info/species")$species species <- ensembl_request("/info/species")$species
sapply(species, function(species) species$name) sapply(species, function(species) species$name)
} }
#' Get all chromosomes names for a species. #' Get all chromosomes names for a species.
get_species_chromosomes <- function(species_id) { get_species_chromosomes <- function(species_id) {
chromosomes <- unlist(ensembl_request( chromosomes <- unlist(ensembl_request(
paste0("/info/assembly/", species_id) paste0("/info/assembly/", species_id)
)$karyotype) )$karyotype)
} }
#' Get a vector of all available unqiue chromosome names. #' Get a vector of all available unqiue chromosome names.
@ -29,6 +29,6 @@ get_species_chromosomes <- function(species_id) {
#' There are multiple names for mitochondrial DNA which have to be removed #' There are multiple names for mitochondrial DNA which have to be removed
#' manually, unfortunately. #' manually, unfortunately.
get_all_chromosomes <- function() { get_all_chromosomes <- function() {
chromosomes <- sapply(get_species_ids(), get_species_chromosomes) chromosomes <- sapply(get_species_ids(), get_species_chromosomes)
unique(unlist(chromosomes)) unique(unlist(chromosomes))
} }

View file

@ -12,8 +12,8 @@ ensembl_datasets <- data.table(biomaRt::listDatasets(ensembl))
# Filter out species ID and name from the result. # Filter out species ID and name from the result.
species <- ensembl_datasets[, .( species <- ensembl_datasets[, .(
id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2], id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2],
name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2] name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2]
)] )]
# List of assemblies that the Ensembl Rest API advertises as chromosomes. # List of assemblies that the Ensembl Rest API advertises as chromosomes.
@ -23,232 +23,232 @@ species <- ensembl_datasets[, .(
# #
# See get_all_chromosomes() # See get_all_chromosomes()
valid_chromosome_names <- c( valid_chromosome_names <- c(
"1", "1",
"2", "2",
"3", "3",
"4", "4",
"5", "5",
"6", "6",
"7", "7",
"8", "8",
"9", "9",
"10", "10",
"11", "11",
"12", "12",
"13", "13",
"14", "14",
"15", "15",
"16", "16",
"17", "17",
"18", "18",
"19", "19",
"20", "20",
"X", "X",
"Y", "Y",
"21", "21",
"4A", "4A",
"1A", "1A",
"22", "22",
"23", "23",
"24", "24",
"25LG1", "25LG1",
"25LG2", "25LG2",
"26", "26",
"27", "27",
"28", "28",
"LGE22", "LGE22",
"Z", "Z",
"25", "25",
"29", "29",
"A1", "A1",
"A2", "A2",
"A3", "A3",
"B1", "B1",
"B2", "B2",
"B3", "B3",
"B4", "B4",
"C1", "C1",
"C2", "C2",
"D1", "D1",
"D2", "D2",
"D3", "D3",
"D4", "D4",
"E1", "E1",
"E2", "E2",
"E3", "E3",
"F1", "F1",
"F2", "F2",
"2A", "2A",
"2B", "2B",
"LG01", "LG01",
"LG02", "LG02",
"LG03", "LG03",
"LG04", "LG04",
"LG05", "LG05",
"LG06", "LG06",
"LG07", "LG07",
"LG08", "LG08",
"LG09", "LG09",
"LG10", "LG10",
"LG11", "LG11",
"LG12", "LG12",
"LG13", "LG13",
"LG14", "LG14",
"LG15", "LG15",
"LG16", "LG16",
"LG17", "LG17",
"LG18", "LG18",
"LG19", "LG19",
"LG20", "LG20",
"LG21", "LG21",
"LG22", "LG22",
"LG23", "LG23",
"LG24", "LG24",
"LG25", "LG25",
"I", "I",
"II", "II",
"III", "III",
"IV", "IV",
"V", "V",
"VI", "VI",
"VII", "VII",
"VIII", "VIII",
"IX", "IX",
"XI", "XI",
"XII", "XII",
"XIII", "XIII",
"XIV", "XIV",
"XV", "XV",
"XVI", "XVI",
"XVII", "XVII",
"XVIII", "XVIII",
"XIX", "XIX",
"XX", "XX",
"XXI", "XXI",
"XXII", "XXII",
"XXIII", "XXIII",
"XXIV", "XXIV",
"W", "W",
"30", "30",
"31", "31",
"32", "32",
"33", "33",
"34", "34",
"35", "35",
"36", "36",
"37", "37",
"38", "38",
"39", "39",
"40", "40",
"2L", "2L",
"2R", "2R",
"3L", "3L",
"3R", "3R",
"LG1", "LG1",
"LG2", "LG2",
"LG3", "LG3",
"LG4", "LG4",
"LG5", "LG5",
"LG6", "LG6",
"LG7", "LG7",
"LG8", "LG8",
"LG9", "LG9",
"LG26", "LG26",
"LG27", "LG27",
"LG28", "LG28",
"LG29", "LG29",
"LG30", "LG30",
"1a", "1a",
"7b", "7b",
"22a", "22a",
"LGE22C19W28_E50C23", "LGE22C19W28_E50C23",
"LGE64", "LGE64",
"7a", "7a",
"MIC_1", "MIC_1",
"MIC_10", "MIC_10",
"MIC_11", "MIC_11",
"MIC_2", "MIC_2",
"MIC_3", "MIC_3",
"MIC_4", "MIC_4",
"MIC_5", "MIC_5",
"MIC_6", "MIC_6",
"MIC_7", "MIC_7",
"MIC_8", "MIC_8",
"MIC_9", "MIC_9",
"sgr01", "sgr01",
"sgr02", "sgr02",
"sgr03", "sgr03",
"sgr04", "sgr04",
"sgr05", "sgr05",
"sgr06", "sgr06",
"sgr07", "sgr07",
"sgr08", "sgr08",
"sgr09", "sgr09",
"sgr10", "sgr10",
"sgr11", "sgr11",
"sgr12", "sgr12",
"sgr13", "sgr13",
"sgr14", "sgr14",
"sgr15", "sgr15",
"sgr16", "sgr16",
"sgr17", "sgr17",
"sgr18", "sgr18",
"sgr19", "sgr19",
"X1", "X1",
"X2", "X2",
"X3", "X3",
"X4", "X4",
"X5", "X5",
"a", "a",
"b", "b",
"c", "c",
"d", "d",
"f", "f",
"g", "g",
"h", "h",
"41", "41",
"42", "42",
"43", "43",
"44", "44",
"45", "45",
"46", "46",
"47", "47",
"48", "48",
"49", "49",
"50", "50",
"LG28B", "LG28B",
"LG30F", "LG30F",
"LG36F", "LG36F",
"LG37M", "LG37M",
"LG42F", "LG42F",
"LG44F", "LG44F",
"LG45M", "LG45M",
"LG48F", "LG48F",
"LG49B", "LG49B",
"LG34", "LG34",
"LG35", "LG35",
"LG7_11", "LG7_11",
"groupI", "groupI",
"groupII", "groupII",
"groupIII", "groupIII",
"groupIV", "groupIV",
"groupV", "groupV",
"groupVI", "groupVI",
"groupVII", "groupVII",
"groupVIII", "groupVIII",
"groupIX", "groupIX",
"groupX", "groupX",
"groupXI", "groupXI",
"groupXII", "groupXII",
"groupXIII", "groupXIII",
"groupXIV", "groupXIV",
"groupXV", "groupXV",
"groupXVI", "groupXVI",
"groupXVII", "groupXVII",
"groupXVIII", "groupXVIII",
"groupXIX", "groupXIX",
"groupXX", "groupXX",
"groupXXI" "groupXXI"
) )
#' Get all chromosome names for an Ensembl dataset. #' Get all chromosome names for an Ensembl dataset.
@ -256,8 +256,8 @@ valid_chromosome_names <- c(
#' The function tries to filter out valid chromosome names from the available #' The function tries to filter out valid chromosome names from the available
#' assemblies in the dataset. #' assemblies in the dataset.
get_chromosome_names <- function(dataset) { get_chromosome_names <- function(dataset) {
chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name") chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
chromosome_names[chromosome_names %chin% valid_chromosome_names] chromosome_names[chromosome_names %chin% valid_chromosome_names]
} }
# Retrieve information on human genes. This will only include genes on # Retrieve information on human genes. This will only include genes on
@ -267,16 +267,16 @@ rlog::log_info("Retrieving information on human genes")
dataset <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl) dataset <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl)
human_data <- data.table(biomaRt::getBM( human_data <- data.table(biomaRt::getBM(
attributes = c( attributes = c(
"ensembl_gene_id", "ensembl_gene_id",
"hgnc_symbol", "hgnc_symbol",
"chromosome_name", "chromosome_name",
"start_position", "start_position",
"end_position" "end_position"
), ),
filters = "chromosome_name", filters = "chromosome_name",
values = get_chromosome_names(dataset), values = get_chromosome_names(dataset),
mart = dataset mart = dataset
)) ))
# Remove duplicated gene IDs (at the time of writing, there are a handful). # Remove duplicated gene IDs (at the time of writing, there are a handful).
@ -284,9 +284,9 @@ human_data <- unique(human_data, by = "ensembl_gene_id")
# Only keep relevant information on genes. # Only keep relevant information on genes.
genes <- human_data[, .( genes <- human_data[, .(
id = ensembl_gene_id, id = ensembl_gene_id,
name = hgnc_symbol, name = hgnc_symbol,
chromosome = chromosome_name chromosome = chromosome_name
)] )]
# Retrieve gene distance data across species. # Retrieve gene distance data across species.
@ -296,39 +296,39 @@ distances <- data.table()
#' Handle data for one species. #' Handle data for one species.
handle_species <- function(species_id, species_data) { handle_species <- function(species_id, species_data) {
chromosomes <- species_data[, chromosomes <- species_data[,
.(chromosome_length = max(end_position)), .(chromosome_length = max(end_position)),
by = chromosome_name by = chromosome_name
] ]
# Store the number of chromosomes in the species table. # Store the number of chromosomes in the species table.
species[id == species_id, n_chromosomes := nrow(chromosomes)] species[id == species_id, n_chromosomes := nrow(chromosomes)]
# Store the median chromosome length in the species table. # Store the median chromosome length in the species table.
species[ species[
id == species_id, id == species_id,
median_chromosome_length := chromosomes[, median(chromosome_length)] median_chromosome_length := chromosomes[, median(chromosome_length)]
] ]
# Precompute the genes' distance to the nearest telomere. # Precompute the genes' distance to the nearest telomere.
species_distances <- species_data[ species_distances <- species_data[
chromosomes, chromosomes,
.( .(
species = species_id, species = species_id,
gene = ensembl_gene_id, gene = ensembl_gene_id,
chromosome_name = chromosome_name, chromosome_name = chromosome_name,
start_position = start_position, start_position = start_position,
end_position = end_position, end_position = end_position,
distance = pmin( distance = pmin(
start_position, start_position,
chromosome_length - end_position chromosome_length - end_position
) )
), ),
on = "chromosome_name" on = "chromosome_name"
] ]
# Add species distances to the distances table. # Add species distances to the distances table.
distances <<- rbindlist(list(distances, species_distances)) distances <<- rbindlist(list(distances, species_distances))
} }
# Handle the human first, as we already retrieved the data and don't need to # Handle the human first, as we already retrieved the data and don't need to
@ -337,67 +337,67 @@ handle_species("hsapiens", human_data)
# Iterate through all other species and retrieve their distance data. # Iterate through all other species and retrieve their distance data.
for (species_id in species[id != "hsapiens", id]) { for (species_id in species[id != "hsapiens", id]) {
rlog::log_info(sprintf("Loading species \"%s\"", species_id)) rlog::log_info(sprintf("Loading species \"%s\"", species_id))
dataset <- biomaRt::useDataset( dataset <- biomaRt::useDataset(
sprintf("%s_gene_ensembl", species_id), sprintf("%s_gene_ensembl", species_id),
mart = ensembl mart = ensembl
) )
# Besides the attributes that are always present, we need to check for # Besides the attributes that are always present, we need to check for
# human orthologs. Some species don't have that information and will be # human orthologs. Some species don't have that information and will be
# skipped. # skipped.
if (!"hsapiens_homolog_ensembl_gene" %chin% if (!"hsapiens_homolog_ensembl_gene" %chin%
biomaRt::listAttributes(dataset, what = "name")) { biomaRt::listAttributes(dataset, what = "name")) {
rlog::log_info("No data on human orthologs") rlog::log_info("No data on human orthologs")
species <- species[id != species_id] species <- species[id != species_id]
next next
} }
chromosome_names <- get_chromosome_names(dataset) chromosome_names <- get_chromosome_names(dataset)
# Skip the species, if there are no assembled chromosomes. # Skip the species, if there are no assembled chromosomes.
if (length(chromosome_names) <= 0) { if (length(chromosome_names) <= 0) {
rlog::log_info("No matching chromosome assemblies") rlog::log_info("No matching chromosome assemblies")
species <- species[id != species_id] species <- species[id != species_id]
next next
} }
# Retrieve information on all genes of the current species, that have # Retrieve information on all genes of the current species, that have
# human orthologs. This is called "homolog" in the Ensembl schema. # human orthologs. This is called "homolog" in the Ensembl schema.
species_distances <- data.table(biomaRt::getBM( species_distances <- data.table(biomaRt::getBM(
attributes = c( attributes = c(
"hsapiens_homolog_ensembl_gene", "hsapiens_homolog_ensembl_gene",
"chromosome_name", "chromosome_name",
"start_position", "start_position",
"end_position" "end_position"
), ),
filters = c("with_hsapiens_homolog", "chromosome_name"), filters = c("with_hsapiens_homolog", "chromosome_name"),
values = list(TRUE, chromosome_names), values = list(TRUE, chromosome_names),
mart = dataset mart = dataset
)) ))
# Only include human genes that we have information on. # Only include human genes that we have information on.
species_distances <- species_distances[ species_distances <- species_distances[
hsapiens_homolog_ensembl_gene %chin% genes$id hsapiens_homolog_ensembl_gene %chin% genes$id
] ]
# Only include one ortholog per human gene. # Only include one ortholog per human gene.
species_distances <- unique( species_distances <- unique(
species_distances, species_distances,
by = "hsapiens_homolog_ensembl_gene" by = "hsapiens_homolog_ensembl_gene"
) )
# Rename gene ID column to match the human data. # Rename gene ID column to match the human data.
setnames( setnames(
species_distances, species_distances,
"hsapiens_homolog_ensembl_gene", "hsapiens_homolog_ensembl_gene",
"ensembl_gene_id" "ensembl_gene_id"
) )
handle_species(species_id, species_distances) handle_species(species_id, species_distances)
} }
# Save data in the appropriate place. # Save data in the appropriate place.