mirror of
https://github.com/johrpan/ubigen.git
synced 2025-10-26 19:57:24 +01:00
Use package's ranking function for included data
This commit is contained in:
parent
698ea5086a
commit
a6f0a64c2c
3 changed files with 18 additions and 16 deletions
10
R/ranking.R
10
R/ranking.R
|
|
@ -3,22 +3,26 @@
|
||||||
#' This function will compute a weighted average across multiple metrics that
|
#' This function will compute a weighted average across multiple metrics that
|
||||||
#' define how ubiquitous a gene is based on its expression across samples.
|
#' define how ubiquitous a gene is based on its expression across samples.
|
||||||
#'
|
#'
|
||||||
|
#' @param data The input data to use. This should either be the result of a
|
||||||
|
#' previous call to this function or the return value of [analyze()].
|
||||||
|
#'
|
||||||
#' @return A `data.table` with gene data as well as the scores, ranks and
|
#' @return A `data.table` with gene data as well as the scores, ranks and
|
||||||
#' percentiles for each gene.
|
#' percentiles for each gene.
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
rank_genes <- function(cross_sample_metric = "above_95",
|
rank_genes <- function(data = ubigen::genes,
|
||||||
|
cross_sample_metric = "above_95",
|
||||||
cross_sample_weight = 0.5,
|
cross_sample_weight = 0.5,
|
||||||
level_metric = "median_expression_normalized",
|
level_metric = "median_expression_normalized",
|
||||||
level_weight = 0.25,
|
level_weight = 0.25,
|
||||||
variation_metric = "qcv_expression_normalized",
|
variation_metric = "qcv_expression_normalized",
|
||||||
variation_weight = -0.25) {
|
variation_weight = -0.25) {
|
||||||
|
data <- copy(data)
|
||||||
|
|
||||||
total_weight <- abs(cross_sample_weight) +
|
total_weight <- abs(cross_sample_weight) +
|
||||||
abs(level_weight) +
|
abs(level_weight) +
|
||||||
abs(variation_weight)
|
abs(variation_weight)
|
||||||
|
|
||||||
data <- copy(ubigen::genes)
|
|
||||||
|
|
||||||
data[, score :=
|
data[, score :=
|
||||||
(cross_sample_weight * get(cross_sample_metric) +
|
(cross_sample_weight * get(cross_sample_metric) +
|
||||||
level_weight * get(level_metric) +
|
level_weight * get(level_metric) +
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@
|
||||||
\title{Rank genes based on how ubiquitous they are.}
|
\title{Rank genes based on how ubiquitous they are.}
|
||||||
\usage{
|
\usage{
|
||||||
rank_genes(
|
rank_genes(
|
||||||
|
data = ubigen::genes,
|
||||||
cross_sample_metric = "above_95",
|
cross_sample_metric = "above_95",
|
||||||
cross_sample_weight = 0.5,
|
cross_sample_weight = 0.5,
|
||||||
level_metric = "median_expression_normalized",
|
level_metric = "median_expression_normalized",
|
||||||
|
|
@ -13,6 +14,10 @@ rank_genes(
|
||||||
variation_weight = -0.25
|
variation_weight = -0.25
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{data}{The input data to use. This should either be the result of a
|
||||||
|
previous call to this function or the return value of \code{\link[=analyze]{analyze()}}.}
|
||||||
|
}
|
||||||
\value{
|
\value{
|
||||||
A \code{data.table} with gene data as well as the scores, ranks and
|
A \code{data.table} with gene data as well as the scores, ranks and
|
||||||
percentiles for each gene.
|
percentiles for each gene.
|
||||||
|
|
|
||||||
|
|
@ -6,22 +6,14 @@ library(here)
|
||||||
|
|
||||||
i_am("scripts/input.R")
|
i_am("scripts/input.R")
|
||||||
|
|
||||||
|
# To save memory, the data includes fake IDs for genes. The actual Ensembl IDs
|
||||||
|
# are part of the separate genes table.
|
||||||
|
|
||||||
genes <- fread(here("scripts", "input", "genes.csv"))
|
genes <- fread(here("scripts", "input", "genes.csv"))
|
||||||
data <- fread(here("scripts", "output", "results.csv"))
|
data <- fread(here("scripts", "output", "results.csv"))
|
||||||
|
|
||||||
data[, score := 0.5 * above_95 +
|
# Rank the data using default parameters.
|
||||||
0.25 * median_expression_normalized +
|
data <- ubigen::rank_genes(data = data)
|
||||||
-0.25 * qcv_expression_normalized]
|
|
||||||
|
|
||||||
# Normalize scores to be between 0.0 and 1.0.
|
|
||||||
data[, score := (score - min(score, na.rm = TRUE)) /
|
|
||||||
(max(score, na.rm = TRUE) - min(score, na.rm = TRUE))]
|
|
||||||
|
|
||||||
# These are genes that are not expressed at all or expressed just once, in case
|
|
||||||
# the standard deviation is used in the score.
|
|
||||||
data[is.na(score), score := 0.0]
|
|
||||||
|
|
||||||
setorder(data, -score)
|
|
||||||
|
|
||||||
# Reintroduce gene IDs and HGNC symbols.
|
# Reintroduce gene IDs and HGNC symbols.
|
||||||
|
|
||||||
|
|
@ -41,6 +33,7 @@ data[, id := NULL]
|
||||||
# Remove duplicates. This will keep the best row for each duplicated gene.
|
# Remove duplicates. This will keep the best row for each duplicated gene.
|
||||||
data <- unique(data, by = "gene")
|
data <- unique(data, by = "gene")
|
||||||
|
|
||||||
|
# Reassign ranks, because duplicates may have been removed.
|
||||||
data[, rank := .I]
|
data[, rank := .I]
|
||||||
|
|
||||||
fwrite(data, file = here("scripts", "output", "genes.csv"))
|
fwrite(data, file = here("scripts", "output", "genes.csv"))
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue