mirror of
https://github.com/johrpan/ubigen.git
synced 2025-10-27 04:07:25 +01:00
Use logarithmic normalization
This commit also changes the way the standard deviation and mean expression are computed in general. Now, samples where the gene is not expressed at all are excluded before the computation.
This commit is contained in:
parent
af6b5561e4
commit
cc17c13888
5 changed files with 43 additions and 12 deletions
|
|
@ -1,43 +0,0 @@
|
|||
# This script uses the results (See results.csv) and computes a score for each
|
||||
# gene. This is the data that will be used in the package.
|
||||
|
||||
library(data.table)
|
||||
library(here)
|
||||
|
||||
i_am("scripts/input.R")
|
||||
|
||||
data <- fread(here("scripts", "output", "results.csv"))
|
||||
|
||||
data[, `:=`(
|
||||
gene = stringr::str_split(gene, "\\.") |> purrr::map_chr(1),
|
||||
mean_expression_normalized = mean_expression / max(mean_expression),
|
||||
sd_expression_normalized = sd_expression / max(sd_expression)
|
||||
)]
|
||||
|
||||
data[, score := 0.5 * above_95 +
|
||||
0.25 * mean_expression_normalized +
|
||||
-0.25 * sd_expression_normalized]
|
||||
|
||||
# Normalize scores to be between 0.0 and 1.0.
|
||||
data[, score := (score - min(score, na.rm = TRUE)) /
|
||||
(max(score, na.rm = TRUE) - min(score, na.rm = TRUE))]
|
||||
|
||||
# These are genes that are not expressed at all.
|
||||
data[is.na(score), score := 0.0]
|
||||
|
||||
setorder(data, -score)
|
||||
|
||||
# Remove duplicates. This will keep the best row for each duplicated gene.
|
||||
data <- unique(data, by = "gene")
|
||||
|
||||
data[, `:=`(
|
||||
hgnc_name = gprofiler2::gconvert(
|
||||
gene,
|
||||
target = "HGNC",
|
||||
mthreshold = 1,
|
||||
filter_na = FALSE
|
||||
)$target,
|
||||
rank = .I
|
||||
)]
|
||||
|
||||
fwrite(data, file = here("scripts", "output", "genes.csv"))
|
||||
Loading…
Add table
Add a link
Reference in a new issue