scripts: Add genes script

2025-10-27 04:07:25 +01:00 · 2022-06-22 19:10:28 +02:00 · 2022-06-22 19:10:28 +02:00 · 07fea8d6a2
commit 07fea8d6a2
parent f341031753
1 changed files with 43 additions and 0 deletions
--- a/scripts/genes.R
+++ b/scripts/genes.R
@ -0,0 +1,43 @@
 # This script uses the results (See results.csv) and computes a score for each
 # gene. This is the data that will be used in the package.
 library(data.table)
 library(here)
 i_am("scripts/input.R")
 data <- fread(here("scripts", "output", "results.csv"))
 data[, `:=`(
  gene = stringr::str_split(gene, "\\.") |> purrr::map_chr(1),
  mean_expression_normalized = mean_expression / max(mean_expression),
  sd_expression_normalized = sd_expression / max(sd_expression)
 )]
 data[, score := 0.5 * above_95 +
  0.25 * mean_expression_normalized +
  -0.25 * sd_expression_normalized]
 # Normalize scores to be between 0.0 and 1.0.
 data[, score := (score - min(score, na.rm = TRUE)) /
  (max(score, na.rm = TRUE) - min(score, na.rm = TRUE))]
 # These are genes that are not expressed at all.
 data[is.na(score), score := 0.0]
 setorder(data, -score)
 # Remove duplicates. This will keep the best row for each duplicated gene.
 data <- unique(data, by = "gene")
 data[, `:=`(
  hgnc_name = gprofiler2::gconvert(
    gene,
    target = "HGNC",
    mthreshold = 1,
    filter_na = FALSE
  )$target,
  rank = .I
 )]
 fwrite(data, file = here("scripts", "output", "genes.csv"))