From 07fea8d6a2a15f83436caa9365feb9d4704a191e Mon Sep 17 00:00:00 2001
From: Elias Projahn <elias@johrpan.de>
Date: Wed, 22 Jun 2022 19:10:28 +0200
Subject: [PATCH] scripts: Add genes script

---
 scripts/genes.R | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 scripts/genes.R

diff --git a/scripts/genes.R b/scripts/genes.R
new file mode 100644
index 0000000..ec41d7f
--- /dev/null
+++ b/scripts/genes.R
@@ -0,0 +1,43 @@
+# This script uses the results (See results.csv) and computes a score for each
+# gene. This is the data that will be used in the package.
+
+library(data.table)
+library(here)
+
+i_am("scripts/input.R")
+
+data <- fread(here("scripts", "output", "results.csv"))
+
+data[, `:=`(
+  gene = stringr::str_split(gene, "\\.") |> purrr::map_chr(1),
+  mean_expression_normalized = mean_expression / max(mean_expression),
+  sd_expression_normalized = sd_expression / max(sd_expression)
+)]
+
+data[, score := 0.5 * above_95 +
+  0.25 * mean_expression_normalized +
+  -0.25 * sd_expression_normalized]
+
+# Normalize scores to be between 0.0 and 1.0.
+data[, score := (score - min(score, na.rm = TRUE)) /
+  (max(score, na.rm = TRUE) - min(score, na.rm = TRUE))]
+
+# These are genes that are not expressed at all.
+data[is.na(score), score := 0.0]
+
+setorder(data, -score)
+
+# Remove duplicates. This will keep the best row for each duplicated gene.
+data <- unique(data, by = "gene")
+
+data[, `:=`(
+  hgnc_name = gprofiler2::gconvert(
+    gene,
+    target = "HGNC",
+    mthreshold = 1,
+    filter_na = FALSE
+  )$target,
+  rank = .I
+)]
+
+fwrite(data, file = here("scripts", "output", "genes.csv"))