From 07fea8d6a2a15f83436caa9365feb9d4704a191e Mon Sep 17 00:00:00 2001 From: Elias Projahn Date: Wed, 22 Jun 2022 19:10:28 +0200 Subject: [PATCH] scripts: Add genes script --- scripts/genes.R | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 scripts/genes.R diff --git a/scripts/genes.R b/scripts/genes.R new file mode 100644 index 0000000..ec41d7f --- /dev/null +++ b/scripts/genes.R @@ -0,0 +1,43 @@ +# This script uses the results (See results.csv) and computes a score for each +# gene. This is the data that will be used in the package. + +library(data.table) +library(here) + +i_am("scripts/input.R") + +data <- fread(here("scripts", "output", "results.csv")) + +data[, `:=`( + gene = stringr::str_split(gene, "\\.") |> purrr::map_chr(1), + mean_expression_normalized = mean_expression / max(mean_expression), + sd_expression_normalized = sd_expression / max(sd_expression) +)] + +data[, score := 0.5 * above_95 + + 0.25 * mean_expression_normalized + + -0.25 * sd_expression_normalized] + +# Normalize scores to be between 0.0 and 1.0. +data[, score := (score - min(score, na.rm = TRUE)) / + (max(score, na.rm = TRUE) - min(score, na.rm = TRUE))] + +# These are genes that are not expressed at all. +data[is.na(score), score := 0.0] + +setorder(data, -score) + +# Remove duplicates. This will keep the best row for each duplicated gene. +data <- unique(data, by = "gene") + +data[, `:=`( + hgnc_name = gprofiler2::gconvert( + gene, + target = "HGNC", + mthreshold = 1, + filter_na = FALSE + )$target, + rank = .I +)] + +fwrite(data, file = here("scripts", "output", "genes.csv"))