ubigen/scripts/ranking.R

# This script uses the results (See results.csv) and computes a score for each
# gene. This is the data that will be used in the package.

library(data.table)
library(here)

i_am("scripts/input.R")

data <- fread(here("scripts", "output", "results.csv"))

# Keep only the actual Ensembl ID for each gene.
data[, gene := stringr::str_split(gene, "\\.") |> purrr::map_chr(1)]

data[, score := 0.5 * above_95 +
  0.25 * mean_expression_normalized +
  -0.25 * sd_expression_normalized]

# Normalize scores to be between 0.0 and 1.0.
data[, score := (score - min(score, na.rm = TRUE)) /
  (max(score, na.rm = TRUE) - min(score, na.rm = TRUE))]

# These are genes that are not expressed at all or expressed just once, in case
# the standard deviation is used in the score.
data[is.na(score), score := 0.0]

setorder(data, -score)

# Remove duplicates. This will keep the best row for each duplicated gene.
data <- unique(data, by = "gene")

data[, `:=`(
  hgnc_name = gprofiler2::gconvert(
    gene,
    target = "HGNC",
    mthreshold = 1,
    filter_na = FALSE
  )$target,
  rank = .I
)]

fwrite(data, file = here("scripts", "output", "genes.csv"))
scripts: Add genes script 2022-06-22 19:10:28 +02:00			`# This script uses the results (See results.csv) and computes a score for each`
			`# gene. This is the data that will be used in the package.`

			`library(data.table)`
			`library(here)`

			`i_am("scripts/input.R")`

			`data <- fread(here("scripts", "output", "results.csv"))`

Use logarithmic normalization This commit also changes the way the standard deviation and mean expression are computed in general. Now, samples where the gene is not expressed at all are excluded before the computation. 2022-07-02 17:52:05 +02:00			`# Keep only the actual Ensembl ID for each gene.`
			`data[, gene := stringr::str_split(gene, "\\.") \|> purrr::map_chr(1)]`
scripts: Add genes script 2022-06-22 19:10:28 +02:00
			`data[, score := 0.5 * above_95 +`
			`0.25 * mean_expression_normalized +`
			`-0.25 * sd_expression_normalized]`

			`# Normalize scores to be between 0.0 and 1.0.`
			`data[, score := (score - min(score, na.rm = TRUE)) /`
			`(max(score, na.rm = TRUE) - min(score, na.rm = TRUE))]`

Use logarithmic normalization This commit also changes the way the standard deviation and mean expression are computed in general. Now, samples where the gene is not expressed at all are excluded before the computation. 2022-07-02 17:52:05 +02:00			`# These are genes that are not expressed at all or expressed just once, in case`
			`# the standard deviation is used in the score.`
scripts: Add genes script 2022-06-22 19:10:28 +02:00			`data[is.na(score), score := 0.0]`

			`setorder(data, -score)`

			`# Remove duplicates. This will keep the best row for each duplicated gene.`
			`data <- unique(data, by = "gene")`

			data[, `:=`(
			`hgnc_name = gprofiler2::gconvert(`
			`gene,`
			`target = "HGNC",`
			`mthreshold = 1,`
			`filter_na = FALSE`
			`)$target,`
			`rank = .I`
			`)]`

			`fwrite(data, file = here("scripts", "output", "genes.csv"))`