2022-06-22 19:10:28 +02:00
|
|
|
# This script uses the results (See results.csv) and computes a score for each
|
|
|
|
|
# gene. This is the data that will be used in the package.
|
|
|
|
|
|
|
|
|
|
library(data.table)
|
|
|
|
|
library(here)
|
|
|
|
|
|
|
|
|
|
i_am("scripts/input.R")
|
|
|
|
|
|
2022-11-30 14:49:42 +01:00
|
|
|
# To save memory, the data includes fake IDs for genes. The actual Ensembl IDs
|
|
|
|
|
# are part of the separate genes table.
|
|
|
|
|
|
2022-09-25 19:01:59 +02:00
|
|
|
genes <- fread(here("scripts", "input", "genes.csv"))
|
2022-06-22 19:10:28 +02:00
|
|
|
data <- fread(here("scripts", "output", "results.csv"))
|
|
|
|
|
|
2022-11-30 14:49:42 +01:00
|
|
|
# Rank the data using default parameters.
|
|
|
|
|
data <- ubigen::rank_genes(data = data)
|
2022-06-22 19:10:28 +02:00
|
|
|
|
2022-09-25 19:01:59 +02:00
|
|
|
# Reintroduce gene IDs and HGNC symbols.
|
|
|
|
|
|
|
|
|
|
setnames(data, "gene", "id")
|
|
|
|
|
|
|
|
|
|
data <- merge(
|
|
|
|
|
data,
|
|
|
|
|
genes,
|
|
|
|
|
by = "id",
|
|
|
|
|
all.x = TRUE,
|
|
|
|
|
sort = FALSE
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
setnames(data, "hgnc_symbol", "hgnc_name")
|
|
|
|
|
data[, id := NULL]
|
|
|
|
|
|
2022-06-22 19:10:28 +02:00
|
|
|
# Remove duplicates. This will keep the best row for each duplicated gene.
|
|
|
|
|
data <- unique(data, by = "gene")
|
|
|
|
|
|
2022-11-30 14:49:42 +01:00
|
|
|
# Reassign ranks, because duplicates may have been removed.
|
2022-09-25 19:01:59 +02:00
|
|
|
data[, rank := .I]
|
2022-06-22 19:10:28 +02:00
|
|
|
|
|
|
|
|
fwrite(data, file = here("scripts", "output", "genes.csv"))
|