diff --git a/DESCRIPTION b/DESCRIPTION index 5073b9f..21a9a1a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -32,5 +32,6 @@ Imports: Suggests: biomaRt, edgeR, + here, purrr, stringr diff --git a/scripts/analyze.R b/scripts/analyze.R index ab99ebf..4216ca6 100644 --- a/scripts/analyze.R +++ b/scripts/analyze.R @@ -7,7 +7,7 @@ library(here) i_am("scripts/input.R") -data <- fread(here("scripts", "input", "data_long.csv.gz")) +data <- fread(here("scripts", "input", "data_long.csv")) data[, `:=`( expression_median = median(expression), diff --git a/scripts/input.R b/scripts/input.R index 40f6de6..94181d6 100644 --- a/scripts/input.R +++ b/scripts/input.R @@ -70,21 +70,24 @@ getpm <- DGEList(counts = read_counts) |> data_wide_samples <- data.table(getpm, keep.rownames = "gene") data_wide_samples[, hgnc_symbol := hgnc_symbols] +# Create lookup tables for genes and samples. + +genes <- data_wide_samples[, .(id = .I, gene, hgnc_symbol)] +fwrite(genes, file = here("scripts", "input", "genes.csv")) + +sample_names <- colnames(data_wide_samples[, !c("gene", "hgnc_symbol")]) +samples <- data.table(id = seq_along(sample_names), sample = sample_names) +fwrite(samples, file = here("scripts", "input", "samples.csv")) + +data_wide_samples[, `:=`(gene = .I, hgnc_symbol = NULL)] +colnames(data_wide_samples) <- c("gene", seq_along(sample_names)) + data_long <- melt( data_wide_samples, - id.vars = c("gene", "hgnc_symbol"), + id.vars = "gene", variable.name = "sample", value.name = "expression", variable.factor = FALSE ) -fwrite( - data_wide_samples, - file = here( - "scripts", - "input", - "data_wide_samples.csv.gz" - ) -) - -fwrite(data_long, file = here("scripts", "input", "data_long.csv.gz")) +fwrite(data_long, file = here("scripts", "input", "data_long.csv")) diff --git a/scripts/ranking.R b/scripts/ranking.R index bf8620d..e005d44 100644 --- a/scripts/ranking.R +++ b/scripts/ranking.R @@ -6,6 +6,7 @@ library(here) i_am("scripts/input.R") +genes <- fread(here("scripts", "input", "genes.csv")) data <- fread(here("scripts", "output", "results.csv")) data[, score := 0.5 * above_95 + @@ -22,17 +23,24 @@ data[is.na(score), score := 0.0] setorder(data, -score) +# Reintroduce gene IDs and HGNC symbols. + +setnames(data, "gene", "id") + +data <- merge( + data, + genes, + by = "id", + all.x = TRUE, + sort = FALSE +) + +setnames(data, "hgnc_symbol", "hgnc_name") +data[, id := NULL] + # Remove duplicates. This will keep the best row for each duplicated gene. data <- unique(data, by = "gene") -data[, `:=`( - hgnc_name = gprofiler2::gconvert( - gene, - target = "HGNC", - mthreshold = 1, - filter_na = FALSE - )$target, - rank = .I -)] +data[, rank := .I] fwrite(data, file = here("scripts", "output", "genes.csv"))