Reduce memory footprint during analysis

This commit is contained in:
Elias Projahn 2022-09-25 19:01:59 +02:00
parent f59a71b16c
commit 2eec3285f9
4 changed files with 33 additions and 21 deletions

View file

@ -7,7 +7,7 @@ library(here)
i_am("scripts/input.R")
data <- fread(here("scripts", "input", "data_long.csv.gz"))
data <- fread(here("scripts", "input", "data_long.csv"))
data[, `:=`(
expression_median = median(expression),

View file

@ -70,21 +70,24 @@ getpm <- DGEList(counts = read_counts) |>
data_wide_samples <- data.table(getpm, keep.rownames = "gene")
data_wide_samples[, hgnc_symbol := hgnc_symbols]
# Create lookup tables for genes and samples.
genes <- data_wide_samples[, .(id = .I, gene, hgnc_symbol)]
fwrite(genes, file = here("scripts", "input", "genes.csv"))
sample_names <- colnames(data_wide_samples[, !c("gene", "hgnc_symbol")])
samples <- data.table(id = seq_along(sample_names), sample = sample_names)
fwrite(samples, file = here("scripts", "input", "samples.csv"))
data_wide_samples[, `:=`(gene = .I, hgnc_symbol = NULL)]
colnames(data_wide_samples) <- c("gene", seq_along(sample_names))
data_long <- melt(
data_wide_samples,
id.vars = c("gene", "hgnc_symbol"),
id.vars = "gene",
variable.name = "sample",
value.name = "expression",
variable.factor = FALSE
)
fwrite(
data_wide_samples,
file = here(
"scripts",
"input",
"data_wide_samples.csv.gz"
)
)
fwrite(data_long, file = here("scripts", "input", "data_long.csv.gz"))
fwrite(data_long, file = here("scripts", "input", "data_long.csv"))

View file

@ -6,6 +6,7 @@ library(here)
i_am("scripts/input.R")
genes <- fread(here("scripts", "input", "genes.csv"))
data <- fread(here("scripts", "output", "results.csv"))
data[, score := 0.5 * above_95 +
@ -22,17 +23,24 @@ data[is.na(score), score := 0.0]
setorder(data, -score)
# Reintroduce gene IDs and HGNC symbols.
setnames(data, "gene", "id")
data <- merge(
data,
genes,
by = "id",
all.x = TRUE,
sort = FALSE
)
setnames(data, "hgnc_symbol", "hgnc_name")
data[, id := NULL]
# Remove duplicates. This will keep the best row for each duplicated gene.
data <- unique(data, by = "gene")
data[, `:=`(
hgnc_name = gprofiler2::gconvert(
gene,
target = "HGNC",
mthreshold = 1,
filter_na = FALSE
)$target,
rank = .I
)]
data[, rank := .I]
fwrite(data, file = here("scripts", "output", "genes.csv"))