Allow selecting the expression dataset

2025-10-26 19:57:24 +01:00 · 2022-12-01 21:23:46 +01:00 · 2022-12-01 21:23:46 +01:00 · 510fafeb6e
commit 510fafeb6e
parent 2f24812c90
15 changed files with 110 additions and 57 deletions
--- a/scripts/analyze.R
+++ b/scripts/analyze.R
@ -1,12 +0,0 @@
-# This scripts reads the input data (See input.R) and performs various
-# computations on it in order to later use the results for computating scores
-# for ubuiquitously expressed genes.
-
-library(data.table)
-library(here)
-
-i_am("scripts/input.R")
-
-data <- fread(here("scripts", "input", "data_long.csv"))
-results <- ubigen::analyze(data)
-fwrite(results, file = here("scripts", "output", "results.csv"))
--- a/scripts/gtex_all.R
+++ b/scripts/gtex_all.R
@ -0,0 +1,32 @@
+# This script uses the results (See results.csv) and computes a score for each
+# gene. This is the data that will be used in the package.
+
+library(data.table)
+library(here)
+
+i_am("scripts/input.R")
+
+data <- fread(here("scripts", "input", "data_long.csv"))
+gtex_all <- ubigen::analyze(data)
+
+# To save memory, the data includes fake IDs for genes. The actual Ensembl IDs
+# are part of the separate genes table.
+
+genes <- fread(here("scripts", "input", "genes.csv"))
+
+setnames(gtex_all, "gene", "id")
+
+data <- merge(
+  gtex_all,
+  genes[, .(id, gene)],
+  by = "id",
+  all.x = TRUE,
+  sort = FALSE
+)
+
+data[, id := NULL]
+
+usethis::use_data(gtex_all, overwrite = TRUE)
+
+genes[, id := NULL]
+usethis::use_data(genes, overwrite = TRUE)
--- a/scripts/hpa.R
+++ b/scripts/hpa.R
@ -0,0 +1,12 @@
+library(data.table)
+library(here)
+
+i_am("scripts/hpa.R")
+
+# Source: https://www.proteinatlas.org/download/rna_tissue_hpa.tsv.zip
+data <- fread(here("scripts", "input", "rna_tissue_hpa.tsv"))
+setnames(data, c("Gene", "Tissue", "nTPM"), c("gene", "sample", "expression"))
+data[, `:=`("Gene name" = NULL, TPM = NULL, pTPM = NULL)]
+
+hpa_tissues <- ubigen::analyze(data)
+usethis::use_data(hpa_tissues, overwrite = TRUE)
--- a/scripts/ranking.R
+++ b/scripts/ranking.R
@ -1,39 +0,0 @@
-# This script uses the results (See results.csv) and computes a score for each
-# gene. This is the data that will be used in the package.
-
-library(data.table)
-library(here)
-
-i_am("scripts/input.R")
-
-# To save memory, the data includes fake IDs for genes. The actual Ensembl IDs
-# are part of the separate genes table.
-
-genes <- fread(here("scripts", "input", "genes.csv"))
-data <- fread(here("scripts", "output", "results.csv"))
-
-# Rank the data using default parameters.
-data <- ubigen::rank_genes(data = data)
-
-# Reintroduce gene IDs and HGNC symbols.
-
-setnames(data, "gene", "id")
-
-data <- merge(
-  data,
-  genes,
-  by = "id",
-  all.x = TRUE,
-  sort = FALSE
-)
-
-setnames(data, "hgnc_symbol", "hgnc_name")
-data[, id := NULL]
-
-# Remove duplicates. This will keep the best row for each duplicated gene.
-data <- unique(data, by = "gene")
-
-# Reassign ranks, because duplicates may have been removed.
-data[, rank := .I]
-
-fwrite(data, file = here("scripts", "output", "genes.csv"))