Allow selecting the expression dataset

2025-10-26 19:57:24 +01:00 · 2022-12-01 21:23:46 +01:00 · 2022-12-01 21:23:46 +01:00 · 510fafeb6e
commit 510fafeb6e
parent 2f24812c90
15 changed files with 110 additions and 57 deletions
--- a/R/data.R
+++ b/R/data.R
@ -1,2 +1,9 @@
-#' A `data.table` containig data on genes and their expression behavior.
+#' A `data.table` containig mappings of Ensembl gene ID to HGNC symbol.
 "genes"
 #' The results from the analysis across all GTEx samples.
 "gtex_all"
 #' The results from the analysis across Human Protein Atlas' tissue aggregated
 #' data.
 "hpa_tissues"
--- a/R/ranking.R
+++ b/R/ranking.R
@ -10,7 +10,7 @@
 #'   percentiles for each gene.
 #'
 #' @export
-rank_genes <- function(data = ubigen::genes,
+rank_genes <- function(data = ubigen::gtex_all,
                       cross_sample_metric = "above_95",
                       cross_sample_weight = 0.5,
                       level_metric = "median_expression_normalized",
--- a/R/server.R
+++ b/R/server.R
@ -1,8 +1,19 @@
 #' Server implementing the main user interface.
 #' @noRd
 server <- function(input, output, session) {
  dataset <- reactive({
    analysis <- if (input$dataset == "hpa_tissues") {
      ubigen::hpa_tissues
    } else {
      ubigen::gtex_all
    }
    merge(analysis, ubigen::genes, by = "gene")
  })
  ranked_data <- reactive({
    rank_genes(
      data = dataset(),
      cross_sample_metric = input$cross_sample_metric,
      cross_sample_weight = input$cross_sample_weight,
      level_metric = input$level_metric,
--- a/R/ui.R
+++ b/R/ui.R
@ -19,6 +19,14 @@ ui <- function() {
            h3("Your genes"),
            gene_selector_ui("custom_genes"),
            h3("Method"),
            selectInput(
              "dataset",
              label = strong("Expression dataset"),
              list(
                "GTEx (all samples)" = "gtex_all",
                "Human Protein Atlas (tissues)" = "hpa_tissues"
              )
            ),
            selectInput(
              "cross_sample_metric",
              verticalLayout(
--- a/data/genes.rda
+++ b/data/genes.rda
--- a/data/gtex_all.rda
+++ b/data/gtex_all.rda
--- a/data/hpa_tissues.rda
+++ b/data/hpa_tissues.rda
--- a/man/genes.Rd
+++ b/man/genes.Rd
@ -3,14 +3,14 @@
 \docType{data}
 \name{genes}
 \alias{genes}
-\title{A \code{data.table} containig data on genes and their expression behavior.}
+\title{A \code{data.table} containig mappings of Ensembl gene ID to HGNC symbol.}
 \format{
-An object of class \code{data.table} (inherits from \code{data.frame}) with 55507 rows and 20 columns.
+An object of class \code{data.table} (inherits from \code{data.frame}) with 55507 rows and 2 columns.
 }
 \usage{
 genes
 }
 \description{
-A \code{data.table} containig data on genes and their expression behavior.
+A \code{data.table} containig mappings of Ensembl gene ID to HGNC symbol.
 }
 \keyword{datasets}
--- a/man/gtex_all.Rd
+++ b/man/gtex_all.Rd
@ -0,0 +1,16 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/data.R
 \docType{data}
 \name{gtex_all}
 \alias{gtex_all}
 \title{The results from the analysis across all GTEx samples.}
 \format{
 An object of class \code{data.table} (inherits from \code{data.frame}) with 55507 rows and 17 columns.
 }
 \usage{
 gtex_all
 }
 \description{
 The results from the analysis across all GTEx samples.
 }
 \keyword{datasets}
--- a/man/hpa_tissues.Rd
+++ b/man/hpa_tissues.Rd
@ -0,0 +1,18 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/data.R
 \docType{data}
 \name{hpa_tissues}
 \alias{hpa_tissues}
 \title{The results from the analysis across Human Protein Atlas' tissue aggregated
 data.}
 \format{
 An object of class \code{data.table} (inherits from \code{data.frame}) with 20090 rows and 17 columns.
 }
 \usage{
 hpa_tissues
 }
 \description{
 The results from the analysis across Human Protein Atlas' tissue aggregated
 data.
 }
 \keyword{datasets}
--- a/man/rank_genes.Rd
+++ b/man/rank_genes.Rd
@ -5,7 +5,7 @@
 \title{Rank genes based on how ubiquitous they are.}
 \usage{
 rank_genes(
-  data = ubigen::genes,
+  data = ubigen::gtex_all,
  cross_sample_metric = "above_95",
  cross_sample_weight = 0.5,
  level_metric = "median_expression_normalized",
--- a/scripts/analyze.R
+++ b/scripts/analyze.R
@ -1,12 +0,0 @@
 # This scripts reads the input data (See input.R) and performs various
 # computations on it in order to later use the results for computating scores
 # for ubuiquitously expressed genes.
 library(data.table)
 library(here)
 i_am("scripts/input.R")
 data <- fread(here("scripts", "input", "data_long.csv"))
 results <- ubigen::analyze(data)
 fwrite(results, file = here("scripts", "output", "results.csv"))
--- a/scripts/gtex_all.R
+++ b/scripts/gtex_all.R
@ -0,0 +1,32 @@
 # This script uses the results (See results.csv) and computes a score for each
 # gene. This is the data that will be used in the package.
 library(data.table)
 library(here)
 i_am("scripts/input.R")
 data <- fread(here("scripts", "input", "data_long.csv"))
 gtex_all <- ubigen::analyze(data)
 # To save memory, the data includes fake IDs for genes. The actual Ensembl IDs
 # are part of the separate genes table.
 genes <- fread(here("scripts", "input", "genes.csv"))
 setnames(gtex_all, "gene", "id")
 data <- merge(
  gtex_all,
  genes[, .(id, gene)],
  by = "id",
  all.x = TRUE,
  sort = FALSE
 )
 data[, id := NULL]
 usethis::use_data(gtex_all, overwrite = TRUE)
 genes[, id := NULL]
 usethis::use_data(genes, overwrite = TRUE)
--- a/scripts/hpa.R
+++ b/scripts/hpa.R
@ -0,0 +1,12 @@
 library(data.table)
 library(here)
 i_am("scripts/hpa.R")
 # Source: https://www.proteinatlas.org/download/rna_tissue_hpa.tsv.zip
 data <- fread(here("scripts", "input", "rna_tissue_hpa.tsv"))
 setnames(data, c("Gene", "Tissue", "nTPM"), c("gene", "sample", "expression"))
 data[, `:=`("Gene name" = NULL, TPM = NULL, pTPM = NULL)]
 hpa_tissues <- ubigen::analyze(data)
 usethis::use_data(hpa_tissues, overwrite = TRUE)
--- a/scripts/ranking.R
+++ b/scripts/ranking.R
@ -1,39 +0,0 @@
 # This script uses the results (See results.csv) and computes a score for each
 # gene. This is the data that will be used in the package.
 library(data.table)
 library(here)
 i_am("scripts/input.R")
 # To save memory, the data includes fake IDs for genes. The actual Ensembl IDs
 # are part of the separate genes table.
 genes <- fread(here("scripts", "input", "genes.csv"))
 data <- fread(here("scripts", "output", "results.csv"))
 # Rank the data using default parameters.
 data <- ubigen::rank_genes(data = data)
 # Reintroduce gene IDs and HGNC symbols.
 setnames(data, "gene", "id")
 data <- merge(
  data,
  genes,
  by = "id",
  all.x = TRUE,
  sort = FALSE
 )
 setnames(data, "hgnc_symbol", "hgnc_name")
 data[, id := NULL]
 # Remove duplicates. This will keep the best row for each duplicated gene.
 data <- unique(data, by = "gene")
 # Reassign ranks, because duplicates may have been removed.
 data[, rank := .I]
 fwrite(data, file = here("scripts", "output", "genes.csv"))