diff --git a/R/data.R b/R/data.R index 88364ce..807fe75 100644 --- a/R/data.R +++ b/R/data.R @@ -1,2 +1,9 @@ -#' A `data.table` containig data on genes and their expression behavior. +#' A `data.table` containig mappings of Ensembl gene ID to HGNC symbol. "genes" + +#' The results from the analysis across all GTEx samples. +"gtex_all" + +#' The results from the analysis across Human Protein Atlas' tissue aggregated +#' data. +"hpa_tissues" diff --git a/R/ranking.R b/R/ranking.R index 2f71468..fd82c2d 100644 --- a/R/ranking.R +++ b/R/ranking.R @@ -10,7 +10,7 @@ #' percentiles for each gene. #' #' @export -rank_genes <- function(data = ubigen::genes, +rank_genes <- function(data = ubigen::gtex_all, cross_sample_metric = "above_95", cross_sample_weight = 0.5, level_metric = "median_expression_normalized", diff --git a/R/server.R b/R/server.R index 2ff93fb..04c703b 100644 --- a/R/server.R +++ b/R/server.R @@ -1,8 +1,19 @@ #' Server implementing the main user interface. #' @noRd server <- function(input, output, session) { + dataset <- reactive({ + analysis <- if (input$dataset == "hpa_tissues") { + ubigen::hpa_tissues + } else { + ubigen::gtex_all + } + + merge(analysis, ubigen::genes, by = "gene") + }) + ranked_data <- reactive({ rank_genes( + data = dataset(), cross_sample_metric = input$cross_sample_metric, cross_sample_weight = input$cross_sample_weight, level_metric = input$level_metric, diff --git a/R/ui.R b/R/ui.R index f352e88..ba781d7 100644 --- a/R/ui.R +++ b/R/ui.R @@ -19,6 +19,14 @@ ui <- function() { h3("Your genes"), gene_selector_ui("custom_genes"), h3("Method"), + selectInput( + "dataset", + label = strong("Expression dataset"), + list( + "GTEx (all samples)" = "gtex_all", + "Human Protein Atlas (tissues)" = "hpa_tissues" + ) + ), selectInput( "cross_sample_metric", verticalLayout( diff --git a/data/genes.rda b/data/genes.rda index 990367b..9358734 100644 Binary files a/data/genes.rda and b/data/genes.rda differ diff --git a/data/gtex_all.rda b/data/gtex_all.rda new file mode 100644 index 0000000..058068b Binary files /dev/null and b/data/gtex_all.rda differ diff --git a/data/hpa_tissues.rda b/data/hpa_tissues.rda new file mode 100644 index 0000000..70efb15 Binary files /dev/null and b/data/hpa_tissues.rda differ diff --git a/man/genes.Rd b/man/genes.Rd index 0c06c32..abc9e61 100644 --- a/man/genes.Rd +++ b/man/genes.Rd @@ -3,14 +3,14 @@ \docType{data} \name{genes} \alias{genes} -\title{A \code{data.table} containig data on genes and their expression behavior.} +\title{A \code{data.table} containig mappings of Ensembl gene ID to HGNC symbol.} \format{ -An object of class \code{data.table} (inherits from \code{data.frame}) with 55507 rows and 20 columns. +An object of class \code{data.table} (inherits from \code{data.frame}) with 55507 rows and 2 columns. } \usage{ genes } \description{ -A \code{data.table} containig data on genes and their expression behavior. +A \code{data.table} containig mappings of Ensembl gene ID to HGNC symbol. } \keyword{datasets} diff --git a/man/gtex_all.Rd b/man/gtex_all.Rd new file mode 100644 index 0000000..6dc1035 --- /dev/null +++ b/man/gtex_all.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{gtex_all} +\alias{gtex_all} +\title{The results from the analysis across all GTEx samples.} +\format{ +An object of class \code{data.table} (inherits from \code{data.frame}) with 55507 rows and 17 columns. +} +\usage{ +gtex_all +} +\description{ +The results from the analysis across all GTEx samples. +} +\keyword{datasets} diff --git a/man/hpa_tissues.Rd b/man/hpa_tissues.Rd new file mode 100644 index 0000000..3a7b336 --- /dev/null +++ b/man/hpa_tissues.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{hpa_tissues} +\alias{hpa_tissues} +\title{The results from the analysis across Human Protein Atlas' tissue aggregated +data.} +\format{ +An object of class \code{data.table} (inherits from \code{data.frame}) with 20090 rows and 17 columns. +} +\usage{ +hpa_tissues +} +\description{ +The results from the analysis across Human Protein Atlas' tissue aggregated +data. +} +\keyword{datasets} diff --git a/man/rank_genes.Rd b/man/rank_genes.Rd index e7f078f..4ee8191 100644 --- a/man/rank_genes.Rd +++ b/man/rank_genes.Rd @@ -5,7 +5,7 @@ \title{Rank genes based on how ubiquitous they are.} \usage{ rank_genes( - data = ubigen::genes, + data = ubigen::gtex_all, cross_sample_metric = "above_95", cross_sample_weight = 0.5, level_metric = "median_expression_normalized", diff --git a/scripts/analyze.R b/scripts/analyze.R deleted file mode 100644 index 1429225..0000000 --- a/scripts/analyze.R +++ /dev/null @@ -1,12 +0,0 @@ -# This scripts reads the input data (See input.R) and performs various -# computations on it in order to later use the results for computating scores -# for ubuiquitously expressed genes. - -library(data.table) -library(here) - -i_am("scripts/input.R") - -data <- fread(here("scripts", "input", "data_long.csv")) -results <- ubigen::analyze(data) -fwrite(results, file = here("scripts", "output", "results.csv")) diff --git a/scripts/gtex_all.R b/scripts/gtex_all.R new file mode 100644 index 0000000..895fb24 --- /dev/null +++ b/scripts/gtex_all.R @@ -0,0 +1,32 @@ +# This script uses the results (See results.csv) and computes a score for each +# gene. This is the data that will be used in the package. + +library(data.table) +library(here) + +i_am("scripts/input.R") + +data <- fread(here("scripts", "input", "data_long.csv")) +gtex_all <- ubigen::analyze(data) + +# To save memory, the data includes fake IDs for genes. The actual Ensembl IDs +# are part of the separate genes table. + +genes <- fread(here("scripts", "input", "genes.csv")) + +setnames(gtex_all, "gene", "id") + +data <- merge( + gtex_all, + genes[, .(id, gene)], + by = "id", + all.x = TRUE, + sort = FALSE +) + +data[, id := NULL] + +usethis::use_data(gtex_all, overwrite = TRUE) + +genes[, id := NULL] +usethis::use_data(genes, overwrite = TRUE) diff --git a/scripts/hpa.R b/scripts/hpa.R new file mode 100644 index 0000000..18f99fc --- /dev/null +++ b/scripts/hpa.R @@ -0,0 +1,12 @@ +library(data.table) +library(here) + +i_am("scripts/hpa.R") + +# Source: https://www.proteinatlas.org/download/rna_tissue_hpa.tsv.zip +data <- fread(here("scripts", "input", "rna_tissue_hpa.tsv")) +setnames(data, c("Gene", "Tissue", "nTPM"), c("gene", "sample", "expression")) +data[, `:=`("Gene name" = NULL, TPM = NULL, pTPM = NULL)] + +hpa_tissues <- ubigen::analyze(data) +usethis::use_data(hpa_tissues, overwrite = TRUE) \ No newline at end of file diff --git a/scripts/ranking.R b/scripts/ranking.R deleted file mode 100644 index a96c1cb..0000000 --- a/scripts/ranking.R +++ /dev/null @@ -1,39 +0,0 @@ -# This script uses the results (See results.csv) and computes a score for each -# gene. This is the data that will be used in the package. - -library(data.table) -library(here) - -i_am("scripts/input.R") - -# To save memory, the data includes fake IDs for genes. The actual Ensembl IDs -# are part of the separate genes table. - -genes <- fread(here("scripts", "input", "genes.csv")) -data <- fread(here("scripts", "output", "results.csv")) - -# Rank the data using default parameters. -data <- ubigen::rank_genes(data = data) - -# Reintroduce gene IDs and HGNC symbols. - -setnames(data, "gene", "id") - -data <- merge( - data, - genes, - by = "id", - all.x = TRUE, - sort = FALSE -) - -setnames(data, "hgnc_symbol", "hgnc_name") -data[, id := NULL] - -# Remove duplicates. This will keep the best row for each duplicated gene. -data <- unique(data, by = "gene") - -# Reassign ranks, because duplicates may have been removed. -data[, rank := .I] - -fwrite(data, file = here("scripts", "output", "genes.csv"))