diff --git a/DESCRIPTION b/DESCRIPTION index 6793809..5073b9f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,3 +29,8 @@ Imports: rclipboard, shiny, shinyWidgets +Suggests: + biomaRt, + edgeR, + purrr, + stringr diff --git a/scripts/input.R b/scripts/input.R index d1ff95d..40f6de6 100644 --- a/scripts/input.R +++ b/scripts/input.R @@ -2,22 +2,74 @@ # further analysis. Note that this requires very good computational resources # and especially a lot of RAM because of the size of the data. +library(biomaRt) +library(edgeR) library(data.table) library(here) i_am("scripts/input.R") # Source: https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/ -# GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz -# The file has been edited removing the lines above the column headers. -data_wide_samples <- fread(here("scripts", "input", "gtex.tsv.gz")) +# GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz +read_counts <- fread(here("scripts", "input", "gtex_read_counts.tsv.gz")) setnames( - data_wide_samples, + read_counts, c("Name", "Description"), c("gene", "hgnc_symbol") ) +# Remove Ensembl gene version numbers. +read_counts[, gene := stringr::str_split(gene, "\\.") |> purrr::map_chr(1)] + +# Get gene lengths from Ensembl. + +ensembl <- useEnsembl("ensembl", version = 107) +dataset <- useDataset("hsapiens_gene_ensembl", mart = ensembl) +gene_lengths <- data.table(getBM( + attributes = c( + "ensembl_gene_id", + "start_position", + "end_position" + ), + mart = dataset +)) + +setnames(gene_lengths, "ensembl_gene_id", "gene") + +gene_lengths[, length := end_position - start_position] + +read_counts <- merge( + read_counts, + gene_lengths, + by = "gene", + all.x = TRUE +) + +read_counts <- read_counts[!is.na(length)] +hgnc_symbols <- read_counts$hgnc_symbol +gene_lengths <- read_counts$length + +read_counts <- as.matrix( + read_counts[, !c( + "hgnc_symbol", + "start_position", + "end_position", + "length" + )], + rownames = "gene" +) + +# Normalize read counts for gene lengths +read_counts <- read_counts / (gene_lengths / 1000) + +getpm <- DGEList(counts = read_counts) |> + calcNormFactors() |> + cpm() + +data_wide_samples <- data.table(getpm, keep.rownames = "gene") +data_wide_samples[, hgnc_symbol := hgnc_symbols] + data_long <- melt( data_wide_samples, id.vars = c("gene", "hgnc_symbol"), diff --git a/scripts/ranking.R b/scripts/ranking.R index 5312786..bf8620d 100644 --- a/scripts/ranking.R +++ b/scripts/ranking.R @@ -8,9 +8,6 @@ i_am("scripts/input.R") data <- fread(here("scripts", "output", "results.csv")) -# Keep only the actual Ensembl ID for each gene. -data[, gene := stringr::str_split(gene, "\\.") |> purrr::map_chr(1)] - data[, score := 0.5 * above_95 + 0.25 * mean_expression_normalized + -0.25 * sd_expression_normalized]