Use GeTMM normalization

This commit is contained in:
Elias Projahn 2022-09-21 18:47:58 +02:00
parent c97ee1ca30
commit f59a71b16c
3 changed files with 61 additions and 7 deletions

View file

@ -29,3 +29,8 @@ Imports:
rclipboard,
shiny,
shinyWidgets
Suggests:
biomaRt,
edgeR,
purrr,
stringr

View file

@ -2,22 +2,74 @@
# further analysis. Note that this requires very good computational resources
# and especially a lot of RAM because of the size of the data.
library(biomaRt)
library(edgeR)
library(data.table)
library(here)
i_am("scripts/input.R")
# Source: https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/
# GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
# The file has been edited removing the lines above the column headers.
data_wide_samples <- fread(here("scripts", "input", "gtex.tsv.gz"))
# GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz
read_counts <- fread(here("scripts", "input", "gtex_read_counts.tsv.gz"))
setnames(
data_wide_samples,
read_counts,
c("Name", "Description"),
c("gene", "hgnc_symbol")
)
# Remove Ensembl gene version numbers.
read_counts[, gene := stringr::str_split(gene, "\\.") |> purrr::map_chr(1)]
# Get gene lengths from Ensembl.
ensembl <- useEnsembl("ensembl", version = 107)
dataset <- useDataset("hsapiens_gene_ensembl", mart = ensembl)
gene_lengths <- data.table(getBM(
attributes = c(
"ensembl_gene_id",
"start_position",
"end_position"
),
mart = dataset
))
setnames(gene_lengths, "ensembl_gene_id", "gene")
gene_lengths[, length := end_position - start_position]
read_counts <- merge(
read_counts,
gene_lengths,
by = "gene",
all.x = TRUE
)
read_counts <- read_counts[!is.na(length)]
hgnc_symbols <- read_counts$hgnc_symbol
gene_lengths <- read_counts$length
read_counts <- as.matrix(
read_counts[, !c(
"hgnc_symbol",
"start_position",
"end_position",
"length"
)],
rownames = "gene"
)
# Normalize read counts for gene lengths
read_counts <- read_counts / (gene_lengths / 1000)
getpm <- DGEList(counts = read_counts) |>
calcNormFactors() |>
cpm()
data_wide_samples <- data.table(getpm, keep.rownames = "gene")
data_wide_samples[, hgnc_symbol := hgnc_symbols]
data_long <- melt(
data_wide_samples,
id.vars = c("gene", "hgnc_symbol"),

View file

@ -8,9 +8,6 @@ i_am("scripts/input.R")
data <- fread(here("scripts", "output", "results.csv"))
# Keep only the actual Ensembl ID for each gene.
data[, gene := stringr::str_split(gene, "\\.") |> purrr::map_chr(1)]
data[, score := 0.5 * above_95 +
0.25 * mean_expression_normalized +
-0.25 * sd_expression_normalized]