mirror of
https://github.com/johrpan/ubigen.git
synced 2025-10-26 19:57:24 +01:00
Use GeTMM normalization
This commit is contained in:
parent
c97ee1ca30
commit
f59a71b16c
3 changed files with 61 additions and 7 deletions
|
|
@ -29,3 +29,8 @@ Imports:
|
||||||
rclipboard,
|
rclipboard,
|
||||||
shiny,
|
shiny,
|
||||||
shinyWidgets
|
shinyWidgets
|
||||||
|
Suggests:
|
||||||
|
biomaRt,
|
||||||
|
edgeR,
|
||||||
|
purrr,
|
||||||
|
stringr
|
||||||
|
|
|
||||||
|
|
@ -2,22 +2,74 @@
|
||||||
# further analysis. Note that this requires very good computational resources
|
# further analysis. Note that this requires very good computational resources
|
||||||
# and especially a lot of RAM because of the size of the data.
|
# and especially a lot of RAM because of the size of the data.
|
||||||
|
|
||||||
|
library(biomaRt)
|
||||||
|
library(edgeR)
|
||||||
library(data.table)
|
library(data.table)
|
||||||
library(here)
|
library(here)
|
||||||
|
|
||||||
i_am("scripts/input.R")
|
i_am("scripts/input.R")
|
||||||
|
|
||||||
# Source: https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/
|
# Source: https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/
|
||||||
# GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
|
# GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz
|
||||||
# The file has been edited removing the lines above the column headers.
|
read_counts <- fread(here("scripts", "input", "gtex_read_counts.tsv.gz"))
|
||||||
data_wide_samples <- fread(here("scripts", "input", "gtex.tsv.gz"))
|
|
||||||
|
|
||||||
setnames(
|
setnames(
|
||||||
data_wide_samples,
|
read_counts,
|
||||||
c("Name", "Description"),
|
c("Name", "Description"),
|
||||||
c("gene", "hgnc_symbol")
|
c("gene", "hgnc_symbol")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Remove Ensembl gene version numbers.
|
||||||
|
read_counts[, gene := stringr::str_split(gene, "\\.") |> purrr::map_chr(1)]
|
||||||
|
|
||||||
|
# Get gene lengths from Ensembl.
|
||||||
|
|
||||||
|
ensembl <- useEnsembl("ensembl", version = 107)
|
||||||
|
dataset <- useDataset("hsapiens_gene_ensembl", mart = ensembl)
|
||||||
|
gene_lengths <- data.table(getBM(
|
||||||
|
attributes = c(
|
||||||
|
"ensembl_gene_id",
|
||||||
|
"start_position",
|
||||||
|
"end_position"
|
||||||
|
),
|
||||||
|
mart = dataset
|
||||||
|
))
|
||||||
|
|
||||||
|
setnames(gene_lengths, "ensembl_gene_id", "gene")
|
||||||
|
|
||||||
|
gene_lengths[, length := end_position - start_position]
|
||||||
|
|
||||||
|
read_counts <- merge(
|
||||||
|
read_counts,
|
||||||
|
gene_lengths,
|
||||||
|
by = "gene",
|
||||||
|
all.x = TRUE
|
||||||
|
)
|
||||||
|
|
||||||
|
read_counts <- read_counts[!is.na(length)]
|
||||||
|
hgnc_symbols <- read_counts$hgnc_symbol
|
||||||
|
gene_lengths <- read_counts$length
|
||||||
|
|
||||||
|
read_counts <- as.matrix(
|
||||||
|
read_counts[, !c(
|
||||||
|
"hgnc_symbol",
|
||||||
|
"start_position",
|
||||||
|
"end_position",
|
||||||
|
"length"
|
||||||
|
)],
|
||||||
|
rownames = "gene"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Normalize read counts for gene lengths
|
||||||
|
read_counts <- read_counts / (gene_lengths / 1000)
|
||||||
|
|
||||||
|
getpm <- DGEList(counts = read_counts) |>
|
||||||
|
calcNormFactors() |>
|
||||||
|
cpm()
|
||||||
|
|
||||||
|
data_wide_samples <- data.table(getpm, keep.rownames = "gene")
|
||||||
|
data_wide_samples[, hgnc_symbol := hgnc_symbols]
|
||||||
|
|
||||||
data_long <- melt(
|
data_long <- melt(
|
||||||
data_wide_samples,
|
data_wide_samples,
|
||||||
id.vars = c("gene", "hgnc_symbol"),
|
id.vars = c("gene", "hgnc_symbol"),
|
||||||
|
|
|
||||||
|
|
@ -8,9 +8,6 @@ i_am("scripts/input.R")
|
||||||
|
|
||||||
data <- fread(here("scripts", "output", "results.csv"))
|
data <- fread(here("scripts", "output", "results.csv"))
|
||||||
|
|
||||||
# Keep only the actual Ensembl ID for each gene.
|
|
||||||
data[, gene := stringr::str_split(gene, "\\.") |> purrr::map_chr(1)]
|
|
||||||
|
|
||||||
data[, score := 0.5 * above_95 +
|
data[, score := 0.5 * above_95 +
|
||||||
0.25 * mean_expression_normalized +
|
0.25 * mean_expression_normalized +
|
||||||
-0.25 * sd_expression_normalized]
|
-0.25 * sd_expression_normalized]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue