ubigen/scripts/input.R

94 lines
2.4 KiB
R
Raw Permalink Normal View History

2022-06-22 19:06:56 +02:00
# This script reads data from GTEx and transforms it into various formats for
# further analysis. Note that this requires very good computational resources
# and especially a lot of RAM because of the size of the data.
2022-09-21 18:47:58 +02:00
library(biomaRt)
library(edgeR)
2022-06-22 19:06:56 +02:00
library(data.table)
library(here)
i_am("scripts/input.R")
# Source: https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/
2022-09-21 18:47:58 +02:00
# GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz
read_counts <- fread(here("scripts", "input", "gtex_read_counts.tsv.gz"))
2022-06-22 19:06:56 +02:00
setnames(
2022-09-21 18:47:58 +02:00
read_counts,
2022-06-22 19:06:56 +02:00
c("Name", "Description"),
c("gene", "hgnc_symbol")
)
2022-09-21 18:47:58 +02:00
# Remove Ensembl gene version numbers.
read_counts[, gene := stringr::str_split(gene, "\\.") |> purrr::map_chr(1)]
# Get gene lengths from Ensembl.
ensembl <- useEnsembl("ensembl", version = 107)
dataset <- useDataset("hsapiens_gene_ensembl", mart = ensembl)
gene_lengths <- data.table(getBM(
attributes = c(
"ensembl_gene_id",
"start_position",
"end_position"
),
mart = dataset
))
setnames(gene_lengths, "ensembl_gene_id", "gene")
gene_lengths[, length := end_position - start_position]
read_counts <- merge(
read_counts,
gene_lengths,
by = "gene",
all.x = TRUE
)
read_counts <- read_counts[!is.na(length)]
hgnc_symbols <- read_counts$hgnc_symbol
gene_lengths <- read_counts$length
read_counts <- as.matrix(
read_counts[, !c(
"hgnc_symbol",
"start_position",
"end_position",
"length"
)],
rownames = "gene"
)
# Normalize read counts for gene lengths
read_counts <- read_counts / (gene_lengths / 1000)
getpm <- DGEList(counts = read_counts) |>
calcNormFactors() |>
cpm()
data_wide_samples <- data.table(getpm, keep.rownames = "gene")
data_wide_samples[, hgnc_symbol := hgnc_symbols]
# Create lookup tables for genes and samples.
genes <- data_wide_samples[, .(id = .I, gene, hgnc_symbol)]
fwrite(genes, file = here("scripts", "input", "genes.csv"))
sample_names <- colnames(data_wide_samples[, !c("gene", "hgnc_symbol")])
samples <- data.table(id = seq_along(sample_names), sample = sample_names)
fwrite(samples, file = here("scripts", "input", "samples.csv"))
data_wide_samples[, `:=`(gene = .I, hgnc_symbol = NULL)]
colnames(data_wide_samples) <- c("gene", seq_along(sample_names))
2022-06-22 19:06:56 +02:00
data_long <- melt(
data_wide_samples,
id.vars = "gene",
2022-06-22 19:06:56 +02:00
variable.name = "sample",
value.name = "expression",
variable.factor = FALSE
)
fwrite(data_long, file = here("scripts", "input", "data_long.csv"))