geposan/scripts/ensembl.R

library(data.table)

rlog::log_info("Connecting to Ensembl API")

# Object to access the Ensembl API.
ensembl <- biomaRt::useEnsembl("ensembl", version = 110)

# Retrieve species information.

rlog::log_info("Retrieving species information")
ensembl_datasets <- data.table(biomaRt::listDatasets(ensembl))

# Filter out species ID and name from the result.
species <- ensembl_datasets[, .(
  id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2],
  name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2]
)]

# List of assemblies that the Ensembl Rest API advertises as chromosomes.
# Mitochondrial DNA has been manually removed. Unfortunately, species IDs from
# the Ensembl REST API don't map to dataset names in the BioMart interface.
# Because of that, we can't programatically filter chromosome names.
#
# See get_all_chromosomes()
valid_chromosome_names <- c(
  "1",
  "2",
  "3",
  "4",
  "5",
  "6",
  "7",
  "8",
  "9",
  "10",
  "11",
  "12",
  "13",
  "14",
  "15",
  "16",
  "17",
  "18",
  "19",
  "X",
  "groupI",
  "groupII",
  "groupIII",
  "groupIV",
  "groupV",
  "groupVI",
  "groupVII",
  "groupVIII",
  "groupIX",
  "groupX",
  "groupXI",
  "groupXII",
  "groupXIII",
  "groupXIV",
  "groupXV",
  "groupXVI",
  "groupXVII",
  "groupXVIII",
  "groupXIX",
  "groupXX",
  "groupXXI",
  "20",
  "Y",
  "21",
  "22",
  "23",
  "24",
  "25",
  "26",
  "27",
  "28",
  "29",
  "30",
  "31",
  "32",
  "33",
  "34",
  "35",
  "36",
  "37",
  "38",
  "I",
  "II",
  "III",
  "IV",
  "V",
  "VI",
  "VII",
  "VIII",
  "IX",
  "XI",
  "XII",
  "XIII",
  "XIV",
  "XV",
  "XVI",
  "XVII",
  "XVIII",
  "XIX",
  "XX",
  "XXI",
  "XXII",
  "XXIII",
  "XXIV",
  "7a",
  "7b",
  "Z",
  "W",
  "a",
  "b",
  "c",
  "d",
  "f",
  "g",
  "h",
  "39",
  "40",
  "1a",
  "22a",
  "sgr01",
  "sgr02",
  "sgr03",
  "sgr04",
  "sgr05",
  "sgr06",
  "sgr07",
  "sgr08",
  "sgr09",
  "sgr10",
  "sgr11",
  "sgr12",
  "sgr13",
  "sgr14",
  "sgr15",
  "sgr16",
  "sgr17",
  "sgr18",
  "sgr19",
  "LGE64",
  "2A",
  "2B",
  "X1",
  "X2",
  "X3",
  "X4",
  "X5",
  "LG1",
  "LG2",
  "LG3",
  "LG4",
  "LG5",
  "LG6",
  "LG7",
  "LG8",
  "LG9",
  "LG10",
  "LG11",
  "LG12",
  "LG13",
  "LG14",
  "LG15",
  "LG16",
  "LG17",
  "LG18",
  "LG19",
  "LG20",
  "LG22",
  "LG23",
  "4A",
  "1A",
  "25LG1",
  "25LG2",
  "LGE22",
  "LG21",
  "A1",
  "A2",
  "A3",
  "B1",
  "B2",
  "B3",
  "B4",
  "C1",
  "C2",
  "D1",
  "D2",
  "D3",
  "D4",
  "E1",
  "E2",
  "E3",
  "F1",
  "F2",
  "LG34",
  "LG35",
  "LG24",
  "LG25",
  "LG26",
  "LG27",
  "LG28",
  "LG29",
  "LG30",
  "MIC_1",
  "MIC_10",
  "MIC_11",
  "MIC_2",
  "MIC_3",
  "MIC_4",
  "MIC_5",
  "MIC_6",
  "MIC_7",
  "MIC_8",
  "MIC_9",
  "2L",
  "2R",
  "3L",
  "3R",
  "LGE22C19W28_E50C23",
  "LG01",
  "LG02",
  "LG03",
  "LG04",
  "LG05",
  "LG06",
  "LG07",
  "LG08",
  "LG09",
  "LG7_11",
  "41",
  "42",
  "43",
  "44",
  "45",
  "46",
  "47",
  "48",
  "49",
  "50",
  "LG28B",
  "LG30F",
  "LG36F",
  "LG37M",
  "LG42F",
  "LG44F",
  "LG45M",
  "LG48F",
  "LG49B"
)

#' Get all chromosome names for an Ensembl dataset.
#'
#' The function tries to filter out valid chromosome names from the available
#' assemblies in the dataset.
get_chromosome_names <- function(dataset) {
  chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
  chromosome_names[chromosome_names %chin% valid_chromosome_names]
}

# Retrieve information on human genes. This will only include genes on
# assembled chromosomes. Chromosomes are filtered using get_chromosome_names().

rlog::log_info("Retrieving information on human genes")
dataset <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl)

human_data <- data.table(biomaRt::getBM(
  attributes = c(
    "ensembl_gene_id",
    "hgnc_symbol",
    "chromosome_name",
    "start_position",
    "end_position"
  ),
  filters = "chromosome_name",
  values = get_chromosome_names(dataset),
  mart = dataset
))

# Remove duplicated gene IDs (at the time of writing, there are a handful).
human_data <- unique(human_data, by = "ensembl_gene_id")

# Only keep relevant information on genes.
genes <- human_data[, .(
  id = ensembl_gene_id,
  name = hgnc_symbol,
  chromosome = chromosome_name
)]

# Retrieve gene distance data across species.

rlog::log_info("Retrieving distance data")
distances <- data.table()

#' Handle data for one species.
handle_species <- function(species_id, species_data) {
  chromosomes <- species_data[,
    .(chromosome_length = max(end_position)),
    by = chromosome_name
  ]

  # Store the number of chromosomes in the species table.
  species[id == species_id, n_chromosomes := nrow(chromosomes)]

  # Store the median chromosome length in the species table.
  species[
    id == species_id,
    median_chromosome_length := chromosomes[, median(chromosome_length)]
  ]

  # Precompute the genes' distance to the nearest telomere.
  species_distances <- species_data[
    chromosomes,
    .(
      species = species_id,
      gene = ensembl_gene_id,
      chromosome_name = chromosome_name,
      start_position = start_position,
      end_position = end_position,
      distance = pmin(
        start_position,
        chromosome_length - end_position
      )
    ),
    on = "chromosome_name"
  ]

  # Add species distances to the distances table.
  distances <<- rbindlist(list(distances, species_distances))
}

# Handle the human first, as we already retrieved the data and don't need to
# filter based on orthologies.
handle_species("hsapiens", human_data)

# Iterate through all other species and retrieve their distance data.
for (species_id in species[id != "hsapiens", id]) {
  rlog::log_info(sprintf("Loading species \"%s\"", species_id))

  dataset <- biomaRt::useDataset(
    sprintf("%s_gene_ensembl", species_id),
    mart = ensembl
  )

  # Besides the attributes that are always present, we need to check for
  # human orthologs. Some species don't have that information and will be
  # skipped.
  if (!"hsapiens_homolog_ensembl_gene" %chin%
    biomaRt::listAttributes(dataset, what = "name")) {
    rlog::log_info("No data on human orthologs")
    species <- species[id != species_id]

    next
  }

  chromosome_names <- get_chromosome_names(dataset)

  # Skip the species, if there are no assembled chromosomes.
  if (length(chromosome_names) <= 0) {
    rlog::log_info("No matching chromosome assemblies")
    species <- species[id != species_id]

    next
  }

  # Retrieve information on all genes of the current species, that have
  # human orthologs. This is called "homolog" in the Ensembl schema.
  species_distances <- data.table(biomaRt::getBM(
    attributes = c(
      "hsapiens_homolog_ensembl_gene",
      "chromosome_name",
      "start_position",
      "end_position"
    ),
    filters = c("with_hsapiens_homolog", "chromosome_name"),
    values = list(TRUE, chromosome_names),
    mart = dataset
  ))

  # Only include human genes that we have information on.
  species_distances <- species_distances[
    hsapiens_homolog_ensembl_gene %chin% genes$id
  ]

  # Only include one ortholog per human gene.
  species_distances <- unique(
    species_distances,
    by = "hsapiens_homolog_ensembl_gene"
  )

  # Rename gene ID column to match the human data.
  setnames(
    species_distances,
    "hsapiens_homolog_ensembl_gene",
    "ensembl_gene_id"
  )

  handle_species(species_id, species_distances)
}

# Save data in the appropriate place.

usethis::use_data(species, overwrite = TRUE)
usethis::use_data(genes, overwrite = TRUE)
usethis::use_data(distances, overwrite = TRUE)
Initial commit 2021-10-19 13:39:55 +02:00			`library(data.table)`

			`rlog::log_info("Connecting to Ensembl API")`

Update data to Ensembl version 105 2022-01-03 17:49:45 +01:00			`# Object to access the Ensembl API.`
Update data to Ensembl 110 2023-09-27 13:11:08 +02:00			`ensembl <- biomaRt::useEnsembl("ensembl", version = 110)`
Initial commit 2021-10-19 13:39:55 +02:00
			`# Retrieve species information.`

			`rlog::log_info("Retrieving species information")`
			`ensembl_datasets <- data.table(biomaRt::listDatasets(ensembl))`

			`# Filter out species ID and name from the result.`
			`species <- ensembl_datasets[, .(`
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2],`
			`name = stringr::str_match(description, "(.) genes \\(.\\)")[, 2]`
Initial commit 2021-10-19 13:39:55 +02:00			`)]`

data: Add more chromosomes 2021-12-05 13:23:00 +01:00			`# List of assemblies that the Ensembl Rest API advertises as chromosomes.`
			`# Mitochondrial DNA has been manually removed. Unfortunately, species IDs from`
			`# the Ensembl REST API don't map to dataset names in the BioMart interface.`
			`# Because of that, we can't programatically filter chromosome names.`
			`#`
			`# See get_all_chromosomes()`
			`valid_chromosome_names <- c(`
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`"1",`
			`"2",`
			`"3",`
			`"4",`
			`"5",`
			`"6",`
			`"7",`
			`"8",`
			`"9",`
			`"10",`
			`"11",`
			`"12",`
			`"13",`
			`"14",`
			`"15",`
			`"16",`
			`"17",`
			`"18",`
			`"19",`
			`"X",`
Update data to Ensembl 110 2023-09-27 13:11:08 +02:00			`"groupI",`
			`"groupII",`
			`"groupIII",`
			`"groupIV",`
			`"groupV",`
			`"groupVI",`
			`"groupVII",`
			`"groupVIII",`
			`"groupIX",`
			`"groupX",`
			`"groupXI",`
			`"groupXII",`
			`"groupXIII",`
			`"groupXIV",`
			`"groupXV",`
			`"groupXVI",`
			`"groupXVII",`
			`"groupXVIII",`
			`"groupXIX",`
			`"groupXX",`
			`"groupXXI",`
			`"20",`
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`"Y",`
			`"21",`
			`"22",`
			`"23",`
			`"24",`
Update data to Ensembl 110 2023-09-27 13:11:08 +02:00			`"25",`
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`"26",`
			`"27",`
			`"28",`
			`"29",`
Update data to Ensembl 110 2023-09-27 13:11:08 +02:00			`"30",`
			`"31",`
			`"32",`
			`"33",`
			`"34",`
			`"35",`
			`"36",`
			`"37",`
			`"38",`
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`"I",`
			`"II",`
			`"III",`
			`"IV",`
			`"V",`
			`"VI",`
			`"VII",`
			`"VIII",`
			`"IX",`
			`"XI",`
			`"XII",`
			`"XIII",`
			`"XIV",`
			`"XV",`
			`"XVI",`
			`"XVII",`
			`"XVIII",`
			`"XIX",`
			`"XX",`
			`"XXI",`
			`"XXII",`
			`"XXIII",`
			`"XXIV",`
Update data to Ensembl 110 2023-09-27 13:11:08 +02:00			`"7a",`
			`"7b",`
			`"Z",`
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`"W",`
Update data to Ensembl 110 2023-09-27 13:11:08 +02:00			`"a",`
			`"b",`
			`"c",`
			`"d",`
			`"f",`
			`"g",`
			`"h",`
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`"39",`
			`"40",`
			`"1a",`
			`"22a",`
			`"sgr01",`
			`"sgr02",`
			`"sgr03",`
			`"sgr04",`
			`"sgr05",`
			`"sgr06",`
			`"sgr07",`
			`"sgr08",`
			`"sgr09",`
			`"sgr10",`
			`"sgr11",`
			`"sgr12",`
			`"sgr13",`
			`"sgr14",`
			`"sgr15",`
			`"sgr16",`
			`"sgr17",`
			`"sgr18",`
			`"sgr19",`
Update data to Ensembl 110 2023-09-27 13:11:08 +02:00			`"LGE64",`
			`"2A",`
			`"2B",`
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`"X1",`
			`"X2",`
			`"X3",`
			`"X4",`
			`"X5",`
Update data to Ensembl 110 2023-09-27 13:11:08 +02:00			`"LG1",`
			`"LG2",`
			`"LG3",`
			`"LG4",`
			`"LG5",`
			`"LG6",`
			`"LG7",`
			`"LG8",`
			`"LG9",`
			`"LG10",`
			`"LG11",`
			`"LG12",`
			`"LG13",`
			`"LG14",`
			`"LG15",`
			`"LG16",`
			`"LG17",`
			`"LG18",`
			`"LG19",`
			`"LG20",`
			`"LG22",`
			`"LG23",`
			`"4A",`
			`"1A",`
			`"25LG1",`
			`"25LG2",`
			`"LGE22",`
			`"LG21",`
			`"A1",`
			`"A2",`
			`"A3",`
			`"B1",`
			`"B2",`
			`"B3",`
			`"B4",`
			`"C1",`
			`"C2",`
			`"D1",`
			`"D2",`
			`"D3",`
			`"D4",`
			`"E1",`
			`"E2",`
			`"E3",`
			`"F1",`
			`"F2",`
			`"LG34",`
			`"LG35",`
			`"LG24",`
			`"LG25",`
			`"LG26",`
			`"LG27",`
			`"LG28",`
			`"LG29",`
			`"LG30",`
			`"MIC_1",`
			`"MIC_10",`
			`"MIC_11",`
			`"MIC_2",`
			`"MIC_3",`
			`"MIC_4",`
			`"MIC_5",`
			`"MIC_6",`
			`"MIC_7",`
			`"MIC_8",`
			`"MIC_9",`
			`"2L",`
			`"2R",`
			`"3L",`
			`"3R",`
			`"LGE22C19W28_E50C23",`
			`"LG01",`
			`"LG02",`
			`"LG03",`
			`"LG04",`
			`"LG05",`
			`"LG06",`
			`"LG07",`
			`"LG08",`
			`"LG09",`
			`"LG7_11",`
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`"41",`
			`"42",`
			`"43",`
			`"44",`
			`"45",`
			`"46",`
			`"47",`
			`"48",`
			`"49",`
			`"50",`
			`"LG28B",`
			`"LG30F",`
			`"LG36F",`
			`"LG37M",`
			`"LG42F",`
			`"LG44F",`
			`"LG45M",`
			`"LG48F",`
Update data to Ensembl 110 2023-09-27 13:11:08 +02:00			`"LG49B"`
data: Add more chromosomes 2021-12-05 13:23:00 +01:00			`)`

Initial commit 2021-10-19 13:39:55 +02:00			`#' Get all chromosome names for an Ensembl dataset.`
			`#'`
data: Add more chromosomes 2021-12-05 13:23:00 +01:00			`#' The function tries to filter out valid chromosome names from the available`
data: Include more chromosomes 2021-11-02 11:00:37 +01:00			`#' assemblies in the dataset.`
Initial commit 2021-10-19 13:39:55 +02:00			`get_chromosome_names <- function(dataset) {`
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")`
			`chromosome_names[chromosome_names %chin% valid_chromosome_names]`
Initial commit 2021-10-19 13:39:55 +02:00			`}`

			`# Retrieve information on human genes. This will only include genes on`
			`# assembled chromosomes. Chromosomes are filtered using get_chromosome_names().`

			`rlog::log_info("Retrieving information on human genes")`
			`dataset <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl)`

			`human_data <- data.table(biomaRt::getBM(`
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`attributes = c(`
			`"ensembl_gene_id",`
			`"hgnc_symbol",`
			`"chromosome_name",`
			`"start_position",`
			`"end_position"`
			`),`
			`filters = "chromosome_name",`
			`values = get_chromosome_names(dataset),`
			`mart = dataset`
Initial commit 2021-10-19 13:39:55 +02:00			`))`

			`# Remove duplicated gene IDs (at the time of writing, there are a handful).`
			`human_data <- unique(human_data, by = "ensembl_gene_id")`

			`# Only keep relevant information on genes.`
			`genes <- human_data[, .(`
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`id = ensembl_gene_id,`
			`name = hgnc_symbol,`
			`chromosome = chromosome_name`
Initial commit 2021-10-19 13:39:55 +02:00			`)]`

			`# Retrieve gene distance data across species.`

			`rlog::log_info("Retrieving distance data")`
ensembl: Save information on species chromosomes 2022-01-08 17:54:54 +01:00			`distances <- data.table()`
Initial commit 2021-10-19 13:39:55 +02:00
ensembl: Save information on species chromosomes 2022-01-08 17:54:54 +01:00			`#' Handle data for one species.`
			`handle_species <- function(species_id, species_data) {`
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`chromosomes <- species_data[,`
			`.(chromosome_length = max(end_position)),`
			`by = chromosome_name`
			`]`

			`# Store the number of chromosomes in the species table.`
			`species[id == species_id, n_chromosomes := nrow(chromosomes)]`

			`# Store the median chromosome length in the species table.`
			`species[`
			`id == species_id,`
			`median_chromosome_length := chromosomes[, median(chromosome_length)]`
			`]`

			`# Precompute the genes' distance to the nearest telomere.`
			`species_distances <- species_data[`
			`chromosomes,`
			`.(`
			`species = species_id,`
			`gene = ensembl_gene_id,`
			`chromosome_name = chromosome_name,`
			`start_position = start_position,`
			`end_position = end_position,`
			`distance = pmin(`
			`start_position,`
			`chromosome_length - end_position`
			`)`
			`),`
			`on = "chromosome_name"`
			`]`

			`# Add species distances to the distances table.`
			`distances <<- rbindlist(list(distances, species_distances))`
ensembl: Save information on species chromosomes 2022-01-08 17:54:54 +01:00			`}`

			`# Handle the human first, as we already retrieved the data and don't need to`
			`# filter based on orthologies.`
			`handle_species("hsapiens", human_data)`
Initial commit 2021-10-19 13:39:55 +02:00
			`# Iterate through all other species and retrieve their distance data.`
Update to Ensembl 106 This also introduces some additional columns to the stored data tables. 2022-04-22 13:21:54 +02:00			`for (species_id in species[id != "hsapiens", id]) {`
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`rlog::log_info(sprintf("Loading species \"%s\"", species_id))`

			`dataset <- biomaRt::useDataset(`
			`sprintf("%s_gene_ensembl", species_id),`
			`mart = ensembl`
			`)`

			`# Besides the attributes that are always present, we need to check for`
			`# human orthologs. Some species don't have that information and will be`
			`# skipped.`
			`if (!"hsapiens_homolog_ensembl_gene" %chin%`
			`biomaRt::listAttributes(dataset, what = "name")) {`
			`rlog::log_info("No data on human orthologs")`
			`species <- species[id != species_id]`

			`next`
			`}`

			`chromosome_names <- get_chromosome_names(dataset)`

			`# Skip the species, if there are no assembled chromosomes.`
			`if (length(chromosome_names) <= 0) {`
			`rlog::log_info("No matching chromosome assemblies")`
			`species <- species[id != species_id]`

			`next`
			`}`

			`# Retrieve information on all genes of the current species, that have`
			`# human orthologs. This is called "homolog" in the Ensembl schema.`
			`species_distances <- data.table(biomaRt::getBM(`
			`attributes = c(`
			`"hsapiens_homolog_ensembl_gene",`
			`"chromosome_name",`
			`"start_position",`
			`"end_position"`
			`),`
			`filters = c("with_hsapiens_homolog", "chromosome_name"),`
			`values = list(TRUE, chromosome_names),`
			`mart = dataset`
			`))`

			`# Only include human genes that we have information on.`
			`species_distances <- species_distances[`
			`hsapiens_homolog_ensembl_gene %chin% genes$id`
			`]`

			`# Only include one ortholog per human gene.`
			`species_distances <- unique(`
			`species_distances,`
			`by = "hsapiens_homolog_ensembl_gene"`
			`)`

			`# Rename gene ID column to match the human data.`
			`setnames(`
			`species_distances,`
			`"hsapiens_homolog_ensembl_gene",`
			`"ensembl_gene_id"`
			`)`

			`handle_species(species_id, species_distances)`
Initial commit 2021-10-19 13:39:55 +02:00			`}`

			`# Save data in the appropriate place.`

			`usethis::use_data(species, overwrite = TRUE)`
			`usethis::use_data(genes, overwrite = TRUE)`
			`usethis::use_data(distances, overwrite = TRUE)`