Simplify data format and correct scale labels

This commit is contained in:
Elias Projahn 2021-08-16 17:21:01 +02:00
parent 495524a0ac
commit 400ca776e0
2 changed files with 22 additions and 28 deletions

18
data.R
View file

@ -41,7 +41,11 @@ load_data <- function(path) {
median_distance = numeric() median_distance = numeric()
) )
distances <- data.table(geneid = integer()) distances <- data.table(
species = character(),
gene = integer(),
distance = integer()
)
# Each file will contain data on one species. # Each file will contain data on one species.
file_names <- list.files(paste(path, "genomes", sep = "/")) file_names <- list.files(paste(path, "genomes", sep = "/"))
@ -63,13 +67,19 @@ load_data <- function(path) {
median_distance = median(species_distances[, dist]) median_distance = median(species_distances[, dist])
))) )))
# Column names have to be unique for each species. species_distances <- data.table(
setnames(species_distances, "dist", species_id) species = species_id,
gene = species_distances[, geneid],
distance = species_distances[, dist]
)
distances <- merge(distances, species_distances, all = TRUE) distances <- rbindlist(list(distances, species_distances))
} }
} }
# Order species by there median distance.
setorder(species, median_distance)
list( list(
genes = genes, genes = genes,
species = species, species = species,

View file

@ -3,44 +3,28 @@ library(ggplot2)
#' Draw a scatter plot containing gene positions. #' Draw a scatter plot containing gene positions.
scatter_plot <- function(gene_ids, data) { scatter_plot <- function(gene_ids, data) {
species <- data$species
setorder(species, median_distance)
distances <- data$distances[geneid %in% gene_ids]
plot <- ggplot() + plot <- ggplot() +
scale_x_continuous( scale_x_discrete(
name = "Species", name = "Species",
breaks = seq_len(nrow(species)), breaks = data$species$id,
labels = species$label labels = data$species$label
) + ) +
scale_y_continuous(name = "Distance to telomeres [Mbp]") + scale_y_continuous(name = "Distance to telomeres [Mbp]")
geom_line(
species,
mapping = aes(
x = as.numeric(rownames(species)),
y = median_distance / 1000000
)
)
colors <- rainbow(length(gene_ids)) colors <- rainbow(length(gene_ids))
for (i in seq_len(length(gene_ids))) { for (i in seq_len(length(gene_ids))) {
gene_id <- gene_ids[i] gene_id <- gene_ids[i]
gene_distances <- data.table(
index = as.numeric(rownames(species)),
distance = unlist(distances[geneid == gene_id, -1])
)
plot <- plot + plot <- plot +
geom_point( geom_point(
gene_distances, data$distances[gene == gene_id],
mapping = aes( mapping = aes(
x = index, x = species,
y = distance / 1000000, y = distance / 1000000,
), ),
color = colors[i] color = colors[i],
size = 4
) )
} }