From 400ca776e0b0081b9ef7bf6392b1265743f556ff Mon Sep 17 00:00:00 2001 From: Elias Projahn Date: Mon, 16 Aug 2021 17:21:01 +0200 Subject: [PATCH] Simplify data format and correct scale labels --- data.R | 18 ++++++++++++++---- scatter_plot.R | 32 ++++++++------------------------ 2 files changed, 22 insertions(+), 28 deletions(-) diff --git a/data.R b/data.R index b75801c..a38bf04 100644 --- a/data.R +++ b/data.R @@ -41,7 +41,11 @@ load_data <- function(path) { median_distance = numeric() ) - distances <- data.table(geneid = integer()) + distances <- data.table( + species = character(), + gene = integer(), + distance = integer() + ) # Each file will contain data on one species. file_names <- list.files(paste(path, "genomes", sep = "/")) @@ -63,13 +67,19 @@ load_data <- function(path) { median_distance = median(species_distances[, dist]) ))) - # Column names have to be unique for each species. - setnames(species_distances, "dist", species_id) + species_distances <- data.table( + species = species_id, + gene = species_distances[, geneid], + distance = species_distances[, dist] + ) - distances <- merge(distances, species_distances, all = TRUE) + distances <- rbindlist(list(distances, species_distances)) } } + # Order species by there median distance. + setorder(species, median_distance) + list( genes = genes, species = species, diff --git a/scatter_plot.R b/scatter_plot.R index 6cf8f10..1ed1e25 100644 --- a/scatter_plot.R +++ b/scatter_plot.R @@ -3,44 +3,28 @@ library(ggplot2) #' Draw a scatter plot containing gene positions. scatter_plot <- function(gene_ids, data) { - species <- data$species - setorder(species, median_distance) - - distances <- data$distances[geneid %in% gene_ids] - plot <- ggplot() + - scale_x_continuous( + scale_x_discrete( name = "Species", - breaks = seq_len(nrow(species)), - labels = species$label + breaks = data$species$id, + labels = data$species$label ) + - scale_y_continuous(name = "Distance to telomeres [Mbp]") + - geom_line( - species, - mapping = aes( - x = as.numeric(rownames(species)), - y = median_distance / 1000000 - ) - ) + scale_y_continuous(name = "Distance to telomeres [Mbp]") colors <- rainbow(length(gene_ids)) for (i in seq_len(length(gene_ids))) { gene_id <- gene_ids[i] - gene_distances <- data.table( - index = as.numeric(rownames(species)), - distance = unlist(distances[geneid == gene_id, -1]) - ) - plot <- plot + geom_point( - gene_distances, + data$distances[gene == gene_id], mapping = aes( - x = index, + x = species, y = distance / 1000000, ), - color = colors[i] + color = colors[i], + size = 4 ) }