mirror of
				https://github.com/johrpan/geposanui.git
				synced 2025-10-26 11:17:24 +01:00 
			
		
		
		
	Separate out geposan package
This commit is contained in:
		
							parent
							
								
									68354bf808
								
							
						
					
					
						commit
						dc0265a13f
					
				
					 16 changed files with 154 additions and 731 deletions
				
			
		
							
								
								
									
										115
									
								
								data.R
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										115
									
								
								data.R
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,115 @@ | |||
| library(data.table) | ||||
| 
 | ||||
| #' Species IDs of known replicatively aging species. | ||||
| species_ids_replicative <- c( | ||||
|     "bihybrid", | ||||
|     "btaurus", | ||||
|     "bthybrid", | ||||
|     "cfamiliaris", | ||||
|     "chircus", | ||||
|     "cjacchus", | ||||
|     "clfamiliaris", | ||||
|     "csabaeus", | ||||
|     "ecaballus", | ||||
|     "fcatus", | ||||
|     "ggorilla", | ||||
|     "hsapiens", | ||||
|     "lafricana", | ||||
|     "mfascicularis", | ||||
|     "mmulatta", | ||||
|     "mmurinus", | ||||
|     "mnemestrina", | ||||
|     "nleucogenys", | ||||
|     "oaries", | ||||
|     "pabelii", | ||||
|     "panubis", | ||||
|     "ppaniscus", | ||||
|     "ptroglodytes", | ||||
|     "sscrofa", | ||||
|     "tgelada" | ||||
| ) | ||||
| 
 | ||||
| #' Species from [geposan] and their aging status. | ||||
| species <- geposan::species[, .( | ||||
|     id, | ||||
|     name, | ||||
|     replicative = id %chin% species_ids_replicative | ||||
| )] | ||||
| 
 | ||||
| #' Gene names of genes for verified TPE-OLD genes. | ||||
| genes_verified_tpe_old <- c( | ||||
|     "C1S", | ||||
|     "DSP", | ||||
|     "ISG15", | ||||
|     "SORBS2", | ||||
|     "TERT" | ||||
| ) | ||||
| 
 | ||||
| #' Gene names of genes with a suggested TPE-OLD. | ||||
| genes_suggested_tpe_old <- c( | ||||
|     "AKAP3", | ||||
|     "ANO2", | ||||
|     "CCND2", | ||||
|     "CD163L1", | ||||
|     "CD9", | ||||
|     "FOXM1", | ||||
|     "GALNT8", | ||||
|     "NDUFA9", | ||||
|     "TEAD4", | ||||
|     "TIGAR", | ||||
|     "TSPAN9" | ||||
| ) | ||||
| 
 | ||||
| #' Genes from [geposan] and their TPE-OLD status. | ||||
| genes <- geposan::genes[, .( | ||||
|     id, | ||||
|     name, | ||||
|     chromosome, | ||||
|     n_species, | ||||
|     suggested = name %chin% genes_suggested_tpe_old, | ||||
|     verified = name %chin% genes_verified_tpe_old | ||||
| )] | ||||
| 
 | ||||
| #' All available methods from [geposan] and additional information on them. | ||||
| methods <- list( | ||||
|     list( | ||||
|         id = "clusteriness", | ||||
|         name = "Clustering", | ||||
|         description = "Clustering of genes" | ||||
|     ), | ||||
|     list( | ||||
|         id = "correlation", | ||||
|         name = "Correlation", | ||||
|         description = "Correlation with known genes" | ||||
|     ), | ||||
|     list( | ||||
|         id = "proximity", | ||||
|         name = "Proximity", | ||||
|         description = "Proximity to telomeres" | ||||
|     ), | ||||
|     list( | ||||
|         id = "neural", | ||||
|         name = "Neural", | ||||
|         description = "Assessment by neural network" | ||||
|     ) | ||||
| ) | ||||
| 
 | ||||
| #' Gene IDs of known or suggested TPE-OLD genes. | ||||
| genes_tpe_old <- genes[suggested | verified == TRUE, id] | ||||
| 
 | ||||
| #' Preset for [geposan] including all species and TPE-OLD genes for reference. | ||||
| preset_all_species <- geposan::preset( | ||||
|     methods = c("clusteriness", "correlation", "proximity", "neural"), | ||||
|     species = species$id, | ||||
|     genes = genes$id, | ||||
|     reference_genes = genes_tpe_old | ||||
| ) | ||||
| 
 | ||||
| #' Preset for [geposan] including only replicatively aging species as well as | ||||
| #' TPE-OLD genes for reference. | ||||
| preset_replicative_species <- geposan::preset( | ||||
|     methods = c("clusteriness", "correlation", "proximity", "neural"), | ||||
|     species = species_ids_replicative, | ||||
|     genes = genes$id, | ||||
|     reference_genes = genes_tpe_old | ||||
| ) | ||||
							
								
								
									
										9
									
								
								main.R
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								main.R
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,9 @@ | |||
| library(shiny) | ||||
| 
 | ||||
| # Initialize data first. | ||||
| source("data.R") | ||||
| 
 | ||||
| source("server.R") | ||||
| source("ui.R") | ||||
| 
 | ||||
| runApp(shinyApp(ui, server)) | ||||
							
								
								
									
										34
									
								
								optimize.R
									
										
									
									
									
								
							
							
						
						
									
										34
									
								
								optimize.R
									
										
									
									
									
								
							|  | @ -1,34 +0,0 @@ | |||
| #' Find the best weights to rank the data. | ||||
| #' | ||||
| #' This function ranks the provided data table based on a weighted score | ||||
| #' computed from the specified `columns`. It tries to find the optimal weights | ||||
| #' that result in a ranking, where the mean rank of the given reference genes | ||||
| #' is as high as possible. | ||||
| #' | ||||
| #' @param data Input data including the columns. | ||||
| #' @param colums Columns containing the separate scores between 0.0 and 1.0. | ||||
| #' @param reference_gene_ids IDs of the reference genes within the input data. | ||||
| #' | ||||
| #' @returns Vector of optimal column weights adding up to 1.0. | ||||
| optimize_weights <- function(data, columns, reference_gene_ids) { | ||||
|     #' Compute the mean rank of the reference genes when applying the weights. | ||||
|     mean_rank <- function(weights) { | ||||
|         data <- copy(data) | ||||
|         data[, score := 0.0] | ||||
| 
 | ||||
|         for (i in seq_along(columns)) { | ||||
|             column <- columns[i] | ||||
|             weighted <- weights[i] * data[, ..column] | ||||
|             data[, score := score + weighted] | ||||
|         } | ||||
| 
 | ||||
|         setorder(data, -score) | ||||
|         data[, rank := .I] | ||||
| 
 | ||||
|         data[gene %chin% reference_gene_ids, mean(rank)] | ||||
|     } | ||||
| 
 | ||||
|     weights <- optim(rep(1.0, length(columns)), mean_rank)$par | ||||
|     total_weight <- sum(weights) | ||||
|     weights / total_weight | ||||
| } | ||||
|  | @ -1,56 +0,0 @@ | |||
| library(data.table) | ||||
| 
 | ||||
| #' Perform a cluster analysis. | ||||
| #' | ||||
| #' This function will cluster the data using `hclust` and `cutree` (with the | ||||
| #' specified height). Every cluster with at least two members qualifies for | ||||
| #' further analysis. Clusters are then ranked based on their size in relation | ||||
| #' to the number of values. The return value is a final score between zero and | ||||
| #' one. Lower ranking clusters contribute less to this score. | ||||
| clusteriness <- function(data, height = 1000000) { | ||||
|     n <- length(data) | ||||
| 
 | ||||
|     # Return a score of 0.0 if there is just one or no value at all. | ||||
|     if (n < 2) { | ||||
|         return(0.0) | ||||
|     } | ||||
| 
 | ||||
|     # Cluster the data and compute the cluster sizes. | ||||
| 
 | ||||
|     tree <- hclust(dist(data)) | ||||
|     clusters <- cutree(tree, h = height) | ||||
|     cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE) | ||||
| 
 | ||||
|     # Compute the "clusteriness" score. | ||||
| 
 | ||||
|     score <- 0.0 | ||||
| 
 | ||||
|     for (i in seq_along(cluster_sizes)) { | ||||
|         cluster_size <- cluster_sizes[i] | ||||
| 
 | ||||
|         if (cluster_size >= 2) { | ||||
|             cluster_score <- cluster_size / n | ||||
|             score <- score + cluster_score / i | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     score | ||||
| } | ||||
| 
 | ||||
| #' Process genes clustering their distance to telomeres. | ||||
| process_clusteriness <- function(distances, gene_ids, preset) { | ||||
|     results <- data.table(gene = gene_ids) | ||||
| 
 | ||||
|     # Prefilter the input data by species. | ||||
|     distances <- distances[species %chin% preset$species_ids] | ||||
| 
 | ||||
|     # Add an index for quickly accessing data per gene. | ||||
|     setkey(distances, gene) | ||||
| 
 | ||||
|     #' Perform the cluster analysis for one gene. | ||||
|     compute <- function(gene_id) { | ||||
|         clusteriness(distances[gene_id, distance]) | ||||
|     } | ||||
| 
 | ||||
|     results[, score := compute(gene), by = 1:nrow(results)] | ||||
| } | ||||
|  | @ -1,63 +0,0 @@ | |||
| library(data.table) | ||||
| 
 | ||||
| #' Compute the mean correlation coefficient comparing gene distances with a set | ||||
| #' of reference genes. | ||||
| process_correlation <- function(distances, gene_ids, preset) { | ||||
|     results <- data.table(gene = gene_ids) | ||||
|     reference_gene_ids <- preset$reference_gene_ids | ||||
|     reference_count <- length(reference_gene_ids) | ||||
| 
 | ||||
|     # Prefilter distances by species. | ||||
|     distances <- distances[species %chin% preset$species_ids] | ||||
| 
 | ||||
|     # Add an index for quickly accessing data per gene. | ||||
|     setkey(distances, gene) | ||||
| 
 | ||||
|     # Prepare the reference genes' data. | ||||
|     reference_distances <- distances[gene %chin% reference_gene_ids] | ||||
| 
 | ||||
|     #' Perform the correlation for one gene. | ||||
|     compute <- function(gene_id) { | ||||
|         gene_distances <- distances[gene_id] | ||||
|         gene_species_count <- nrow(gene_distances) | ||||
| 
 | ||||
|         # Return a score of 0.0 if there is just one or no value at all. | ||||
|         if (gene_species_count <= 1) { | ||||
|             return(0.0) | ||||
|         } | ||||
| 
 | ||||
|         #' Buffer for the sum of correlation coefficients. | ||||
|         correlation_sum <- 0 | ||||
| 
 | ||||
|         # Correlate with all reference genes but not with the gene itself. | ||||
|         for (reference_gene_id in | ||||
|             reference_gene_ids[reference_gene_ids != gene_id]) { | ||||
|             data <- merge( | ||||
|                 gene_distances, | ||||
|                 reference_distances[reference_gene_id], | ||||
|                 by = "species" | ||||
|             ) | ||||
| 
 | ||||
|             # Skip this reference gene, if there are not enough value pairs. | ||||
|             # This will lessen the final score, because it effectively | ||||
|             # represents a correlation coefficient of 0.0. | ||||
|             if (nrow(data) <= 1) { | ||||
|                 next | ||||
|             } | ||||
| 
 | ||||
|             # Order data by the reference gene's distance to get a monotonic | ||||
|             # relation. | ||||
|             setorder(data, distance.y) | ||||
| 
 | ||||
|             correlation_sum <- correlation_sum + abs(cor( | ||||
|                 data[, distance.x], data[, distance.y], | ||||
|                 method = "spearman" | ||||
|             )) | ||||
|         } | ||||
| 
 | ||||
|         # Compute the score as the mean correlation coefficient. | ||||
|         score <- correlation_sum / reference_count | ||||
|     } | ||||
| 
 | ||||
|     results[, score := compute(gene), by = 1:nrow(results)] | ||||
| } | ||||
							
								
								
									
										254
									
								
								process/input.R
									
										
									
									
									
								
							
							
						
						
									
										254
									
								
								process/input.R
									
										
									
									
									
								
							|  | @ -1,254 +0,0 @@ | |||
| library(biomaRt) | ||||
| library(data.table) | ||||
| library(progress) | ||||
| library(rlog) | ||||
| library(stringr) | ||||
| 
 | ||||
| #' Species IDs of known replicatively aging species. | ||||
| species_ids_replicative <- c( | ||||
|     "bihybrid", | ||||
|     "btaurus", | ||||
|     "bthybrid", | ||||
|     "cfamiliaris", | ||||
|     "chircus", | ||||
|     "cjacchus", | ||||
|     "clfamiliaris", | ||||
|     "csabaeus", | ||||
|     "ecaballus", | ||||
|     "fcatus", | ||||
|     "ggorilla", | ||||
|     "hsapiens", | ||||
|     "lafricana", | ||||
|     "mfascicularis", | ||||
|     "mmulatta", | ||||
|     "mmurinus", | ||||
|     "mnemestrina", | ||||
|     "nleucogenys", | ||||
|     "oaries", | ||||
|     "pabelii", | ||||
|     "panubis", | ||||
|     "ppaniscus", | ||||
|     "ptroglodytes", | ||||
|     "sscrofa", | ||||
|     "tgelada" | ||||
| ) | ||||
| 
 | ||||
| #' Gene names of genes for verified TPE-OLD genes. | ||||
| genes_verified_tpe_old <- c( | ||||
|     "C1S", | ||||
|     "DSP", | ||||
|     "ISG15", | ||||
|     "SORBS2", | ||||
|     "TERT" | ||||
| ) | ||||
| 
 | ||||
| #' Gene names of genes with a suggested TPE-OLD. | ||||
| genes_suggested_tpe_old <- c( | ||||
|     "AKAP3", | ||||
|     "ANO2", | ||||
|     "CCND2", | ||||
|     "CD163L1", | ||||
|     "CD9", | ||||
|     "FOXM1", | ||||
|     "GALNT8", | ||||
|     "NDUFA9", | ||||
|     "TEAD4", | ||||
|     "TIGAR", | ||||
|     "TSPAN9" | ||||
| ) | ||||
| 
 | ||||
| #' Shared accessor for the Ensembl API. | ||||
| ensembl <- NULL | ||||
| 
 | ||||
| #' Get the ensembl accessor and initialize it if necessary. | ||||
| get_ensembl <- function() { | ||||
|     if (is.null(ensembl)) { | ||||
|         ensembl <<- useEnsembl("ensembl", version = 104) | ||||
|     } | ||||
| 
 | ||||
|     ensembl | ||||
| } | ||||
| 
 | ||||
| #' Get all chromosome names for a Ensembl dataset. | ||||
| #' | ||||
| #' Valid chromosome names include decimal numbers as well as 'X' and 'Y'. | ||||
| get_chromosome_names <- function(dataset) { | ||||
|     chromosome_names <- listFilterOptions(dataset, "chromosome_name") | ||||
|     chromosome_names[str_which(chromosome_names, "^[0-9]+|[XY]$")] | ||||
| } | ||||
| 
 | ||||
| #' Retrieve information on species. | ||||
| #' | ||||
| #' The result will be a `data.table` with the following columns: | ||||
| #' | ||||
| #'  - `id` Species ID as presented by Ensembl. | ||||
| #'  - `name` Human readable species name. | ||||
| #'  - `replicative` Whether the species is likely to be aging replicatively. | ||||
| retrieve_species <- function() { | ||||
|     # Ensembl datasets correspond to distinct species. | ||||
|     ensembl_datasets <- data.table(listDatasets(get_ensembl())) | ||||
| 
 | ||||
|     # Filter out species ID and name from the result. | ||||
|     species <- ensembl_datasets[, .( | ||||
|         id = str_match(dataset, "(.*)_gene_ensembl")[, 2], | ||||
|         name = str_match(description, "(.*) genes \\(.*\\)")[, 2] | ||||
|     )] | ||||
| 
 | ||||
|     species[, replicative := id %chin% species_ids_replicative] | ||||
| } | ||||
| 
 | ||||
| #' Retrieve information on human genes. | ||||
| #' | ||||
| #' This will only include genes on assembled chromosomes. Chromosomes are | ||||
| #' filtered based on their name being either a decimal number, 'X' or 'Y'. | ||||
| #' | ||||
| #' The result will be a `data.table` with the following columns: | ||||
| #' | ||||
| #'  - `id` Ensembl gene ID. | ||||
| #'  - `ǹame` HGNC name of the gene. | ||||
| #'  - `chromosome` Human chromosome on which the gene is located. | ||||
| retrieve_genes <- function() { | ||||
|     dataset <- useDataset("hsapiens_gene_ensembl", mart = get_ensembl()) | ||||
| 
 | ||||
|     genes <- data.table(getBM( | ||||
|         attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name"), | ||||
|         filters = "chromosome_name", | ||||
|         values = get_chromosome_names(dataset), | ||||
|         mart = useDataset("hsapiens_gene_ensembl", mart = get_ensembl()) | ||||
|     )) | ||||
| 
 | ||||
|     genes[, .( | ||||
|         id = ensembl_gene_id, | ||||
|         name = hgnc_symbol, | ||||
|         chromosome = chromosome_name, | ||||
|         verified = hgnc_symbol %chin% genes_verified_tpe_old, | ||||
|         suggested = hgnc_symbol %chin% genes_suggested_tpe_old | ||||
|     )] | ||||
| } | ||||
| 
 | ||||
| #' Retrieve gene distance data. | ||||
| #' | ||||
| #' The data will include all available values for the given species and genes | ||||
| #' that are located on assembled chromosomes. | ||||
| #' | ||||
| #' The result will be a `data.table` with the following columns: | ||||
| #' | ||||
| #'  - `species` Species ID. | ||||
| #'  - `gene` Ensembl gene ID. | ||||
| #'  - `distance` Distance to nearest telomere in base pairs. | ||||
| retrieve_distances <- function(species_ids, gene_ids) { | ||||
|     ensembl <- get_ensembl() | ||||
| 
 | ||||
|     # Exclude the human from the species, in case it is present there. | ||||
|     species_ids <- species_ids[species_ids != "hsapiens"] | ||||
| 
 | ||||
|     species_count <- length(species_ids) | ||||
|     gene_count <- length(gene_ids) | ||||
| 
 | ||||
|     log_info(sprintf( | ||||
|         "Retrieving distance data for %i genes from %i species", | ||||
|         gene_count, | ||||
|         species_count | ||||
|     )) | ||||
| 
 | ||||
|     progress <- progress_bar$new( | ||||
|         total = gene_count, | ||||
|         format = "Retrieving distance data [:bar] :percent (ETA :eta)" | ||||
|     ) | ||||
| 
 | ||||
|     # Special case the human species and retrieve all available distance | ||||
|     # information. | ||||
| 
 | ||||
|     dataset <- useDataset("hsapiens_gene_ensembl", mart = ensembl) | ||||
| 
 | ||||
|     human_distances <- data.table(getBM( | ||||
|         attributes = c( | ||||
|             "ensembl_gene_id", | ||||
|             "chromosome_name", | ||||
|             "start_position", | ||||
|             "end_position" | ||||
|         ), | ||||
|         filters = "chromosome_name", | ||||
|         values = get_chromosome_names(dataset), | ||||
|         mart = dataset | ||||
|     )) | ||||
| 
 | ||||
|     # Compute the nearest distance to telomeres. | ||||
| 
 | ||||
|     human_distances[, | ||||
|         chromosome_length := max(end_position), | ||||
|         by = chromosome_name | ||||
|     ] | ||||
| 
 | ||||
|     distances <- human_distances[, .( | ||||
|         species = "hsapiens", | ||||
|         gene = ensembl_gene_id, | ||||
|         distance = pmin( | ||||
|             start_position, | ||||
|             chromosome_length - end_position | ||||
|         ) | ||||
|     )] | ||||
| 
 | ||||
|     for (i in 1:species_count) { | ||||
|         species_id <- species_ids[i] | ||||
| 
 | ||||
|         progress$tick() | ||||
| 
 | ||||
|         dataset <- useDataset( | ||||
|             sprintf("%s_gene_ensembl", species_id), | ||||
|             mart = ensembl | ||||
|         ) | ||||
| 
 | ||||
|         # Besides the attributes that are always present, we need to check for | ||||
|         # human orthologs. Some species don't have that information and will be | ||||
|         # skipped. | ||||
|         if (!"hsapiens_homolog_ensembl_gene" %chin% | ||||
|             listAttributes(dataset, what = "name")) { | ||||
|             next | ||||
|         } | ||||
| 
 | ||||
|         chromosome_names <- get_chromosome_names(dataset) | ||||
| 
 | ||||
|         # Skip the species, if there are no assembled chromosomes. | ||||
|         if (length(chromosome_names) <= 0) { | ||||
|             next | ||||
|         } | ||||
| 
 | ||||
|         # Retrieve information on all genes of the current species, that have | ||||
|         # human orthologs. This is called "homolog" in the Ensembl schema. | ||||
|         ensembl_distances <- data.table(getBM( | ||||
|             attributes = c( | ||||
|                 "hsapiens_homolog_ensembl_gene", | ||||
|                 "chromosome_name", | ||||
|                 "start_position", | ||||
|                 "end_position" | ||||
|             ), | ||||
|             filters = c("with_hsapiens_homolog", "chromosome_name"), | ||||
|             values = list(TRUE, chromosome_names), | ||||
|             mart = dataset | ||||
|         )) | ||||
| 
 | ||||
|         # Precompute the genes' distance to the nearest telomere. | ||||
| 
 | ||||
|         ensembl_distances[, | ||||
|             chromosome_length := max(end_position), | ||||
|             by = chromosome_name | ||||
|         ] | ||||
| 
 | ||||
|         species_distances <- ensembl_distances[, .( | ||||
|             species = species_id, | ||||
|             gene = hsapiens_homolog_ensembl_gene, | ||||
|             distance = pmin( | ||||
|                 start_position, | ||||
|                 chromosome_length - end_position | ||||
|             ) | ||||
|         )] | ||||
| 
 | ||||
|         distances <- rbindlist(list(distances, species_distances)) | ||||
|     } | ||||
| 
 | ||||
|     # Arbitrarily exclude duplicated genes. | ||||
|     # TODO: Consider a refined approach or work out how to include all | ||||
|     # duplicates. | ||||
|     unique(distances, by = c("species", "gene")) | ||||
| } | ||||
|  | @ -1,62 +0,0 @@ | |||
| source("process/clusteriness.R") | ||||
| source("process/correlation.R") | ||||
| source("process/neural.R") | ||||
| source("process/proximity.R") | ||||
| 
 | ||||
| #' Construct a new method. | ||||
| #' | ||||
| #' A method describes a way to perform a computation on gene distance data that | ||||
| #' results in a single score per gene. The function should accept the following | ||||
| #' parameters in this order: | ||||
| #' | ||||
| #'  - `distances` Distance data to use. | ||||
| #'  - `gene_ids` Genes to process. | ||||
| #'  - `preset` Preset to apply. | ||||
| #' | ||||
| #' The function should return a `data.table` with the following columns: | ||||
| #' | ||||
| #'  - `gene` Gene ID of the processed gene. | ||||
| #'  - `score` Score for the gene between 0.0 and 1.0. | ||||
| #' | ||||
| #' @param id Internal identifier for the method. | ||||
| #' @param name Human readable name for the method. | ||||
| #' @param description Short human readable description. | ||||
| #' @param fn Function to perform the computation. | ||||
| #' | ||||
| #' @return A named list containing the arguments. | ||||
| method <- function(id, name, description, fn) { | ||||
|     list( | ||||
|         id = id, | ||||
|         name = name, | ||||
|         description = description, | ||||
|         fn = fn | ||||
|     ) | ||||
| } | ||||
| 
 | ||||
| #' All methods to be included in the analysis. | ||||
| methods <- list( | ||||
|     method( | ||||
|         "clusteriness", | ||||
|         "Clustering", | ||||
|         "Clustering of genes", | ||||
|         process_clusteriness | ||||
|     ), | ||||
|     method( | ||||
|         "correlation", | ||||
|         "Correlation", | ||||
|         "Correlation with known genes", | ||||
|         process_correlation | ||||
|     ), | ||||
|     method( | ||||
|         "proximity", | ||||
|         "Proximity", | ||||
|         "Proximity to telomeres", | ||||
|         process_proximity | ||||
|     ), | ||||
|     method( | ||||
|         "neural", | ||||
|         "Neural", | ||||
|         "Assessment by neural network", | ||||
|         process_neural | ||||
|     ) | ||||
| ) | ||||
							
								
								
									
										101
									
								
								process/neural.R
									
										
									
									
									
								
							
							
						
						
									
										101
									
								
								process/neural.R
									
										
									
									
									
								
							|  | @ -1,101 +0,0 @@ | |||
| library(data.table) | ||||
| library(neuralnet) | ||||
| 
 | ||||
| #' Find genes by training a neural network on reference position data. | ||||
| #' @param seed A seed to get reproducible results. | ||||
| process_neural <- function(distances, gene_ids, preset, seed = 726839) { | ||||
|     species_ids <- preset$species_ids | ||||
|     reference_gene_ids <- preset$reference_gene_ids | ||||
|     set.seed(seed) | ||||
|     gene_count <- length(gene_ids) | ||||
| 
 | ||||
|     # Prefilter distances by species. | ||||
|     distances <- distances[species %chin% species_ids] | ||||
| 
 | ||||
|     #' Input data for the network. This contains the gene ID as an identifier | ||||
|     #' as well as the per-species gene distances as input variables. | ||||
|     data <- data.table(gene = gene_ids) | ||||
| 
 | ||||
|     #' Buffer to keep track of species included in the computation. Species | ||||
|     #' from `species_ids` may be excluded if they don't have enough data. | ||||
|     species_ids_included <- NULL | ||||
| 
 | ||||
|     for (species_id in species_ids) { | ||||
|         # Make a column specific to this species. | ||||
| 
 | ||||
|         species_distances <- distances[species == species_id, .(gene, distance)] | ||||
|         setnames(species_distances, "distance", species_id) | ||||
| 
 | ||||
|         # Only include species with at least 25% known values. | ||||
| 
 | ||||
|         species_distances <- na.omit(species_distances) | ||||
| 
 | ||||
|         if (nrow(species_distances) >= 0.25 * gene_count) { | ||||
|             species_ids_included <- append(species_ids_included, species_id) | ||||
|             data <- merge(data, species_distances, all = TRUE) | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     # Replace missing data with mean values. The neural network can't handle | ||||
|     # NAs in a meaningful way. Choosing extreme values here would result in | ||||
|     # heavily biased results. Therefore, the mean value is chosen as a | ||||
|     # compromise. However, this will of course lessen the significance of the | ||||
|     # results. | ||||
|     for (species_id in species_ids_included) { | ||||
|         mean_value <- data[, mean(get(species_id), na.rm = TRUE)] | ||||
| 
 | ||||
|         data[ | ||||
|             is.na(get(species_id)), | ||||
|             eval(quote(species_id)) := round(mean_value) | ||||
|         ] | ||||
|     } | ||||
| 
 | ||||
|     # Extract the reference genes. | ||||
| 
 | ||||
|     reference_data <- data[gene %chin% reference_gene_ids] | ||||
|     reference_data[, neural := 1.0] | ||||
| 
 | ||||
|     # Take out random samples from the remaining genes. This is another | ||||
|     # compromise with a negative impact on significance. Because there is no | ||||
|     # information on genes with are explicitely *not* TPE-OLD genes, we have to | ||||
|     # assume that a random sample of genes has a low probability of including | ||||
|     # TPE-OLD genes. | ||||
| 
 | ||||
|     without_reference_data <- data[!gene %chin% reference_gene_ids] | ||||
| 
 | ||||
|     reference_samples <- without_reference_data[ | ||||
|         sample( | ||||
|             nrow(without_reference_data), | ||||
|             nrow(reference_data) | ||||
|         ) | ||||
|     ] | ||||
| 
 | ||||
|     reference_samples[, neural := 0.0] | ||||
| 
 | ||||
|     # Merge training data. The training data includes all reference genes as | ||||
|     # well as an equal number of random sample genes. | ||||
|     training_data <- rbindlist(list(reference_data, reference_samples)) | ||||
| 
 | ||||
|     # Construct and train the neural network. | ||||
| 
 | ||||
|     nn_formula <- as.formula(sprintf( | ||||
|         "neural~%s", | ||||
|         paste(species_ids_included, collapse = "+") | ||||
|     )) | ||||
| 
 | ||||
|     layer1 <- length(species_ids_included) * 0.66 | ||||
|     layer2 <- layer1 * 0.66 | ||||
|     layer3 <- layer2 * 0.66 | ||||
| 
 | ||||
|     nn <- neuralnet( | ||||
|         nn_formula, | ||||
|         training_data, | ||||
|         hidden = c(layer1, layer2, layer3), | ||||
|         linear.output = FALSE | ||||
|     ) | ||||
| 
 | ||||
|     # Return the resulting scores given by applying the neural network. | ||||
| 
 | ||||
|     data[, score := compute(nn, data)$net.result] | ||||
|     data[, .(gene, score)] | ||||
| } | ||||
|  | @ -1,29 +0,0 @@ | |||
| library(data.table) | ||||
| 
 | ||||
| #' Create a new preset. | ||||
| #' | ||||
| #' A preset is a combination of input values to all processing methods. The | ||||
| #' preset's hash will be used to cache the results of applying those. | ||||
| #' | ||||
| #' @param species_ids IDs of species to include. | ||||
| #' @param reference_gene_ids Reference genes to use. | ||||
| #' | ||||
| #' @return A named list containing the arguments. | ||||
| preset <- function(species_ids, reference_gene_ids) { | ||||
|     list( | ||||
|         species_ids = species_ids, | ||||
|         reference_gene_ids = reference_gene_ids | ||||
|     ) | ||||
| } | ||||
| 
 | ||||
| #' A default preset including only replicatively aging species. | ||||
| preset_replicative_species <- preset( | ||||
|     species[replicative == TRUE, id], | ||||
|     genes[suggested | verified == TRUE, id] | ||||
| ) | ||||
| 
 | ||||
| #' A default preset including all species. | ||||
| preset_all_species <-  preset( | ||||
|     species[, id], | ||||
|     genes[suggested | verified == TRUE, id] | ||||
| ) | ||||
|  | @ -1,58 +0,0 @@ | |||
| library(data.table) | ||||
| 
 | ||||
| source("process/util.R") | ||||
| 
 | ||||
| # Load input data | ||||
| 
 | ||||
| source("process/input.R") | ||||
| 
 | ||||
| species <- run_cached("inputs/species", retrieve_species) | ||||
| genes <- run_cached("inputs/genes", retrieve_genes) | ||||
| 
 | ||||
| distances <- run_cached( | ||||
|     "inputs/distances", | ||||
|     retrieve_distances, | ||||
|     species[, id], | ||||
|     genes[, id] | ||||
| ) | ||||
| 
 | ||||
| genes <- merge( | ||||
|     genes, | ||||
|     distances[, .(n_species = .N), by = "gene"], | ||||
|     by.x = "id", | ||||
|     by.y = "gene" | ||||
| ) | ||||
| 
 | ||||
| source("process/methods.R") | ||||
| source("process/presets.R") | ||||
| 
 | ||||
| #' Apply all methods with the specified preset without caching. | ||||
| process_priv <- function(preset) { | ||||
|     results <- data.table(gene = genes[, id]) | ||||
| 
 | ||||
|     for (method in methods) { | ||||
|         method_results <- method$fn(distances, genes[, id], preset) | ||||
|         setnames(method_results, "score", method$id) | ||||
| 
 | ||||
|         results <- merge( | ||||
|             results, | ||||
|             method_results | ||||
|         ) | ||||
|     } | ||||
| 
 | ||||
|     results | ||||
| } | ||||
| 
 | ||||
| #' Apply all methods with the specified preset. | ||||
| #' | ||||
| #' The result will be cached by the preset's hash and restored from cache, if | ||||
| #' possible. The return value is a `data.table` with one row for each gene | ||||
| #' identified by it's ID (`gene` column). The additional columns contain the | ||||
| #' resulting per method and are named after the method IDs. | ||||
| process <- function(preset) { | ||||
|     run_cached( | ||||
|         sprintf("results/%s", rlang::hash(preset)), | ||||
|         process_priv, | ||||
|         preset | ||||
|     ) | ||||
| } | ||||
|  | @ -1,20 +0,0 @@ | |||
| library(data.table) | ||||
| 
 | ||||
| #' Score the mean distance of genes to the telomeres across species. | ||||
| #' | ||||
| #' A score will be given to each gene such that 0.0 corresponds to the maximal | ||||
| #' mean distance across all genes and 1.0 corresponds to a distance of 0. | ||||
| process_proximity <- function(distances, gene_ids, preset) { | ||||
|     species_count <- length(preset$species_ids) | ||||
| 
 | ||||
|     # Prefilter distances by species. | ||||
|     distances <- distances[species %chin% preset$species_ids] | ||||
| 
 | ||||
|     # Compute the score as described above. | ||||
| 
 | ||||
|     distances <- distances[, .(mean_distance = mean(distance)), by = "gene"] | ||||
|     max_distance <- distances[, max(mean_distance)] | ||||
|     distances[, score := 1 - mean_distance / max_distance] | ||||
| 
 | ||||
|     distances[, .(gene, score)] | ||||
| } | ||||
|  | @ -1,13 +1,14 @@ | |||
| library(data.table) | ||||
| library(DT) | ||||
| library(geposan) | ||||
| library(gprofiler2) | ||||
| library(plotly) | ||||
| library(rclipboard) | ||||
| library(shiny) | ||||
| 
 | ||||
| source("optimize.R") | ||||
| source("rank_plot.R") | ||||
| source("scatter_plot.R") | ||||
| source("utils.R") | ||||
| 
 | ||||
| #' Java script function to replace gene IDs with Ensembl gene links. | ||||
| js_link <- JS("function(row, data) { | ||||
|  | @ -45,16 +46,19 @@ server <- function(input, output, session) { | |||
|             } | ||||
|         } | ||||
| 
 | ||||
|         reference_gene_ids <- genes[suggested | verified == TRUE, id] | ||||
|         weights <- optimize_weights(results, method_ids, reference_gene_ids) | ||||
|         weights <- geposan::optimize_weights( | ||||
|             results, | ||||
|             method_ids, | ||||
|             genes_tpe_old | ||||
|         ) | ||||
| 
 | ||||
|         mapply(function(method_id, weight) { | ||||
|         for (method_id in method_ids) { | ||||
|             updateSliderInput( | ||||
|                 session, | ||||
|                 sprintf("%s_weight", method_id), | ||||
|                 value = weight * 100 | ||||
|                 value = weights[[method_id]] * 100 | ||||
|             ) | ||||
|         }, method_ids, weights) | ||||
|         } | ||||
|     }) | ||||
| 
 | ||||
|     # Observe each method's enable button. | ||||
|  | @ -67,14 +71,17 @@ server <- function(input, output, session) { | |||
|     #' Rank the results based on the specified weights. Filter out genes with | ||||
|     #' too few species but don't apply the cut-off score. | ||||
|     results <- reactive({ | ||||
|         # Select the species preset. | ||||
| 
 | ||||
|         results <- if (input$species == "all") { | ||||
|             process(preset_all_species) | ||||
|         # Select the preset. | ||||
|         preset <- if (input$species == "all") { | ||||
|             preset_all_species | ||||
|         } else { | ||||
|             process(preset_replicative_species) | ||||
|             preset_replicative_species | ||||
|         } | ||||
| 
 | ||||
|         # Perform the analysis cached based on the preset's hash. | ||||
|         results <- run_cached(rlang::hash(preset), geposan::analyze, preset) | ||||
| 
 | ||||
|         # Add all gene information to the results. | ||||
|         results <- merge( | ||||
|             results, | ||||
|             genes, | ||||
|  | @ -82,41 +89,21 @@ server <- function(input, output, session) { | |||
|             by.y = "id" | ||||
|         ) | ||||
| 
 | ||||
|         # Compute scoring factors and the weighted score. | ||||
|         # Exclude genes with too few species. | ||||
|         results <- results[n_species >= input$n_species] | ||||
| 
 | ||||
|         total_weight <- 0.0 | ||||
|         results[, score := 0.0] | ||||
|         # Rank the results based on the weights. | ||||
| 
 | ||||
|         weights <- NULL | ||||
| 
 | ||||
|         for (method in methods) { | ||||
|             if (input[[method$id]]) { | ||||
|                 weight <- input[[sprintf("%s_weight", method$id)]] | ||||
|                 total_weight <- total_weight + weight | ||||
|                 column <- method$id | ||||
|                 weighted <- weight * results[, ..column] | ||||
|                 results[, score := score + weighted] | ||||
|                 weights[[method$id]] <- weight | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         results[, score := score / total_weight] | ||||
| 
 | ||||
|         # Exclude genes with too few species. | ||||
|         results <- results[n_species >= input$n_species] | ||||
| 
 | ||||
|         # Penalize missing species. | ||||
|         if (input$penalize) { | ||||
|             species_count <- if (input$species == "all") { | ||||
|                 nrow(species) | ||||
|             } else { | ||||
|                 length(species_ids_replicative) | ||||
|             } | ||||
| 
 | ||||
|             results <- results[, score := score * n_species / species_count] | ||||
|         } | ||||
| 
 | ||||
|         # Order the results based on their score. | ||||
| 
 | ||||
|         setorder(results, -score, na.last = TRUE) | ||||
|         results[, rank := .I] | ||||
|         geposan::ranking(results, weights) | ||||
|     }) | ||||
| 
 | ||||
|     #' Apply the cut-off score to the ranked results. | ||||
|  | @ -1,7 +0,0 @@ | |||
| library(shiny) | ||||
| 
 | ||||
| source("process/process.R") | ||||
| source("shiny/server.R") | ||||
| source("shiny/ui.R") | ||||
| 
 | ||||
| runApp(shinyApp(ui, server)) | ||||
|  | @ -56,11 +56,7 @@ ui <- fluidPage( | |||
|                         value = 100 | ||||
|                     ) | ||||
|                 ) | ||||
|             }), | ||||
|             checkboxInput( | ||||
|                 "penalize", | ||||
|                 "Penalize missing values" | ||||
|             ), | ||||
|             }) | ||||
|         ), | ||||
|         mainPanel( | ||||
|             tabsetPanel( | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue