mirror of
				https://github.com/johrpan/ubigen.git
				synced 2025-10-26 19:57:24 +01:00 
			
		
		
		
	
		
			
	
	
		
			82 lines
		
	
	
	
		
			3.6 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
		
		
			
		
	
	
			82 lines
		
	
	
	
		
			3.6 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
|  | #' Analyze the provided expression data for ubiquitously expressed genes. | ||
|  | #' | ||
|  | #' @param data A `data.table` in normalized, long format. There should be a | ||
|  | #'   `gene` column containing Ensembl gene IDs, a `sample` column containing | ||
|  | #'   abitrary sample identifiers that are unique per sample and an `expression` | ||
|  | #'   column containing the actual expression value for each given combination | ||
|  | #'   of gene and sample. | ||
|  | #' | ||
|  | #' @return A `data.table` containing all computed values per gene. | ||
|  | #' | ||
|  | #' @export | ||
|  | analyze <- function(data) { | ||
|  |     data[, `:=`( | ||
|  |         expression_median = median(expression), | ||
|  |         expression_95 = quantile(expression, probs = 0.95) | ||
|  |     ), by = sample] | ||
|  | 
 | ||
|  |     # Transform the expression logarithmically. The samples that don't express a | ||
|  |     # gene at all will be left out intentionally. | ||
|  |     data[expression > 0, expression_log := log2(expression)] | ||
|  | 
 | ||
|  |     results <- data[, .( | ||
|  |         median_expression = median(expression[expression > 0]), | ||
|  |         iqr_expression = IQR(expression[expression > 0]), | ||
|  |         mean_expression = mean(expression[expression > 0]), | ||
|  |         sd_expression = sd(expression[expression > 0]), | ||
|  |         median_expression_normalized = median(expression_log, na.rm = TRUE), | ||
|  |         iqr_expression_normalized = IQR(expression_log, na.rm = TRUE), | ||
|  |         mean_expression_normalized = mean(expression_log, na.rm = TRUE), | ||
|  |         sd_expression_normalized = sd(expression_log, na.rm = TRUE), | ||
|  |         above_zero = mean(expression > 0.0), | ||
|  |         above_threshold = mean(expression > 50.0), | ||
|  |         above_median = mean(expression > expression_median), | ||
|  |         above_95 = mean(expression > expression_95) | ||
|  |     ), by = "gene"] | ||
|  | 
 | ||
|  |     results[, `:=`( | ||
|  |         qcv_expression = iqr_expression / median_expression, | ||
|  |         qcv_expression_normalized = | ||
|  |             iqr_expression_normalized / median_expression_normalized, | ||
|  |         cv_expression = sd_expression / mean_expression, | ||
|  |         cv_expression_normalized = | ||
|  |             sd_expression_normalized / mean_expression_normalized | ||
|  |     )] | ||
|  | 
 | ||
|  |     # Normalize values to the range of 0.0 to 1.0. | ||
|  |     results[, `:=`( | ||
|  |         median_expression_normalized = | ||
|  |             (median_expression_normalized - | ||
|  |                 min(median_expression_normalized, na.rm = TRUE)) / | ||
|  |                 (max(median_expression_normalized, na.rm = TRUE) - | ||
|  |                     min(median_expression_normalized, na.rm = TRUE)), | ||
|  |         iqr_expression_normalized = | ||
|  |             (iqr_expression_normalized - | ||
|  |                 min(iqr_expression_normalized, na.rm = TRUE)) / | ||
|  |                 (max(iqr_expression_normalized, na.rm = TRUE) - | ||
|  |                     min(iqr_expression_normalized, na.rm = TRUE)), | ||
|  |         qcv_expression_normalized = | ||
|  |             (qcv_expression_normalized - | ||
|  |                 min(qcv_expression_normalized, na.rm = TRUE)) / | ||
|  |                 (max(qcv_expression_normalized, na.rm = TRUE) - | ||
|  |                     min(qcv_expression_normalized, na.rm = TRUE)), | ||
|  |         mean_expression_normalized = | ||
|  |             (mean_expression_normalized - | ||
|  |                 min(mean_expression_normalized, na.rm = TRUE)) / | ||
|  |                 (max(mean_expression_normalized, na.rm = TRUE) - | ||
|  |                     min(mean_expression_normalized, na.rm = TRUE)), | ||
|  |         sd_expression_normalized = | ||
|  |             (sd_expression_normalized - | ||
|  |                 min(sd_expression_normalized, na.rm = TRUE)) / | ||
|  |                 (max(sd_expression_normalized, na.rm = TRUE) - | ||
|  |                     min(sd_expression_normalized, na.rm = TRUE)), | ||
|  |         cv_expression_normalized = | ||
|  |             (cv_expression_normalized - | ||
|  |                 min(cv_expression_normalized, na.rm = TRUE)) / | ||
|  |                 (max(cv_expression_normalized, na.rm = TRUE) - | ||
|  |                     min(cv_expression_normalized, na.rm = TRUE)) | ||
|  |     )] | ||
|  | 
 | ||
|  |     results | ||
|  | } |