mirror of
				https://github.com/johrpan/ubigen.git
				synced 2025-10-26 19:57:24 +01:00 
			
		
		
		
	Reduce memory footprint during analysis
This commit is contained in:
		
							parent
							
								
									f59a71b16c
								
							
						
					
					
						commit
						2eec3285f9
					
				
					 4 changed files with 33 additions and 21 deletions
				
			
		|  | @ -32,5 +32,6 @@ Imports: | |||
| Suggests: | ||||
|     biomaRt, | ||||
|     edgeR, | ||||
|     here, | ||||
|     purrr, | ||||
|     stringr | ||||
|  |  | |||
|  | @ -7,7 +7,7 @@ library(here) | |||
| 
 | ||||
| i_am("scripts/input.R") | ||||
| 
 | ||||
| data <- fread(here("scripts", "input", "data_long.csv.gz")) | ||||
| data <- fread(here("scripts", "input", "data_long.csv")) | ||||
| 
 | ||||
| data[, `:=`( | ||||
|     expression_median = median(expression), | ||||
|  |  | |||
|  | @ -70,21 +70,24 @@ getpm <- DGEList(counts = read_counts) |> | |||
| data_wide_samples <- data.table(getpm, keep.rownames = "gene") | ||||
| data_wide_samples[, hgnc_symbol := hgnc_symbols] | ||||
| 
 | ||||
| # Create lookup tables for genes and samples. | ||||
| 
 | ||||
| genes <- data_wide_samples[, .(id = .I, gene, hgnc_symbol)] | ||||
| fwrite(genes, file = here("scripts", "input", "genes.csv")) | ||||
| 
 | ||||
| sample_names <- colnames(data_wide_samples[, !c("gene", "hgnc_symbol")]) | ||||
| samples <- data.table(id = seq_along(sample_names), sample = sample_names) | ||||
| fwrite(samples, file = here("scripts", "input", "samples.csv")) | ||||
| 
 | ||||
| data_wide_samples[, `:=`(gene = .I, hgnc_symbol = NULL)] | ||||
| colnames(data_wide_samples) <- c("gene", seq_along(sample_names)) | ||||
| 
 | ||||
| data_long <- melt( | ||||
|   data_wide_samples, | ||||
|   id.vars = c("gene", "hgnc_symbol"), | ||||
|   id.vars = "gene", | ||||
|   variable.name = "sample", | ||||
|   value.name = "expression", | ||||
|   variable.factor = FALSE | ||||
| ) | ||||
| 
 | ||||
| fwrite( | ||||
|   data_wide_samples, | ||||
|   file = here( | ||||
|     "scripts", | ||||
|     "input", | ||||
|     "data_wide_samples.csv.gz" | ||||
|   ) | ||||
| ) | ||||
| 
 | ||||
| fwrite(data_long, file = here("scripts", "input", "data_long.csv.gz")) | ||||
| fwrite(data_long, file = here("scripts", "input", "data_long.csv")) | ||||
|  |  | |||
|  | @ -6,6 +6,7 @@ library(here) | |||
| 
 | ||||
| i_am("scripts/input.R") | ||||
| 
 | ||||
| genes <- fread(here("scripts", "input", "genes.csv")) | ||||
| data <- fread(here("scripts", "output", "results.csv")) | ||||
| 
 | ||||
| data[, score := 0.5 * above_95 + | ||||
|  | @ -22,17 +23,24 @@ data[is.na(score), score := 0.0] | |||
| 
 | ||||
| setorder(data, -score) | ||||
| 
 | ||||
| # Reintroduce gene IDs and HGNC symbols. | ||||
| 
 | ||||
| setnames(data, "gene", "id") | ||||
| 
 | ||||
| data <- merge( | ||||
|   data, | ||||
|   genes, | ||||
|   by = "id", | ||||
|   all.x = TRUE, | ||||
|   sort = FALSE | ||||
| ) | ||||
| 
 | ||||
| setnames(data, "hgnc_symbol", "hgnc_name") | ||||
| data[, id := NULL] | ||||
| 
 | ||||
| # Remove duplicates. This will keep the best row for each duplicated gene. | ||||
| data <- unique(data, by = "gene") | ||||
| 
 | ||||
| data[, `:=`( | ||||
|   hgnc_name = gprofiler2::gconvert( | ||||
|     gene, | ||||
|     target = "HGNC", | ||||
|     mthreshold = 1, | ||||
|     filter_na = FALSE | ||||
|   )$target, | ||||
|   rank = .I | ||||
| )] | ||||
| data[, rank := .I] | ||||
| 
 | ||||
| fwrite(data, file = here("scripts", "output", "genes.csv")) | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue