mirror of
				https://github.com/johrpan/ubigen.git
				synced 2025-10-26 19:57:24 +01:00 
			
		
		
		
	Use GeTMM normalization
This commit is contained in:
		
							parent
							
								
									c97ee1ca30
								
							
						
					
					
						commit
						f59a71b16c
					
				
					 3 changed files with 61 additions and 7 deletions
				
			
		|  | @ -29,3 +29,8 @@ Imports: | |||
|     rclipboard, | ||||
|     shiny, | ||||
|     shinyWidgets | ||||
| Suggests: | ||||
|     biomaRt, | ||||
|     edgeR, | ||||
|     purrr, | ||||
|     stringr | ||||
|  |  | |||
|  | @ -2,22 +2,74 @@ | |||
| # further analysis. Note that this requires very good computational resources | ||||
| # and especially a lot of RAM because of the size of the data. | ||||
| 
 | ||||
| library(biomaRt) | ||||
| library(edgeR) | ||||
| library(data.table) | ||||
| library(here) | ||||
| 
 | ||||
| i_am("scripts/input.R") | ||||
| 
 | ||||
| # Source: https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/ | ||||
| # GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz | ||||
| # The file has been edited removing the lines above the column headers. | ||||
| data_wide_samples <- fread(here("scripts", "input", "gtex.tsv.gz")) | ||||
| # GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz | ||||
| read_counts <- fread(here("scripts", "input", "gtex_read_counts.tsv.gz")) | ||||
| 
 | ||||
| setnames( | ||||
|   data_wide_samples, | ||||
|   read_counts, | ||||
|   c("Name", "Description"), | ||||
|   c("gene", "hgnc_symbol") | ||||
| ) | ||||
| 
 | ||||
| # Remove Ensembl gene version numbers. | ||||
| read_counts[, gene := stringr::str_split(gene, "\\.") |> purrr::map_chr(1)] | ||||
| 
 | ||||
| # Get gene lengths from Ensembl. | ||||
| 
 | ||||
| ensembl <- useEnsembl("ensembl", version = 107) | ||||
| dataset <- useDataset("hsapiens_gene_ensembl", mart = ensembl) | ||||
| gene_lengths <- data.table(getBM( | ||||
|   attributes = c( | ||||
|     "ensembl_gene_id", | ||||
|     "start_position", | ||||
|     "end_position" | ||||
|   ), | ||||
|   mart = dataset | ||||
| )) | ||||
| 
 | ||||
| setnames(gene_lengths, "ensembl_gene_id", "gene") | ||||
| 
 | ||||
| gene_lengths[, length := end_position - start_position] | ||||
| 
 | ||||
| read_counts <- merge( | ||||
|   read_counts, | ||||
|   gene_lengths, | ||||
|   by = "gene", | ||||
|   all.x = TRUE | ||||
| ) | ||||
| 
 | ||||
| read_counts <- read_counts[!is.na(length)] | ||||
| hgnc_symbols <- read_counts$hgnc_symbol | ||||
| gene_lengths <- read_counts$length | ||||
| 
 | ||||
| read_counts <- as.matrix( | ||||
|   read_counts[, !c( | ||||
|     "hgnc_symbol", | ||||
|     "start_position", | ||||
|     "end_position", | ||||
|     "length" | ||||
|   )], | ||||
|   rownames = "gene" | ||||
| ) | ||||
| 
 | ||||
| # Normalize read counts for gene lengths | ||||
| read_counts <- read_counts / (gene_lengths / 1000) | ||||
| 
 | ||||
| getpm <- DGEList(counts = read_counts) |> | ||||
|   calcNormFactors() |> | ||||
|   cpm() | ||||
| 
 | ||||
| data_wide_samples <- data.table(getpm, keep.rownames = "gene") | ||||
| data_wide_samples[, hgnc_symbol := hgnc_symbols] | ||||
| 
 | ||||
| data_long <- melt( | ||||
|   data_wide_samples, | ||||
|   id.vars = c("gene", "hgnc_symbol"), | ||||
|  |  | |||
|  | @ -8,9 +8,6 @@ i_am("scripts/input.R") | |||
| 
 | ||||
| data <- fread(here("scripts", "output", "results.csv")) | ||||
| 
 | ||||
| # Keep only the actual Ensembl ID for each gene. | ||||
| data[, gene := stringr::str_split(gene, "\\.") |> purrr::map_chr(1)] | ||||
| 
 | ||||
| data[, score := 0.5 * above_95 + | ||||
|   0.25 * mean_expression_normalized + | ||||
|   -0.25 * sd_expression_normalized] | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue