| Title: | Genome-wide Detection of Differential ORF Usage |
|---|---|
| Description: | Differential open reading frame (ORF) translation analysis framework for ribosome profiling (Ribo-seq) with matched RNA-seq. Implements (i) Differential ORF Usage (DOU), a beta-binomial generalized linear model that models the expected proportion of Ribo-seq versus RNA-seq reads mapping to each ORF within a gene, and (ii) ORF-level Differential Translation Efficiency (DTE), a negative binomial GLM that capture changes in translation efficiency of individual ORFs across experimental conditions. Supports ORF-level read summarization for bulk and single-cell Ribo-seq. |
| Authors: | Chun Shen Lim [aut, cre] (ORCID: <https://orcid.org/0000-0001-7015-0125>), Gabrielle Chieng [aut, ctb] (ORCID: <https://orcid.org/0009-0008-8710-9979>), Marsden [fnd] |
| Maintainer: | Chun Shen Lim <[email protected]> |
| License: | MIT + file LICENSE |
| Version: | 1.1.0 |
| Built: | 2026-05-16 09:19:04 UTC |
| Source: | https://github.com/bioc/DOTSeq |
This function extracts post hoc results for a given gene ID and computes condition-specific ORF usage using emmeans contrasts.
calculateUsage(data, gene_id)calculateUsage(data, gene_id)
data |
A |
gene_id |
A character string specifying the gene ID of interest |
Usage is computed as the inverse logit of the difference between
ribo and RNA strategy emmeans per condition. Only valid emmGrid
objects are used. Strategy levels are assigned using a helper
function assign_strategy_levels.
A data.frame with ORF IDs, conditions, and usage estimates
# Load a required library library(SummarizedExperiment) # Load test data dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) flat <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = flat, flattened_bed = bed ) dou <- getDOU(d) dou <- dou[rowData(dou)$is_kept == TRUE, ] # Subset only one gene dou <- dou[rowData(dou)$gene_id == "ENSG00000119402.18", ] getDOU(d) <- dou d <- DOTSeq(datasets = d, modules = "DOU") usage_df <- calculateUsage( getDOU(d), gene_id = "ENSG00000119402.18" ) print(usage_df)# Load a required library library(SummarizedExperiment) # Load test data dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) flat <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = flat, flattened_bed = bed ) dou <- getDOU(d) dou <- dou[rowData(dou)$is_kept == TRUE, ] # Subset only one gene dou <- dou[rowData(dou)$gene_id == "ENSG00000119402.18", ] getDOU(d) <- dou d <- DOTSeq(datasets = d, modules = "DOU") usage_df <- calculateUsage( getDOU(d), gene_id = "ENSG00000119402.18" ) print(usage_df)
Uses summarizeOverlaps to count reads
overlapping genomic ranges, handling both single-end and paired-end
BAM files.
countReads( gr, bam_files, ignore.strand = list(single = TRUE, paired = FALSE), verbose = TRUE )countReads( gr, bam_files, ignore.strand = list(single = TRUE, paired = FALSE), verbose = TRUE )
gr |
A |
bam_files |
Character vector. Paths to BAM files. |
ignore.strand |
A named list with logical values for |
verbose |
Logical. Whether to print progress messages. |
A matrix of read counts with features as rows and samples as columns.
library(GenomicRanges) library(pasillaBamSubset) bam_list <- c(untreated1_chr4(), untreated3_chr4()) gr <- GRanges(seqnames = "chr4", ranges = IRanges(start = 233, end = 2300)) countReads(gr = gr, bam_files = bam_list)library(GenomicRanges) library(pasillaBamSubset) bam_list <- c(untreated1_chr4(), untreated3_chr4()) gr <- GRanges(seqnames = "chr4", ranges = IRanges(start = 233, end = 2300)) countReads(gr = gr, bam_files = bam_list)
Vectorized, memory-efficient counting from BAMs with CB/UB tags. Overlaps are computed against per-seqname NCList indexes; UMIs are deduplicated per (cell, feature) if provided. Results are sparse matrices (features × cells).
countReadsSingleCell( gr, bam, seqlevels_style = "UCSC", tags = list(cell = "CB", umi = "UB"), ignore.strand = TRUE, yieldSize = 1e+06, mapq = 10, dedup = TRUE, mode = c("Union", "IntersectionStrict", "IntersectionNotEmpty"), cells = NULL, verbose = TRUE, BPPARAM = MulticoreParam(workers = 12, stop.on.error = FALSE, progressbar = TRUE) )countReadsSingleCell( gr, bam, seqlevels_style = "UCSC", tags = list(cell = "CB", umi = "UB"), ignore.strand = TRUE, yieldSize = 1e+06, mapq = 10, dedup = TRUE, mode = c("Union", "IntersectionStrict", "IntersectionNotEmpty"), cells = NULL, verbose = TRUE, BPPARAM = MulticoreParam(workers = 12, stop.on.error = FALSE, progressbar = TRUE) )
gr |
GRanges of genomic features (e.g., ORFs). Must be on the same reference naming style as BAMs or convertible via seqlevelsStyle. |
bam |
Character vector of BAM paths (STARsolo/CellRanger or similar). |
seqlevels_style |
Character; the naming style for chromosome
identifiers (e.g., |
tags |
Named list with |
ignore.strand |
Logical: whether to ignore strand in overlaps. |
yieldSize |
Integer chunk size for streaming from BAM. |
mapq |
Minimum MAPQ to keep (default 10). |
dedup |
Logical: if TRUE and UMI tag present, deduplicate UMIs per (cell, feature). |
mode |
Overlap mode; currently supports "Union" (others placeholder for extension). |
cells |
Optional character vector of barcodes to keep; if provided, columns are restricted to these and ordered accordingly. |
verbose |
Logical for progress messages. |
BPPARAM |
A BiocParallelParam. Default:
|
A sparse dgCMatrix (features × cells), with rownames taken from names(gr) if present.
suppressPackageStartupMessages({ library(GenomicRanges) library(Rsamtools) library(GenomeInfoDb) library(Matrix) library(BiocParallel) }) # Minimal GRanges with two ORF-like features on chr1 gr <- GRanges("chr1", IRanges(c(90, 140), width = 40)) names(gr) <- c("orf1", "orf2") # Write a tiny SAM with CB/UB tags for two cells (cellA, cellB) # r1 and r2 share the same (cell, UMI) and overlap orf1 -> should deduplicate to 1 # r3 overlaps orf2 in cellA; r4 overlaps orf2 in cellB sam <- c( "@HD\tVN:1.6\tSO:coordinate", "@SQ\tSN:chr1\tLN:1000", "r1\t0\tchr1\t100\t60\t20M\t*\t0\t0\tACGTACGTACGTACGTACGT\tFFFFFFFFFFFFFFFFFFFF\tCB:Z:cellA\tUB:Z:umi1", "r2\t0\tchr1\t110\t60\t20M\t*\t0\t0\tACGTACGTACGTACGTACGT\tFFFFFFFFFFFFFFFFFFFF\tCB:Z:cellA\tUB:Z:umi1", # dup UMI/cell "r3\t0\tchr1\t150\t60\t20M\t*\t0\t0\tACGTACGTACGTACGTACGT\tFFFFFFFFFFFFFFFFFFFF\tCB:Z:cellA\tUB:Z:umi2", "r4\t0\tchr1\t150\t60\t20M\t*\t0\t0\tACGTACGTACGTACGTACGT\tFFFFFFFFFFFFFFFFFFFF\tCB:Z:cellB\tUB:Z:umi1" ) td <- tempdir() sam_path <- file.path(td, "toy.sam") writeLines(sam, sam_path) dest_base <- file.path(td, "toy") bam_path <- Rsamtools::asBam(sam_path, destination = dest_base, overwrite = TRUE) bam_path <- paste0(dest_base, ".bam") Rsamtools::indexBam(bam_path) # Count per (feature × cell) with UMI deduplication; use SerialParam for portability mat <- countReadsSingleCell( gr = gr, bam = bam_path, seqlevels_style = "UCSC", tags = list(cell = "CB", umi = "UB"), mapq = 10, dedup = TRUE, BPPARAM = BiocParallel::SerialParam() ) # Inspect the sparse matrix print(dim(mat)) print(colnames(mat)) as.matrix(mat)suppressPackageStartupMessages({ library(GenomicRanges) library(Rsamtools) library(GenomeInfoDb) library(Matrix) library(BiocParallel) }) # Minimal GRanges with two ORF-like features on chr1 gr <- GRanges("chr1", IRanges(c(90, 140), width = 40)) names(gr) <- c("orf1", "orf2") # Write a tiny SAM with CB/UB tags for two cells (cellA, cellB) # r1 and r2 share the same (cell, UMI) and overlap orf1 -> should deduplicate to 1 # r3 overlaps orf2 in cellA; r4 overlaps orf2 in cellB sam <- c( "@HD\tVN:1.6\tSO:coordinate", "@SQ\tSN:chr1\tLN:1000", "r1\t0\tchr1\t100\t60\t20M\t*\t0\t0\tACGTACGTACGTACGTACGT\tFFFFFFFFFFFFFFFFFFFF\tCB:Z:cellA\tUB:Z:umi1", "r2\t0\tchr1\t110\t60\t20M\t*\t0\t0\tACGTACGTACGTACGTACGT\tFFFFFFFFFFFFFFFFFFFF\tCB:Z:cellA\tUB:Z:umi1", # dup UMI/cell "r3\t0\tchr1\t150\t60\t20M\t*\t0\t0\tACGTACGTACGTACGTACGT\tFFFFFFFFFFFFFFFFFFFF\tCB:Z:cellA\tUB:Z:umi2", "r4\t0\tchr1\t150\t60\t20M\t*\t0\t0\tACGTACGTACGTACGTACGT\tFFFFFFFFFFFFFFFFFFFF\tCB:Z:cellB\tUB:Z:umi1" ) td <- tempdir() sam_path <- file.path(td, "toy.sam") writeLines(sam, sam_path) dest_base <- file.path(td, "toy") bam_path <- Rsamtools::asBam(sam_path, destination = dest_base, overwrite = TRUE) bam_path <- paste0(dest_base, ".bam") Rsamtools::indexBam(bam_path) # Count per (feature × cell) with UMI deduplication; use SerialParam for portability mat <- countReadsSingleCell( gr = gr, bam = bam_path, seqlevels_style = "UCSC", tags = list(cell = "CB", umi = "UB"), mapq = 10, dedup = TRUE, BPPARAM = BiocParallel::SerialParam() ) # Inspect the sparse matrix print(dim(mat)) print(colnames(mat)) as.matrix(mat)
DOTSeq provides a flexible beta-binomial generalized linear model (GLM) framework for modeling the expected proportion of ribosome profiling (Ribo-seq) to RNA-seq counts for individual open reading frames (ORFs) relative to other ORFs within the same gene. It also includes a negative binomial GLM framework for detecting changes in translation efficiency across experimental conditions.
DOTSeq is a statistical framework for modeling Differential ORF Translation using ribosome profiling (Ribo-seq) and RNA-seq data. It supports two modes of input:
A named list of raw input components: count_table,
condition_table, flattened_gtf, and bed.
A pre-constructed DOTSeqDataSets-class object containing processed data.
The function automatically detects the input type and proceeds with the
appropriate workflow. It performs ORF-level filtering, model fitting,
post hoc contrasts, and adaptive shrinkage of effect sizes. Plotting and
downstream analysis are handled separately via the plotDOT function.
DOTSeq( datasets, formula = ~condition * strategy, modules = c("DOU", "DTE"), target = NULL, baseline = NULL, min_count = 1, stringent = TRUE, dispersion_modeling = "auto", dispformula = NULL, lrt = FALSE, diagnostic = FALSE, parallel = list(n = 4L, autopar = TRUE), optimizers = FALSE, nullweight = 500, contrasts_method = "revpairwise", verbose = TRUE )DOTSeq( datasets, formula = ~condition * strategy, modules = c("DOU", "DTE"), target = NULL, baseline = NULL, min_count = 1, stringent = TRUE, dispersion_modeling = "auto", dispformula = NULL, lrt = FALSE, diagnostic = FALSE, parallel = list(n = 4L, autopar = TRUE), optimizers = FALSE, nullweight = 500, contrasts_method = "revpairwise", verbose = TRUE )
datasets |
Either:
|
formula |
A formula object specifying the design.
Default is |
modules |
Character vector specifying which DOTSeq modules
to run. Options include |
target |
Character string specifying the non-reference condition
level to extract the corresponding interaction term.
Default is |
baseline |
Character string specifying the desired reference level.
Default is |
min_count |
Minimum count threshold for filtering ORFs.
Default is |
stringent |
Logical or
|
dispersion_modeling |
String specifying the dispersion modeling
approach for DOU. Options include |
dispformula |
Optional formula object for custom dispersion modeling. |
lrt |
Logical; if |
diagnostic |
Logical; if |
parallel |
A list passed to |
optimizers |
Logical; if |
nullweight |
Numeric. Prior weight on the null hypothesis for empirical
Bayes shrinkage in DOU. Default is |
contrasts_method |
Character string specifying the method for post hoc
contrasts in DOU. Default is |
verbose |
Logical; if |
A DOTSeqDataSets-class object containing:
DOUA DOUData-class object with DOU results.
DTEA DTEData-class object with DTE results.
Maintainer: Chun Shen Lim [email protected] (ORCID)
Authors:
Gabrielle Chieng (ORCID) [contributor]
Other contributors:
Marsden [funder]
Brooks, M. E., Kristensen, K., van Benthem, K. J., Magnusson, A., Berg, C. W., Nielsen, A., Skaug, H. J., Mächler, M. and Bolker, B. M. (2017). glmmTMB balances speed and flexibility among packages for zero-inflated generalized linear mixed modeling. The R Journal, 378–400. DOI: 10.32614/RJ-2017-066
Love, M. I., Huber, W., Anders, S. (2014). Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology, 15:550. DOI: 10.1186/s13059-014-0550-8
Lenth, R., Piaskowski, J. (2025). emmeans: Estimated Marginal Means, aka Least-Squares Means. R package version 2.0.0. https://rvlenth.github.io/emmeans/
Stephens, M. (2016). False discovery rates: a new deal. Biostatistics, 18(2). DOI: 10.1093/biostatistics/kxw041
Hartig, F. (2025). DHARMa: Residual Diagnostics for Hierarchical (Multi-Level / Mixed) Regression Models. R package version 0.4.7. https://github.com/florianhartig/dharma
Useful links:
DOTSeqDataSets-class, fitDOU,
testDOU, plotDOT
# Load test data dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) gtf <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # Use raw input list d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = gtf, flattened_bed = bed ) d <- DOTSeq(datasets = d, modules = "DTE") show(d) # Get the DOUData object dou <- getDOU(d) # Subset DOUData and edit DOTSeqDataSets in place set.seed(42) random_rows <- sample(seq_len(nrow(dou)), size = 50) getDOU(d) <- dou[random_rows, ] # Run the DOU module d <- DOTSeq(datasets = d) # Create a DOTSeqDataSets object and use it as input d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = gtf, flattened_bed = bed ) d <- DOTSeq(datasets = d, modules = "DTE")# Load test data dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) gtf <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # Use raw input list d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = gtf, flattened_bed = bed ) d <- DOTSeq(datasets = d, modules = "DTE") show(d) # Get the DOUData object dou <- getDOU(d) # Subset DOUData and edit DOTSeqDataSets in place set.seed(42) random_rows <- sample(seq_len(nrow(dou)), size = 50) getDOU(d) <- dou[random_rows, ] # Run the DOU module d <- DOTSeq(datasets = d) # Create a DOTSeqDataSets object and use it as input d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = gtf, flattened_bed = bed ) d <- DOTSeq(datasets = d, modules = "DTE")
A wrapper class to store both DOU and DTE results from DOTSeq analysis.
Displays a summary of the DOTSeqDataSets object.
## S4 method for signature 'DOTSeqDataSets' show(object)## S4 method for signature 'DOTSeqDataSets' show(object)
object |
A |
A DOTSeqDataSets-class S4 object containing DOU and DTE results.
DOUA DOUData-class object.
DTEA DTEData-class object.
# Read in count matrix, condition table, and annotation files dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) flat <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # Create a DOTSeqDataSets object d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = flat, flattened_bed = bed ) getDOU(d) getDTE(d)# Read in count matrix, condition table, and annotation files dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) flat <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # Create a DOTSeqDataSets object d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = flat, flattened_bed = bed ) getDOU(d) getDTE(d)
This function initialize and construct the
DOTSeqDataSets-class object. This includes loading count
and metadata tables, parsing ORF annotations, filtering ORFs based on
count thresholds, and preparing objects for downstream differential
translation analysis using beta-binomial and negative binomial GLM.
This function initialize and construct the
DOTSeqDataSets-class object. This includes loading count
and metadata tables, read a GRanges object with ORF-level
annotation, filtering ORFs based on count thresholds, and preparing
objects for downstream differential translation analysis using
beta-binomial and negative binomial GLM.
DOTSeqDataSetsFromFeatureCounts( count_table, condition_table, flattened_gtf, flattened_bed, formula = ~condition * strategy, target = NULL, baseline = NULL, min_count = 1, stringent = TRUE, verbose = TRUE ) DOTSeqDataSetsFromSummarizeOverlaps( count_table, condition_table, annotation, formula = ~condition * strategy, target = NULL, baseline = NULL, min_count = 1, stringent = TRUE, verbose = TRUE )DOTSeqDataSetsFromFeatureCounts( count_table, condition_table, flattened_gtf, flattened_bed, formula = ~condition * strategy, target = NULL, baseline = NULL, min_count = 1, stringent = TRUE, verbose = TRUE ) DOTSeqDataSetsFromSummarizeOverlaps( count_table, condition_table, annotation, formula = ~condition * strategy, target = NULL, baseline = NULL, min_count = 1, stringent = TRUE, verbose = TRUE )
count_table |
A dataframe of read counts with features as rows and
samples as columns. Generated from |
condition_table |
Path to a sample metadata file or a data frame.
Must include columns: |
flattened_gtf |
Path to a flattened GFF/GTF file containing ORF annotations. |
flattened_bed |
Path to a flattened BED file with ORF annotations. |
formula |
A formula object specifying the design.
Default is |
target |
Character string specifying the non-reference condition
level to extract the corresponding interaction term. Contrasted
against the baseline condition. Default is |
baseline |
Character string specifying the desired reference
level. Default is |
min_count |
Minimum count threshold for filtering ORFs.
Default is |
stringent |
Logical or
|
verbose |
Logical; if |
annotation |
A GRanges object with ORF level annotation,
typically obtained from |
A DOTSeqDataSets-class object containing:
A DOUData-class object containing pre-filtered
raw counts (assay slot), sample metadata
(colData slot), and ORF-level annotation (rowRanges)
used for modeling Differential ORF Usage (DOU).
A DTEData-class object used for modeling
Differential Translation Efficiency (DTE). Stores all data above
except for rowRanges.
A DOTSeqDataSets-class object containing:
A DOUData-class object containing pre-filtered
raw counts (assay slot), sample metadata
(colData slot), and ORF-level annotation (rowRanges)
used for modeling Differential ORF Usage (DOU).
A DTEData-class object used for modeling
Differential Translation Efficiency (DTE). Stores all data above
except for rowRanges.
DOTSeqDataSetsFromFeatureCounts
DOTSeqDataSetsFromFeatureCounts
# Read in count matrix, condition table, and annotation files dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) gtf <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # Create a DOTSeqDataSets object d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = gtf, flattened_bed = bed ) show(d)# Read in count matrix, condition table, and annotation files dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) gtf <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # Create a DOTSeqDataSets object d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = gtf, flattened_bed = bed ) show(d)
A RangedSummarizedExperiment object extended to store modeling
components for Differential ORF Usage (DOU) analysis in the
DOTSeq framework.
This class contains raw counts, sample metadata, and additional slots
for the model formula, emmeans specifications,
and contrast results. It supports flexible modeling of
translation-specific effects using GLM / GLMM and post hoc contrasts.
formulaA formula object specifying the model design
(e.g., ~ condition * strategy).
specsA formula object used to generate
emmeans specifications for post hoc contrasts.
interactionA DFrame containing results
from interaction-specific contrasts.
strategyA DFrame containing results
from strategy-specific contrasts.
RangedSummarizedExperiment,
glmmTMB, emmeans
# Read in count matrix, condition table, and annotation files dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) flat <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # Create a DOTSeqDataSets object d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = flat, flattened_bed = bed ) getDOU(d) getDTE(d)# Read in count matrix, condition table, and annotation files dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) flat <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # Create a DOTSeqDataSets object d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = flat, flattened_bed = bed ) getDOU(d) getDTE(d)
Checks that the slots in a DOUData object are correctly populated.
object |
A |
TRUE if valid, or a character string describing the error.
A DESeqDataSet object extended to store modeling
components for Differential Translation Efficiency (DTE) analysis in
the DOTSeq framework.
This class contains raw counts, sample metadata, and additional slots
for the emmeans specifications, and contrast
results.
Inherits all slots from DESeqDataSet, and adds:
specs: A formula object used to generate
emmeans specifications for post hoc contrasts.
interaction: A DFrame or data.frame containing
results interaction-specific contrasts
strategy: A DFrame or data.frame containing
results strategy-specific contrasts
# Read in count matrix, condition table, and annotation files dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) flat <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # Create a DOTSeqDataSets object d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = flat, flattened_bed = bed ) getDOU(d) getDTE(d)# Read in count matrix, condition table, and annotation files dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) flat <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # Create a DOTSeqDataSets object d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = flat, flattened_bed = bed ) getDOU(d) getDTE(d)
This function identifies ORFs in DNA sequences from FASTA files,
DNAStringSet, or BSgenome objects. It supports both linear and
circular genomes, and can detect ORFs on the sense (+) or both strands.
findORFsFasta( sequences, start_codons = "ATG", stop_codons = "TAA", min_len = 0, longest_orf = TRUE, is_circular = FALSE, plus_strand_only = TRUE )findORFsFasta( sequences, start_codons = "ATG", stop_codons = "TAA", min_len = 0, longest_orf = TRUE, is_circular = FALSE, plus_strand_only = TRUE )
sequences |
Character path to a FASTA file, or a |
start_codons |
Character string of start codons (e.g., "ATG|GTG") |
stop_codons |
Character string of stop codons (e.g., "TAA|TAG") |
min_len |
Integer. Minimum ORF length in bases. Default is |
longest_orf |
Logical. If TRUE, return only the longest ORF per sequence. |
is_circular |
Logical. Whether the genome is circular (e.g., bacterial genomes). |
plus_strand_only |
Logical. If TRUE, scan only the forward strand; if FALSE, scan both strands. |
This method is optimized for prokaryotic genomes or transcript
sequences. Direct use for eukaryotic whole genomes is not suitable
due to the presence of splicing. See also ORFik's findMapORFs.
Each FASTA header is treated independently, and the name (up to the
first space) is used as the seqnames in the returned GRanges
object. Circular genome support is included, and ORFs that span the
start/end boundary are handled.
Note: Ensure your FASTA file is valid and headers are formatted as
>name info, with the name first and no hidden characters, as this
affects coordinate parsing.
A GRanges object containing the ORFs found.
Haakon Tjeldnes et al. (original), Chun Shen Lim (modification).
Tjeldnes, H., Labun, K., Torres Cleuren, Y. et al. ORFik: a comprehensive R toolkit for the analysis of translation. BMC Bioinformatics 22, 336 (2021). DOI: 10.1186/s12859-021-04254-w
This function fits beta-binomial generalized (mixed) linear models
(GLM or GLMM) for Differential ORF Usage (DOU) across all genes
(via glmmTMB). It supports multiple dispersion
modeling approaches and optional diagnostics using DHARMa.
fitDOU( data, formula = ~condition * strategy, specs = ~condition * strategy, dispformula = NULL, dispersion_modeling = "auto", lrt = FALSE, diagnostic = FALSE, parallel = list(n = 4L, autopar = TRUE), optimizers = FALSE, verbose = TRUE )fitDOU( data, formula = ~condition * strategy, specs = ~condition * strategy, dispformula = NULL, dispersion_modeling = "auto", lrt = FALSE, diagnostic = FALSE, parallel = list(n = 4L, autopar = TRUE), optimizers = FALSE, verbose = TRUE )
data |
A |
formula |
A formula object specifying the model design,
e.g., |
specs |
A formula specifying the structure of the estimated
marginal means. Default is |
dispformula |
Optional formula object specifying a custom dispersion
model (used when |
dispersion_modeling |
Character string specifying the dispersion modeling strategy. Options are:
|
lrt |
Logical; if |
diagnostic |
Logical; if |
parallel |
A list passed to |
optimizers |
Logical; if |
verbose |
Logical; if |
A named list of PostHoc objects, one per ORF.
Brooks, M. E., Kristensen, K., van Benthem, K. J., Magnusson, A., Berg, C. W., Nielsen, A., Skaug, H. J., Mächler, M. and Bolker, B. M. (2017). glmmTMB balances speed and flexibility among packages for zero-inflated generalized linear mixed modeling. The R Journal, 378–400. DOI: 10.32614/RJ-2017-066
Lenth R, Piaskowski J (2025). emmeans: Estimated Marginal Means, aka Least-Squares Means. R package version 2.0.0. https://rvlenth.github.io/emmeans/
Hartig F (2025). DHARMa: Residual Diagnostics for Hierarchical (Multi-Level / Mixed) Regression Models. R package version 0.4.7. https://github.com/florianhartig/dharma
DOTSeq, DOTSeqDataSets-class,
testDOU
# Load SummarizedExperiment to enable subsetting and access to components # like rowRanges and rowData library(SummarizedExperiment) # Read in count matrix, condition table, and annotation files dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) gtf <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # Create a DOTSeqDataSets object d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = gtf, flattened_bed = bed ) dou <- getDOU(d) # Keep ORFs where all replicates in at least one condition pass min_count # Single-ORF genes are removed dou <- dou[rowRanges(dou)$is_kept == TRUE, ] # Randomly sample 100 ORFs for fitDOU set.seed(42) random_rows <- sample(seq_len(nrow(dou)), size = 100) dou <- dou[random_rows, ] # Model fitting using fitDOU rowData(dou)[["DOUResults"]] <- fitDOU( data = dou, formula = ~ condition * strategy, specs = ~ condition * strategy, dispersion_modeling = "auto", lrt = FALSE, optimizers = FALSE, diagnostic = FALSE, parallel = list(n = 4L, autopar = TRUE), verbose = TRUE )# Load SummarizedExperiment to enable subsetting and access to components # like rowRanges and rowData library(SummarizedExperiment) # Read in count matrix, condition table, and annotation files dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) gtf <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # Create a DOTSeqDataSets object d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = gtf, flattened_bed = bed ) dou <- getDOU(d) # Keep ORFs where all replicates in at least one condition pass min_count # Single-ORF genes are removed dou <- dou[rowRanges(dou)$is_kept == TRUE, ] # Randomly sample 100 ORFs for fitDOU set.seed(42) random_rows <- sample(seq_len(nrow(dou)), size = 100) dou <- dou[random_rows, ] # Model fitting using fitDOU rowData(dou)[["DOUResults"]] <- fitDOU( data = dou, formula = ~ condition * strategy, specs = ~ condition * strategy, dispersion_modeling = "auto", lrt = FALSE, optimizers = FALSE, diagnostic = FALSE, parallel = list(n = 4L, autopar = TRUE), verbose = TRUE )
These methods retrieve or replace either interaction-specific or
strategy-specific contrast results from a DOUData-class,
DTEData-class, or DOTSeqDataSets-class object.
getContrasts(object, type = c("interaction", "strategy")) ## S4 method for signature 'DOUData' getContrasts(object, type = c("interaction", "strategy")) ## S4 method for signature 'DTEData' getContrasts(object, type = c("interaction", "strategy")) ## S4 method for signature 'DOTSeqDataSets' getContrasts(object, type = c("interaction", "strategy")) getContrasts(object, type = c("interaction", "strategy")) <- value ## S4 replacement method for signature 'DOUData' getContrasts(object, type = c("interaction", "strategy")) <- value ## S4 replacement method for signature 'DTEData' getContrasts(object, type = c("interaction", "strategy")) <- valuegetContrasts(object, type = c("interaction", "strategy")) ## S4 method for signature 'DOUData' getContrasts(object, type = c("interaction", "strategy")) ## S4 method for signature 'DTEData' getContrasts(object, type = c("interaction", "strategy")) ## S4 method for signature 'DOTSeqDataSets' getContrasts(object, type = c("interaction", "strategy")) getContrasts(object, type = c("interaction", "strategy")) <- value ## S4 replacement method for signature 'DOUData' getContrasts(object, type = c("interaction", "strategy")) <- value ## S4 replacement method for signature 'DTEData' getContrasts(object, type = c("interaction", "strategy")) <- value
object |
A |
type |
A character string, either |
value |
A |
For the accessor, a DFrame or data.frame
containing contrast results. For the replacement, an updated
DOUData-class, DTEData-class or DOTSeqDataSets-class object.
# Read in count matrix, condition table, and annotation files dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) flat <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # Create a DOTSeqDataSets object d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = flat, flattened_bed = bed ) getDOU(d) getDTE(d)# Read in count matrix, condition table, and annotation files dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) flat <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # Create a DOTSeqDataSets object d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = flat, flattened_bed = bed ) getDOU(d) getDTE(d)
These methods allow access to and replacement of the DOUData-class
object stored within a DOTSeqDataSets-class container.
getDOU(object) ## S4 method for signature 'DOTSeqDataSets' getDOU(object) getDOU(object) <- value ## S4 replacement method for signature 'DOTSeqDataSets' getDOU(object) <- valuegetDOU(object) ## S4 method for signature 'DOTSeqDataSets' getDOU(object) getDOU(object) <- value ## S4 replacement method for signature 'DOTSeqDataSets' getDOU(object) <- value
object |
A |
value |
A replacement object (e.g., a |
For the accessor, a DOUData-class object. For the replacement,
an updated DOTSeqDataSets-class object.
# Read in count matrix, condition table, and annotation files dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) gtf <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # Create a DOTSeqDataSets object d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = gtf, flattened_bed = bed ) getDOU(d) getDTE(d)# Read in count matrix, condition table, and annotation files dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) gtf <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # Create a DOTSeqDataSets object d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = gtf, flattened_bed = bed ) getDOU(d) getDTE(d)
These methods allow access to and replacement of the DTEData-class
object stored within a DOTSeqDataSets-class container.
getDTE(object) ## S4 method for signature 'DOTSeqDataSets' getDTE(object) getDTE(object) <- value ## S4 replacement method for signature 'DOTSeqDataSets' getDTE(object) <- valuegetDTE(object) ## S4 method for signature 'DOTSeqDataSets' getDTE(object) getDTE(object) <- value ## S4 replacement method for signature 'DOTSeqDataSets' getDTE(object) <- value
object |
A |
value |
A replacement object (e.g., a |
For the accessor, a DTEData-class object. For the replacement,
an updated DOTSeqDataSets-class object.
# Read in count matrix, condition table, and annotation files dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) flat <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # Create a DOTSeqDataSets object d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = flat, flattened_bed = bed ) getDOU(d) getDTE(d)# Read in count matrix, condition table, and annotation files dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) flat <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # Create a DOTSeqDataSets object d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = flat, flattened_bed = bed ) getDOU(d) getDTE(d)
Filters one or more BAM files to retain reads overlapping exonic regions
defined in a TxDb object stored in the metadata of a GRanges object.
Optionally restricts filtering to coding genes only (genes with CDS).
Filtered BAMs are sorted and saved with the suffix .exonic.sorted.bam
in a user-specified or temporary directory.
getExonicReads( gr, seqlevels_style = "UCSC", bam_files, bam_output_dir = tempdir(), coding_genes_only = TRUE, verbose = TRUE )getExonicReads( gr, seqlevels_style = "UCSC", bam_files, bam_output_dir = tempdir(), coding_genes_only = TRUE, verbose = TRUE )
gr |
A |
seqlevels_style |
Character; the naming style for chromosome
identifiers (e.g., |
bam_files |
A character vector of paths to BAM files to be filtered. |
bam_output_dir |
A writable directory where filtered BAM files will be
saved. Defaults to |
coding_genes_only |
Logical; if |
verbose |
Logical; if |
The function uses Rsamtools::filterBam() to extract reads overlapping
exonic regions and Rsamtools::sortBam() to sort the filtered BAM.
The output files are named based on the input BAM file with the suffix
.exonic.sorted.bam.
This function is called for its side effect of creating filtered and
sorted BAM files in bam_output_dir. It returns NULL invisibly.
library(TxDb.Dmelanogaster.UCSC.dm3.ensGene) library(pasillaBamSubset) library(AnnotationDbi) library(GenomeInfoDb) library(withr) # Save a subset of TxDb as an SQLite file txdb_chr4 <- keepSeqlevels( TxDb.Dmelanogaster.UCSC.dm3.ensGene, "chr4", pruning.mode = "coarse" ) txdb_path <- file.path(tempdir(), "dm3_chr4.sqlite") saveDb(txdb_chr4, file = txdb_path) # Create a GRanges object with a link to the TxDb SQLite file gr <- GRanges(seqnames = "chr4", ranges = IRanges(start = 233, end = 2300)) metadata(gr)$txdb <- txdb_path # Filter BAM file and save output in a temporary directory temp_dir <- tempdir() getExonicReads(gr, bam_files = untreated1_chr4(), bam_output_dir = temp_dir ) # Clean up withr::defer(unlink(txdb_path)) withr::defer(unlink(list.files(temp_dir, pattern = "exonic", full.names = TRUE)))library(TxDb.Dmelanogaster.UCSC.dm3.ensGene) library(pasillaBamSubset) library(AnnotationDbi) library(GenomeInfoDb) library(withr) # Save a subset of TxDb as an SQLite file txdb_chr4 <- keepSeqlevels( TxDb.Dmelanogaster.UCSC.dm3.ensGene, "chr4", pruning.mode = "coarse" ) txdb_path <- file.path(tempdir(), "dm3_chr4.sqlite") saveDb(txdb_chr4, file = txdb_path) # Create a GRanges object with a link to the TxDb SQLite file gr <- GRanges(seqnames = "chr4", ranges = IRanges(start = 233, end = 2300)) metadata(gr)$txdb <- txdb_path # Filter BAM file and save output in a temporary directory temp_dir <- tempdir() getExonicReads(gr, bam_files = untreated1_chr4(), bam_output_dir = temp_dir ) # Clean up withr::defer(unlink(txdb_path)) withr::defer(unlink(list.files(temp_dir, pattern = "exonic", full.names = TRUE)))
Identifies open reading frames (ORFs) from transcript sequences and maps
them to genomic coordinates using a GTF/GFF annotation file or a TxDb
object. Supports input sequences as a FASTA file, a DNAStringSet,
or a BSgenome object. Classifies small ORFs (sORFs) as upstream
(uORF), downstream (dORF), or overlapping (oORF) relative to the main
ORFs (mORFs).
getORFs( sequences, annotation, txdb_output_dir = NULL, organism = "Homo sapiens", require_ids = c("hgnc_id", "protein_id", "ccdsid"), source_filter = NULL, circ_seqs = NULL, start_codons = "ATG", stop_codons = "TAA|TAG|TGA", min_len = 0, longest_orf = TRUE, verbose = TRUE )getORFs( sequences, annotation, txdb_output_dir = NULL, organism = "Homo sapiens", require_ids = c("hgnc_id", "protein_id", "ccdsid"), source_filter = NULL, circ_seqs = NULL, start_codons = "ATG", stop_codons = "TAA|TAG|TGA", min_len = 0, longest_orf = TRUE, verbose = TRUE )
sequences |
Transcript sequences. Can be a character string (path
to a FASTA file),
a |
annotation |
Transcript annotation. Can be a character string (path
to a GTF or GFF file) or a |
txdb_output_dir |
TxDb output directory. The TxDb file path is
linked to the GRanges returned in the metadata slot. Default: |
organism |
Character string specifying the organism name (used
only when building a TxDb from a GTF/GFF file). Default is
|
require_ids |
Character vector. Metadata column names that must
be present and non-NA. Default: |
source_filter |
Character or NULL. If provided, filters
transcripts by the 'source' column. Default: |
circ_seqs |
Character vector of circular sequences to exclude
(e.g., |
start_codons |
Character vector of start codons to search for
(e.g., |
stop_codons |
Character string of stop codons separated by
|
min_len |
Integer specifying the minimum ORF length in bases.
Default is |
longest_orf |
Logical. If |
verbose |
Logical. If |
ORFs are identified in transcript space using findORFsFasta().
Coordinates are mapped to the genome using mapFromTranscripts()
and exon annotations.
Main ORFs are defined by overlap with annotated CDS regions.
Small ORFs are classified relative to mORFs based on strand-aware genomic position.
A GRanges object containing genomic coordinates of
ORFs, with metadata columns gene_id and orf_type.
Main ORFs are labeled as "mORF", and small ORFs are
classified as "uORF", "dORF", or "oORF".
Lawrence, M., Huber, W., Pagès, H., Aboyoun, P., Carlson, M., Gentleman, R., Morgan, M., Carey, V. (2013). Software for Computing and Annotating Genomic Ranges. PLoS Computational Biology, 9. DOI: 10.1371/journal.pcbi.1003118
Tjeldnes, H., Labun, K., Torres Cleuren, Y. et al. ORFik: a comprehensive R toolkit for the analysis of translation. BMC Bioinformatics 22, 336 (2021). DOI: 10.1186/s12859-021-04254-w
library(BSgenome.Hsapiens.UCSC.hg38) library(TxDb.Hsapiens.UCSC.hg38.knownGene) library(GenomicFeatures) # Load genome and TxDb genome <- BSgenome.Hsapiens.UCSC.hg38 txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene # Get exons grouped by transcript exons_by_tx <- exonsBy(txdb, by = "tx", use.names = TRUE) # Select a single transcript for demonstration tx1 <- head(exons_by_tx, 100) # Extract transcript sequence tx_seqs <- extractTranscriptSeqs(genome, tx1) # Run getORFs on the transcript sequence txdb_output_dir <- tempdir() gr <- getORFs( sequences = tx_seqs, annotation = txdb, txdb_output_dir = txdb_output_dir ) print(gr) # Clean up sqlite_files <- list.files(txdb_output_dir, pattern = "\\.sqlite$", full.names = TRUE) unlink(sqlite_files)library(BSgenome.Hsapiens.UCSC.hg38) library(TxDb.Hsapiens.UCSC.hg38.knownGene) library(GenomicFeatures) # Load genome and TxDb genome <- BSgenome.Hsapiens.UCSC.hg38 txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene # Get exons grouped by transcript exons_by_tx <- exonsBy(txdb, by = "tx", use.names = TRUE) # Select a single transcript for demonstration tx1 <- head(exons_by_tx, 100) # Extract transcript sequence tx_seqs <- extractTranscriptSeqs(genome, tx1) # Run getORFs on the transcript sequence txdb_output_dir <- tempdir() gr <- getORFs( sequences = tx_seqs, annotation = txdb, txdb_output_dir = txdb_output_dir ) print(gr) # Clean up sqlite_files <- list.files(txdb_output_dir, pattern = "\\.sqlite$", full.names = TRUE) unlink(sqlite_files)
Queries Ensembl or Ensembl Genomes BioMart databases to retrieve gene symbols, descriptions, and optionally Gene Ontology (GO) terms. Supports multiple organism groups including vertebrates, plants, fungi, protists, metazoa, and bacteria.
If the specified symbol_col returns only NA values,
the function automatically falls back to using the
description field instead.
mapIDs( ensembl_ids, dataset, symbol_col = "external_gene_name", include_go = FALSE, mart_source = "ensembl", host = NULL )mapIDs( ensembl_ids, dataset, symbol_col = "external_gene_name", include_go = FALSE, mart_source = "ensembl", host = NULL )
ensembl_ids |
A character vector of Ensembl gene IDs to query. |
dataset |
A string specifying the dataset name (e.g.,
|
symbol_col |
A string specifying the attribute to use as the
gene symbol. Common options include |
include_go |
Logical; if |
mart_source |
A string indicating the BioMart source. One of
|
host |
Optional. A custom host URL (e.g., for archived Ensembl versions). |
A data frame containing gene symbols for the input Ensembl
IDs. If symbol_col is unavailable, the description
field is used instead and renamed to match symbol_col.
Durinck S, Spellman P, Birney E, Huber W (2009). Mapping identifiers for the integration of genomic datasets with the R/Bioconductor package biomaRt. Nature Protocols, 4, 1184–1191. DOI: 10.1038/nprot.2009.97
Durinck S, Moreau Y, Kasprzyk A, Davis S, De Moor B, Brazma A, Huber W (2005). BioMart and Bioconductor: a powerful link between biological databases and microarray data analysis. Bioinformatics, 21, 3439–3440. DOI: 10.1093/bioinformatics/bti525
# Ping Ensembl REST to avoid timeouts when the service is down. is_ensembl_up <- function(timeout = 3) { url <- "https://rest.ensembl.org/info/ping" out <- try( curl::curl_fetch_memory(url, handle = curl::new_handle(timeout = timeout)), silent = TRUE ) is.list(out) && out$status_code >= 200 && out$status_code < 500 } if (is_ensembl_up()) { # Human gene example res <- mapIDs( ensembl_ids = c("ENSG00000139618"), dataset = "hsapiens_gene_ensembl", mart_source = "ensembl" ) head(res) # Arabidopsis example (uncomment to try) # res <- mapIDs( # ensembl_ids = c("AT1G01010"), # dataset = "athaliana_eg_gene", # symbol_col = "tair_symbol", # mart_source = "plants" # ) # head(res) # Plasmodium falciparum example (uncomment to try) # res <- mapIDs( # ensembl_ids = c("PF3D7_0100100"), # dataset = "pfalciparum_eg_gene", # mart_source = "protists" # ) # head(res) } else { message("Ensembl appears unavailable; skipping online example.") }# Ping Ensembl REST to avoid timeouts when the service is down. is_ensembl_up <- function(timeout = 3) { url <- "https://rest.ensembl.org/info/ping" out <- try( curl::curl_fetch_memory(url, handle = curl::new_handle(timeout = timeout)), silent = TRUE ) is.list(out) && out$status_code >= 200 && out$status_code < 500 } if (is_ensembl_up()) { # Human gene example res <- mapIDs( ensembl_ids = c("ENSG00000139618"), dataset = "hsapiens_gene_ensembl", mart_source = "ensembl" ) head(res) # Arabidopsis example (uncomment to try) # res <- mapIDs( # ensembl_ids = c("AT1G01010"), # dataset = "athaliana_eg_gene", # symbol_col = "tair_symbol", # mart_source = "plants" # ) # head(res) # Plasmodium falciparum example (uncomment to try) # res <- mapIDs( # ensembl_ids = c("PF3D7_0100100"), # dataset = "pfalciparum_eg_gene", # mart_source = "protists" # ) # head(res) } else { message("Ensembl appears unavailable; skipping online example.") }
Retrieves the model type string from a PostHoc object.
Retrieves model results, parameters, and diagnostics.
Retrieves the post hoc summary object (e.g. from emmeans).
modelType(object) ## S4 method for signature 'PostHoc' modelType(object) fitResults(object) ## S4 method for signature 'PostHoc' fitResults(object) posthoc(object) ## S4 method for signature 'PostHoc' posthoc(object)modelType(object) ## S4 method for signature 'PostHoc' modelType(object) fitResults(object) ## S4 method for signature 'PostHoc' fitResults(object) posthoc(object) ## S4 method for signature 'PostHoc' posthoc(object)
object |
A |
A character(1) string indicating the model type.
A list containing model results and diagnostics.
A post hoc summary object.
modelType(PostHoc): Access the model type from a PostHoc object.
fitResults(PostHoc): Access the results list.
posthoc(PostHoc): Access the post hoc summary.
ph <- PostHoc(type = "glmmTMB") modelType(ph) ph <- PostHoc(results = list(aic = 100)) fitResults(ph) ph <- PostHoc(posthoc = "dummy_emmeans") posthoc(ph)ph <- PostHoc(type = "glmmTMB") modelType(ph) ph <- PostHoc(results = list(aic = 100)) fitResults(ph) ph <- PostHoc(posthoc = "dummy_emmeans") posthoc(ph)
Can also be used generaly to get number of GRanges object per GRangesList group
numExonsPerGroup(grl, keep.names = TRUE)numExonsPerGroup(grl, keep.names = TRUE)
grl |
|
keep.names |
a logical, keep names or not, default: (TRUE) |
an integer vector of counts
Haakon Tjeldnes et al.
Tjeldnes, H., Labun, K., Torres Cleuren, Y. et al. ORFik: a comprehensive R toolkit for the analysis of translation. BMC Bioinformatics 22, 336 (2021). DOI: 10.1186/s12859-021-04254-w
Generates a suite of visualizations to explore Differential ORF Usage
(DOU) and Translation Efficiency (DTE) results. Supports volcano
plots, Venn diagrams, composite scatter plots, heatmaps, and usage
plots. Integrates Ensembl gene symbols and highlights significant
ORFs based on empirical Bayes shrinkage (via the ashr package's
ash function).
plotDOT( plot_type = "volcano", results = NULL, data = NULL, id_mapping = FALSE, include_go = FALSE, gene_id = NULL, dou_params = list(est_col = "posterior", est_thresh = 1, signif_col = "lfsr", signif_thresh = 0.05, signif_ceil = 10, extreme_thresh = NULL), dte_params = list(est_col = "log2FoldChange", signif_col = "padj", signif_thresh = 0.05), plot_params = list(top_hits = 20, color_by = "significance", rank_by = "significance", legend_position = "topright", order_by = NULL, flip_sign = FALSE), annotation_params = list(sorf_type = "uORF", dataset = "hsapiens_gene_ensembl", symbol_col = "hgnc_symbol", mart_source = "ensembl"), colors = list(dte = adjustcolor("#0072B2", alpha.f = 0.6), dou = adjustcolor("#E69F00", alpha.f = 0.6), both = adjustcolor("#CC79A7", alpha.f = 0.6), none = adjustcolor("grey80", alpha.f = 0.6), uorf = adjustcolor("#D73027", alpha.f = 0.6), morf = adjustcolor("#4575B4", alpha.f = 0.6), dorf = adjustcolor("#A6A6A6", alpha.f = 0.6), low = "blue", middle = "white", high = "red", usage = "Set2"), force_new_device = TRUE, verbose = TRUE )plotDOT( plot_type = "volcano", results = NULL, data = NULL, id_mapping = FALSE, include_go = FALSE, gene_id = NULL, dou_params = list(est_col = "posterior", est_thresh = 1, signif_col = "lfsr", signif_thresh = 0.05, signif_ceil = 10, extreme_thresh = NULL), dte_params = list(est_col = "log2FoldChange", signif_col = "padj", signif_thresh = 0.05), plot_params = list(top_hits = 20, color_by = "significance", rank_by = "significance", legend_position = "topright", order_by = NULL, flip_sign = FALSE), annotation_params = list(sorf_type = "uORF", dataset = "hsapiens_gene_ensembl", symbol_col = "hgnc_symbol", mart_source = "ensembl"), colors = list(dte = adjustcolor("#0072B2", alpha.f = 0.6), dou = adjustcolor("#E69F00", alpha.f = 0.6), both = adjustcolor("#CC79A7", alpha.f = 0.6), none = adjustcolor("grey80", alpha.f = 0.6), uorf = adjustcolor("#D73027", alpha.f = 0.6), morf = adjustcolor("#4575B4", alpha.f = 0.6), dorf = adjustcolor("#A6A6A6", alpha.f = 0.6), low = "blue", middle = "white", high = "red", usage = "Set2"), force_new_device = TRUE, verbose = TRUE )
plot_type |
Type of plot to generate. Options include
|
results |
A data frame containing DOU and DTE estimates and
significance values. Required for all plots except |
data |
A |
id_mapping |
Optional input controlling gene symbol annotation. Can be one of:
Used in volcano and heatmap plots to annotate genes with symbols.
If |
include_go |
Logical; if |
gene_id |
Character string specifying the gene ID for usage
plots. Default is |
dou_params |
A named list of parameters for DOU filtering and display:
|
dte_params |
A named list of parameters for DTE filtering:
|
plot_params |
A named list controlling labeling of top hits:
|
annotation_params |
A named list for BioMart annotation:
|
colors |
A named list of colors used across plots:
|
force_new_device |
Logical; if |
verbose |
Logical; if |
This function orchestrates multiple visualization components to
explore differential translation across ORFs. It uses
testDOU output to identify significant ORFs,
retrieves gene symbols via getBM, and generates
plots to summarize DOU and DTE relationships. The composite scatter
plot includes marginal distributions by ORF type, helping to
visualize the overlap and divergence between DTE and DOU signals.
The volcano plot highlights extreme and top-ranked ORFs, while
the heatmap summarizes DOU across top genes.
A data frame containing gene symbols retrieved from Ensembl, used for labeling and heatmap visualization.
Durinck S, Spellman P, Birney E, Huber W (2009). Mapping identifiers for the integration of genomic datasets with the R/Bioconductor package biomaRt. Nature Protocols, 4, 1184–1191. DOI: 10.1038/nprot.2009.97
Durinck S, Moreau Y, Kasprzyk A, Davis S, De Moor B, Brazma A, Huber W (2005). BioMart and Bioconductor: a powerful link between biological databases and microarray data analysis. Bioinformatics, 21, 3439–3440. DOI: 10.1093/bioinformatics/bti525
Larsson J, Gustafsson P (2018). “A Case Study in Fitting Area-Proportional Euler Diagrams with Ellipses Using eulerr.” In Proceedings of International Workshop on Set Visualization and Reasoning, volume 2116, 84–91. https://ceur-ws.org/Vol-2116/paper7.pdf
# Example ORF-level results results_df <- data.frame( orf_id = c( "ENSG00000139618.19:O001", "ENSG00000139618.19:O002", "ENSG00000157764.15:O003" ), lfsr = c(0.01, 0.2, 0.03), padj = c(0.02, 0.01, 0.1) ) plotDOT(plot_type = "venn", results = results_df)# Example ORF-level results results_df <- data.frame( orf_id = c( "ENSG00000139618.19:O001", "ENSG00000139618.19:O002", "ENSG00000157764.15:O003" ), lfsr = c(0.01, 0.2, 0.03), padj = c(0.02, 0.01, 0.1) ) plotDOT(plot_type = "venn", results = results_df)
This function constructs a new PostHoc object, which stores the
results of a statistical model fitted to ORF-level data. It is typically
used internally by the DOTSeq workflow or manually for testing and
diagnostics.
PostHoc(type = "fitError", results = list(), posthoc = NA_real_)PostHoc(type = "fitError", results = list(), posthoc = NA_real_)
type |
A |
results |
A |
posthoc |
An optional object storing post hoc summary objects
(e.g., from |
A PostHoc S4 object.
## Create a dummy PostHoc object PostHocRes <- PostHoc( type = "glmmTMB", results = list(x = 3, y = 7, b = 4) ) PostHocRes## Create a dummy PostHoc object PostHocRes <- PostHoc( type = "glmmTMB", results = list(x = 3, y = 7, b = 4) ) PostHocRes
The PostHoc class represents post hoc summaries derived from a
beta-binomial GLM/GLMM fitted to ORF-level data. It is also used to store
diagnostics and metadata for each ORF.
Objects of this class are typically created by the user-level function
fitDOU(), or manually using the PostHoc() constructor.
In the DOTSeq workflow, each ORF is assigned a PostHoc
object, which is stored in a DataFrame and embedded in the
rowData slot of a SummarizedExperiment.
typeA character(1) string indicating the model type.
Default is "fitError". If the model is successfully fitted,
the type is typically "glmmTMB".
resultsA list containing model results, parameters, and
test statistics.
posthocAn object of class ANY storing post hoc summary
objects (e.g., from emmeans).
## Create a dummy PostHoc object PostHocRes <- PostHoc( type = "glmmTMB", results = list( model_fit = list(aic = 96.03), tests = list(pvalue_best = 0.355) ) ) PostHocRes## Create a dummy PostHoc object PostHocRes <- PostHoc( type = "glmmTMB", results = list( model_fit = list(aic = 96.03), tests = list(pvalue_best = 0.355) ) ) PostHocRes
This function takes an R formula object that may contain random effect terms and returns a new formula with all such terms removed.
remove_random_effects(formula)remove_random_effects(formula)
formula |
An R formula object. Random effect terms are
identified by the pattern |
A new R formula object containing only the fixed effect terms.
Simulates ribosome profiling and matched RNA-seq count matrices with specified differential ORF translation (DOT) effects. The simulation can include batch effects and supports multiple experimental conditions and replicates.
simDOT( ribo, rna, annotation = NULL, regulation_type = NULL, te_genes = 10, bgenes = 10, num_samples = 2, conditions = 2, gcoeff = 1.5, bcoeff = 0.9, num_batches = 2, size_factor = NULL, min_size = NULL, scale_p0 = NULL, shape = 0.6, scale = 0.5, batch_scenario = "balanced", diagplot_ribo = FALSE, diagplot_rna = FALSE )simDOT( ribo, rna, annotation = NULL, regulation_type = NULL, te_genes = 10, bgenes = 10, num_samples = 2, conditions = 2, gcoeff = 1.5, bcoeff = 0.9, num_batches = 2, size_factor = NULL, min_size = NULL, scale_p0 = NULL, shape = 0.6, scale = 0.5, batch_scenario = "balanced", diagplot_ribo = FALSE, diagplot_rna = FALSE )
ribo |
A matrix or data frame of ribosome profiling counts (genes x samples). |
rna |
A matrix or data frame of RNA-seq counts (genes x samples). |
annotation |
A GRanges object with ORF level annotation,
typically obtained from |
regulation_type |
Character. Specifies the type of DOT effect to
simulate. Passed to the |
te_genes |
Numeric. Percentage of genes to be assigned as differentially translated (default: 10). |
bgenes |
Numeric. Percentage of genes to carry a batch effect (default: 10). |
num_samples |
Integer. Number of biological replicates per condition (default: 2). |
conditions |
Integer. Number of experimental conditions (default: 2). |
gcoeff |
Numeric. Magnitude of log-fold change for DOT effects (default: 1.5). |
bcoeff |
Numeric. Magnitude of batch effect coefficient (default: 0.9). |
num_batches |
Integer. Number of batches (default: 2). |
size_factor |
Numeric scalar. A multiplicative factor applied to the
estimated size parameter ( |
min_size |
Numeric scalar. A lower bound for the modified size
parameter ( |
scale_p0 |
Optional numeric scalar to scale the zero-inflation probabilities. |
shape |
Numeric. Shape parameter for gamma distribution used to simulate baseline coefficients (default: 0.6). |
scale |
Numeric. Scale parameter for gamma distribution used to simulate baseline coefficients (default: 0.5). |
batch_scenario |
Character. Specifies the batch effect design. Must be one of:
|
diagplot_ribo |
Logical. If |
diagplot_rna |
Logical. If |
A DOTSeqDataSets-class object containing:
A DOUData-class object containing simulated count
matrix (assay slot), sample metadata (colData slot), and ORF-level
annotation (rowRanges slot). The rowRanges slot also stores
labels (named binary vector indicating true positive (1)), and
logFC (log-fold changes for the simulated DOU effect) for modeling
Differential ORF Usage (DOU).
A DTEData-class object used for modeling
Differential Translation Efficiency (DTE). Stores all data above
except for rowRanges
Frazee, A. C., Jaffe, A. E., Langmead, B., & Leek, J. T. (2015). Polyester: Simulating RNA-seq datasets with differential transcript expression. Bioinformatics, 31(17), 2778-2784. DOI: 10.1093/bioinformatics/btv272
Chothani, S., Adami, E., Ouyang, J. F., Viswanathan, S., Hubner, N., Cook, S. A., Schafer, S., Rackham, O. J. L. (2019). deltaTE: Detection of translationally regulated genes by integrative analysis of Ribo-seq and RNA-seq data. Current Protocols in Molecular Biology, 129, e108. DOI: 10.1002/cpmb.108
library(SummarizedExperiment) dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table(file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) flat <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = flat, flattened_bed = bed ) raw_counts <- assay(getDOU(d)) raw_counts <- raw_counts[, grep("Cycling|Interphase", colnames(raw_counts))] ribo <- raw_counts[, grep("ribo", colnames(raw_counts))] rna <- raw_counts[, grep("rna", colnames(raw_counts))] rowranges <- rowRanges(getDOU(d)) r <- "uORF_up_mORF_down" g <- 1.5 d <- simDOT( ribo, rna, annotation = rowranges, regulation_type = r, gcoeff = g, num_samples = 1, num_batches = 2 ) show(d) rowData(getDOU(d))library(SummarizedExperiment) dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table(file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) flat <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = flat, flattened_bed = bed ) raw_counts <- assay(getDOU(d)) raw_counts <- raw_counts[, grep("Cycling|Interphase", colnames(raw_counts))] ribo <- raw_counts[, grep("ribo", colnames(raw_counts))] rna <- raw_counts[, grep("rna", colnames(raw_counts))] rowranges <- rowRanges(getDOU(d)) r <- "uORF_up_mORF_down" g <- 1.5 d <- simDOT( ribo, rna, annotation = rowranges, regulation_type = r, gcoeff = g, num_samples = 1, num_batches = 2 ) show(d) rowData(getDOU(d))
Performs Differential ORF Usage (DOU) analysis by computing
contrasts between ribosome profiling and RNA-seq modalities
using estimated marginal means (EMMs) from fitted models.
Supports interaction-specific and strategy-specific contrasts.
Applies empirical Bayes shrinkage via ash to
stabilize effect size estimates.
testDOU( data, contrasts_method = "revpairwise", nullweight = 500, verbose = TRUE )testDOU( data, contrasts_method = "revpairwise", nullweight = 500, verbose = TRUE )
data |
A |
contrasts_method |
Character string specifying the method
for computing contrasts. Default is |
nullweight |
Numeric. Prior weight on the null hypothesis
for empirical Bayes shrinkage. Higher values yield more
conservative lfsr estimates. Default is |
verbose |
Logical. If |
Results for post hoc contrasts are stored in long format using
explicit contrast and/or strategy columns.
Non-converged models are omitted.
A DOUData-class object with two new
S4Vectors::DataFrame slots. These tables contain
long-format results for all computed contrasts:
interactionA long-format S4Vectors::DataFrame containing DOU
effect sizes (log-odds of Ribo-seq minus RNA-seq) for all
interaction-specific contrasts. Columns include:
contrast, betahat (raw log-odds effect size),
sebetahat (standard error), waldpval
(Wald test p-value), waldpadj
(adjusted p-value), posterior (posterior mean from
shrinkage), and lfsr (Local False Sign Rate).
strategyA long-format S4Vectors::DataFrame containing
strategy-specific effect sizes (e.g., Ribo-seq only) for all
computed contrasts. Columns include:
strategy, contrast, and the same shrunken and
unshrunken metrics as above.
Lenth R, Piaskowski J (2025). emmeans: Estimated Marginal Means, aka Least-Squares Means. R package version 2.0.0, https://rvlenth.github.io/emmeans/
Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2. DOI: 10.1093/biostatistics/kxw041
DOTSeq, DOTSeqDataSets-class,
fitDOU, plotDOT
# Load SummarizedExperiment to enable subsetting and access to # components like rowRanges and rowData library(SummarizedExperiment) # Read in count matrix, condition table, and annotation files dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) gtf <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") # extract only samples processed using cyclohexamide cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # remove the treatment column # Create a DOTSeqDataSets object d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = gtf, flattened_bed = bed ) # Keep ORFs where all replicates in at least one condition pass min_count # Single-ORF genes are removed dou <- getDOU(d) dou <- dou[rowRanges(dou)$is_kept == TRUE, ] # Randomly sample 100 ORFs for fitDOU set.seed(42) random_rows <- sample(seq_len(nrow(dou)), size = 100) dou <- dou[random_rows, ] # Model fitting using fitDOU rowData(dou)[["DOUResults"]] <- fitDOU( data = dou, formula = ~ condition * strategy, specs = ~ condition * strategy, dispersion_modeling = "auto", lrt = FALSE, optimizers = FALSE, diagnostic = FALSE, parallel = list(n = 4L, autopar = TRUE), verbose = TRUE ) # Run post hoc contrasts, Wald tests, and effect size shrinkage dou <- testDOU(dou, verbose = TRUE)# Load SummarizedExperiment to enable subsetting and access to # components like rowRanges and rowData library(SummarizedExperiment) # Read in count matrix, condition table, and annotation files dir <- system.file("extdata", package = "DOTSeq") cnt <- read.table( file.path(dir, "featureCounts.cell_cycle_subset.txt.gz"), header = TRUE, comment.char = "#" ) names(cnt) <- gsub(".*(SRR[0-9]+).*", "\\1", names(cnt)) gtf <- file.path(dir, "gencode.v47.orf_flattened_subset.gtf.gz") bed <- file.path(dir, "gencode.v47.orf_flattened_subset.bed.gz") meta <- read.table(file.path(dir, "metadata.txt.gz")) names(meta) <- c("run", "strategy", "replicate", "treatment", "condition") # extract only samples processed using cyclohexamide cond <- meta[meta$treatment == "chx", ] cond$treatment <- NULL # remove the treatment column # Create a DOTSeqDataSets object d <- DOTSeqDataSetsFromFeatureCounts( count_table = cnt, condition_table = cond, flattened_gtf = gtf, flattened_bed = bed ) # Keep ORFs where all replicates in at least one condition pass min_count # Single-ORF genes are removed dou <- getDOU(d) dou <- dou[rowRanges(dou)$is_kept == TRUE, ] # Randomly sample 100 ORFs for fitDOU set.seed(42) random_rows <- sample(seq_len(nrow(dou)), size = 100) dou <- dou[random_rows, ] # Model fitting using fitDOU rowData(dou)[["DOUResults"]] <- fitDOU( data = dou, formula = ~ condition * strategy, specs = ~ condition * strategy, dispersion_modeling = "auto", lrt = FALSE, optimizers = FALSE, diagnostic = FALSE, parallel = list(n = 4L, autopar = TRUE), verbose = TRUE ) # Run post hoc contrasts, Wald tests, and effect size shrinkage dou <- testDOU(dou, verbose = TRUE)