| Title: | Simulation of Tumor Clonal Evolution with Realistic Sequencing Noise |
|---|---|
| Description: | ClonalSim generates realistic mutational profiles of tumor samples with hierarchical clonal structure. It simulates founder, shared, and private mutations with biologically realistic noise models including intra-tumor heterogeneity (Beta distribution) and technical sequencing noise (negative binomial depth variation, binomial read sampling, base errors). The package is designed for benchmarking variant callers, testing clonal deconvolution algorithms, and teaching tumor heterogeneity concepts. |
| Authors: | Gabriele Bucci [aut, cre] (ORCID: <https://orcid.org/0000-0001-9838-7204>) |
| Maintainer: | Gabriele Bucci <[email protected]> |
| License: | MIT + file LICENSE |
| Version: | 1.1.0 |
| Built: | 2026-05-29 08:18:30 UTC |
| Source: | https://github.com/bioc/ClonalSim |
Simulates intra-tumor heterogeneity using a Beta distribution. The Beta distribution is more realistic than Gaussian for frequency data as it is naturally bounded between 0 and 1.
applyBiologicalNoise(true_freq, n_mutations, concentration = 50)applyBiologicalNoise(true_freq, n_mutations, concentration = 50)
true_freq |
numeric, the true allele frequency (0-1) |
n_mutations |
integer, number of mutations to generate |
concentration |
numeric, concentration parameter for Beta distribution (alpha + beta). Higher values = less variability. Default: 50 |
For a clone with true frequency f, VAF values are sampled from
Beta(alpha, beta) where:
alpha = f * concentration
beta = (1-f) * concentration
Higher concentration values result in VAF closer to the expected frequency, while lower values increase spread, simulating spatial/temporal heterogeneity.
numeric vector of VAF values with biological noise
# Generate 100 mutations with true frequency 0.3 and moderate heterogeneity vaf <- applyBiologicalNoise(0.3, 100, concentration = 50) hist(vaf, main = "VAF with biological noise", xlab = "VAF") # High heterogeneity (low concentration) vaf_high_het <- applyBiologicalNoise(0.3, 100, concentration = 20) # Low heterogeneity (high concentration) vaf_low_het <- applyBiologicalNoise(0.3, 100, concentration = 100)# Generate 100 mutations with true frequency 0.3 and moderate heterogeneity vaf <- applyBiologicalNoise(0.3, 100, concentration = 50) hist(vaf, main = "VAF with biological noise", xlab = "VAF") # High heterogeneity (low concentration) vaf_high_het <- applyBiologicalNoise(0.3, 100, concentration = 20) # Low heterogeneity (high concentration) vaf_low_het <- applyBiologicalNoise(0.3, 100, concentration = 100)
Constructor for ClonalSimData
ClonalSimData( mutations = data.frame(), params = list(), clonal_structure = data.frame(), metadata = list() )ClonalSimData( mutations = data.frame(), params = list(), clonal_structure = data.frame(), metadata = list() )
mutations |
data.frame with mutation data |
params |
list with simulation parameters |
clonal_structure |
data.frame with clone hierarchy |
metadata |
list with additional metadata |
ClonalSimData object
# Typically created by simulateTumor(), but can be constructed manually mutations <- data.frame( Mutation = "mut1", Chromosome = "chr1", Position = 1000000, Ref = "A", Alt = "T", True_VAF = 0.5, VAF = 0.48, Depth = 100, Alt_reads = 48, Clone = "Clone1", Type = "private" ) params <- list(subclone_freqs = c(0.5)) sim_data <- ClonalSimData(mutations = mutations, params = params)# Typically created by simulateTumor(), but can be constructed manually mutations <- data.frame( Mutation = "mut1", Chromosome = "chr1", Position = 1000000, Ref = "A", Alt = "T", True_VAF = 0.5, VAF = 0.48, Depth = 100, Alt_reads = 48, Clone = "Clone1", Type = "private" ) params <- list(subclone_freqs = c(0.5)) sim_data <- ClonalSimData(mutations = mutations, params = params)
S4 class to store results from tumor clonal evolution simulation
object |
ClonalSimData object |
TRUE if valid, otherwise error message
mutationsdata.frame containing mutation information with columns: Mutation, Chromosome, Position, Ref, Alt, True_VAF, VAF, Depth, Alt_reads, Clone, Type, Clone_IDs
paramslist containing simulation parameters: subclone_freqs, n_mut_per_clone, n_mut_founder, n_mut_shared, biological_noise, sequencing_noise
clonal_structuredata.frame defining hierarchical clone relationships
metadatalist containing additional metadata (date, version, seed)
# Create a simple simulation sim <- simulateTumor( subclone_freqs = c(0.3, 0.4, 0.3), n_mut_per_clone = c(30, 40, 30) ) # Access mutations head(getMutations(sim)) # Access parameters getSimParams(sim)# Create a simple simulation sim <- simulateTumor( subclone_freqs = c(0.3, 0.4, 0.3), n_mut_per_clone = c(30, 40, 30) ) # Access mutations head(getMutations(sim)) # Access parameters getSimParams(sim)
Get clonal structure
getClonalStructure(object)getClonalStructure(object)
object |
ClonalSimData object |
data.frame defining clone hierarchy
sim <- simulateTumor() structure <- getClonalStructure(sim) print(structure)sim <- simulateTumor() structure <- getClonalStructure(sim) print(structure)
Get metadata
getMetadata(object)getMetadata(object)
object |
ClonalSimData object |
list with metadata
sim <- simulateTumor() metadata <- getMetadata(sim) metadata$datesim <- simulateTumor() metadata <- getMetadata(sim) metadata$date
Get mutations data frame
getMutations(object)getMutations(object)
object |
ClonalSimData object |
data.frame with mutation information
sim <- simulateTumor() mutations <- getMutations(sim) head(mutations)sim <- simulateTumor() mutations <- getMutations(sim) head(mutations)
Get observed VAF values (with sequencing noise)
getObservedVAF(object)getObservedVAF(object)
object |
ClonalSimData object |
numeric vector of observed VAF values
sim <- simulateTumor() obs_vaf <- getObservedVAF(sim) summary(obs_vaf)sim <- simulateTumor() obs_vaf <- getObservedVAF(sim) summary(obs_vaf)
Get simulation parameters
getSimParams(object)getSimParams(object)
object |
ClonalSimData object |
list with simulation parameters
sim <- simulateTumor() params <- getSimParams(sim) params$subclone_freqssim <- simulateTumor() params <- getSimParams(sim) params$subclone_freqs
Get true VAF values (biological, without sequencing noise)
getTrueVAF(object)getTrueVAF(object)
object |
ClonalSimData object |
numeric vector of true VAF values
sim <- simulateTumor() true_vaf <- getTrueVAF(sim) summary(true_vaf)sim <- simulateTumor() true_vaf <- getTrueVAF(sim) summary(true_vaf)
Plot method for ClonalSimData
## S4 method for signature 'ClonalSimData,missing' plot(x, y, type = "vaf_density", ...)## S4 method for signature 'ClonalSimData,missing' plot(x, y, type = "vaf_density", ...)
x |
ClonalSimData object |
y |
unused (for S4 compatibility) |
type |
character indicating plot type: "vaf_density", "vaf_scatter", "depth_histogram", "clone_matrix". Default is "vaf_density" |
... |
additional arguments passed to plotting functions |
Note: You need to load ggplot2 explicitly before using plot():
library(ggplot2)
ggplot2 object
library(ggplot2) sim <- simulateTumor() plot(sim, type = "vaf_density") plot(sim, type = "vaf_scatter")library(ggplot2) sim <- simulateTumor() plot(sim, type = "vaf_density") plot(sim, type = "vaf_scatter")
Print summary method
## S3 method for class 'summary.ClonalSimData' print(x, ...)## S3 method for class 'summary.ClonalSimData' print(x, ...)
x |
summary.ClonalSimData object |
... |
additional arguments (unused) |
NULL (prints to console)
Show method for ClonalSimData
## S4 method for signature 'ClonalSimData' show(object)## S4 method for signature 'ClonalSimData' show(object)
object |
ClonalSimData object |
NULL (prints to console)
sim <- simulateTumor() show(sim)sim <- simulateTumor() show(sim)
Simulates sequencing coverage with realistic overdispersion using negative binomial distribution, which better captures real NGS variability compared to Poisson.
simulateDepth( n_mutations, mean_depth = 100, distribution = "negative_binomial", dispersion = 20 )simulateDepth( n_mutations, mean_depth = 100, distribution = "negative_binomial", dispersion = 20 )
n_mutations |
integer, number of mutations |
mean_depth |
numeric, mean sequencing coverage. Default: 100 |
distribution |
character, distribution type: "negative_binomial" (realistic, default), "poisson" (simpler), or "uniform" (for testing) |
dispersion |
numeric, size parameter for negative binomial. Lower values = more variable coverage. Default: 20 |
The negative binomial distribution allows overdispersion (variance > mean), which occurs in real sequencing due to:
GC content bias
Mappability differences
PCR amplification artifacts
The dispersion parameter controls variability: lower values produce more variable coverage across positions.
integer vector of depth values
# Realistic depth with overdispersion depth_realistic <- simulateDepth(1000, mean_depth = 100, dispersion = 20) hist(depth_realistic, main = "Realistic depth", xlab = "Depth") # More variable coverage (low dispersion) depth_variable <- simulateDepth(1000, mean_depth = 100, dispersion = 10) # Uniform coverage (for testing) depth_uniform <- simulateDepth(1000, mean_depth = 100, distribution = "uniform")# Realistic depth with overdispersion depth_realistic <- simulateDepth(1000, mean_depth = 100, dispersion = 20) hist(depth_realistic, main = "Realistic depth", xlab = "Depth") # More variable coverage (low dispersion) depth_variable <- simulateDepth(1000, mean_depth = 100, dispersion = 10) # Uniform coverage (for testing) depth_uniform <- simulateDepth(1000, mean_depth = 100, distribution = "uniform")
Simulates stochastic read counts using binomial distribution, which models the independent sampling of each sequencing read.
simulateSequencingReads(true_vaf, depth, error_rate = 0.001)simulateSequencingReads(true_vaf, depth, error_rate = 0.001)
true_vaf |
numeric vector, true variant allele frequencies |
depth |
integer vector, sequencing depth at each position |
error_rate |
numeric, base miscall rate (0-1). Default: 0.001 (0.1 typical for Illumina) |
Each sequencing read is independently sampled from the DNA pool. Given a true VAF and depth D, the number of alternative reads follows a binomial distribution: alt_reads ~ Binomial(D, VAF).
This introduces natural sampling variation, with higher uncertainty at:
Low sequencing depth
Low variant frequency
The error_rate parameter simulates base miscalls from sequencer optical/ chemistry errors.
list with three components:
vaf: observed VAF values
alt_reads: alternative allele read counts
ref_reads: reference allele read counts
# Simulate reads for mutations at VAF 0.3 with depth 100 true_vaf <- rep(0.3, 100) depth <- rep(100, 100) reads <- simulateSequencingReads(true_vaf, depth) # Observed VAF varies due to sampling hist(reads$vaf, main = "Observed VAF", xlab = "VAF") abline(v = 0.3, col = "red", lty = 2) # Check alt_reads distribution hist(reads$alt_reads, main = "Alt read counts", xlab = "Alt reads")# Simulate reads for mutations at VAF 0.3 with depth 100 true_vaf <- rep(0.3, 100) depth <- rep(100, 100) reads <- simulateSequencingReads(true_vaf, depth) # Observed VAF varies due to sampling hist(reads$vaf, main = "Observed VAF", xlab = "VAF") abline(v = 0.3, col = "red", lty = 2) # Check alt_reads distribution hist(reads$alt_reads, main = "Alt read counts", xlab = "Alt reads")
Main function to simulate a heterogeneous tumor sample with hierarchical clonal structure, including realistic biological and technical sequencing noise.
simulateTumor( subclone_freqs = c(0.15, 0.25, 0.3, 0.3), n_mut_per_clone = c(20, 25, 30, 15), n_mut_founder = 10, n_mut_shared = list(`2 3 4` = 15, `3 4` = 12, `1 2` = 8), biological_noise = list(enabled = TRUE, concentration = 50), sequencing_noise = list(enabled = TRUE, mean_depth = 100, depth_variation = "negative_binomial", depth_dispersion = 20, error_rate = 0.001, binomial_sampling = TRUE), germline_variants = list(enabled = FALSE, n_variants = 50, vaf_expected = 0.5) )simulateTumor( subclone_freqs = c(0.15, 0.25, 0.3, 0.3), n_mut_per_clone = c(20, 25, 30, 15), n_mut_founder = 10, n_mut_shared = list(`2 3 4` = 15, `3 4` = 12, `1 2` = 8), biological_noise = list(enabled = TRUE, concentration = 50), sequencing_noise = list(enabled = TRUE, mean_depth = 100, depth_variation = "negative_binomial", depth_dispersion = 20, error_rate = 0.001, binomial_sampling = TRUE), germline_variants = list(enabled = FALSE, n_variants = 50, vaf_expected = 0.5) )
subclone_freqs |
numeric vector, frequencies of each subclone (must sum to <= 1). Default: c(0.15, 0.25, 0.30, 0.30) |
n_mut_per_clone |
integer vector, number of private mutations per clone. Default: c(20, 25, 30, 15) |
n_mut_founder |
integer, number of founder mutations (present in all clones). Default: 10 |
n_mut_shared |
named list, shared mutations between clone groups. Names indicate which clones (e.g., "2 3 4"), values are mutation counts. Default: list("2 3 4" = 15, "3 4" = 12, "1 2" = 8) |
biological_noise |
list with biological noise parameters:
|
sequencing_noise |
list with sequencing noise parameters:
|
germline_variants |
list with germline variant parameters:
Germline variants will have VAF adjusted by tumor purity: observed_VAF = germline_vaf * tumor_purity + 0.5 * (1 - tumor_purity) |
ClonalSimData object containing mutations, parameters, and metadata
# Basic simulation with default parameters sim <- simulateTumor() show(sim) plot(sim) # Custom clonal structure sim <- simulateTumor( subclone_freqs = c(0.2, 0.3, 0.5), n_mut_per_clone = c(30, 40, 30) ) # Low purity tumor (40% purity) sim <- simulateTumor( subclone_freqs = c(0.1, 0.15, 0.15), n_mut_per_clone = c(20, 25, 30) ) # High coverage sequencing sim <- simulateTumor( sequencing_noise = list( enabled = TRUE, mean_depth = 500, depth_dispersion = 100 ) ) # No noise (ideal data for testing) sim <- simulateTumor( biological_noise = list(enabled = FALSE), sequencing_noise = list(enabled = FALSE) ) # Include germline variants sim <- simulateTumor( subclone_freqs = c(0.3, 0.4), # 70% purity n_mut_per_clone = c(40, 50), germline_variants = list(enabled = TRUE, n_variants = 100) )# Basic simulation with default parameters sim <- simulateTumor() show(sim) plot(sim) # Custom clonal structure sim <- simulateTumor( subclone_freqs = c(0.2, 0.3, 0.5), n_mut_per_clone = c(30, 40, 30) ) # Low purity tumor (40% purity) sim <- simulateTumor( subclone_freqs = c(0.1, 0.15, 0.15), n_mut_per_clone = c(20, 25, 30) ) # High coverage sequencing sim <- simulateTumor( sequencing_noise = list( enabled = TRUE, mean_depth = 500, depth_dispersion = 100 ) ) # No noise (ideal data for testing) sim <- simulateTumor( biological_noise = list(enabled = FALSE), sequencing_noise = list(enabled = FALSE) ) # Include germline variants sim <- simulateTumor( subclone_freqs = c(0.3, 0.4), # 70% purity n_mut_per_clone = c(40, 50), germline_variants = list(enabled = TRUE, n_variants = 100) )
Summary method for ClonalSimData
## S4 method for signature 'ClonalSimData' summary(object)## S4 method for signature 'ClonalSimData' summary(object)
object |
ClonalSimData object |
list with summary statistics
sim <- simulateTumor() summary(sim)sim <- simulateTumor() summary(sim)
Simple export to data.frame (useful for compatibility with other tools like PyClone, SciClone)
toDataFrame(object, file = NULL, include_true_vaf = TRUE)toDataFrame(object, file = NULL, include_true_vaf = TRUE)
object |
ClonalSimData object |
file |
character, optional file path to write CSV. Default: NULL |
include_true_vaf |
logical, include True_VAF column. Default: TRUE |
data.frame with mutation data
sim <- simulateTumor() df <- toDataFrame(sim) head(df) # Export to CSV (use tempdir to avoid writing to user's filesystem) csv_file <- tempfile(fileext = ".csv") toDataFrame(sim, file = csv_file)sim <- simulateTumor() df <- toDataFrame(sim) head(df) # Export to CSV (use tempdir to avoid writing to user's filesystem) csv_file <- tempfile(fileext = ".csv") toDataFrame(sim, file = csv_file)
Converts mutation data to a GenomicRanges::GRanges object, the standard Bioconductor data structure for genomic intervals.
toGRanges(object, include_metadata = TRUE)toGRanges(object, include_metadata = TRUE)
object |
ClonalSimData object |
include_metadata |
logical, include all metadata columns. Default: TRUE |
GRanges object with mutation information
sim <- simulateTumor() gr <- toGRanges(sim) print(gr) # Access metadata columns GenomicRanges::mcols(gr) # Subset by chromosome gr_chr1 <- gr[GenomicRanges::seqnames(gr) == "chr1"]sim <- simulateTumor() gr <- toGRanges(sim) print(gr) # Access metadata columns GenomicRanges::mcols(gr) # Subset by chromosome gr_chr1 <- gr[GenomicRanges::seqnames(gr) == "chr1"]
Exports data in format suitable for PyClone clonal deconvolution analysis.
toPyClone(object, file, sample_id = "sample1")toPyClone(object, file, sample_id = "sample1")
object |
ClonalSimData object |
file |
character, file path to write TSV |
sample_id |
character, sample identifier. Default: "sample1" |
data.frame formatted for PyClone (invisibly)
sim <- simulateTumor() pyclone_file <- tempfile(fileext = ".tsv") toPyClone(sim, file = pyclone_file)sim <- simulateTumor() pyclone_file <- tempfile(fileext = ".tsv") toPyClone(sim, file = pyclone_file)
Exports data in format suitable for SciClone clonal analysis.
toSciClone(object, file)toSciClone(object, file)
object |
ClonalSimData object |
file |
character, file path to write TSV |
data.frame formatted for SciClone (invisibly)
sim <- simulateTumor() sciclone_file <- tempfile(fileext = ".tsv") toSciClone(sim, file = sciclone_file)sim <- simulateTumor() sciclone_file <- tempfile(fileext = ".tsv") toSciClone(sim, file = sciclone_file)
Exports mutation data to Variant Call Format (VCF), the standard format for variant data. Returns a VariantAnnotation::VRanges object which can be written to VCF file.
toVCF(object, sample_name = "TumorSample", output_file = NULL)toVCF(object, sample_name = "TumorSample", output_file = NULL)
object |
ClonalSimData object |
sample_name |
character, sample name for VCF. Default: "TumorSample" |
output_file |
character, optional file path to write VCF. If NULL, returns VRanges object without writing. Default: NULL |
VRanges object (invisibly if output_file is specified)
sim <- simulateTumor() # Get VRanges object vr <- suppressWarnings(toVCF(sim)) print(vr) # Write to VCF file (use tempdir to avoid writing to user's filesystem) vcf_file <- tempfile(fileext = ".vcf") suppressWarnings(toVCF(sim, output_file = vcf_file))sim <- simulateTumor() # Get VRanges object vr <- suppressWarnings(toVCF(sim)) print(vr) # Write to VCF file (use tempdir to avoid writing to user's filesystem) vcf_file <- tempfile(fileext = ".vcf") suppressWarnings(toVCF(sim, output_file = vcf_file))