Title: | Convenience Functions for Biscuit |
---|---|
Description: | A test harness for bsseq loading of Biscuit output, summarization of WGBS data over defined regions and in mappable samples, with or without imputation, dropping of mostly-NA rows, age estimates, etc. |
Authors: | Tim Triche [aut], Wanding Zhou [aut], Benjamin Johnson [aut], Jacob Morrison [aut, cre], Lyong Heo [aut], James Eapen [aut] |
Maintainer: | Jacob Morrison <[email protected]> |
License: | GPL-3 |
Version: | 1.21.0 |
Built: | 2024-11-20 06:13:21 UTC |
Source: | https://github.com/bioc/biscuiteer |
A test harness for bsseq loading of Biscuit output, summarization of WGBS data over defined regions and in mappable samples (with or without imputation, dropping mostly-NA rows, age estimates, etc.)
Timothy J Triche Jr [email protected], Wanding Zhou [email protected], Ben Johnson [email protected], Jacob Morrison [email protected], Lyong Heo [email protected]
Useful links:
Report bugs at https://github.com/trichelab/biscuiteer/issues
orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE)
orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE)
Calls summarizeBsSeqOver to summarize a bsseq object over provided DNA regions. Useful for exploring genomic data using cBioPortal.
atRegions(bsseq, regions, mappings = NULL, nm = "POETICname", ...)
atRegions(bsseq, regions, mappings = NULL, nm = "POETICname", ...)
bsseq |
A bsseq object |
regions |
A GRanges or GRangesList of regions |
mappings |
A mapping table with rownames(mappings) == colnames(bsseq) (DEFAULT: NULL) |
nm |
Column of the mapping table to map to (DEFAULT: "POETICname") |
... |
Other arguments to pass to summarizeBsSeqOver |
GRanges with summarized information about the bsseq object for the given DNA regions
orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) reg <- GRanges(seqnames = rep("chr11",5), strand = rep("*",5), ranges = IRanges(start = c(0,2.8e6,1.17e7,1.38e7,1.69e7), end= c(2.8e6,1.17e7,1.38e7,1.69e7,2.2e7)) ) regions <- atRegions(bsseq = bisc, regions = reg)
orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) reg <- GRanges(seqnames = rep("chr11",5), strand = rep("*",5), ranges = IRanges(start = c(0,2.8e6,1.17e7,1.38e7,1.69e7), end= c(2.8e6,1.17e7,1.38e7,1.69e7,2.2e7)) ) regions <- atRegions(bsseq = bisc, regions = reg)
Example usage for E-M
binCoverage( bsseq, bins, which = NULL, QDNAseq = TRUE, readLen = 100, paired = TRUE )
binCoverage( bsseq, bins, which = NULL, QDNAseq = TRUE, readLen = 100, paired = TRUE )
bsseq |
A bsseq object - supplied to getCoverage() |
bins |
Bins to summarize over - from tileGenome or QDNAseq.xxYY |
which |
Limit to specific regions? - functions as an import() (DEFAULT: NULL) |
QDNAseq |
Return a QDNAseqReadCounts? - if FALSE, returns a GRanges (DEFAULT: TRUE) |
readLen |
Correction factor for coverage - read length in bp (DEFAULT: 100) |
paired |
Whether the data are from paired-end sequencing (DEFAULT: TRUE) |
NOTE: As of early Sept 2019, QDNAseq did not have hg38 capabilities. If you desire to use the hg38 genome, biscuiteer suggests you use a GRanges object to define your bins.
NOTE: As of late July 2020, biscuiteer has started implemented support for hg38, hg19, mm10, and mm9 for bisulfite-specific features, including adaptive GC-content computation and SV integration for adjusting CNV ends.
Binned read counts
bins <- GRanges(seqnames = rep("chr11",10), strand = rep("*",10), ranges = IRanges(start=100000*0:9, width=100000) ) reg <- GRanges(seqnames = rep("chr11",5), strand = rep("*",5), ranges = IRanges(start = c(0,2.8e6,1.17e7,1.38e7,1.69e7), end= c(2.8e6,1.17e7,1.38e7,1.69e7,2.2e7)) ) orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) bc <- binCoverage(bsseq = bisc, bins = bins, which = reg, QDNAseq = FALSE)
bins <- GRanges(seqnames = rep("chr11",10), strand = rep("*",10), ranges = IRanges(start=100000*0:9, width=100000) ) reg <- GRanges(seqnames = rep("chr11",5), strand = rep("*",5), ranges = IRanges(start = c(0,2.8e6,1.17e7,1.38e7,1.69e7), end= c(2.8e6,1.17e7,1.38e7,1.69e7,2.2e7)) ) orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) bc <- binCoverage(bsseq = bisc, bins = bins, which = reg, QDNAseq = FALSE)
See biscuiteer manpage for package description
## S4 method for signature 'BSseq' samples(object) ## S4 method for signature 'BSseq' header(x) ## S4 method for signature 'BSseq' meta(x) ## S4 method for signature 'BSseq' fixed(x) ## S4 method for signature 'BSseq' info(x) ## S4 method for signature 'BSseq,ANY' geno(x)
## S4 method for signature 'BSseq' samples(object) ## S4 method for signature 'BSseq' header(x) ## S4 method for signature 'BSseq' meta(x) ## S4 method for signature 'BSseq' fixed(x) ## S4 method for signature 'BSseq' info(x) ## S4 method for signature 'BSseq,ANY' geno(x)
object |
A bsseq object, preferably with !is.null(metadata(x)$vcfHeader) |
x |
A bsseq object, preferably with !is.null(metadata(x)$vcfHeader) |
biscuiteer adds VariantAnnotation methods to BSseq objects with VCF headers:
samples
,header
,meta
,fixed
,info
,geno
Due to inherited method signatures, the argument (singular) to the method may
be named x
or it may be named object
. Either way, it is a BSseq object.
These add to the existing methods defined in package bsseq for class BSseq:
[
,length
,sampleNames
,sampleNames<-
,pData
,pData<-
,show
,combine
Those add to the methods BSseq inherits from SummarizedExperiment, such as:
colData
,rowRanges
,metadata
,subset
,subsetByOverlaps
,isDisjoint
,&c.
Most of the biscuiteer methods operate on the VCF header, which readBiscuit
likes to stuff into the metadata
slot of BSseq objects it produces. Some
may be handy for populating a BSseq object with QC stats, or querying those.
Depends on the method - usually a List-like object of some sort
RangedSummarizedExperiment
VCFHeader-class
BSseq-class
BSseq
Returns metadata from a Biscuit run using either a supplied VCF file or the vcfHeader metadata element from the bsseq object
biscuitMetadata(bsseq = NULL, VCF = NULL) getBiscuitMetadata(bsseq = NULL, VCF = NULL)
biscuitMetadata(bsseq = NULL, VCF = NULL) getBiscuitMetadata(bsseq = NULL, VCF = NULL)
bsseq |
A bsseq object with a vcfHeader element (DEFAULT: NULL) |
VCF |
A tabix'ed VCF file (can just be the header information) from which the bsseq vcfHeader element is derived (DEFAULT: NULL) |
Information regarding the Biscuit run
getBiscuitMetadata()
: Alias for biscuitMetadata
orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) meta <- biscuitMetadata(bisc)
orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) meta <- biscuitMetadata(bisc)
This function splits an object by chromosome arm, which tends to make parallelization much easier, as cross-arm dependencies are unusual. Therefore, the larger chromosomes can be split across processes or machines without worrying much about data starvation for processes on smaller chromosomes.
byChromArm(x, arms = NULL)
byChromArm(x, arms = NULL)
x |
Any object with a GRanges in it: bsseq, SummarizedExperiment... |
arms |
Another GRanges, but specifying chromosome arms (DEFAULT: NULL) |
A list, List, or *list, with pieces of x by chromosome arm
orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) reg <- GRanges(seqnames = rep("chr11",5), strand = rep("*",5), ranges = IRanges(start = c(0,2.8e6,1.17e7,1.38e7,1.69e7), end= c(2.8e6,1.17e7,1.38e7,1.69e7,2.2e7)) ) names(reg) <- as.character(reg) arms <- byChromArm(bisc, arms = reg)
orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) reg <- GRanges(seqnames = rep("chr11",5), strand = rep("*",5), ranges = IRanges(start = c(0,2.8e6,1.17e7,1.38e7,1.69e7), end= c(2.8e6,1.17e7,1.38e7,1.69e7,2.2e7)) ) names(reg) <- as.character(reg) arms <- byChromArm(bisc, arms = reg)
This function finds the k most extremal features (features above a certain fraction of the Bernoulli variance) in 'bsseq' and returns their values.
byExtremality(bsseq, r = NULL, k = 500)
byExtremality(bsseq, r = NULL, k = 500)
bsseq |
A bsseq object |
r |
Regions to consider - NULL covers all loci (DEFAULT: NULL) |
k |
How many rows/regions to return (DEFAULT: 500) |
For DNA methylation, particularly when summarized across regions, we can do better (a lot better) than MAD. Since we know: max(SD(X_j)) if X_j ~ Beta(a, b) < max(SD(X_j)) if X_j ~ Bernoulli(a/(a+b)) for X with a known mean and standard deviation (SD), then we can solve for (a+b) by MoM. We can then define the extremality by: extremality = sd(X_j) / bernoulliSD(mean(X_j))
A GRanges object with methylation values sorted by extremality
shuf_bed <- system.file("extdata", "MCF7_Cunha_chr11p15_shuffled.bed.gz", package="biscuiteer") orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") shuf_vcf <- system.file("extdata", "MCF7_Cunha_shuffled_header_only.vcf.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc1 <- readBiscuit(BEDfile = shuf_bed, VCFfile = shuf_vcf, merged = FALSE) bisc2 <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) reg <- GRanges(seqnames = rep("chr11",5), strand = rep("*",5), ranges = IRanges(start = c(0,2.8e6,1.17e7,1.38e7,1.69e7), end= c(2.8e6,1.17e7,1.38e7,1.69e7,2.2e7)) ) comb <- unionize(bisc1, bisc2) ext <- byExtremality(comb, r = reg)
shuf_bed <- system.file("extdata", "MCF7_Cunha_chr11p15_shuffled.bed.gz", package="biscuiteer") orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") shuf_vcf <- system.file("extdata", "MCF7_Cunha_shuffled_header_only.vcf.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc1 <- readBiscuit(BEDfile = shuf_bed, VCFfile = shuf_vcf, merged = FALSE) bisc2 <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) reg <- GRanges(seqnames = rep("chr11",5), strand = rep("*",5), ranges = IRanges(start = c(0,2.8e6,1.17e7,1.38e7,1.69e7), end= c(2.8e6,1.17e7,1.38e7,1.69e7,2.2e7)) ) comb <- unionize(bisc1, bisc2) ext <- byExtremality(comb, r = reg)
A BED checker for Biscuit CpG/CpH output (BED-like format with 2 or 3 columns per sample). By default, files with more than 50 million loci will be processed iteratively, since data.table tends to run into problems with gzipped joint CpH files.
checkBiscuitBED( BEDfile, VCFfile, merged, sampleNames = NULL, chunkSize = 5e+07, hdf5 = FALSE, sparse = TRUE, how = c("data.table", "readr"), chr = NULL )
checkBiscuitBED( BEDfile, VCFfile, merged, sampleNames = NULL, chunkSize = 5e+07, hdf5 = FALSE, sparse = TRUE, how = c("data.table", "readr"), chr = NULL )
BEDfile |
A BED-like file - must be compressed and tabix'ed |
VCFfile |
A VCF file - must be compressed and tabix'ed. Only the header information is needed. |
merged |
Is this merged CpG data? |
sampleNames |
Names of samples - NULL: create names, vector: assign names, data.frame: make pData (DEFAULT: NULL) |
chunkSize |
For files longer than |
hdf5 |
Use HDF5 arrays for backing the data? Using HDF5-backed arrays stores the data in a HDF5 file on disk, rather than loading entire object into memory. This allows for analyses to be done on memory-limited systems at the small cost of slightly reduced return times. (DEFAULT: FALSE) |
sparse |
Use sparse Matrix objects for the data? If TRUE, use a Matrix object for sparse matrices (matrices with many zeroes in them) (DEFAULT: TRUE) |
how |
How to load the data - "data.table" or "readr"? (DEFAULT: data.table) |
chr |
Load a specific chromosome to rbind() later? (DEFAULT: NULL) |
Input BED and VCF files must be tabix'ed. No exceptions!
Parameters to be supplied to makeBSseq
readBiscuit
orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") params <- checkBiscuitBED(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE)
orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") params <- checkBiscuitBED(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE)
Epigenetic clock data
data(clocks, package="biscuiteer")
data(clocks, package="biscuiteer")
Source: See inst/scripts/clocks.R for how the clocks data object was generated. For more information about sources, see the descriptions in ?getClock and ?WGBSage. Return type: data.frame
Utility function for extracting sample names from tabix'ed sample columns, assuming a VCF-naming scheme (such as Sample_1.foo, Sample_1.bar or Sample1_foo, Sample1_bar).
condenseSampleNames(tbx, stride, trailing = "\\.$")
condenseSampleNames(tbx, stride, trailing = "\\.$")
tbx |
A TabixFile instance to parse |
stride |
How many columns per sample |
trailing |
Trailing character to trim (DEFAULT: "\.$") |
A character vector of sample names (longest common substrings)
library(Rsamtools) orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") if (length(headerTabix(orig_bed)$header) > 0) { condenseSampleNames(orig_bed, 2) }
library(Rsamtools) orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") if (length(headerTabix(orig_bed)$header) > 0) { condenseSampleNames(orig_bed, 2) }
WARNING: This function will be deprecated in the next Bioconductor release
CpGindex(bsseq, CGIs = NULL, PRCs = NULL, WCGW = NULL, PMDs = NULL)
CpGindex(bsseq, CGIs = NULL, PRCs = NULL, WCGW = NULL, PMDs = NULL)
bsseq |
A BSseq object |
CGIs |
A GRanges of CpG island regions - HMM CGIs if NULL (DEFAULT: NULL) |
PRCs |
A GRanges of Polycomb targets - H9 state 23 low-meth if NULL (DEFAULT: NULL) |
WCGW |
A GRanges of solo-WCGW sites - PMD WCGWs if NULL (DEFAULT: NULL) |
PMDs |
A GRanges of hypomethylating regions - PMDs if NULL (DEFAULT: NULL) |
Measures hypermethylation at PRCs in CGIs or hypomethylation at WCGWs in PMDs
At some point in some conference call somewhere, a collaborator suggested that a simple index of Polycomb repressor complex (PRC) binding site hyper- methylation and CpG-poor "partially methylated domain" (PMD) hypomethylation would be a handy yardstick for both deterministic and stochastic changes associated with proliferation, aging, and cancer. This function provides such an index by compiling measures of aberrant hyper- and hypo-methylation along with the ratio of hyper- to hypo-methylation. (The logic for this is that while the phenomena tend to occur together, there are many exceptions) The resulting measures can provide a high-level summary of proliferation-, aging-, and/or disease-associated changes in DNA methylation across samples.
The choice of defaults is fairly straightforward: in 2006, three independent groups reported recurrent hypermethylation in cancer at sites marked by both H3K4me3 (activating) and H3K27me3 (repressive) histone marks in embryonic stem cells; these became known as "bivalent" sites. The Roadmap Epigenome project performed ChIP-seq on hundreds of normal primary tissues and cell line results from the ENCODE project to generate a systematic catalog of "chromatin states" alongside dozens of whole-genome bisulfite sequencing experiments in the same tissues. We used both to generate a default atlas of bivalent (Polycomb-associated and transcriptionally-poised) sites from H9 human embryonic stem cells which retain low DNA methylation across normal (non-placental) REMC tissues. In 2018, Zhou and Dinh (Nature Genetics) found isolated (AT)CG(AT) sites, or "solo-WCGW" motifs, in common PMDs as the most universal barometer of proliferation- and aging-associated methylation loss in mammalian cells, so we use their solo-WCGW sites in common PMDs as the default measure for hypomethylation. The resulting CpGindex is a vector of length 3 for each sample: hypermethylation, hypomethylation, and their ratio.
We suggest fitting a model for the composition of bulk samples (tumor/normal, tissue1/tissue2, or whatever is most appropriate) prior to drawing any firm conclusions from the results of this function. For example, a mixture of two-thirds normal tissue and one-third tumor tissue may produce the same or lower degree of hyper/hypomethylation than high-tumor-content cell-free DNA samples from the blood plasma of the same patient. Intuition is simply not a reliable guide in such situations, which occur with some regularity. If orthogonal estimates of purity/composition are available (flow cytometry, ploidy, yield of filtered cfDNA), it is a Very Good Idea to include them.
The default for this function is to use the HMM-defined CpG islands from Hao Wu's paper (Wu, Caffo, Jaffee, Irizarry & Feinberg, Biostatistics 2010) as generic "hypermethylation" targets inside of "bivalent" (H3K27me3+H3K4me3) sites (identified in H9 embryonic stem cells & unmethylated across normals), and the solo-WCGW sites within common partially methylated domains from Wanding Zhou and Huy Dinh's paper (Zhou, Dinh, et al, Nat Genetics 2018) as genetic "hypomethylation" targets (as above, obvious caveats about tissue specificity and user-supplied possibilities exist, but the defaults are sane for many purposes, and can be exchanged for whatever targets a user wishes).
The function returns all three components of the "CpG index", comprised of hyperCGI and hypoPMD (i.e. hyper, hypo, and their ratio). The PMD "score" is a base-coverage-weighted average of losses to solo-WCGW bases within PMDs; the PRC score is similarly base-coverage-weighted but across HMM CGI CpGs, within polycomb repressor complex sites (by default, the subset of state 23 segments in the 25-state, 12-mark ChromImpute model for H9 which have less than 10 percent CpG methylation across the CpG-island-overlapping segment in all normal primary tissues and cells from the Reference Epigenome project). By providing different targets and/or regions, users can customize as needed.
The return value is a CpGindex object, which is really just a DataFrame that
knows about the regions at which it was summarized, and reminds the user of
this when they implicitly call the show
method on it.
A CpGindex (DataFrame w/cols `hyper`, `hypo`, `ratio` + 2 GRs)
Subset of ENSEMBL regulatory build regions for hg19 genome
data(ENSR_subset.hg19, package="biscuiteer")
data(ENSR_subset.hg19, package="biscuiteer")
Source URL: homo_sapiens.GRCh37.Regulatory_Build.regulatory_features.20161117.gff.gz (regions that overlap Inifinium annotation manifests - described at http://zwdzwd.github.io/InfiniumAnnotation - are selected for final GRanges) Source type: GFF Return type: GRanges
Subset of ENSEMBL regulatory build regions for hg19 genome
data(ENSR_subset.hg38, package="biscuiteer")
data(ENSR_subset.hg38, package="biscuiteer")
Source URL: homo_sapiens.GRCh38.Regulatory_Build.regulatory_features.20161111.gff.gz (regions that overlap Inifinium annotation manifests - described at http://zwdzwd.github.io/InfiniumAnnotation - are selected for final GRanges) Source type: GFF Return type: GRanges
Works efficiently on matrices and DelayedMatrix objects. Note that it is possible for "raw" extremality to be greater than 1, so this function does a second pass to correct for this.
extremality(x, raw = FALSE)
extremality(x, raw = FALSE)
x |
A rectangular object with proportions in it |
raw |
Skip the correction pass? (DEFAULT: FALSE) |
The extremality of each row (if more than one) of the object
x <- rnorm(100, mean=0.5, sd=0.15) x <- matrix(x, nrow=50, ncol=2) ext <- extremality(x, raw=TRUE)
x <- rnorm(100, mean=0.5, sd=0.15) x <- matrix(x, nrow=50, ncol=2) ext <- extremality(x, raw=TRUE)
Helper function: expanded expit
fexpit(x, sqz = 1e-06)
fexpit(x, sqz = 1e-06)
x |
a vector of values between -Inf and +Inf |
sqz |
the amount by which we 'squoze', default is .000001 |
a vector of values between 0 and 1 inclusive
set.seed(1234) x <- rnorm(n=1000) summary(x) sqz <- 1 / (10**6) p <- fexpit(x, sqz=sqz) summary(p) all( (abs(x - flogit(p)) / x) < sqz ) all( abs(x - flogit(fexpit(x))) < sqz )
set.seed(1234) x <- rnorm(n=1000) summary(x) sqz <- 1 / (10**6) p <- fexpit(x, sqz=sqz) summary(p) all( (abs(x - flogit(p)) / x) < sqz ) all( abs(x - flogit(fexpit(x))) < sqz )
Function potentially used to be a part of dmrseq. Included here to avoid dmrseq failing due to any number of reasons related to lack of coverage.
filterLoci(bsseq, testCovariate)
filterLoci(bsseq, testCovariate)
bsseq |
A bsseq object for filtering |
testCovariate |
The name of the pData column dmrseq will test on |
The code is adapted from the precheck loop of dmrseq::dmrseq
A bsseq object ready for dmrseq to use
dmrseq
WGBSeq
RRBSeq
shuf_bed <- system.file("extdata", "MCF7_Cunha_chr11p15_shuffled.bed.gz", package="biscuiteer") orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") shuf_vcf <- system.file("extdata", "MCF7_Cunha_shuffled_header_only.vcf.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc1 <- readBiscuit(BEDfile = shuf_bed, VCFfile = shuf_vcf, merged = FALSE) bisc2 <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) comb <- unionize(bisc1, bisc2) filt <- filterLoci(comb, "sampleNames")
shuf_bed <- system.file("extdata", "MCF7_Cunha_chr11p15_shuffled.bed.gz", package="biscuiteer") orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") shuf_vcf <- system.file("extdata", "MCF7_Cunha_shuffled_header_only.vcf.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc1 <- readBiscuit(BEDfile = shuf_bed, VCFfile = shuf_vcf, merged = FALSE) bisc2 <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) comb <- unionize(bisc1, bisc2) filt <- filterLoci(comb, "sampleNames")
Uses Horvath-type 'epigenetic clock' raw output to project into actual ages
fixAge(x, adult = 21)
fixAge(x, adult = 21)
x |
Untransformed or raw prediction(s) |
adult |
Age of adulthood (DEFAULT: 21) |
The 'Epigenetic Clock' (Horvath 2012) and similar schemes use a number of CpG loci (or regions, or perhaps CpH loci – it doesn't really matter what) to estimate the chronological/biological age of samples from DNA methylation with pre-trained feature weights (coefficients) for each region/locus.
All of these type of clocks use a nonlinear output transformation which
switches from an exponential growth model for children into a linear model
for adults, where adult
is an arbitrary number (by default and custom,
that number is 21; elsewhere it can sometimes be seen as 20, but all known
epi-age transformation functions quietly add 1 to the constant internally).
This function implements the above standard output transformation step.
Transformed prediction(s)
clock <- getClock(genome="hg38") score <- clock$gr$score age <- fixAge(score)
clock <- getClock(genome="hg38") score <- clock$gr$score age <- fixAge(score)
Useful for coercing matrices into how bsseq is expecting the M matrix to be.
fixNAs(x, y = 0, sparseMatrix = FALSE)
fixNAs(x, y = 0, sparseMatrix = FALSE)
x |
The matrix-like object containing NAs to fix |
y |
The value to replace the NAs with (DEFAULT: 0) |
sparseMatrix |
Make the result a Matrix object? (DEFAULT: FALSE) |
x with no NAs (possibly a sparse Matrix)
nom <- c(rep(c(1,4,NA,9,NA,NA,7,NA), 5)) no_nas <- fixNAs(nom)
nom <- c(rep(c(1,4,NA,9,NA,NA,7,NA), 5)) no_nas <- fixNAs(nom)
Helper function: squeezed logit
flogit(p, sqz = 1e-06)
flogit(p, sqz = 1e-06)
p |
a vector of values between 0 and 1 inclusive |
sqz |
the amount by which to 'squeeze', default is .000001 |
a vector of values between -Inf and +Inf
set.seed(1234) p <- runif(n=1000) summary(p) sqz <- 1 / (10**6) x <- flogit(p, sqz=sqz) summary(x) all( abs(p - fexpit(x, sqz=sqz)) < sqz ) all( abs(p - fexpit(flogit(p, sqz=sqz), sqz=sqz)) < sqz )
set.seed(1234) p <- runif(n=1000) summary(p) sqz <- 1 / (10**6) x <- flogit(p, sqz=sqz) summary(x) all( abs(p - fexpit(x, sqz=sqz)) < sqz ) all( abs(p - fexpit(flogit(p, sqz=sqz), sqz=sqz)) < sqz )
Biscuiteer supports several 'epigenetic clock' models. This function retrieves the various models.
getClock( model = c("horvath", "horvathshrunk", "hannum", "skinandblood"), padding = 15, genome = c("hg19", "hg38", "GRCh37", "GRCh38"), useENSR = FALSE, useHMMI = FALSE )
getClock( model = c("horvath", "horvathshrunk", "hannum", "skinandblood"), padding = 15, genome = c("hg19", "hg38", "GRCh37", "GRCh38"), useENSR = FALSE, useHMMI = FALSE )
model |
One of "horvath", "horvathshrunk", "hannum", or "skinandblood" |
padding |
How many base pairs (+/-) to expand a feature's footprint (DEFAULT: 15) |
genome |
One of "hg19", "GRCh37", "hg38", or "GRCh38" (DEFAULT: "hg19") |
useENSR |
Substitute ENSEMBL regulatory feature boundaries? (DEFAULT: FALSE) |
useHMMI |
Substitute HMM-based CpG island boundaries? (DEFAULT: FALSE) |
The remapped coordinates for the Horvath (2012) and Hannum (2013) clocks, along with shrunken Horvath (2012) and improved Horvath (2018) models, are provided as part of biscuiteer (visit inst/scripts/clocks.R to find out how) along with some functionality to make them more usable in RRBS/WGBS data of varying coverage along varying genomes. For example, the HMM-based CpG island model introduced by Wu (2010) can be used to assign to within-island features the methylation rate of their associated island, and ENSEMBL regulatory build features (ENSR features, for short) such as CTCF binding sites can have their coordinates substituted for the default padded boundaries of a feature.
The net result of this process is that, while the default settings simply swap in a 30-bp stretch centered on the selected clock's CpG (and/or CpH) loci, add the intercept, and ship out the model, much more flexibility is available to the user. This function provides a single point for tuning of such options in the event that defaults don't work well for a user.
The precedence of options is as follows:
If a feature has neither ENSR nor HMMI IDs, it is padded (only) +/- bp.
If it has an HMMI but not ENSR ID or ENSR==FALSE, the HMM island is used.
If a feature has an ENSR ID, and ENSR==TRUE, the ENSR feature is used.
If a feature has both an ENSR ID and an HMMI ID, and both options are TRUE, then the ENSR start and end coordinates will take precedence over its HMMI.
The above shenanigans produce the GRanges object returned as gr
in a List.
The intercept
value returned with the model is its fixed (B0) coefficient.
The cleanup
function returned with the model transforms its raw output.
a List with elements `model`, `gr`, `intercept`, and `cleanup`
clock <- getClock(model="horvathshrunk", genome="hg38")
clock <- getClock(model="horvathshrunk", genome="hg38")
Want an object with nominally Gaussian error for compartment inference, so this function uses 'suitable' (defaults to to 3 or more reads in 2 or more samples) measurements. Using Dirichlet smoothing (adding 'k' reads to M and U), these measurements are then turned into lightly moderated, logit-transformed methylated-fraction estimates for compartment calling.
getLogitFracMeth(x, minCov = 3, minSamp = 2, k = 0.1, r = NULL) getMvals(x, minCov = 3, minSamp = 2, k = 0.1, r = NULL)
getLogitFracMeth(x, minCov = 3, minSamp = 2, k = 0.1, r = NULL) getMvals(x, minCov = 3, minSamp = 2, k = 0.1, r = NULL)
x |
A bsseq object with methylated and total reads |
minCov |
Minimum read coverage for landmarking samples (DEFAULT: 3) |
minSamp |
Minimum landmark samples with at least minCov (DEFAULT: 2) |
k |
Pseudoreads for smoothing (DEFAULT: 0.1) |
r |
Regions to collapse over - if NULL, do it by CpG (DEFAULT: NULL) |
Smoothed logit(M / Cov) GRanges with coordinates as row names
getMvals()
: Alias for getLogitFracMeth
orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) reg <- GRanges(seqnames = rep("chr11",5), strand = rep("*",5), ranges = IRanges(start = c(0,2.8e6,1.17e7,1.38e7,1.69e7), end= c(2.8e6,1.17e7,1.38e7,1.69e7,2.2e7)) ) frac <- getLogitFracMeth(bisc, minSamp = 1, r = reg)
orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) reg <- GRanges(seqnames = rep("chr11",5), strand = rep("*",5), ranges = IRanges(start = c(0,2.8e6,1.17e7,1.38e7,1.69e7), end= c(2.8e6,1.17e7,1.38e7,1.69e7,2.2e7)) ) frac <- getLogitFracMeth(bisc, minSamp = 1, r = reg)
Chromosome arm locations for GRCh37 genome
data(GRCh37.chromArm, package="biscuiteer")
data(GRCh37.chromArm, package="biscuiteer")
Source URL: https://genome.ucsc.edu/cgi-bin/hgTables (Cytogenic bands were retrieved using the UCSC Table Browser. The output was then exported to a TXT file, where the chromosome arms were combined and formed into a GRanges) Source type: TXT Return type: GRanges
Chromosome arm locations for GRCh38 genome
data(GRCh38.chromArm, package="biscuiteer")
data(GRCh38.chromArm, package="biscuiteer")
Source URL: https://genome.ucsc.edu/cgi-bin/hgTables (Cytogenic bands were retrieved using the UCSC Table Browser. The output was then exported to a TXT file, where the chromosome arms were combined and formed into a GRanges) Source type: TXT Return type: GRanges
Output data.frame can be written to a .seg file if supplied with filename input argument
grToSeg(gr, filename = NULL, minAbs = NULL)
grToSeg(gr, filename = NULL, minAbs = NULL)
gr |
A GRanges or GRangesList to dump to .seg file |
filename |
Where to save the result - unsaved if NULL (DEFAULT: NULL) |
minAbs |
Minimum absolute gain/loss cutoff (DEFAULT: NULL) |
A data.frame with columns: (ID, chrom, loc.start, loc.end, num.mark, seg.mean)
segToGr
clock <- getClock(model="horvathshrunk", genome="hg38") gr <- clock$gr df <- grToSeg(gr = gr)
clock <- getClock(model="horvathshrunk", genome="hg38") gr <- clock$gr df <- grToSeg(gr = gr)
Hypermethylated targets in bivalent histone sites from H9 embryonic stem cells which were unmethylated across normal cells for hg19 genome
data(H9state23unmeth.hg19, package="biscuiteer")
data(H9state23unmeth.hg19, package="biscuiteer")
GRanges was generated by taking the HMM-derived CpG islands (described in ?HMM_CpG_islands.hg19) and overlapping with regions that were unmethylated in normal H9 stem cells and had a ChromHMM state of 2 or 3 (see https://www.nature.com/articles/nmeth.1906#MOESM194 for a description of ChromHMM) Return type: GRanges
Hypermethylated targets in bivalent histone sites from H9 embryonic stem cells which were unmethylated across normal cells for hg38 genome
data(H9state23unmeth.hg38, package="biscuiteer")
data(H9state23unmeth.hg38, package="biscuiteer")
GRanges was generated by taking the HMM-derived CpG islands (described in ?HMM_CpG_islands.hg38) and overlapping with regions that were unmethylated in normal H9 stem cells and had a ChromHMM state of 2 or 3 (see https://www.nature.com/articles/nmeth.1906#MOESM194 for a description of ChromHMM) Return type: GRanges
Chromosome arm locations for hg19 genome
data(hg19.chromArm, package="biscuiteer")
data(hg19.chromArm, package="biscuiteer")
Source URL: http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz (Chromosome arms were combined to form the final GRanges) Source type: TXT Return type: GRanges
Chromosome arm locations for hg38 genome
data(hg38.chromArm, package="biscuiteer")
data(hg38.chromArm, package="biscuiteer")
Source URL: http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz (Chromosome arms were combined to form the final GRanges) Source type: TXT Return type: GRanges
Hidden Markov Model-derived CpG islands from hg19 genome
data(HMM_CpG_islands.hg19, package="biscuiteer")
data(HMM_CpG_islands.hg19, package="biscuiteer")
Source URL: https://www.ncbi.nlm.nih.gov/pubmed/20212320 (Hidden Markov Model CpG islands were produced using the method described in this paper. The hg19 genome was used for the CpG island production.) Source type: hg19 genome and procedure described in paper Return type: GRanges
Hidden Markov Model-derived CpG islands from hg38 genome
data(HMM_CpG_islands.hg38, package="biscuiteer")
data(HMM_CpG_islands.hg38, package="biscuiteer")
Source URL: https://www.ncbi.nlm.nih.gov/pubmed/20212320 (Hidden Markov Model CpG islands were produced using the method described in this paper. The hg19 genome was used for the CpG island production.) Source type: hg19 genome and procedure described in paper Return type: GRanges
Beware that any reasonably large BED files may not fit into memory!
makeBSseq(tbl, params, simplify = FALSE, verbose = FALSE)
makeBSseq(tbl, params, simplify = FALSE, verbose = FALSE)
tbl |
A tibble (from read_tsv) or a data.table (from fread) |
params |
Parameters from checkBiscuitBED |
simplify |
Simplify sample names by dropping .foo.bar.hg19? (or similar) (DEFAULT: FALSE) |
verbose |
Print extra statements? (DEFAULT: FALSE) |
An in-memory bsseq object
library(data.table) library(R.utils) orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") params <- checkBiscuitBED(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE, how = "data.table") select <- grep("\\.context", params$colNames, invert=TRUE) tbl <- fread(gunzip(params$tbx$path, remove = FALSE), sep="\t", sep2=",", fill=TRUE, na.strings=".", select=select) unzippedName <- sub("\\.gz$", "", params$tbx$path) if (file.exists(unzippedName)) { file.remove(unzippedName) } if (params$hasHeader == FALSE) names(tbl) <- params$colNames[select] names(tbl) <- sub("^#", "", names(tbl)) tbl <- tbl[rowSums(is.na(tbl)) == 0, ] bsseq <- makeBSseq(tbl = tbl, params = params)
library(data.table) library(R.utils) orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") params <- checkBiscuitBED(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE, how = "data.table") select <- grep("\\.context", params$colNames, invert=TRUE) tbl <- fread(gunzip(params$tbx$path, remove = FALSE), sep="\t", sep2=",", fill=TRUE, na.strings=".", select=select) unzippedName <- sub("\\.gz$", "", params$tbx$path) if (file.exists(unzippedName)) { file.remove(unzippedName) } if (params$hasHeader == FALSE) names(tbl) <- params$colNames[select] names(tbl) <- sub("^#", "", names(tbl)) tbl <- tbl[rowSums(is.na(tbl)) == 0, ] bsseq <- makeBSseq(tbl = tbl, params = params)
Takes BED-like format with 2 or 3 columns per sample. Unmerged CpG files have 2 columns (beta values and coverage), whereas merged CpG files have 3 columns (beta values, coverage, and context).
readBiscuit( BEDfile, VCFfile, merged, sampleNames = NULL, simplify = FALSE, genome = "hg19", how = c("data.table", "readr"), hdf5 = FALSE, hdf5dir = NULL, sparse = FALSE, chunkSize = 1e+06, chr = NULL, which = NULL, verbose = FALSE ) loadBiscuit( BEDfile, VCFfile, merged, sampleNames = NULL, simplify = FALSE, genome = "hg19", how = c("data.table", "readr"), hdf5 = FALSE, hdf5dir = NULL, sparse = FALSE, chunkSize = 1e+06, chr = NULL, which = NULL, verbose = FALSE )
readBiscuit( BEDfile, VCFfile, merged, sampleNames = NULL, simplify = FALSE, genome = "hg19", how = c("data.table", "readr"), hdf5 = FALSE, hdf5dir = NULL, sparse = FALSE, chunkSize = 1e+06, chr = NULL, which = NULL, verbose = FALSE ) loadBiscuit( BEDfile, VCFfile, merged, sampleNames = NULL, simplify = FALSE, genome = "hg19", how = c("data.table", "readr"), hdf5 = FALSE, hdf5dir = NULL, sparse = FALSE, chunkSize = 1e+06, chr = NULL, which = NULL, verbose = FALSE )
BEDfile |
A BED-like file - must be compressed and tabix'ed |
VCFfile |
A VCF file - must be compressed and tabix'ed. Only the header information is needed. |
merged |
Is this merged CpG data? |
sampleNames |
Names of samples - NULL: create names, vector: assign names, data.frame: make pData (DEFAULT: NULL) |
simplify |
Simplify sample names by dropping .foo.bar.hg19? (or similar) (DEFAULT: FALSE) |
genome |
Genome assembly the runs were aligned against (DEFAULT: "hg19") |
how |
How to load data - either data.table or readr (DEFAULT: "data.table") |
hdf5 |
Make the object HDF5-backed - CURRENTLY NOT AVAILABLE (DEFAULT: FALSE) |
hdf5dir |
Directory to store HDF5 files if 'hdf5' = TRUE (DEFAULT: NULL) |
sparse |
Use sparse Matrix objects for the data? (DEFAULT: FALSE) |
chunkSize |
Number of rows before readr reading becomes chunked (DEFAULT: 1e6) |
chr |
Load a specific chromosome? (DEFAULT: NULL) |
which |
A GRanges of regions to load - NULL loads them all (DEFAULT: NULL) |
verbose |
Print extra statements? (DEFAULT: FALSE) |
NOTE: Assumes alignment against hg19 (use genome argument to override). NOTE: Requires header from VCF file to detect sample names
A bsseq::BSseq object
loadBiscuit()
: Alias for readBiscuit
bsseq
checkBiscuitBED
orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE)
orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE)
Read in and decode the RLE representation of the epibed format out of biscuit epiread
readEpibed( epibed, genome = NULL, chr = NULL, start = 1, end = 2^28, fragment_level = TRUE )
readEpibed( epibed, genome = NULL, chr = NULL, start = 1, end = 2^28, fragment_level = TRUE )
epibed |
The path to the epibed file (must be bgzip and tabix indexed) |
genome |
What genome did this come from (e.g. 'hg19') (default: NULL) |
chr |
Which chromosome to retrieve (default: NULL) |
start |
The starting position for a region of interest (default: 1) |
end |
The end position for a region of interest (default: 2^28) |
fragment_level |
Whether to collapse reads to the fragment level (default: TRUE) |
A GRanges object
epibed.nome <- system.file("extdata", "hct116.nome.epibed.gz", package="biscuiteer") epibed.bsseq <- system.file("extdata", "hct116.bsseq.epibed.gz", package="biscuiteer") epibed.nome.gr <- readEpibed(epibed = epibed.nome, genome = "hg19", chr = "chr1") epibed.bsseq.gr <- readEpibed(epibed = epibed.bsseq, genome = "hg19", chr = "chr1")
epibed.nome <- system.file("extdata", "hct116.nome.epibed.gz", package="biscuiteer") epibed.bsseq <- system.file("extdata", "hct116.bsseq.epibed.gz", package="biscuiteer") epibed.nome.gr <- readEpibed(epibed = epibed.nome, genome = "hg19", chr = "chr1") epibed.bsseq.gr <- readEpibed(epibed = epibed.bsseq, genome = "hg19", chr = "chr1")
(e)RRBS settings for dmrseq
RRBSeq(bsseq, testCovariate, cutoff = 0.2, bpSpan = 750, ...)
RRBSeq(bsseq, testCovariate, cutoff = 0.2, bpSpan = 750, ...)
bsseq |
A bsseq object |
testCovariate |
The pData column to test on |
cutoff |
The minimum CpG-wise difference to use (DEFAULT: 0.2) |
bpSpan |
Span of smoother AND max gap in DMR CpGs (DEFAULT: 750) |
... |
Other arguments to pass along to dmrseq |
A GRanges object (same as from dmrseq)
data(BS.chr21, package="dmrseq") dat <- BS.chr21 rrbs <- RRBSeq(dat[1:500, ], "Rep", cutoff = 0.05, BPPARAM=BiocParallel::SerialParam())
data(BS.chr21, package="dmrseq") dat <- BS.chr21 rrbs <- RRBSeq(dat[1:500, ], "Rep", cutoff = 0.05, BPPARAM=BiocParallel::SerialParam())
Reverse of grToSeg
segToGr(seg, genome = "hg19", name = "ID", score = "seg.mean")
segToGr(seg, genome = "hg19", name = "ID", score = "seg.mean")
seg |
The .seg filename |
genome |
Genome against which segments were annotated (DEFAULT: "hg19") |
name |
.seg file column to use as $name metadata (DEFAULT: "ID") |
score |
.seg file column to use as $score metadata (DEFAULT: "seg.mean") |
A GRanges object
grToSeg
clock <- getClock(model="horvathshrunk", genome="hg38") gr <- clock$gr df <- grToSeg(gr = gr, file = "test_grToSeg.seg") segs <- segToGr("test_grToSeg.seg", genome="hg38") if (file.exists("test_grToSeg.seg")) file.remove("test_grToSeg.seg")
clock <- getClock(model="horvathshrunk", genome="hg38") gr <- clock$gr df <- grToSeg(gr = gr, file = "test_grToSeg.seg") segs <- segToGr("test_grToSeg.seg", genome="hg38") if (file.exists("test_grToSeg.seg")) file.remove("test_grToSeg.seg")
Seqinfo for hg19 genome
data(seqinfo.hg19, package="biscuiteer")
data(seqinfo.hg19, package="biscuiteer")
Source URL: http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.chrom.sizes (The output from this site was downloaded into a TXT file and then loaded into a sorted Seqinfo table) Source type: TXT Return type: Seqinfo
Seqinfo for hg38 genome
data(seqinfo.hg38, package="biscuiteer")
data(seqinfo.hg38, package="biscuiteer")
Source URL: http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes (The output from this site was downloaded into a TXT file and then loaded into a sorted Seqinfo table) Source type: TXT Return type: Seqinfo
Seqinfo for mm10 genome
data(seqinfo.mm10, package="biscuiteer")
data(seqinfo.mm10, package="biscuiteer")
Source URL: http://hgdownload.cse.ucsc.edu/goldenPath/mm10/bigZips/mm10.chrom.sizes (The output from this site was downloaded into a TXT file and then loaded into a sorted Seqinfo table) Source type: TXT Return type: Seqinfo
Tries using the longest common subsequence to figure out what can be dropped. Usually used for VCF columns.
simplifySampleNames(x)
simplifySampleNames(x)
x |
A SummarizedExperiment-derived object, or a character vector |
The input object, but with simplified sample names
orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) bisc <- simplifySampleNames(bisc)
orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) bisc <- simplifySampleNames(bisc)
Used for bsseq objects. Mostly a local wrapp for getMeth.
summarizeBsSeqOver(bsseq, segs, dropNA = FALSE, impute = FALSE)
summarizeBsSeqOver(bsseq, segs, dropNA = FALSE, impute = FALSE)
bsseq |
The bsseq object to summarize |
segs |
Regions to summarize over (GRanges object, no GRangesList yet) |
dropNA |
Whether to drop rows if more than half of samples are NA (DEFAULT: FALSE) |
impute |
Whether to impute NAs/NaNs (DEFAULT: FALSE) |
A matrix of regional methylation fractions
orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) reg <- GRanges(seqnames = rep("chr11",5), strand = rep("*",5), ranges = IRanges(start = c(0,2.8e6,1.17e7,1.38e7,1.69e7), end= c(2.8e6,1.17e7,1.38e7,1.69e7,2.2e7)) ) summary <- summarizeBsSeqOver(bsseq = bisc, segs = reg, dropNA = TRUE)
orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) reg <- GRanges(seqnames = rep("chr11",5), strand = rep("*",5), ranges = IRanges(start = c(0,2.8e6,1.17e7,1.38e7,1.69e7), end= c(2.8e6,1.17e7,1.38e7,1.69e7,2.2e7)) ) summary <- summarizeBsSeqOver(bsseq = bisc, segs = reg, dropNA = TRUE)
Read from tabix-indexed bed file to list objects
tabixRetrieve( paths, chr, start = 1, end = 2^28, sample_names = NULL, is.epibed = FALSE, BPPARAM = SerialParam() )
tabixRetrieve( paths, chr, start = 1, end = 2^28, sample_names = NULL, is.epibed = FALSE, BPPARAM = SerialParam() )
paths |
path(s) to the bed files |
chr |
chromosome name |
start |
start coordinate of region of interest |
end |
end coordinate of region of interest |
sample_names |
sample names, just use paths if not specified |
is.epibed |
whether the input is epibed format |
BPPARAM |
how to parallelize |
a list object with DNA methylation level and depth
Wrapper for the combine(bsseq1, ...) method in bsseq
unionize(bs1, ...)
unionize(bs1, ...)
bs1 |
A bsseq object |
... |
One or more bsseq objects to combine with bs1 |
Takes provided bsseq objects, the union of their GRanges, fills out the sites not in the union with 0M/0Cov, and returns the even-sparser bsseq holding all of them.
A larger and more sparse bsseq object
shuf_bed <- system.file("extdata", "MCF7_Cunha_chr11p15_shuffled.bed.gz", package="biscuiteer") orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") shuf_vcf <- system.file("extdata", "MCF7_Cunha_shuffled_header_only.vcf.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc1 <- readBiscuit(BEDfile = shuf_bed, VCFfile = shuf_vcf, merged = FALSE) bisc2 <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) comb <- unionize(bisc1, bisc2)
shuf_bed <- system.file("extdata", "MCF7_Cunha_chr11p15_shuffled.bed.gz", package="biscuiteer") orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") shuf_vcf <- system.file("extdata", "MCF7_Cunha_shuffled_header_only.vcf.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc1 <- readBiscuit(BEDfile = shuf_bed, VCFfile = shuf_vcf, merged = FALSE) bisc2 <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) comb <- unionize(bisc1, bisc2)
See Horvath, Genome Biology, 2013 for more information
WGBSage( bsseq, model = c("horvath", "horvathshrunk", "hannum", "skinandblood"), padding = 15, useENSR = FALSE, useHMMI = FALSE, minCovg = 5, impute = FALSE, minSamp = 5, genome = NULL, dropBad = FALSE, ... )
WGBSage( bsseq, model = c("horvath", "horvathshrunk", "hannum", "skinandblood"), padding = 15, useENSR = FALSE, useHMMI = FALSE, minCovg = 5, impute = FALSE, minSamp = 5, genome = NULL, dropBad = FALSE, ... )
bsseq |
A bsseq object (must have assays named |
model |
Which model ("horvath", "horvathshrunk", "hannum", "skinandblood") |
padding |
How many bases +/- to pad the target CpG by (DEFAULT: 15) |
useENSR |
Use ENSEMBL regulatory region bounds instead of CpGs (DEFAULT: FALSE) |
useHMMI |
Use HMM CpG island boundaries instead of padded CpGs (DEFAULT: FALSE) |
minCovg |
Minimum regional read coverage desired to estimate 5mC (DEFAULT: 5) |
impute |
Use k-NN imputation to fill in low-coverage regions? (DEFAULT: FALSE) |
minSamp |
Minimum number of non-NA samples to perform imputation (DEFAULT: 5) |
genome |
Genome to use as reference, if no genome(bsseq) is set (DEFAULT: NULL) |
dropBad |
Drop rows/cols with > half missing pre-imputation? (DEFAULT: FALSE) |
... |
Arguments to be passed to impute.knn, such as rng.seed |
Note: the accuracy of the prediction will increase or decrease depending on
how various hyper-parameters are set by the user. This is NOT a hands-off
procedure, and the defaults are only a starting point for exploration. It
will not be uncommon to tune padding
, minCovg
, and minSamp
for each
WGBS or RRBS experiment (and the latter may be impacted by whether dupes are
removed prior to importing data). Consider yourself forewarned. In the near
future we may add support for arbitrary region-coefficient inputs and result
transformation functions, which of course will just make the problems worse.
Also, please cite the appropriate papers for the Epigenetic Clock(s) you use:
For the 'horvath' or 'horvathshrunk' clocks, cite Horvath, Genome Biology 2013. For the 'hannum' clock, cite Hannum et al, Molecular Cell 2013. For the 'skinandblood' clock, cite Horvath et al, Aging 2018.
Last but not least, keep track of the parameters YOU used for YOUR estimates.
The call
element in the returned list of results is for this exact purpose.
If you need recover the GRanges object used to average(or impute) DNAme
values for the model, try granges(result$methcoefs)
on a result. The
methylation fraction and coefficients for each region can be found in the
GRanges object, result$methcoefs, where each sample has a corresponding
column with the methylation fraction and the coefficients have their own
column titled "coefs". Additionally, the age estimates are stored in
result$age (named, in case dropBad == TRUE).
A list with call, methylation estimates, coefs, age estimates
shuf_bed <- system.file("extdata", "MCF7_Cunha_chr11p15_shuffled.bed.gz", package="biscuiteer") orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") shuf_vcf <- system.file("extdata", "MCF7_Cunha_shuffled_header_only.vcf.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc1 <- readBiscuit(BEDfile = shuf_bed, VCFfile = shuf_vcf, merged = FALSE) bisc2 <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) comb <- unionize(bisc1, bisc2) ages <- WGBSage(comb, "horvath")
shuf_bed <- system.file("extdata", "MCF7_Cunha_chr11p15_shuffled.bed.gz", package="biscuiteer") orig_bed <- system.file("extdata", "MCF7_Cunha_chr11p15.bed.gz", package="biscuiteer") shuf_vcf <- system.file("extdata", "MCF7_Cunha_shuffled_header_only.vcf.gz", package="biscuiteer") orig_vcf <- system.file("extdata", "MCF7_Cunha_header_only.vcf.gz", package="biscuiteer") bisc1 <- readBiscuit(BEDfile = shuf_bed, VCFfile = shuf_vcf, merged = FALSE) bisc2 <- readBiscuit(BEDfile = orig_bed, VCFfile = orig_vcf, merged = FALSE) comb <- unionize(bisc1, bisc2) ages <- WGBSage(comb, "horvath")
Wrapper for WGBS settings for dmrseq
WGBSeq(bsseq, testCovariate, bpSpan = 1000, ...)
WGBSeq(bsseq, testCovariate, bpSpan = 1000, ...)
bsseq |
A bsseq object |
testCovariate |
The pData column to test on |
bpSpan |
Span of smoother AND 2x max gap in DMR CpGs (DEFAULT: 1000) |
... |
Other arguments to pass along to dmrseq |
A GRanges object (same as from dmrseq)
data(BS.chr21, package="dmrseq") dat <- BS.chr21 wgbs <- WGBSeq(dat[1:500, ], "CellType", cutoff = 0.05, BPPARAM=BiocParallel::SerialParam())
data(BS.chr21, package="dmrseq") dat <- BS.chr21 wgbs <- WGBSeq(dat[1:500, ], "CellType", cutoff = 0.05, BPPARAM=BiocParallel::SerialParam())