Title: | Integrating Cap Enrichment with Transcript Expression Analysis |
---|---|
Description: | icetea (Integrating Cap Enrichment with Transcript Expression Analysis) provides functions for end-to-end analysis of multiple 5'-profiling methods such as CAGE, RAMPAGE and MAPCap, beginning from raw reads to detection of transcription start sites using replicates. It also allows performing differential TSS detection between group of samples, therefore, integrating the mRNA cap enrichment information with transcript expression analysis. |
Authors: | Vivek Bhardwaj [aut, cre] |
Maintainer: | Vivek Bhardwaj <[email protected]> |
License: | GPL-3 + file LICENSE |
Version: | 1.25.0 |
Built: | 2024-11-19 04:43:45 UTC |
Source: | https://github.com/bioc/icetea |
Match BAM headers bw files and get active chromosome list (from restrict) (written by Aaron Lun, 12 Dec 2014, copied and modified here)
activeChrs(bam.files, restrict)
activeChrs(bam.files, restrict)
bam.files |
Character . bam files to check |
restrict |
character. Chromosomes to select |
Vector of selected chromosomes
This function annotates the provided TSS bed file to provide the number of TSS falling within the genomic features from a given TxDB object. In order to break ties between overlapping features, the function ranks the features by preference. By default, the following order is used: fiveUTR > promoter > intron > coding > spliceSite > threeUTR > intergenic. A custom order of feature ranks can also be provided.
annotateTSS( tssBED, txdb, featureRank = c("fiveUTR", "promoter", "intron", "coding", "spliceSite", "threeUTR", "intergenic"), plotValue = "number", outFile = NULL )
annotateTSS( tssBED, txdb, featureRank = c("fiveUTR", "promoter", "intron", "coding", "spliceSite", "threeUTR", "intergenic"), plotValue = "number", outFile = NULL )
tssBED |
A bed file with detected TSS/differential TSS coordinates |
txdb |
A txdb object. |
featureRank |
A vector with features to use for breaking ties, in decending order of preference (highest to lowest), |
plotValue |
What values to plot (choose from "number", "percent" or NULL for no plot) |
outFile |
Output file name. (filename extention would be used to determine type). If outfile not specified, the plot would be retured on the screen |
A data.frame with number of TSS falling into each feature
# load a txdb object library("TxDb.Dmelanogaster.UCSC.dm6.ensGene") seqlevelsStyle(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "ENSEMBL" # limiting the annotation to X chromosome seqlevels(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "X" # annotate a given TSS bed file dir <- system.file("extdata", package = "icetea") tss <- file.path(dir, "testTSS_merged.bed") annotations <- annotateTSS(tssBED = tss, TxDb.Dmelanogaster.UCSC.dm6.ensGene, plotValue = "number", outFile = "TSS_annot.pdf")
# load a txdb object library("TxDb.Dmelanogaster.UCSC.dm6.ensGene") seqlevelsStyle(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "ENSEMBL" # limiting the annotation to X chromosome seqlevels(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "X" # annotate a given TSS bed file dir <- system.file("extdata", package = "icetea") tss <- file.path(dir, "testTSS_merged.bed") annotations <- annotateTSS(tssBED = tss, TxDb.Dmelanogaster.UCSC.dm6.ensGene, plotValue = "number", outFile = "TSS_annot.pdf")
Check capset validity
check_capSet(object)
check_capSet(object)
object |
capset object |
boolean
Demultiplex and tag fastq files using sample barcodes
demultiplexFASTQ(CSobject, outdir, max_mismatch = 0, ncores = 1) ## S4 method for signature 'CapSet' demultiplexFASTQ(CSobject, outdir, max_mismatch = 0, ncores = 1)
demultiplexFASTQ(CSobject, outdir, max_mismatch = 0, ncores = 1) ## S4 method for signature 'CapSet' demultiplexFASTQ(CSobject, outdir, max_mismatch = 0, ncores = 1)
CSobject |
CapSet object created using |
outdir |
character. path to output directory |
max_mismatch |
integer. maximum allowed mismatches in the sample barcode |
ncores |
integrer. No. of cores/threads to use |
de-multiplxed fastq files corresponding to each barcode. The files are written on disk with the corresponding sample names as specified in the CapSet object
# load a previously saved CapSet object cs <- exampleCSobject() # demultiplex allowing one mismatch in sample indexes dir.create("demult_fastq") cs <- demultiplexFASTQ(cs, outdir = "demult_fastq", max_mismatch = 1)
# load a previously saved CapSet object cs <- exampleCSobject() # demultiplex allowing one mismatch in sample indexes dir.create("demult_fastq") cs <- demultiplexFASTQ(cs, outdir = "demult_fastq", max_mismatch = 1)
Detect differentially expressed Transcription Start Sites between two conditions (test)
detectDiffTSS(fit, testGroup, contGroup, TSSfile = NULL, MAplot_fdr = NA) ## S4 method for signature 'DGEGLM' detectDiffTSS(fit, testGroup, contGroup, TSSfile = NULL, MAplot_fdr = NA) ## S4 method for signature 'DESeqDataSet' detectDiffTSS(fit, testGroup, contGroup, MAplot_fdr = NA)
detectDiffTSS(fit, testGroup, contGroup, TSSfile = NULL, MAplot_fdr = NA) ## S4 method for signature 'DGEGLM' detectDiffTSS(fit, testGroup, contGroup, TSSfile = NULL, MAplot_fdr = NA) ## S4 method for signature 'DESeqDataSet' detectDiffTSS(fit, testGroup, contGroup, MAplot_fdr = NA)
fit |
DGEGLM object (output of |
testGroup |
Test group name |
contGroup |
Control group name |
TSSfile |
The TSS .bed file used for |
MAplot_fdr |
FDR threshold to mark differentially expressed TSS in MAplot (NA = Don't make an MAplot) |
A GRanges
object containing p-values of differential expression for each TSS.
# before running this # 1. Create a CapSet object # 2. de-multiplex the fastqs # 3. map them # 4. filter duplicate reads from mapped BAM # 5. detect TSS # 6. fit the diff TSS model. ## Not run: # load a previously saved DGEGLM object from step 5 csfit <- load("diffTSS_fit.Rdata") dir <- system.file("extdata", package = "icetea") # detect differentially expressed TSS between groups (return MA plot) detectDiffTSS(csfit, testGroup = "mut", controlGroup = "wt", tssFile = file.path(dir, "testTSS_merged.bed"), MAplot_fdr = 0.05) ## End(Not run) ## Not run: # load a previously saved DGEGLM object from step 5 csfit <- load("diffTSS_fit.Rdata") dir <- system.file("extdata", package = "icetea") # detect differentially expressed TSS between groups (return MA plot) detectDiffTSS(csfit, testGroup = "mut", controlGroup = "wt", MAplot_fdr = 0.05) ## End(Not run)
# before running this # 1. Create a CapSet object # 2. de-multiplex the fastqs # 3. map them # 4. filter duplicate reads from mapped BAM # 5. detect TSS # 6. fit the diff TSS model. ## Not run: # load a previously saved DGEGLM object from step 5 csfit <- load("diffTSS_fit.Rdata") dir <- system.file("extdata", package = "icetea") # detect differentially expressed TSS between groups (return MA plot) detectDiffTSS(csfit, testGroup = "mut", controlGroup = "wt", tssFile = file.path(dir, "testTSS_merged.bed"), MAplot_fdr = 0.05) ## End(Not run) ## Not run: # load a previously saved DGEGLM object from step 5 csfit <- load("diffTSS_fit.Rdata") dir <- system.file("extdata", package = "icetea") # detect differentially expressed TSS between groups (return MA plot) detectDiffTSS(csfit, testGroup = "mut", controlGroup = "wt", MAplot_fdr = 0.05) ## End(Not run)
Detection of Trancription start sites based on local enrichment
detectTSS( CSobject, groups, outfile_prefix = NULL, windowSize = 10L, sliding = TRUE, foldChange = 2, mergeLength = 1L, restrictChr = NULL, ncores = 1, readPos = "start" ) ## S4 method for signature 'CapSet' detectTSS( CSobject, groups, outfile_prefix = NULL, windowSize = 10L, sliding = TRUE, foldChange = 2, mergeLength = 1L, restrictChr = NULL, ncores = 1, readPos = "start" )
detectTSS( CSobject, groups, outfile_prefix = NULL, windowSize = 10L, sliding = TRUE, foldChange = 2, mergeLength = 1L, restrictChr = NULL, ncores = 1, readPos = "start" ) ## S4 method for signature 'CapSet' detectTSS( CSobject, groups, outfile_prefix = NULL, windowSize = 10L, sliding = TRUE, foldChange = 2, mergeLength = 1L, restrictChr = NULL, ncores = 1, readPos = "start" )
CSobject |
CapSet object created using |
groups |
a character vector that contains group name of the sample, for replicate-based TSS calling (see example) |
outfile_prefix |
Output name prefix for the .Rdata file containing window counts, background counts and filtering statistics calculated during TSS detection. |
windowSize |
Size of the window to bin the genome for TSS detection. By default, a window size of 10 is used for binning the genome, however smaller window sizes can optionally be provided for higher resolution TSS detection. Note that the background size is set to 200x the window size (2kb for 10bp windows) to calculate local enrichment. Subsequently enriched windows are merged, unless the mergeLength is increased. |
sliding |
TRUE/FALSE. Indicating whether or not to use sliding windows. The windows are shifted by length which is half of the specified window length. |
foldChange |
Numeric. A fold change cutoff of local enrichment to detect the TSS. If the samples have good signal enrichment over background (inspect in genome browser), a low cutoff of 2-fold can be used. For samples with low sequencing depth it's also desirable to have a low cutoff of 2-fold. The final "score" of detected TSS is the mean fold-change of all merged windows that passed the foldChange cutoff. TSSs can therefore also be filtered using this score after detectTSS is run. |
mergeLength |
Integer. Merge the windows within this distance that pass the foldChange cutoff. Default (1L) means that only subsequently enriched windows would be merged. |
restrictChr |
Chromosomes to restrict the analysis to. |
ncores |
No. of cores/threads to use |
readPos |
character. position of read to use. Options are "start", "end" and "center". For TSS detection, the "start" of reads are used (default). But center or end might be useful for detecting RNA-binding proteins (in iCLIP-like data) |
.bed files containing TSS position for each group, along with a bed file for consensus (union) TSS sites of all samples.
# before running this # 1. Create a CapSet object # 2. de-multiplex the fastqs # 3. map them # 4. filter duplicate reads from mapped BAM # load a previously saved CapSet object cs <- exampleCSobject() # detect TSS (samples in same group are treated as replicates) cs <- detectTSS(cs, groups = rep(c("wt","mut"), each = 2), outfile_prefix = "testTSS", foldChange = 6, restrictChr = "X", ncores = 1)
# before running this # 1. Create a CapSet object # 2. de-multiplex the fastqs # 3. map them # 4. filter duplicate reads from mapped BAM # load a previously saved CapSet object cs <- exampleCSobject() # detect TSS (samples in same group are treated as replicates) cs <- detectTSS(cs, groups = rep(c("wt","mut"), each = 2), outfile_prefix = "testTSS", foldChange = 6, restrictChr = "X", ncores = 1)
Make DESeq2 or edgeR QC plots
diffQCplots(method, fit, y, designMat)
diffQCplots(method, fit, y, designMat)
method |
one of "DESeq2" or "edgeR" |
fit |
output of fitDiffTSS (if method = "edgeR") |
y |
output of fitDiffTSS (if method = "DESeq2") |
designMat |
design matrix (data frame) |
Sparsity, dispersion and PCA plot (if method = DESeq2), BCV, dispersion and MDS plot (if method = "edgeR")
Create example CapSet object
exampleCSobject(expMethod = "MAPCap")
exampleCSobject(expMethod = "MAPCap")
expMethod |
Which experiment method to use (options : "RAMPAGE", "MAPCap") |
An object of class CapSet
cs <- exampleCSobject(expMethod = "MAPCap")
cs <- exampleCSobject(expMethod = "MAPCap")
Export the detected TSS from CapSet object as .bed files
exportTSS(CSobject, outfile_prefix, pergroup = FALSE, merged = TRUE) ## S4 method for signature 'CapSet' exportTSS(CSobject, outfile_prefix, pergroup = FALSE, merged = TRUE)
exportTSS(CSobject, outfile_prefix, pergroup = FALSE, merged = TRUE) ## S4 method for signature 'CapSet' exportTSS(CSobject, outfile_prefix, pergroup = FALSE, merged = TRUE)
CSobject |
The modified CapSet object after running |
outfile_prefix |
Prefix (with path) for output .bed files |
pergroup |
If TRUE, write output per group of samples |
merged |
If TRUE, write merged bed file (union of all groups) |
.bed file(s) containing detected TSS.
# load a previously saved CapSet object cs <- exampleCSobject() # export tss exportTSS(cs, merged = TRUE, outfile_prefix = "testTSS")
# load a previously saved CapSet object cs <- exampleCSobject() # export tss exportTSS(cs, merged = TRUE, outfile_prefix = "testTSS")
This script considers the read mapping start position and the UMI to determine whether a read is a PCR duplicate. All PCR duplicates are then removed and one entry per read is kept. In case of paired-end reads (MAPCap/RAMPAGE), only one end (R1) is kept after filtering, unless 'keepPairs“ is set to TRUE
filterDuplicates(CSobject, outdir, ncores = 1, keepPairs = FALSE) ## S4 method for signature 'CapSet' filterDuplicates(CSobject, outdir, ncores = 1, keepPairs = FALSE)
filterDuplicates(CSobject, outdir, ncores = 1, keepPairs = FALSE) ## S4 method for signature 'CapSet' filterDuplicates(CSobject, outdir, ncores = 1, keepPairs = FALSE)
CSobject |
an object of class |
outdir |
character. output directory for filtered BAM files |
ncores |
integer. No. of cores to use |
keepPairs |
logical. indicating whether to keep pairs in the paired-end data. (note: the pairs are treated as independent reads during duplicate removal). Also use keepPairs = TRUE for single-end data. |
modified CapSet object with filtering information. Filtered BAM files are saved in 'outdir'.
# before running this # 1. Create a CapSet object # 2. de-multiplex the fastqs # 3. map them # load a previously saved CapSet object cs <- exampleCSobject() # filter duplicate reads from mapped BAM files dir.create("filtered_bam") cs <- filterDuplicates(cs, outdir = "filtered_bam")
# before running this # 1. Create a CapSet object # 2. de-multiplex the fastqs # 3. map them # load a previously saved CapSet object cs <- exampleCSobject() # filter duplicate reads from mapped BAM files dir.create("filtered_bam") cs <- filterDuplicates(cs, outdir = "filtered_bam")
Filter PCR-duplicates from BAM file using internal UMIs
filterDups(bamFile, outFile, keepPairs)
filterDups(bamFile, outFile, keepPairs)
bamFile |
character. Input BAM file |
outFile |
character. Output (filtered) BAM file |
keepPairs |
logical. Keep R2 read? |
Filtered BAM file, after PCR duplicate removal
Detect differentially expressed Transcription Start Sites between two conditions (fit model)
fitDiffTSS( CSobject, TSSfile = NULL, groups, method = "DESeq2", normalization = NULL, normFactors = NULL, outplots = NULL, plotRefSample = NA, ncores = 1 ) ## S4 method for signature 'CapSet' fitDiffTSS( CSobject, TSSfile = NULL, groups, method = "DESeq2", normalization = NULL, normFactors = NULL, outplots = NULL, plotRefSample = NA, ncores = 1 )
fitDiffTSS( CSobject, TSSfile = NULL, groups, method = "DESeq2", normalization = NULL, normFactors = NULL, outplots = NULL, plotRefSample = NA, ncores = 1 ) ## S4 method for signature 'CapSet' fitDiffTSS( CSobject, TSSfile = NULL, groups, method = "DESeq2", normalization = NULL, normFactors = NULL, outplots = NULL, plotRefSample = NA, ncores = 1 )
CSobject |
An object of class |
TSSfile |
A .bed file with TSS positions to test for differential TSS analysis. If left empty, the union of detected TSS present within the provided CSobject would be plotted. |
groups |
Character vector indicating the group into which each sample within the CSobject falls. the groups would be use to create a design matrix. As an example, replicates for one condition could be in the same group. |
method |
Which method to use for differential expression analysis? options are "DESeq2" or "edgeR". If "DESeq2" is chosen, the library size is either estimated via DESeq2 (using "median of ratios") or can be provided via the "normFactors" option below. Setting the "normalization" (below) has no effect in that case. |
normalization |
A character indicating the type of normalization to perform. Options are "windowTMM", "TMM", "RLE", "upperquartile" or NULL (don't compute normalization factors). If "windowTMM" is chosen, the normalization factors are calculated using the TMM method on 10 kb windows of the genome. "TMM" computes TMM normalization using counts from all the evaluated TSSs. If NULL, the external normalization factors can be used (provided using 'normFactors'). |
normFactors |
external normalization factors (from Spike-Ins, for example). |
outplots |
Output pdf filename for plots. If provided, the plots for BCV, dispersion and MDS plot is created and saved in this file. |
plotRefSample |
Name of reference sample to plot for detection of composition bias in the data. Samples could be normalized using one of the provided normalization methods to control for composition bias. |
ncores |
No. of cores/threads to use |
An object of class DGEGLM-class or DESeqDataSet
# before running this # 1. Create a CapSet object # 2. de-multiplex the fastqs # 3. map them # 4. filter duplicate reads from mapped BAM # 5. detect TSS # 6. fit the diffTSS model ## Not run: # load a previously saved CapSet object cs <- exampleCSobject() # count reads on all TSS (union) and fit a model using replicates within groups csfit <- fitDiffTSS(cs, groups = rep(c("wt","mut"), each = 2), normalization = "internal", outplots = NULL, plotRefSample = "embryo1") save(csfit, file = "diffTSS_fit.Rdata") ## End(Not run)
# before running this # 1. Create a CapSet object # 2. de-multiplex the fastqs # 3. map them # 4. filter duplicate reads from mapped BAM # 5. detect TSS # 6. fit the diffTSS model ## Not run: # load a previously saved CapSet object cs <- exampleCSobject() # count reads on all TSS (union) and fit a model using replicates within groups csfit <- fitDiffTSS(cs, groups = rep(c("wt","mut"), each = 2), normalization = "internal", outplots = NULL, plotRefSample = "embryo1") save(csfit, file = "diffTSS_fit.Rdata") ## End(Not run)
Get data to create new ShortReadQ object after barcode trimming
get_newfastq(type, fq_R1, fq_R2)
get_newfastq(type, fq_R1, fq_R2)
type |
character. expType of the CapSet object |
fq_R1 |
character. fastq Read 1 |
fq_R2 |
character. fastq Read 2 |
A list with new R1 and R2 sequence, quality, barcode string and sample id
Get flags to read from bam
getBamFlags(countAll)
getBamFlags(countAll)
countAll |
logical. count all reads? |
bamFlags
Get chromosome bins from BAM files
getChromBins(bamFiles, restrictChr = NULL, binSize)
getChromBins(bamFiles, restrictChr = NULL, binSize)
bamFiles |
Character. bam files |
restrictChr |
character. Chromosomes to select |
binSize |
numeric. Size of bins |
GRanges (bins) for both strands
Get chromosome sliding windows from BAM files
getChromWindows(bamFiles, restrictChr = NULL, binSize, stepSize)
getChromWindows(bamFiles, restrictChr = NULL, binSize, stepSize)
bamFiles |
Character vector (bam files) |
restrictChr |
Chromosomes to select |
binSize |
Size of bins |
stepSize |
Size of window slide |
GRanges (sliding windows) for both strands
Get gene-level counts from TSS data
getGeneCounts( CSobject, transcriptGRL, regionAroundTSS = 500, outfile = NA, ncores = 1 ) ## S4 method for signature 'CapSet' getGeneCounts( CSobject, transcriptGRL, regionAroundTSS = 500, outfile = NA, ncores = 1 )
getGeneCounts( CSobject, transcriptGRL, regionAroundTSS = 500, outfile = NA, ncores = 1 ) ## S4 method for signature 'CapSet' getGeneCounts( CSobject, transcriptGRL, regionAroundTSS = 500, outfile = NA, ncores = 1 )
CSobject |
The |
transcriptGRL |
A GRangesList object containing transcripts, created using transcriptsBy(txdb) |
regionAroundTSS |
integer, indicating how many bases downstream of TSS to count |
outfile |
character. Tab-separated output file name (if required) |
ncores |
integer. No. of cores/threads to use |
data.frame with gene-level counts for all genes in the txdb object
# load a txdb object library("TxDb.Dmelanogaster.UCSC.dm6.ensGene") seqlevelsStyle(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "ENSEMBL" # get transcripts by gene (only X chromsome, for simplicity) seqlevels(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "X" dm6trans <- transcriptsBy(TxDb.Dmelanogaster.UCSC.dm6.ensGene, "gene") # load a CapSet object cs <- exampleCSobject() # get gene counts, counting reads around 500 bp of the TSS gcounts <- getGeneCounts(cs, dm6trans)
# load a txdb object library("TxDb.Dmelanogaster.UCSC.dm6.ensGene") seqlevelsStyle(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "ENSEMBL" # get transcripts by gene (only X chromsome, for simplicity) seqlevels(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "X" dm6trans <- transcriptsBy(TxDb.Dmelanogaster.UCSC.dm6.ensGene, "gene") # load a CapSet object cs <- exampleCSobject() # get gene counts, counting reads around 500 bp of the TSS gcounts <- getGeneCounts(cs, dm6trans)
Get platform-specific multicore params
getMCparams(cores)
getMCparams(cores)
cores |
integer. No. of cores to use. |
BPPARAM object
Calculate normalization factors from CapSet object
getNormFactors(CSobject, features, method = "TMM", ...) ## S4 method for signature 'CapSet' getNormFactors(CSobject, features, method = "TMM", ...)
getNormFactors(CSobject, features, method = "TMM", ...) ## S4 method for signature 'CapSet' getNormFactors(CSobject, features, method = "TMM", ...)
CSobject |
An object of class |
features |
A GRanges-class.object to count the reads on. |
method |
Method to use for normalization. Options : "TMM","RLE","upperquartile","none" |
... |
Additional arguments passed to calcNormFactors |
Numeric vector of calculated normalization factors.
# load a txdb object library("TxDb.Dmelanogaster.UCSC.dm6.ensGene") seqlevelsStyle(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "ENSEMBL" # get genes (only X chromsome, for simplicity) seqlevels(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "X" dm6genes <- genes(TxDb.Dmelanogaster.UCSC.dm6.ensGene) # get norm factors by counting reads on genes cs <- exampleCSobject() normfacs <- getNormFactors(cs, dm6genes, method = "RLE")
# load a txdb object library("TxDb.Dmelanogaster.UCSC.dm6.ensGene") seqlevelsStyle(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "ENSEMBL" # get genes (only X chromsome, for simplicity) seqlevels(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "X" dm6genes <- genes(TxDb.Dmelanogaster.UCSC.dm6.ensGene) # get norm factors by counting reads on genes cs <- exampleCSobject() normfacs <- getNormFactors(cs, dm6genes, method = "RLE")
Assign feature ranks on a VariantAnnotation output
getranks(x, rank_vec)
getranks(x, rank_vec)
x |
output from VariantAnnotation |
rank_vec |
the pre-set vector of ranks |
A vector of ranks of length = length of input features
Map the data from 5' profiling techniques
mapCaps( CSobject, genomeIndex, outdir, externalGTF = NULL, ncores = 1, logfile = NULL ) ## S4 method for signature 'CapSet' mapCaps( CSobject, genomeIndex, outdir, externalGTF = NULL, ncores = 1, logfile = NULL )
mapCaps( CSobject, genomeIndex, outdir, externalGTF = NULL, ncores = 1, logfile = NULL ) ## S4 method for signature 'CapSet' mapCaps( CSobject, genomeIndex, outdir, externalGTF = NULL, ncores = 1, logfile = NULL )
CSobject |
An object of class |
genomeIndex |
character. Path to the Subread index file. Should end with the basename of the index. |
outdir |
character. Output directory path |
externalGTF |
character. provide external annotation file in 'GTF' format , if present to increase alignment accuracy |
ncores |
integer. Number of cores/threads to use for mapping. |
logfile |
character. A log file to write the processing message. |
modified CapSet object with mapping information. Mapped and sorted BAM files are saved in 'outdir'.
## Not run: # before mapping : # 1. Create a CapSet object # 2. de-multiplex the fastqs # load a previously saved CapSet object cs <- exampleCSobject() # map the data (not available on windows) library(Rsubread) dir.create("bam") buildindex(basename = "dm6", reference = "/path/to/dm6_genome.fa") cs <- mapCaps(cs, genomeIndex = "dm6", outdir = "bam", nthreads = 10) ## End(Not run)
## Not run: # before mapping : # 1. Create a CapSet object # 2. de-multiplex the fastqs # load a previously saved CapSet object cs <- exampleCSobject() # map the data (not available on windows) library(Rsubread) dir.create("bam") buildindex(basename = "dm6", reference = "/path/to/dm6_genome.fa") cs <- mapCaps(cs, genomeIndex = "dm6", outdir = "bam", nthreads = 10) ## End(Not run)
The function creates an object of class 'CapSet', used for the TSS analysis. A CapSet object can be created using the the raw, multiplxed fastq files along with the list of sample indexes and corresponding sample names. In case the files are already de-multiplexed or mapped, the CapSet object can also be created using the path to demultiplexed fastq/mapped or filtered BAM files, along with corresponding sample names. In these cases statistics and operations for the missing files would not be possible.
newCapSet( expMethod, fastq_R1 = NULL, fastq_R2 = NULL, idxList = NULL, sampleNames, demult_R1 = NA, demult_R2 = NA, mapped_file = NA, filtered_file = NA, paired_end = TRUE )
newCapSet( expMethod, fastq_R1 = NULL, fastq_R2 = NULL, idxList = NULL, sampleNames, demult_R1 = NA, demult_R2 = NA, mapped_file = NA, filtered_file = NA, paired_end = TRUE )
expMethod |
experiment method ('CAGE', 'RAMPAGE' or 'MAPCap') |
fastq_R1 |
path for Read R1 (or file path for single end reads) |
fastq_R2 |
path for Read R2 (for paired end reads) |
idxList |
a vector of index sequences (for demultiplexing) |
sampleNames |
(required) a vector of sample names corresponding to the provided files |
demult_R1 |
a vector of file paths for demultiplexed R1 reads |
demult_R2 |
a vector of file paths for demultiplexed R2 reads |
mapped_file |
a vector of file paths for mapped BAM files. |
filtered_file |
a vector of file paths for de-duplicated BAM files. |
paired_end |
logical, indiciting whether the data is paired end |
An object of class CapSet
fastqType
Type of fastq ('single' or 'paired')
fastq_R1
Path to R1 fastq
fastq_R2
Path to R1 fastq (for paired-end data)
expMethod
Name of protocol (RAMPGE or MAPCap)
sampleInfo
A DataFrame object created using information from
newCapSet
function
tss_detected
A GRangesList object of detected TSS
# list of barcode IDs idxlist <- c("CAAGTG", "TTAGCC", "GTGGAA", "TGTGAG") dir <- system.file("extdata", package="icetea") # corresponding sample names fnames <- c("embryo1", "embryo2", "embryo3", "embryo4") ## CapSet object from raw (multiplexed) fastq files cs <- newCapSet(expMethod = 'MAPCap', fastq_R1 = file.path(dir, 'mapcap_test_R1.fastq.gz'), fastq_R2 = file.path(dir, 'mapcap_test_R2.fastq.gz'), idxList = idxlist, sampleNames = fnames) ## CapSet object from mapped BAM files bams <- list.files(file.path(dir, 'bam'), pattern = '.bam$', full.names = TRUE) cs <- newCapSet(expMethod = 'MAPCap', mapped_file = bams, sampleNames = fnames)
# list of barcode IDs idxlist <- c("CAAGTG", "TTAGCC", "GTGGAA", "TGTGAG") dir <- system.file("extdata", package="icetea") # corresponding sample names fnames <- c("embryo1", "embryo2", "embryo3", "embryo4") ## CapSet object from raw (multiplexed) fastq files cs <- newCapSet(expMethod = 'MAPCap', fastq_R1 = file.path(dir, 'mapcap_test_R1.fastq.gz'), fastq_R2 = file.path(dir, 'mapcap_test_R2.fastq.gz'), idxList = idxlist, sampleNames = fnames) ## CapSet object from mapped BAM files bams <- list.files(file.path(dir, 'bam'), pattern = '.bam$', full.names = TRUE) cs <- newCapSet(expMethod = 'MAPCap', mapped_file = bams, sampleNames = fnames)
Count the number of reads in a given GRanges
numReadsInBed(regions, bams = NA, countall = FALSE)
numReadsInBed(regions, bams = NA, countall = FALSE)
regions |
The GRanges object to count reads in. |
bams |
character. path to bam files from where the reads have to be counted |
countall |
logical. whether to keep both reads of paired-end data |
Total counts within given ranges per BAM file.
Plotprecision background script
plotPrecision(ref, tssData, distCut)
plotPrecision(ref, tssData, distCut)
ref |
GRanges. reference ranges to compare the precision with. |
tssData |
GRangesList object with TSS detected per sample |
distCut |
integer. max distance cutoff |
ggplot object
Plot read statistics from the CapSet object
plotReadStats( CSobject, plotType = "dodge", plotValue = "numbers", outFile = NULL ) ## S4 method for signature 'CapSet' plotReadStats( CSobject, plotType = "dodge", plotValue = "numbers", outFile = NULL )
plotReadStats( CSobject, plotType = "dodge", plotValue = "numbers", outFile = NULL ) ## S4 method for signature 'CapSet' plotReadStats( CSobject, plotType = "dodge", plotValue = "numbers", outFile = NULL )
CSobject |
The |
plotType |
character. The type of plot to make. Choose from "stack" or "dodge" for either a stacked barchart, or a bar chart with "dodged" positions (analogous to ggplot) |
plotValue |
character. What values to plot. Choose from "numbers" or "proportions". If "proportions" is selected, the proportion of reads w.r.t total demultiplexed reads per sample would be plotted |
outFile |
character. Output file name. (filename extention would be used to determine type). If outfile not specified, the plot would be retured on the screen |
A ggplot object, or a file. Plot showing the number/proportion of reads in each category, per sample
# load a previously saved CapSet object cs <- exampleCSobject() plotReadStats(cs, plotType = "dodge", plotValue = "numbers", outFile = "test_numbers.pdf")
# load a previously saved CapSet object cs <- exampleCSobject() plotReadStats(cs, plotType = "dodge", plotValue = "numbers", outFile = "test_numbers.pdf")
Plot precision of TSS detection from multiple samples (bed files) with respect to a given reference annotation.
Plot precision of TSS detection from multiple samples present within a
CapSet
object, with respect to a given reference annotation.
plotTSSprecision( reference, detectedTSS, distanceCutoff = 500, outFile = NULL, ... ) ## S4 method for signature 'GRanges,character' plotTSSprecision( reference, detectedTSS, distanceCutoff = 500, outFile = NULL, sampleNames ) ## S4 method for signature 'GRanges,CapSet' plotTSSprecision( reference, detectedTSS, distanceCutoff = 500, outFile = NULL, ... )
plotTSSprecision( reference, detectedTSS, distanceCutoff = 500, outFile = NULL, ... ) ## S4 method for signature 'GRanges,character' plotTSSprecision( reference, detectedTSS, distanceCutoff = 500, outFile = NULL, sampleNames ) ## S4 method for signature 'GRanges,CapSet' plotTSSprecision( reference, detectedTSS, distanceCutoff = 500, outFile = NULL, ... )
reference |
Reference Transcrips/Genes as a |
detectedTSS |
Either a CapSet object with TSS information (after running |
distanceCutoff |
integer. Maximum distance (in base pairs) from reference TSS to plot |
outFile |
character. Output file name (filename extention would be used to determine type) If outfile not specified, the plot would be returned on the screen |
... |
Additional arguments |
sampleNames |
character. Labels for input samples (in the same order as the input bed files) |
A ggplot object, or a file. Plot showing perent of TSS detected per sample with respect to their cumulative distance to TSS of the provided reference
# load a txdb object suppressMessages(library("TxDb.Dmelanogaster.UCSC.dm6.ensGene")) seqlevelsStyle(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "ENSEMBL" transcripts <- transcripts(TxDb.Dmelanogaster.UCSC.dm6.ensGene) # Plotting the precision using a pre computed set of TSS (.bed files) : tssfile <- system.file("extdata", "testTSS_merged.bed", package = "icetea") plotTSSprecision(reference = transcripts, detectedTSS = tssfile, sampleNames = "testTSS", distanceCutoff = 500, outFile = "TSS_detection_precision.png") ## Plotting the precision using a CapSet object : library("TxDb.Dmelanogaster.UCSC.dm6.ensGene") seqlevelsStyle(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "ENSEMBL" # only use chrX to make the analysis faster seqlevels(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "X" transcripts <- transcripts(TxDb.Dmelanogaster.UCSC.dm6.ensGene) # load a previously saved CapSet object cs <- exampleCSobject() # plot plotTSSprecision(reference = transcripts, detectedTSS = cs, outFile = "TSS_detection_precision.png")
# load a txdb object suppressMessages(library("TxDb.Dmelanogaster.UCSC.dm6.ensGene")) seqlevelsStyle(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "ENSEMBL" transcripts <- transcripts(TxDb.Dmelanogaster.UCSC.dm6.ensGene) # Plotting the precision using a pre computed set of TSS (.bed files) : tssfile <- system.file("extdata", "testTSS_merged.bed", package = "icetea") plotTSSprecision(reference = transcripts, detectedTSS = tssfile, sampleNames = "testTSS", distanceCutoff = 500, outFile = "TSS_detection_precision.png") ## Plotting the precision using a CapSet object : library("TxDb.Dmelanogaster.UCSC.dm6.ensGene") seqlevelsStyle(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "ENSEMBL" # only use chrX to make the analysis faster seqlevels(TxDb.Dmelanogaster.UCSC.dm6.ensGene) <- "X" transcripts <- transcripts(TxDb.Dmelanogaster.UCSC.dm6.ensGene) # load a previously saved CapSet object cs <- exampleCSobject() # plot plotTSSprecision(reference = transcripts, detectedTSS = cs, outFile = "TSS_detection_precision.png")
preprocess reads to count only 3' overlaps
readsTo3p(reads, width = 1, fix = "end")
readsTo3p(reads, width = 1, fix = "end")
reads |
GAlignment object to resize |
width |
integer. New read length |
fix |
character. 'Start' for 5' |
Resized reads as GRanges
preprocess reads to count only 5' overlaps
readsTo5p(reads, width = 1, fix = "start")
readsTo5p(reads, width = 1, fix = "start")
reads |
GAlignment object to resize |
width |
integer. New read length |
fix |
character. 'Start' for 5' |
Resized reads as GRanges
preprocess reads to count only center overlaps
readsToCenter(reads, width = 1, fix = "center")
readsToCenter(reads, width = 1, fix = "center")
reads |
GAlignment object to resize |
width |
integer. New read length |
fix |
character. 'Start' for 5' |
Resized reads as GRanges
Retrieve and replace sample information of a CapSet object
sampleInfo(object, ...) sampleInfo(object, ...) <- value ## S4 method for signature 'CapSet' sampleInfo(object) ## S4 replacement method for signature 'CapSet' sampleInfo(object) <- value
sampleInfo(object, ...) sampleInfo(object, ...) <- value ## S4 method for signature 'CapSet' sampleInfo(object) ## S4 replacement method for signature 'CapSet' sampleInfo(object) <- value
object |
The |
... |
Additional options |
value |
Replacement DataFrame object |
sample information data.frame
# load a previously saved CapSet object cs <- exampleCSobject() # get sampleinfo si <- sampleInfo(cs) # modify si$samples <- paste0("sample_", seq_along(1:nrow(si)) ) # replace sampleInfo(cs) <- si
# load a previously saved CapSet object cs <- exampleCSobject() # get sampleinfo si <- sampleInfo(cs) # modify si$samples <- paste0("sample_", seq_along(1:nrow(si)) ) # replace sampleInfo(cs) <- si
Split paired-end fastq by barcodes
split_fastq( expType, idx_name, outfile_R1, outfile_R2, fastq_R1, fastq_R2, max_mismatch )
split_fastq( expType, idx_name, outfile_R1, outfile_R2, fastq_R1, fastq_R2, max_mismatch )
expType |
character. experiment type (RAMPAGE, MAPCap or CAGE) |
idx_name |
character. barcode ID |
outfile_R1 |
character. output fastq file : Read 1 |
outfile_R2 |
character. output fastq file : Read 2 |
fastq_R1 |
character. input fastq file : Read 1 |
fastq_R2 |
character. input fastq file : Read 2 |
max_mismatch |
integer. max allowed mismatches |
kept reads corresponding to each barcode.
Split the composite BAM file using internal indexes (MAPCap)
splitBAM_byIndex( bamFile, index_list, outfile_list, max_mismatch = 0, ncores = 1 )
splitBAM_byIndex( bamFile, index_list, outfile_list, max_mismatch = 0, ncores = 1 )
bamFile |
character. Path to a mapped BAM file |
index_list |
character. A list of indexes for splitting |
outfile_list |
character. A list of output file names (with order corresponding to that of index_list) |
max_mismatch |
integer. No. of mismatches allowed in index (maxium 1 recommended) |
ncores |
integer. Number of cores to use for parallel processing |
Filtered files
bam <- system.file("extdata", "bam/embryo1.bam", package = "icetea") splitBAM_byIndex(bamFile = bam, index_list = c("CAAGTG", "CAAGTT"), outfile_list = c("test_filt1.bam","test_filt2.bam"), ncores = 1)
bam <- system.file("extdata", "bam/embryo1.bam", package = "icetea") splitBAM_byIndex(bamFile = bam, index_list = c("CAAGTG", "CAAGTT"), outfile_list = c("test_filt1.bam","test_filt2.bam"), ncores = 1)
Split the composite BAM file using replicate indexes (MAPCap data)
splitBAM_byRepindex(bamFile, outfile_prefix, ncores = 1)
splitBAM_byRepindex(bamFile, outfile_prefix, ncores = 1)
bamFile |
character. Path to a mapped BAM file |
outfile_prefix |
character. prefix for output file (replicates IDs will be added as RR/YY) |
ncores |
integer. Number of cores to use for parallel processing |
Filtered files by replicate Index
bam <- system.file("extdata", "bam/embryo1.bam", package = "icetea") splitBAM_byRepindex(bamFile = bam, outfile_prefix = "testSplit", ncores = 1)
bam <- system.file("extdata", "bam/embryo1.bam", package = "icetea") splitBAM_byRepindex(bamFile = bam, outfile_prefix = "testSplit", ncores = 1)
Get features with the best rank for each TSS
splitranks(x)
splitranks(x)
x |
output of getranks |
A data frame with counts
Perform stranded Bin counts
strandBinCounts( bam.files, restrictChrs, bam_param, bp_param, window_size, sliding = FALSE, func )
strandBinCounts( bam.files, restrictChrs, bam_param, bp_param, window_size, sliding = FALSE, func )
bam.files |
character vector. BAM files to use |
restrictChrs |
character vector. chromosomes to use |
bam_param |
ScanBAMParams |
bp_param |
BPPARAM |
window_size |
integer. size of window to use |
sliding |
logical. perform sliding window counts? |
func |
function to preprocess reads |
RangedSE object with forward and reverse strand counts