| Title: | Identification of putative mammalian orthologs to given enhancer |
|---|---|
| Description: | Get ENCODE data of enhancer region via H3K4me1 peaks and search homolog regions for given sequences. The candidates of enhancer homolog regions can be filtered by distance to target TSS. The top candidates from human and mouse will be aligned to each other and then exported as multiple alignments with given enhancer. |
| Authors: | Jianhong Ou [aut, cre] (ORCID: <https://orcid.org/0000-0002-8652-2488>), Valentina Cigliola [dtc], Kenneth Poss [fnd] |
| Maintainer: | Jianhong Ou <[email protected]> |
| License: | GPL (>= 2) |
| Version: | 1.19.0 |
| Built: | 2026-05-28 15:06:00 UTC |
| Source: | https://github.com/bioc/enhancerHomologSearch |
Do pairwise alignment for query enhancer to target genome
alignment( query, subject, method = c("ClustalW", "Muscle"), cluster = c("nj", "upgma", "upgmamax", "upgmamin", "upgmb"), substitutionMatrix = c("iub", "clustalw"), gapOpening = ifelse(method[1] == "ClustalW", 15, 400), gapExtension = ifelse(method[1] == "ClustalW", 6.66, 0), maxiters = ifelse(method[1] == "ClustalW", 3, 16), order = c("aligned", "input"), ... )alignment( query, subject, method = c("ClustalW", "Muscle"), cluster = c("nj", "upgma", "upgmamax", "upgmamin", "upgmb"), substitutionMatrix = c("iub", "clustalw"), gapOpening = ifelse(method[1] == "ClustalW", 15, 400), gapExtension = ifelse(method[1] == "ClustalW", 6.66, 0), maxiters = ifelse(method[1] == "ClustalW", 3, 16), order = c("aligned", "input"), ... )
query |
An object of DNAStringSet to represent enhancer |
subject |
An list of objects of Enhancers. |
method |
specifies the multiple sequence alignment to be used; currently, "ClustalW", and "Muscle" are supported. Default is "Muscle" |
cluster |
The clustering method which should be used. Possible values are "nj" (default) and "upgma". In the original ClustalW implementation, this parameter is called clustering. |
substitutionMatrix |
substitution matrix for scoring matches and mismatches; The valid choices for this parameter are "iub" and "clustalw". In the original ClustalW implementation, this parameter is called matrix. |
gapOpening |
gap opening penalty; the default is 400 for DNA sequences and 420 for RNA sequences. The default for amino acid sequences depends on the profile score settings: for the setting le=TRUE, the default is 2.9, for sp=TRUE, the default is 1,439, and for sv=TRUE, the default is 300. Note that these defaults may not be suitable if custom substitution matrices are being used. In such a case, a sensible choice of gap penalties that fits well to the substitution matrix must be made. |
gapExtension |
gap extension penalty; the default is 0. |
maxiters |
maximum number of iterations; the default is 16. |
order |
how the sequences should be ordered in the output object; if "aligned" is chosen, the sequences are ordered in the way the multiple sequence alignment algorithm orders them. If "input" is chosen, the sequences in the output object are ordered in the same way as the input sequences. |
... |
Parameters can be used by Muscle, or ClustalW. |
An object of Enhancers.
library(BSgenome.Hsapiens.UCSC.hg38) library(BSgenome.Mmusculus.UCSC.mm10) library(BSgenome.Drerio.UCSC.danRer10) LEN <- GRanges("chr4", IRanges(19050041, 19051709)) seqEN <- getSeq(BSgenome.Drerio.UCSC.danRer10, LEN) aln_hs <- readRDS(system.file("extdata", "aln_hs.rds", package="enhancerHomologSearch")) genome(aln_hs) <- Hsapiens aln_mm <- readRDS(system.file("extdata", "aln_mm.rds", package="enhancerHomologSearch")) genome(aln_mm) <- Mmusculus al <- alignment(seqEN, list(human=aln_hs, mouse=aln_mm), method="ClustalW", order="input")library(BSgenome.Hsapiens.UCSC.hg38) library(BSgenome.Mmusculus.UCSC.mm10) library(BSgenome.Drerio.UCSC.danRer10) LEN <- GRanges("chr4", IRanges(19050041, 19051709)) seqEN <- getSeq(BSgenome.Drerio.UCSC.danRer10, LEN) aln_hs <- readRDS(system.file("extdata", "aln_hs.rds", package="enhancerHomologSearch")) genome(aln_hs) <- Hsapiens aln_mm <- readRDS(system.file("extdata", "aln_mm.rds", package="enhancerHomologSearch")) genome(aln_mm) <- Mmusculus al <- alignment(seqEN, list(human=aln_hs, mouse=aln_mm), method="ClustalW", order="input")
Do pairwise alignment for query enhancer to target genome
alignmentOne(query, subject, block = 1000, bpparam = bpparam(), ...)alignmentOne(query, subject, block = 1000, bpparam = bpparam(), ...)
query |
An object of DNAStringSet to represent enhancer |
subject |
Output of getENCODEdata. An object of Enhancers |
block |
The size of sequences to do alignment. Increase the size will increase the memory cost. Default 1000. |
bpparam |
BiocParallel parameters. |
... |
not used. |
An object of Enhancers.
library(BiocParallel) bpparam <- MulticoreParam(workers = 1, tasks=200, progressbar=TRUE) library(BSgenome.Hsapiens.UCSC.hg38) peaks <- GRanges("chr1", IRanges(seq(5000, 50000, by=1000), width=1000)) peaks$id <- paste(seq_along(peaks), 1, sep="_") subj <- Enhancers(genome=Hsapiens, peaks=peaks) q <- getSeq(Hsapiens, GRanges("chr1", IRanges(90000, width=1000))) ao <- alignmentOne(q, subj, bpparam=bpparam)library(BiocParallel) bpparam <- MulticoreParam(workers = 1, tasks=200, progressbar=TRUE) library(BSgenome.Hsapiens.UCSC.hg38) peaks <- GRanges("chr1", IRanges(seq(5000, 50000, by=1000), width=1000)) peaks$id <- paste(seq_along(peaks), 1, sep="_") subj <- Enhancers(genome=Hsapiens, peaks=peaks) q <- getSeq(Hsapiens, GRanges("chr1", IRanges(90000, width=1000))) ao <- alignmentOne(q, subj, bpparam=bpparam)
Print the conserved motifs in the alignments
conservedMotifs( aln, aln_list, PWMs, queryGenome, background = "genome", ..., output_folder, format = c("txt", "html") )conservedMotifs( aln, aln_list, PWMs, queryGenome, background = "genome", ..., output_folder, format = c("txt", "html") )
aln |
alignment of multiple DNAs. Output of alignment function. |
aln_list |
The list of output of searchTFBPS such as for human and mouse. |
PWMs |
The Position Weight Matrix list represented as a numeric matrix. Object of PWMatrixList or PFMatrixList. |
queryGenome |
An object of BSgenome for query enhancer. |
background |
Background nucleotide frequencies. Default is "genome". Refer matchMotifs for details. |
... |
Other parameters can be passed to to matchMotifs. |
output_folder |
Output folder name. |
format |
The format of output files with motif match positions. Available formats are 'txt' and 'html'. Default is 'txt'. |
A list of XStringViews
library(BSgenome.Hsapiens.UCSC.hg38) library(BSgenome.Mmusculus.UCSC.mm10) library(BSgenome.Drerio.UCSC.danRer10) LEN <- GRanges("chr4", IRanges(19050041, 19051709)) seqEN <- getSeq(BSgenome.Drerio.UCSC.danRer10, LEN) aln_hs <- readRDS(system.file("extdata", "aln_hs.rds", package="enhancerHomologSearch")) genome(aln_hs) <- Hsapiens aln_mm <- readRDS(system.file("extdata", "aln_mm.rds", package="enhancerHomologSearch")) genome(aln_mm) <- Mmusculus al <- alignment(seqEN, list(human=aln_hs, mouse=aln_mm), method="ClustalW", order="input") data(motifs) conservedMotifs(al[[1]], list(human=aln_hs, mouse=aln_mm), motifs[["dist60"]], Drerio)library(BSgenome.Hsapiens.UCSC.hg38) library(BSgenome.Mmusculus.UCSC.mm10) library(BSgenome.Drerio.UCSC.danRer10) LEN <- GRanges("chr4", IRanges(19050041, 19051709)) seqEN <- getSeq(BSgenome.Drerio.UCSC.danRer10, LEN) aln_hs <- readRDS(system.file("extdata", "aln_hs.rds", package="enhancerHomologSearch")) genome(aln_hs) <- Hsapiens aln_mm <- readRDS(system.file("extdata", "aln_mm.rds", package="enhancerHomologSearch")) genome(aln_mm) <- Mmusculus al <- alignment(seqEN, list(human=aln_hs, mouse=aln_mm), method="ClustalW", order="input") data(motifs) conservedMotifs(al[[1]], list(human=aln_hs, mouse=aln_mm), motifs[["dist60"]], Drerio)
"Enhancers"
An object of class "Enhancers" represents the output of function getENCODEdata, which includes the sequences of enhancers and their genomic coordinates.
Enhancers(genome, peaks, TFBP, TFBP0) ## S4 method for signature 'Enhancers' x$name ## S4 replacement method for signature 'Enhancers' x$name <- value ## S4 method for signature 'Enhancers,ANY' distance(x) ## S4 replacement method for signature 'Enhancers' distance(x) <- value ## S4 method for signature 'Enhancers' tfbp(x) ## S4 method for signature 'Enhancers' query_tfbp(x) ## S4 method for signature 'Enhancers' getSeq(x, ...) ## S4 method for signature 'Enhancers,ANY' subsetByOverlaps( x, ranges, maxgap = -1L, minoverlap = 0L, type = c("any", "start", "end", "within", "equal"), invert = FALSE, ... ) ## S4 method for signature 'Enhancers' subset(x, ...) ## S4 method for signature 'Enhancers' seqinfo(x) ## S4 method for signature 'Enhancers' genome(x) ## S4 replacement method for signature 'Enhancers' genome(x) <- value ## S4 method for signature 'Enhancers' peaks(x) ## S4 replacement method for signature 'Enhancers' peaks(x) <- value ## S4 method for signature 'Enhancers' show(object)Enhancers(genome, peaks, TFBP, TFBP0) ## S4 method for signature 'Enhancers' x$name ## S4 replacement method for signature 'Enhancers' x$name <- value ## S4 method for signature 'Enhancers,ANY' distance(x) ## S4 replacement method for signature 'Enhancers' distance(x) <- value ## S4 method for signature 'Enhancers' tfbp(x) ## S4 method for signature 'Enhancers' query_tfbp(x) ## S4 method for signature 'Enhancers' getSeq(x, ...) ## S4 method for signature 'Enhancers,ANY' subsetByOverlaps( x, ranges, maxgap = -1L, minoverlap = 0L, type = c("any", "start", "end", "within", "equal"), invert = FALSE, ... ) ## S4 method for signature 'Enhancers' subset(x, ...) ## S4 method for signature 'Enhancers' seqinfo(x) ## S4 method for signature 'Enhancers' genome(x) ## S4 replacement method for signature 'Enhancers' genome(x) <- value ## S4 method for signature 'Enhancers' peaks(x) ## S4 replacement method for signature 'Enhancers' peaks(x) <- value ## S4 method for signature 'Enhancers' show(object)
genome |
An object of BSgenome. |
peaks |
An object of GRanges. |
TFBP |
An object of lgCMatrix. |
TFBP0 |
An vector of logical.
|
x |
An object of |
name |
Slot name. |
value |
The values. |
... |
parameters can be passed to upstream functions. |
ranges, maxgap, minoverlap, type, invert
|
parameters used by subsetByOverlaps |
object |
An object of |
An object of Enhancers.
genomeAn object of BSgenome.
peaksAn object of GRanges.
TFBPAn object of lgCMatrix.
TFBP0An vector of logical.
Enhancers()Enhancers()
Query enhancer peak and extract sequences from ENCODE
getENCODEdata( genome, markers = "H3K4me1", window_size = 1000L, step = 50L, output = c("Enhancer", "raw_peaks"), ... )getENCODEdata( genome, markers = "H3K4me1", window_size = 1000L, step = 50L, output = c("Enhancer", "raw_peaks"), ... )
genome |
An object of BSgenome. |
markers |
Enhancer markers. Default 'H3K4me1'. For active enhancer, it can be set as c('H3K4me1', 'H3K27ac'). If markers is a 'GRanges' object, the function will skip the download step. |
window_size, step
|
The size of windows and steps to split the peaks into small pieces. These parameter is used because the width of histone marker peaks are different sizes. Break the peaks into small pieces can increase the matching score and align the matching for different peaks into same size. The window_size is also be used for overlapping detection of multiple histone markers. |
output |
Output format. If it is 'raw_peaks', the raw peaks list will be returned. Otherwise, the Enhancer object will be returned. |
... |
Parameters can be passed to queryEncode |
An object of Enhancers with genome, and peaks. The peaks is an object of GRanges. The genome is an object of BSgenome.
library(BSgenome.Hsapiens.UCSC.hg38) hs <- getENCODEdata(genome=Hsapiens, partialMatch=c(biosample_summary="spinal cord"))library(BSgenome.Hsapiens.UCSC.hg38) hs <- getENCODEdata(genome=Hsapiens, partialMatch=c(biosample_summary="spinal cord"))
The data were extracted from MotifDb package (v 1.34.0) and grouped by motifStack package (v 1.37.2). The data were packaged as PFMatrixList object by TFBSTools (v 1.30.0)
data(motifs)data(motifs)
a list of PFMatrixList. The names of the list is the grouop distance.
MotifDb package. Source code for the data generation is in extdata folder
data(motifs) names(motifs) motifs[[1]]data(motifs) names(motifs) motifs[[1]]
Search ENCODE data by querying the ENCODE Portal using its REST API.
queryEncode( exactMatch, partialMatch = character(0), API_url = "https://www.encodeproject.org/search/", ... )queryEncode( exactMatch, partialMatch = character(0), API_url = "https://www.encodeproject.org/search/", ... )
exactMatch |
character. Exact-match keywords refer to search results that perfectly match all the keywords in the search query, exactly as entered. It is a named character vector. The names will be the keys and characters will be the values for search. |
partialMatch |
character. Partial-match refer to search results that contain the search query. It is a named character vector. The names will be the keys and characters will be the values for search. The value starting from '!' indicates logical negation(NOT). The value starting from '>', '>=', '<', '==', '<=' indicates string comparison. |
API_url |
character. The ENCODE REST API url. |
... |
Not used. |
A list of search results
res <- queryEncode( exactMatch=c( target.label="H3K4me1", replicates.library.biosample.donor.organism.scientific_name="Homo sapiens", assembly="GRCh38", assay_term_name="ChIP-seq"), partialMatch=c(biosample_summary="heart"))res <- queryEncode( exactMatch=c( target.label="H3K4me1", replicates.library.biosample.donor.organism.scientific_name="Homo sapiens", assembly="GRCh38", assay_term_name="ChIP-seq"), partialMatch=c(biosample_summary="heart"))
Save enhancer homologs to file in phylip format.
saveAlignments( al, output_folder = tempdir(), motifConsensus = NULL, format = c("txt", "html") )saveAlignments( al, output_folder = tempdir(), motifConsensus = NULL, format = c("txt", "html") )
al |
output of alignment. |
output_folder |
output folder. |
motifConsensus |
Transcription factor binding consensus. |
format |
The format of output files. Available formats are 'txt' and 'html'. Default is 'txt'. |
The I/O status.
al <- readRDS(system.file("extdata", "al.rds", package="enhancerHomologSearch")) tmpfolder <- tempdir() library(MotifDb) motifs <- query(MotifDb, "JASPAR_CORE") consensus <- sapply(motifs, consensusString) consensus <- DNAStringSet(gsub("\\?", "N", consensus)) saveAlignments(al, output_folder=tmpfolder, motifConsensus=consensus)al <- readRDS(system.file("extdata", "al.rds", package="enhancerHomologSearch")) tmpfolder <- tempdir() library(MotifDb) motifs <- query(MotifDb, "JASPAR_CORE") consensus <- sapply(motifs, consensusString) consensus <- DNAStringSet(gsub("\\?", "N", consensus)) saveAlignments(al, output_folder=tmpfolder, motifConsensus=consensus)
Search the TFBPs for query in subject.
searchTFBPS( query, subject, PWMs, queryGenome, background = "genome", ..., maximalShuffleEnhancers = 1000 )searchTFBPS( query, subject, PWMs, queryGenome, background = "genome", ..., maximalShuffleEnhancers = 1000 )
query |
An object of DNAStringSet to represent enhancer |
subject |
Output of getENCODEdata. An object of Enhancers |
PWMs |
The Position Weight Matrix list represented as a numeric matrix. Object of PWMatrixList or PFMatrixList. |
queryGenome |
An object of BSgenome for query data. |
background |
background nucleotide frequencies. Default is "genome". Refer matchMotifs for details. |
... |
Parameters will be passed to matchMotifs except 'out' and 'genome'. |
maximalShuffleEnhancers |
The maximal number of Shuffled enhancers. If the number of the input enhancer candidates is greater than maximalShuffleEnhancers, no shuffled enhancer sequences will be included. The shuffled enhancers will be created by shuffle. |
An object of Enhancers.
library(BSgenome.Hsapiens.UCSC.hg38) peaks <- GRanges("chr1", IRanges(seq(5000, 50000, by=1000), width=1000)) peaks$id <- paste(seq_along(peaks), 1, sep="_") subj <- Enhancers(genome=Hsapiens, peaks=peaks) q <- getSeq(Hsapiens, GRanges("chr1", IRanges(90000, width=1000))) data(motifs) ao <- searchTFBPS(q, subj, motifs[["dist60"]], queryGenome=Hsapiens, maximalShuffleEnhancers = 50)library(BSgenome.Hsapiens.UCSC.hg38) peaks <- GRanges("chr1", IRanges(seq(5000, 50000, by=1000), width=1000)) peaks$id <- paste(seq_along(peaks), 1, sep="_") subj <- Enhancers(genome=Hsapiens, peaks=peaks) q <- getSeq(Hsapiens, GRanges("chr1", IRanges(90000, width=1000))) data(motifs) ao <- searchTFBPS(q, subj, motifs[["dist60"]], queryGenome=Hsapiens, maximalShuffleEnhancers = 50)
Uses the uShuffle library to shuffle reads
shuffle(reads, k = 2, n = 2)shuffle(reads, k = 2, n = 2)
reads |
An object of BStringSet. |
k |
the k-let size. |
n |
the number of random sequences to generate. |
An object of BStringSet.
Jiang, M., Anderson, J., Gillespie, J. et al. uShuffle: A useful tool for shuffling biological sequences while preserving the k-let counts. BMC Bioinformatics 9, 192 (2008). https://doi.org/10.1186/1471-2105-9-192
library(Biostrings) f <- DNAStringSet(c("CTC-NACCAGTAT", "TTGA", "TACCTAGAG")) shuffle(f)library(Biostrings) f <- DNAStringSet(c("CTC-NACCAGTAT", "TTGA", "TACCTAGAG")) shuffle(f)