Title: | CNE Detection and Visualization |
---|---|
Description: | Large-scale identification and advanced visualization of sets of conserved noncoding elements. |
Authors: | Ge Tan <[email protected]> |
Maintainer: | Ge Tan <[email protected]> |
License: | GPL-2 | file LICENSE |
Version: | 1.43.0 |
Built: | 2024-11-03 06:28:20 UTC |
Source: | https://github.com/bioc/CNEr |
Given a list of GO IDs, add the corresponding ancestor GO IDs.
addAncestorGO(go)
addAncestorGO(go)
go |
A |
The ancestor GO IDs for each GO ID are added to the elements.
A list
of GO IDs with their ancestor GO IDs.
This function is mainly designed for processing the gff annotation generated from interproscan, where for each gene, a set of GO IDs are assigned. However, for GO enrichment analysis, we need a list of mapping from genes to the GO IDs and their ancestor GO IDs as well.
Ge Tan
## Not run: library(GO.db) go <- list(c("GO:0005215", "GO:0006810", "GO:0016020"), "GO:0016579") addAncestorGO(go) ## End(Not run)
## Not run: library(GO.db) go <- list(c("GO:0005215", "GO:0006810", "GO:0016020"), "GO:0016579") addAncestorGO(go) ## End(Not run)
Five annotation tracks for plotting in Gviz.
data(axisTrack) data(cpgIslands) data(refGenes)
data(axisTrack) data(cpgIslands) data(refGenes)
These tracks are based on genome="danRer10", chr = "chr6", start = 24000000, end = 27000000.
data(axisTrack) data(cpgIslands) data(refGenes)
data(axisTrack) data(cpgIslands) data(refGenes)
"Axt"
The Axt S4 object to hold a axt file.
## Constructors: Axt(targetRanges=GRanges(), targetSeqs=DNAStringSet(), queryRanges=GRanges(), querySeqs=DNAStringSet(), score=integer(0), symCount=integer(0), names=NULL) ## Accessor-like methods: ## S4 method for signature 'Axt' targetRanges(x) ## S4 method for signature 'Axt' targetSeqs(x) ## S4 method for signature 'Axt' queryRanges(x) ## S4 method for signature 'Axt' querySeqs(x) ## S4 method for signature 'Axt' score(x) ## S4 method for signature 'Axt' symCount(x) ## ... and more (see Methods)
## Constructors: Axt(targetRanges=GRanges(), targetSeqs=DNAStringSet(), queryRanges=GRanges(), querySeqs=DNAStringSet(), score=integer(0), symCount=integer(0), names=NULL) ## Accessor-like methods: ## S4 method for signature 'Axt' targetRanges(x) ## S4 method for signature 'Axt' targetSeqs(x) ## S4 method for signature 'Axt' queryRanges(x) ## S4 method for signature 'Axt' querySeqs(x) ## S4 method for signature 'Axt' score(x) ## S4 method for signature 'Axt' symCount(x) ## ... and more (see Methods)
targetRanges |
Object of class |
targetSeqs |
Object of class |
queryRanges |
Object of class |
querySeqs |
Object of class |
score |
Object of class |
symCount |
Object of class |
names |
|
x |
Object of class |
In ‘axt’ files and Axt
object, the ‘targetRanges’
also have the alignments on positive strands.
However, the ‘queryRanges’ can have alignments on negative strands,
and the coordinates are based on negative strands, which is quite
different from the convention in Bioconductor.
To convert the coordinates of alignments on the negative strand to
the positive strand, use normaliseStrand
.
signature(x = "Axt", i = "ANY", j = "ANY")
: Axt getter
signature(x = "Axt")
: Axt concatenator.
signature(x = "Axt")
: Get the number of alignments.
signature(x = "Axt")
:
Get the ranges of query genome.
signature(x = "Axt")
:
Get the alignment sequences of query genome.
signature(x = "Axt")
: Get the alignment score.
signature(x = "Axt")
: Get the alignment lengths.
signature(x = "Axt")
:
Get the ranges of reference genome.
signature(x = "Axt")
:
Get the alignment sequences of reference genome.
Ge Tan
readAxt
writeAxt
subAxt
fixCoordinates
makeAxtTracks
library(GenomicRanges) library(Biostrings) ## Constructor targetRanges <- GRanges(seqnames=c("chr1", "chr1", "chr2", "chr3"), ranges=IRanges(start=c(1, 20, 2, 3), end=c(10, 25, 10, 10)), strand="+") targetSeqs <- DNAStringSet(c("ATTTTATGTG", "GGGAAG", "GGGCTTTTG", "TTGTGTAG")) queryRanges <- GRanges(seqnames=c("chr1", "chr10", "chr10", "chr20"), ranges=IRanges(start=c(1, 25, 50, 5), end=c(10, 30, 58, 12)), strand="+") querySeqs <- DNAStringSet(c("ATTTAAAGTG", "GGAAAA", "GGGCTCTGG", "TTAAATAA")) score <- c(246L, 4422L, 5679L, 1743L) symCount <- c(10L, 6L, 9L, 8L) axt <- Axt(targetRanges=targetRanges, targetSeqs=targetSeqs, queryRanges=queryRanges, querySeqs=querySeqs, score=score, symCount=symCount) ## getters names(axt) length(axt) first(axt) last(axt) seqnames(axt) strand(axt) seqinfo(axt) ## Vector methods axt[1] ## List methods unlist(axt) ## Combining c(axt, axt)
library(GenomicRanges) library(Biostrings) ## Constructor targetRanges <- GRanges(seqnames=c("chr1", "chr1", "chr2", "chr3"), ranges=IRanges(start=c(1, 20, 2, 3), end=c(10, 25, 10, 10)), strand="+") targetSeqs <- DNAStringSet(c("ATTTTATGTG", "GGGAAG", "GGGCTTTTG", "TTGTGTAG")) queryRanges <- GRanges(seqnames=c("chr1", "chr10", "chr10", "chr20"), ranges=IRanges(start=c(1, 25, 50, 5), end=c(10, 30, 58, 12)), strand="+") querySeqs <- DNAStringSet(c("ATTTAAAGTG", "GGAAAA", "GGGCTCTGG", "TTAAATAA")) score <- c(246L, 4422L, 5679L, 1743L) symCount <- c(10L, 6L, 9L, 8L) axt <- Axt(targetRanges=targetRanges, targetSeqs=targetSeqs, queryRanges=queryRanges, querySeqs=querySeqs, score=score, symCount=symCount) ## getters names(axt) length(axt) first(axt) last(axt) seqnames(axt) strand(axt) seqinfo(axt) ## Vector methods axt[1] ## List methods unlist(axt) ## Combining c(axt, axt)
Wrapper function of axtChain
: chain together psl alignments.
If two matching alignments next to each other are close enough,
they are joined into one segment.
This function doesn't work on Windows platform since Kent utilities only
support Unix-based platforms.
axtChain(psls, chains=sub("\\.psl$", ".chain", psls, ignore.case=TRUE), assemblyTarget, assemblyQuery, distance=c("far", "medium", "near"), removePsl=TRUE, binary="axtChain")
axtChain(psls, chains=sub("\\.psl$", ".chain", psls, ignore.case=TRUE), assemblyTarget, assemblyQuery, distance=c("far", "medium", "near"), removePsl=TRUE, binary="axtChain")
psls |
|
chains |
|
assemblyTarget |
|
assemblyQuery |
|
distance |
It can be "far", "medium" or "close". It decides the score matrix used in lastz aligner. See '?scoringMatrix' for more details. |
removePsl |
|
binary |
|
character
(n): the file names of output chain files.
Ge Tan
http://hgdownload.cse.ucsc.edu/admin/exe/
## Not run: ## This example doesn't run because it requires two bit files and external ## Kent utilities. psls <- tools::list_files_with_exts( dir="/Users/gtan/OneDrive/Project/CSC/CNEr/axt", exts="psl") assemblyTarget <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/danRer10.2bit" assemblyQuery <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/hg38.2bit" axtChain(psls, assemblyTarget=assemblyTarget, assemblyQuery=assemblyQuery, distance="far", removePsl=FALSE, binary="axtChain") ## End(Not run)
## Not run: ## This example doesn't run because it requires two bit files and external ## Kent utilities. psls <- tools::list_files_with_exts( dir="/Users/gtan/OneDrive/Project/CSC/CNEr/axt", exts="psl") assemblyTarget <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/danRer10.2bit" assemblyQuery <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/hg38.2bit" axtChain(psls, assemblyTarget=assemblyTarget, assemblyQuery=assemblyQuery, distance="far", removePsl=FALSE, binary="axtChain") ## End(Not run)
Given the path of the axt file, this function retrieves information on the widths of the alignments.
axtInfo(axtFiles)
axtInfo(axtFiles)
axtFiles |
The filenames of axt files. |
A vector of integer
is returned.
It stores the widths of all the alignments.
Ge Tan
axtFile <- file.path(system.file("extdata", package="CNEr"), "hg38.danRer10.net.axt") axtInfo <- axtInfo(axtFile)
axtFile <- file.path(system.file("extdata", package="CNEr"), "hg38.danRer10.net.axt") axtInfo <- axtInfo(axtFile)
Utility functions for UCSC bin indexing system manipulation
binFromCoordRange(starts, ends) binRangesFromCoordRange(start, end) binRestrictionString(start, end, field="bin")
binFromCoordRange(starts, ends) binRangesFromCoordRange(start, end) binRestrictionString(start, end, field="bin")
starts , ends
|
A vector of integers. A set of ranges. |
start , end
|
A integer vector of length 1. A coordinate range. |
field |
Name of bin column. Default: "bin". |
The UCSC bin indexing system was initially suggested by Richard Durbin and Lincoln Stein to speed up the SELECT of a SQL query for the rows overlapping with certain genome coordinate. The system first used in UCSC genome browser is described by Kent et. al. (2002).
For binFromCoordRange
, it returns the bin number
that should be assigned to a feature spanning the given range.
Usually it is used when creating a database for the features.
For binRangesFromCoordRange
, it returns the set of bin ranges
that overlap a given coordinate range.
It is usually used to find out the bins overlapped with a range.
For SQL query, it is more convenient to use binRestrictionString
than to use this function directly.
For binRestrictionString
, it returns a string to be used
in the WHERE section of a SQL SELECT statement
that is to select features overlapping a certain range.
* USE THIS WHEN QUERYING A DB *
Ge Tan
Kent, W. J., Sugnet, C. W., Furey, T. S., Roskin, K. M., Pringle, T. H., Zahler, A. M., & Haussler, A. D. (2002). The Human Genome Browser at UCSC. Genome Research, 12(6), 996-1006. doi:10.1101/gr.229102
http://genomewiki.ucsc.edu/index.php/Bin_indexing_system
binFromCoordRange(starts=c(10003, 1000000), ends=c(10004, 1100000)) binRangesFromCoordRange(start=10000, end=2000000) binRestrictionString(start=10000, end=2000000, field="bin")
binFromCoordRange(starts=c(10003, 1000000), ends=c(10004, 1100000)) binRangesFromCoordRange(start=10000, end=2000000) binRestrictionString(start=10000, end=2000000, field="bin")
CNE
object
This wrapper function blat the CNEs against the reference genome. Note that blat must be installed on your system.
blatCNE(cne, blatOptions=NULL, cutIdentity=90)
blatCNE(cne, blatOptions=NULL, cutIdentity=90)
cne |
|
blatOptions |
|
cutIdentity |
|
When winSize > 45, the blat option is "-tileSize=11 -minScore=30 -repMatch=1024".
When 35 < winSize <= 45, the blat option is "-tileSize=10 -minScore=28 -repMatch=4096".
When the winSize <= 35, the blat option is "-tileSize=9 -minScore=24 -repMatch=16384".
A CNE
object with a final set of CNEs.
Ge Tan
## Not run: data(CNEDanRer10Hg38) data(CNEHg38DanRer10) cne <- CNE(assembly1Fn=file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit"), assembly2Fn=file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit"), window=50L, identity=45L, CNE12=CNEDanRer10Hg38[["45_50"]], CNE21=CNEHg38DanRer10[["45_50"]], aligner="blat") cne <- cneMerge(cne) cne <- blatCNE(cne) ## End(Not run)
## Not run: data(CNEDanRer10Hg38) data(CNEHg38DanRer10) cne <- CNE(assembly1Fn=file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit"), assembly2Fn=file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit"), window=50L, identity=45L, CNE12=CNEDanRer10Hg38[["45_50"]], CNE21=CNEHg38DanRer10[["45_50"]], aligner="blat") cne <- cneMerge(cne) cne <- blatCNE(cne) ## End(Not run)
This is the main function for conserved noncoding elements (CNEs) identification.
ceScan(x, tFilter=NULL, qFilter=NULL, tSizes=NULL, qSizes=NULL, window=50L, identity=50L)
ceScan(x, tFilter=NULL, qFilter=NULL, tSizes=NULL, qSizes=NULL, window=50L, identity=50L)
x |
|
tFilter , qFilter
|
|
tSizes , qSizes
|
|
window |
|
identity |
|
ceScan scan the axts alignments and identify the CNEs.
ceScan can accept axts in Axt
object and regions to
filter out as GRanges
objects,
or directly the ‘axt’ files and ‘bed’ files.
The details of the algorithm are described in the vignette.
A list
of GRangePairs
or CNE
object is returned.
Each element of the list corresponds to one user-specified threshold for identifying CNEs.
Ge Tan
library(BSgenome.Drerio.UCSC.danRer10) library(BSgenome.Hsapiens.UCSC.hg38) axtFnHg38DanRer10 <- file.path(system.file("extdata", package="CNEr"), "hg38.danRer10.net.axt") axtHg38DanRer10 <- readAxt(axtFnHg38DanRer10) axtFnDanRer10Hg38 <- file.path(system.file("extdata", package="CNEr"), "danRer10.hg38.net.axt") axtDanRer10Hg38 <- readAxt(axtFnDanRer10Hg38) bedHg38Fn <- file.path(system.file("extdata", package="CNEr"), "filter_regions.hg38.bed") bedHg38 <- readBed(bedHg38Fn) bedDanRer10Fn <- file.path(system.file("extdata", package="CNEr"), "filter_regions.danRer10.bed") bedDanRer10 <- readBed(bedDanRer10Fn) qSizesHg38 <- seqinfo(BSgenome.Hsapiens.UCSC.hg38) qSizesDanRer10 <- seqinfo(BSgenome.Drerio.UCSC.danRer10) ## Axt object windows <- c(50L, 50L, 50L) identities <- c(45L, 48L, 49L) CNEHg38DanRer10 <- ceScan(x=axtHg38DanRer10, tFilter=bedHg38, qFilter=bedDanRer10, tSizes=qSizesHg38, qSizes=qSizesDanRer10, window=windows, identity=identities) CNEDanRer10Hg38 <- ceScan(x=axtDanRer10Hg38, tFilter=bedDanRer10, qFilter=bedHg38, tSizes=qSizesDanRer10, qSizes=qSizesHg38, window=windows, identity=identities) ## CNE object cneDanRer10Hg38 <- CNE( assembly1Fn=file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit"), assembly2Fn=file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit"), axt12Fn=axtFnDanRer10Hg38, axt21Fn=axtFnHg38DanRer10, cutoffs1=8L, cutoffs2=4L) ## Here danRer10Filter is tFilter since danRer10 is assembly1 cneListDanRer10Hg38 <- ceScan(x=cneDanRer10Hg38, tFilter=bedDanRer10, qFilter=bedHg38, window=windows, identity=identities)
library(BSgenome.Drerio.UCSC.danRer10) library(BSgenome.Hsapiens.UCSC.hg38) axtFnHg38DanRer10 <- file.path(system.file("extdata", package="CNEr"), "hg38.danRer10.net.axt") axtHg38DanRer10 <- readAxt(axtFnHg38DanRer10) axtFnDanRer10Hg38 <- file.path(system.file("extdata", package="CNEr"), "danRer10.hg38.net.axt") axtDanRer10Hg38 <- readAxt(axtFnDanRer10Hg38) bedHg38Fn <- file.path(system.file("extdata", package="CNEr"), "filter_regions.hg38.bed") bedHg38 <- readBed(bedHg38Fn) bedDanRer10Fn <- file.path(system.file("extdata", package="CNEr"), "filter_regions.danRer10.bed") bedDanRer10 <- readBed(bedDanRer10Fn) qSizesHg38 <- seqinfo(BSgenome.Hsapiens.UCSC.hg38) qSizesDanRer10 <- seqinfo(BSgenome.Drerio.UCSC.danRer10) ## Axt object windows <- c(50L, 50L, 50L) identities <- c(45L, 48L, 49L) CNEHg38DanRer10 <- ceScan(x=axtHg38DanRer10, tFilter=bedHg38, qFilter=bedDanRer10, tSizes=qSizesHg38, qSizes=qSizesDanRer10, window=windows, identity=identities) CNEDanRer10Hg38 <- ceScan(x=axtDanRer10Hg38, tFilter=bedDanRer10, qFilter=bedHg38, tSizes=qSizesDanRer10, qSizes=qSizesHg38, window=windows, identity=identities) ## CNE object cneDanRer10Hg38 <- CNE( assembly1Fn=file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit"), assembly2Fn=file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit"), axt12Fn=axtFnDanRer10Hg38, axt21Fn=axtFnHg38DanRer10, cutoffs1=8L, cutoffs2=4L) ## Here danRer10Filter is tFilter since danRer10 is assembly1 cneListDanRer10Hg38 <- ceScan(x=cneDanRer10Hg38, tFilter=bedDanRer10, qFilter=bedHg38, window=windows, identity=identities)
Wrapper function of chainMergeSort
:
Combines sorted files into a larger sorted file.
This function doesn't work on Windows platform since Kent utilities only
support Linux and Unix platforms.
chainMergeSort(chains, assemblyTarget, assemblyQuery, allChain=paste0(sub("\\.2bit$", "", basename(assemblyTarget), ignore.case=TRUE), ".", sub("\\.2bit$", "", basename(assemblyQuery), ignore.case=TRUE), ".all.chain"), removeChains=TRUE, binary="chainMergeSort")
chainMergeSort(chains, assemblyTarget, assemblyQuery, allChain=paste0(sub("\\.2bit$", "", basename(assemblyTarget), ignore.case=TRUE), ".", sub("\\.2bit$", "", basename(assemblyQuery), ignore.case=TRUE), ".all.chain"), removeChains=TRUE, binary="chainMergeSort")
chains |
|
assemblyTarget |
|
assemblyQuery |
|
allChain |
|
removeChains |
|
binary |
|
This allChain file is what we get from UCSC download, e.g., hg19.danRer7.all.chain.gz.
character
(1): the file names of merged allChain file.
Ge Tan
http://hgdownload.cse.ucsc.edu/admin/exe/
## Not run: ## This example doesn't run because it requires two bit files and external ## Kent utilities. chains <- tools::list_files_with_exts( dir="/Users/gtan/OneDrive/Project/CSC/CNEr/axt", exts="chain") assemblyTarget <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/danRer10.2bit" assemblyQuery <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/hg38.2bit" chainMergeSort(chains, assemblyTarget, assemblyQuery, allChain=file.path("/Users/gtan/OneDrive/Project/CSC/CNEr/axt", paste0(sub("\\.2bit$", "", basename(assemblyTarget), ignore.case=TRUE), ".", sub("\\.2bit$", "", basename(assemblyQuery), ignore.case=TRUE), ".all.chain")), removeChains=FALSE, binary="chainMergeSort") ## End(Not run)
## Not run: ## This example doesn't run because it requires two bit files and external ## Kent utilities. chains <- tools::list_files_with_exts( dir="/Users/gtan/OneDrive/Project/CSC/CNEr/axt", exts="chain") assemblyTarget <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/danRer10.2bit" assemblyQuery <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/hg38.2bit" chainMergeSort(chains, assemblyTarget, assemblyQuery, allChain=file.path("/Users/gtan/OneDrive/Project/CSC/CNEr/axt", paste0(sub("\\.2bit$", "", basename(assemblyTarget), ignore.case=TRUE), ".", sub("\\.2bit$", "", basename(assemblyQuery), ignore.case=TRUE), ".all.chain")), removeChains=FALSE, binary="chainMergeSort") ## End(Not run)
Wrapper function of chainNetSyntenic: Makes alignment nets out of chains and adds synteny info to net. This function doesn't work on Windows platform since Kent utilities only support Linux and Unix platforms.
chainNetSyntenic(allPreChain, assemblyTarget, assemblyQuery, netSyntenicFile=paste0(sub("\\.2bit$", "", basename(assemblyTarget), ignore.case = TRUE), ".", sub("\\.2bit$", "", basename(assemblyQuery), ignore.case = TRUE), ".noClass.net"), binaryChainNet="chainNet", binaryNetSyntenic="netSyntenic")
chainNetSyntenic(allPreChain, assemblyTarget, assemblyQuery, netSyntenicFile=paste0(sub("\\.2bit$", "", basename(assemblyTarget), ignore.case = TRUE), ".", sub("\\.2bit$", "", basename(assemblyQuery), ignore.case = TRUE), ".noClass.net"), binaryChainNet="chainNet", binaryNetSyntenic="netSyntenic")
allPreChain |
|
assemblyTarget |
|
assemblyQuery |
|
netSyntenicFile |
|
binaryChainNet |
|
binaryNetSyntenic |
|
Add classification information using the database tables: actually this step is not necessary in this pipeline according to http://blog.gmane.org/gmane.science.biology.ucscgenome.general/month=20130301. The class information will only be used for Genome Browser. Since it needs some specific modification of the table names for certain species, we skip this step now. If this step is done, then the generated class.net is the gzipped net file that you see in UCSC Downloads area.
character
(1): the file names of generated net file.
Ge Tan
http://hgdownload.cse.ucsc.edu/admin/exe/
## Not run: ## This example doesn't run because it requires two bit files and external ## Kent utilities. allPreChain <- file.path("/Users/gtan/OneDrive/Project/CSC/CNEr/axt", "danRer10.hg38.all.pre.chain") assemblyTarget <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/danRer10.2bit" assemblyQuery <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/hg38.2bit" chainNetSyntenic(allPreChain, assemblyTarget, assemblyQuery, netSyntenicFile=file.path( "/Users/gtan/OneDrive/Project/CSC/CNEr/axt", paste0(sub("\\.2bit$", "", basename(assemblyTarget), ignore.case = TRUE), ".", sub("\\.2bit$", "", basename(assemblyQuery), ignore.case = TRUE), ".noClass.net")), binaryChainNet="chainNet", binaryNetSyntenic="netSyntenic") ## End(Not run)
## Not run: ## This example doesn't run because it requires two bit files and external ## Kent utilities. allPreChain <- file.path("/Users/gtan/OneDrive/Project/CSC/CNEr/axt", "danRer10.hg38.all.pre.chain") assemblyTarget <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/danRer10.2bit" assemblyQuery <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/hg38.2bit" chainNetSyntenic(allPreChain, assemblyTarget, assemblyQuery, netSyntenicFile=file.path( "/Users/gtan/OneDrive/Project/CSC/CNEr/axt", paste0(sub("\\.2bit$", "", basename(assemblyTarget), ignore.case = TRUE), ".", sub("\\.2bit$", "", basename(assemblyQuery), ignore.case = TRUE), ".noClass.net")), binaryChainNet="chainNet", binaryNetSyntenic="netSyntenic") ## End(Not run)
Wrapper function of chainPreNet
:
Removes chains that don't have a chance of being netted.
This function doesn't work on Windows platform since Kent utilities only
support Linux and Unix platforms.
chainPreNet(allChain, assemblyTarget, assemblyQuery, allPreChain=paste0(sub("\\.2bit$", "", basename(assemblyTarget), ignore.case = TRUE), ".", sub("\\.2bit$", "", basename(assemblyQuery), ignore.case = TRUE), ".all.pre.chain"), removeAllChain=TRUE, binary="chainPreNet")
chainPreNet(allChain, assemblyTarget, assemblyQuery, allPreChain=paste0(sub("\\.2bit$", "", basename(assemblyTarget), ignore.case = TRUE), ".", sub("\\.2bit$", "", basename(assemblyQuery), ignore.case = TRUE), ".all.pre.chain"), removeAllChain=TRUE, binary="chainPreNet")
allChain |
|
assemblyTarget |
|
assemblyQuery |
|
allPreChain |
|
removeAllChain |
|
binary |
|
character
(1): the file names of merged allPreChain file.
Ge Tan
http://hgdownload.cse.ucsc.edu/admin/exe/
## Not run: ## This example doesn't run because it requires two bit files and external ## Kent utilities. allChain <- file.path("/Users/gtan/OneDrive/Project/CSC/CNEr/axt", "danRer10.hg38.all.chain") assemblyTarget <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/danRer10.2bit" assemblyQuery <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/hg38.2bit" chainPreNet(allChain, assemblyTarget, assemblyQuery, allPreChain=file.path( "/Users/gtan/OneDrive/Project/CSC/CNEr/axt", paste0(sub("\\.2bit$", "", basename(assemblyTarget), ignore.case = TRUE), ".", sub("\\.2bit$", "", basename(assemblyQuery), ignore.case = TRUE), ".all.pre.chain")), removeAllChain=FALSE, binary="chainPreNet") ## End(Not run)
## Not run: ## This example doesn't run because it requires two bit files and external ## Kent utilities. allChain <- file.path("/Users/gtan/OneDrive/Project/CSC/CNEr/axt", "danRer10.hg38.all.chain") assemblyTarget <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/danRer10.2bit" assemblyQuery <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/hg38.2bit" chainPreNet(allChain, assemblyTarget, assemblyQuery, allPreChain=file.path( "/Users/gtan/OneDrive/Project/CSC/CNEr/axt", paste0(sub("\\.2bit$", "", basename(assemblyTarget), ignore.case = TRUE), ".", sub("\\.2bit$", "", basename(assemblyQuery), ignore.case = TRUE), ".all.pre.chain")), removeAllChain=FALSE, binary="chainPreNet") ## End(Not run)
"CNE"
CNE
class contains all the meta-data of CNEs, including the pair of
assemblies, the thresholds, the intermediate and final CNE sets.
### Constructors: CNE(assembly1Fn=character(1), assembly2Fn=character(1), axt12Fn=character(), axt21Fn=character(), window=50L, identity=50L, CNE12=GRangePairs(), CNE21=GRangePairs(), CNEMerged=GRangePairs(), CNEFinal=GRangePairs(), aligner="blat", cutoffs1=4L, cutoffs2=4L) ### Accessor-like methods: ## S4 method for signature 'CNE' thresholds(x) ## S4 method for signature 'CNE' CNE12(x) ## S4 method for signature 'CNE' CNE21(x) ## S4 method for signature 'CNE' CNEMerged(x) ## S4 method for signature 'CNE' CNEFinal(x) ## ... and more (see Methods)
### Constructors: CNE(assembly1Fn=character(1), assembly2Fn=character(1), axt12Fn=character(), axt21Fn=character(), window=50L, identity=50L, CNE12=GRangePairs(), CNE21=GRangePairs(), CNEMerged=GRangePairs(), CNEFinal=GRangePairs(), aligner="blat", cutoffs1=4L, cutoffs2=4L) ### Accessor-like methods: ## S4 method for signature 'CNE' thresholds(x) ## S4 method for signature 'CNE' CNE12(x) ## S4 method for signature 'CNE' CNE21(x) ## S4 method for signature 'CNE' CNEMerged(x) ## S4 method for signature 'CNE' CNEFinal(x) ## ... and more (see Methods)
assembly1Fn , assembly2Fn
|
Object of class |
axt12Fn , axt21Fn
|
Object of class |
window |
Object of class |
identity |
Object of class |
CNE12 |
Object of class |
CNE21 |
Object of class |
CNEMerged |
Object of class |
CNEFinal |
Object of class |
aligner |
Object of class |
cutoffs1 , cutoffs2
|
Object of class |
x |
Object of class |
signature(x = "CNE")
:
Get the CNE1 results.
signature(x = "CNE")
:
Get the CNE2 results.
signature(x = "CNE")
:
Get the merged CNE results.
signature(x = "CNE")
:
Get the final CNE results.
signature(x = "CNE")
:
Get the thresholds used for scanning CNEs.
Ge Tan
library(GenomicRanges) ## Constructor CNE12 <- GRangePairs(first=GRanges(seqnames=c("chr13", "chr4", "chr4"), ranges=IRanges(start=c(71727138,150679343, 146653164), end=c(71727224, 150679400, 146653221)), strand="+"), second=GRanges(seqnames=c("chr1"), ranges=IRanges(start=c(29854162, 23432387, 35711077), end=c(29854248, 23432444, 35711134)), strand="+") ) CNE21 <- GRangePairs(first=GRanges(seqnames=c("chr1"), ranges=IRanges(start=c(29854162, 23432387, 35711077), end=c(29854248, 23432444, 35711134)), strand="+"), second=GRanges(seqnames=c("chr13", "chr4", "chr4"), ranges=IRanges(start=c(71727138,150679343, 146653164), end=c(71727224, 150679400, 146653221)), strand="+") ) cne <- CNE(assembly1Fn=file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit"), assembly2Fn=file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit"), window=50L, identity=50L, CNE12=CNE12, CNE21=CNE21, CNEMerged=CNE12, CNEFinal=CNE12, aligner="blat", cutoffs1=4L, cutoffs2=4L) ## Accessor CNE12(cne) CNE21(cne) thresholds(cne) CNEMerged(cne) CNEFinal(cne)
library(GenomicRanges) ## Constructor CNE12 <- GRangePairs(first=GRanges(seqnames=c("chr13", "chr4", "chr4"), ranges=IRanges(start=c(71727138,150679343, 146653164), end=c(71727224, 150679400, 146653221)), strand="+"), second=GRanges(seqnames=c("chr1"), ranges=IRanges(start=c(29854162, 23432387, 35711077), end=c(29854248, 23432444, 35711134)), strand="+") ) CNE21 <- GRangePairs(first=GRanges(seqnames=c("chr1"), ranges=IRanges(start=c(29854162, 23432387, 35711077), end=c(29854248, 23432444, 35711134)), strand="+"), second=GRanges(seqnames=c("chr13", "chr4", "chr4"), ranges=IRanges(start=c(71727138,150679343, 146653164), end=c(71727224, 150679400, 146653221)), strand="+") ) cne <- CNE(assembly1Fn=file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit"), assembly2Fn=file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit"), window=50L, identity=50L, CNE12=CNE12, CNE21=CNE21, CNEMerged=CNE12, CNEFinal=CNE12, aligner="blat", cutoffs1=4L, cutoffs2=4L) ## Accessor CNE12(cne) CNE21(cne) thresholds(cne) CNEMerged(cne) CNEFinal(cne)
These two datasets are the direct output from ceScan
.
data(CNEHg38DanRer10)
data(CNEHg38DanRer10)
data(CNEHg38DanRer10)
data(CNEHg38DanRer10)
This function queries the database and generates the CNEs' density values.
CNEDensity(dbName, tableName, chr, start, end, whichAssembly=c("first", "second"), windowSize=300, minLength=NULL)
CNEDensity(dbName, tableName, chr, start, end, whichAssembly=c("first", "second"), windowSize=300, minLength=NULL)
dbName |
|
tableName |
|
chr |
|
start , end
|
|
whichAssembly |
|
windowSize |
|
minLength |
|
A GRanges
object with density values is returned.
signature(tableName = "character",
assembly1 = "character", assembly2 = "missing",
threshold = "missing")
signature(tableName = "missing", assembly1 = "character",
assembly2 = "character", threshold = "character")
Ge Tan
dbName <- file.path(system.file("extdata", package="CNEr"), "danRer10CNE.sqlite") genome <- "danRer10" chr <- "chr6" start <- 24000000L end <- 27000000L windowSize <- 200L minLength <- 50L cneDanRer10Hg38_45_50 <- CNEDensity(dbName=dbName, tableName="danRer10_hg38_45_50", whichAssembly="first", chr=chr, start=start, end=end, windowSize=windowSize, minLength=minLength) cneDanRer10Hg38_49_50 <- CNEDensity(dbName=dbName, tableName="danRer10_hg38_49_50", whichAssembly="first", chr=chr, start=start, end=end, windowSize=windowSize, minLength=minLength)
dbName <- file.path(system.file("extdata", package="CNEr"), "danRer10CNE.sqlite") genome <- "danRer10" chr <- "chr6" start <- 24000000L end <- 27000000L windowSize <- 200L minLength <- 50L cneDanRer10Hg38_45_50 <- CNEDensity(dbName=dbName, tableName="danRer10_hg38_45_50", whichAssembly="first", chr=chr, start=start, end=end, windowSize=windowSize, minLength=minLength) cneDanRer10Hg38_49_50 <- CNEDensity(dbName=dbName, tableName="danRer10_hg38_49_50", whichAssembly="first", chr=chr, start=start, end=end, windowSize=windowSize, minLength=minLength)
cneFinalListDanRer10Hg38 dataset contains the CNE between danRer10 and hg38 around chr6:24,000,000..27,000,000.
data("cneFinalListDanRer10Hg38")
data("cneFinalListDanRer10Hg38")
data(cneFinalListDanRer10Hg38)
data(cneFinalListDanRer10Hg38)
Removes the CNEs which overlap on both genomes.
cneMerge(cne12, cne21)
cneMerge(cne12, cne21)
cne12 |
A object of |
cne21 |
A object of |
A GRangePairs
of CNEs or a CNE
object is returned.
In this table, the order of columns is consistent with cne1.
For instance, if cne1 has the first three columns for zebrafish
and next three columns for human,
in the merged table, the first three columns are
still the coordinates for zebrafish
while the next three columns are the coordinates for human.
Ge Tan
library(GenomicRanges) firstGRange <- GRanges(seqnames=c("chr1", "chr1", "chr2", "chr2", "chr5"), ranges=IRanges(start=c(1, 20, 2, 3, 1), end=c(10, 25, 10, 10, 10)), strand="+") lastGRange <- GRanges(seqnames=c("chr15", "chr10", "chr10", "chr10", "chr15"), ranges=IRanges(start=c(1, 25, 50, 51, 5), end=c(8, 40, 55, 60, 10)), strand="+") cne12 <- GRangePairs(firstGRange[1:3], lastGRange[1:3]) cne21 <- GRangePairs(lastGRange[4:5], firstGRange[4:5]) ## GRangePairs, GRangePairs cneMerge(cne12, cne21) ## CNE, missing cne <- CNE(assembly1Fn=file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit"), assembly2Fn=file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit"), window=50L, identity=50L, CNE12=cne12, CNE21=cne21, aligner="blat") cneMerge(cne)
library(GenomicRanges) firstGRange <- GRanges(seqnames=c("chr1", "chr1", "chr2", "chr2", "chr5"), ranges=IRanges(start=c(1, 20, 2, 3, 1), end=c(10, 25, 10, 10, 10)), strand="+") lastGRange <- GRanges(seqnames=c("chr15", "chr10", "chr10", "chr10", "chr15"), ranges=IRanges(start=c(1, 25, 50, 51, 5), end=c(8, 40, 55, 60, 10)), strand="+") cne12 <- GRangePairs(firstGRange[1:3], lastGRange[1:3]) cne21 <- GRangePairs(lastGRange[4:5], firstGRange[4:5]) ## GRangePairs, GRangePairs cneMerge(cne12, cne21) ## CNE, missing cne <- CNE(assembly1Fn=file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit"), assembly2Fn=file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit"), window=50L, identity=50L, CNE12=cne12, CNE21=cne21, aligner="blat") cneMerge(cne)
This function tries to automate the fetch of chromosome sizes for assemblies from UCSC.
fetchChromSizes(assembly)
fetchChromSizes(assembly)
assembly |
A |
This function downloads ‘chromInfo.txt.gz’ from UCSC golden path for UCSC assemblies.
A object of Seqinfo
is returned.
Currently, the assemblies from UCSC are supported.
Ge Tan
fetchChromSizes("hg19") fetchChromSizes("mm10")
fetchChromSizes("hg19") fetchChromSizes("mm10")
Axt
object
In ‘axt’ file and Axt
object, the coordinates of negative
query alignments are relative to the reverse-complemented coordinates of
its chromosome.
This is different from the convention in Bioconductor.
This function fixes the coordinates which are always relative to the
positive strand.
fixCoordinates(x)
fixCoordinates(x)
x |
|
In Axt
, the ‘strand’ is for the aligning organism.
If the strand value is “-”, the values of the aligning organism's
start and end fields are relative to the reverse-complemented coordinates of
its chromosome.
A Axt
object.
Ge Tan
axtFnDanRer10Hg38 <- file.path(system.file("extdata", package="CNEr"), "danRer10.hg38.net.axt") qAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit") tAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit") axtDanRer10Hg38 <- readAxt(axtFnDanRer10Hg38, tAssemblyFn=tAssemblyFn, qAssemblyFn=qAssemblyFn) ## Fix the coordinates fixCoordinates(axtDanRer10Hg38) ## Restore it fixCoordinates(fixCoordinates(axtDanRer10Hg38))
axtFnDanRer10Hg38 <- file.path(system.file("extdata", package="CNEr"), "danRer10.hg38.net.axt") qAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit") tAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit") axtDanRer10Hg38 <- readAxt(axtFnDanRer10Hg38, tAssemblyFn=tAssemblyFn, qAssemblyFn=qAssemblyFn) ## Fix the coordinates fixCoordinates(axtDanRer10Hg38) ## Restore it fixCoordinates(fixCoordinates(axtDanRer10Hg38))
The GRangePairs
class is a container for a pair of
GRanges
objects that have the same lengths.
A GRangePairs
object is a list-like object where each element
describes a pair of genomic range.
They do not necessarily have the same seqinfo
, i.e.,
the coordinates from the same assembly.
GRangePairs(first=GRanges(), second=GRanges(), ..., names=NULL,
hits=NULL)
:
GRangePairs constructor.
In the code snippets below, x
is a GRangePairs object.
length(x)
:
Return the number of granges pairs in x
.
names(x)
, names(x) <- value
:
Get or set the names on x
.
first(x)
,
last(x)
,
second(x)
:
Get the ‘first’ or ‘last’/‘second’ GRange
for each grange pair in
x
. The result is a GRanges object of the same length
as x
.
first(x)<-
,
second(x)<-
:
Set the ‘first’ or ‘second’ GRange
for each grange pair in
x
. The result is a GRanges object of the same length
as x
.
seqnames(x)
:
Get the seqname of first GRanges and last GRanges and return in a
DataFrame
object.
strand(x)
:
Get the strand for each grange pair in x
.
seqinfo(x)
:
Get the information about the underlying sequences.
In the code snippets below, x
is a GRangePairs object.
x[i]
:
Return a new GRangePairs object made of the selected
genomic ranges pairs.
In the code snippets below, x
is a GRangePairs object.
unlist(x, use.names=TRUE)
:
Return the GRangePairs object conceptually defined
by c(x[[1]], x[[2]], ..., x[[length(x)]])
.
use.names
determines whether x
names should be
passed to the result or not.
In the code snippets below, x
is a GRangePairs object.
grglist(x, use.mcols=FALSE)
:
Return a GRangesList object of length length(x)
where the i-th element represents the ranges (with respect to the
reference) of the i-th grange pair in x
.
Note that this results in the ranges being always ordered consistently with the original "query template", that is, being in the order defined by walking the "query template" from the beginning to the end.
If use.mcols
is TRUE and x
has metadata columns on it
(accessible with mcols(x)
), they're propagated to the returned
object.
as(x, "GRangesList")
:
Alternate ways of doing grglist(x, use.mcols=TRUE)
.
as(x, "GRanges")
:
Equivalent of unlist(x, use.names=TRUE)
.
In the code snippets below, x
is a GRangesList object.
swap(x)
:
Swap the first, last GRanges.
unique(x)
:
Get the unique GRangePairs.
show(x)
:
By default, the show
method displays 5 head and 5 tail
elements. This can be changed by setting the global options
showHeadLines
and showTailLines
. If the object
length is less than (or equal to) the sum of these 2 options
plus 1, then the full object is displayed.
Ge Tan
## Constructor library(GenomicRanges) first <- GRanges(seqnames=c("chr1", "chr1", "chr2", "chr3"), ranges=IRanges(start=c(1, 20, 2, 3), end=c(10, 25, 10, 10)), strand="+") last <- GRanges(seqnames=c("chr1", "chr10", "chr10", "chr20"), ranges=IRanges(start=c(1, 25, 50, 5), end=c(8, 40, 55, 16)), strand="+") namesGRangePairs <- c("a","b","c","d") grangesPairs1 <- GRangePairs(first, last, names=namesGRangePairs) grangesPairs2 <- GRangePairs(first, last) ## getters and setters names(grangesPairs1) names(grangesPairs2) <- namesGRangePairs first(grangesPairs1) first(grangesPairs1) <- second(grangesPairs1) second(grangesPairs1) second(grangesPairs1) <- first(grangesPairs1) length(grangesPairs1) seqnames(grangesPairs1) strand(grangesPairs1) seqinfo(grangesPairs1) ## Vector methods grangesPairs1[1] ## List methods unlist(grangesPairs1) ## Coersion grglist(grangesPairs1) as(grangesPairs1, "GRangesList") as(grangesPairs1, "GRanges") as(grangesPairs1, "DataFrame") as.data.frame(grangesPairs1) ## Combining c(grangesPairs1, grangesPairs2) ## Swap swap(grangesPairs1) ## Unique unique(c(grangesPairs1, grangesPairs1))
## Constructor library(GenomicRanges) first <- GRanges(seqnames=c("chr1", "chr1", "chr2", "chr3"), ranges=IRanges(start=c(1, 20, 2, 3), end=c(10, 25, 10, 10)), strand="+") last <- GRanges(seqnames=c("chr1", "chr10", "chr10", "chr20"), ranges=IRanges(start=c(1, 25, 50, 5), end=c(8, 40, 55, 16)), strand="+") namesGRangePairs <- c("a","b","c","d") grangesPairs1 <- GRangePairs(first, last, names=namesGRangePairs) grangesPairs2 <- GRangePairs(first, last) ## getters and setters names(grangesPairs1) names(grangesPairs2) <- namesGRangePairs first(grangesPairs1) first(grangesPairs1) <- second(grangesPairs1) second(grangesPairs1) second(grangesPairs1) <- first(grangesPairs1) length(grangesPairs1) seqnames(grangesPairs1) strand(grangesPairs1) seqinfo(grangesPairs1) ## Vector methods grangesPairs1[1] ## List methods unlist(grangesPairs1) ## Coersion grglist(grangesPairs1) as(grangesPairs1, "GRangesList") as(grangesPairs1, "GRanges") as(grangesPairs1, "DataFrame") as.data.frame(grangesPairs1) ## Combining c(grangesPairs1, grangesPairs2) ## Swap swap(grangesPairs1) ## Unique unique(c(grangesPairs1, grangesPairs1))
Example of GrangePairs
object from
the collinear regions of Adineta vaga.
data("grangesPairsForDotplot")
data("grangesPairsForDotplot")
The collinear regions from “scaffold_1” and “scaffold_5”.
Example from own project.
data(grangesPairsForDotplot)
data(grangesPairsForDotplot)
Wrapper function of lastal
to do the pairwise whole genome alignment.
This function doesn't work on Windows platform.
lastal(db, queryFn, outputFn=sub("\\.(fa|fasta)$", ".maf", paste(basename(db), basename(queryFn), sep = ","), ignore.case = TRUE), distance=c("far", "medium", "near"), binary="lastal", mc.cores=getOption("mc.cores", 2L), echoCommand=FALSE)
lastal(db, queryFn, outputFn=sub("\\.(fa|fasta)$", ".maf", paste(basename(db), basename(queryFn), sep = ","), ignore.case = TRUE), distance=c("far", "medium", "near"), binary="lastal", mc.cores=getOption("mc.cores", 2L), echoCommand=FALSE)
db |
|
queryFn |
|
outputFn |
|
distance |
It can be "far", "medium" or "near". It decides the score matrix used in lastz aligner. See '?scoringMatrix' for more details. |
binary |
|
mc.cores |
|
echoCommand |
|
A character
(1) vector of ouput maf file names.
lastal
aligner must be installed on the machine to use this function.
Ge Tan
## Not run: assemblyDir <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit" ## Build the lastdb index system2(command="lastdb", args=c("-c", file.path(assemblyDir, "danRer10"), file.path(assemblyDir, "danRer10.fa"))) ## Run lastal aligner lastal(db=file.path(assemblyDir, "danRer10"), queryFn=file.path(assemblyDir, "hg38.fa"), outputFn=file.path(axtDir, "danRer10.hg38.maf"), distance="far", binary="lastal", mc.cores=4L) ## maf to psl psls <- file.path(axtDir, "danRer10.hg38.psl") system2(command="maf-convert", args=c("psl", file.path(axtDir, "danRer10.hg38.maf"), ">", psls)) ## End(Not run)
## Not run: assemblyDir <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit" ## Build the lastdb index system2(command="lastdb", args=c("-c", file.path(assemblyDir, "danRer10"), file.path(assemblyDir, "danRer10.fa"))) ## Run lastal aligner lastal(db=file.path(assemblyDir, "danRer10"), queryFn=file.path(assemblyDir, "hg38.fa"), outputFn=file.path(axtDir, "danRer10.hg38.maf"), distance="far", binary="lastal", mc.cores=4L) ## maf to psl psls <- file.path(axtDir, "danRer10.hg38.psl") system2(command="maf-convert", args=c("psl", file.path(axtDir, "danRer10.hg38.maf"), ">", psls)) ## End(Not run)
Wrapper function of lastz
to do the pairwise whole genome alignment.
This function doesn't work on Windows platform.
lastz(assemblyTarget, assemblyQuery, outputDir = ".", chrsTarget = NULL, chrsQuery = NULL, distance = c("far", "medium", "near"), binary = "lastz", mc.cores = getOption("mc.cores", 2L), echoCommand = FALSE)
lastz(assemblyTarget, assemblyQuery, outputDir = ".", chrsTarget = NULL, chrsQuery = NULL, distance = c("far", "medium", "near"), binary = "lastz", mc.cores = getOption("mc.cores", 2L), echoCommand = FALSE)
assemblyTarget |
|
assemblyQuery |
|
outputDir |
|
chrsTarget |
NULL or |
chrsQuery |
NULL or |
distance |
It can be "far", "medium" or "near". It decides the score matrix used in lastz aligner. See '?scoringMatrix' for more details. |
binary |
|
mc.cores |
|
echoCommand |
|
A character
(n) vector of ouput lav file names.
lastz
aligner must be installed on the machine to use this function.
Ge Tan
http://www.bx.psu.edu/~rsharris/lastz/
## Not run: ## This example doesn't run because it requires two bit files and external ## Kent utilities. assemblyTarget <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/danRer10.2bit" assemblyQuery <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/hg38.2bit" lavs <- lastz(assemblyTarget, assemblyQuery, outputDir="/Users/gtan/OneDrive/Project/CSC/CNEr/axt", chrsTarget=c("chr1", "chr2", "chr3"), chrsQuery=c("chr1", "chr2", "chr3"), distance="far", mc.cores=4) ## End(Not run)
## Not run: ## This example doesn't run because it requires two bit files and external ## Kent utilities. assemblyTarget <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/danRer10.2bit" assemblyQuery <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/hg38.2bit" lavs <- lastz(assemblyTarget, assemblyQuery, outputDir="/Users/gtan/OneDrive/Project/CSC/CNEr/axt", chrsTarget=c("chr1", "chr2", "chr3"), chrsQuery=c("chr1", "chr2", "chr3"), distance="far", mc.cores=4) ## End(Not run)
Wrapper function of lavToPsl
: Convert blastz lav to psl format.
This function doesn't work on Windows platform since Kent utilities only
support Linux and Unix platforms.
lavToPsl(lavs, psls=sub("\\.lav$", ".psl", lavs, ignore.case = TRUE), removeLav=TRUE, binary="lavToPsl")
lavToPsl(lavs, psls=sub("\\.lav$", ".psl", lavs, ignore.case = TRUE), removeLav=TRUE, binary="lavToPsl")
lavs |
|
psls |
codecharacter(n): file names of output psl files. By default, in the same folder of input lav files with same names. |
removeLav |
|
binary |
|
character
(n): the file names of output psl files.
Ge Tan
http://hgdownload.cse.ucsc.edu/admin/exe/
## Not run: ## This example doesn't run because it requires lav files from previous steps ## and external Kent utilities. lavs <- tools::list_files_with_exts( dir="/Users/gtan/OneDrive/Project/CSC/CNEr/axt", exts="lav") lavToPsl(lavs, removeLav=FALSE, binary="lavToPsl") ## End(Not run)
## Not run: ## This example doesn't run because it requires lav files from previous steps ## and external Kent utilities. lavs <- tools::list_files_with_exts( dir="/Users/gtan/OneDrive/Project/CSC/CNEr/axt", exts="lav") lavToPsl(lavs, removeLav=FALSE, binary="lavToPsl") ## End(Not run)
Make ancora format files from GRangePairs
of CNE
makeAncoraFiles(cne, outputDir = ".", genomeFirst = "first", genomeSecond = "second", threshold = "50_50")
makeAncoraFiles(cne, outputDir = ".", genomeFirst = "first", genomeSecond = "second", threshold = "50_50")
cne |
|
outputDir |
|
genomeFirst , genomeSecond
|
|
threshold |
|
The filenames of output.
This function is mainly for internal use in Lenhard group.
Ge Tan
data(cneFinalListDanRer10Hg38) cne <- CNEFinal(cneFinalListDanRer10Hg38[["45_50"]]) makeAncoraFiles(cne, genomeFirst = "danRer10", genomeSecond = "hg38", threshold = "45_50")
data(cneFinalListDanRer10Hg38) cne <- CNEFinal(cneFinalListDanRer10Hg38[["45_50"]]) makeAncoraFiles(cne, genomeFirst = "danRer10", genomeSecond = "hg38", threshold = "45_50")
Make the bed tracks for the ‘Axt’ alignment.
makeAxtTracks(x)
makeAxtTracks(x)
x |
A |
The coordinates of query ‘Axt’ alignment are fixed to be relative to positive strand before output into ‘bed’ file.
A list of GRanges for target and query alignments. The two output ‘bed’ files are “targetAxt.bed” and “queryAxt.bed”.
Ge Tan
tAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit") qAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit") axtFn <- file.path(system.file("extdata", package="CNEr"), "danRer10.hg38.net.axt") axt <- readAxt(axtFn, tAssemblyFn, qAssemblyFn) makeAxtTracks(axt)
tAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit") qAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit") axtFn <- file.path(system.file("extdata", package="CNEr"), "danRer10.hg38.net.axt") axt <- readAxt(axtFn, tAssemblyFn, qAssemblyFn) makeAxtTracks(axt)
Make ‘Bed’, ‘bedGraph’, ‘BigWig’ files
from GRangePairs
for
display in other Genome Browser.
makeCNEDensity(x, outputDir = ".", genomeFirst = "first", genomeSecond = "second", threshold = "50_50", windowSizeFirst = 300L, windowSizeSecond = 300L)
makeCNEDensity(x, outputDir = ".", genomeFirst = "first", genomeSecond = "second", threshold = "50_50", windowSizeFirst = 300L, windowSizeSecond = 300L)
x |
|
outputDir |
|
genomeFirst , genomeSecond
|
|
threshold |
|
windowSizeFirst , windowSizeSecond
|
|
The CNE density is defined as the percentage of regions covered by CNEs within the smoothing window.
The filenames of output ‘Bed’, ‘bedGraph’ and ‘BigWig’ files.
This function is mainly for internal use in Lenhard group.
Ge Tan
## Not run: dbName <- file.path(system.file("extdata", package="CNEr"), "danRer10CNE.sqlite") qAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit") tAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit") cneGRangePairs <- readCNERangesFromSQLite(dbName=dbName, tableName="danRer10_hg38_45_50", tAssemblyFn=tAssemblyFn, qAssemblyFn=qAssemblyFn) makeCNEDensity(cneGRangePairs[1:1000]) ## End(Not run)
## Not run: dbName <- file.path(system.file("extdata", package="CNEr"), "danRer10CNE.sqlite") qAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit") tAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit") cneGRangePairs <- readCNERangesFromSQLite(dbName=dbName, tableName="danRer10_hg38_45_50", tAssemblyFn=tAssemblyFn, qAssemblyFn=qAssemblyFn) makeCNEDensity(cneGRangePairs[1:1000]) ## End(Not run)
Make Genomic Regulatory Blocks (GRBs) boundaries prediction from a set of CNEs.
makeGRBs(x, winSize=NULL, genes=NULL, ratio=1, background=c("chromosome", "genome"), minCNEs=1L)
makeGRBs(x, winSize=NULL, genes=NULL, ratio=1, background=c("chromosome", "genome"), minCNEs=1L)
x |
|
winSize |
|
genes |
|
ratio |
|
background |
|
minCNEs |
|
First we calculate the CNE densities from the CNEs.
Then we segment the regions according to the values of CNE densities.
The regions with CNE densities above the expected CNE densities * ratio are
considered as putative GRBs.
Putative GRBs that do not encompass any gene are filtered out.
Finally, the GRBs that have fewer than minCNEs
number of CNEs will
be filtered out.
A GRanges
object of GRB coordinates is returned.
The numbers of CNEs and the coordinates of CNEs within each GRB are
returned as a metadata column.
Ge Tan
library(TxDb.Drerio.UCSC.danRer10.refGene) refGenesDanRer10 <- genes(TxDb.Drerio.UCSC.danRer10.refGene) ancoraCNEsFns <- file.path(system.file("extdata", package="CNEr"), c("cne2wBf_cypCar1_danRer10_100_100", "cne2wBf_cteIde1_danRer10_100_100", "cne2wBf_AstMex102_danRer10_48_50")) cneList <- do.call(GRangesList, lapply(ancoraCNEsFns, readAncora, assembly="danRer10")) names(cneList) <- c("Common carp", "Grass carp", "Blind cave fish") seqlengths(cneList) <- seqlengths(TxDb.Drerio.UCSC.danRer10.refGene)[ names(seqlengths(cneList))] makeGRBs(cneList, winSize=200, genes=refGenesDanRer10, ratio=1.2, background="genome") makeGRBs(cneList, winSize=200, genes=refGenesDanRer10, ratio=1.2, background="chromosome", minCNEs=3L)
library(TxDb.Drerio.UCSC.danRer10.refGene) refGenesDanRer10 <- genes(TxDb.Drerio.UCSC.danRer10.refGene) ancoraCNEsFns <- file.path(system.file("extdata", package="CNEr"), c("cne2wBf_cypCar1_danRer10_100_100", "cne2wBf_cteIde1_danRer10_100_100", "cne2wBf_AstMex102_danRer10_48_50")) cneList <- do.call(GRangesList, lapply(ancoraCNEsFns, readAncora, assembly="danRer10")) names(cneList) <- c("Common carp", "Grass carp", "Blind cave fish") seqlengths(cneList) <- seqlengths(TxDb.Drerio.UCSC.danRer10.refGene)[ names(seqlengths(cneList))] makeGRBs(cneList, winSize=200, genes=refGenesDanRer10, ratio=1.2, background="genome") makeGRBs(cneList, winSize=200, genes=refGenesDanRer10, ratio=1.2, background="chromosome", minCNEs=3L)
Given a Axt
alignment, plot a heatmap showing the percentage of
each matched alignments.
matchDistribution(x, size=10000, title=NULL)
matchDistribution(x, size=10000, title=NULL)
x |
|
size |
|
title |
|
By default, if there are more than 10,000 alignments, 10,000 alignments will be sampled and calculated for the distribution for speed purposes.
Only the four bases (A, C, G, T), gap (-) and any (N) are displayed. Other ambiguous bases are not considered.
A ggplot2 object will be returned.
Ge Tan
axtFile <- file.path(system.file("extdata", package="CNEr"), "hg38.danRer10.net.axt") axt <- readAxt(axtFile) matchDistribution(axt)
axtFile <- file.path(system.file("extdata", package="CNEr"), "hg38.danRer10.net.axt") axt <- readAxt(axtFile) matchDistribution(axt)
Calculate the N50, N90 values for a fasta or 2bit file.
N50(fn) N90(fn)
N50(fn) N90(fn)
fn |
|
This function calculates the N50, N90 values for an assembly. The N50 value is calculated by first ordering every contig/scaffold by length from longest to shortest. Next, starting from the longest contig/scaffold, the lengths of each contig are summed, until this running sum equals one-half of the total length of all contigs/scaffolds in the assembly. Then the length of shortest contig/scaffold in this list is the N50 value. Similar procedure is used for N90 but including 90% of the assembly.
An integer value of N50 or N90 value.
Ge Tan
twoBitFn <- file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit") N50(twoBitFn)
twoBitFn <- file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit") N50(twoBitFn)
Wrapper function of netToAxt
and axtSort
:
convert net (and chain) to axt, and sort axt files.
This function doesn't work on the Windows platform since Kent utilities only
support Linux and Unix platforms.
netToAxt(in.net, in.chain, assemblyTarget, assemblyQuery, axtFile=paste0(sub("\\.2bit$", "", basename(assemblyTarget), ignore.case = TRUE), ".", sub("\\.2bit$", "", basename(assemblyQuery), ignore.case = TRUE), ".net.axt"), removeFiles=FALSE, binaryNetToAxt="netToAxt", binaryAxtSort="axtSort")
netToAxt(in.net, in.chain, assemblyTarget, assemblyQuery, axtFile=paste0(sub("\\.2bit$", "", basename(assemblyTarget), ignore.case = TRUE), ".", sub("\\.2bit$", "", basename(assemblyQuery), ignore.case = TRUE), ".net.axt"), removeFiles=FALSE, binaryNetToAxt="netToAxt", binaryAxtSort="axtSort")
in.net |
|
in.chain |
|
assemblyTarget |
|
assemblyQuery |
|
axtFile |
|
removeFiles |
|
binaryNetToAxt |
|
binaryAxtSort |
|
character
(1): the file name of output axt file.
Ge Tan
http://hgdownload.cse.ucsc.edu/admin/exe/
## Not run: ## This example doesn't run because it requires two bit files and external ## Kent utilities. in.net <- file.path("/Users/gtan/OneDrive/Project/CSC/CNEr/axt", "danRer10.hg38.noClass.net") in.chain <- file.path("/Users/gtan/OneDrive/Project/CSC/CNEr/axt", "danRer10.hg38.all.pre.chain") assemblyTarget <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/danRer10.2bit" assemblyQuery <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/hg38.2bit" netToAxt(in.net, in.chain, assemblyTarget, assemblyQuery, axtFile=file.path("/Users/gtan/OneDrive/Project/CSC/CNEr/axt", paste0(sub("\\.2bit$", "", basename(assemblyTarget), ignore.case = TRUE), ".", sub("\\.2bit$", "", basename(assemblyQuery), ignore.case = TRUE), ".net.axt")), removeFiles=FALSE, binaryNetToAxt="netToAxt", binaryAxtSort="axtSort") ## End(Not run)
## Not run: ## This example doesn't run because it requires two bit files and external ## Kent utilities. in.net <- file.path("/Users/gtan/OneDrive/Project/CSC/CNEr/axt", "danRer10.hg38.noClass.net") in.chain <- file.path("/Users/gtan/OneDrive/Project/CSC/CNEr/axt", "danRer10.hg38.all.pre.chain") assemblyTarget <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/danRer10.2bit" assemblyQuery <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit/hg38.2bit" netToAxt(in.net, in.chain, assemblyTarget, assemblyQuery, axtFile=file.path("/Users/gtan/OneDrive/Project/CSC/CNEr/axt", paste0(sub("\\.2bit$", "", basename(assemblyTarget), ignore.case = TRUE), ".", sub("\\.2bit$", "", basename(assemblyQuery), ignore.case = TRUE), ".net.axt")), removeFiles=FALSE, binaryNetToAxt="netToAxt", binaryAxtSort="axtSort") ## End(Not run)
Given the desired organism name, fetch the mapping between KEGG IDs and Entrez gene IDs.
orgKEGGIds2EntrezIDs(organism="Homo sapiens")
orgKEGGIds2EntrezIDs(organism="Homo sapiens")
organism |
|
A list
of Entrez gene IDs with KEGG IDs as names.
Ge Tan
## Not run: orgKEGGIds2EntrezIDs(organism="Homo sapiens") ## End(Not run)
## Not run: orgKEGGIds2EntrezIDs(organism="Homo sapiens") ## End(Not run)
Plot the CNE genomic location distribution. It gives an overview of the tendency of CNEs to form clusters.
plotCNEDistribution(x, chrs = NULL, chrScale = c("Mb", "Kb"))
plotCNEDistribution(x, chrs = NULL, chrScale = c("Mb", "Kb"))
x |
|
chrs |
|
chrScale |
|
In the plot, x axis is the genomic location along each chromosome/scaffold. The y axis is the sequential CNE number. A typical CNE cluster can be spotted by the dramatic increase in y axis and small increase in x axis.
A ggplot
object.
Ge Tan
dbName <- file.path(system.file("extdata", package="CNEr"), "danRer10CNE.sqlite") qAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit") tAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit") cneGRangePairs <- readCNERangesFromSQLite(dbName=dbName, tableName="danRer10_hg38_45_50", tAssemblyFn=tAssemblyFn, qAssemblyFn=qAssemblyFn) plotCNEDistribution(first(cneGRangePairs))
dbName <- file.path(system.file("extdata", package="CNEr"), "danRer10CNE.sqlite") qAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit") tAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit") cneGRangePairs <- readCNERangesFromSQLite(dbName=dbName, tableName="danRer10_hg38_45_50", tAssemblyFn=tAssemblyFn, qAssemblyFn=qAssemblyFn) plotCNEDistribution(first(cneGRangePairs))
CNE widths can follow heavy tailed distribution that are associated with power-laws. This function plots the reverse cumulative density distribution of CNE widths, and fits a discrete power-law distribution. Goodness of fit can also be evaluated.
plotCNEWidth(x, ...)
plotCNEWidth(x, ...)
x |
|
... |
Additional points passed to |
The power-law distribution is associated with heavy tailed distribution.
A reverse cumulative density distribution plot will be generated with optimal lower bound xmin, scaling parameteralpha for power-law fit.
An invisible list of fitted model is returned.
The power-law distribution implementation is based on the poweRlaw package.
Ge Tan
Salerno, W., Havlak, P., and Miller, J. (2006). Scale-invariant structure of strongly conserved sequence in genomic intersections and alignments. Proc. Natl. Acad. Sci. U.S.A. 103, 13121-13125.
dbName <- file.path(system.file("extdata", package="CNEr"), "danRer10CNE.sqlite") cneGRangePairs <- readCNERangesFromSQLite(dbName=dbName, tableName="danRer10_hg38_45_50") plotCNEWidth(cneGRangePairs)
dbName <- file.path(system.file("extdata", package="CNEr"), "danRer10CNE.sqlite") cneGRangePairs <- readCNERangesFromSQLite(dbName=dbName, tableName="danRer10_hg38_45_50") plotCNEWidth(cneGRangePairs)
Given two GRanges
objects, select the Axt
alignments
whose the target and query alignments are both within each pair of ranges.
psubAxt(x, targetSearch, querySearch)
psubAxt(x, targetSearch, querySearch)
x |
|
targetSearch , querySearch
|
|
The ‘targetSearch’ and ‘querySearch’ have the coordinates relative to the positive strand. For each pair of the ranges, the alignments that lie within both the target and query range are kept.
A Axt
object.
Ge Tan
library(GenomicRanges) tAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit") qAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit") axtFn <- file.path(system.file("extdata", package="CNEr"), "danRer10.hg38.net.axt") axt <- readAxt(axtFn, tAssemblyFn, qAssemblyFn) targetSearch <- GRanges(seqnames=c("chr6"), ranges=IRanges(start=c(24000000, 26900000), end=c(24060000, 26905000)), strand="+" ) querySearch <- GRanges(seqnames=c("chr7", "chr2"), ranges=IRanges(start=c(12577000, 241262700), end=c(12579000, 241268600)), strand="+" ) psubAxt(axt, targetSearch, querySearch)
library(GenomicRanges) tAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit") qAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit") axtFn <- file.path(system.file("extdata", package="CNEr"), "danRer10.hg38.net.axt") axt <- readAxt(axtFn, tAssemblyFn, qAssemblyFn) targetSearch <- GRanges(seqnames=c("chr6"), ranges=IRanges(start=c(24000000, 26900000), end=c(24060000, 26905000)), strand="+" ) querySearch <- GRanges(seqnames=c("chr7", "chr2"), ranges=IRanges(start=c(12577000, 241262700), end=c(12579000, 241268600)), strand="+" ) psubAxt(axt, targetSearch, querySearch)
Query the CNEData package to fetch the CNEs based on target, query species, winSize and identity.
queryCNEData(dbName, target, query, winSize, identity, type=c("target", "all"))
queryCNEData(dbName, target, query, winSize, identity, type=c("target", "all"))
dbName |
The path of SQLite database. |
target , query
|
The CNEs between target and query species. |
winSize , identity
|
The thresholds of CNEs to fetch on identity over winSize. |
type |
Which set of CNEs are returned. When it is "all", the CNEs of target always on the left side of returned data.frame. |
A data.frame of CNEs coordinates in chr, start, end.
Ge Tan
Read a RepeatMasker .out file into a GRanges
object.
read.rmMask.GRanges(fn)
read.rmMask.GRanges(fn)
fn |
|
A GRanges
object with metadata columns containing the
name of the matching interspersed repeat,
the class of the repeat and the Smith-Waterman score of the match.
Ge Tan
http://www.repeatmasker.org/webrepeatmaskerhelp.html
fn <- system.file("extdata", "ce2chrM.fa.out", package="IRanges") read.rmMask.GRanges(fn)
fn <- system.file("extdata", "ce2chrM.fa.out", package="IRanges") read.rmMask.GRanges(fn)
Read a soft repeat masked fasta file into a GRanges
object.
read.rmskFasta(fn)
read.rmskFasta(fn)
fn |
|
Only the lower case based ("a", "c", "g", "t") are considered in the soft repeat masked fasta.
GRanges
object with coordinates of repeat masked regions.
Ge Tan
fn <- file.path(system.file("extdata", package="CNEr"), "rmsk.fa") read.rmskFasta(fn)
fn <- file.path(system.file("extdata", package="CNEr"), "rmsk.fa") read.rmskFasta(fn)
Read the Ancora CNE file into a GRanges
or GRangePairs
object.
readAncora(fn, assembly=NULL, tAssemblyFn=NULL, qAssemblyFn=NULL)
readAncora(fn, assembly=NULL, tAssemblyFn=NULL, qAssemblyFn=NULL)
fn |
|
assembly |
|
tAssemblyFn , qAssemblyFn
|
|
The Ancora CNE filename has its own naming style. For example, "cne2wBf_hg38_mm10_50_50" denotes human coordinates for the first three columns of the file and mouse coordinates from the forth to the sixth column.
The start coordinate system is 0-based.
A GRanges
object of the CNE ranges when assembly is specified,
or a GRangePairs
object when assembly is NULL.
This function is mainly for internal use in Lenhard group.
Ge Tan
fn <- file.path(system.file("extdata", package="CNEr"), "cne2wBf_danRer10_hg38_45_50") zebrafishCNEs <- readAncora(fn, "danRer10") humanCNEs <- readAncora(fn, "hg38") zebrafishHumanCNEs <- readAncora(fn)
fn <- file.path(system.file("extdata", package="CNEr"), "cne2wBf_danRer10_hg38_45_50") zebrafishCNEs <- readAncora(fn, "danRer10") humanCNEs <- readAncora(fn, "hg38") zebrafishHumanCNEs <- readAncora(fn)
Read Ancora legacy CNE format into a SQLite database.
readAncoraIntoSQLite(cneFns, dbName, overwrite=FALSE)
readAncoraIntoSQLite(cneFns, dbName, overwrite=FALSE)
cneFns |
|
dbName |
|
overwrite |
|
The Ancora legacy CNE file has the filename in the format of "cne2wBf_AstMex102_danRer10_48_50". The first six columns are the coordinates of pairs of CNEs. The start coordinate system is 0-based and is converted into 1-based when it is imported into the SQLite database.
A character
vector of table names.
This function is mainly for internal use in Lenhard group.
Ge Tan
ancoraCNEsFns <- file.path(system.file("extdata", package="CNEr"), c("cne2wBf_cypCar1_danRer10_100_100", "cne2wBf_cteIde1_danRer10_100_100", "cne2wBf_AstMex102_danRer10_48_50")) dbName <- tempfile() readAncoraIntoSQLite(ancoraCNEsFns, dbName, overwrite=FALSE)
ancoraCNEsFns <- file.path(system.file("extdata", package="CNEr"), c("cne2wBf_cypCar1_danRer10_100_100", "cne2wBf_cteIde1_danRer10_100_100", "cne2wBf_AstMex102_danRer10_48_50")) dbName <- tempfile() readAncoraIntoSQLite(ancoraCNEsFns, dbName, overwrite=FALSE)
This function reads the ‘Axt’ files into an Axt
object.
readAxt(axtFiles, tAssemblyFn=NULL, qAssemblyFn=NULL)
readAxt(axtFiles, tAssemblyFn=NULL, qAssemblyFn=NULL)
axtFiles |
|
tAssemblyFn , qAssemblyFn
|
|
This function reads the ‘Axt’ files of two assemblies.
It can be a single big ‘Axt’ file or several small ‘Axt’ files.
Contrary to the start coordinate in ‘Axt’ file,
the start coordinate in Axt
object is 1-based.
When ‘tAssemblyFn’ and ‘qAssemblyFn’ are
not NULL
, the corresponding Seqinfo
will be added into the returned Axt
object.
A object Axt
is returned.
Ge Tan
axtFile <- file.path(system.file("extdata", package="CNEr"), "hg38.danRer10.net.axt") tAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit") qAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit") axt <- readAxt(axtFile, tAssemblyFn, qAssemblyFn)
axtFile <- file.path(system.file("extdata", package="CNEr"), "hg38.danRer10.net.axt") tAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit") qAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit") axt <- readAxt(axtFile, tAssemblyFn, qAssemblyFn)
Read the coordinates information from a bed file.
readBed(bedFile, assemblyFn=NULL)
readBed(bedFile, assemblyFn=NULL)
bedFile |
|
assemblyFn |
|
This function is designed to read the bed file as ‘chrom’, ‘chromStart’, ‘chromEnd’. The strand information is also stored where available.
In the bed file, the ‘chromStart’ is on the 0-based coordinate system
while ‘chromEnd’ is on the 1-based coordinate system.
For example, the first 100 bases of a chromosome are
defined as ‘chromStart=0’, ‘chromEnd=100’,
and span the bases numbered 0-99.
When it is read into GRanges
,
both the ‘chromStart’ and ‘chromEnd’ are on 1-based coordinate,
i.e.,
‘chromStart=1’ and ‘chromEnd=100’.
When ‘assemblyFn’ is not NULL
, the corresponding Seqinfo
will be added into the returned GRanges
.
A GRanges
object is returned.
When no strand information is available in the bed file,
all the ranges are assumed to be on the positive strand.
Ge Tan
https://genome.ucsc.edu/FAQ/FAQformat.html#format1
bedFn <- file.path(system.file("extdata", package="CNEr"), "filter_regions.hg38.bed") assemblyFn <- file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit") bed <- readBed(bedFn, assemblyFn=assemblyFn)
bedFn <- file.path(system.file("extdata", package="CNEr"), "filter_regions.hg38.bed") assemblyFn <- file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit") bed <- readBed(bedFn, assemblyFn=assemblyFn)
Query the SQLite database based on chromosome,
coordinates and some other criteria.
Primarily not intended to be used directly.
For the CNE density plot, fetchCNEDensity
function should be used.
readCNERangesFromSQLite(dbName, tableName, chr=NULL, start=NULL, end=NULL, whichAssembly=c("first","second"), minLength=NULL, tAssemblyFn=NULL, qAssemblyFn=NULL)
readCNERangesFromSQLite(dbName, tableName, chr=NULL, start=NULL, end=NULL, whichAssembly=c("first","second"), minLength=NULL, tAssemblyFn=NULL, qAssemblyFn=NULL)
dbName |
A object of |
tableName |
A object of |
chr |
|
start , end
|
|
whichAssembly |
|
minLength |
|
tAssemblyFn , qAssemblyFn
|
|
An object of GRangePairs
is returned.
Ge Tan
dbName <- file.path(system.file("extdata", package="CNEr"), "danRer10CNE.sqlite") tableName <- "danRer10_hg38_45_50" qAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit") tAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit") ## single chr, start, end chr <- "chr6" start <- 24000000L end <- 27000000 minLength <- 50L fetchedCNERanges <- readCNERangesFromSQLite(dbName, tableName, chr, start, end, whichAssembly="first", minLength=minLength, tAssemblyFn=tAssemblyFn, qAssemblyFn=qAssemblyFn) ## multiple chr, start, end chr=c("chr1", "chr3") start=c(90730248, 137523122) end=c(90730300, 137523190) fetchedCNERanges <- readCNERangesFromSQLite(dbName, tableName, chr, start, end, whichAssembly="second", minLength=minLength) ## chr, NULL, NULL fetchedCNERanges <- readCNERangesFromSQLite(dbName, tableName, chr, start=NULL, end=NULL, whichAssembly="second", minLength=minLength)
dbName <- file.path(system.file("extdata", package="CNEr"), "danRer10CNE.sqlite") tableName <- "danRer10_hg38_45_50" qAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit") tAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit") ## single chr, start, end chr <- "chr6" start <- 24000000L end <- 27000000 minLength <- 50L fetchedCNERanges <- readCNERangesFromSQLite(dbName, tableName, chr, start, end, whichAssembly="first", minLength=minLength, tAssemblyFn=tAssemblyFn, qAssemblyFn=qAssemblyFn) ## multiple chr, start, end chr=c("chr1", "chr3") start=c(90730248, 137523122) end=c(90730300, 137523190) fetchedCNERanges <- readCNERangesFromSQLite(dbName, tableName, chr, start, end, whichAssembly="second", minLength=minLength) ## chr, NULL, NULL fetchedCNERanges <- readCNERangesFromSQLite(dbName, tableName, chr, start=NULL, end=NULL, whichAssembly="second", minLength=minLength)
This function reverses the cigar string, i.e., 20M15I10D will be reversed to 10D15I20M.
reverseCigar(cigar, ops=CIGAR_OPS)
reverseCigar(cigar, ops=CIGAR_OPS)
cigar |
A character vector of cigar strings. |
ops |
A character vector of the extended CIGAR operations. By default, CIGAR_OPS is used. |
A character vector contains the reversed cigar strings.
Ge Tan
cigar = c("20M15I10D", "10D15I20M") reverseCigar(cigar)
cigar = c("20M15I10D", "10D15I20M") reverseCigar(cigar)
This function saves the CNE results into a local SQLite database.
saveCNEToSQLite(x, dbName, tableName=NULL, overwrite=FALSE)
saveCNEToSQLite(x, dbName, tableName=NULL, overwrite=FALSE)
x |
An object of |
dbName |
|
tableName |
|
overwrite |
|
before loading into an SQLite database, a bin indexing system is used to index the CNE range, which provides faster SQL query.
Ge Tan
dbName <- tempfile() data(cneFinalListDanRer10Hg38) tableNames <- paste("danRer10", "hg38", names(cneFinalListDanRer10Hg38), sep="_") for(i in 1:length(cneFinalListDanRer10Hg38)){ saveCNEToSQLite(cneFinalListDanRer10Hg38[[i]], dbName, tableNames[i], overwrite=TRUE) }
dbName <- tempfile() data(cneFinalListDanRer10Hg38) tableNames <- paste("danRer10", "hg38", names(cneFinalListDanRer10Hg38), sep="_") for(i in 1:length(cneFinalListDanRer10Hg38)){ saveCNEToSQLite(cneFinalListDanRer10Hg38[[i]], dbName, tableNames[i], overwrite=TRUE) }
Generates the scoring matrix for lastz aligner.
scoringMatrix(distance = c("far", "medium", "near"))
scoringMatrix(distance = c("far", "medium", "near"))
distance |
It can be "far", "medium" or "close". It defines the scoring matrix used in lastz aligner. Generally, if two species are close to each other, for example human and chimp, "close" should be used. If two species have a divergence time of 100 MYA, "far" should be used. In other cases, "medium" should be used. |
A matrix
of the scoring matrix is returned.
HOXD70 is medium. HoxD55 is far. human-chimp.v2 is close.
Ge Tan
http://genomewiki.ucsc.edu/index.php/Hg38_17-way_conservation_lastz_parameters
scoringMatrix(distance="far")
scoringMatrix(distance="far")
Axt
objectA ‘subAxt’ method for extracting a set of alignments from
an Axt
object.
subAxt(x, chr, start, end, select=c("target", "query"), qSize=NULL)
subAxt(x, chr, start, end, select=c("target", "query"), qSize=NULL)
x |
An object of |
chr |
An object of |
start , end
|
An object of |
select |
When select is ‘target’,
the subset criteria are applied on target alignments in |
qSize |
|
Usually when we want to subset some axts from a Axt
object,
we care about all the axts within a certain range.
The axts can come from the axt file with chr as reference
(i.e., target sequence),
or the axt file with chr as query sequence.
When the chr is query sequence, it can be on the negative strand.
Hence, the size of chromosome is necessary to
convert the search range to a range on negative strand coordinate.
When one Axt alignment partially overlaps the range, the whole Axt alignment will be extracted.
An extracted Axt
object is returned.
Ge Tan
library(GenomicRanges) library(rtracklayer) ## Prepare the axt object tAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit") qAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit") axtFilesHg38DanRer10 <- file.path(system.file("extdata", package="CNEr"), "hg38.danRer10.net.axt") axtHg38DanRer10 <- readAxt(axtFilesHg38DanRer10, tAssemblyFn, qAssemblyFn) ## "character", "integer", "integer" on "target" sequence subAxt(axtHg38DanRer10, chr="chr1", start=148165963L, end=222131835L, select="target") ## "GRanges"" on "target" sequence searchGRanges <- GRanges(seqnames="chr1", ranges=IRanges(start=148165963L, end=222131835L), strand="+") subAxt(axtHg38DanRer10, searchGRanges, select="target") ## multiple "character", "integer", "integer" on "target" sequence subAxt(axtHg38DanRer10, chr=c("chr1", "chr13"), start=c(148165963L, 94750629L), end=c(222131835L, 94966991L), select="target") ## "character" only on "target" sequence subAxt(axtHg38DanRer10, chr="chr1", select="target") ## GRanges on "query" sequence searchGRanges <- GRanges(seqnames="chr6", ranges=IRanges(start=25825774, end=26745499), strand="+") subAxt(axtHg38DanRer10, searchGRanges, select="query")
library(GenomicRanges) library(rtracklayer) ## Prepare the axt object tAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Hsapiens.UCSC.hg38"), "single_sequences.2bit") qAssemblyFn <- file.path(system.file("extdata", package="BSgenome.Drerio.UCSC.danRer10"), "single_sequences.2bit") axtFilesHg38DanRer10 <- file.path(system.file("extdata", package="CNEr"), "hg38.danRer10.net.axt") axtHg38DanRer10 <- readAxt(axtFilesHg38DanRer10, tAssemblyFn, qAssemblyFn) ## "character", "integer", "integer" on "target" sequence subAxt(axtHg38DanRer10, chr="chr1", start=148165963L, end=222131835L, select="target") ## "GRanges"" on "target" sequence searchGRanges <- GRanges(seqnames="chr1", ranges=IRanges(start=148165963L, end=222131835L), strand="+") subAxt(axtHg38DanRer10, searchGRanges, select="target") ## multiple "character", "integer", "integer" on "target" sequence subAxt(axtHg38DanRer10, chr=c("chr1", "chr13"), start=c(148165963L, 94750629L), end=c(222131835L, 94966991L), select="target") ## "character" only on "target" sequence subAxt(axtHg38DanRer10, chr="chr1", select="target") ## GRanges on "query" sequence searchGRanges <- GRanges(seqnames="chr6", ranges=IRanges(start=25825774, end=26745499), strand="+") subAxt(axtHg38DanRer10, searchGRanges, select="query")
A collection of different functions used to deal with Axt
object.
summary(object, ...) ## mismatch number and proportion
summary(object, ...) ## mismatch number and proportion
object |
An |
... |
Currently not used. |
A table
object with the counts of mismatches, insertions, deletions
and the matches of each base.
Ge Tan
axtFilesHg38DanRer10 <- file.path(system.file("extdata", package="CNEr"), "hg38.danRer10.net.axt") axtHg38DanRer10 <- readAxt(axtFilesHg38DanRer10) summary(axtHg38DanRer10)
axtFilesHg38DanRer10 <- file.path(system.file("extdata", package="CNEr"), "hg38.danRer10.net.axt") axtHg38DanRer10 <- readAxt(axtFilesHg38DanRer10) summary(axtHg38DanRer10)
Syntenic dotplot for Axt
alignment object or GRangePairs
.
syntenicDotplot(x, firstSeqlengths=NULL, secondSeqlengths=NULL, firstChrs=NULL, secondChrs=NULL, col=c("blue", "red"), type=c("line", "dot"))
syntenicDotplot(x, firstSeqlengths=NULL, secondSeqlengths=NULL, firstChrs=NULL, secondChrs=NULL, col=c("blue", "red"), type=c("line", "dot"))
x |
|
firstSeqlengths , secondSeqlengths
|
|
firstChrs , secondChrs
|
|
col |
|
type |
“line” or “dot” plot type: When plotting massive number of ranges, “dot” should be used. Otherwise, “line” should be used. |
This syntenic dotplot is a type of scatter plot for Axt
object,
and line plot for GRangePairs
object.
In the case of possibly massive number of Axt
alignments,
the line plots will make it invisible at a large genome scale.
Each axis represents concatenated selected chromosomes laid end-to-end, and each dot in the scatter-plot represents a putative homologous match between the two genomes. These dotplots are used for whole genome comparisons within the same genome or across two genomes from different taxa in order to identify synteny.
A ggplot
object.
For highly fragmented assemblies, the synteny is invisible on the dotplot.
Ge Tan
library(GenomeInfoDb) library(BSgenome.Ggallus.UCSC.galGal3) library(BSgenome.Hsapiens.UCSC.hg19) ## dotplot for Axt object fn <- file.path(system.file("extdata", package="CNEr"), "chr4.hg19.galGal3.net.axt.gz") axt <- readAxt(fn) firstSeqlengths <- seqlengths(BSgenome.Hsapiens.UCSC.hg19) secondSeqlengths <- seqlengths(BSgenome.Ggallus.UCSC.galGal3) firstChrs <- c("chr4") secondChrs <- c("chr4") syntenicDotplot(axt, firstSeqlengths, secondSeqlengths, firstChrs=firstChrs, secondChrs=secondChrs, type="dot") ## dotplot for GRangePairs object data(grangesPairsForDotplot) syntenicDotplot(grangesPairsForDotplot, type="line")
library(GenomeInfoDb) library(BSgenome.Ggallus.UCSC.galGal3) library(BSgenome.Hsapiens.UCSC.hg19) ## dotplot for Axt object fn <- file.path(system.file("extdata", package="CNEr"), "chr4.hg19.galGal3.net.axt.gz") axt <- readAxt(fn) firstSeqlengths <- seqlengths(BSgenome.Hsapiens.UCSC.hg19) secondSeqlengths <- seqlengths(BSgenome.Ggallus.UCSC.galGal3) firstChrs <- c("chr4") secondChrs <- c("chr4") syntenicDotplot(axt, firstSeqlengths, secondSeqlengths, firstChrs=firstChrs, secondChrs=secondChrs, type="dot") ## dotplot for GRangePairs object data(grangesPairsForDotplot) syntenicDotplot(grangesPairsForDotplot, type="line")
writeAxt
function
Write an axt object into a file.
writeAxt(axt, con)
writeAxt(axt, con)
axt |
An |
con |
A connection object or a character string. |
Ge Tan
axtFile <- file.path(system.file("extdata", package="CNEr"), "hg38.danRer10.net.axt") axt <- readAxt(axtFile) writeAxt(axt, con=tempfile())
axtFile <- file.path(system.file("extdata", package="CNEr"), "hg38.danRer10.net.axt") axt <- readAxt(axtFile) writeAxt(axt, con=tempfile())