| Title: | Utilities for manipulating chromosome names, including modifying them to follow a particular naming style |
|---|---|
| Description: | Contains data and functions that define and allow translation between different chromosome sequence naming conventions (e.g., "chr1" versus "1"), including a function that attempts to place sequence names in their natural, rather than lexicographic, order. |
| Authors: | Sonali Arora [aut], Martin Morgan [aut], Marc Carlson [aut], Hervé Pagès [aut, cre], Prisca Chidimma Maduka [ctb], Atuhurira Kirabo Kakopo [ctb], Haleema Khan [ctb] (vignette translation from Sweave to Rmarkdown / HTML), Emmanuel Chigozie Elendu [ctb] |
| Maintainer: | Hervé Pagès <[email protected]> |
| License: | Artistic-2.0 |
| Version: | 1.49.1 |
| Built: | 2026-05-22 22:12:26 UTC |
| Source: | https://github.com/bioc/GenomeInfoDb |
getChromInfoFromEnsembl returns chromosome information like
sequence names, lengths and circularity flags for a given Ensembl
species e.g. Human, Cow, Saccharomyces cerevisiae, etc...
getChromInfoFromEnsembl(species, release=NA, division=NA, use.grch37=FALSE, assembled.molecules.only=FALSE, include.non_ref.sequences=FALSE, include.contigs=FALSE, include.clones=FALSE, map.NCBI=FALSE, recache=FALSE, as.Seqinfo=FALSE)getChromInfoFromEnsembl(species, release=NA, division=NA, use.grch37=FALSE, assembled.molecules.only=FALSE, include.non_ref.sequences=FALSE, include.contigs=FALSE, include.clones=FALSE, map.NCBI=FALSE, recache=FALSE, as.Seqinfo=FALSE)
species |
A single string specifying the name of an Ensembl species e.g.
Alternatively the name of an assembly (e.g. |
release |
The Ensembl release to query e.g. 89. If set to |
division |
|
use.grch37 |
NOT TESTED YET!
|
assembled.molecules.only |
NOT IMPLEMENTED YET! |
include.non_ref.sequences |
TODO: DOCUMENT THIS! |
include.contigs |
Whether or not sequences for which |
include.clones |
Whether or not sequences for which |
map.NCBI |
If Note that not all rows in the "Ensembl chrom info" data frame are
necessarily mapped to a row in the "NCBI chrom info" data frame.
For the unmapped rows the NCBI columns in the final data frame
are filled with NAs ( The primary use case for using |
recache |
|
as.Seqinfo |
|
COMING SOON...
For getChromInfoFromEnsembl: By default, a 7-column data frame
with columns:
name: character.
length: integer.
coord_system: factor.
synonyms: list.
toplevel: logical.
non_ref: logical.
circular: logical.
and with attribute species_info which contains details about
the species that was used to obtaine the data.
If map.NCBI is TRUE, then 7 "NCBI columns" are added
to the result:
NCBI.SequenceName: character.
NCBI.SequenceRole: factor.
NCBI.AssignedMolecule: factor.
NCBI.GenBankAccn: character.
NCBI.Relationship: factor.
NCBI.RefSeqAccn: character.
NCBI.AssemblyUnit: factor.
Note that the names of the "NCBI columns" are those returned
by getChromInfoFromNCBI but with the NCBI.
prefix added to them.
H. Pagès
getChromInfoFromNCBI and
getChromInfoFromUCSC for getting chromosome
information for an NCBI assembly or UCSC genome.
Seqinfo objects.
## --------------------------------------------------------------------- ## A. BASIC EXAMPLES ## --------------------------------------------------------------------- ## Internet access required! ## === Worm === ## https://uswest.ensembl.org/Caenorhabditis_elegans celegans <- getChromInfoFromEnsembl("celegans") attr(celegans, "species_info") getChromInfoFromEnsembl("celegans", as.Seqinfo=TRUE) celegans <- getChromInfoFromEnsembl("celegans", map.NCBI=TRUE) ## === Yeast === ## https://uswest.ensembl.org/Saccharomyces_cerevisiae scerevisiae <- getChromInfoFromEnsembl("scerevisiae") attr(scerevisiae, "species_info") getChromInfoFromEnsembl("scerevisiae", as.Seqinfo=TRUE) scerevisiae <- getChromInfoFromEnsembl("scerevisiae", map.NCBI=TRUE) ## Arabidopsis thaliana: athaliana <- getChromInfoFromEnsembl("athaliana", division="plants", map.NCBI=TRUE) attr(athaliana, "species_info") ## --------------------------------------------------------------------- ## Temporary stuff that needs to go away... ## --------------------------------------------------------------------- ## TODO: Check all species for which an NCBI assembly is registered! ## Checked so far (with current Ensembl release i.e. 99): ## - celegans OK ## - scerevisiae OK ## - athaliana OK ## - btaurus OK ## - sscrofa OK ## Not run: ## WORK IN PROGRESS!!! library(GenomeInfoDb) .do_join <- GenomeInfoDb:::.do_join .map_Ensembl_seqlevels_to_NCBI_seqlevels <- GenomeInfoDb:::.map_Ensembl_seqlevels_to_NCBI_seqlevels .map_Ensembl_seqlevels_to_NCBI_seqlevels( paste0("ENS_", 1:26), CharacterList(c(list(c(aa="INSDC1", bb="GNBK7"), c("INSDC2", "RefSeq3")), rep(list(NULL), 23), list("NCBI_7"))), paste0("NCBI_", 1:10), paste0("GNBK", c(1:8, NA, 9)), c(paste0("REFSEQ", c(1:7, 1, 1)), NA), verbose=TRUE ) map_to_NCBI <- function(Ensembl_chrom_info, NCBI_chrom_info, special_mappings=NULL) { .map_Ensembl_seqlevels_to_NCBI_seqlevels( Ensembl_chrom_info[ , "name"], Ensembl_chrom_info[ , "synonyms"], NCBI_chrom_info[ , "SequenceName"], NCBI_chrom_info[ , "GenBankAccn"], NCBI_chrom_info[ , "RefSeqAccn"], special_mappings=special_mappings, verbose=TRUE) } ## ------------------------------------------------------------------------ ## Human ## https://uswest.ensembl.org/Homo_sapiens/ ## Based on GRCh38.p13 (GCA_000001405.28) ## Return 944 rows human_chrom_info <- getChromInfoFromEnsembl("hsapiens") # 1 id: 131550 <- ref chromosome # CHR_HSCHR1_1_CTG3 id: 131561 <- non-ref chromosome # HSCHR1_1_CTG3 id: 131562 <- scaffold (no scaffold is non_ref) ## Map to NCBI ## Summary: ## - 639/640 NCBI sequences are reverse-mapped. ## - Restricted mapping is one-to-one. GRCh38.p13 <- getChromInfoFromNCBI("GRCh38.p13") L2R <- map_to_NCBI(human_chrom_info, GRCh38.p13) ## The only sequence in GRCh38.p13 that cannot be mapped to Ensembl is ## HG2139_PATCH (was introduced in GRCh38.p2)! Why? What's special about ## this patch? GRCh38.p13$mapped <- tabulate(L2R, nbins=nrow(GRCh38.p13)) != 0L table(GRCh38.p13$SequenceRole, GRCh38.p13$mapped) # FALSE TRUE # assembled-molecule 0 25 # alt-scaffold 0 261 # unlocalized-scaffold 0 42 # unplaced-scaffold 0 127 # pseudo-scaffold 0 0 # fix-patch 1 112 # novel-patch 0 72 human_chrom_info <- .do_join(human_chrom_info, GRCh38.p13, L2R) table(human_chrom_info$SequenceRole, human_chrom_info$toplevel) # FALSE TRUE # assembled-molecule 0 25 # alt-scaffold 261 0 # unlocalized-scaffold 0 42 # unplaced-scaffold 0 127 # pseudo-scaffold 0 0 # fix-patch 112 0 # novel-patch 72 0 #hsa_seqlevels <- readRDS("hsapiens_gene_ensembl_txdb_seqlevels.rds") ## ------------------------------------------------------------------------ ## Mouse ## https://uswest.ensembl.org/Mus_musculus/ ## Based on GRCm38.p6 (GCA_000001635.8) ## Return 258 rows mouse_chrom_info <- getChromInfoFromEnsembl("mmusculus") ## Map to NCBI ## Summary: ## - 139/239 NCBI sequences are reverse-mapped. ## - Restricted mapping is NOT one-to-one: 2 Ensembl sequences (NC_005089.1 ## and MT) are both mapped to NCBI MT. GRCm38.p6 <- getChromInfoFromNCBI("GRCm38.p6") L2R <- map_to_NCBI(mouse_chrom_info, GRCm38.p6) ## 100 sequences in GRCm38.p6 are not mapped: GRCm38.p6$mapped <- tabulate(L2R, nbins=nrow(GRCm38.p6)) != 0L table(GRCm38.p6$SequenceRole, GRCm38.p6$mapped) # FALSE TRUE # assembled-molecule 0 22 # alt-scaffold 99 0 # unlocalized-scaffold 0 22 # unplaced-scaffold 0 22 # pseudo-scaffold 0 0 # fix-patch 1 64 # novel-patch 0 9 ## OK so Ensembl doesn't include the alt-scaffolds for Mouse. BUT WHAT ## HAPPENED TO THIS ONE fix-patch SEQUENCE (MG4237_PATCH) THAT IS NOT ## MAPPED? Found it in seq_region_synonym table! It's seq_region_id=100405. ## Hey but that seq_region_id is **NOT** in the seq_region table!!! THIS ## VIOLATES FOREIGN KEY CONSTRAINT!!!! mouse_chrom_info <- .do_join(mouse_chrom_info, GRCm38.p6, L2R) ## Ensembl does NOT comsider NC_005089.1 (duplicate entry for MT) toplevel: mouse_chrom_info[mouse_chrom_info$SequenceName # name length coord_system synonyms toplevel # 184 NC_005089.1 16299 scaffold FALSE # 201 MT 16299 chromosome NC_005089.1, chrM, AY172335.1 TRUE # SequenceName GenBankAccn RefSeqAccn # 184 MT AY172335.1 NC_005089.1 # 201 MT AY172335.1 NC_005089.1 ## ------------------------------------------------------------------------ ## Rat ## https://uswest.ensembl.org/Rattus_norvegicus/ ## Based on Rnor_6.0 (GCA_000001895.4) # Return 1418 rows rat_chrom_info <- getChromInfoFromEnsembl("rnorvegicus") ## Map to NCBI ## Summary: ## - 955/955 NCBI sequences are reverse-mapped. ## - Reverse mapping is one-to-many: 2 Ensembl sequences (NC_001665.2 and MT) ## are mapped to NCBI MT. Rnor_6.0 <- getChromInfoFromNCBI("Rnor_6.0") L2R <- map_to_NCBI(rat_chrom_info, Rnor_6.0) rat_chrom_info <- .do_join(rat_chrom_info, Rnor_6.0, L2R) ## Ensembl does NOT comsider NC_001665.2 (duplicate entry for MT) toplevel: rat_chrom_info[rat_chrom_info$SequenceName # name length coord_system synonyms toplevel # 1417 NC_001665.2 16313 scaffold FALSE # 1418 MT 16313 chromosome NC_001665.2, AY172581.1, chrM TRUE # SequenceName GenBankAccn RefSeqAccn # 1417 MT AY172581.1 NC_001665.2 # 1418 MT AY172581.1 NC_001665.2 table(rat_chrom_info$SequenceRole, rat_chrom_info$toplevel) # FALSE TRUE # assembled-molecule 1 23 # alt-scaffold 0 0 # unlocalized-scaffold 0 354 # unplaced-scaffold 0 578 # pseudo-scaffold 0 0 # fix-patch 0 0 # novel-patch 0 0 ## End(Not run)## --------------------------------------------------------------------- ## A. BASIC EXAMPLES ## --------------------------------------------------------------------- ## Internet access required! ## === Worm === ## https://uswest.ensembl.org/Caenorhabditis_elegans celegans <- getChromInfoFromEnsembl("celegans") attr(celegans, "species_info") getChromInfoFromEnsembl("celegans", as.Seqinfo=TRUE) celegans <- getChromInfoFromEnsembl("celegans", map.NCBI=TRUE) ## === Yeast === ## https://uswest.ensembl.org/Saccharomyces_cerevisiae scerevisiae <- getChromInfoFromEnsembl("scerevisiae") attr(scerevisiae, "species_info") getChromInfoFromEnsembl("scerevisiae", as.Seqinfo=TRUE) scerevisiae <- getChromInfoFromEnsembl("scerevisiae", map.NCBI=TRUE) ## Arabidopsis thaliana: athaliana <- getChromInfoFromEnsembl("athaliana", division="plants", map.NCBI=TRUE) attr(athaliana, "species_info") ## --------------------------------------------------------------------- ## Temporary stuff that needs to go away... ## --------------------------------------------------------------------- ## TODO: Check all species for which an NCBI assembly is registered! ## Checked so far (with current Ensembl release i.e. 99): ## - celegans OK ## - scerevisiae OK ## - athaliana OK ## - btaurus OK ## - sscrofa OK ## Not run: ## WORK IN PROGRESS!!! library(GenomeInfoDb) .do_join <- GenomeInfoDb:::.do_join .map_Ensembl_seqlevels_to_NCBI_seqlevels <- GenomeInfoDb:::.map_Ensembl_seqlevels_to_NCBI_seqlevels .map_Ensembl_seqlevels_to_NCBI_seqlevels( paste0("ENS_", 1:26), CharacterList(c(list(c(aa="INSDC1", bb="GNBK7"), c("INSDC2", "RefSeq3")), rep(list(NULL), 23), list("NCBI_7"))), paste0("NCBI_", 1:10), paste0("GNBK", c(1:8, NA, 9)), c(paste0("REFSEQ", c(1:7, 1, 1)), NA), verbose=TRUE ) map_to_NCBI <- function(Ensembl_chrom_info, NCBI_chrom_info, special_mappings=NULL) { .map_Ensembl_seqlevels_to_NCBI_seqlevels( Ensembl_chrom_info[ , "name"], Ensembl_chrom_info[ , "synonyms"], NCBI_chrom_info[ , "SequenceName"], NCBI_chrom_info[ , "GenBankAccn"], NCBI_chrom_info[ , "RefSeqAccn"], special_mappings=special_mappings, verbose=TRUE) } ## ------------------------------------------------------------------------ ## Human ## https://uswest.ensembl.org/Homo_sapiens/ ## Based on GRCh38.p13 (GCA_000001405.28) ## Return 944 rows human_chrom_info <- getChromInfoFromEnsembl("hsapiens") # 1 id: 131550 <- ref chromosome # CHR_HSCHR1_1_CTG3 id: 131561 <- non-ref chromosome # HSCHR1_1_CTG3 id: 131562 <- scaffold (no scaffold is non_ref) ## Map to NCBI ## Summary: ## - 639/640 NCBI sequences are reverse-mapped. ## - Restricted mapping is one-to-one. GRCh38.p13 <- getChromInfoFromNCBI("GRCh38.p13") L2R <- map_to_NCBI(human_chrom_info, GRCh38.p13) ## The only sequence in GRCh38.p13 that cannot be mapped to Ensembl is ## HG2139_PATCH (was introduced in GRCh38.p2)! Why? What's special about ## this patch? GRCh38.p13$mapped <- tabulate(L2R, nbins=nrow(GRCh38.p13)) != 0L table(GRCh38.p13$SequenceRole, GRCh38.p13$mapped) # FALSE TRUE # assembled-molecule 0 25 # alt-scaffold 0 261 # unlocalized-scaffold 0 42 # unplaced-scaffold 0 127 # pseudo-scaffold 0 0 # fix-patch 1 112 # novel-patch 0 72 human_chrom_info <- .do_join(human_chrom_info, GRCh38.p13, L2R) table(human_chrom_info$SequenceRole, human_chrom_info$toplevel) # FALSE TRUE # assembled-molecule 0 25 # alt-scaffold 261 0 # unlocalized-scaffold 0 42 # unplaced-scaffold 0 127 # pseudo-scaffold 0 0 # fix-patch 112 0 # novel-patch 72 0 #hsa_seqlevels <- readRDS("hsapiens_gene_ensembl_txdb_seqlevels.rds") ## ------------------------------------------------------------------------ ## Mouse ## https://uswest.ensembl.org/Mus_musculus/ ## Based on GRCm38.p6 (GCA_000001635.8) ## Return 258 rows mouse_chrom_info <- getChromInfoFromEnsembl("mmusculus") ## Map to NCBI ## Summary: ## - 139/239 NCBI sequences are reverse-mapped. ## - Restricted mapping is NOT one-to-one: 2 Ensembl sequences (NC_005089.1 ## and MT) are both mapped to NCBI MT. GRCm38.p6 <- getChromInfoFromNCBI("GRCm38.p6") L2R <- map_to_NCBI(mouse_chrom_info, GRCm38.p6) ## 100 sequences in GRCm38.p6 are not mapped: GRCm38.p6$mapped <- tabulate(L2R, nbins=nrow(GRCm38.p6)) != 0L table(GRCm38.p6$SequenceRole, GRCm38.p6$mapped) # FALSE TRUE # assembled-molecule 0 22 # alt-scaffold 99 0 # unlocalized-scaffold 0 22 # unplaced-scaffold 0 22 # pseudo-scaffold 0 0 # fix-patch 1 64 # novel-patch 0 9 ## OK so Ensembl doesn't include the alt-scaffolds for Mouse. BUT WHAT ## HAPPENED TO THIS ONE fix-patch SEQUENCE (MG4237_PATCH) THAT IS NOT ## MAPPED? Found it in seq_region_synonym table! It's seq_region_id=100405. ## Hey but that seq_region_id is **NOT** in the seq_region table!!! THIS ## VIOLATES FOREIGN KEY CONSTRAINT!!!! mouse_chrom_info <- .do_join(mouse_chrom_info, GRCm38.p6, L2R) ## Ensembl does NOT comsider NC_005089.1 (duplicate entry for MT) toplevel: mouse_chrom_info[mouse_chrom_info$SequenceName # name length coord_system synonyms toplevel # 184 NC_005089.1 16299 scaffold FALSE # 201 MT 16299 chromosome NC_005089.1, chrM, AY172335.1 TRUE # SequenceName GenBankAccn RefSeqAccn # 184 MT AY172335.1 NC_005089.1 # 201 MT AY172335.1 NC_005089.1 ## ------------------------------------------------------------------------ ## Rat ## https://uswest.ensembl.org/Rattus_norvegicus/ ## Based on Rnor_6.0 (GCA_000001895.4) # Return 1418 rows rat_chrom_info <- getChromInfoFromEnsembl("rnorvegicus") ## Map to NCBI ## Summary: ## - 955/955 NCBI sequences are reverse-mapped. ## - Reverse mapping is one-to-many: 2 Ensembl sequences (NC_001665.2 and MT) ## are mapped to NCBI MT. Rnor_6.0 <- getChromInfoFromNCBI("Rnor_6.0") L2R <- map_to_NCBI(rat_chrom_info, Rnor_6.0) rat_chrom_info <- .do_join(rat_chrom_info, Rnor_6.0, L2R) ## Ensembl does NOT comsider NC_001665.2 (duplicate entry for MT) toplevel: rat_chrom_info[rat_chrom_info$SequenceName # name length coord_system synonyms toplevel # 1417 NC_001665.2 16313 scaffold FALSE # 1418 MT 16313 chromosome NC_001665.2, AY172581.1, chrM TRUE # SequenceName GenBankAccn RefSeqAccn # 1417 MT AY172581.1 NC_001665.2 # 1418 MT AY172581.1 NC_001665.2 table(rat_chrom_info$SequenceRole, rat_chrom_info$toplevel) # FALSE TRUE # assembled-molecule 1 23 # alt-scaffold 0 0 # unlocalized-scaffold 0 354 # unplaced-scaffold 0 578 # pseudo-scaffold 0 0 # fix-patch 0 0 # novel-patch 0 0 ## End(Not run)
getChromInfoFromNCBI returns chromosome information
like sequence names, lengths and circularity flags for a given
NCBI assembly e.g. for GRCh38, ARS-UCD1.2, R64, etc...
Note that getChromInfoFromNCBI behaves slightly differently
depending on whether the assembly is registered in the
GenomeInfoDb package or not. See below for the details.
Use registered_NCBI_assemblies to list all the NCBI assemblies
currently registered in the GenomeInfoDb package.
getChromInfoFromNCBI(assembly, assembled.molecules.only=FALSE, assembly.units=NULL, recache=FALSE, as.Seqinfo=FALSE) registered_NCBI_assemblies(organism=NA)getChromInfoFromNCBI(assembly, assembled.molecules.only=FALSE, assembly.units=NULL, recache=FALSE, as.Seqinfo=FALSE) registered_NCBI_assemblies(organism=NA)
assembly |
A single string specifying the name of an NCBI assembly (e.g.
|
assembled.molecules.only |
If If |
assembly.units |
If
|
recache |
|
as.Seqinfo |
|
organism |
When |
*** registered vs unregistered NCBI assemblies ***
All NCBI assemblies can be looked up by assembly accession (GenBank or RefSeq) but only registered assemblies can also be looked up by assembly name.
For registered assemblies, the returned circularity flags are guaranteed to be accurate. For unregistered assemblies, a heuristic is used to determine the circular sequences.
Please contact the maintainer of the GenomeInfoDb package to request registration of additional assemblies.
*** Offline mode ***
The GenomeInfoDb package includes a small database that contains
chromosome information for the most commonly used NCBI and UCSC genome
assemblies e.g. GRCh38.p14, GRCh38.p13, GRCm39, hg38, mm39, etc...
This makes calls like getChromInfoFromNCBI("GRCh38.p14") or
getChromInfoFromUCSC("hg38") work offline. Plus now they are fast
and reliable.
Note that calling getChromInfoFromNCBI() with recache=TRUE
will trigger retrieval of the chromosome info from NCBI, and will issue
a warning if this info no longer matches the chromosome info stored in
the package. Please open an issue on GitHub if you get such warning or
want to request adding chromosome information of your favorite NCBI or
UCSC genome assembly to GenomeInfoDb's internal database.
For getChromInfoFromNCBI: By default, a 10-column data frame
with columns:
SequenceName: character.
SequenceRole: factor.
AssignedMolecule: factor.
GenBankAccn: character.
Relationship: factor.
RefSeqAccn: character.
AssemblyUnit: factor.
SequenceLength: integer. Note that this column **can**
contain NAs! For example this is the case in assembly Amel_HAv3.1
where the length of sequence MT is missing or in assembly
Release 5 where the length of sequence Un is missing.
UCSCStyleName: character.
circular: logical.
For registered_NCBI_assemblies: A data frame summarizing all the
NCBI assemblies currently registered in the GenomeInfoDb
package.
H. Pagès
getChromInfoFromUCSC for getting chromosome
information for a UCSC genome.
getChromInfoFromEnsembl for getting chromosome
information for an Ensembl species.
Seqinfo objects.
## All registered NCBI assemblies for Triticum aestivum (bread wheat): registered_NCBI_assemblies("tri")[1:4] ## All registered NCBI assemblies for Homo sapiens: registered_NCBI_assemblies("homo")[1:4] ## Internet access required! getChromInfoFromNCBI("GRCh37") getChromInfoFromNCBI("GRCh37", as.Seqinfo=TRUE) getChromInfoFromNCBI("GRCh37", assembled.molecules.only=TRUE) ## The GRCh38.p14 assembly only adds "patch sequences" to the GRCh38 ## assembly: GRCh38 <- getChromInfoFromNCBI("GRCh38") table(GRCh38$SequenceRole) GRCh38.p14 <- getChromInfoFromNCBI("GRCh38.p14") table(GRCh38.p14$SequenceRole) # 254 patch sequences (164 fix + 90 novel) ## All registered NCBI assemblies for Arabidopsis thaliana: registered_NCBI_assemblies("arabi")[1:4] getChromInfoFromNCBI("TAIR10.1") getChromInfoFromNCBI("TAIR10.1", assembly.units="non-nuclear") ## Sanity checks: idx <- match(GRCh38$SequenceName, GRCh38.p14$SequenceName) stopifnot(!anyNA(idx)) tmp1 <- GRCh38.p14[idx, ] rownames(tmp1) <- NULL tmp2 <- GRCh38.p14[-idx, ] stopifnot( identical(tmp1[ , -(5:7)], GRCh38[ , -(5:7)]), identical(tmp2, GRCh38.p14[GRCh38.p14$AssemblyUnit == "PATCHES", ]) )## All registered NCBI assemblies for Triticum aestivum (bread wheat): registered_NCBI_assemblies("tri")[1:4] ## All registered NCBI assemblies for Homo sapiens: registered_NCBI_assemblies("homo")[1:4] ## Internet access required! getChromInfoFromNCBI("GRCh37") getChromInfoFromNCBI("GRCh37", as.Seqinfo=TRUE) getChromInfoFromNCBI("GRCh37", assembled.molecules.only=TRUE) ## The GRCh38.p14 assembly only adds "patch sequences" to the GRCh38 ## assembly: GRCh38 <- getChromInfoFromNCBI("GRCh38") table(GRCh38$SequenceRole) GRCh38.p14 <- getChromInfoFromNCBI("GRCh38.p14") table(GRCh38.p14$SequenceRole) # 254 patch sequences (164 fix + 90 novel) ## All registered NCBI assemblies for Arabidopsis thaliana: registered_NCBI_assemblies("arabi")[1:4] getChromInfoFromNCBI("TAIR10.1") getChromInfoFromNCBI("TAIR10.1", assembly.units="non-nuclear") ## Sanity checks: idx <- match(GRCh38$SequenceName, GRCh38.p14$SequenceName) stopifnot(!anyNA(idx)) tmp1 <- GRCh38.p14[idx, ] rownames(tmp1) <- NULL tmp2 <- GRCh38.p14[-idx, ] stopifnot( identical(tmp1[ , -(5:7)], GRCh38[ , -(5:7)]), identical(tmp2, GRCh38.p14[GRCh38.p14$AssemblyUnit == "PATCHES", ]) )
getChromInfoFromUCSC returns chromosome information like
sequence names, lengths and circularity flags for a given UCSC genome
e.g. for hg19, panTro6, sacCer3, etc...
Note that getChromInfoFromUCSC behaves slightly differently
depending on whether a genome is registered in the
GenomeInfoDb package or not. See below for the details.
Use registered_UCSC_genomes to list all the UCSC genomes
currently registered in the GenomeInfoDb package.
getChromInfoFromUCSC(genome, assembled.molecules.only=FALSE, map.NCBI=FALSE, add.ensembl.col=FALSE, goldenPath.url=getOption("UCSC.goldenPath.url"), recache=FALSE, as.Seqinfo=FALSE) registered_UCSC_genomes(organism=NA)getChromInfoFromUCSC(genome, assembled.molecules.only=FALSE, map.NCBI=FALSE, add.ensembl.col=FALSE, goldenPath.url=getOption("UCSC.goldenPath.url"), recache=FALSE, as.Seqinfo=FALSE) registered_UCSC_genomes(organism=NA)
genome |
A single string specifying the name of a UCSC genome
e.g. |
assembled.molecules.only |
If If Note that |
map.NCBI |
If Note that not all rows in the "UCSC chrom info" data frame are
necessarily mapped to a row in the "NCBI chrom info" data frame.
For example chrM in hg19 has no corresponding sequence in the GRCh37
assembly (the mitochondrial chromosome was omitted from GRCh37).
For the unmapped rows the NCBI columns in the final data frame
are filled with NAs ( The primary use case for using |
add.ensembl.col |
|
goldenPath.url |
A single string specifying the URL to the UCSC goldenPath location where the chromosome sizes are expected to be found. |
recache |
|
as.Seqinfo |
|
organism |
When |
*** Registered vs unregistered UCSC genomes ***
For registered genomes, the returned data frame contains
information about which sequences are assembled molecules and which
are not, and the assembled.molecules.only argument is
supported. For unregistered genomes, this information is
missing, and the assembled.molecules.only argument is ignored
with a warning.
For registered genomes, the returned circularity flags are guaranteed to be accurate. For unregistered genomes, a heuristic is used to determine the circular sequences.
For registered genomes, special care is taken to make sure that the sequences are returned in a sensible order. For unregistered genomes, a heuristic is used to return the sequences in a sensible order.
Please contact the maintainer of the GenomeInfoDb package to request registration of additional genomes.
*** Offline mode ***
The GenomeInfoDb package includes a small database that contains
chromosome information for the most commonly used NCBI and UCSC genome
assemblies e.g. GRCh38.p14, GRCh38.p13, GRCm39, hg38, mm39, etc...
This makes calls like getChromInfoFromNCBI("GRCh38.p14") or
getChromInfoFromUCSC("hg38") work offline. Plus now they are fast
and reliable.
Note that calling getChromInfoFromUCSC() with recache=TRUE
will trigger retrieval of the chromosome info from UCSC, and will issue
a warning if this info no longer matches the chromosome info stored in
the package. Please open an issue on GitHub if you get such warning or
want to request adding chromosome information of your favorite NCBI or
UCSC genome assembly to GenomeInfoDb's internal database.
For getChromInfoFromUCSC: By default, a 4-column data frame
with columns:
chrom: character.
size: integer.
assembled: logical.
circular: logical.
If map.NCBI is TRUE, then 7 "NCBI columns" are added
to the result:
NCBI.SequenceName: character.
NCBI.SequenceRole: factor.
NCBI.AssignedMolecule: factor.
NCBI.GenBankAccn: character.
NCBI.Relationship: factor.
NCBI.RefSeqAccn: character.
NCBI.AssemblyUnit: factor.
Note that the names of the "NCBI columns" are those returned
by getChromInfoFromNCBI but with the NCBI.
prefix added to them.
If add.ensembl.col is TRUE, the column ensembl
is added to the result.
For registered_UCSC_genomes: A data frame summarizing all the UCSC
genomes currently registered in the GenomeInfoDb package.
H. Pagès
getChromInfoFromNCBI for getting chromosome
information for an NCBI assembly.
getChromInfoFromEnsembl for getting chromosome
information for an Ensembl species.
Seqinfo objects.
The getBSgenome convenience utility in
the BSgenome package for getting a BSgenome
object from an installed BSgenome data package.
## --------------------------------------------------------------------- ## A. BASIC EXAMPLES ## --------------------------------------------------------------------- getChromInfoFromUCSC("hg19") getChromInfoFromUCSC("hg19", as.Seqinfo=TRUE) ## Map the hg38 sequences to their corresponding sequences in ## the GRCh38.p13 assembly: getChromInfoFromUCSC("hg38", map.NCBI=TRUE)[c(1, 5)] ## Note that some NCBI-based UCSC genomes contain sequences that ## are not mapped. For example this is the case for chrM in hg19: hg19 <- getChromInfoFromUCSC("hg19", map.NCBI=TRUE) hg19[is.na(hg19$NCBI.SequenceName), ] ## Map the hg19 sequences to the Ensembl sequence names: getChromInfoFromUCSC("hg19", add.ensembl.col=TRUE) ## --- List of UCSC genomes currently registered in the package --- registered_UCSC_genomes() ## All registered UCSC genomes for Felis catus (domestic cat): registered_UCSC_genomes(organism = "Felis catus") ## All registered UCSC genomes for Homo sapiens: registered_UCSC_genomes("homo") ## --------------------------------------------------------------------- ## B. USING getChromInfoFromUCSC() TO SET UCSC SEQUENCE NAMES ON THE ## GRCh38 GENOME ## --------------------------------------------------------------------- ## Load the BSgenome.Hsapiens.NCBI.GRCh38 package: library(BSgenome) genome <- getBSgenome("GRCh38") # this loads the # BSgenome.Hsapiens.NCBI.GRCh38 package genome ## Get the chromosome info for the hg38 genome: hg38_chrom_info <- getChromInfoFromUCSC("hg38", map.NCBI=TRUE) ncbi2ucsc <- setNames(hg38_chrom_info$chrom, hg38_chrom_info$NCBI.SequenceName) ## Set the UCSC sequence names on 'genome': seqlevels(genome) <- ncbi2ucsc[seqlevels(genome)] genome ## Sanity check: check that the sequence lengths in 'genome' are the same ## as in 'hg38_chrom_info': m <- match(seqlevels(genome), hg38_chrom_info$chrom) stopifnot(identical(unname(seqlengths(genome)), hg38_chrom_info$size[m]))## --------------------------------------------------------------------- ## A. BASIC EXAMPLES ## --------------------------------------------------------------------- getChromInfoFromUCSC("hg19") getChromInfoFromUCSC("hg19", as.Seqinfo=TRUE) ## Map the hg38 sequences to their corresponding sequences in ## the GRCh38.p13 assembly: getChromInfoFromUCSC("hg38", map.NCBI=TRUE)[c(1, 5)] ## Note that some NCBI-based UCSC genomes contain sequences that ## are not mapped. For example this is the case for chrM in hg19: hg19 <- getChromInfoFromUCSC("hg19", map.NCBI=TRUE) hg19[is.na(hg19$NCBI.SequenceName), ] ## Map the hg19 sequences to the Ensembl sequence names: getChromInfoFromUCSC("hg19", add.ensembl.col=TRUE) ## --- List of UCSC genomes currently registered in the package --- registered_UCSC_genomes() ## All registered UCSC genomes for Felis catus (domestic cat): registered_UCSC_genomes(organism = "Felis catus") ## All registered UCSC genomes for Homo sapiens: registered_UCSC_genomes("homo") ## --------------------------------------------------------------------- ## B. USING getChromInfoFromUCSC() TO SET UCSC SEQUENCE NAMES ON THE ## GRCh38 GENOME ## --------------------------------------------------------------------- ## Load the BSgenome.Hsapiens.NCBI.GRCh38 package: library(BSgenome) genome <- getBSgenome("GRCh38") # this loads the # BSgenome.Hsapiens.NCBI.GRCh38 package genome ## Get the chromosome info for the hg38 genome: hg38_chrom_info <- getChromInfoFromUCSC("hg38", map.NCBI=TRUE) ncbi2ucsc <- setNames(hg38_chrom_info$chrom, hg38_chrom_info$NCBI.SequenceName) ## Set the UCSC sequence names on 'genome': seqlevels(genome) <- ncbi2ucsc[seqlevels(genome)] genome ## Sanity check: check that the sequence lengths in 'genome' are the same ## as in 'hg38_chrom_info': m <- match(seqlevels(genome), hg38_chrom_info$chrom) stopifnot(identical(unname(seqlengths(genome)), hg38_chrom_info$size[m]))
NCBI maintains a collection of unique taxonomy IDs and pairs these with associated genus and species designations. This function returns the set of pre-processed values that we use to check that something is a valid Taxonomy ID (or organism).
Requires the GenomeInfoDbData package.
loadTaxonomyDb()loadTaxonomyDb()
A data frame with 1 row per genus/species designation and three columns. The 1st column is the taxonomy ID. The second columns is the genus and the third is the species name.
Marc Carlson
library(GenomeInfoDbData) ## get the data taxdb <- loadTaxonomyDb() tail(taxdb) ## which can then be searched etc. taxdb[grepl('yoelii', taxdb$species), ]library(GenomeInfoDbData) ## get the data taxdb <- loadTaxonomyDb() tail(taxdb) ## which can then be searched etc. taxdb[grepl('yoelii', taxdb$species), ]
genomeBuilds lists the available genomes for a given
species while mapGenomeBuilds maps between UCSC and Ensemble
genome builds.
genomeBuilds(organism, style = c("UCSC", "Ensembl")) mapGenomeBuilds(genome, style = c("UCSC", "Ensembl")) listOrganisms()genomeBuilds(organism, style = c("UCSC", "Ensembl")) mapGenomeBuilds(genome, style = c("UCSC", "Ensembl")) listOrganisms()
organism |
A character vector of common names or organism |
genome |
A character vector of genomes equivalent to UCSC version or Ensembl Assemblies |
style |
A single value equivalent to "UCSC" or "Ensembl" specifying the output genome |
genomeBuilds lists the currently available genomes for a given list of
organisms. The genomes can be shown as "UCSC" or "Ensembl" IDs determined
by style. organism must be specified as a character
vector and match common names (i.e "Dog", "Mouse") or organism name
(i.e "Homo sapiens", "Mus musculus") . A list of
available organisms can be shown using listOrganisms().
mapGenomeBuilds provides a mapping between "UCSC" builds and
"Ensembl" builds. genome must be specified as a character
vector and match either a "UCSC" ID or an "Ensembl"
Id. genomeBuilds can be used to get a list of available build
Ids for a given organism. NA's may be present in the output. This would
occur when the current genome build removed a previously defined
genome for an organism.
In both functions, if style is not specified, "UCSC" is used as
default.
A data.frame of builds for a given organism or genome in the specified
style. If style == "UCSC", ucscID, ucscDate and
ensemblID are given. If style == "Ensembl", ensemblID,
ensemblVersion, ensemblDate, and ucscID are given. The opposing ID is
given so that it is possible to distinguish between many-to-one
mappings.
Valerie Obenchain [email protected] and Lori Shepherd [email protected]
UCSC genome builds https://genome.ucsc.edu/FAQ/FAQreleases.html Ensembl genome builds http://useast.ensembl.org/info/website/archives/assembly.html
listOrganisms() genomeBuilds("mouse") genomeBuilds(c("Mouse", "dog", "human"), style="Ensembl") mapGenomeBuilds(c("canFam3", "GRCm38", "mm9")) mapGenomeBuilds(c("canFam3", "GRCm38", "mm9"), style="Ensembl")listOrganisms() genomeBuilds("mouse") genomeBuilds(c("Mouse", "dog", "human"), style="Ensembl") mapGenomeBuilds(c("canFam3", "GRCm38", "mm9")) mapGenomeBuilds(c("canFam3", "GRCm38", "mm9"), style="Ensembl")
Low-level utility functions to access NCBI resources. Not intended to be used directly by the end user.
find_NCBI_assembly_ftp_dir(assembly_accession, assembly_name=NA) fetch_assembly_report(assembly_accession, assembly_name=NA, AssemblyUnits=NULL)find_NCBI_assembly_ftp_dir(assembly_accession, assembly_name=NA) fetch_assembly_report(assembly_accession, assembly_name=NA, AssemblyUnits=NULL)
assembly_accession |
A single string containing either a GenBank assembly accession
(e.g. Alternatively, for |
assembly_name |
A single string or |
AssemblyUnits |
By default, all the assembly units are included in the data frame
returned by |
For find_NCBI_assembly_ftp_dir(): A length-2 character vector:
The 1st element in the vector is the URL to the FTP dir, without the trailing slash.
The 2nd element in the vector is the prefix used in the names of most of the files in the FTP dir.
For fetch_assembly_report(): A data frame with 1 row per sequence
in the assembly and 10 columns:
SequenceName
SequenceRole
AssignedMolecule
AssignedMoleculeLocationOrType
GenBankAccn
Relationship
RefSeqAccn
AssemblyUnit
SequenceLength
UCSCStyleName
fetch_assembly_report is the workhorse behind higher-level
and more user-friendly getChromInfoFromNCBI.
H. Pagès
getChromInfoFromNCBI for a higher-level and
more user-friendly version of fetch_assembly_report.
ftp_dir <- find_NCBI_assembly_ftp_dir("GCA_000001405.15") ftp_dir url <- ftp_dir[[1]] # URL to the FTP dir prefix <- ftp_dir[[2]] # prefix used in names of most files list_ftp_dir(url) assembly_report_url <- paste0(url, "/", prefix, "_assembly_report.txt") ## To fetch the assembly report for assembly GCA_000001405.15, you can ## call fetch_assembly_report() on the assembly accession or directly ## on the URL to the assembly report: assembly_report <- fetch_assembly_report("GCA_000001405.15") dim(assembly_report) head(assembly_report) ## Sanity check: assembly_report2 <- fetch_assembly_report(assembly_report_url) stopifnot(identical(assembly_report, assembly_report2))ftp_dir <- find_NCBI_assembly_ftp_dir("GCA_000001405.15") ftp_dir url <- ftp_dir[[1]] # URL to the FTP dir prefix <- ftp_dir[[2]] # prefix used in names of most files list_ftp_dir(url) assembly_report_url <- paste0(url, "/", prefix, "_assembly_report.txt") ## To fetch the assembly report for assembly GCA_000001405.15, you can ## call fetch_assembly_report() on the assembly accession or directly ## on the URL to the assembly report: assembly_report <- fetch_assembly_report("GCA_000001405.15") dim(assembly_report) head(assembly_report) ## Sanity check: assembly_report2 <- fetch_assembly_report(assembly_report_url) stopifnot(identical(assembly_report, assembly_report2))
Keep, drop or rename seqlevels in objects with a Seqinfo class.
keepSeqlevels(x, value, pruning.mode=c("error", "coarse", "fine", "tidy")) dropSeqlevels(x, value, pruning.mode=c("error", "coarse", "fine", "tidy")) renameSeqlevels(x, value) standardChromosomes(x, species=NULL) keepStandardChromosomes(x, species=NULL, pruning.mode=c("error", "coarse", "fine", "tidy"))keepSeqlevels(x, value, pruning.mode=c("error", "coarse", "fine", "tidy")) dropSeqlevels(x, value, pruning.mode=c("error", "coarse", "fine", "tidy")) renameSeqlevels(x, value) standardChromosomes(x, species=NULL) keepStandardChromosomes(x, species=NULL, pruning.mode=c("error", "coarse", "fine", "tidy"))
x |
Any object having a Seqinfo class in which the seqlevels will be kept, dropped or renamed. |
value |
A named or unnamed character vector. Names are ignored by In the case of |
pruning.mode |
See |
species |
The genus and species of the organism. Supported species can be seen with
|
Matching and overlap operations on range objects often require that the
seqlevels match before a comparison can be made (e.g., findOverlaps).
keepSeqlevels, dropSeqlevels and renameSeqlevels are
high-level convenience functions that wrap the low-level seqlevels
setter.
keepSeqlevels, dropSeqlevels: Subsetting operations
that modify the size of x. keepSeqlevels keeps only
the seqlevels in value and removes all others.
dropSeqlevels drops the levels in value and retains
all others. If value does not match any seqlevels in x
an empty object is returned.
When x is a GRangesList it is possible to have 'mixed'
list elements that have ranges from different chromosomes.
keepSeqlevels will not keep 'mixed' list elements
renameSeqlevels: Rename the seqlevels in x to those in
value. If value is a named character vector, the names
are used to map the new seqlevels to the old. When value is
unnamed, the replacement vector must be the same length and in the
same order as the original seqlevels(x).
standardChromosomes: Lists the 'standard' chromosomes defined
as sequences in the assembly that are not scaffolds; also referred
to as an 'assembly molecule' in NCBI. standardChromosomes
attempts to detect the seqlevel style and if more than one style is
matched, e.g., 'UCSC' and 'Ensembl', the first is chosen.
x must have a Seqinfo object. species can be
specified as a character string; supported species are listed with
names(genomeStyles()).
When x contains seqlevels from multiple organisms all
those considered standard will be kept. For example, if
seqlevels are "chr1" and "chr3R" from human and fly both will be
kept. If species="Homo sapiens" is specified then only
"chr1" is kept.
keepStandardChromosomes: Subsetting operation that returns
only the 'standard' chromosomes.
x must have a Seqinfo object. species can be
specified as a character string; supported species are listed with
names(genomeStyles()).
When x contains seqlevels from multiple organisms all
those considered standard will be kept. For example, if
seqlevels are "chr1" and "chr3R" from human and fly both will be
kept. If species="Homo sapiens" is specified then only
"chr1" is kept.
The x object with seqlevels removed or renamed. If x has
no seqlevels (empty object) or no replacement values match the current
seqlevels in x the unchanged x is returned.
Valerie Obenchain, Sonali Arora
## --------------------------------------------------------------------- ## keepSeqlevels / dropSeqlevels ## --------------------------------------------------------------------- ## ## GRanges / GAlignments: ## library(GenomicRanges) gr <- GRanges(c("chr1", "chr1", "chr2", "chr3"), IRanges(1:4, width=3)) seqlevels(gr) ## Keep only 'chr1' gr1 <- keepSeqlevels(gr, "chr1", pruning.mode="coarse") ## Drop 'chr1'. Both 'chr2' and 'chr3' are kept. gr2 <- dropSeqlevels(gr, "chr1", pruning.mode="coarse") library(Rsamtools) # for the ex1.bam file library(GenomicAlignments) # for readGAlignments() fl <- system.file("extdata", "ex1.bam", package="Rsamtools") gal <- readGAlignments(fl) ## If 'value' is named, the names are ignored. seq2 <- keepSeqlevels(gal, c(foo="seq2"), pruning.mode="coarse") seqlevels(seq2) ## ## List-like objects: ## grl0 <- GRangesList(A=GRanges("chr2", IRanges(3:2, 5)), B=GRanges(c("chr2", "chrMT"), IRanges(7:6, 15)), C=GRanges(c("chrY", "chrMT"), IRanges(17:16, 25)), D=GRanges()) ## See ?seqinfo for a description of the pruning modes. keepSeqlevels(grl0, "chr2", pruning.mode="coarse") keepSeqlevels(grl0, "chr2", pruning.mode="fine") keepSeqlevels(grl0, "chr2", pruning.mode="tidy") library(TxDb.Dmelanogaster.UCSC.dm3.ensGene) txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene ## Pruning mode "coarse" is particularly well suited on a GRangesList ## object that contains exons grouped by transcript: ex_by_tx <- exonsBy(txdb, by="tx") seqlevels(ex_by_tx) ex_by_tx2 <- keepSeqlevels(ex_by_tx, "chr2L", pruning.mode="coarse") seqlevels(ex_by_tx2) ## Pruning mode "tidy" is particularly well suited on a GRangesList ## object that contains transcripts grouped by gene: tx_by_gene <- transcriptsBy(txdb, by="gene") seqlevels(tx_by_gene) tx_by_gene2 <- keepSeqlevels(tx_by_gene, "chr2L", pruning.mode="tidy") seqlevels(tx_by_gene2) ## --------------------------------------------------------------------- ## renameSeqlevels ## --------------------------------------------------------------------- ## ## GAlignments: ## seqlevels(gal) ## Rename 'seq2' to 'chr2' with a named vector. gal2a <- renameSeqlevels(gal, c(seq2="chr2")) ## Rename 'seq2' to 'chr2' with an unnamed vector that includes all ## seqlevels as they appear in the object. gal2b <- renameSeqlevels(gal, c("seq1", "chr2")) ## Names that do not match existing seqlevels are ignored. ## This attempt at renaming does nothing. gal3 <- renameSeqlevels(gal, c(foo="chr2")) stopifnot(identical(gal, gal3)) ## ## TxDb: ## seqlevels(txdb) ## When the seqlevels of a TxDb are renamed, all future ## extractions reflect the modified seqlevels. renameSeqlevels(txdb, sub("chr", "CH", seqlevels(txdb))) renameSeqlevels(txdb, c(CHM="M")) seqlevels(txdb) transcripts <- transcripts(txdb) identical(seqlevels(txdb), seqlevels(transcripts)) ## --------------------------------------------------------------------- ## keepStandardChromosomes ## --------------------------------------------------------------------- ## ## GRanges: ## gr <- GRanges(c(paste0("chr",c(1:3)), "chr1_gl000191_random", "chr1_gl000192_random"), IRanges(1:5, width=3)) gr keepStandardChromosomes(gr, pruning.mode="coarse") ## ## List-like objects: ## grl <- GRangesList(GRanges("chr1", IRanges(1:2, 5)), GRanges(c("chr1_GL383519v1_alt", "chr1"), IRanges(5:6, 5))) ## Use pruning.mode="coarse" to drop list elements with mixed seqlevels: keepStandardChromosomes(grl, pruning.mode="coarse") ## Use pruning.mode="tidy" to keep all list elements with ranges on ## standard chromosomes: keepStandardChromosomes(grl, pruning.mode="tidy") ## ## The set of standard chromosomes should not be affected by the ## particular seqlevel style currently in use: ## ## NCBI worm <- GRanges(c("I", "II", "foo", "X", "MT"), IRanges(1:5, width=5)) keepStandardChromosomes(worm, pruning.mode="coarse") ## UCSC seqlevelsStyle(worm) <- "UCSC" keepStandardChromosomes(worm, pruning.mode="coarse") ## Ensembl seqlevelsStyle(worm) <- "Ensembl" keepStandardChromosomes(worm, pruning.mode="coarse")## --------------------------------------------------------------------- ## keepSeqlevels / dropSeqlevels ## --------------------------------------------------------------------- ## ## GRanges / GAlignments: ## library(GenomicRanges) gr <- GRanges(c("chr1", "chr1", "chr2", "chr3"), IRanges(1:4, width=3)) seqlevels(gr) ## Keep only 'chr1' gr1 <- keepSeqlevels(gr, "chr1", pruning.mode="coarse") ## Drop 'chr1'. Both 'chr2' and 'chr3' are kept. gr2 <- dropSeqlevels(gr, "chr1", pruning.mode="coarse") library(Rsamtools) # for the ex1.bam file library(GenomicAlignments) # for readGAlignments() fl <- system.file("extdata", "ex1.bam", package="Rsamtools") gal <- readGAlignments(fl) ## If 'value' is named, the names are ignored. seq2 <- keepSeqlevels(gal, c(foo="seq2"), pruning.mode="coarse") seqlevels(seq2) ## ## List-like objects: ## grl0 <- GRangesList(A=GRanges("chr2", IRanges(3:2, 5)), B=GRanges(c("chr2", "chrMT"), IRanges(7:6, 15)), C=GRanges(c("chrY", "chrMT"), IRanges(17:16, 25)), D=GRanges()) ## See ?seqinfo for a description of the pruning modes. keepSeqlevels(grl0, "chr2", pruning.mode="coarse") keepSeqlevels(grl0, "chr2", pruning.mode="fine") keepSeqlevels(grl0, "chr2", pruning.mode="tidy") library(TxDb.Dmelanogaster.UCSC.dm3.ensGene) txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene ## Pruning mode "coarse" is particularly well suited on a GRangesList ## object that contains exons grouped by transcript: ex_by_tx <- exonsBy(txdb, by="tx") seqlevels(ex_by_tx) ex_by_tx2 <- keepSeqlevels(ex_by_tx, "chr2L", pruning.mode="coarse") seqlevels(ex_by_tx2) ## Pruning mode "tidy" is particularly well suited on a GRangesList ## object that contains transcripts grouped by gene: tx_by_gene <- transcriptsBy(txdb, by="gene") seqlevels(tx_by_gene) tx_by_gene2 <- keepSeqlevels(tx_by_gene, "chr2L", pruning.mode="tidy") seqlevels(tx_by_gene2) ## --------------------------------------------------------------------- ## renameSeqlevels ## --------------------------------------------------------------------- ## ## GAlignments: ## seqlevels(gal) ## Rename 'seq2' to 'chr2' with a named vector. gal2a <- renameSeqlevels(gal, c(seq2="chr2")) ## Rename 'seq2' to 'chr2' with an unnamed vector that includes all ## seqlevels as they appear in the object. gal2b <- renameSeqlevels(gal, c("seq1", "chr2")) ## Names that do not match existing seqlevels are ignored. ## This attempt at renaming does nothing. gal3 <- renameSeqlevels(gal, c(foo="chr2")) stopifnot(identical(gal, gal3)) ## ## TxDb: ## seqlevels(txdb) ## When the seqlevels of a TxDb are renamed, all future ## extractions reflect the modified seqlevels. renameSeqlevels(txdb, sub("chr", "CH", seqlevels(txdb))) renameSeqlevels(txdb, c(CHM="M")) seqlevels(txdb) transcripts <- transcripts(txdb) identical(seqlevels(txdb), seqlevels(transcripts)) ## --------------------------------------------------------------------- ## keepStandardChromosomes ## --------------------------------------------------------------------- ## ## GRanges: ## gr <- GRanges(c(paste0("chr",c(1:3)), "chr1_gl000191_random", "chr1_gl000192_random"), IRanges(1:5, width=3)) gr keepStandardChromosomes(gr, pruning.mode="coarse") ## ## List-like objects: ## grl <- GRangesList(GRanges("chr1", IRanges(1:2, 5)), GRanges(c("chr1_GL383519v1_alt", "chr1"), IRanges(5:6, 5))) ## Use pruning.mode="coarse" to drop list elements with mixed seqlevels: keepStandardChromosomes(grl, pruning.mode="coarse") ## Use pruning.mode="tidy" to keep all list elements with ranges on ## standard chromosomes: keepStandardChromosomes(grl, pruning.mode="tidy") ## ## The set of standard chromosomes should not be affected by the ## particular seqlevel style currently in use: ## ## NCBI worm <- GRanges(c("I", "II", "foo", "X", "MT"), IRanges(1:5, width=5)) keepStandardChromosomes(worm, pruning.mode="coarse") ## UCSC seqlevelsStyle(worm) <- "UCSC" keepStandardChromosomes(worm, pruning.mode="coarse") ## Ensembl seqlevelsStyle(worm) <- "Ensembl" keepStandardChromosomes(worm, pruning.mode="coarse")
The seqlevelsStyle getter and setter can be used to get the current
seqlevels style of an object and to rename its seqlevels according to a given
style.
seqlevelsStyle(x) seqlevelsStyle(x) <- value ## Related low-level utilities: genomeStyles(species) extractSeqlevels(species, style) extractSeqlevelsByGroup(species, style, group) mapSeqlevels(seqnames, style, best.only=TRUE, drop=TRUE) seqlevelsInGroup(seqnames, group, species, style)seqlevelsStyle(x) seqlevelsStyle(x) <- value ## Related low-level utilities: genomeStyles(species) extractSeqlevels(species, style) extractSeqlevelsByGroup(species, style, group) mapSeqlevels(seqnames, style, best.only=TRUE, drop=TRUE) seqlevelsInGroup(seqnames, group, species, style)
x |
The object from/on which to get/set the seqlevels style. |
value |
A single character string that sets the seqlevels style for |
species |
The genus and species of the organism in question separated by a single space. Don't forget to capitalize the genus. |
style |
a character vector with a single element to specify the style. |
group |
Group can be 'auto' for autosomes, 'sex' for sex chromosomes/allosomes, 'circular' for circular chromosomes. The default is 'all' which returns all the chromosomes. |
best.only |
if |
drop |
if |
seqnames |
a character vector containing the labels attached to the chromosomes in a given genome for a given style. For example : For Homo sapiens, NCBI style - they are "1","2","3",...,"X","Y","MT" |
seqlevelsStyle(x), seqlevelsStyle(x) <- value:
Get the current seqlevels style of an object, or rename its seqlevels
according to the supplied style.
genomeStyles:
Different organizations have different naming conventions for how they
name the biologically defined sequence elements (usually chromosomes)
for each organism they support. The Seqnames package contains a
database that defines these different conventions.
genomeStyles() returns the list of all supported seqname mappings, one per supported organism. Each mapping is represented as a data frame with 1 column per seqname style and 1 row per chromosome name (not all chromosomes of a given organism necessarily belong to the mapping).
genomeStyles(species) returns a data.frame only for the given organism with all its supported seqname mappings.
extractSeqlevels:
Returns a character vector of the seqnames for a single style and species.
extractSeqlevelsByGroup:
Returns a character vector of the seqnames for a single style and species
by group. Group can be 'auto' for autosomes, 'sex' for sex chromosomes/
allosomes, 'circular' for circular chromosomes. The default is 'all' which
returns all the chromosomes.
mapSeqlevels:
Returns a matrix with 1 column per supplied sequence name and 1 row
per sequence renaming map compatible with the specified style.
If best.only is TRUE (the default), only the "best"
renaming maps (i.e. the rows with less NAs) are returned.
seqlevelsInGroup:
It takes a character vector along with a group and optional style and
species.If group is not specified , it returns "all" or standard/top level
seqnames.
Returns a character vector of seqnames after subsetting for the group
specified by the user. See examples for more details.
For seqlevelsStyle: A single string containing the style of the
seqlevels in x, or a character vector containing the styles of the
seqlevels in x if the current style cannot be determined
unambiguously. Note that this information is not stored in x
but inferred from its seqlevels using a heuristic helped by a seqlevels
style database stored in the GenomeInfoDb package.
If the underlying genome is known (i.e. if unique(genome(x)) is
not NA), the name of the genome or assembly (e.g. ce11 or
WBcel235) is also used by the heuristic.
For extractSeqlevels , extractSeqlevelsByGroup and
seqlevelsInGroup: A character vector of seqlevels
for given supported species and group.
For mapSeqlevels: A matrix with 1 column per supplied sequence
name and 1 row per sequence renaming map compatible with the specified style.
For genomeStyle: If species is specified returns a data.frame
containg the seqlevels style and its mapping for a given organism. If species
is not specified, a list is returned with one list per species containing
the seqlevels style with the corresponding mappings.
Sonali Arora, Martin Morgan, Marc Carlson, H. Pagès
## --------------------------------------------------------------------- ## seqlevelsStyle() getter and setter ## --------------------------------------------------------------------- ## On a character vector: x <- paste0("chr", 1:5) seqlevelsStyle(x) seqlevelsStyle(x) <- "NCBI" x ## On a GRanges object: library(GenomicRanges) gr <- GRanges(rep(c("chr2", "chr3", "chrM"), 2), IRanges(1:6, 10)) seqlevelsStyle(gr) seqlevelsStyle(gr) <- "NCBI" gr seqlevelsStyle(gr) seqlevelsStyle(gr) <- "dbSNP" gr seqlevelsStyle(gr) seqlevelsStyle(gr) <- "UCSC" gr ## In general the seqlevelsStyle() setter doesn't know how to rename ## scaffolds. However, if the genome is specified, it's very likely ## that seqlevelsStyle() will be able to take advantage of that: gr <- GRanges(rep(c("2", "Y", "Hs6_111610_36"), 2), IRanges(1:6, 10)) genome(gr) <- "NCBI36" seqlevelsStyle(gr) <- "UCSC" gr ## On a Seqinfo object: si <- si0 <- Seqinfo(genome="apiMel2") si seqlevelsStyle(si) <- "NCBI" si seqlevelsStyle(si) <- "RefSeq" si seqlevelsStyle(si) <- "UCSC" stopifnot(identical(si0, si)) si <- si0 <- Seqinfo(genome="WBcel235") si seqlevelsStyle(si) <- "UCSC" si seqlevelsStyle(si) <- "RefSeq" si seqlevelsStyle(si) <- "NCBI" stopifnot(identical(si0, si)) si <- Seqinfo(genome="macFas5") si seqlevelsStyle(si) <- "NCBI" si ## --------------------------------------------------------------------- ## Related low-level utilities ## --------------------------------------------------------------------- ## Genome styles: names(genomeStyles()) genomeStyles("Homo_sapiens") "UCSC" %in% names(genomeStyles("Homo_sapiens")) ## Extract seqlevels based on species, style and group: ## The 'group' argument can be 'sex', 'auto', 'circular' or 'all'. ## All: extractSeqlevels(species="Drosophila_melanogaster", style="Ensembl") ## Sex chromosomes: extractSeqlevelsByGroup(species="Homo_sapiens", style="UCSC", group="sex") ## Autosomes: extractSeqlevelsByGroup(species="Homo_sapiens", style="UCSC", group="auto") ## Identify which seqnames belong to a particular 'group': newchr <- paste0("chr",c(1:22,"X","Y","M","1_gl000192_random","4_ctg9")) seqlevelsInGroup(newchr, group="sex") newchr <- as.character(c(1:22,"X","Y","MT")) seqlevelsInGroup(newchr, group="all","Homo_sapiens","NCBI") ## Identify which seqnames belong to a species and style: seqnames <- c("chr1","chr9", "chr2", "chr3", "chr10") all(seqnames %in% extractSeqlevels("Homo_sapiens", "UCSC")) ## Find mapped seqlevelsStyles for exsiting seqnames: mapSeqlevels(c("chrII", "chrIII", "chrM"), "NCBI") mapSeqlevels(c("chrII", "chrIII", "chrM"), "Ensembl")## --------------------------------------------------------------------- ## seqlevelsStyle() getter and setter ## --------------------------------------------------------------------- ## On a character vector: x <- paste0("chr", 1:5) seqlevelsStyle(x) seqlevelsStyle(x) <- "NCBI" x ## On a GRanges object: library(GenomicRanges) gr <- GRanges(rep(c("chr2", "chr3", "chrM"), 2), IRanges(1:6, 10)) seqlevelsStyle(gr) seqlevelsStyle(gr) <- "NCBI" gr seqlevelsStyle(gr) seqlevelsStyle(gr) <- "dbSNP" gr seqlevelsStyle(gr) seqlevelsStyle(gr) <- "UCSC" gr ## In general the seqlevelsStyle() setter doesn't know how to rename ## scaffolds. However, if the genome is specified, it's very likely ## that seqlevelsStyle() will be able to take advantage of that: gr <- GRanges(rep(c("2", "Y", "Hs6_111610_36"), 2), IRanges(1:6, 10)) genome(gr) <- "NCBI36" seqlevelsStyle(gr) <- "UCSC" gr ## On a Seqinfo object: si <- si0 <- Seqinfo(genome="apiMel2") si seqlevelsStyle(si) <- "NCBI" si seqlevelsStyle(si) <- "RefSeq" si seqlevelsStyle(si) <- "UCSC" stopifnot(identical(si0, si)) si <- si0 <- Seqinfo(genome="WBcel235") si seqlevelsStyle(si) <- "UCSC" si seqlevelsStyle(si) <- "RefSeq" si seqlevelsStyle(si) <- "NCBI" stopifnot(identical(si0, si)) si <- Seqinfo(genome="macFas5") si seqlevelsStyle(si) <- "NCBI" si ## --------------------------------------------------------------------- ## Related low-level utilities ## --------------------------------------------------------------------- ## Genome styles: names(genomeStyles()) genomeStyles("Homo_sapiens") "UCSC" %in% names(genomeStyles("Homo_sapiens")) ## Extract seqlevels based on species, style and group: ## The 'group' argument can be 'sex', 'auto', 'circular' or 'all'. ## All: extractSeqlevels(species="Drosophila_melanogaster", style="Ensembl") ## Sex chromosomes: extractSeqlevelsByGroup(species="Homo_sapiens", style="UCSC", group="sex") ## Autosomes: extractSeqlevelsByGroup(species="Homo_sapiens", style="UCSC", group="auto") ## Identify which seqnames belong to a particular 'group': newchr <- paste0("chr",c(1:22,"X","Y","M","1_gl000192_random","4_ctg9")) seqlevelsInGroup(newchr, group="sex") newchr <- as.character(c(1:22,"X","Y","MT")) seqlevelsInGroup(newchr, group="all","Homo_sapiens","NCBI") ## Identify which seqnames belong to a species and style: seqnames <- c("chr1","chr9", "chr2", "chr3", "chr10") all(seqnames %in% extractSeqlevels("Homo_sapiens", "UCSC")) ## Find mapped seqlevelsStyles for exsiting seqnames: mapSeqlevels(c("chrII", "chrIII", "chrM"), "NCBI") mapSeqlevels(c("chrII", "chrIII", "chrM"), "Ensembl")