Title: | Explore and download data from the recount3 project |
---|---|
Description: | The recount3 package enables access to a large amount of uniformly processed RNA-seq data from human and mouse. You can download RangedSummarizedExperiment objects at the gene, exon or exon-exon junctions level with sample metadata and QC statistics. In addition we provide access to sample coverage BigWig files. |
Authors: | Leonardo Collado-Torres [aut, cre] |
Maintainer: | Leonardo Collado-Torres <[email protected]> |
License: | Artistic-2.0 |
Version: | 1.17.0 |
Built: | 2024-11-19 04:14:43 UTC |
Source: | https://github.com/bioc/recount3 |
Given an organism
and an annotation
, this function returns the
corresponding file extension used in the recount3
files.
annotation_ext( organism = c("human", "mouse"), annotation = annotation_options(organism) )
annotation_ext( organism = c("human", "mouse"), annotation = annotation_options(organism) )
organism |
A |
annotation |
A |
A character(1)
with the annotation file extension to be used.
Other internal functions for accessing the recount3 data:
create_rse_manual()
,
file_retrieve()
,
locate_url_ann()
,
locate_url()
,
project_homes()
,
read_counts()
,
read_metadata()
annotation_ext("human") annotation_ext("human", "fantom6_cat") annotation_ext("human", "refseq") annotation_ext("mouse")
annotation_ext("human") annotation_ext("human", "fantom6_cat") annotation_ext("human", "refseq") annotation_ext("mouse")
This function will return the available annotation options for a given
organism
.
annotation_options(organism = c("human", "mouse"))
annotation_options(organism = c("human", "mouse"))
organism |
A |
A character()
vector with the supported annotation options for the
given organism
.
annotation_options("human") annotation_options("mouse")
annotation_options("human") annotation_options("mouse")
List available projects in recount3
available_projects( organism = c("human", "mouse"), recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3"), bfc = recount3_cache(), available_homes = project_homes(organism = organism, recount3_url = recount3_url) )
available_projects( organism = c("human", "mouse"), recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3"), bfc = recount3_cache(), available_homes = project_homes(organism = organism, recount3_url = recount3_url) )
organism |
A |
recount3_url |
A |
bfc |
A BiocFileCache-class
object where the files will be cached to, typically created by
|
available_homes |
A |
A data.frame()
with the project ID (project
), the organism
, the
file_source
from where the data was accessed,
the recount3
project home location (project_home
), the project
project_type
that differentiates between data_sources
and compilations
,
the n_samples
with the number of samples in the given project.
## Find all the human projects human_projects <- available_projects() ## Explore the results dim(human_projects) head(human_projects) ## How many are from a data source vs a compilation? table(human_projects$project_type, useNA = "ifany") ## What are the unique file sources? table( human_projects$file_source[human_projects$project_type == "data_sources"] ) ## Note that big projects are broken up to make them easier to access ## For example, GTEx and TCGA are broken up by tissue head(subset(human_projects, file_source == "gtex")) head(subset(human_projects, file_source == "tcga")) ## Find all the mouse projects mouse_projects <- available_projects(organism = "mouse") ## Explore the results dim(mouse_projects) head(mouse_projects) ## How many are from a data source vs a compilation? table(mouse_projects$project_type, useNA = "ifany") ## What are the unique file sources? table( mouse_projects$file_source[mouse_projects$project_type == "data_sources"] ) ## Not run: ## Use with a custom recount3_url: available_projects( recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3test", available_homes = "data_sources/sra" ) ## You can also rely on project_homes() if the custom URL has a text file ## that can be read with readLines() at: ## <recount3_url>/<organism>/homes_index available_projects( recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3test" ) ## End(Not run)
## Find all the human projects human_projects <- available_projects() ## Explore the results dim(human_projects) head(human_projects) ## How many are from a data source vs a compilation? table(human_projects$project_type, useNA = "ifany") ## What are the unique file sources? table( human_projects$file_source[human_projects$project_type == "data_sources"] ) ## Note that big projects are broken up to make them easier to access ## For example, GTEx and TCGA are broken up by tissue head(subset(human_projects, file_source == "gtex")) head(subset(human_projects, file_source == "tcga")) ## Find all the mouse projects mouse_projects <- available_projects(organism = "mouse") ## Explore the results dim(mouse_projects) head(mouse_projects) ## How many are from a data source vs a compilation? table(mouse_projects$project_type, useNA = "ifany") ## What are the unique file sources? table( mouse_projects$file_source[mouse_projects$project_type == "data_sources"] ) ## Not run: ## Use with a custom recount3_url: available_projects( recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3test", available_homes = "data_sources/sra" ) ## You can also rely on project_homes() if the custom URL has a text file ## that can be read with readLines() at: ## <recount3_url>/<organism>/homes_index available_projects( recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3test" ) ## End(Not run)
This function returns a data.frame()
with the samples that are available
from recount3
. Note that a specific sample might be available from a
given data_source
and none or many collections
.
available_samples( organism = c("human", "mouse"), recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3"), bfc = recount3_cache(), verbose = getOption("recount3_verbose", TRUE), available_homes = project_homes(organism = organism, recount3_url = recount3_url) )
available_samples( organism = c("human", "mouse"), recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3"), bfc = recount3_cache(), verbose = getOption("recount3_verbose", TRUE), available_homes = project_homes(organism = organism, recount3_url = recount3_url) )
organism |
A |
recount3_url |
A |
bfc |
A BiocFileCache-class
object where the files will be cached to, typically created by
|
verbose |
A |
available_homes |
A |
A data.frame()
with the sample ID used by the original source of
the data (external_id
), the project ID (project
), the organism
, the
file_source
from where the data was accessed, the date the sample
was processed (date_processed
) in YYYY-MM-DD
format,
the recount3
project home location (project_home
), and the project
project_type
that differentiates between data_sources
and compilations
.
## Find all the human samples available from recount3 human_samples <- available_samples() dim(human_samples) head(human_samples) ## How many are from a data source vs a compilation? table(human_samples$project_type, useNA = "ifany") ## What are the unique file sources? table( human_samples$file_source[human_samples$project_type == "data_sources"] ) ## Find all the mouse samples available from recount3 mouse_samples <- available_samples("mouse") dim(mouse_samples) head(mouse_samples) ## How many are from a data source vs a compilation? table(mouse_samples$project_type, useNA = "ifany")
## Find all the human samples available from recount3 human_samples <- available_samples() dim(human_samples) head(human_samples) ## How many are from a data source vs a compilation? table(human_samples$project_type, useNA = "ifany") ## What are the unique file sources? table( human_samples$file_source[human_samples$project_type == "data_sources"] ) ## Find all the mouse samples available from recount3 mouse_samples <- available_samples("mouse") dim(mouse_samples) head(mouse_samples) ## How many are from a data source vs a compilation? table(mouse_samples$project_type, useNA = "ifany")
As described in the recount workflow, the counts provided by the recount2
project are base-pair counts. You can scale them using transform_counts()
or compute the read counts using the area under coverage information (AUC).
compute_read_counts( rse, round = TRUE, avg_mapped_read_length = "recount_qc.star.average_mapped_length" )
compute_read_counts( rse, round = TRUE, avg_mapped_read_length = "recount_qc.star.average_mapped_length" )
rse |
A
RangedSummarizedExperiment-class
created by |
round |
A |
avg_mapped_read_length |
A |
This function is similar to
recount::read_counts(use_paired_end = TRUE, round = TRUE)
but more general
and with a different name to avoid NAMESPACE conflicts. Note that the default
value of round
is different than in recount::read_counts()
. This
was done to match the default value of round
in transform_counts()
.
A matrix()
with the read counts. By default this function uses
the average read length to the QC annotation.
Collado-Torres L, Nellore A and Jaffe AE. recount workflow: Accessing over 70,000 human RNA-seq samples with Bioconductor version 1; referees: 1 approved, 2 approved with reservations. F1000Research 2017, 6:1558 doi: 10.12688/f1000research.12223.1.
Other count transformation functions:
compute_scale_factors()
,
is_paired_end()
,
transform_counts()
## Create a RSE object at the gene level rse_gene_SRP009615 <- create_rse_manual("SRP009615") colSums(compute_read_counts(rse_gene_SRP009615)) / 1e6 ## Create a RSE object at the gene level rse_gene_DRP000499 <- create_rse_manual("DRP000499") colSums(compute_read_counts(rse_gene_DRP000499)) / 1e6 ## You can compare the read counts against those from recount::read_counts() ## from the recount2 project which used a different RNA-seq aligner ## If needed, install recount, the R/Bioconductor package for recount2: # BiocManager::install("recount") recount2_readsums <- colSums(assay(recount::read_counts( recount::rse_gene_SRP009615 ), "counts")) / 1e6 recount3_readsums <- colSums(compute_read_counts(rse_gene_SRP009615)) / 1e6 recount_readsums <- data.frame( recount2 = recount2_readsums[order(names(recount2_readsums))], recount3 = recount3_readsums[order(names(recount3_readsums))] ) plot(recount2 ~ recount3, data = recount_readsums) abline(a = 0, b = 1, col = "purple", lwd = 2, lty = 2) ## Repeat for DRP000499, a paired-end study recount::download_study("DRP000499", outdir = tempdir()) load(file.path(tempdir(), "rse_gene.Rdata"), verbose = TRUE) recount2_readsums <- colSums(assay(recount::read_counts( rse_gene ), "counts")) / 1e6 recount3_readsums <- colSums(compute_read_counts(rse_gene_DRP000499)) / 1e6 recount_readsums <- data.frame( recount2 = recount2_readsums[order(names(recount2_readsums))], recount3 = recount3_readsums[order(names(recount3_readsums))] ) plot(recount2 ~ recount3, data = recount_readsums) abline(a = 0, b = 1, col = "purple", lwd = 2, lty = 2)
## Create a RSE object at the gene level rse_gene_SRP009615 <- create_rse_manual("SRP009615") colSums(compute_read_counts(rse_gene_SRP009615)) / 1e6 ## Create a RSE object at the gene level rse_gene_DRP000499 <- create_rse_manual("DRP000499") colSums(compute_read_counts(rse_gene_DRP000499)) / 1e6 ## You can compare the read counts against those from recount::read_counts() ## from the recount2 project which used a different RNA-seq aligner ## If needed, install recount, the R/Bioconductor package for recount2: # BiocManager::install("recount") recount2_readsums <- colSums(assay(recount::read_counts( recount::rse_gene_SRP009615 ), "counts")) / 1e6 recount3_readsums <- colSums(compute_read_counts(rse_gene_SRP009615)) / 1e6 recount_readsums <- data.frame( recount2 = recount2_readsums[order(names(recount2_readsums))], recount3 = recount3_readsums[order(names(recount3_readsums))] ) plot(recount2 ~ recount3, data = recount_readsums) abline(a = 0, b = 1, col = "purple", lwd = 2, lty = 2) ## Repeat for DRP000499, a paired-end study recount::download_study("DRP000499", outdir = tempdir()) load(file.path(tempdir(), "rse_gene.Rdata"), verbose = TRUE) recount2_readsums <- colSums(assay(recount::read_counts( rse_gene ), "counts")) / 1e6 recount3_readsums <- colSums(compute_read_counts(rse_gene_DRP000499)) / 1e6 recount_readsums <- data.frame( recount2 = recount2_readsums[order(names(recount2_readsums))], recount3 = recount3_readsums[order(names(recount3_readsums))] ) plot(recount2 ~ recount3, data = recount_readsums) abline(a = 0, b = 1, col = "purple", lwd = 2, lty = 2)
This function computes the count scaling factors used by
transform_counts()
. This function is similar to
recount::scale_counts(factor_only = TRUE)
, but it is more general.
compute_scale_factors( x, by = c("auc", "mapped_reads"), targetSize = 4e+07, L = 100, auc = "recount_qc.bc_auc.all_reads_all_bases", avg_mapped_read_length = "recount_qc.star.average_mapped_length", mapped_reads = "recount_qc.star.all_mapped_reads", paired_end = is_paired_end(x, avg_mapped_read_length) )
compute_scale_factors( x, by = c("auc", "mapped_reads"), targetSize = 4e+07, L = 100, auc = "recount_qc.bc_auc.all_reads_all_bases", avg_mapped_read_length = "recount_qc.star.average_mapped_length", mapped_reads = "recount_qc.star.all_mapped_reads", paired_end = is_paired_end(x, avg_mapped_read_length) )
x |
Either a
RangedSummarizedExperiment-class
created by |
by |
Either |
targetSize |
A |
L |
A |
auc |
A |
avg_mapped_read_length |
A |
mapped_reads |
A |
paired_end |
A |
A numeric()
with the sample scale factors that are used by
transform_counts()
.
Other count transformation functions:
compute_read_counts()
,
is_paired_end()
,
transform_counts()
## Download the metadata for SRP009615, a single-end study SRP009615_meta <- read_metadata( metadata_files = file_retrieve( locate_url( "SRP009615", "data_sources/sra", ) ) ) ## Compute the scaling factors compute_scale_factors(SRP009615_meta, by = "auc") compute_scale_factors(SRP009615_meta, by = "mapped_reads") ## Download the metadata for DRP000499, a paired-end study DRP000499_meta <- read_metadata( metadata_files = file_retrieve( locate_url( "DRP000499", "data_sources/sra", ) ) ) ## Compute the scaling factors compute_scale_factors(DRP000499_meta, by = "auc") compute_scale_factors(DRP000499_meta, by = "mapped_reads") ## You can compare the factors against those from recount::scale_counts() ## from the recount2 project which used a different RNA-seq aligner ## If needed, install recount, the R/Bioconductor package for recount2: # BiocManager::install("recount") recount2_factors <- recount::scale_counts( recount::rse_gene_SRP009615, by = "auc", factor_only = TRUE ) recount3_factors <- compute_scale_factors(SRP009615_meta, by = "auc") recount_factors <- data.frame( recount2 = recount2_factors[order(names(recount2_factors))], recount3 = recount3_factors[order(names(recount3_factors))] ) plot(recount2 ~ recount3, data = recount_factors) abline(a = 0, b = 1, col = "purple", lwd = 2, lty = 2)
## Download the metadata for SRP009615, a single-end study SRP009615_meta <- read_metadata( metadata_files = file_retrieve( locate_url( "SRP009615", "data_sources/sra", ) ) ) ## Compute the scaling factors compute_scale_factors(SRP009615_meta, by = "auc") compute_scale_factors(SRP009615_meta, by = "mapped_reads") ## Download the metadata for DRP000499, a paired-end study DRP000499_meta <- read_metadata( metadata_files = file_retrieve( locate_url( "DRP000499", "data_sources/sra", ) ) ) ## Compute the scaling factors compute_scale_factors(DRP000499_meta, by = "auc") compute_scale_factors(DRP000499_meta, by = "mapped_reads") ## You can compare the factors against those from recount::scale_counts() ## from the recount2 project which used a different RNA-seq aligner ## If needed, install recount, the R/Bioconductor package for recount2: # BiocManager::install("recount") recount2_factors <- recount::scale_counts( recount::rse_gene_SRP009615, by = "auc", factor_only = TRUE ) recount3_factors <- compute_scale_factors(SRP009615_meta, by = "auc") recount_factors <- data.frame( recount2 = recount2_factors[order(names(recount2_factors))], recount3 = recount3_factors[order(names(recount3_factors))] ) plot(recount2 ~ recount3, data = recount_factors) abline(a = 0, b = 1, col = "purple", lwd = 2, lty = 2)
This function creates a directory with the configuration files required for creating a UCSC track hub for the BigWig files from a given project. These files can then be hosted on GitHub or elsewhere. For more details about UCSC track hubs, check https://genome.ucsc.edu/goldenpath/help/hgTrackHubHelp.html.
create_hub( x, output_dir = file.path(tempdir(), x$project[1]), hub_name = "recount3", email = "someone@somewhere", show_max = 5, hub_short_label = "recount3 coverage", hub_long_label = "recount3 summaries and queries for large-scaleRNA-seq expression and splicing", hub_description_url = "https://rna.recount.bio/index.html", recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3") )
create_hub( x, output_dir = file.path(tempdir(), x$project[1]), hub_name = "recount3", email = "someone@somewhere", show_max = 5, hub_short_label = "recount3 coverage", hub_long_label = "recount3 summaries and queries for large-scaleRNA-seq expression and splicing", hub_description_url = "https://rna.recount.bio/index.html", recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3") )
x |
A |
output_dir |
A |
hub_name |
A |
email |
A |
show_max |
An |
hub_short_label |
A |
hub_long_label |
A |
hub_description_url |
A |
recount3_url |
A |
See https://github.com/LieberInstitute/recount3-docs/blob/master/UCSC_hubs/create_hubs.R for an example of how this function was used.
A directory at output_dir
with the files needed for a UCSC
track hub.
## Find all the mouse samples available from recount3 mouse_samples <- available_samples("mouse") ## Subset to project DRP001299 info_DRP001299 <- subset(mouse_samples, project == "DRP001299") hub_dir <- create_hub(info_DRP001299) ## List the files created by create_hub() hub_files <- list.files(hub_dir, full.names = TRUE, recursive = TRUE) hub_files ## Check the files contents sapply(hub_files, function(x) { cat(paste(readLines(x), collapse = "\n")) }) ## You can also check the file contents for this example project at ## https://github.com/LieberInstitute/recount3-docs/tree/master/UCSC_hubs/mouse/sra_DRP001299 ## or test it out on UCSC directly through the following URL: ## https://genome.ucsc.edu/cgi-bin/hgTracks?db=mm10&lastVirtModeType=default&lastVirtModeExtraState=&virtModeType=default&virtMode=0&nonVirtPosition=&position=chr1&hubUrl=https://raw.githubusercontent.com/LieberInstitute/recount3-docs/master/UCSC_hubs/mouse/sra_DRP001299/hub.txt
## Find all the mouse samples available from recount3 mouse_samples <- available_samples("mouse") ## Subset to project DRP001299 info_DRP001299 <- subset(mouse_samples, project == "DRP001299") hub_dir <- create_hub(info_DRP001299) ## List the files created by create_hub() hub_files <- list.files(hub_dir, full.names = TRUE, recursive = TRUE) hub_files ## Check the files contents sapply(hub_files, function(x) { cat(paste(readLines(x), collapse = "\n")) }) ## You can also check the file contents for this example project at ## https://github.com/LieberInstitute/recount3-docs/tree/master/UCSC_hubs/mouse/sra_DRP001299 ## or test it out on UCSC directly through the following URL: ## https://genome.ucsc.edu/cgi-bin/hgTracks?db=mm10&lastVirtModeType=default&lastVirtModeExtraState=&virtModeType=default&virtMode=0&nonVirtPosition=&position=chr1&hubUrl=https://raw.githubusercontent.com/LieberInstitute/recount3-docs/master/UCSC_hubs/mouse/sra_DRP001299/hub.txt
Once you have identified a project you want to work with, you can use this
function to construct a recount3
RangedSummarizedExperiment-class
(RSE) object at the gene or exon expression feature level. This function will
retrieve the data, cache it, then assemble the RSE object.
create_rse( project_info, type = c("gene", "exon", "jxn"), annotation = annotation_options(project_info$organism), bfc = recount3_cache(), jxn_format = c("ALL", "UNIQUE"), recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3"), verbose = getOption("recount3_verbose", TRUE) )
create_rse( project_info, type = c("gene", "exon", "jxn"), annotation = annotation_options(project_info$organism), bfc = recount3_cache(), jxn_format = c("ALL", "UNIQUE"), recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3"), verbose = getOption("recount3_verbose", TRUE) )
project_info |
A |
type |
A |
annotation |
A |
bfc |
A BiocFileCache-class
object where the files will be cached to, typically created by
|
jxn_format |
A |
recount3_url |
A |
verbose |
A |
A RangedSummarizedExperiment-class object.
## Find all available human projects human_projects <- available_projects() ## Find the project you are interested in proj_info <- subset( human_projects, project == "SRP009615" & project_type == "data_sources" ) ## Create a RSE object at the gene level rse_gene_SRP009615 <- create_rse(proj_info) ## Explore the resulting RSE gene object rse_gene_SRP009615 ## Information about how this RSE object was made metadata(rse_gene_SRP009615) ## Number of genes by number of samples dim(rse_gene_SRP009615) ## Information about the genes rowRanges(rse_gene_SRP009615) ## Sample metadata colnames(colData(rse_gene_SRP009615)) ## Check how much memory this RSE object uses pryr::object_size(rse_gene_SRP009615) ## Create an RSE object using gencode_v29 instead of gencode_v26 rse_gene_SRP009615_gencode_v29 <- create_rse( proj_info, annotation = "gencode_v29", verbose = FALSE ) rowRanges(rse_gene_SRP009615_gencode_v29) ## Create an RSE object using FANTOM6_CAT instead of gencode_v26 rse_gene_SRP009615_fantom6_cat <- create_rse( proj_info, annotation = "fantom6_cat" ) rowRanges(rse_gene_SRP009615_fantom6_cat) ## Create an RSE object using RefSeq instead of gencode_v26 rse_gene_SRP009615_refseq <- create_rse( proj_info, annotation = "refseq" ) rowRanges(rse_gene_SRP009615_refseq) ## Create an RSE object using ERCC instead of gencode_v26 rse_gene_SRP009615_ercc <- create_rse( proj_info, annotation = "ercc" ) rowRanges(rse_gene_SRP009615_ercc) ## Create an RSE object using SIRV instead of gencode_v26 rse_gene_SRP009615_sirv <- create_rse( proj_info, annotation = "sirv" ) rowRanges(rse_gene_SRP009615_sirv) ## Obtain a list of RSE objects for all gene annotations rses_gene <- lapply(annotation_options(), function(x) { create_rse(proj_info, type = "gene", annotation = x) }) names(rses_gene) <- annotation_options() rses_gene ## Create a RSE object at the exon level rse_exon_SRP009615 <- create_rse( proj_info, type = "exon" ) ## Explore the resulting RSE exon object rse_exon_SRP009615 dim(rse_exon_SRP009615) rowRanges(rse_exon_SRP009615) pryr::object_size(rse_exon_SRP009615) ## Create a RSE object at the exon-exon junction level rse_jxn_SRP009615 <- create_rse( proj_info, type = "jxn" ) ## Explore the resulting RSE exon-exon junctions object rse_jxn_SRP009615 dim(rse_jxn_SRP009615) rowRanges(rse_jxn_SRP009615) pryr::object_size(rse_jxn_SRP009615) ## Obtain a list of RSE objects for all exon annotations ## Not run: rses_exon <- lapply(annotation_options(), function(x) { create_rse(proj_info, type = "exon", annotation = x, verbose = FALSE) }) names(rses_exon) <- annotation_options() ## End(Not run)
## Find all available human projects human_projects <- available_projects() ## Find the project you are interested in proj_info <- subset( human_projects, project == "SRP009615" & project_type == "data_sources" ) ## Create a RSE object at the gene level rse_gene_SRP009615 <- create_rse(proj_info) ## Explore the resulting RSE gene object rse_gene_SRP009615 ## Information about how this RSE object was made metadata(rse_gene_SRP009615) ## Number of genes by number of samples dim(rse_gene_SRP009615) ## Information about the genes rowRanges(rse_gene_SRP009615) ## Sample metadata colnames(colData(rse_gene_SRP009615)) ## Check how much memory this RSE object uses pryr::object_size(rse_gene_SRP009615) ## Create an RSE object using gencode_v29 instead of gencode_v26 rse_gene_SRP009615_gencode_v29 <- create_rse( proj_info, annotation = "gencode_v29", verbose = FALSE ) rowRanges(rse_gene_SRP009615_gencode_v29) ## Create an RSE object using FANTOM6_CAT instead of gencode_v26 rse_gene_SRP009615_fantom6_cat <- create_rse( proj_info, annotation = "fantom6_cat" ) rowRanges(rse_gene_SRP009615_fantom6_cat) ## Create an RSE object using RefSeq instead of gencode_v26 rse_gene_SRP009615_refseq <- create_rse( proj_info, annotation = "refseq" ) rowRanges(rse_gene_SRP009615_refseq) ## Create an RSE object using ERCC instead of gencode_v26 rse_gene_SRP009615_ercc <- create_rse( proj_info, annotation = "ercc" ) rowRanges(rse_gene_SRP009615_ercc) ## Create an RSE object using SIRV instead of gencode_v26 rse_gene_SRP009615_sirv <- create_rse( proj_info, annotation = "sirv" ) rowRanges(rse_gene_SRP009615_sirv) ## Obtain a list of RSE objects for all gene annotations rses_gene <- lapply(annotation_options(), function(x) { create_rse(proj_info, type = "gene", annotation = x) }) names(rses_gene) <- annotation_options() rses_gene ## Create a RSE object at the exon level rse_exon_SRP009615 <- create_rse( proj_info, type = "exon" ) ## Explore the resulting RSE exon object rse_exon_SRP009615 dim(rse_exon_SRP009615) rowRanges(rse_exon_SRP009615) pryr::object_size(rse_exon_SRP009615) ## Create a RSE object at the exon-exon junction level rse_jxn_SRP009615 <- create_rse( proj_info, type = "jxn" ) ## Explore the resulting RSE exon-exon junctions object rse_jxn_SRP009615 dim(rse_jxn_SRP009615) rowRanges(rse_jxn_SRP009615) pryr::object_size(rse_jxn_SRP009615) ## Obtain a list of RSE objects for all exon annotations ## Not run: rses_exon <- lapply(annotation_options(), function(x) { create_rse(proj_info, type = "exon", annotation = x, verbose = FALSE) }) names(rses_exon) <- annotation_options() ## End(Not run)
This function is used internally by create_rse()
to construct a recount3
RangedSummarizedExperiment-class
object that contains the base-pair coverage counts at the gene
or exon
feature level for a given annotation.
create_rse_manual( project, project_home = project_homes(organism = organism, recount3_url = recount3_url), type = c("gene", "exon", "jxn"), organism = c("human", "mouse"), annotation = annotation_options(organism), bfc = recount3_cache(), jxn_format = c("ALL", "UNIQUE"), recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3"), verbose = getOption("recount3_verbose", TRUE) )
create_rse_manual( project, project_home = project_homes(organism = organism, recount3_url = recount3_url), type = c("gene", "exon", "jxn"), organism = c("human", "mouse"), annotation = annotation_options(organism), bfc = recount3_cache(), jxn_format = c("ALL", "UNIQUE"), recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3"), verbose = getOption("recount3_verbose", TRUE) )
project |
A |
project_home |
A |
type |
A |
organism |
A |
annotation |
A |
bfc |
A BiocFileCache-class
object where the files will be cached to, typically created by
|
jxn_format |
A |
recount3_url |
A |
verbose |
A |
A RangedSummarizedExperiment-class object.
https://doi.org/10.12688/f1000research.12223.1 for details on the base-pair coverage counts used in recount2 and recount3.
Other internal functions for accessing the recount3 data:
annotation_ext()
,
file_retrieve()
,
locate_url_ann()
,
locate_url()
,
project_homes()
,
read_counts()
,
read_metadata()
## Unlike create_rse(), here we create an RSE object by ## fully specifying all the arguments for locating this study rse_gene_SRP009615_manual <- create_rse_manual( "SRP009615", "data_sources/sra" ) rse_gene_SRP009615_manual ## Check how much memory this RSE object uses pryr::object_size(rse_gene_SRP009615_manual) ## Test with a collection that has a single sample ## NOTE: this requires loading the full data for this study when ## creating the RSE object rse_gene_ERP110066_collection_manual <- create_rse_manual( "ERP110066", "collections/geuvadis_smartseq", recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3" ) rse_gene_ERP110066_collection_manual ## Check how much memory this RSE object uses pryr::object_size(rse_gene_ERP110066_collection_manual) ## Mouse example rse_gene_DRP002367_manual <- create_rse_manual( "DRP002367", "data_sources/sra", organism = "mouse" ) rse_gene_DRP002367_manual ## Information about how this RSE was made metadata(rse_gene_DRP002367_manual) ## Test with a collection that has one sample, at the exon level ## NOTE: this requires loading the full data for this study (nearly 6GB!) ## Not run: rse_exon_ERP110066_collection_manual <- create_rse_manual( "ERP110066", "collections/geuvadis_smartseq", type = "exon", recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3" ) rse_exon_ERP110066_collection_manual ## Check how much memory this RSE object uses pryr::object_size(rse_exon_ERP110066_collection_manual) # 409 MB ## Test with a collection that has one sample, at the junction level ## NOTE: this requires loading the full data for this study system.time(rse_jxn_ERP110066_collection_manual <- create_rse_manual( "ERP110066", "collections/geuvadis_smartseq", type = "jxn", recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3" )) rse_jxn_ERP110066_collection_manual ## Check how much memory this RSE object uses ## NOTE: this doesn't run since 2 files are missing on the test site! pryr::object_size(rse_jxn_ERP110066_collection_manual) ## End(Not run) ## Not run: ## For testing and debugging project <- "ERP110066" project_home <- "collections/geuvadis_smartseq" project <- "SRP009615" project_home <- "data_sources/sra" type <- "gene" organism <- "human" annotation <- "gencode_v26" jxn_format <- "ALL" bfc <- recount3_cache() recount3_url <- "http://idies.jhu.edu/recount3/data" verbose <- TRUE ## End(Not run)
## Unlike create_rse(), here we create an RSE object by ## fully specifying all the arguments for locating this study rse_gene_SRP009615_manual <- create_rse_manual( "SRP009615", "data_sources/sra" ) rse_gene_SRP009615_manual ## Check how much memory this RSE object uses pryr::object_size(rse_gene_SRP009615_manual) ## Test with a collection that has a single sample ## NOTE: this requires loading the full data for this study when ## creating the RSE object rse_gene_ERP110066_collection_manual <- create_rse_manual( "ERP110066", "collections/geuvadis_smartseq", recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3" ) rse_gene_ERP110066_collection_manual ## Check how much memory this RSE object uses pryr::object_size(rse_gene_ERP110066_collection_manual) ## Mouse example rse_gene_DRP002367_manual <- create_rse_manual( "DRP002367", "data_sources/sra", organism = "mouse" ) rse_gene_DRP002367_manual ## Information about how this RSE was made metadata(rse_gene_DRP002367_manual) ## Test with a collection that has one sample, at the exon level ## NOTE: this requires loading the full data for this study (nearly 6GB!) ## Not run: rse_exon_ERP110066_collection_manual <- create_rse_manual( "ERP110066", "collections/geuvadis_smartseq", type = "exon", recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3" ) rse_exon_ERP110066_collection_manual ## Check how much memory this RSE object uses pryr::object_size(rse_exon_ERP110066_collection_manual) # 409 MB ## Test with a collection that has one sample, at the junction level ## NOTE: this requires loading the full data for this study system.time(rse_jxn_ERP110066_collection_manual <- create_rse_manual( "ERP110066", "collections/geuvadis_smartseq", type = "jxn", recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3" )) rse_jxn_ERP110066_collection_manual ## Check how much memory this RSE object uses ## NOTE: this doesn't run since 2 files are missing on the test site! pryr::object_size(rse_jxn_ERP110066_collection_manual) ## End(Not run) ## Not run: ## For testing and debugging project <- "ERP110066" project_home <- "collections/geuvadis_smartseq" project <- "SRP009615" project_home <- "data_sources/sra" type <- "gene" organism <- "human" annotation <- "gencode_v26" jxn_format <- "ALL" bfc <- recount3_cache() recount3_url <- "http://idies.jhu.edu/recount3/data" verbose <- TRUE ## End(Not run)
This function expands the SRA attributes stored in sra.sample_attributes
variable in the colData()
slot of a
RangedSummarizedExperiment-class
produced by create_rse()
.
expand_sra_attributes(rse)
expand_sra_attributes(rse)
rse |
A RangedSummarizedExperiment-class
object created by |
Note that this function will work on projects available from SRA only.
Furthermore, SRA attributes are project-specific. Thus, if you use this
function in more than one RSE object, you won't be able to combine them
easily with cbind()
and will need to manually merge the colData()
slots
from your set of RSE files before being able to run cbind()
.
A RangedSummarizedExperiment-class object with expanded metadata columns.
Andrew E Jaffe modified by Leonardo Collado-Torres.
## Find all available human projects human_projects <- available_projects() ## Find the project you are interested in proj_info <- subset( human_projects, project == "SRP009615" & project_type == "data_sources" ) ## Create a RSE object at the gene level rse_gene_SRP009615 <- create_rse(proj_info) ## Expand the SRA attributes (see details for more information) rse_gene_SRP009615 <- expand_sra_attributes(rse_gene_SRP009615)
## Find all available human projects human_projects <- available_projects() ## Find the project you are interested in proj_info <- subset( human_projects, project == "SRP009615" & project_type == "data_sources" ) ## Create a RSE object at the gene level rse_gene_SRP009615 <- create_rse(proj_info) ## Expand the SRA attributes (see details for more information) rse_gene_SRP009615 <- expand_sra_attributes(rse_gene_SRP009615)
Download a remote file and cache it to re-use later
file_retrieve( url, bfc = recount3_cache(), verbose = getOption("recount3_verbose", TRUE) )
file_retrieve( url, bfc = recount3_cache(), verbose = getOption("recount3_verbose", TRUE) )
url |
A |
bfc |
A BiocFileCache-class
object where the files will be cached to, typically created by
|
verbose |
A |
A character(1)
with the path to the cached file.
Other internal functions for accessing the recount3 data:
annotation_ext()
,
create_rse_manual()
,
locate_url_ann()
,
locate_url()
,
project_homes()
,
read_counts()
,
read_metadata()
## Download the metadata file for project SRP009615 url_SRP009615_meta <- locate_url( "SRP009615", "data_sources/sra" ) local_SRP009615_meta <- file_retrieve( url = url_SRP009615_meta ) local_SRP009615_meta ## Download the gene counts file for project SRP009615 url_SRP009615_gene <- locate_url( "SRP009615", "data_sources/sra", type = "gene" ) local_SRP009615_gene <- file_retrieve( url = url_SRP009615_gene ) local_SRP009615_gene
## Download the metadata file for project SRP009615 url_SRP009615_meta <- locate_url( "SRP009615", "data_sources/sra" ) local_SRP009615_meta <- file_retrieve( url = url_SRP009615_meta ) local_SRP009615_meta ## Download the gene counts file for project SRP009615 url_SRP009615_gene <- locate_url( "SRP009615", "data_sources/sra", type = "gene" ) local_SRP009615_gene <- file_retrieve( url = url_SRP009615_gene ) local_SRP009615_gene
Based on two alignment metrics, this function guesses the samples are paired end or not.
is_paired_end( x, avg_mapped_read_length = "recount_qc.star.average_mapped_length", avg_read_length = "recount_seq_qc.avg_len" )
is_paired_end( x, avg_mapped_read_length = "recount_qc.star.average_mapped_length", avg_read_length = "recount_seq_qc.avg_len" )
x |
Either a
RangedSummarizedExperiment-class
created by |
avg_mapped_read_length |
A |
avg_read_length |
A |
A logical()
vector specifying whether each sample was likely
paired-end or not.
Other count transformation functions:
compute_read_counts()
,
compute_scale_factors()
,
transform_counts()
## Download the metadata for SRP009615, a single-end study SRP009615_meta <- read_metadata( metadata_files = file_retrieve( locate_url( "SRP009615", "data_sources/sra", ) ) ) ## Are the samples paired end? is_paired_end(SRP009615_meta) ## Download the metadata for DRP000499, a paired-end study DRP000499_meta <- read_metadata( metadata_files = file_retrieve( locate_url( "DRP000499", "data_sources/sra", ) ) ) is_paired_end(DRP000499_meta)
## Download the metadata for SRP009615, a single-end study SRP009615_meta <- read_metadata( metadata_files = file_retrieve( locate_url( "SRP009615", "data_sources/sra", ) ) ) ## Are the samples paired end? is_paired_end(SRP009615_meta) ## Download the metadata for DRP000499, a paired-end study DRP000499_meta <- read_metadata( metadata_files = file_retrieve( locate_url( "DRP000499", "data_sources/sra", ) ) ) is_paired_end(DRP000499_meta)
recount3
fileGiven an organism of interest, this function constructs the URL for accessing
one of the output files from the recount3
project. You can then download
the file using file_retrieve()
.
locate_url( project, project_home = project_homes(organism = organism, recount3_url = recount3_url), type = c("metadata", "gene", "exon", "jxn", "bw"), organism = c("human", "mouse"), sample = NULL, annotation = annotation_options(organism), jxn_format = c("ALL", "UNIQUE"), recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3") )
locate_url( project, project_home = project_homes(organism = organism, recount3_url = recount3_url), type = c("metadata", "gene", "exon", "jxn", "bw"), organism = c("human", "mouse"), sample = NULL, annotation = annotation_options(organism), jxn_format = c("ALL", "UNIQUE"), recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3") )
project |
A |
project_home |
A |
type |
A |
organism |
A |
sample |
A |
annotation |
A |
jxn_format |
A |
recount3_url |
A |
A character()
with the URL(s) for the file(s) of interest.
Other internal functions for accessing the recount3 data:
annotation_ext()
,
create_rse_manual()
,
file_retrieve()
,
locate_url_ann()
,
project_homes()
,
read_counts()
,
read_metadata()
## Example for metadata files from a project from SRA locate_url( "SRP009615", "data_sources/sra" ) ## Example for metadata files from a project that is part of a collection locate_url( "ERP110066", "collections/geuvadis_smartseq", recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3" ) ## Example for a BigWig file locate_url( "SRP009615", "data_sources/sra", "bw", "human", "SRR387777" ) ## Locate example gene count files locate_url( "SRP009615", "data_sources/sra", "gene" ) locate_url( "SRP009615", "data_sources/sra", "gene", annotation = "refseq" ) ## Example for a gene count file from a project that is part of a collection locate_url( "ERP110066", "collections/geuvadis_smartseq", "gene", recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3" ) ## Locate example junction files locate_url( "SRP009615", "data_sources/sra", "jxn" ) ## Example for metadata files from a project from SRA locate_url( "ERP001942", "data_sources/sra" )
## Example for metadata files from a project from SRA locate_url( "SRP009615", "data_sources/sra" ) ## Example for metadata files from a project that is part of a collection locate_url( "ERP110066", "collections/geuvadis_smartseq", recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3" ) ## Example for a BigWig file locate_url( "SRP009615", "data_sources/sra", "bw", "human", "SRR387777" ) ## Locate example gene count files locate_url( "SRP009615", "data_sources/sra", "gene" ) locate_url( "SRP009615", "data_sources/sra", "gene", annotation = "refseq" ) ## Example for a gene count file from a project that is part of a collection locate_url( "ERP110066", "collections/geuvadis_smartseq", "gene", recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3" ) ## Locate example junction files locate_url( "SRP009615", "data_sources/sra", "jxn" ) ## Example for metadata files from a project from SRA locate_url( "ERP001942", "data_sources/sra" )
Given a expression feature type
, organism
and annotation
, this
function constructs the URL (or file path) to access a recount3
annotation file. This function is used by create_rse_manual()
.
locate_url_ann( type = c("gene", "exon"), organism = c("human", "mouse"), annotation = annotation_options(organism), recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3") )
locate_url_ann( type = c("gene", "exon"), organism = c("human", "mouse"), annotation = annotation_options(organism), recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3") )
type |
A |
organism |
A |
annotation |
A |
recount3_url |
A |
A character(1)
with the URL (or file path) to access the
recount3 annotation file.
Other internal functions for accessing the recount3 data:
annotation_ext()
,
create_rse_manual()
,
file_retrieve()
,
locate_url()
,
project_homes()
,
read_counts()
,
read_metadata()
locate_url_ann() locate_url_ann(organism = "mouse")
locate_url_ann() locate_url_ann(organism = "mouse")
This function finds the home for a given project (study) of interest based
on the organism
and the home_type
.
project_homes( organism = c("human", "mouse"), recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3") )
project_homes( organism = c("human", "mouse"), recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3") )
organism |
A |
recount3_url |
A |
By default it reads a small text file from
recount3_url
/organism
/homes_index using readLines()
. This text file
should contain each possible project home per line. See
http://duffel.rail.bio/recount3/human/homes_index for an example.
A character()
vector with the available project_home
options.
Other internal functions for accessing the recount3 data:
annotation_ext()
,
create_rse_manual()
,
file_retrieve()
,
locate_url_ann()
,
locate_url()
,
read_counts()
,
read_metadata()
## List the different available `project_home` options for the default ## arguments project_homes("human") project_homes("mouse") ## Test files project_homes("human", recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3" )
## List the different available `project_home` options for the default ## arguments project_homes("human") project_homes("mouse") ## Test files project_homes("human", recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3" )
This function reads in a recount3
gene or gexon counts file into R. You can
first locate the file using locate_url()
then download it to your
computer using file_retrieve()
.
read_counts(counts_file, samples = NULL)
read_counts(counts_file, samples = NULL)
counts_file |
A |
samples |
A |
A data.frame()
with sample IDs as the column names.
https://doi.org/10.12688/f1000research.12223.1 for details on the base-pair coverage counts used in recount2 and recount3.
Other internal functions for accessing the recount3 data:
annotation_ext()
,
create_rse_manual()
,
file_retrieve()
,
locate_url_ann()
,
locate_url()
,
project_homes()
,
read_metadata()
## Download the gene counts file for project SRP009615 url_SRP009615_gene <- locate_url( "SRP009615", "data_sources/sra", type = "gene" ) local_SRP009615_gene <- file_retrieve(url = url_SRP009615_gene) ## Read the gene counts, take about 3 seconds system.time(SRP009615_gene_counts <- read_counts(local_SRP009615_gene)) dim(SRP009615_gene_counts) ## Explore the top left corner SRP009615_gene_counts[seq_len(6), seq_len(6)] ## Explore the first 6 samples. summary(SRP009615_gene_counts[, seq_len(6)]) ## Note that the count units are in ## base-pair coverage counts just like in the recount2 project. ## See https://doi.org/10.12688/f1000research.12223.1 for more details ## about this type of counts. ## They can be converted to reads per 40 million reads, RPKM and other ## counts. This is more easily done once assembled into a ## RangedSummarizedExperiment object. ## Locate and retrieve an exon counts file local_SRP009615_exon <- file_retrieve( locate_url( "SRP009615", "data_sources/sra", type = "exon" ) ) local_SRP009615_exon ## Read the exon counts, takes about 50-60 seconds system.time( SRP009615_exon_counts <- read_counts( local_SRP009615_exon ) ) dim(SRP009615_exon_counts) pryr::object_size(SRP009615_exon_counts) ## Explore the top left corner SRP009615_exon_counts[seq_len(6), seq_len(6)] ## Explore the first 6 samples. summary(SRP009615_exon_counts[, seq_len(6)])
## Download the gene counts file for project SRP009615 url_SRP009615_gene <- locate_url( "SRP009615", "data_sources/sra", type = "gene" ) local_SRP009615_gene <- file_retrieve(url = url_SRP009615_gene) ## Read the gene counts, take about 3 seconds system.time(SRP009615_gene_counts <- read_counts(local_SRP009615_gene)) dim(SRP009615_gene_counts) ## Explore the top left corner SRP009615_gene_counts[seq_len(6), seq_len(6)] ## Explore the first 6 samples. summary(SRP009615_gene_counts[, seq_len(6)]) ## Note that the count units are in ## base-pair coverage counts just like in the recount2 project. ## See https://doi.org/10.12688/f1000research.12223.1 for more details ## about this type of counts. ## They can be converted to reads per 40 million reads, RPKM and other ## counts. This is more easily done once assembled into a ## RangedSummarizedExperiment object. ## Locate and retrieve an exon counts file local_SRP009615_exon <- file_retrieve( locate_url( "SRP009615", "data_sources/sra", type = "exon" ) ) local_SRP009615_exon ## Read the exon counts, takes about 50-60 seconds system.time( SRP009615_exon_counts <- read_counts( local_SRP009615_exon ) ) dim(SRP009615_exon_counts) pryr::object_size(SRP009615_exon_counts) ## Explore the top left corner SRP009615_exon_counts[seq_len(6), seq_len(6)] ## Explore the first 6 samples. summary(SRP009615_exon_counts[, seq_len(6)])
This function reads in the recount3
metadata files into R. You can first
locate the files using locate_url()
then download it to your computer
using file_retrieve()
.
read_metadata(metadata_files)
read_metadata(metadata_files)
metadata_files |
A |
A data.frame()
with all lower case column names for the sample
metadata.
Other internal functions for accessing the recount3 data:
annotation_ext()
,
create_rse_manual()
,
file_retrieve()
,
locate_url_ann()
,
locate_url()
,
project_homes()
,
read_counts()
## Download the metadata files for project ERP110066 url_ERP110066_meta <- locate_url( "ERP110066", "data_sources/sra" ) local_ERP110066_meta <- file_retrieve( url = url_ERP110066_meta ) ## Read the metadata ERP110066_meta <- read_metadata(local_ERP110066_meta) dim(ERP110066_meta) colnames(ERP110066_meta) ## Read the metadata files for a project in a collection ## Note: using the test files since I can't access collections right now ## for this collection ERP110066_collection_meta <- read_metadata( metadata_files = file_retrieve( locate_url( "ERP110066", "collections/geuvadis_smartseq", recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3" ) ) ) dim(ERP110066_collection_meta) ## New columns for this collection colnames(ERP110066_collection_meta)[!colnames(ERP110066_collection_meta) %in% colnames(ERP110066_meta)] ## Read the metadata for a mouse project DRP002367_meta <- read_metadata( metadata_files = file_retrieve( locate_url("DRP002367", "data_sources/sra", organism = "mouse") ) ) dim(DRP002367_meta) ## Locate and read the GTEx bladder metadata gtex_bladder_meta <- read_metadata( file_retrieve( locate_url("BLADDER", "data_sources/gtex") ) ) dim(gtex_bladder_meta) colnames(gtex_bladder_meta)
## Download the metadata files for project ERP110066 url_ERP110066_meta <- locate_url( "ERP110066", "data_sources/sra" ) local_ERP110066_meta <- file_retrieve( url = url_ERP110066_meta ) ## Read the metadata ERP110066_meta <- read_metadata(local_ERP110066_meta) dim(ERP110066_meta) colnames(ERP110066_meta) ## Read the metadata files for a project in a collection ## Note: using the test files since I can't access collections right now ## for this collection ERP110066_collection_meta <- read_metadata( metadata_files = file_retrieve( locate_url( "ERP110066", "collections/geuvadis_smartseq", recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3" ) ) ) dim(ERP110066_collection_meta) ## New columns for this collection colnames(ERP110066_collection_meta)[!colnames(ERP110066_collection_meta) %in% colnames(ERP110066_meta)] ## Read the metadata for a mouse project DRP002367_meta <- read_metadata( metadata_files = file_retrieve( locate_url("DRP002367", "data_sources/sra", organism = "mouse") ) ) dim(DRP002367_meta) ## Locate and read the GTEx bladder metadata gtex_bladder_meta <- read_metadata( file_retrieve( locate_url("BLADDER", "data_sources/gtex") ) ) dim(gtex_bladder_meta) colnames(gtex_bladder_meta)
This function allows you to specify where the recount3 files will be cached to. It is powered by BiocFileCache-class.
recount3_cache(cache_dir = getOption("recount3_cache", NULL))
recount3_cache(cache_dir = getOption("recount3_cache", NULL))
cache_dir |
A |
A BiocFileCache-class object where the recount3 files will be cached to.
Other recount3 cache functions:
recount3_cache_files()
,
recount3_cache_rm()
## Locate the recount3 cache default directory recount3_cache()
## Locate the recount3 cache default directory recount3_cache()
This function returns the list of URLs of the recount3 files you have
stored in your recount3_cache()
.
recount3_cache_files(bfc = recount3_cache())
recount3_cache_files(bfc = recount3_cache())
bfc |
A BiocFileCache-class
object where the files will be cached to, typically created by
|
A character()
with the URLs of the recount3 files you have
downloaded.
Other recount3 cache functions:
recount3_cache_rm()
,
recount3_cache()
## List the URLs you have downloaded recount3_cache_files()
## List the URLs you have downloaded recount3_cache_files()
This function removes the recount3 files you have
stored in your recount3_cache()
.
recount3_cache_rm(bfc = recount3_cache())
recount3_cache_rm(bfc = recount3_cache())
bfc |
A BiocFileCache-class
object where the files will be cached to, typically created by
|
A character(0)
if the removal of files was successful.
Other recount3 cache functions:
recount3_cache_files()
,
recount3_cache()
## List the URLs you have downloaded recount3_cache_files() ## Not run: ## Now delete the cached files recount3_cache_rm() ## List againt your recount3 files (should be empty) recount3_cache_files() ## End(Not run)
## List the URLs you have downloaded recount3_cache_files() ## Not run: ## Now delete the cached files recount3_cache_rm() ## List againt your recount3 files (should be empty) recount3_cache_files() ## End(Not run)
In preparation for a differential expression analysis, you will have to
choose how to scale the raw counts provided by the recount3 project. These
raw counts are similar to those provided by the recount2 project, except
that they were generated with a different aligner and a modified counting
approach. The raw coverage counts for recount2 are described with
illustrative figures at https://doi.org/10.12688/f1000research.12223.1.
Note that the raw counts are the sum of the base level coverage so you have
to take into account the total base-pair coverage for the given sample
(default option) by using the area under the coverage (AUC), or alternatively
use the mapped read lengths. You might want to do some further scaling to
take into account the gene or exon lengths. If you prefer to calculate read
counts without scaling check the function compute_read_counts()
.
transform_counts( rse, by = c("auc", "mapped_reads"), targetSize = 4e+07, L = 100, round = TRUE, ... )
transform_counts( rse, by = c("auc", "mapped_reads"), targetSize = 4e+07, L = 100, round = TRUE, ... )
rse |
A
RangedSummarizedExperiment-class
created by |
by |
Either |
targetSize |
A |
L |
A |
round |
A |
... |
Further arguments passed to |
This function is similar to
recount::scale_counts()
but more general and with a different name to
avoid NAMESPACE conflicts.
A matrix()
with the transformed (scaled) counts.
Other count transformation functions:
compute_read_counts()
,
compute_scale_factors()
,
is_paired_end()
## Create a RSE object at the gene level rse_gene_SRP009615 <- create_rse_manual("SRP009615") ## Scale the counts using the AUC assays(rse_gene_SRP009615)$counts <- transform_counts(rse_gene_SRP009615) ## See that now we have two assayNames() rse_gene_SRP009615 assayNames(rse_gene_SRP009615) ## You can compare the scaled counts against those from ## recount::scale_counts() from the recount2 project ## which used a different RNA-seq aligner ## If needed, install recount, the R/Bioconductor package for recount2: # BiocManager::install("recount") recount2_sizes <- colSums(assay(recount::scale_counts( recount::rse_gene_SRP009615, by = "auc" ), "counts")) / 1e6 recount3_sizes <- colSums(assay(rse_gene_SRP009615, "counts")) / 1e6 recount_sizes <- data.frame( recount2 = recount2_sizes[order(names(recount2_sizes))], recount3 = recount3_sizes[order(names(recount3_sizes))] ) plot(recount2 ~ recount3, data = recount_sizes) abline(a = 0, b = 1, col = "purple", lwd = 2, lty = 2) ## Compute RPKMs assays(rse_gene_SRP009615)$RPKM <- recount::getRPKM(rse_gene_SRP009615) colSums(assay(rse_gene_SRP009615, "RPKM")) ## Compute TPMs assays(rse_gene_SRP009615)$TPM <- recount::getTPM(rse_gene_SRP009615) colSums(assay(rse_gene_SRP009615, "TPM")) / 1e6 ## Should all be equal to 1
## Create a RSE object at the gene level rse_gene_SRP009615 <- create_rse_manual("SRP009615") ## Scale the counts using the AUC assays(rse_gene_SRP009615)$counts <- transform_counts(rse_gene_SRP009615) ## See that now we have two assayNames() rse_gene_SRP009615 assayNames(rse_gene_SRP009615) ## You can compare the scaled counts against those from ## recount::scale_counts() from the recount2 project ## which used a different RNA-seq aligner ## If needed, install recount, the R/Bioconductor package for recount2: # BiocManager::install("recount") recount2_sizes <- colSums(assay(recount::scale_counts( recount::rse_gene_SRP009615, by = "auc" ), "counts")) / 1e6 recount3_sizes <- colSums(assay(rse_gene_SRP009615, "counts")) / 1e6 recount_sizes <- data.frame( recount2 = recount2_sizes[order(names(recount2_sizes))], recount3 = recount3_sizes[order(names(recount3_sizes))] ) plot(recount2 ~ recount3, data = recount_sizes) abline(a = 0, b = 1, col = "purple", lwd = 2, lty = 2) ## Compute RPKMs assays(rse_gene_SRP009615)$RPKM <- recount::getRPKM(rse_gene_SRP009615) colSums(assay(rse_gene_SRP009615, "RPKM")) ## Compute TPMs assays(rse_gene_SRP009615)$TPM <- recount::getTPM(rse_gene_SRP009615) colSums(assay(rse_gene_SRP009615, "TPM")) / 1e6 ## Should all be equal to 1