Introduction to the AnVIL package

AnVIL

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager", repos = "https://cran.r-project.org")
BiocManager::install("AnVIL")
library(AnVILGCP)
library(AnVIL)
dir(file.path(Sys.getenv("GCLOUD_SDK_PATH"), "bin"), "^(gcloud|gsutil)$")
## [1] "gcloud" "gsutil"
## the code chunks in this vignette are fully evaluated when
## gcloud_exists() returns TRUE
AnVILGCP::gcloud_exists()
## [1] FALSE
BiocManager::install("GenomicFeatures")
add_libpaths("~/my/project")
gcloud_account() # authentication account
gcloud_project() # billing project information
gcloud_cmd("projects", "list") %>%
    readr::read_table() %>%
    filter(startsWith(PROJECT_ID, "anvil"))
gcloud_help("projects")
src <- "gs://genomics-public-data/1000-genomes/"
gsutil_ls(src)

other <- paste0(src, "other")
gsutil_ls(other, recursive = TRUE)

sample_info <- paste0(src, "other/sample_info/sample_info.csv")
gsutil_stat(sample_info)
fl <- tempfile()
gsutil_cp(sample_info, fl)

csv <- readr::read_csv(fl, guess_max = 5000L, col_types = readr::cols())
csv
pipe <- gsutil_pipe(fl, "rb")
readr::read_csv(pipe, guess_max = 5000L, col_types = readr::cols()) %>%
    dplyr::select("Sample", "Family_ID", "Population", "Gender")
destination <- tempfile()
stopifnot(dir.create(destination))
source <- paste0(src, "other/sample_info")

## dry run
gsutil_rsync(source, destination)

gsutil_rsync(source, destination, dry = FALSE)
dir(destination, recursive = TRUE)

## nothing to synchronize
gsutil_rsync(source, destination, dry = FALSE)

## one file requires synchronization
unlink(file.path(destination, "README"))
gsutil_rsync(source, destination, dry = FALSE)
avworkspace_namespace()
avworkspace_name()
## N.B.: IT MAY NOT BE NECESSARY TO SET THESE WHEN ON ANVIL
avworkspace_namespace("pathogen-genomic-surveillance")
avworkspace_name("COVID-19")
avtables()
sample <- avtable("sample")
sample
sample %>%
    select("sample_id", contains("fasta")) %>%
    filter(!is.na(final_assembly_fasta))
my_cars <-
    mtcars |>
    as_tibble(rownames = "model") |>
    mutate(model = gsub(" ", "_", model))
job_status <- avtable_import(my_cars)
avtable_import_status(job_status)
(job_status <- avtable_import(my_cars, pageSize = 10))
## pageSize = 10 rows (4 pages)
##   |======================================================================| 100%
## # A tibble: 4 × 5
##    page from_row to_row job_id                               status
##   <int>    <int>  <int> <chr>                                <chr>
## 1     1        1     10 a32e9706-f63c-49ed-9620-b214746b9392 Uploaded
## 2     2       11     20 f2910ac2-0954-4fb9-b36c-970845a266b7 Uploaded
## 3     3       21     30 e18adc5b-d26f-4a8a-a0d7-a232e17ac8d2 Uploaded
## 4     4       31     32 d14efb89-e2dd-4937-b80a-169520b5f563 Uploaded
(job_status <- avtable_import_status(job_status))
## checking status of 4 avtable import jobs
##   |======================================================================| 100%
## # A tibble: 4 × 5
##    page from_row to_row job_id                               status
##   <int>    <int>  <int> <chr>                                <chr>
## 1     1        1     10 a32e9706-f63c-49ed-9620-b214746b9392 Done
## 2     2       11     20 f2910ac2-0954-4fb9-b36c-970845a266b7 Done
## 3     3       21     30 e18adc5b-d26f-4a8a-a0d7-a232e17ac8d2 ReadyForUpsert
## 4     4       31     32 d14efb89-e2dd-4937-b80a-169520b5f563 ReadyForUpsert
(job_status <- avtable_import_status(job_status))
## checking status of 4 avtable import jobs
##   |======================================================================| 100%
## # A tibble: 4 × 5
##    page from_row to_row job_id                               status
##   <int>    <int>  <int> <chr>                                <chr>
## 1     1        1     10 a32e9706-f63c-49ed-9620-b214746b9392 Done
## 2     2       11     20 f2910ac2-0954-4fb9-b36c-970845a266b7 Done
## 3     3       21     30 e18adc5b-d26f-4a8a-a0d7-a232e17ac8d2 Done
## 4     4       31     32 d14efb89-e2dd-4937-b80a-169520b5f563 Done
## editable copy of '1000G-high-coverage-2019' workspace
avworkspace("anvil-datastorage/1000G-high-coverage-2019")
sample <-
    avtable("sample") %>%                               # existing table
    mutate(set = sample(head(LETTERS), nrow(.), TRUE))  # arbitrary groups
sample %>%                                   # new 'participant_set' table
    avtable_import_set("participant", "set", "participant")
sample %>%                                   # new 'sample_set' table
    avtable_import_set("sample", "set", "name")
avdata()
bucket <- avbucket()
bucket
avfiles_ls()
## requires workspace ownership
uri <- avbucket()                             # discover bucket
bucket <- file.path(uri, "mtcars.tab")
write.table(mtcars, gsutil_pipe(bucket, "w")) # write to bucket
## backup all files and folders in the current working directory
avfiles_backup(getwd(), recursive = TRUE)

## backup all files in the current directory
avfiles_backup(dir())

## backup all files to gs://<avbucket()>/scratch/
avfiles_backup(dir, paste0(avbucket(), "/scratch"))
uri <- c(
    vcf = "drs://dg.ANV0/6f633518-f2de-4460-aaa4-a27ee6138ab5",
    tbi = "drs://dg.ANV0/4fb9e77f-c92a-4deb-ac90-db007dc633aa"
)
tbl <- drs_stat(uri)
## # A tibble: 2 × 9
##   drs      fileName   size gsUri accessUrl timeUpdated hashes       bucket name
##   <chr>    <chr>     <dbl> <chr> <chr>     <chr>       <list>       <chr>  <chr>
## 1 drs://d… NA21144… 7.06e9 gs:/… NA        2020-07-08… <named list> fc-56… CCDG…
## 2 drs://d… NA21144… 4.08e6 gs:/… NA        2020-07-08… <named list> fc-56… CCDG…
drs_cp(uri, "/tmp")     # local temporary directory
drs_cp(uri, avbucket()) # workspace bucket
suppressPackageStartupMessages({
    library(VariantAnnotation)
})
https <- drs_access_url(uri)
vcffile <- VcfFile(https[["vcf"]], https[["tbi"]])
scanVcfHeader(vcffile)
## class: VCFHeader
## samples(1): NA21144
## meta(3): fileformat reference contig
## fixed(2): FILTER ALT
## info(16): BaseQRankSum ClippingRankSum ... ReadPosRankSum VariantType
## geno(11): GT AB ... PL SB

variants <- readVcf(vcffile, param = GRanges("chr1:1-1000000"))
nrow(variants)
## [1] 123077
terra <- Terra()
terra
terra %>% tags("Status")
terra$status
terra$status()
args(terra$createBillingProjectFull)
args(terra$overwriteWorkspaceMethodConfig)
status <- terra$status()
class(status)
str(status)
lst <- status %>% as.list()
lengths(lst)
lengths(lst$systems)
str(lst$systems)
> .api_test_check(Terra(), "Terra") |> lengths()
        common          added        removed        updated  common_in_use
           135             24              3             11              9
removed_in_use updated_in_use
             0              3
> .api_test_check(Terra(), "Terra")[c("removed_in_use", "updated_in_use")]
$removed_in_use
character(0)

$updated_in_use
[1] "cloneWorkspace"         "entityQuery"            "flexibleImportEntities"
Imports: AnVIL
importFrom AnVIL, Service
importMethodsFrom AnVIL, "$"   # pehaps also `tags()`, etc
importClassesFrom AnVIL, Service
.MyService <- setClass("MyService", contains = "Service")

MyService <-
    function()
{
    .MyService(Service(
        "myservice",
        host = "api.firecloud.org",
        api_url = "https://api.firecloud.org/api-docs.yaml",
        authenticate = FALSE
    ))
}
git clone https://git.bioconductor.org/packages/AnVIL

Workspace	AnVIL function
TABLES	`avtables()`
REFERENCE DATA	None
OTHER DATA	`avbucket()`
Workspace Data	`avdata()`
Files	`avfiles_ls()`, `avfiles_backup()`, `avfiles_restore()`

Introduction to the AnVIL package

Installation

Quick start

Up to speed with AnVIL

Use in the AnVIL cloud

Local use

Graphical interfaces

For end users

Fast binary package installation

Working with Google cloud-based resources

Using `gcloud_*()` for account management

Using `gsutil_*()` for file and bucket management

Using `av*()` to work with AnVIL tables and data

Tables, reference data, and persistent files

Using `avtable*()` for accessing tables

Using `avdata()` for accessing Workspace Data

Using `avbucket()` and workspace files

Using `avnotebooks*()` for notebook management

Using `avworkflows_*()` for workflows

Using `avworkspace_*()` for workspaces

Using `drs_*()` for resolving DRS (Data Repository Service) URIs

For developers

Set-up

Service APIs

Construction

Invoke endpoints

Process responses

Test endpoints

Service implementations

Extending the `Service` class to implement your own RESTful interface

Support, bug reports, and source code availability

Appendix

Acknowledgments

Session info

Introduction to the AnVIL package

Installation

Quick start

Up to speed with AnVIL

Use in the AnVIL cloud

Local use

Graphical interfaces

For end users

Fast binary package installation

Working with Google cloud-based resources

Using gcloud_*() for account management

Using gsutil_*() for file and bucket management

Using av*() to work with AnVIL tables and data

Tables, reference data, and persistent files

Using avtable*() for accessing tables

Using avdata() for accessing Workspace Data

Using avbucket() and workspace files

Using avnotebooks*() for notebook management

Using avworkflows_*() for workflows

Using avworkspace_*() for workspaces

Using drs_*() for resolving DRS (Data Repository Service) URIs

For developers

Set-up

Service APIs

Construction

Invoke endpoints

Process responses

Test endpoints

Service implementations

Extending the Service class to implement your own RESTful interface

Support, bug reports, and source code availability

Appendix

Acknowledgments

Session info

Using `gcloud_*()` for account management

Using `gsutil_*()` for file and bucket management

Using `av*()` to work with AnVIL tables and data

Using `avtable*()` for accessing tables

Using `avdata()` for accessing Workspace Data

Using `avbucket()` and workspace files

Using `avnotebooks*()` for notebook management

Using `avworkflows_*()` for workflows

Using `avworkspace_*()` for workspaces

Using `drs_*()` for resolving DRS (Data Repository Service) URIs

Extending the `Service` class to implement your own RESTful interface