Title: | basilisk and hail |
---|---|
Description: | Use hail via basilisk when appropriate, or via reticulate. This package can be used in terra.bio to interact with UK Biobank resources processed by hail.is. |
Authors: | Vincent Carey [aut, cre] |
Maintainer: | Vincent Carey <[email protected]> |
License: | Artistic-2.0 |
Version: | 1.7.1 |
Built: | 2024-11-21 03:05:04 UTC |
Source: | https://github.com/bioc/BiocHail |
S3 support
as.data.frame(x, row.names = NULL, optional = FALSE, ...)
as.data.frame(x, row.names = NULL, optional = FALSE, ...)
x |
entity coercible to data.frame |
row.names |
character or NULL |
optional |
logical |
... |
any args |
data.frame
hl <- hail_init() annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") as.data.frame(tab$head(3L))
hl <- hail_init() annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") as.data.frame(tab$head(3L))
S3 support
## Default S3 method: as.data.frame(x, row.names = NULL, optional = FALSE, ...)
## Default S3 method: as.data.frame(x, row.names = NULL, optional = FALSE, ...)
x |
entity coercible to data.frame |
row.names |
character or NULL |
optional |
logical |
... |
any args |
data.frame
hl <- hail_init() annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") as.data.frame(tab$head(3L))
hl <- hail_init() annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") as.data.frame(tab$head(3L))
convert hail.table.Table to R data frame
## S3 method for class 'hail.table.Table' as.data.frame(x, row.names = NULL, optional = FALSE, ...)
## S3 method for class 'hail.table.Table' as.data.frame(x, row.names = NULL, optional = FALSE, ...)
x |
instance of "hail.table.Table" |
row.names |
not used |
optional |
not used |
... |
not used |
data.frame
only use on small table because collect is used
hl <- hail_init() annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") as.data.frame(tab$head(3L))
hl <- hail_init() annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") as.data.frame(tab$head(3L))
bare interface to hail using reticulate
bare_hail()
bare_hail()
python reference to hail module
'/home/jupyter/.local/share/r-miniconda/envs/r-reticulate/bin/pip3 install...' is used to ensure that reticulate's python ecosystem is what we want
# assumes terra if (nchar(Sys.getenv("WORKSPACE_NAMESPACE"))>0) { hl = bare_hail() hl$init(idempotent=TRUE, spark_conf=list( 'spark.hadoop.fs.gs.requester.pays.mode'= 'CUSTOM', 'spark.hadoop.fs.gs.requester.pays.buckets'= 'ukb-diverse-pops-public', 'spark.hadoop.fs.gs.requester.pays.project.id'= Sys.getenv("GOOGLE_PROJECT"))) hl$read_matrix_table('gs://ukb-diverse-pops-public/sumstats_release/results_full.mt')$describe() ## Not run: # this is supposed to get us some LD data but xx.shape fails, issue filed hli = reticulate::import("hail.linalg") upa = reticulate::import("ukbb_pan_ancestry") xx = hli$BlockMatrix$read(upa$get_ld_matrix_path('AFR')) ## End(Not run) }
# assumes terra if (nchar(Sys.getenv("WORKSPACE_NAMESPACE"))>0) { hl = bare_hail() hl$init(idempotent=TRUE, spark_conf=list( 'spark.hadoop.fs.gs.requester.pays.mode'= 'CUSTOM', 'spark.hadoop.fs.gs.requester.pays.buckets'= 'ukb-diverse-pops-public', 'spark.hadoop.fs.gs.requester.pays.project.id'= Sys.getenv("GOOGLE_PROJECT"))) hl$read_matrix_table('gs://ukb-diverse-pops-public/sumstats_release/results_full.mt')$describe() ## Not run: # this is supposed to get us some LD data but xx.shape fails, issue filed hli = reticulate::import("hail.linalg") upa = reticulate::import("ukbb_pan_ancestry") xx = hli$BlockMatrix$read(upa$get_ld_matrix_path('AFR')) ## End(Not run) }
extract field names from hail.table.Table
## S4 method for signature 'hail.table.Table' colnames(x, do.NULL = TRUE, prefix = "col")
## S4 method for signature 'hail.table.Table' colnames(x, do.NULL = TRUE, prefix = "col")
x |
hail.table.Table instance |
do.NULL |
ignored |
prefix |
ignored |
character vector
hl <- hail_init() annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") colnames(tab)
hl <- hail_init() annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") colnames(tab)
s3 support
filter(.data, ..., .by = NULL, .preserve = FALSE)
filter(.data, ..., .by = NULL, .preserve = FALSE)
.data |
instance of hail.table.Table |
... |
should include named components 'filter' which is a logical vector with same number of rows as '.data', 'hl', a reference to a hail environment (Module), and 'placeholder' an arbitrary character(1) |
.by |
not used |
.preserve |
not used |
filtered hail.table.Table reference
hl <- hail_init() annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") pick <- rep(FALSE, 3500) pick[seq_len(10)] <- TRUE ft <- filter(tab, filter = pick, hl = hl) ft$count() ft$head(2L)$collect()
hl <- hail_init() annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") pick <- rep(FALSE, 3500) pick[seq_len(10)] <- TRUE ft <- filter(tab, filter = pick, hl = hl) ft$count() ft$head(2L)$collect()
filter rows of a hail Table
## S3 method for class 'hail.table.Table' filter(.data, ..., .by = NULL, .preserve = FALSE)
## S3 method for class 'hail.table.Table' filter(.data, ..., .by = NULL, .preserve = FALSE)
.data |
instance of hail.table.Table |
... |
should include named components 'filter' which is a logical vector with same number of rows as '.data', 'hl', a reference to a hail environment (Module), and 'placeholder' an arbitrary character(1) |
.by |
not used |
.preserve |
not used |
filtered hail.table.Table reference
writes one line of table to disk to retrieve field names
FIXME: uses disk because I don't know how to create a BooleanExpression except by importing.
hl <- hail_init() annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") pick <- rep(FALSE, 3500) pick[seq_len(10)] <- TRUE ft <- filter(tab, filter = pick, hl = hl) ft$count() ft$head(2L)$collect()
hl <- hail_init() annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") pick <- rep(FALSE, 3500) pick[seq_len(10)] <- TRUE ft <- filter(tab, filter = pick, hl = hl) ft$count() ft$head(2L)$collect()
interface to 1kg import
get_1kg( hl, retrieve_import_write = FALSE, path_1kg_zip = osn_1kg_path(), folder = tempdir(), cache = BiocFileCache::BiocFileCache() )
get_1kg( hl, retrieve_import_write = FALSE, path_1kg_zip = osn_1kg_path(), folder = tempdir(), cache = BiocFileCache::BiocFileCache() )
hl |
hail object |
retrieve_import_write |
logical(1) if TRUE, use hl.utils.get_1kg to retrieve data, otherwise acquire a previously written zip file, either from a cache, or, if no file found in cache, from web, followed by caching |
path_1kg_zip |
character(1) path to zip of MatrixTable, defaults to 'osn_1kg_path()'. |
folder |
character(1) destination of 1kg.mt as retrieved using hl.utils.get_1kg, import_vcf, write |
cache |
a BiocFileCache-type cache |
"hail.matrixtable.MatrixTable" instance
overwrite is permitted in the import_vcf.write event
hl <- hail_init() mt <- get_1kg(hl) mt mt$rows()$select()$show(5L) # must use integer annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") tab$describe() tab$show(width = 100L)
hl <- hail_init() mt <- get_1kg(hl) mt mt$rows()$select()$show(5L) # must use integer annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") tab$describe() tab$show(width = 100L)
S3 generic for get_key
get_key(x)
get_key(x)
x |
anything |
typically a list
hl <- hail_init() annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") get_key(tab)
hl <- hail_init() annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") get_key(tab)
S3 method for get_key
## S3 method for class 'hail.table.Table' get_key(x)
## S3 method for class 'hail.table.Table' get_key(x)
x |
instance of hail.table.Table |
a list with elements names (names of keys) and key_df (data.frame of key values, with column names)
hl <- hail_init() annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") get_key(tab)
hl <- hail_init() annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") get_key(tab)
interface to a small subset of UKBB summary stats in MatrixTable format
get_ukbb_sumstat_10kloci_mt( hl, folder = tempdir(), cache = BiocFileCache::BiocFileCache(), timeout.ukbb = 3600 )
get_ukbb_sumstat_10kloci_mt( hl, folder = tempdir(), cache = BiocFileCache::BiocFileCache(), timeout.ukbb = 3600 )
hl |
hail object |
folder |
character(1) destination of 1kg.mt as retrieved using hl.utils.get_1kg, import_vcf, write |
cache |
a BiocFileCache-type cache |
timeout.ukbb |
numeric(1) defaults to 3600 for timeout setting in 'options()'; option value is reset on exit |
"hail.matrixtable.MatrixTable" instance
The loci were selected using a .000345 of over 28 million loci recorded in the UKBB pan-ancestry record. The sample is made available a) to assess some issues with data volume (the full resource is about 12.78 TB according to [this doc](https://pan-dev.ukbb.broadinstitute.org/docs/hail-format/index.html), and b) to provide full information on the scope of phenotypes and populations available.
This function will unzip 5GB of MatrixTable data. It may be desirable to cache the unzipped image to a persistent location. If this has been done and the environment variable 'HAIL_UKBB_SUMSTAT_10K_PATH' has been set to this location, this function will use the MatrixTable content found there.
hl <- hail_init() ss <- get_ukbb_sumstat_10kloci_mt(hl) # consider saving the unzipped image and recaching ss$count()
hl <- hail_init() ss <- get_ukbb_sumstat_10kloci_mt(hl) # consider saving the unzipped image and recaching ss$count()
initialize hail, using more options
hail_init( quiet = FALSE, min_block_size = 0L, branching_factor = 50L, default_reference = "GRCh37", global_seed = 1234L, log = tempfile(), spark_conf = NULL, gcs_requester_pays_configuration = NULL )
hail_init( quiet = FALSE, min_block_size = 0L, branching_factor = 50L, default_reference = "GRCh37", global_seed = 1234L, log = tempfile(), spark_conf = NULL, gcs_requester_pays_configuration = NULL )
quiet |
logical(1) defaults to FALSE |
min_block_size |
integer(1) defaults to 0L |
branching_factor |
integer(1) defaults to 50L |
default_reference |
character(1) defaults to "GRCh37", for compatibility with earlier 'hail_init' |
global_seed |
integer(1) defaults to 1234L |
log |
character(1) target folder for logging, defaults to tempfile() |
spark_conf |
list, defaults to NULL |
gcs_requester_pays_configuration |
list, defaults to NULL |
python reference to hail module
hail object may be passed around. See hail documentation for details on all args.
proj = Sys.getenv("GOOGLE_PROJECT") buck = Sys.getenv("GCS_BUCKET") if (nchar(buck)>0) { # conf = list(proj, c(buck)) doesn't seem to generate tuple[str,Sequence[str]] hl <- hail_init() #gcs_requester_pays_configuration=conf) hl$default_reference() }
proj = Sys.getenv("GOOGLE_PROJECT") buck = Sys.getenv("GCS_BUCKET") if (nchar(buck)>0) { # conf = list(proj, c(buck)) doesn't seem to generate tuple[str,Sequence[str]] hl <- hail_init() #gcs_requester_pays_configuration=conf) hl$default_reference() }
initialize hail
hail_init_simple()
hail_init_simple()
python reference to hail module
hail object may be passed around
hc <- hail_init_simple() hc
hc <- hail_init_simple() hc
stop hail
hail_stop(hl)
hail_stop(hl)
hl |
a hail object produced by hail_init() |
result of stop() method for Hail module
hail_stop
hail_stop
data.frame with metadata about 3202 samples genotyped against T2T reference
data("kg_3202")
data("kg_3202")
data.frame
data.frame
Source: index files described at 'https://www.internationalgenome.org/data-portal/data-collection/30x-grch38'
data(kg_3202) dim(kg_3202)
data(kg_3202) dim(kg_3202)
pheno_data component harvesting from columns of summary stats MatrixTable allowing for info on multiple populations in the pheno_data component
multipop_df( x, top2get = c("trait_type", "phenocode", "description", "modifier", "coding_description", "coding"), pheno2get = c("n_cases", "n_controls", "heritability", "pop") )
multipop_df( x, top2get = c("trait_type", "phenocode", "description", "modifier", "coding_description", "coding"), pheno2get = c("n_cases", "n_controls", "heritability", "pop") )
x |
Struct - a single element of the list returned by mt$cols()$collect() |
top2get |
character() vector of general fields to retrieve |
pheno2get |
character() vector of fields to be retrieved for each subpopulation |
data.frame
# following are too time-consuming but can be of interest # if (nchar(Sys.getenv("HAIL_UKBB_SUMSTAT_10K_PATH"))>0) { # hl = hail_init() # ss = get_ukbb_sumstat_10kloci_mt(hl) # sscol = ss$cols()$collect() # may take a bit of time # print(length(sscol)) # multipop_df(sscol[[1]]) # } # # if (nchar(Sys.getenv("HAIL_UKBB_SUMSTAT_10K_PATH"))>0) { # # to get an overview of all phenotype-cohort combinations in a searchable table # mmm = lapply(sscol, multipop_df ) # mymy = do.call(rbind, mmm) # over 16k rows # DT::datatable(mymy) # } # # this runs quickly and is demonstrative hl <- hail_init() litzip <- system.file("extdata", "myss2.zip", package = "BiocHail") td <- tempdir() unzip(litzip, exdir = td) ntab <- hl$read_matrix_table(paste0(td, "/myss2.mt")) ntab$describe() nt2 <- ntab$col$collect() multipop_df(nt2[[1]]) # must select one element
# following are too time-consuming but can be of interest # if (nchar(Sys.getenv("HAIL_UKBB_SUMSTAT_10K_PATH"))>0) { # hl = hail_init() # ss = get_ukbb_sumstat_10kloci_mt(hl) # sscol = ss$cols()$collect() # may take a bit of time # print(length(sscol)) # multipop_df(sscol[[1]]) # } # # if (nchar(Sys.getenv("HAIL_UKBB_SUMSTAT_10K_PATH"))>0) { # # to get an overview of all phenotype-cohort combinations in a searchable table # mmm = lapply(sscol, multipop_df ) # mymy = do.call(rbind, mmm) # over 16k rows # DT::datatable(mymy) # } # # this runs quickly and is demonstrative hl <- hail_init() litzip <- system.file("extdata", "myss2.zip", package = "BiocHail") td <- tempdir() unzip(litzip, exdir = td) ntab <- hl$read_matrix_table(paste0(td, "/myss2.mt")) ntab$describe() nt2 <- ntab$col$collect() multipop_df(nt2[[1]]) # must select one element
Open Storage Network path to a zip of hail MatrixTable with some 1kg data for the Hail.is GWAS tutorial
osn_1kg_path()
osn_1kg_path()
character(1) URL to zip
osn_1kg_path()
osn_1kg_path()
Open Storage Network path to a zip of hail MatrixTable with a small subset of UKBB summary statistics as of 12/25/2022
osn_ukbb_sumst10k_path()
osn_ukbb_sumst10k_path()
character(1) path to zip
osn_ukbb_sumst10k_path()
osn_ukbb_sumst10k_path()
generate path to installed annotations file
path_1kg_annotations()
path_1kg_annotations()
character(1) path to annotations
.txt file retrieved from extraction on 'https://storage.googleapis.com/hail-1kg/tutorial_data.tar'
path_1kg_annotations()
path_1kg_annotations()
HWE-normalized PCA scores for 3202 thousand-genomes samples genotyped with the telomere-to-telomere reference
data("pcs_191k")
data("pcs_191k")
data.frame
data.frame
The genotypes are from a 5
data(pcs_191k) dim(pcs_191k)
data(pcs_191k) dim(pcs_191k)
HWE-normalized PCA scores for 3202 thousand-genomes samples genotyped with the telomere-to-telomere reference
data("pcs_38k")
data("pcs_38k")
data.frame
data.frame
The genotypes are from a 1
data(pcs_38k) dim(pcs_38k)
data(pcs_38k) dim(pcs_38k)
pheno_data component harvesting from columns of summary stats MatrixTable
pheno_data_sec_2df( m, section = 1, toget = c("n_cases", "n_controls", "heritability", "pop"), verbose = FALSE )
pheno_data_sec_2df( m, section = 1, toget = c("n_cases", "n_controls", "heritability", "pop"), verbose = FALSE )
m |
Struct returned from mt$cols()$collect() |
section |
numeric(1) element of pheno_data list to be transformed to data.frame |
toget |
character() vector of field names to retrieve |
verbose |
logical(1) if TRUE (NOT default) will message that there are multiple 'pheno_data' components returned |
1 row data.frame
applies top2df to the pheno_data component of input
update the reference genome for a hail instance
rg_update( hc, init = "GRCh38", newjson = system.file("json/t2tAnVIL.json", package = "BiocHail") )
rg_update( hc, init = "GRCh38", newjson = system.file("json/t2tAnVIL.json", package = "BiocHail") )
hc |
hail context |
init |
character(1) valid name for a reference genome, defaults to "GRCh38" |
newjson |
character(1) path to a json spec of a reference genome [needs doc] |
a python list; the function is used for its side effect
hl <- hail_init() rg_update(hl)
hl <- hail_init() rg_update(hl)
acquire row names of a Hail Table, assuming key has been set
## S4 method for signature 'hail.table.Table' rownames(x, do.NULL = TRUE, prefix = "row")
## S4 method for signature 'hail.table.Table' rownames(x, do.NULL = TRUE, prefix = "row")
x |
instance of hail.table.Table |
do.NULL |
not used |
prefix |
not used |
character()
character vector
To try example, run 'example("rownames,hail.table.Table-method")'
hl <- hail_init() annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") rt <- rownames(tab) length(rt) head(rt)
hl <- hail_init() annopath <- path_1kg_annotations() tab <- hl$import_table(annopath, impute = TRUE)$key_by("Sample") rt <- rownames(tab) length(rt) head(rt)
top-level annotation harvesting from columns of summary statistics MatrixTable
top2df( x, toget = c("trait_type", "phenocode", "description", "modifier", "coding_description") )
top2df( x, toget = c("trait_type", "phenocode", "description", "modifier", "coding_description") )
x |
a Struct returned from mt$cols()$collect() – which can be slow |
toget |
character() vector of field names to retrieve |
1-row data.frame
python None are transformed to NA
initialize ukbb
ukbb_init()
ukbb_init()
python module reference
ukbb module may be passed around
ukbb <- ukbb_init() names(ukbb)
ukbb <- ukbb_init() names(ukbb)