Title: | interface to python sklearn via Rstudio reticulate |
---|---|
Description: | This package provides interfaces to selected sklearn elements, and demonstrates fault tolerant use of python modules requiring extensive iteration. |
Authors: | Vince Carey [cre, aut] |
Maintainer: | Vince Carey <[email protected]> |
License: | Artistic-2.0 |
Version: | 1.29.0 |
Built: | 2024-12-09 06:32:02 UTC |
Source: | https://github.com/bioc/BiocSklearn |
create a file connection to HDF5 matrix
h5mat(infile, mode = "r", ...)
h5mat(infile, mode = "r", ...)
infile |
a pathname to an HDF5 file |
mode |
character(1) defaults to "r", see py_help for h5py.File |
... |
unused |
instance of (S3) h5py._hl.files.File
The result of this function must be used with basiliskRun with the env argument set to bsklenv, or there is a risk of inconsistent python modules being invoked. This should only be used with the persistent environment discipline of basilisk.
if (interactive()) { # not clear why fn = system.file("ban_6_17/assays.h5", package="BiocSklearn") proc = basilisk::basiliskStart(BiocSklearn:::bsklenv) basilisk::basiliskRun(proc, function(infile, mode="r") { h5py = reticulate::import("h5py") hh = h5py$File( infile, mode=mode ) cat("File reference:\n ") print(hh) cat("File attributes in python:\n ") print(head(names(hh))) cat("File keys in python:\n ") print(hh$keys()) cat("HDF5 dataset in python:\n ") print(hh['assay001']) }, infile=fn, mode="r") basilisk::basiliskStop(proc) }
if (interactive()) { # not clear why fn = system.file("ban_6_17/assays.h5", package="BiocSklearn") proc = basilisk::basiliskStart(BiocSklearn:::bsklenv) basilisk::basiliskRun(proc, function(infile, mode="r") { h5py = reticulate::import("h5py") hh = h5py$File( infile, mode=mode ) cat("File reference:\n ") print(hh) cat("File attributes in python:\n ") print(head(names(hh))) cat("File keys in python:\n ") print(hh$keys()) cat("HDF5 dataset in python:\n ") print(hh['assay001']) }, infile=fn, mode="r") basilisk::basiliskStop(proc) }
obtain an HDF5 dataset reference suitable for handling as numpy matrix
H5matref(filename, dsname = "assay001")
H5matref(filename, dsname = "assay001")
filename |
a pathname to an HDF5 file |
dsname |
internal name of HDF5 matrix to use, defaults to 'assay001' |
instance of (S3) "h5py._hl.dataset.Dataset"
This should only be used with persistent environment discipline of basilisk. Additional support is planned in Bioc 3.12.
fn = system.file("ban_6_17/assays.h5", package="BiocSklearn") ban = H5matref(fn) ban proc = basilisk::basiliskStart(BiocSklearn:::bsklenv) fullpca = basilisk::basiliskRun(proc, function() { np = import("numpy", convert=FALSE) # ensure print(ban$shape) print(np$take(ban, 0:3, 0L)) fullpca = skPCA(ban) tx = getTransformed(fullpca) print(dim(tx)) fullpca }) basilisk::basiliskStop(proc) # project samples np = reticulate::import("numpy", convert=FALSE, delay_load=TRUE) np$take(ban, 0:20, 0L)$shape st = skPartialPCA_step(np$take(ban, 0:20, 0L), n_comp=4L) st = skPartialPCA_step(np$take(ban, 21:40, 0L), n_comp=4L, obj=st) st = skPartialPCA_step(np$take(ban, 41:63, 0L), n_comp=4L, obj=st) oo = st$transform(ban) dim(oo) cor(oo[,1:4], getTransformed(fullpca)[,1:4])
fn = system.file("ban_6_17/assays.h5", package="BiocSklearn") ban = H5matref(fn) ban proc = basilisk::basiliskStart(BiocSklearn:::bsklenv) fullpca = basilisk::basiliskRun(proc, function() { np = import("numpy", convert=FALSE) # ensure print(ban$shape) print(np$take(ban, 0:3, 0L)) fullpca = skPCA(ban) tx = getTransformed(fullpca) print(dim(tx)) fullpca }) basilisk::basiliskStop(proc) # project samples np = reticulate::import("numpy", convert=FALSE, delay_load=TRUE) np$take(ban, 0:20, 0L)$shape st = skPartialPCA_step(np$take(ban, 0:20, 0L), n_comp=4L) st = skPartialPCA_step(np$take(ban, 21:40, 0L), n_comp=4L, obj=st) st = skPartialPCA_step(np$take(ban, 41:63, 0L), n_comp=4L, obj=st) oo = st$transform(ban) dim(oo) cor(oo[,1:4], getTransformed(fullpca)[,1:4])
constructor for SkDecomp
SkDecomp(transform, method)
SkDecomp(transform, method)
transform |
typically a numerical matrix representing a projection of the input |
method |
character(1) arbitrary tag describing the decomposition |
container for sklearn objects and transforms
## S4 method for signature 'SkDecomp' getTransformed(x)
## S4 method for signature 'SkDecomp' getTransformed(x)
x |
instance of SkDecomp |
the getTransformed method returns a matrix
transform
stored as R matrix
method
string identifying method
In Bioc 3.11, the object
slot is removed. This is a consequence
of adoption of basilisk discipline for acquiring and using python resources,
which greatly increases reliability, at the expense of added complication in
handling python objects interactively in R. We are working on restoring
this functionality but it will take time.
use basilisk discipline to perform partial (n_components) incremental (chunk.size) PCA with scikit.decomposition
skIncrPartialPCA(mat, n_components, chunk.size = 10)
skIncrPartialPCA(mat, n_components, chunk.size = 10)
mat |
a matrix |
n_components |
integer(1) number of PCs to compute |
chunk.size |
integer(1) number of rows to use each step |
A good source for capabilities and examples is at the sklearn doc site.
lk = skIncrPartialPCA(iris[,1:4], n_components=3L) lk head(getTransformed(lk))
lk = skIncrPartialPCA(iris[,1:4], n_components=3L) lk head(getTransformed(lk))
use sklearn IncrementalPCA procedure
skIncrPCA(mat, n_components = 2L, batch_size = 5L, ...)
skIncrPCA(mat, n_components = 2L, batch_size = 5L, ...)
mat |
a matrix – can be R matrix or numpy.ndarray, if the latter it must be set up in a basilisk persistent environment, and that is not currently demonstrated for this package. |
n_components |
number of PCA to retrieve |
batch_size |
number of records to use at each iteration |
... |
passed to python IncrementalPCA |
matrix with rotation
dem = skIncrPCA(iris[,1:4], batch_size=25L) dem2 = skIncrPCA(iris[,1:4], batch_size=25L, n_components=2L) dem dem2
dem = skIncrPCA(iris[,1:4], batch_size=25L) dem2 = skIncrPCA(iris[,1:4], batch_size=25L, n_components=2L) dem dem2
demo of HDF5 processing with incremental PCA/batch_size/fit_transform
skIncrPCA_h5(fn, dsname = "assay001", n_components, chunk.size = 10L)
skIncrPCA_h5(fn, dsname = "assay001", n_components, chunk.size = 10L)
fn |
character(1) path to HDF5 file |
dsname |
character(1) name of dataset within HDF5 file, assumed to be 2-dimensional array |
n_components |
numeric(1) passed to IncrementalPCA |
chunk.size |
numeric(1) passed to IncrementalPCA as batch_size |
Here we use IncrementalPCA$fit_transform and let python take care of chunk retrieval.
skIncrPartialPCA
acquires chunks from R matrix and uses IncrementalPCA$partial_fit.
if (interactive()) { fn = system.file("hdf5/irmatt.h5", package="BiocSklearn") # 'transposed' relative to R iris dem = skIncrPCA_h5(fn, n_components=3L, dsname="tquants") dem head(getTransformed(dem)) }
if (interactive()) { fn = system.file("hdf5/irmatt.h5", package="BiocSklearn") # 'transposed' relative to R iris dem = skIncrPCA_h5(fn, n_components=3L, dsname="tquants") dem head(getTransformed(dem)) }
optionally fault tolerant incremental partial PCA for projection of samples from SummarizedExperiment
skIncrPPCA( se, chunksize, n_components, assayind = 1, picklePath = "./skIdump.pkl", matTx = force, ... )
skIncrPPCA( se, chunksize, n_components, assayind = 1, picklePath = "./skIdump.pkl", matTx = force, ... )
se |
instance of SummarizedExperiment |
chunksize |
integer number of samples per step |
n_components |
integer number of PCs to compute |
assayind |
not used, assumed set to 1 |
picklePath |
if non-null, incremental results saved here via joblib.dump, for each chunk. If NULL, no saving of incremental results. |
matTx |
a function defaulting to force() that accepts a matrix and returns a matrix with identical dimensions, e.g., |
... |
not used |
python instance of sklearn.decomposition.incremental_pca.IncrementalPCA
Will treat samples as records and all features (rows)
as attributes, projecting.
to an n_components
-dimensional space. Method will
acquire chunk of assay data
and transpose before computing PCA contributions.
In case of crash, restore from picklePath
using
joblib$load
after loading reticulate. You can
use the n_samples_seen_
component of the restored
python reference to determine where to restart.
You can manage resumption using skPartialPCA_step
.
# demo SE made with TENxGenomics: # mm = matrixSummarizedExperiment(h5path, 1:27998, 1:750) # saveHDF5SummarizedExperiment(mm, "tenx_750") # if (FALSE) { if (requireNamespace("HDF5Array")) { se750 = HDF5Array::loadHDF5SummarizedExperiment( system.file("hdf5/tenx_750", package="BiocSklearn")) lit = skIncrPPCA(se750[, 1:50], chunksize=5, n_components=4) round(cor(pypc <- lit$transform(dat <- t(as.matrix(assay(se750[,1:50]))))),3) rpc = prcomp(dat) round(cor(rpc$x[,1:4], pypc), 3) } # this has to be made basilisk-compliant } # and is blocked until then
# demo SE made with TENxGenomics: # mm = matrixSummarizedExperiment(h5path, 1:27998, 1:750) # saveHDF5SummarizedExperiment(mm, "tenx_750") # if (FALSE) { if (requireNamespace("HDF5Array")) { se750 = HDF5Array::loadHDF5SummarizedExperiment( system.file("hdf5/tenx_750", package="BiocSklearn")) lit = skIncrPPCA(se750[, 1:50], chunksize=5, n_components=4) round(cor(pypc <- lit$transform(dat <- t(as.matrix(assay(se750[,1:50]))))),3) rpc = prcomp(dat) round(cor(rpc$x[,1:4], pypc), 3) } # this has to be made basilisk-compliant } # and is blocked until then
interface to sklearn.cluster.KMeans using basilisk discipline
skKMeans(mat, ...)
skKMeans(mat, ...)
mat |
a matrix-like datum or reference to such |
... |
arguments to sklearn.cluster.KMeans |
a list with cluster assignments (integers starting with zero) and asserted cluster centers.
This is a demonstrative interface to the resources of sklearn.cluster. In this particular interface, we are using sklearn.cluster.k_means_.KMeans. There are many other possibilities in sklearn.cluster: _dbscan_inner, feature_agglomeration, hierarchical, k_means, k_means_elkan, affinity_propagation, bicluster, birch, dbscan, hierarchical, k_means, mean_shift, setup, spectral.
Basilisk discipline has not been used for this function, 1 June 2022.
irloc = system.file("csv/iris.csv", package="BiocSklearn") np = reticulate::import("numpy", delay_load=TRUE, convert=FALSE) h5py = reticulate::import("h5py", delay_load=TRUE) irismat = np$genfromtxt(irloc, delimiter=',') ans = skKMeans(irismat, n_clusters=2L) names(ans) # names of available result components table(iris$Species, ans$labels) # now use an HDF5 reference irh5 = system.file("hdf5/irmat.h5", package="BiocSklearn") fref = h5py$File(irh5) ds = fref$`__getitem__`("quants") ans2 = skKMeans(np$array(ds)$T, n_clusters=2L) # HDF5 matrix is transposed relative to python array layout! Is the np$array conversion unduly costly? table(ans$labels, ans2$labels) ans3 = skKMeans(np$array(ds)$T, n_clusters=8L, max_iter=200L, algorithm="lloyd", random_state=20L) dem = skKMeans(iris[,1:4], n_clusters=3L, max_iter=100L, algorithm="lloyd", random_state=20L) str(dem) tab = table(iris$Species, dem$labels) tab plot(iris[,1], iris[,3], col=as.numeric(factor(iris$Species))) points(dem$centers[,1], dem$centers[,3], pch=19, col=apply(tab,2,which.max))
irloc = system.file("csv/iris.csv", package="BiocSklearn") np = reticulate::import("numpy", delay_load=TRUE, convert=FALSE) h5py = reticulate::import("h5py", delay_load=TRUE) irismat = np$genfromtxt(irloc, delimiter=',') ans = skKMeans(irismat, n_clusters=2L) names(ans) # names of available result components table(iris$Species, ans$labels) # now use an HDF5 reference irh5 = system.file("hdf5/irmat.h5", package="BiocSklearn") fref = h5py$File(irh5) ds = fref$`__getitem__`("quants") ans2 = skKMeans(np$array(ds)$T, n_clusters=2L) # HDF5 matrix is transposed relative to python array layout! Is the np$array conversion unduly costly? table(ans$labels, ans2$labels) ans3 = skKMeans(np$array(ds)$T, n_clusters=8L, max_iter=200L, algorithm="lloyd", random_state=20L) dem = skKMeans(iris[,1:4], n_clusters=3L, max_iter=100L, algorithm="lloyd", random_state=20L) str(dem) tab = table(iris$Species, dem$labels) tab plot(iris[,1], iris[,3], col=as.numeric(factor(iris$Species))) points(dem$centers[,1], dem$centers[,3], pch=19, col=apply(tab,2,which.max))
take a step in sklearn IncrementalPCA partial fit procedure
skPartialPCA_step(mat, n_components, obj)
skPartialPCA_step(mat, n_components, obj)
mat |
a matrix – can be R matrix or numpy.ndarray |
n_components |
number of PCA to retrieve |
obj |
sklearn.decomposition.IncrementalPCA instance |
trained IncrementalPCA reference, to which 'transform' method can be applied to obtain projection for any compliant input
if obj is missing, the process is initialized with the matrix provided
# these steps are not basilisk-compliant, you need to acquire references irloc = system.file("csv/iris.csv", package="BiocSklearn") np = reticulate::import("numpy", delay_load=TRUE, convert=FALSE) irismat = np$genfromtxt(irloc, delimiter=',') ta = np$take ipc = skPartialPCA_step(ta(irismat,0:49,0L)) ipc = skPartialPCA_step(ta(irismat,50:99,0L), obj=ipc) ipc = skPartialPCA_step(ta(irismat,100:149,0L), obj=ipc) head(names(ipc)) ipc$transform(ta(irismat,0:5,0L)) fullproj = ipc$transform(irismat) fullpc = prcomp(data.matrix(iris[,1:4]))$x round(cor(fullpc,fullproj),3)
# these steps are not basilisk-compliant, you need to acquire references irloc = system.file("csv/iris.csv", package="BiocSklearn") np = reticulate::import("numpy", delay_load=TRUE, convert=FALSE) irismat = np$genfromtxt(irloc, delimiter=',') ta = np$take ipc = skPartialPCA_step(ta(irismat,0:49,0L)) ipc = skPartialPCA_step(ta(irismat,50:99,0L), obj=ipc) ipc = skPartialPCA_step(ta(irismat,100:149,0L), obj=ipc) head(names(ipc)) ipc$transform(ta(irismat,0:5,0L)) fullproj = ipc$transform(irismat) fullpc = prcomp(data.matrix(iris[,1:4]))$x round(cor(fullpc,fullproj),3)
use sklearn PCA procedure
skPCA(mat, ...)
skPCA(mat, ...)
mat |
a matrix – can be R matrix or numpy.ndarray |
... |
additional parameters passed to sklearn.decomposition.PCA, for additional information use |
matrix with rotation
If no additional arguments are passed, all defaults are used.
#irloc = system.file("csv/iris.csv", package="BiocSklearn") #irismat = SklearnEls()$np$genfromtxt(irloc, delimiter=',') #skpi = skPCA(irismat) #getTransformed(skpi)[1:5,] chk = skPCA(data.matrix(iris[,1:4])) chk head(getTransformed(chk)) head(prcomp(data.matrix(iris[,1:4]))$x)
#irloc = system.file("csv/iris.csv", package="BiocSklearn") #irismat = SklearnEls()$np$genfromtxt(irloc, delimiter=',') #skpi = skPCA(irismat) #getTransformed(skpi)[1:5,] chk = skPCA(data.matrix(iris[,1:4])) chk head(getTransformed(chk)) head(prcomp(data.matrix(iris[,1:4]))$x)
use sklearn pairwise_distances procedure
skPWD(mat, ...)
skPWD(mat, ...)
mat |
a matrix – can be R matrix or numpy.ndarray |
... |
additional parameters passed to sklearn.metrics.pairwise_distances, for additional information use |
matrix with rotation
If no additional arguments are passed, all defaults are used.
irloc = system.file("csv/iris.csv", package="BiocSklearn") data(iris) irismat = as.matrix(iris[,1:4]) chk1 = skPWD(irismat) chk1[1:4,1:5] chk2 = skPWD(irismat, metric='manhattan') chk2[1:4,1:5]
irloc = system.file("csv/iris.csv", package="BiocSklearn") data(iris) irismat = as.matrix(iris[,1:4]) chk1 = skPWD(irismat) chk1[1:4,1:5] chk2 = skPWD(irismat, metric='manhattan') chk2[1:4,1:5]