Title: | PathoStat Statistical Microbiome Analysis Package |
---|---|
Description: | The purpose of this package is to perform Statistical Microbiome Analysis on metagenomics results from sequencing data samples. In particular, it supports analyses on the PathoScope generated report files. PathoStat provides various functionalities including Relative Abundance charts, Diversity estimates and plots, tests of Differential Abundance, Time Series visualization, and Core OTU analysis. |
Authors: | Solaiappan Manimaran <[email protected]>, Matthew Bendall <[email protected]>, Sandro Valenzuela Diaz <[email protected]>, Eduardo Castro <[email protected]>, Tyler Faits <[email protected]>, Yue Zhao <[email protected]>, Anthony Nicholas Federico <[email protected]>, W. Evan Johnson <[email protected]> |
Maintainer: | Solaiappan Manimaran <[email protected]>, Yue Zhao <[email protected]> |
License: | GPL (>= 2) |
Version: | 1.33.0 |
Built: | 2024-10-30 09:21:39 UTC |
Source: | https://github.com/bioc/PathoStat |
Do bootstrap and LOOCV
Bootstrap_LOOCV_LR_AUC(df, targetVec, nboot = 50)
Bootstrap_LOOCV_LR_AUC(df, targetVec, nboot = 50)
df |
Row is sample, column is feature. Required |
targetVec |
y vector. Required |
nboot |
number of BOOTSTRAP |
bootstrap loocv result dataframe
data('iris') Bootstrap_LOOCV_LR_AUC(iris[,1:4], c(rep(1,100), rep(0,50)), nboot = 3)
data('iris') Bootstrap_LOOCV_LR_AUC(iris[,1:4], c(rep(1,100), rep(0,50)), nboot = 3)
Given PAM and disease/control annotation, do Chi-square test for each row of PAM
Chisq_Test_Pam(pam, label.vec.num, pvalue.cutoff = 0.05)
Chisq_Test_Pam(pam, label.vec.num, pvalue.cutoff = 0.05)
pam |
Input data object that contains the data to be tested. Required |
label.vec.num |
The target binary condition. Required |
pvalue.cutoff |
choose p-value cut-off |
df.output object
tmp <- matrix(rbinom(12,1,0.5), nrow = 3) rownames(tmp) <- c("a", "b", "c") Chisq_Test_Pam(tmp, c(1,1,0,0))
tmp <- matrix(rbinom(12,1,0.5), nrow = 3) rownames(tmp) <- c("a", "b", "c") Chisq_Test_Pam(tmp, c(1,1,0,0))
Return the Relative Abundance (RA) data for the given count OTU table
findRAfromCount(count_otu)
findRAfromCount(count_otu)
count_otu |
Count OTU table |
ra_otu Relative Abundance (RA) OTU table
data_dir <- system.file("data", package = "PathoStat") infileName <- "pstat_data.rda" pstat_test <- loadPstat(data_dir, infileName) ra_otu <- findRAfromCount(phyloseq::otu_table(pstat_test))
data_dir <- system.file("data", package = "PathoStat") infileName <- "pstat_data.rda" pstat_test <- loadPstat(data_dir, infileName) ra_otu <- findRAfromCount(phyloseq::otu_table(pstat_test))
Find the Taxonomy Information Matrix
findTaxonMat(names, taxonLevels)
findTaxonMat(names, taxonLevels)
names |
Row names of the taxonomy matrix |
taxonLevels |
Taxon Levels of all tids |
taxmat Taxonomy Information Matrix
example_data_dir <- system.file("example/data", package = "PathoStat") pathoreport_file_suffix <- "-sam-report.tsv" datlist <- readPathoscopeData(example_data_dir, pathoreport_file_suffix, input.files.name.vec = as.character(1:6)) dat <- datlist$data ids <- rownames(dat) tids <- unlist(lapply(ids, FUN = grepTid)) # taxonLevels <- findTaxonomy(tids[1:5]) # taxmat <- findTaxonMat(ids[1:5], taxonLevels)
example_data_dir <- system.file("example/data", package = "PathoStat") pathoreport_file_suffix <- "-sam-report.tsv" datlist <- readPathoscopeData(example_data_dir, pathoreport_file_suffix, input.files.name.vec = as.character(1:6)) dat <- datlist$data ids <- rownames(dat) tids <- unlist(lapply(ids, FUN = grepTid)) # taxonLevels <- findTaxonomy(tids[1:5]) # taxmat <- findTaxonMat(ids[1:5], taxonLevels)
Find the taxonomy for unlimited tids
findTaxonomy(tids)
findTaxonomy(tids)
tids |
Given taxonomy ids |
taxondata Data with the taxonomy information
example_data_dir <- system.file("example/data", package = "PathoStat") pathoreport_file_suffix <- "-sam-report.tsv" datlist <- readPathoscopeData(example_data_dir, pathoreport_file_suffix, input.files.name.vec = as.character(1:6)) dat <- datlist$data ids <- rownames(dat) tids <- unlist(lapply(ids, FUN = grepTid)) # taxonLevels <- findTaxonomy(tids[1:5])
example_data_dir <- system.file("example/data", package = "PathoStat") pathoreport_file_suffix <- "-sam-report.tsv" datlist <- readPathoscopeData(example_data_dir, pathoreport_file_suffix, input.files.name.vec = as.character(1:6)) dat <- datlist$data ids <- rownames(dat) tids <- unlist(lapply(ids, FUN = grepTid)) # taxonLevels <- findTaxonomy(tids[1:5])
Find the taxonomy for maximum 300 tids
findTaxonomy300(tids)
findTaxonomy300(tids)
tids |
Given taxonomy ids |
taxondata Data with the taxonomy information
example_data_dir <- system.file("example/data", package = "PathoStat") pathoreport_file_suffix <- "-sam-report.tsv" datlist <- readPathoscopeData(example_data_dir, pathoreport_file_suffix, input.files.name.vec = as.character(1:6)) dat <- datlist$data ids <- rownames(dat) tids <- unlist(lapply(ids, FUN = grepTid)) # taxonLevels <- findTaxonomy300(tids[1:5])
example_data_dir <- system.file("example/data", package = "PathoStat") pathoreport_file_suffix <- "-sam-report.tsv" datlist <- readPathoscopeData(example_data_dir, pathoreport_file_suffix, input.files.name.vec = as.character(1:6)) dat <- datlist$data ids <- rownames(dat) tids <- unlist(lapply(ids, FUN = grepTid)) # taxonLevels <- findTaxonomy300(tids[1:5])
Given PAM and disease/control annotation, do Chi-square test for each row of PAM
Fisher_Test_Pam(pam, label.vec.num, pvalue.cutoff = 0.05)
Fisher_Test_Pam(pam, label.vec.num, pvalue.cutoff = 0.05)
pam |
Input data object that contains the data to be tested. Required |
label.vec.num |
The target binary condition. Required |
pvalue.cutoff |
choose p-value cut-off |
df.output object
tmp <- matrix(rbinom(12,1,0.5), nrow = 3) rownames(tmp) <- c("a", "b", "c") Fisher_Test_Pam(tmp, c(1,1,0,0))
tmp <- matrix(rbinom(12,1,0.5), nrow = 3) rownames(tmp) <- c("a", "b", "c") Fisher_Test_Pam(tmp, c(1,1,0,0))
Format taxonomy table for rendering
formatTaxTable(ttable)
formatTaxTable(ttable)
ttable |
Taxonomy table |
Formatted table suitable for rendering with. DT::renderDataTable
transform cpm counts to presence-absence matrix
GET_PAM(df)
GET_PAM(df)
df |
Input data object that contains the data to be tested. Required |
df.output object
GET_PAM(data.frame(a = c(1,3,0), b = c(0,0.1,2)))
GET_PAM(data.frame(a = c(1,3,0), b = c(0,0.1,2)))
Getter function to get the shinyInput option
getShinyInput()
getShinyInput()
shinyInput option
getShinyInput()
getShinyInput()
Getter function to get the shinyInputCombat option
getShinyInputCombat()
getShinyInputCombat()
shinyInputCombat option
getShinyInputCombat()
getShinyInputCombat()
Getter function to get the shinyInputOrig option
getShinyInputOrig()
getShinyInputOrig()
shinyInputOrig option
getShinyInputOrig()
getShinyInputOrig()
Use Lasso to do feature selection
getSignatureFromMultipleGlmnet(df.input, target.vec, nfolds = 10, logisticRegression = TRUE, nRun = 100, alpha = 1)
getSignatureFromMultipleGlmnet(df.input, target.vec, nfolds = 10, logisticRegression = TRUE, nRun = 100, alpha = 1)
df.input |
Row is sample, column is feature. Required |
target.vec |
y vector. Required |
nfolds |
glmnet CV nfolds |
logisticRegression |
doing logistic regression or linear regression. |
nRun |
number of glmnet runs |
alpha |
same as in glmnet |
signature
data('iris') getSignatureFromMultipleGlmnet(iris[,1:4], c(rep(1,100), rep(0,50)), nfolds = 3, nRun = 10)
data('iris') getSignatureFromMultipleGlmnet(iris[,1:4], c(rep(1,100), rep(0,50)), nfolds = 3, nRun = 10)
Greps the tid from the given identifier string
grepTid(id)
grepTid(id)
id |
Given identifier string |
tid string
grepTid("ti|700015|org|Coriobacterium_glomerans_PW2")
grepTid("ti|700015|org|Coriobacterium_glomerans_PW2")
Loads all data from a set of PathoID reports. For each column in the PathoID report, construct a matrix where the rows are genomes and the columns are samples. Returns a list where each element is named according to the PathoID column. For example, ret[["Final.Best.Hit.Read.Numbers"]] on the result of this function will get you the final count matrix. Also includes elements "total_reads" and "total_genomes" from the first line of the PathoID report.
loadPathoscopeReports(reportfiles, nrows = NULL)
loadPathoscopeReports(reportfiles, nrows = NULL)
reportfiles |
Paths to report files |
nrows |
Option to read first N rows of PathoScope reports |
Returns a list where each element is named according to the PathoID column. For example, ret[["Final.Best.Hit.Read.Numbers"]] on the result of this function will get you the final count matrix. Also includes elements "total_reads" and "total_genomes" from the first line of the PathoID report.
input_dir <- system.file("example/data", package = "PathoStat") reportfiles <- list.files(input_dir, pattern = "*-sam-report.tsv", full.names = TRUE)
input_dir <- system.file("example/data", package = "PathoStat") reportfiles <- list.files(input_dir, pattern = "*-sam-report.tsv", full.names = TRUE)
Load the R data(.rda) file with pathostat object
loadPstat(indir = ".", infileName = "pstat_data.rda")
loadPstat(indir = ".", infileName = "pstat_data.rda")
indir |
Input Directory of the .rda file |
infileName |
File name of the .rda file |
pstat pathostat object (NULL if it does not exist)
data_dir <- system.file("data", package = "PathoStat") infileName <- "pstat_data.rda" pstat <- loadPstat(data_dir, infileName)
data_dir <- system.file("data", package = "PathoStat") infileName <- "pstat_data.rda" pstat <- loadPstat(data_dir, infileName)
Compute log2(counts per mil reads) and library size for each sample
log2CPM(qcounts, lib.size = NULL)
log2CPM(qcounts, lib.size = NULL)
qcounts |
quantile normalized counts |
lib.size |
default is colsums(qcounts) |
list containing log2(quantile counts per mil reads) and library sizes
log2CPM(matrix(1:12, nrow = 3))
log2CPM(matrix(1:12, nrow = 3))
LOOCV
LOOAUC_simple_multiple_noplot_one_df(df, targetVec)
LOOAUC_simple_multiple_noplot_one_df(df, targetVec)
df |
Row is sample, column is feature. Required |
targetVec |
y vector. Required |
mean auc
data('iris') LOOAUC_simple_multiple_noplot_one_df(iris[,1:4], c(rep(1,100), rep(0,50)))
data('iris') LOOAUC_simple_multiple_noplot_one_df(iris[,1:4], c(rep(1,100), rep(0,50)))
LOOCV with ROC curve
LOOAUC_simple_multiple_one_df(df, targetVec)
LOOAUC_simple_multiple_one_df(df, targetVec)
df |
Row is sample, column is feature. Required |
targetVec |
y vector. Required |
the ROC
data('iris') LOOAUC_simple_multiple_one_df(iris[,1:4], c(rep(1,100), rep(0,50)))
data('iris') LOOAUC_simple_multiple_one_df(iris[,1:4], c(rep(1,100), rep(0,50)))
Contains all currently-supported BatchQC output data classes:
slots:
a single object of class otu_tableOrNULL
a single object of class otu_tableOrNULL
a single object of class otu_tableOrNULL
a single object of class otu_tableOrNULL
otumat = matrix(sample(1:100, 100, replace = TRUE), nrow = 10, ncol = 10) rownames(otumat) <- paste0("OTU", 1:nrow(otumat)) colnames(otumat) <- paste0("Sample", 1:ncol(otumat)) taxmat = matrix(sample(letters, 70, replace = TRUE), nrow = nrow(otumat), ncol = 7) rownames(taxmat) <- rownames(otumat) colnames(taxmat) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species") OTU = phyloseq::otu_table(otumat, taxa_are_rows = TRUE) TAX = phyloseq::tax_table(taxmat) physeq = phyloseq::phyloseq(OTU, TAX) pathostat1(physeq)
otumat = matrix(sample(1:100, 100, replace = TRUE), nrow = 10, ncol = 10) rownames(otumat) <- paste0("OTU", 1:nrow(otumat)) colnames(otumat) <- paste0("Sample", 1:ncol(otumat)) taxmat = matrix(sample(letters, 70, replace = TRUE), nrow = nrow(otumat), ncol = 7) rownames(taxmat) <- rownames(otumat) colnames(taxmat) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species") OTU = phyloseq::otu_table(otumat, taxa_are_rows = TRUE) TAX = phyloseq::tax_table(taxmat) physeq = phyloseq::phyloseq(OTU, TAX) pathostat1(physeq)
Compute percentage
percent(x, digits = 2, format = "f")
percent(x, digits = 2, format = "f")
x |
a number or a vector |
digits |
how many digit of percentage |
format |
numeric format, "f" for float |
the percentage
pecent.vec <- percent(c(0.9, 0.98))
pecent.vec <- percent(c(0.9, 0.98))
Further details.
phyloseq_to_edgeR(physeq, group, method = "RLE", ...)
phyloseq_to_edgeR(physeq, group, method = "RLE", ...)
physeq |
(Required). |
group |
(Required). A character vector or factor giving the experimental group/condition for each sample/library. |
method |
(Optional). |
... |
Additional arguments passed on to |
dispersion
data_dir_test <- system.file("data", package = "PathoStat") pstat_test <- loadPstat(indir=data_dir_test, infileName="pstat_data_2_L1.rda") phyloseq_to_edgeR(pstat_test, group="Sex")
data_dir_test <- system.file("data", package = "PathoStat") pstat_test <- loadPstat(indir=data_dir_test, infileName="pstat_data_2_L1.rda") phyloseq_to_edgeR(pstat_test, group="Sex")
Plot PCA
plotPCAPlotly(df.input, condition.color.vec, condition.color.name = "condition", condition.shape.vec = NULL, condition.shape.name = "condition", columnTitle = "Title", pc.a = "PC1", pc.b = "PC2")
plotPCAPlotly(df.input, condition.color.vec, condition.color.name = "condition", condition.shape.vec = NULL, condition.shape.name = "condition", columnTitle = "Title", pc.a = "PC1", pc.b = "PC2")
df.input |
Input data object that contains the data to be plotted. Required |
condition.color.vec |
color vector. Required |
condition.color.name |
color variable name. Required |
condition.shape.vec |
shape vector. Required |
condition.shape.name |
shape variable name. Required |
columnTitle |
Title to be displayed at top of heatmap. |
pc.a |
pc.1 |
pc.b |
pc.2 |
the plot
data('iris') plotPCAPlotly(t(iris[,1:4]), condition.color.vec = c(rep(1,100), rep(0,50)), condition.shape.vec = c(rep(0,100), rep(1,50)))
data('iris') plotPCAPlotly(t(iris[,1:4]), condition.color.vec = c(rep(1,100), rep(0,50)), condition.shape.vec = c(rep(0,100), rep(1,50)))
Plot PCoA
plotPCoAPlotly(physeq.input, condition.color.vec, condition.color.name = "condition", condition.shape.vec = NULL, condition.shape.name = "condition", method = "bray", columnTitle = "Title", pc.a = "Axis.1", pc.b = "Axis.2")
plotPCoAPlotly(physeq.input, condition.color.vec, condition.color.name = "condition", condition.shape.vec = NULL, condition.shape.name = "condition", method = "bray", columnTitle = "Title", pc.a = "Axis.1", pc.b = "Axis.2")
physeq.input |
Input data object that contains the data to be plotted. Required |
condition.color.vec |
color vector. Required |
condition.color.name |
color variable name. Required |
condition.shape.vec |
shape vector. Required |
condition.shape.name |
shape variable name. Required |
method |
which distance metric |
columnTitle |
Title to be displayed at top of heatmap. |
pc.a |
pc.1 |
pc.b |
pc.2 |
the plot
data_dir_test <- system.file("data", package = "PathoStat") pstat_test <- loadPstat(indir=data_dir_test, infileName="pstat_data_2_L1.rda") plotPCoAPlotly(pstat_test, condition.color.vec = rbinom(33,1,0.5), condition.shape.vec = rbinom(33,1,0.5))
data_dir_test <- system.file("data", package = "PathoStat") pstat_test <- loadPstat(indir=data_dir_test, infileName="pstat_data_2_L1.rda") plotPCoAPlotly(pstat_test, condition.color.vec = rbinom(33,1,0.5), condition.shape.vec = rbinom(33,1,0.5))
This example data consists of 33 samples from a diet study with 11 subjects taking 3 different diets in random order
pstat
pstat
pathostat object extension of phyloseq-class experiment-level object:
OTU table with 41 taxa and 33 samples
Sample Data with 33 samples by 18 sample variables
Taxonomy Table with 41 taxa by 9 taxonomic ranks
Phylogenetic Tree with 41 tips and 40 internal nodes
pathostat object
Reads the data from PathoScope reports and returns a list of final guess relative abundance and count data
readPathoscopeData(input_dir = ".", pathoreport_file_suffix = "-sam-report.tsv", use.input.files = FALSE, input.files.path.vec = NULL, input.files.name.vec = NULL)
readPathoscopeData(input_dir = ".", pathoreport_file_suffix = "-sam-report.tsv", use.input.files = FALSE, input.files.path.vec = NULL, input.files.name.vec = NULL)
input_dir |
Directory where the tsv files from PathoScope are located |
pathoreport_file_suffix |
PathoScope report files suffix |
use.input.files |
whether input dir to pathoscope files or directly pathoscope files |
input.files.path.vec |
vector of pathoscope file paths |
input.files.name.vec |
vector of pathoscope file names |
List of final guess relative abundance and count data
example_data_dir <- system.file("example/data", package = "PathoStat") pathoreport_file_suffix <- "-sam-report.tsv" datlist <- readPathoscopeData(example_data_dir, pathoreport_file_suffix, input.files.name.vec = as.character(1:6))
example_data_dir <- system.file("example/data", package = "PathoStat") pathoreport_file_suffix <- "-sam-report.tsv" datlist <- readPathoscopeData(example_data_dir, pathoreport_file_suffix, input.files.name.vec = as.character(1:6))
Statistical Microbiome Analysis on the pathostat input and generates a html report and produces interactive shiny app plots
runPathoStat(pstat = NULL, report_dir = ".", report_option_binary = "111111111", interactive = TRUE)
runPathoStat(pstat = NULL, report_dir = ".", report_option_binary = "111111111", interactive = TRUE)
pstat |
phyloseq extension pathostat object |
report_dir |
Output report directory path |
report_option_binary |
9 bits Binary String representing the plots to display and hide in the report |
interactive |
when TRUE, opens the interactive shinyApp |
outputfile The output file with all the statistical plots
runPathoStat(interactive = FALSE)
runPathoStat(interactive = FALSE)
Save the pathostat object to R data(.rda) file
savePstat(pstat, outdir = ".", outfileName = "pstat_data.rda")
savePstat(pstat, outdir = ".", outfileName = "pstat_data.rda")
pstat |
pathostat object |
outdir |
Output Directory of the .rda file |
outfileName |
File name of the .rda file |
outfile .rda file
data_dir_test <- system.file("data", package = "PathoStat") pstat_test <- loadPstat(indir=data_dir_test, infileName="pstat_data_2_L1.rda") outfile <- savePstat(pstat_test)
data_dir_test <- system.file("data", package = "PathoStat") pstat_test <- loadPstat(indir=data_dir_test, infileName="pstat_data_2_L1.rda") outfile <- savePstat(pstat_test)
Setter function to set the shinyInput option
setShinyInput(x)
setShinyInput(x)
x |
shinyInput option |
shinyInput option
setShinyInput(NULL)
setShinyInput(NULL)
Setter function to set the shinyInputCombat option
setShinyInputCombat(x)
setShinyInputCombat(x)
x |
shinyInputCombat option |
shinyInputCombat option
setShinyInputCombat(NULL)
setShinyInputCombat(NULL)
Setter function to set the shinyInputOrig option
setShinyInputOrig(x)
setShinyInputOrig(x)
x |
shinyInputOrig option |
shinyInputOrig option
setShinyInputOrig(NULL)
setShinyInputOrig(NULL)
Creates a table of summary metrics
summarizeTable(pstat)
summarizeTable(pstat)
pstat |
Input pstat |
A data.frame object of summary metrics.
data_dir_test <- system.file("data", package = "PathoStat") pstat_test <- loadPstat(indir=data_dir_test, infileName="pstat_data_2_L1.rda") st.tmp <- summarizeTable(pstat_test)
data_dir_test <- system.file("data", package = "PathoStat") pstat_test <- loadPstat(indir=data_dir_test, infileName="pstat_data_2_L1.rda") st.tmp <- summarizeTable(pstat_test)
Find the taxonomy for the given taxon id name
TranslateIdToTaxLevel(pstat, input.id.vec, tax.level)
TranslateIdToTaxLevel(pstat, input.id.vec, tax.level)
pstat |
pathostat object |
input.id.vec |
names containing id |
tax.level |
target taxon level |
target taxon level names
data_dir_test <- system.file("data", package = "PathoStat") pstat_test <- loadPstat(indir=data_dir_test, infileName="pstat_data_2_L1.rda") names.new <- TranslateIdToTaxLevel(pstat_test, c("ti|862962|org|Bacteroides_fragilis_638R", "ti|697329|org|Ruminococcus_albus_7" ), "genus")
data_dir_test <- system.file("data", package = "PathoStat") pstat_test <- loadPstat(indir=data_dir_test, infileName="pstat_data_2_L1.rda") names.new <- TranslateIdToTaxLevel(pstat_test, c("ti|862962|org|Bacteroides_fragilis_638R", "ti|697329|org|Ruminococcus_albus_7" ), "genus")
Mann-whitney test for a dataframe
Wilcox_Test_df(df, label.vec.num, pvalue.cutoff = 0.05)
Wilcox_Test_df(df, label.vec.num, pvalue.cutoff = 0.05)
df |
Input data object that contains the data to be tested. Required |
label.vec.num |
The target binary condition. Required |
pvalue.cutoff |
choose p-value cut-off |
df.output object
data('iris') Wilcox_Test_df(t(iris[,1:4]), c(rep(1,100), rep(0,50)))
data('iris') Wilcox_Test_df(t(iris[,1:4]), c(rep(1,100), rep(0,50)))