Filter phylogentic profiles
Description
Create a filtered data needed for plotting or clustering
phylogenetic profiles. NOTE: this function require some intermediate steps
using the results from other functions. If you would like to get a full
processed data from the raw input, please use the function
fromInputToProfile() instead!
Usage
filterProfileData(DF, taxaCount, refTaxon = NULL,
percentCO = c(0, 1), coorthoCOMax = 9999,
var1CO = c(0, 1), var2CO = c(0, 1), var1Rel = "protein",
var2Rel = "protein", groupByCat = FALSE, catDt = NULL,
var1AggregateBy = "max", var2AggregateBy = "max")
filterProfileData(DF, taxaCount, refTaxon = NULL,
percentCO = c(0, 1), coorthoCOMax = 9999,
var1CO = c(0, 1), var2CO = c(0, 1), var1Rel = "protein",
var2Rel = "protein", groupByCat = FALSE, catDt = NULL,
var1AggregateBy = "max", var2AggregateBy = "max")
Arguments
DF |
a reduced dataframe contains info for all phylogenetic
profiles in the selected taxonomy rank.
|
taxaCount |
dataframe counting present taxa in each supertaxon
|
refTaxon |
selected reference taxon. NOTE: This taxon will not be
affected by the filtering. If you want to filter all, set refTaxon <- NULL.
Default = NULL.
|
percentCO |
min and max cutoffs for percentage of species present
in a supertaxon. Default = c(0, 1).
|
coorthoCOMax |
maximum number of co-orthologs allowed. Default =
9999.
|
var1CO |
min and max cutoffs for var1. Default = c(0, 1).
|
var2CO |
min anc max cutoffs for var2. Default = c(0, 1).
|
var1Rel |
relation of var1 ("protein" for protein-protein or
"species" for protein-species). Default = "protein".
|
var2Rel |
relation of var2 ("protein" for protein-protein or
"species" for protein-species). Default = "protein".
|
groupByCat |
group genes by their categories (TRUE or FALSE). Default =
FALSE.
|
catDt |
dataframe contains gene categories
(optional, NULL if groupByCat = FALSE or no info provided). Default = NULL.
|
var1AggregateBy |
aggregate method for VAR1 (max, min, mean
or median), applied for calculating var1 of supertaxa. Default = "max".
|
var2AggregateBy |
aggregate method for VAR2 (max, min, mean
or median), applied for calculating var2 of supertaxa. Default = "max".
|
Value
A filtered dataframe for generating profile plot including seed gene
IDs (or orthologous group IDs), their ortholog IDs and the corresponding
(super)taxa, (super)taxon IDs, number of co-orthologs in each (super)taxon,
values for two additional variables var1, var2,
supertaxon, and the categories of seed genes (or ortholog groups).
Author(s)
Vinh Tran [email protected]
See Also
parseInfoProfile
and reduceProfile
for generating input dataframe, fullProcessedProfile
for a
demo full processed profile dataframe, fromInputToProfile
for
generating fully processed data from raw input.
Examples
# NOTE: this function require some intermediate steps using the results from
# other functions. If you would like to get a full processed data from the
# raw input, please use the function fromInputToProfile() instead!
library(dplyr)
data("fullProcessedProfile", package="PhyloProfile")
rankName <- "class"
refTaxon <- "Mammalia"
percentCutoff <- c(0.0, 1.0)
coorthologCutoffMax <- 10
var1Cutoff <- c(0.75, 1.0)
var2Cutoff <- c(0.5, 1.0)
var1Relation <- "protein"
var2Relation <- "species"
groupByCat <- FALSE
catDt <- NULL
var1AggregateBy <- "max"
var2AggregateBy <- "max"
taxonIDs <- levels(as.factor(fullProcessedProfile$ncbiID))
sortedInputTaxa <- sortInputTaxa(
taxonIDs, rankName, refTaxon, NULL, NULL
)
taxaCount <- sortedInputTaxa %>% dplyr::group_by(supertaxon) %>%
summarise(n = n(), .groups = "drop")
filterProfileData(
fullProcessedProfile,
taxaCount,
refTaxon,
percentCutoff,
coorthologCutoffMax,
var1Cutoff,
var2Cutoff,
var1Relation,
var2Relation,
groupByCat,
catDt,
var1AggregateBy,
var2AggregateBy
)
library(dplyr)
data("fullProcessedProfile", package="PhyloProfile")
rankName <- "class"
refTaxon <- "Mammalia"
percentCutoff <- c(0.0, 1.0)
coorthologCutoffMax <- 10
var1Cutoff <- c(0.75, 1.0)
var2Cutoff <- c(0.5, 1.0)
var1Relation <- "protein"
var2Relation <- "species"
groupByCat <- FALSE
catDt <- NULL
var1AggregateBy <- "max"
var2AggregateBy <- "max"
taxonIDs <- levels(as.factor(fullProcessedProfile$ncbiID))
sortedInputTaxa <- sortInputTaxa(
taxonIDs, rankName, refTaxon, NULL, NULL
)
taxaCount <- sortedInputTaxa %>% dplyr::group_by(supertaxon) %>%
summarise(n = n(), .groups = "drop")
filterProfileData(
fullProcessedProfile,
taxaCount,
refTaxon,
percentCutoff,
coorthologCutoffMax,
var1Cutoff,
var2Cutoff,
var1Relation,
var2Relation,
groupByCat,
catDt,
var1AggregateBy,
var2AggregateBy
)