Title: | Search harmonized metadata created under the OmicsMLRepo project |
---|---|
Description: | This package provides functions to browse the harmonized metadata for large omics databases. This package also supports data navigation if the metadata incorporates ontology. |
Authors: | Sehyun Oh [aut, cre] |
Maintainer: | Sehyun Oh <[email protected]> |
License: | Artistic-2.0 |
Version: | 1.1.5 |
Built: | 2025-03-23 03:43:45 UTC |
Source: | https://github.com/bioc/OmicsMLRepoR |
Extract all the terms used in a quired attribute/column
availableTerms(attribute, db = "cMD")
availableTerms(attribute, db = "cMD")
attribute |
A character (1). Name of the attribute/column you want to extract the terms used under. |
db |
A character(1). Currently, 'cMD' (curatedMetagenomicData) is only supported. |
A data frame with two columns - existing values under the quried attribute ('allowedvalues' column) and their ontology term id ('ontology' column).
availableTerms("age_group") availableTerms("disease")
availableTerms("age_group") availableTerms("disease")
Extract ontology from the ontology term ids
get_ontologies(terms, delim = ":")
get_ontologies(terms, delim = ":")
terms |
A character vector |
delim |
A character. Delimiter between ontology and its id. Default is ':'. |
A character vector containing the ontology names of the input 'terms'. The length of this is same as the 'terms' input.
terms <- c("HP:0001824", "MONDO:0010200", "NCIT:C122328", "4471000175100") get_ontologies(terms = terms)
terms <- c("HP:0001824", "MONDO:0010200", "NCIT:C122328", "4471000175100") get_ontologies(terms = terms)
Same as the 'getWideMetaTb' function, this function accepts single target column ('targetCol'). The target columns (and linked, accessory columns) should have the same number of elements separated by the 'delim', and the multiple values for each column belongs to the same column/ attribute/field, i.e., no additional column name is required/provided.
getLongMetaTb(meta, targetCol = NULL, delim = NULL)
getLongMetaTb(meta, targetCol = NULL, delim = NULL)
meta |
A data frame. Each column (and associated 'ontology_term_id' column) should use the same delimiter to separate multiple, same-numbered values. |
targetCol |
A character (1). The column name to expand if present. |
delim |
Optional. A character (1) of a delimiter used to separate multiple values in the metadata table. |
A data frame of metadata expanded so that each individual treatment has its own row.
data(mini_cmd) lmeta <- getLongMetaTb(mini_cmd, "hla") dim(mini_cmd) dim(lmeta) data(mini_cmd2) lmeta2 <- getLongMetaTb(mini_cmd2, "target_condition") head(lmeta2, 3) data(mini_cbio) trt_cols <- grep("^treatment_", colnames(mini_cbio), value = TRUE) lmeta3 <- getLongMetaTb(mini_cbio, targetCol = trt_cols) short_tb <- data.frame( ind = c("A", "B", "C", "D", "E"), aval = c("cat;dog", "chicken", "horse", "frog;pig", "snake"), cval = c(1, NA, 3, 4, 5), bval = c("red;blue", "yellow", "NA", "green;NA", "brown")) getLongMetaTb(short_tb, c("aval", "bval"), delim = ";")
data(mini_cmd) lmeta <- getLongMetaTb(mini_cmd, "hla") dim(mini_cmd) dim(lmeta) data(mini_cmd2) lmeta2 <- getLongMetaTb(mini_cmd2, "target_condition") head(lmeta2, 3) data(mini_cbio) trt_cols <- grep("^treatment_", colnames(mini_cbio), value = TRUE) lmeta3 <- getLongMetaTb(mini_cbio, targetCol = trt_cols) short_tb <- data.frame( ind = c("A", "B", "C", "D", "E"), aval = c("cat;dog", "chicken", "horse", "frog;pig", "snake"), cval = c(1, NA, 3, 4, 5), bval = c("red;blue", "yellow", "NA", "green;NA", "brown")) getLongMetaTb(short_tb, c("aval", "bval"), delim = ";")
Download a harmonized metadata table
getMetadata(database = NULL, load = TRUE)
getMetadata(database = NULL, load = TRUE)
database |
Name of the database to get the metadata from. Currently, there are two available options.
|
load |
Default is |
Curated metadata table or file cache location, if 'load = FALSE'.
cmd <- getMetadata("cMD")
cmd <- getMetadata("cMD")
Collapse values from multiple columns into one
getNarrowMetaTb( meta, newCol = NULL, targetCol = NULL, sep = ":", delim = ";", remove = TRUE, na.rm = TRUE, sort = TRUE )
getNarrowMetaTb( meta, newCol = NULL, targetCol = NULL, sep = ":", delim = ";", remove = TRUE, na.rm = TRUE, sort = TRUE )
meta |
A data frame. |
newCol |
A character (1). Name of the new column to store collapsed values. |
targetCol |
A character vector. Names of the columns to be collapsed into one column. |
sep |
A character (1). Delimiter used to concatenate column name and its value. Default is double colons, ':'. |
delim |
A character(1). Separator to use between values/columns. Default is ';'. |
remove |
With the default, 'TRUE', this function will remove input columns from output data frame. |
na.rm |
With the default, 'TRUE', missing values will be removed prior to uniting each value. |
sort |
With the default, 'TRUE', the united columns will be ordered alphabetically. |
A data frame where target columns (targetCol
) are collapsed
into a single column. The original column name and its value are
concatenated with the 'sep' input and the column:value pairs are separated
by the 'delim' input. Target columns will be merged in the alphabetical
order of their names.
wide_tb <- data.frame(fruit = c("apple", "banana", "pear", "watermelon", "grape"), shape = c("round", "long", NA, "round", NA), color = c("red", "yellow", NA, "green", "purple"), size = c("medium", "medium", NA, "large", "small")) getNarrowMetaTb(wide_tb, newCol = "feature", targetCol = c("color", "shape", "size"), sep = ":", delim = ";")
wide_tb <- data.frame(fruit = c("apple", "banana", "pear", "watermelon", "grape"), shape = c("round", "long", NA, "round", NA), color = c("red", "yellow", NA, "green", "purple"), size = c("medium", "medium", NA, "large", "small")) getNarrowMetaTb(wide_tb, newCol = "feature", targetCol = c("color", "shape", "size"), sep = ":", delim = ";")
Extract identical or similar ontology terms across different ontologies
getOntoInfo(query, ontology = "", exact = FALSE, rows = 20)
getOntoInfo(query, ontology = "", exact = FALSE, rows = 20)
query |
A character (1) containing the search query, either a term label or term id. |
ontology |
A character vector defining the ontology to be queried. Default is the empty character, to search all ontologies. |
exact |
A logical (1) defining if OLS search is restricted to exact matches. Defaults is 'FALSE'. |
rows |
An integer (1) defining the number of query returns. Default is 20L. Maximum number of values returned by the server is 1000. |
A tibble containing ontology term label and description
getOntoInfo("NCIT:C4872") getOntoInfo("NCIT:C4872", ontology = c("EFO", "MONDO")) getOntoInfo("Skin Infection") getOntoInfo("Sitagliptin", ontology = c("Chebi", "apple")) ## Multiple query values getOntoInfo("plasma,membrane") getOntoInfo(c("plasma", "membrane"))
getOntoInfo("NCIT:C4872") getOntoInfo("NCIT:C4872", ontology = c("EFO", "MONDO")) getOntoInfo("Skin Infection") getOntoInfo("Sitagliptin", ontology = c("Chebi", "apple")) ## Multiple query values getOntoInfo("plasma,membrane") getOntoInfo(c("plasma", "membrane"))
Compresses expanded metadata columns to one row per sample
getShortMetaTb(meta, idCols = NULL, targetCol = NULL, delim = "<;>")
getShortMetaTb(meta, idCols = NULL, targetCol = NULL, delim = "<;>")
meta |
A data frame with expanded treatment columns. |
idCols |
Optional. A character vector of columns that identify single samples, such as 'curation_id' and 'sampleId'. Defaults to standard ID columns. |
targetCol |
Optional. A character vector of columns to compress if present. Default is names of all cBioPortal treatment-related columns. |
delim |
Optional. A delimiter string. Default is '<;>'. |
A data frame where each sample gets a single row
data(mini_cmd) lmeta <- getLongMetaTb(mini_cmd, "hla") res <- getShortMetaTb(lmeta, targetCol = "hla") dim(res) # 200 x 3 table long_tb <- data.frame(ind = c("A", "A", "B", "C", "D", "D", "E"), aval = c("cat", "dog", "chicken", "horse", "frog", "pig", "snake"), cval = c(1, 1, NA, 3, 4, 4, 5), bval = c("red", "blue", "yellow", NA, "green", NA, "brown")) getShortMetaTb(long_tb, idCols = "ind", targetCol = c("aval", "bval"))
data(mini_cmd) lmeta <- getLongMetaTb(mini_cmd, "hla") res <- getShortMetaTb(lmeta, targetCol = "hla") dim(res) # 200 x 3 table long_tb <- data.frame(ind = c("A", "A", "B", "C", "D", "D", "E"), aval = c("cat", "dog", "chicken", "horse", "frog", "pig", "snake"), cval = c(1, 1, NA, 3, 4, 4, 5), bval = c("red", "blue", "yellow", NA, "green", NA, "brown")) getShortMetaTb(long_tb, idCols = "ind", targetCol = c("aval", "bval"))
The values stored in one column should include their potential column names to use this function.
getWideMetaTb(meta, targetCol = NULL, sep = ":", delim = "<;>", remove = TRUE)
getWideMetaTb(meta, targetCol = NULL, sep = ":", delim = "<;>", remove = TRUE)
meta |
A data frame. |
targetCol |
A character (1). The column name to expand if present. Multiple attributes should be separated by the 'sep' and the column name and its value should be separated by the provided 'delim'. |
sep |
A character (1). Delimiter used to concatenate column name and its value. Default is double colons, ':'. |
delim |
A character(1). Separator used between values. Default '<;>'. |
remove |
If 'TRUE', remove input columns from output data frame. |
A data frame where the contents under 'targetCol' is split into individual columns in an alphabetical order. Data type of the expanded columns is all character.
## Narrow-table example narrow_tb <- data.frame(fruit = c("apple", "banana", "pear", "watermelon", "grape"), feature = c("color:red;shape:round;size:medium", "color:yellow;shape:long;size:medium", "color:brown;shape:NA;size:NA", "color:green;shape:round;size:large", "color:purple;shape:NA;size:small")) getWideMetaTb(narrow_tb, targetCol = "feature", sep = ":", delim = ";") ## Narrow-table example with missing columns narrow_tb2 <- data.frame(fruit = c("apple", "banana", "pear", "watermelon", "grape"), feature = c("color:red;shape:round;size:medium", "color:yellow;shape:long;size:medium", NA, "color:green;size:large", "color:purple;shape:NA;size:small")) getWideMetaTb(narrow_tb2, targetCol = "feature", sep = ":", delim = ";") ## Subset of cMD metadata data(mini_cmd3) wtb <- getWideMetaTb(mini_cmd3, targetCol = "probing_pocket_depth") head(wtb)
## Narrow-table example narrow_tb <- data.frame(fruit = c("apple", "banana", "pear", "watermelon", "grape"), feature = c("color:red;shape:round;size:medium", "color:yellow;shape:long;size:medium", "color:brown;shape:NA;size:NA", "color:green;shape:round;size:large", "color:purple;shape:NA;size:small")) getWideMetaTb(narrow_tb, targetCol = "feature", sep = ":", delim = ";") ## Narrow-table example with missing columns narrow_tb2 <- data.frame(fruit = c("apple", "banana", "pear", "watermelon", "grape"), feature = c("color:red;shape:round;size:medium", "color:yellow;shape:long;size:medium", NA, "color:green;size:large", "color:purple;shape:NA;size:small")) getWideMetaTb(narrow_tb2, targetCol = "feature", sep = ":", delim = ";") ## Subset of cMD metadata data(mini_cmd3) wtb <- getWideMetaTb(mini_cmd3, targetCol = "probing_pocket_depth") head(wtb)
This function is designed for a group of, collapsible metadata attributes (e.g., 'biomarker' for curatedMetagenomicData).
merge_vectors(base, update, sep = ":", delim = ";")
merge_vectors(base, update, sep = ":", delim = ";")
base |
A character. A space-holder version of the key:value concatenates (e.g., 'column1:NA;column2:NA;column3:NA') |
update |
A character. The target string to be compared and filled with 'base' if there is missing pairs. (e.g., 'column1:value1;column3:value3') |
sep |
A character string to separate the column name and value. Default is ':' |
delim |
A character string to separate the column:value pairs. Default is ';' |
A character updated the target string ('update') to follow the reference string ('base').
x <- "color:NA;shape:NA;size:NA" y <- "color:green;size:large" merge_vectors(x, y)
x <- "color:NA;shape:NA;size:NA" y <- "color:green;size:large" merge_vectors(x, y)
A subset of curated version of cBioPortal's clinical metadata.
mini_cbio
mini_cbio
A data frame with 10 samples and 9 columns ('curation_id', 'acronym', 'acronym_ontology_term_id', 'sex', 'package', 'treatment_name', 'treatment_name_ontology_term_id', 'treatment_type', 'treatment_type_ontology_term_id')
Sehyun Oh [email protected]
A subset of curated version of 'sampleMetadata' from the curatedMetagenomicData (cMD, ver.3.8.0) package.
mini_cmd
mini_cmd
A data frame with 200 samples and 3 columns ('curation_id', 'hla', and 'package')
Sehyun Oh [email protected]
A subset of curated version of 'sampleMetadata' from the curatedMetagenomicData (cMD, ver.3.8.0) package.
mini_cmd2
mini_cmd2
A data frame with 200 samples and 7 columns ('curation_id', 'target_condition', 'target_condition_ontology_term_id', 'pmid', 'disease', 'disease_ontology_term_id', 'package'). The two key exemplary attributes ('target_condition' and 'disease') selected here contain multiple values.
Sehyun Oh [email protected]
A subset of curated version of 'sampleMetadata' from the curatedMetagenomicData (cMD, ver.3.8.0) package.
mini_cmd3
mini_cmd3
A data frame with 200 samples and 7 columns ('curation_id', 'pmid', 'package', 'target_condition', 'feces_phenotype', 'probing_pocket_depth', 'target_condition_ontology_term_id', 'feces_phenotype_ontology_term_id', 'probing_pocket_depth_ontology_term_id'). The 'target_condition' is multi-valued attribute, and 'feces_phenotype' and 'probing_pocket_depth' are composite attributes.
Sehyun Oh [email protected]
The OmicsMLRepoR package provides functions to browse the harmonized metadata created under the OmicsMLRepo project. It supports data navigation if the metadata incorporates ontology.
getMetadata
: Download a harmonized metadata table
tree_filter
: Find samples including the queried
terms and their descendants
For more detailed information, see the vignette:
vignette("Quickstart", package = "OmicsMLRepoR")
Groups ontology terms by the child term of a provided "parent" they fall under
ontoSummarize(parent, descendants, ontology)
ontoSummarize(parent, descendants, ontology)
parent |
Character; Term to use as parent of summarized children |
descendants |
Character vector; Terms to summarize as children of parent |
ontology |
Character; Ontology database ID |
A dataframe containing the descendants summarized into groups. Name of each group is the child of the parent that the descendants are grouped under. Both IDs and labels of the ontology terms are provided.
ontology <- "ncit" parent <- "NCIT:C17049" descendants <- c("NCIT:C44265", "NCIT:C77811", "NCIT:C43856", "NCIT:C43672", "NCIT:C2991", "NCIT:C43860") ontoSummarize(parent, descendants, ontology)
ontology <- "ncit" parent <- "NCIT:C17049" descendants <- c("NCIT:C44265", "NCIT:C77811", "NCIT:C43856", "NCIT:C43672", "NCIT:C2991", "NCIT:C43860") ontoSummarize(parent, descendants, ontology)
Plot ontology tree
ontoTreePlot(term, display = c("Term", "Text"))
ontoTreePlot(term, display = c("Term", "Text"))
term |
A character (1). Ontology term id (obo_id) |
display |
A character (1) specifying a node labeling option. Two available options are 'Term' for ontology term or IRI (Internationalized Resource Identifier) and 'Text' for the label or preferred name. |
A ontology tree plot. All the terms used in the output plot are ancestors of the queried term, so the queried term is the tip.
ontoTreePlot("NCIT:C2852", "Term")
ontoTreePlot("NCIT:C2852", "Term")
A small data table to demonstrating the data reshaping functions in OmicsMLRepoR.
sample_metadata
sample_metadata
A data frame with 4 rows and 7 columns
Sehyun Oh [email protected]
This function speads gout multiple values per attribute (‘multi-valued’ attribute) or multiple features under a single generic attribute (‘composite’ attribute). Multi-valued attribute will return 'long' table, while composite attribute will return 'wide' table.
spreadMeta(meta, targetCol)
spreadMeta(meta, targetCol)
meta |
A data frame. Harmonized metadata available through the OmicsMLRepoR package. It should have the 'package' column. |
targetCol |
A character (1). The column name to expand if present. |
A metadata table where the 'targetCol' is updated to a long form (if it is multi-valued attribute) or spread into multiple columns (if it is composite attribute).
data(mini_cmd2) # multi-valued attribute data(mini_cmd3) # composite attribute spreadMeta(mini_cmd2, "target_condition") spreadMeta(mini_cmd3, "probing_pocket_depth") data(mini_cbio) trt_cols <- grep("^treatment_", colnames(mini_cbio), value = TRUE) spreadMeta(mini_cbio, targetCol = trt_cols)
data(mini_cmd2) # multi-valued attribute data(mini_cmd3) # composite attribute spreadMeta(mini_cmd2, "target_condition") spreadMeta(mini_cmd3, "probing_pocket_depth") data(mini_cbio) trt_cols <- grep("^treatment_", colnames(mini_cbio), value = TRUE) spreadMeta(mini_cbio, targetCol = trt_cols)
Similar to filter
function, while its filtering
includes descendants and synonyms of the query term in addition to ontology
terms and ids identical or similar to the query term across different
ontologies collected through OLS search.
tree_filter(.data, col, query, logic = "OR", delim = NULL)
tree_filter(.data, col, query, logic = "OR", delim = NULL)
.data |
A data frame |
col |
A character (1). Column name to filter by. |
query |
A character vector containing words or ids to be used in the ontology search |
logic |
A character (1). Operator used to determine filtering method. Values allowed: "AND", "OR", "NOT". Defaults to "OR" |
delim |
A character (1) used to separate multiple values. If your
'.data' input is obtained from |
Data frame filtered by provided queries along with child terms in the specified column
meta <- getMetadata("cMD") tree_filter(meta, disease, c("pancreatic disease", "cancer"))
meta <- getMetadata("cMD") tree_filter(meta, disease, c("pancreatic disease", "cancer"))