Title: | Processing Various Types of Data on GEO and TCGA |
---|---|
Description: | Gene Expression Omnibus(GEO) and The Cancer Genome Atlas (TCGA) provide us with a wealth of data, such as RNA-seq, DNA Methylation, SNP and Copy number variation data. It's easy to download data from TCGA using the gdc tool, but processing these data into a format suitable for bioinformatics analysis requires more work. This R package was developed to handle these data. |
Authors: | Erqiang Hu [aut, cre] |
Maintainer: | Erqiang Hu <[email protected]> |
License: | Artistic-2.0 |
Version: | 2.7.0 |
Built: | 2024-10-30 07:45:20 UTC |
Source: | https://github.com/bioc/GeoTcgaData |
Preprocess of Microarray data
array_preprocess(x, missing_value = "knn", string = " /// ")
array_preprocess(x, missing_value = "knn", string = " /// ")
x |
matrix of Microarray data, each column is a sample, and each row is a gene. |
missing_value |
Method to impute missing expression data, one of "zero" and "knn". |
string |
a string, sep of the gene |
matrix
arraylist <- get_geo_array("GSE781") arraylist <- lapply(arraylist, array_preprocess)
arraylist <- get_geo_array("GSE781") arraylist <- lapply(arraylist, array_preprocess)
Find the mean value of the gene in each module
cal_mean_module(geneExpress, module)
cal_mean_module(geneExpress, module)
geneExpress |
a data.frame of gene expression data. Each column is a sample, and each row is a gene. |
module |
a data.frame of two column. The first column is module name, the second column are genes in this module. |
a data.frame, means the mean of gene expression value in the same module
data(geneExpress) data(module) result <- cal_mean_module(geneExpress, module)
data(geneExpress) data(module) result <- cal_mean_module(geneExpress, module)
cluster probes of Microarray data
cluster_array(x, clusterCutoff = 0.7)
cluster_array(x, clusterCutoff = 0.7)
x |
matrix of Microarray data, the first is the name of the gene, and the others are the expression value. |
clusterCutoff |
Pearson correlation threshold to cut off the hierarchical tree. |
data.frame
arraylist <- get_geo_array("GSE781") arraylist <- lapply(arraylist, array_preprocess) arraylist_cluster <- lapply(arraylist, cluster_array)
arraylist <- get_geo_array("GSE781") arraylist <- lapply(arraylist, array_preprocess) arraylist_cluster <- lapply(arraylist, cluster_array)
combine pvalues of SNP difference analysis result
combine_pvalue(snpResult, snp2gene, combineMethod = min)
combine_pvalue(snpResult, snp2gene, combineMethod = min)
snpResult |
data.frame of SNP difference analysis result. |
snp2gene |
data frame of two column: snp and gene. |
combineMethod |
Method of combining the pvalue of multiple snp in a gene. |
data.frame
snpResult <- data.frame(pvalue = runif(100), estimate = runif(100)) rownames(snpResult) <- paste0("snp", seq_len(100)) snp2gene <- data.frame(snp = rownames(snpResult), gene = rep(paste0("gene", seq_len(20)), 5)) result <- combine_pvalue(snpResult, snp2gene)
snpResult <- data.frame(pvalue = runif(100), estimate = runif(100)) rownames(snpResult) <- paste0("snp", seq_len(100)) snp2gene <- data.frame(snp = rownames(snpResult), gene = rep(paste0("gene", seq_len(20)), 5)) result <- combine_pvalue(snpResult, snp2gene)
Convert count to FPKM
countToFpkm(counts_matrix, keyType = "SYMBOL", gene_cov)
countToFpkm(counts_matrix, keyType = "SYMBOL", gene_cov)
counts_matrix |
a matrix, colnames of counts_matrix are sample name, rownames of counts_matrix are gene symbols |
keyType |
keyType, one of keytypes(org.Hs.eg.db). |
gene_cov |
data.frame of two column, the first column is gene length, the second column is gene GC content |
a matrix
data(gene_cov) lung_squ_count2 <- matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3) rownames(lung_squ_count2) <- c("DISC1", "TCOF1", "SPPL3") colnames(lung_squ_count2) <- c("sample1", "sample2", "sample3") result <- countToFpkm(lung_squ_count2, keyType = "SYMBOL", gene_cov = gene_cov )
data(gene_cov) lung_squ_count2 <- matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3) rownames(lung_squ_count2) <- c("DISC1", "TCOF1", "SPPL3") colnames(lung_squ_count2) <- c("sample1", "sample2", "sample3") result <- countToFpkm(lung_squ_count2, keyType = "SYMBOL", gene_cov = gene_cov )
Convert count to Tpm
countToTpm(counts_matrix, keyType = "SYMBOL", gene_cov)
countToTpm(counts_matrix, keyType = "SYMBOL", gene_cov)
counts_matrix |
a matrix, colnames of counts_matrix are sample name, rownames of counts_matrix are gene symbols |
keyType |
keyType, one of keytypes(org.Hs.eg.db). |
gene_cov |
data.frame of two column, the first column is gene length, the second column is gene GC content |
a matrix
data(gene_cov) lung_squ_count2 <- matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3) rownames(lung_squ_count2) <- c("DISC1", "TCOF1", "SPPL3") colnames(lung_squ_count2) <- c("sample1", "sample2", "sample3") result <- countToTpm(lung_squ_count2, keyType = "SYMBOL", gene_cov = gene_cov )
data(gene_cov) lung_squ_count2 <- matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3) rownames(lung_squ_count2) <- c("DISC1", "TCOF1", "SPPL3") colnames(lung_squ_count2) <- c("sample1", "sample2", "sample3") result <- countToTpm(lung_squ_count2, keyType = "SYMBOL", gene_cov = gene_cov )
Differential analysis of Microarray data
differential_array(df, group, method = "limma", adjust.method = "BH")
differential_array(df, group, method = "limma", adjust.method = "BH")
df |
data.frame of the omic data, each column is a sample, and each row is a gene. |
group |
a vector, group of samples. |
method |
method to do differential analysis, one of "limma", "ttest", "wilcox". |
adjust.method |
adjust.method, one of "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", and "none". |
data.frame
library(GeoTcgaData) library(data.table) # Use real GEO data as example arrayData <- read.table("GSE54807_series_matrix.txt.gz", sep = "\t", header = TRUE, fill=TRUE, comment.char = "!", check.names=FALSE) gpl <- fread("GPL6244-17930.txt", sep = "\t", header = TRUE) gpl <- gpl[, c("ID", "gene_assignment")] class(gpl) <- "data.frame" for (i in seq_len(nrow(gpl))) { aa <- strsplit(gpl[i, 2], " // ")[[1]][5] gpl[i, 2] <- as.character(strsplit(aa, " /// ")[[1]][1]) } gpl[,1] <- as.character(gpl[,1]) arrayData[, 1] <- as.character(arrayData[, 1]) rownames(gpl) <- gpl[, 1] arrayData[, 1] <- gpl[arrayData[, 1], 2] arrayData <- repRemove(arrayData," /// ") # Remove rows that do not correspond to genes arrayData <- arrayData[!is.na(arrayData[, 1]), ] arrayData <- arrayData[!arrayData[, 1] == "", ] arrayData <- arrayData[!arrayData[, 1] == "---", ] arrayData <- arrayData[order(arrayData[, 1]), ] arrayData <- gene_ave(arrayData, 1) keep <- apply(arrayData, 1, function(x) sum(x < 1) < (length(x)/2)) arrayData <- arrayData[keep, ] group <- c(rep("group1", 12), rep("group2", 12)) result <- differential_array(df = arrayData, group = group) # Use random data as example arrayData <- matrix(runif(200), 25, 8) rownames(arrayData) <- paste0("gene", 1:25) colnames(arrayData) <- paste0("sample", 1:8) group <- c(rep("group1", 4), rep("group2", 4)) names(group) <- colnames(arrayData) result <- differential_array(df = arrayData, group = group)
library(GeoTcgaData) library(data.table) # Use real GEO data as example arrayData <- read.table("GSE54807_series_matrix.txt.gz", sep = "\t", header = TRUE, fill=TRUE, comment.char = "!", check.names=FALSE) gpl <- fread("GPL6244-17930.txt", sep = "\t", header = TRUE) gpl <- gpl[, c("ID", "gene_assignment")] class(gpl) <- "data.frame" for (i in seq_len(nrow(gpl))) { aa <- strsplit(gpl[i, 2], " // ")[[1]][5] gpl[i, 2] <- as.character(strsplit(aa, " /// ")[[1]][1]) } gpl[,1] <- as.character(gpl[,1]) arrayData[, 1] <- as.character(arrayData[, 1]) rownames(gpl) <- gpl[, 1] arrayData[, 1] <- gpl[arrayData[, 1], 2] arrayData <- repRemove(arrayData," /// ") # Remove rows that do not correspond to genes arrayData <- arrayData[!is.na(arrayData[, 1]), ] arrayData <- arrayData[!arrayData[, 1] == "", ] arrayData <- arrayData[!arrayData[, 1] == "---", ] arrayData <- arrayData[order(arrayData[, 1]), ] arrayData <- gene_ave(arrayData, 1) keep <- apply(arrayData, 1, function(x) sum(x < 1) < (length(x)/2)) arrayData <- arrayData[keep, ] group <- c(rep("group1", 12), rep("group2", 12)) result <- differential_array(df = arrayData, group = group) # Use random data as example arrayData <- matrix(runif(200), 25, 8) rownames(arrayData) <- paste0("gene", 1:25) colnames(arrayData) <- paste0("sample", 1:8) group <- c(rep("group1", 4), rep("group2", 4)) names(group) <- colnames(arrayData) result <- differential_array(df = arrayData, group = group)
Do difference analysis of gene level copy number variation data
differential_CNV( cnvData, sampleGroup, method = "Chisquare", adjust.method = "BH", ... )
differential_CNV( cnvData, sampleGroup, method = "Chisquare", adjust.method = "BH", ... )
cnvData |
data.frame of CNV data, each column is a sample, and each row is a CNV. |
sampleGroup |
vector of sample group |
method |
method to do diffenenital analysis, one of "Chisquare", "fisher", and "CATT"(Cochran-Armitage trend test) |
adjust.method |
adjust.method, one of "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", and "none". |
... |
parameters for "Chisquare", "fisher", and "CATT"(Cochran-Armitage trend test) |
data.frame with pvalue and estimate
# use TCGAbiolinks data as example library(TCGAbiolinks) query <- GDCquery( project = "TCGA-ACC", data.category = "Copy Number Variation", data.type = "Gene Level Copy Number", access = "open" ) GDCdownload(query) cnvData <- GDCprepare(query) aa <- assays(cnvData)$copy_number bb <- aa aa[bb == 2] <- 0 aa[bb < 2] <- -1 aa[bb > 2] <- 1 sampleGroup <- sample(c("A", "B"), ncol(cnvData), replace = TRUE) diffCnv <- differential_CNV(aa, sampleGroup) # Use sangerbox CNV data as example cnvData <- fread("Merge_GeneLevelCopyNumber.txt") class(cnvData) <- "data.frame" rownames(cnvData) <- cnvData[, 1] cnvData <- cnvData[, -c(1, 2, 3)] sampleGroup <- sample(c("A", "B"), ncol(cnvData), replace = TRUE) diffCnv <- differential_CNV(cnvData, sampleGroup) # use random data as example aa <- matrix(sample(c(0, 1, -1), 200, replace = TRUE), 25, 8) rownames(aa) <- paste0("gene", 1:25) colnames(aa) <- paste0("sample", 1:8) sampleGroup <- sample(c("A", "B"), ncol(aa), replace = TRUE) diffCnv <- differential_CNV(aa, sampleGroup)
# use TCGAbiolinks data as example library(TCGAbiolinks) query <- GDCquery( project = "TCGA-ACC", data.category = "Copy Number Variation", data.type = "Gene Level Copy Number", access = "open" ) GDCdownload(query) cnvData <- GDCprepare(query) aa <- assays(cnvData)$copy_number bb <- aa aa[bb == 2] <- 0 aa[bb < 2] <- -1 aa[bb > 2] <- 1 sampleGroup <- sample(c("A", "B"), ncol(cnvData), replace = TRUE) diffCnv <- differential_CNV(aa, sampleGroup) # Use sangerbox CNV data as example cnvData <- fread("Merge_GeneLevelCopyNumber.txt") class(cnvData) <- "data.frame" rownames(cnvData) <- cnvData[, 1] cnvData <- cnvData[, -c(1, 2, 3)] sampleGroup <- sample(c("A", "B"), ncol(cnvData), replace = TRUE) diffCnv <- differential_CNV(cnvData, sampleGroup) # use random data as example aa <- matrix(sample(c(0, 1, -1), 200, replace = TRUE), 25, 8) rownames(aa) <- paste0("gene", 1:25) colnames(aa) <- paste0("sample", 1:8) sampleGroup <- sample(c("A", "B"), ncol(aa), replace = TRUE) diffCnv <- differential_CNV(aa, sampleGroup)
differential_limma
differential_limma(df, group, adjust.method = "BH")
differential_limma(df, group, adjust.method = "BH")
df |
data.frame of the omic data |
group |
a vector, group of samples. |
adjust.method |
adjust.method. |
data.frame
df <- matrix(runif(200), 25, 8) df <- as.data.frame(df) rownames(df) <- paste0("gene", 1:25) colnames(df) <- paste0("sample", 1:8) group <- sample(c("group1", "group2"), 8, replace = TRUE) result <- differential_limma(df = df, group = group)
df <- matrix(runif(200), 25, 8) df <- as.data.frame(df) rownames(df) <- paste0("gene", 1:25) colnames(df) <- paste0("sample", 1:8) group <- sample(c("group1", "group2"), 8, replace = TRUE) result <- differential_limma(df = df, group = group)
Get methylation difference gene
differential_methy( cpgData, sampleGroup, groupCol, combineMethod = "stouffer", missing_value = "knn", cpg2gene = NULL, normMethod = "PBC", region = "TSS1500", model = "gene", adjust.method = "BH", adjPvalCutoff = 0.05, ucscData = FALSE )
differential_methy( cpgData, sampleGroup, groupCol, combineMethod = "stouffer", missing_value = "knn", cpg2gene = NULL, normMethod = "PBC", region = "TSS1500", model = "gene", adjust.method = "BH", adjPvalCutoff = 0.05, ucscData = FALSE )
cpgData |
data.frame of cpg beta value, , or SummarizedExperiment object |
sampleGroup |
vector of sample group |
groupCol |
group column |
combineMethod |
method to combine the cpg pvalues, a function or one of "stouffer", "fisher" and "rhoScores". |
missing_value |
Method to impute missing expression data, one of "zero" and "knn". |
cpg2gene |
data.frame to annotate cpg locus to gene |
normMethod |
Method to do normalization: "PBC" or "BMIQ". |
region |
region of genes, one of "Body", "TSS1500", "TSS200", "3'UTR", "1stExon", "5'UTR", and "IGR". Only used when cpg2gene is NULL. |
model |
if "cpg", step1: calculate difference cpgs; step2: calculate difference genes. if "gene", step1: calculate the methylation level of genes; step2: calculate difference genes. |
adjust.method |
character string specifying the method used to adjust p-values for multiple testing. See p.adjust for possible values. |
adjPvalCutoff |
adjusted pvalue cutoff |
ucscData |
Logical, whether the data comes from UCSC Xena. |
data.frame
# use TCGAbiolinks data library(TCGAbiolinks) query <- GDCquery(project = "TCGA-ACC", data.category = "DNA Methylation", data.type = "Methylation Beta Value", platform = "Illumina Human Methylation 450") GDCdownload(query, method = "api", files.per.chunk = 5, directory = Your_Path) merge_result <- Merge_methy_tcga(Your_Path_to_DNA_Methylation_data) library(ChAMP) # To avoid reporting errors differential_gene <- differential_methy(cpgData = merge_result, sampleGroup = sample(c("C","T"), ncol(merge_result[[1]]), replace = TRUE)) # use user defined data library(ChAMP) cpgData <- matrix(runif(2000), nrow = 200, ncol = 10) rownames(cpgData) <- paste0("cpg", seq_len(200)) colnames(cpgData) <- paste0("sample", seq_len(10)) sampleGroup <- c(rep("group1", 5), rep("group2", 5)) names(sampleGroup) <- colnames(cpgData) cpg2gene <- data.frame(cpg = rownames(cpgData), gene = rep(paste0("gene", seq_len(20)), 10)) result <- differential_methy(cpgData, sampleGroup, cpg2gene = cpg2gene, normMethod = NULL) # use SummarizedExperiment object input library(ChAMP) cpgData <- matrix(runif(2000), nrow = 200, ncol = 10) rownames(cpgData) <- paste0("cpg", seq_len(200)) colnames(cpgData) <- paste0("sample", seq_len(10)) sampleGroup <- c(rep("group1", 5), rep("group2", 5)) names(sampleGroup) <- colnames(cpgData) cpg2gene <- data.frame(cpg = rownames(cpgData), gene = rep(paste0("gene", seq_len(20)), 10)) colData <- S4Vectors::DataFrame( row.names = colnames(cpgData), group = sampleGroup ) data <- SummarizedExperiment::SummarizedExperiment( assays=S4Vectors::SimpleList(counts=cpgData), colData = colData) result <- differential_methy(cpgData = data, groupCol = "group", normMethod = NULL, cpg2gene = cpg2gene)
# use TCGAbiolinks data library(TCGAbiolinks) query <- GDCquery(project = "TCGA-ACC", data.category = "DNA Methylation", data.type = "Methylation Beta Value", platform = "Illumina Human Methylation 450") GDCdownload(query, method = "api", files.per.chunk = 5, directory = Your_Path) merge_result <- Merge_methy_tcga(Your_Path_to_DNA_Methylation_data) library(ChAMP) # To avoid reporting errors differential_gene <- differential_methy(cpgData = merge_result, sampleGroup = sample(c("C","T"), ncol(merge_result[[1]]), replace = TRUE)) # use user defined data library(ChAMP) cpgData <- matrix(runif(2000), nrow = 200, ncol = 10) rownames(cpgData) <- paste0("cpg", seq_len(200)) colnames(cpgData) <- paste0("sample", seq_len(10)) sampleGroup <- c(rep("group1", 5), rep("group2", 5)) names(sampleGroup) <- colnames(cpgData) cpg2gene <- data.frame(cpg = rownames(cpgData), gene = rep(paste0("gene", seq_len(20)), 10)) result <- differential_methy(cpgData, sampleGroup, cpg2gene = cpg2gene, normMethod = NULL) # use SummarizedExperiment object input library(ChAMP) cpgData <- matrix(runif(2000), nrow = 200, ncol = 10) rownames(cpgData) <- paste0("cpg", seq_len(200)) colnames(cpgData) <- paste0("sample", seq_len(10)) sampleGroup <- c(rep("group1", 5), rep("group2", 5)) names(sampleGroup) <- colnames(cpgData) cpg2gene <- data.frame(cpg = rownames(cpgData), gene = rep(paste0("gene", seq_len(20)), 10)) colData <- S4Vectors::DataFrame( row.names = colnames(cpgData), group = sampleGroup ) data <- SummarizedExperiment::SummarizedExperiment( assays=S4Vectors::SimpleList(counts=cpgData), colData = colData) result <- differential_methy(cpgData = data, groupCol = "group", normMethod = NULL, cpg2gene = cpg2gene)
Do difference analysis of RNA-seq data
differential_RNA( counts, group, groupCol, method = "limma", geneLength = NULL, gccontent = NULL, filter = TRUE, edgeRNorm = TRUE, adjust.method = "BH", useTopconfects = TRUE, ucscData = FALSE )
differential_RNA( counts, group, groupCol, method = "limma", geneLength = NULL, gccontent = NULL, filter = TRUE, edgeRNorm = TRUE, adjust.method = "BH", useTopconfects = TRUE, ucscData = FALSE )
counts |
a dataframe or numeric matrix of raw counts data, or SummarizedExperiment object |
group |
sample groups |
groupCol |
group column |
method |
one of "DESeq2", "edgeR" , "limma", "dearseq", "NOISeq", "Wilcoxon", and "auto". |
geneLength |
a vector of gene length. |
gccontent |
a vector of gene GC content. |
filter |
if TRUE, use filterByExpr to filter genes. |
edgeRNorm |
if TRUE, use edgeR to do normalization for dearseq method. |
adjust.method |
character string specifying the method used to adjust p-values for multiple testing. See p.adjust for possible values. |
useTopconfects |
if TRUE, use topconfects to provide a more biologically useful ranked gene list. |
ucscData |
Logical, whether the data comes from UCSC Xena. |
data.frame
library(TCGAbiolinks) query <- GDCquery( project = "TCGA-ACC", data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = "STAR - Counts" ) GDCdownload(query, method = "api", files.per.chunk = 3, directory = Your_Path ) dataRNA <- GDCprepare( query = query, directory = Your_Path, save = TRUE, save.filename = "dataRNA.RData" ) ## get raw count matrix dataPrep <- TCGAanalyze_Preprocessing( object = dataRNA, cor.cut = 0.6, datatype = "STAR - Counts" ) # Use `differential_RNA` to do difference analysis. # We provide the data of human gene length and GC content in `gene_cov`. group <- sample(c("grp1", "grp2"), ncol(dataPrep), replace = TRUE) library(cqn) # To avoid reporting errors: there is no function "rq" ## get gene length and GC content library(org.Hs.eg.db) genes_bitr <- bitr(rownames(gene_cov), fromType = "ENTREZID", toType = "ENSEMBL", OrgDb = org.Hs.eg.db, drop = TRUE ) genes_bitr <- genes_bitr[!duplicated(genes_bitr[, 2]), ] gene_cov2 <- gene_cov[genes_bitr$ENTREZID, ] rownames(gene_cov2) <- genes_bitr$ENSEMBL genes <- intersect(rownames(dataPrep), rownames(gene_cov2)) dataPrep <- dataPrep[genes, ] geneLength <- gene_cov2(genes, "length") gccontent <- gene_cov2(genes, "GC") names(geneLength) <- names(gccontent) <- genes ## Difference analysis DEGAll <- differential_RNA( counts = dataPrep, group = group, geneLength = geneLength, gccontent = gccontent ) # Use `clusterProfiler` to do enrichment analytics: diffGenes <- DEGAll$logFC names(diffGenes) <- rownames(DEGAll) diffGenes <- sort(diffGenes, decreasing = TRUE) library(clusterProfiler) library(enrichplot) library(org.Hs.eg.db) gsego <- gseGO(gene = diffGenes, OrgDb = org.Hs.eg.db, keyType = "ENSEMBL") dotplot(gsego) # use user-defined data df <- matrix(rnbinom(400, mu = 4, size = 10), 25, 16) df <- as.data.frame(df) rownames(df) <- paste0("gene", 1:25) colnames(df) <- paste0("sample", 1:16) group <- sample(c("group1", "group2"), 16, replace = TRUE) result <- differential_RNA(counts = df, group = group, filte = FALSE, method = "Wilcoxon") # use SummarizedExperiment object input df <- matrix(rnbinom(400, mu = 4, size = 10), 25, 16) rownames(df) <- paste0("gene", 1:25) colnames(df) <- paste0("sample", 1:16) group <- sample(c("group1", "group2"), 16, replace = TRUE) nrows <- 200; ncols <- 20 counts <- matrix( runif(nrows * ncols, 1, 1e4), nrows, dimnames = list(paste0("cg",1:200),paste0("S",1:20)) ) colData <- S4Vectors::DataFrame( row.names = paste0("sample", 1:16), group = group ) data <- SummarizedExperiment::SummarizedExperiment( assays=S4Vectors::SimpleList(counts=df), colData = colData) result <- differential_RNA(counts = data, groupCol = "group", filte = FALSE, method = "Wilcoxon")
library(TCGAbiolinks) query <- GDCquery( project = "TCGA-ACC", data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = "STAR - Counts" ) GDCdownload(query, method = "api", files.per.chunk = 3, directory = Your_Path ) dataRNA <- GDCprepare( query = query, directory = Your_Path, save = TRUE, save.filename = "dataRNA.RData" ) ## get raw count matrix dataPrep <- TCGAanalyze_Preprocessing( object = dataRNA, cor.cut = 0.6, datatype = "STAR - Counts" ) # Use `differential_RNA` to do difference analysis. # We provide the data of human gene length and GC content in `gene_cov`. group <- sample(c("grp1", "grp2"), ncol(dataPrep), replace = TRUE) library(cqn) # To avoid reporting errors: there is no function "rq" ## get gene length and GC content library(org.Hs.eg.db) genes_bitr <- bitr(rownames(gene_cov), fromType = "ENTREZID", toType = "ENSEMBL", OrgDb = org.Hs.eg.db, drop = TRUE ) genes_bitr <- genes_bitr[!duplicated(genes_bitr[, 2]), ] gene_cov2 <- gene_cov[genes_bitr$ENTREZID, ] rownames(gene_cov2) <- genes_bitr$ENSEMBL genes <- intersect(rownames(dataPrep), rownames(gene_cov2)) dataPrep <- dataPrep[genes, ] geneLength <- gene_cov2(genes, "length") gccontent <- gene_cov2(genes, "GC") names(geneLength) <- names(gccontent) <- genes ## Difference analysis DEGAll <- differential_RNA( counts = dataPrep, group = group, geneLength = geneLength, gccontent = gccontent ) # Use `clusterProfiler` to do enrichment analytics: diffGenes <- DEGAll$logFC names(diffGenes) <- rownames(DEGAll) diffGenes <- sort(diffGenes, decreasing = TRUE) library(clusterProfiler) library(enrichplot) library(org.Hs.eg.db) gsego <- gseGO(gene = diffGenes, OrgDb = org.Hs.eg.db, keyType = "ENSEMBL") dotplot(gsego) # use user-defined data df <- matrix(rnbinom(400, mu = 4, size = 10), 25, 16) df <- as.data.frame(df) rownames(df) <- paste0("gene", 1:25) colnames(df) <- paste0("sample", 1:16) group <- sample(c("group1", "group2"), 16, replace = TRUE) result <- differential_RNA(counts = df, group = group, filte = FALSE, method = "Wilcoxon") # use SummarizedExperiment object input df <- matrix(rnbinom(400, mu = 4, size = 10), 25, 16) rownames(df) <- paste0("gene", 1:25) colnames(df) <- paste0("sample", 1:16) group <- sample(c("group1", "group2"), 16, replace = TRUE) nrows <- 200; ncols <- 20 counts <- matrix( runif(nrows * ncols, 1, 1e4), nrows, dimnames = list(paste0("cg",1:200),paste0("S",1:20)) ) colData <- S4Vectors::DataFrame( row.names = paste0("sample", 1:16), group = group ) data <- SummarizedExperiment::SummarizedExperiment( assays=S4Vectors::SimpleList(counts=df), colData = colData) result <- differential_RNA(counts = data, groupCol = "group", filte = FALSE, method = "Wilcoxon")
Do difference analysis of SNP data
differential_SNP(snpDf, sampleGroup, combineMethod = min)
differential_SNP(snpDf, sampleGroup, combineMethod = min)
snpDf |
data.frame of SNP data, each column is a sample, and each row is a SNP. |
sampleGroup |
vector of sample group. |
combineMethod |
Method of combining the pvalue of multiple snp in a gene. |
data.frame
library(TCGAbiolinks) query <- GDCquery( project = "TCGA-CHOL", data.category = "Simple Nucleotide Variation", access = "open", legacy = FALSE, data.type = "Masked Somatic Mutation", workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking" ) GDCdownload(query) data_snp <- GDCprepare(query) samples <- unique(data_snp$Tumor_Sample_Barcode) sampleGroup <- sample(c("A", "B"), length(samples), replace = TRUE) names(sampleGroup) <- samples pvalue <- differential_SNP_tcga(snpData = data_snp, sampleGroup = sampleGroup) # use demo data snpDf <- matrix(sample(c("mutation", NA), 100, replace = TRUE), 10, 10) snpDf <- as.data.frame(snpDf) sampleGroup <- sample(c("A", "B"), 10, replace = TRUE) result <- differential_SNP(snpDf, sampleGroup)
library(TCGAbiolinks) query <- GDCquery( project = "TCGA-CHOL", data.category = "Simple Nucleotide Variation", access = "open", legacy = FALSE, data.type = "Masked Somatic Mutation", workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking" ) GDCdownload(query) data_snp <- GDCprepare(query) samples <- unique(data_snp$Tumor_Sample_Barcode) sampleGroup <- sample(c("A", "B"), length(samples), replace = TRUE) names(sampleGroup) <- samples pvalue <- differential_SNP_tcga(snpData = data_snp, sampleGroup = sampleGroup) # use demo data snpDf <- matrix(sample(c("mutation", NA), 100, replace = TRUE), 10, 10) snpDf <- as.data.frame(snpDf) sampleGroup <- sample(c("A", "B"), 10, replace = TRUE) result <- differential_SNP(snpDf, sampleGroup)
Do difference analysis of SNP data downloaded from GEO
differential_SNP_GEO(snpData, sampleGroup, method = "Chisquare")
differential_SNP_GEO(snpData, sampleGroup, method = "Chisquare")
snpData |
data.frame of SNP data downloaded from GEO |
sampleGroup |
vector of sample group |
method |
one of "Chisquare", "fisher", and "CATT"(Cochran-Armitage trend test) |
data.frame
file1 <- read.table("GSE66903_series_matrix.txt.gz", fill=TRUE, comment.char="!", header = TRUE) rownames(file1) <- file1[, 1] snpData <- file1[, -1] sampleGroup <- sample(c("A", "B"), ncol(snpData ), replace = TRUE) names(sampleGroup) <- colnames(snpData) snpData <- SNP_QC(snpData) sampleGroup <- sample(c("A", "B"), ncol(snpData ), replace = TRUE) result1 <- differential_SNP_GEO(snpData = snpData, sampleGroup = sampleGroup, method = "Chisquare") # use demo data snpDf <- matrix(sample(c("AA", "Aa", "aa"), 100, replace = TRUE), 10, 10) snpDf <- as.data.frame(snpDf) sampleGroup <- sample(c("A", "B"), 10, replace = TRUE) result <- differential_SNP_GEO(snpDf, sampleGroup, method = "fisher")
file1 <- read.table("GSE66903_series_matrix.txt.gz", fill=TRUE, comment.char="!", header = TRUE) rownames(file1) <- file1[, 1] snpData <- file1[, -1] sampleGroup <- sample(c("A", "B"), ncol(snpData ), replace = TRUE) names(sampleGroup) <- colnames(snpData) snpData <- SNP_QC(snpData) sampleGroup <- sample(c("A", "B"), ncol(snpData ), replace = TRUE) result1 <- differential_SNP_GEO(snpData = snpData, sampleGroup = sampleGroup, method = "Chisquare") # use demo data snpDf <- matrix(sample(c("AA", "Aa", "aa"), 100, replace = TRUE), 10, 10) snpDf <- as.data.frame(snpDf) sampleGroup <- sample(c("A", "B"), 10, replace = TRUE) result <- differential_SNP_GEO(snpDf, sampleGroup, method = "fisher")
Do difference analysis of SNP data downloaded from TCGAbiolinks
differential_SNP_tcga(snpData, sampleGroup, combineMethod = NULL)
differential_SNP_tcga(snpData, sampleGroup, combineMethod = NULL)
snpData |
data.frame of SNP data downloaded from TCGAbiolinks |
sampleGroup |
vector of sample group |
combineMethod |
Method of combining the pvalue of multiple snp in a gene. |
data.frame
library(TCGAbiolinks) query <- GDCquery( project = "TCGA-CHOL", data.category = "Simple Nucleotide Variation", access = "open", legacy = FALSE, data.type = "Masked Somatic Mutation", workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking" ) GDCdownload(query) data_snp <- GDCprepare(query) samples <- unique(data_snp$Tumor_Sample_Barcode) sampleGroup <- sample(c("A", "B"), length(samples), replace = TRUE) names(sampleGroup) <- samples pvalue <- differential_SNP_tcga(snpData = data_snp, sampleGroup = sampleGroup) # use demo data snpDf <- matrix(sample(c("mutation", NA), 100, replace = TRUE), 10, 10) snpDf <- as.data.frame(snpDf) sampleGroup <- sample(c("A", "B"), 10, replace = TRUE) result <- differential_SNP(snpDf, sampleGroup)
library(TCGAbiolinks) query <- GDCquery( project = "TCGA-CHOL", data.category = "Simple Nucleotide Variation", access = "open", legacy = FALSE, data.type = "Masked Somatic Mutation", workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking" ) GDCdownload(query) data_snp <- GDCprepare(query) samples <- unique(data_snp$Tumor_Sample_Barcode) sampleGroup <- sample(c("A", "B"), length(samples), replace = TRUE) names(sampleGroup) <- samples pvalue <- differential_SNP_tcga(snpData = data_snp, sampleGroup = sampleGroup) # use demo data snpDf <- matrix(sample(c("mutation", NA), 100, replace = TRUE), 10, 10) snpDf <- as.data.frame(snpDf) sampleGroup <- sample(c("A", "B"), 10, replace = TRUE) result <- differential_SNP(snpDf, sampleGroup)
Convert fpkm to Tpm
fpkmToTpm(fpkm_matrix)
fpkmToTpm(fpkm_matrix)
fpkm_matrix |
a matrix, colnames of fpkm_matrix are sample name, rownames of fpkm_matrix are genes |
a matrix
lung_squ_count2 <- matrix(c(0.11, 0.22, 0.43, 0.14, 0.875, 0.66, 0.77, 0.18, 0.29), ncol = 3) rownames(lung_squ_count2) <- c("DISC1", "TCOF1", "SPPL3") colnames(lung_squ_count2) <- c("sample1", "sample2", "sample3") result <- fpkmToTpm(lung_squ_count2)
lung_squ_count2 <- matrix(c(0.11, 0.22, 0.43, 0.14, 0.875, 0.66, 0.77, 0.18, 0.29), ncol = 3) rownames(lung_squ_count2) <- c("DISC1", "TCOF1", "SPPL3") colnames(lung_squ_count2) <- c("sample1", "sample2", "sample3") result <- fpkmToTpm(lung_squ_count2)
Average the values of same genes in gene expression profile
gene_ave(file_gene_ave, k = 1)
gene_ave(file_gene_ave, k = 1)
file_gene_ave |
a data.frame of gene expression data, each column is a sample, and each row is a gene. |
k |
a number, indicates which is the gene column. |
a data.frame, the values of same genes in gene expression profile
aa <- c("MARCH1", "MARC1", "MARCH1", "MARCH1", "MARCH1") bb <- c(2.969058399, 4.722410064, 8.165514853, 8.24243893, 8.60815086) cc <- c(3.969058399, 5.722410064, 7.165514853, 6.24243893, 7.60815086) file_gene_ave <- data.frame(aa = aa, bb = bb, cc = cc) colnames(file_gene_ave) <- c("Gene", "GSM1629982", "GSM1629983") result <- gene_ave(file_gene_ave, 1)
aa <- c("MARCH1", "MARC1", "MARCH1", "MARCH1", "MARCH1") bb <- c(2.969058399, 4.722410064, 8.165514853, 8.24243893, 8.60815086) cc <- c(3.969058399, 5.722410064, 7.165514853, 6.24243893, 7.60815086) file_gene_ave <- data.frame(aa = aa, bb = bb, cc = cc) colnames(file_gene_ave) <- c("Gene", "GSM1629982", "GSM1629983") result <- gene_ave(file_gene_ave, 1)
the gene length and GC content data comes from TxDb.Hsapiens.UCSC.hg38.knownGene and BSgenome.Hsapiens.UCSC.hg38
gene_cov
gene_cov
A data.frame with 27341 rows and 2 column
It is a randomly generated expression data used as an example of functions in this package. the rowname is gene symbols the columns are gene expression values
geneExpress
geneExpress
A data.frame with 10779 rows and 2 column
Get Microarray matrix data from GEO
get_geo_array(gse)
get_geo_array(gse)
gse |
GSE number, such as GSE781. |
a list of matrix
arraylist <- get_geo_array("GSE781")
arraylist <- get_geo_array("GSE781")
the first column represents the gene symbol
GSE66705_sample2
GSE66705_sample2
A matrix with 999 rows and 3 column
the other columns represent the expression of genes
Convert ENSEMBL gene id to gene Symbol in TCGA
id_conversion_TCGA(profiles, toType = "SYMBOL")
id_conversion_TCGA(profiles, toType = "SYMBOL")
profiles |
a data.frame of gene expression data, each column is a sample, and each row is a gene. |
toType |
one of 'keytypes(org.Hs.eg.db)' |
a data.frame, gene symbols and their expression value
library(org.Hs.eg.db) data(profile) result <- id_conversion_TCGA(profile)
library(org.Hs.eg.db) data(profile) result <- id_conversion_TCGA(profile)
It is a randomly generated expression data used as an example of functions in this package. the first column represents the gene symbol
kegg_liver
kegg_liver
A matrix with 100 rows and 150 column
the other columns represent the expression(count) of genes
When the methylation data is downloaded from TCGA, each sample is saved in a folder, which contains the methylation value file and the descriptive file. This function can directly extract and consolidate all folders.
Merge_methy_tcga(dirr = NULL)
Merge_methy_tcga(dirr = NULL)
dirr |
a string for the directory of methylation data download from tcga useing the tools gdc |
a matrix, a combined methylation expression spectrum matrix
merge_result <- Merge_methy_tcga(system.file(file.path("extdata", "methy"), package = "GeoTcgaData"))
merge_result <- Merge_methy_tcga(system.file(file.path("extdata", "methy"), package = "GeoTcgaData"))
It is a randomly generated expression data used as an example of functions in this package.
module
module
A matrix with 176 rows and 3 column
Preparer file for chi-square test
prepare_chi(cnv)
prepare_chi(cnv)
cnv |
result of ann_merge() |
a matrix
cnv <- matrix(c( -1.09150, -1.47120, -0.87050, -0.50880, -0.50880, 2.0, 2.0, 2.0, 2.0, 2.0, 2.601962, 2.621332, 2.621332, 2.621332, 2.621332, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0 ), nrow = 5) cnv <- as.data.frame(cnv) rownames(cnv) <- c("AJAP1", "FHAD1", "CLCNKB", "CROCCP2", "AL137798.3") colnames(cnv) <- c( "TCGA-DD-A4NS-10A-01D-A30U-01", "TCGA-ED-A82E-01A-11D-A34Y-01", "TCGA-WQ-A9G7-01A-11D-A36W-01", "TCGA-DD-AADN-01A-11D-A40Q-01", "TCGA-ZS-A9CD-10A-01D-A36Z-01", "TCGA-DD-A1EB-11A-11D-A12Y-01" ) cnv_chi_file <- prepare_chi(cnv)
cnv <- matrix(c( -1.09150, -1.47120, -0.87050, -0.50880, -0.50880, 2.0, 2.0, 2.0, 2.0, 2.0, 2.601962, 2.621332, 2.621332, 2.621332, 2.621332, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0 ), nrow = 5) cnv <- as.data.frame(cnv) rownames(cnv) <- c("AJAP1", "FHAD1", "CLCNKB", "CROCCP2", "AL137798.3") colnames(cnv) <- c( "TCGA-DD-A4NS-10A-01D-A30U-01", "TCGA-ED-A82E-01A-11D-A34Y-01", "TCGA-WQ-A9G7-01A-11D-A36W-01", "TCGA-DD-AADN-01A-11D-A40Q-01", "TCGA-ZS-A9CD-10A-01D-A36Z-01", "TCGA-DD-A1EB-11A-11D-A12Y-01" ) cnv_chi_file <- prepare_chi(cnv)
It is a randomly generated expression data used as an example of functions in this package. the first column represents the gene symbol
profile
profile
A matrix with 10 rows and 10 column
the other columns represent the expression(FPKM) of genes
Handle the case where one id corresponds to multiple genes
repAssign(input_file, string)
repAssign(input_file, string)
input_file |
input file, a data.frame or a matrix, the first column should be genes. |
string |
a string, sep of the gene |
a data.frame, when an id corresponds to multiple genes, the expression value is assigned to each gene
aa <- c("MARCH1 /// MMA", "MARC1", "MARCH2 /// MARCH3", "MARCH3 /// MARCH4", "MARCH1") bb <- c("2.969058399", "4.722410064", "8.165514853", "8.24243893", "8.60815086") cc <- c("3.969058399", "5.722410064", "7.165514853", "6.24243893", "7.60815086") input_file <- data.frame(aa = aa, bb = bb, cc = cc) repAssign_result <- repAssign(input_file, " /// ")
aa <- c("MARCH1 /// MMA", "MARC1", "MARCH2 /// MARCH3", "MARCH3 /// MARCH4", "MARCH1") bb <- c("2.969058399", "4.722410064", "8.165514853", "8.24243893", "8.60815086") cc <- c("3.969058399", "5.722410064", "7.165514853", "6.24243893", "7.60815086") input_file <- data.frame(aa = aa, bb = bb, cc = cc) repAssign_result <- repAssign(input_file, " /// ")
Handle the case where one id corresponds to multiple genes
repRemove(input_file, string)
repRemove(input_file, string)
input_file |
input file, a data.frame or a matrix, the first column should be genes. |
string |
a string,sep of the gene |
a data.frame, when an id corresponds to multiple genes, the expression value is deleted
aa <- c("MARCH1 /// MMA", "MARC1", "MARCH2 /// MARCH3", "MARCH3 /// MARCH4", "MARCH1") bb <- c("2.969058399", "4.722410064", "8.165514853", "8.24243893", "8.60815086") cc <- c("3.969058399", "5.722410064", "7.165514853", "6.24243893", "7.60815086") input_file <- data.frame(aa = aa, bb = bb, cc = cc) repRemove_result <- repRemove(input_file, " /// ")
aa <- c("MARCH1 /// MMA", "MARC1", "MARCH2 /// MARCH3", "MARCH3 /// MARCH4", "MARCH1") bb <- c("2.969058399", "4.722410064", "8.165514853", "8.24243893", "8.60815086") cc <- c("3.969058399", "5.722410064", "7.165514853", "6.24243893", "7.60815086") input_file <- data.frame(aa = aa, bb = bb, cc = cc) repRemove_result <- repRemove(input_file, " /// ")
Do quality control of SNP data downloaded from TCGAbiolinks
SNP_QC( snpData, geon = 0.02, mind = 0.02, maf = 0.05, hwe = 1e-06, miss = "NoCall" )
SNP_QC( snpData, geon = 0.02, mind = 0.02, maf = 0.05, hwe = 1e-06, miss = "NoCall" )
snpData |
data.frame of SNP data downloaded from TCGAbiolinks |
geon |
filters out all variants with missing call rates exceeding the provided value (default 0.02) to be removed |
mind |
filters out all samples with missing call rates exceeding the provided value (default 0.02) to be removed |
maf |
filters out all variants with minor allele frequency below the provided threshold |
hwe |
filters out all variants which have Hardy-Weinberg equilibrium exact test p-value below the provided threshold |
miss |
character of miss value |
data.frame
# use demo data snpDf <- matrix(sample(c("AA", "Aa", "aa"), 100, replace = TRUE), 10, 10) snpDf <- as.data.frame(snpDf) sampleGroup <- sample(c("A", "B"), 10, replace = TRUE) result <- SNP_QC(snpDf)
# use demo data snpDf <- matrix(sample(c("AA", "Aa", "aa"), 100, replace = TRUE), 10, 10) snpDf <- as.data.frame(snpDf) sampleGroup <- sample(c("A", "B"), 10, replace = TRUE) result <- SNP_QC(snpDf)
It is a randomly generated expression data used as an example of functions in this package. the first column represents the gene symbol
ventricle
ventricle
A matrix with 32 rows and 20 column
the other columns represent the expression of genes