Title: | Evaluating SNV-Induced Disruption of G-Quadruplex Structures |
---|---|
Description: | G-quadruplexes (G4s) are unique nucleic acid secondary structures predominantly found in guanine-rich regions and have been shown to be involved in various biological regulatory processes. G4SNVHunter is an R package designed to rapidly identify genomic sequences with G4-forming potential and accurately screen user-provided single nucleotide variants (also applicable to single nucleotide polymorphisms) that may destabilize these structures. This enables users to screen key variants for further experimental study, investigating how these variants may influence biological functions, such as gene regulation, by altering G4 formation. |
Authors: | Rongxin Zhang [cre, aut] |
Maintainer: | Rongxin Zhang <[email protected]> |
License: | MIT + file LICENSE |
Version: | 0.99.4 |
Built: | 2024-12-14 03:07:26 UTC |
Source: | https://github.com/bioc/G4SNVHunter |
This function checks whether the user-provided SNVs (or SNPs) are single nucleotide substitutions.
checkSNV(snv_gr = NULL, mode = "wra", ref_col = NULL, alt_col = NULL)
checkSNV(snv_gr = NULL, mode = "wra", ref_col = NULL, alt_col = NULL)
snv_gr |
A |
mode |
A character string specifying the checks to be performed.
|
ref_col |
Column name for the ref bases in |
alt_col |
Column name for the alt bases in |
A logical value indicating whether the user-provided SNVs (or SNPs) passed all checks.
if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } if (!requireNamespace("BSgenome.Hsapiens.UCSC.hg19", quietly = TRUE)) { BiocManager::install("GenomicRanges") } library(GenomicRanges) gr1 <- GRanges("chr1", IRanges(start = 100, width = 1)) # check width ('w') checkSNV(gr1, mode = "w") gr2 <- GRanges( seqnames = Rle("seq1"), ranges = IRanges(c(100, 200, 300), width = 1), ref = c("A", "C", "G"), alt = c("T", "T", "A") ) # check width ('w'), ref ('r'), and alt ('a') checkSNV(gr2, mode = "wra", ref_col = "ref", alt_col = "alt") # check width ('w') and alt ('a') checkSNV(gr2, mode = "wa", alt_col = "alt") gr3 <- GRanges("chr1", IRanges(start = 100, width = 10)) # widths should be all one checkSNV(gr3, mode = "w") gr4 <- GRanges( seqnames = Rle("seq1"), ranges = IRanges(start = 100, width = 1), ref = "AG", alt = "T" ) # ref should be all one checkSNV(gr4, mode = "wr", ref_col = "ref")
if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } if (!requireNamespace("BSgenome.Hsapiens.UCSC.hg19", quietly = TRUE)) { BiocManager::install("GenomicRanges") } library(GenomicRanges) gr1 <- GRanges("chr1", IRanges(start = 100, width = 1)) # check width ('w') checkSNV(gr1, mode = "w") gr2 <- GRanges( seqnames = Rle("seq1"), ranges = IRanges(c(100, 200, 300), width = 1), ref = c("A", "C", "G"), alt = c("T", "T", "A") ) # check width ('w'), ref ('r'), and alt ('a') checkSNV(gr2, mode = "wra", ref_col = "ref", alt_col = "alt") # check width ('w') and alt ('a') checkSNV(gr2, mode = "wa", alt_col = "alt") gr3 <- GRanges("chr1", IRanges(start = 100, width = 10)) # widths should be all one checkSNV(gr3, mode = "w") gr4 <- GRanges( seqnames = Rle("seq1"), ranges = IRanges(start = 100, width = 1), ref = "AG", alt = "T" ) # ref should be all one checkSNV(gr4, mode = "wr", ref_col = "ref")
This function filters the SNV Impact GRanges
object returned by the
SNVImpactG4
function based on user-defined thresholds for the
G4.info.score
, mut.score
, and score.diff
parameters.
This function filters SNVs that may significantly impact the formation of G4
structures using customizable filtering criteria. You are not required to
specify all three threshold parameters. However, at least one threshold
parameter must be provided.
filterSNVImpact( gr, raw_score_threshold = NULL, mut_score_threshold = NULL, score_diff_threshold = NULL )
filterSNVImpact( gr, raw_score_threshold = NULL, mut_score_threshold = NULL, score_diff_threshold = NULL )
gr |
A |
raw_score_threshold |
A positive numeric value no greater than 4 used
as the threshold for the absolute value of |
mut_score_threshold |
A positive numeric value no greater than 4 used
as the threshold for the absolute value of |
score_diff_threshold |
A negative numeric value no less than -4 used as
the threshold for |
A filtered GRanges
object, containing only the records that
meet the specified threshold criteria.
SNVImpactG4
for assessing the impact of SNVs on
G4 formation.
if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } if (!requireNamespace("GenomicRanges", quietly = TRUE)) { BiocManager::install("GenomicRanges") } if (!requireNamespace("BSgenome.Hsapiens.UCSC.hg19", quietly = TRUE)) { BiocManager::install("BSgenome.Hsapiens.UCSC.hg19") } library(GenomicRanges) library(BSgenome.Hsapiens.UCSC.hg19) hg19 <- BSgenome.Hsapiens.UCSC.hg19 chr21_seq <- DNAStringSet(hg19$chr21) # Chromosome name is needed names(chr21_seq) <- "chr21" G4 <- G4HunterDetect(chr21_seq) data(snp_gr) res_snp <- SNVImpactG4(G4, snp_gr, alt_col = "alt") filtered_snv_eff <- filterSNVImpact(res_snp, mut_score_threshold = 1.2, score_diff_threshold = -0.2 ) print(filtered_snv_eff)
if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } if (!requireNamespace("GenomicRanges", quietly = TRUE)) { BiocManager::install("GenomicRanges") } if (!requireNamespace("BSgenome.Hsapiens.UCSC.hg19", quietly = TRUE)) { BiocManager::install("BSgenome.Hsapiens.UCSC.hg19") } library(GenomicRanges) library(BSgenome.Hsapiens.UCSC.hg19) hg19 <- BSgenome.Hsapiens.UCSC.hg19 chr21_seq <- DNAStringSet(hg19$chr21) # Chromosome name is needed names(chr21_seq) <- "chr21" G4 <- G4HunterDetect(chr21_seq) data(snp_gr) res_snp <- SNVImpactG4(G4, snp_gr, alt_col = "alt") filtered_snv_eff <- filterSNVImpact(res_snp, mut_score_threshold = 1.2, score_diff_threshold = -0.2 ) print(filtered_snv_eff)
This function detects G4 sequences for a given DNAStringSet
object
using the G4Hunter algorithm.
G4HunterDetect( sequences = NULL, threshold = 1.5, window_size = 25, include_sequences = TRUE, strands = "b" )
G4HunterDetect( sequences = NULL, threshold = 1.5, window_size = 25, include_sequences = TRUE, strands = "b" )
sequences |
A |
threshold |
A numeric value specifying the threshold for the G4Hunter score (absolute value). Default is 1.5. We do not recommend setting the threshold below 1.2. |
window_size |
An integer specifying the window size (bp) for prediction. Default is 25. Another commonly used window size is 20. However, 25 is generally preferred unless otherwise required. |
include_sequences |
A logical value ( |
strands |
A character string ( |
A GRanges
object containing the predicted G4 sequences.
The GRanges object includes the following metadata columns:
score
The actual G4Hunter score for the G4 sequence detected (after merging, pruning, etc.).
max_score
The maximum G4Hunter score for the window covering this G4.
sequence
The G4 sequence after merging, pruning, etc.
If no G4 sequences were detected, it will return an empty GRanges
object.
loadSequence
for loading genome sequences into a
DNAStringSet
object.
if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } if (!requireNamespace("Biostrings", quietly = TRUE)) { BiocManager::install("Biostrings") } library(Biostrings) sequences <- DNAStringSet(c( "AGTGAATGGGATGGGAGGAGGGACGGGGTAGTACAGCATAGCATG", "TAGGTAGCTACGACACCCTGCCCTACCCCTACCCCTATCTA" )) names(sequences) <- c("seq1", "seq2") G4s <- G4HunterDetect(sequences, threshold = 1.5, window_size = 25) print(G4s) seq_path <- system.file("extdata", "seq.fa", package = "G4SNVHunter") G4s <- G4HunterDetect(loadSequence(seq_path = seq_path)) print(G4s) seq_path <- system.file("extdata", "seq.txt", package = "G4SNVHunter") G4s <- G4HunterDetect(loadSequence(seq_path = seq_path)) print(G4s)
if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } if (!requireNamespace("Biostrings", quietly = TRUE)) { BiocManager::install("Biostrings") } library(Biostrings) sequences <- DNAStringSet(c( "AGTGAATGGGATGGGAGGAGGGACGGGGTAGTACAGCATAGCATG", "TAGGTAGCTACGACACCCTGCCCTACCCCTACCCCTATCTA" )) names(sequences) <- c("seq1", "seq2") G4s <- G4HunterDetect(sequences, threshold = 1.5, window_size = 25) print(G4s) seq_path <- system.file("extdata", "seq.fa", package = "G4SNVHunter") G4s <- G4HunterDetect(loadSequence(seq_path = seq_path)) print(G4s) seq_path <- system.file("extdata", "seq.txt", package = "G4SNVHunter") G4s <- G4HunterDetect(loadSequence(seq_path = seq_path)) print(G4s)
This function calculates the G4Hunter score for a given nucleotide sequence, which reflects the ability of that sequence to form a G4 structure.
G4HunterScore(seq = NULL)
G4HunterScore(seq = NULL)
seq |
A single character string representing the nucleotide sequence.
Must contain only the characters |
A numeric value representing the G4Hunter score for the provided sequence.
G4HunterDetect
for detecting the G4 sequences in a
given DNAStringSet
object.
sequence <- "GGGTAAGGGATGGGTCGGG" score <- G4HunterScore(sequence) print(score) # A negative value indicates that the G4 sequence # is located on the reverse strand sequence <- "GGGTAAGGGATGGGTCGGG" score <- G4HunterScore(sequence) print(score)
sequence <- "GGGTAAGGGATGGGTCGGG" score <- G4HunterScore(sequence) print(score) # A negative value indicates that the G4 sequence # is located on the reverse strand sequence <- "GGGTAAGGGATGGGTCGGG" score <- G4HunterScore(sequence) print(score)
This function loads genomic sequences from various sources, including a FASTA file, a text file with sequence identifiers and corresponding sequences, or a data frame object.
loadSequence(genome_seq = NULL, seq_path = NULL)
loadSequence(genome_seq = NULL, seq_path = NULL)
genome_seq |
A data frame containing sequence identifiers and corresponding sequences. |
seq_path |
A character string specifying the file path to a FASTA file
(suffixed with |
A DNAStringSet
object containing the genome sequences.
# File path for sequences in fasta format fa_path <- system.file("extdata", "seq.fa", package = "G4SNVHunter") seq <- loadSequence(seq_path = fa_path) print(seq) # Another example # Load sequences from data.frame seq_df <- data.frame( chr = c("seq1", "seq2"), sequence = c( paste0(rep("G", 100), collapse = ""), paste0(rep("A", 100), collapse = "") ) ) seq <- loadSequence(genome_seq = seq_df) print(seq)
# File path for sequences in fasta format fa_path <- system.file("extdata", "seq.fa", package = "G4SNVHunter") seq <- loadSequence(seq_path = fa_path) print(seq) # Another example # Load sequences from data.frame seq_df <- data.frame( chr = c("seq1", "seq2"), sequence = c( paste0(rep("G", 100), collapse = ""), paste0(rep("A", 100), collapse = "") ) ) seq <- loadSequence(genome_seq = seq_df) print(seq)
This function generates a series of plots to visualize basic statistics of G4 sequences predicted by the G4Hunter algorithm. The function produces the following plots:
Distribution of max scores (absolute values).
Distribution of max scores, split by strand.
Distribution of scores (absolute values).
Distribution of scores, split by strand.
Distribution of sequence lengths (with lengths greater than 50 bp grouped into a single bin).
Distribution of sequence lengths, split by strand (with lengths greater than 50 bp grouped into a single bin).
plotG4Info( G4, p_colors = c("#6B9ECC", "#D91117", "#0E619C", "#58AC7B", "#D91117", "#0E619C", "#F39C40", "#D91117", "#0E619C") )
plotG4Info( G4, p_colors = c("#6B9ECC", "#D91117", "#0E619C", "#58AC7B", "#D91117", "#0E619C", "#F39C40", "#D91117", "#0E619C") )
G4 |
A GRanges object containing G4 sequences. The object must include at least the following metadata columns:
The GRanges object must also have a |
p_colors |
A vector of colors to be used for the plots. It should contain nine color values for the different plot elements. |
A combined plot displays all the generated plots arranged in a grid layout.
G4HunterDetect
for detecting the G4 sequences in a
given DNAStringSet
object.
seq_path <- system.file("extdata", "seq.txt", package = "G4SNVHunter") G4 <- G4HunterDetect(loadSequence(seq_path = seq_path)) plotG4Info(G4)
seq_path <- system.file("extdata", "seq.txt", package = "G4SNVHunter") G4 <- G4HunterDetect(loadSequence(seq_path = seq_path)) plotG4Info(G4)
This function plot sequence logos to visualize sequence variants caused by SNVs or SNPs, with the location of the variants highlighted by rectangles and arrows.
plotImpactSeq(filtered_gr, ncol = 1)
plotImpactSeq(filtered_gr, ncol = 1)
filtered_gr |
A |
ncol |
An integer specifying the number of columns in the output plot grid. Default is 1. |
A plot that displays the grid of sequence logos, showing the differences between the original and mutated sequences.
SNVImpactG4
for evaluating the impact of SNVs on G4
formation, and filterSNVImpact
for filtering G4s that are
significantly affected by SNVs.
if (!requireNamespace("GenomicRanges", quietly = TRUE)) { BiocManager::install("GenomicRanges") } library(GenomicRanges) seq <- data.frame(chr = c("seq1", "seq2"), seq = c("ATTTGGGAGGGAGGGAGGGATGATGAAAATTTTATTTATTTTATTTA", "TTTATACTATTCCCTTACCCTCCCATCCCCATACGGCATCTAGATC")) seq_gr <- loadSequence(seq) G4 <- G4HunterDetect(seq_gr) snv_gr <- GRanges(seqnames = c("seq1", "seq2"), ranges = IRanges(start = c(18, 23), width = 1), ref = c("G", "C"), alt = c("C", "G")) effect <- SNVImpactG4(G4, snv_gr, alt_col = "alt") plotImpactSeq(effect, ncol = 2)
if (!requireNamespace("GenomicRanges", quietly = TRUE)) { BiocManager::install("GenomicRanges") } library(GenomicRanges) seq <- data.frame(chr = c("seq1", "seq2"), seq = c("ATTTGGGAGGGAGGGAGGGATGATGAAAATTTTATTTATTTTATTTA", "TTTATACTATTCCCTTACCCTCCCATCCCCATACGGCATCTAGATC")) seq_gr <- loadSequence(seq) G4 <- G4HunterDetect(seq_gr) snv_gr <- GRanges(seqnames = c("seq1", "seq2"), ranges = IRanges(start = c(18, 23), width = 1), ref = c("G", "C"), alt = c("C", "G")) effect <- SNVImpactG4(G4, snv_gr, alt_col = "alt") plotImpactSeq(effect, ncol = 2)
This function generates two plots for visualizing the impact of SNVs on G4 formation: 1. A scatter plot with density shading comparing the original G4Hunter score and the mutant G4Hunter score. 2. A density plot showing the distribution of score changes between the original and mutant G4 sequences.
plotSNVImpact(gr, p_colors = c("#b22d2d", "#6ca4d6", "#2d69b0", "#1f77b4"))
plotSNVImpact(gr, p_colors = c("#b22d2d", "#6ca4d6", "#2d69b0", "#1f77b4"))
gr |
A
|
p_colors |
A vector of four colors used for plotting. |
A combined plot (using plot_grid
) containing two subplots:
- Density scatter plot comparing original vs mutant G4Hunter scores.
- Density plot of the G4Hunter score differences.
SNVImpactG4
for evaluating the impact of SNVs on G4
formation, and filterSNVImpact
for filtering G4s that are
significantly affected by SNVs.
if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } if (!requireNamespace("GenomicRanges", quietly = TRUE)) { BiocManager::install("GenomicRanges") } if (!requireNamespace("BSgenome.Hsapiens.UCSC.hg19", quietly = TRUE)) { BiocManager::install("BSgenome.Hsapiens.UCSC.hg19") } library(GenomicRanges) library(BSgenome.Hsapiens.UCSC.hg19) hg19 <- BSgenome.Hsapiens.UCSC.hg19 chr21_seq <- DNAStringSet(hg19$chr21) # Chromosome name is needed names(chr21_seq) <- "chr21" G4 <- G4HunterDetect(chr21_seq) # Load SNVs data(snp_gr) res_snp <- SNVImpactG4(G4, snp_gr, alt_col = "alt") plotSNVImpact(res_snp)
if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } if (!requireNamespace("GenomicRanges", quietly = TRUE)) { BiocManager::install("GenomicRanges") } if (!requireNamespace("BSgenome.Hsapiens.UCSC.hg19", quietly = TRUE)) { BiocManager::install("BSgenome.Hsapiens.UCSC.hg19") } library(GenomicRanges) library(BSgenome.Hsapiens.UCSC.hg19) hg19 <- BSgenome.Hsapiens.UCSC.hg19 chr21_seq <- DNAStringSet(hg19$chr21) # Chromosome name is needed names(chr21_seq) <- "chr21" G4 <- G4HunterDetect(chr21_seq) # Load SNVs data(snp_gr) res_snp <- SNVImpactG4(G4, snp_gr, alt_col = "alt") plotSNVImpact(res_snp)
This dataset contains a GRanges
object storing single nucleotide
polymorphisms (SNPs).
data(snp_gr)
data(snp_gr)
A GRanges
object with 30,000 SNPs.
This dataset contains a GRanges
object storing single nucleotide
variants (SNVs).
data(snv_gr)
data(snv_gr)
A GRanges
object with 50,000 SNVs.
This function evaluates the impact of SNVs on G4 formation.
SNVImpactG4( G4 = NULL, snvs = NULL, alt_col = "alt", mode = "s", sampleid_col = NULL, snvid_col = NULL )
SNVImpactG4( G4 = NULL, snvs = NULL, alt_col = "alt", mode = "s", sampleid_col = NULL, snvid_col = NULL )
G4 |
A |
snvs |
A |
alt_col |
A character string specifying the name of the column in
|
mode |
A character string indicating the mode of operation.
Set to |
sampleid_col |
A character string specifying the name of the column in
|
snvid_col |
A character string specifying the column name in
|
A GRanges
object depending on the mode
parameter:
For mode = "s"
, the returned GRanges
object includes the
following metadata columns:
seqnames
Identifiers for SNVs.
ranges
Position of the SNVs.
strand
Strand of the SNVs.
SNV.info.*
Metadata columns related to each SNV.
G4.info.*
Metadata columns from the original G4 object.
mut.G4.seq
The mutated G4 sequence after applying the
SNV change.
mut.G4.anno.seq
The mutated G4 sequence, with the
mutated bases annotated using square brackets.
mut.score
The G4Hunter score of the mutated sequence.
score.diff
The difference between the mutated G4Hunter
score and the original G4Hunter score.
The value is calculated as the absolute value of the mutated G4Hunter
score minus the absolute value of the original G4Hunter score.
For mode = "m"
, the returned GRanges
object includes the
following metadata columns:
seqnames
Identifiers for G4 sequences.
ranges
Position of the G4 sequences (start and end).
strand
Strand of the G4 sequences.
G4.info.*
Metadata columns from the original G4 object.
snv.ids
Concatenated SNV IDs for all SNVs affecting the
G4 region.
sample.ids
A semicolon-separated list of sample IDs
overlapping each G4 region.
mut.G4.seq
The mutated G4 sequence after applying the
combined SNV changes.
mut.G4.anno.seq
The mutated G4 sequence, with the
mutated bases annotated using square brackets.
mut.score
The G4Hunter score of the mutated sequence.
score.diff
The difference between the mutated G4Hunter
score and the original G4Hunter score. This value is calculated as the
absolute value of the mutated G4Hunter score minus the absolute value
of the original G4Hunter score.
G4HunterDetect
for detecting the G4 sequences in a
given DNAStringSet
object.
G4HunterScore
for calculating the G4Hunter scores
for a given sequence.
filterSNVImpact
for filtering out SNVs with
significant impact.
if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } if (!requireNamespace("GenomicRanges", quietly = TRUE)) { BiocManager::install("GenomicRanges") } if (!requireNamespace("BSgenome.Hsapiens.UCSC.hg19", quietly = TRUE)) { BiocManager::install("BSgenome.Hsapiens.UCSC.hg19") } library(GenomicRanges) library(BSgenome.Hsapiens.UCSC.hg19) # Load sequence for chromosome 21 (hg19) hg19 <- BSgenome.Hsapiens.UCSC.hg19 chr21_seq <- DNAStringSet(hg19$chr21) # Chromosome name is needed names(chr21_seq) <- "chr21" # Detect G4s in human chromosome 21 G4 <- G4HunterDetect(chr21_seq) # 's' mode # Load SNPs data(snp_gr) # Obtain SNPs that overlap with G4 regions and assess their impact on G4. # In variant-centric mode ('s'), evaluating each SNP individually. res_snp <- SNVImpactG4(G4, snp_gr, alt_col = "alt") print(res_snp) # 'm' mode # Load SNVs data(snv_gr) # Obtain SNVs that overlap with G4 regions and assess their impact on G4. # In sample-centric mode ('m'), evaluate the combined impact of SNVs on G4s. # Grouped by the sample IDs specified in 'sampleid_col'. res_snv <- SNVImpactG4(G4, snv_gr, alt_col = "alt", mode = "m", sampleid_col = "sampleid", snvid_col = "snv_id" ) print(res_snv)
if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } if (!requireNamespace("GenomicRanges", quietly = TRUE)) { BiocManager::install("GenomicRanges") } if (!requireNamespace("BSgenome.Hsapiens.UCSC.hg19", quietly = TRUE)) { BiocManager::install("BSgenome.Hsapiens.UCSC.hg19") } library(GenomicRanges) library(BSgenome.Hsapiens.UCSC.hg19) # Load sequence for chromosome 21 (hg19) hg19 <- BSgenome.Hsapiens.UCSC.hg19 chr21_seq <- DNAStringSet(hg19$chr21) # Chromosome name is needed names(chr21_seq) <- "chr21" # Detect G4s in human chromosome 21 G4 <- G4HunterDetect(chr21_seq) # 's' mode # Load SNPs data(snp_gr) # Obtain SNPs that overlap with G4 regions and assess their impact on G4. # In variant-centric mode ('s'), evaluating each SNP individually. res_snp <- SNVImpactG4(G4, snp_gr, alt_col = "alt") print(res_snp) # 'm' mode # Load SNVs data(snv_gr) # Obtain SNVs that overlap with G4 regions and assess their impact on G4. # In sample-centric mode ('m'), evaluate the combined impact of SNVs on G4s. # Grouped by the sample IDs specified in 'sampleid_col'. res_snv <- SNVImpactG4(G4, snv_gr, alt_col = "alt", mode = "m", sampleid_col = "sampleid", snvid_col = "snv_id" ) print(res_snv)