Title: | FASTQ Quality Control |
---|---|
Description: | Assessment of FASTQ file format with multiple metrics including quality score, sequence content, overrepresented sequence and Kmers. |
Authors: | Wenyue Xing [aut], August Guang [aut, cre] |
Maintainer: | August Guang <[email protected]> |
License: | Artistic-2.0 |
Version: | 1.23.0 |
Built: | 2024-11-19 06:12:24 UTC |
Source: | https://github.com/bioc/qckitfastq |
Creates a sorted from most frequent to least frequent abundance table of adapters that are found to be present in the reads at greater than 0.1% of the reads. If output_file is selected then will save the entire set of adapters and counts. Only available for macOS/Linux due to dependency on C++14.
adapter_content(infile, adapter_file = system.file("extdata", "adapters.txt", package = "qckitfastq"), output_file = NA)
adapter_content(infile, adapter_file = system.file("extdata", "adapters.txt", package = "qckitfastq"), output_file = NA)
infile |
the path to a gzipped FASTQ file |
adapter_file |
Path to adapters.txt file. Default from package. |
output_file |
File to save data frame to. Default NA. |
Sorted table of adapters and counts.
if(.Platform$OS.type != "windows") { infile <- system.file("extdata","test.fq.gz", package = "qckitfastq") adapter_content(infile)[1:5] }
if(.Platform$OS.type != "windows") { infile <- system.file("extdata","test.fq.gz", package = "qckitfastq") adapter_content(infile)[1:5] }
Compute adapter content in reads. This function is only available for macOS/Linux.
calc_adapter_content(infile, adapters)
calc_adapter_content(infile, adapters)
infile |
filepath to fastq sequence |
adapters |
filepath to adapters |
map object with adapter names as the key and the number of times the adapters appears in the reads as the value
if(.Platform$OS.type != "windows") { adapter_file <- system.file("extdata", "adapters.txt", package = "qckitfastq") infile <- system.file("extdata", "test.fq.gz", package = "qckitfastq") content <- calc_adapter_content(infile, adapter_file) }
if(.Platform$OS.type != "windows") { adapter_file <- system.file("extdata", "adapters.txt", package = "qckitfastq") infile <- system.file("extdata", "test.fq.gz", package = "qckitfastq") content <- calc_adapter_content(infile, adapter_file) }
Calculate score based on Illumina format
calc_format_score(score, score_format)
calc_format_score(score, score_format)
score |
An ascii quality score from the fastq |
score_format |
The illumina format |
a string as with the best guess as to the illumina format
calc_format_score("A","Sanger")
calc_format_score("A","Sanger")
Calculate sequece counts for each unique sequence and create a table with unique sequences and corresponding counts
calc_over_rep_seq(infile, min_size = 5L, buffer_size = 1000000L)
calc_over_rep_seq(infile, min_size = 5L, buffer_size = 1000000L)
infile |
A string giving the path for the fastqfile |
min_size |
An int for thhresholding over representation |
buffer_size |
An int for the number of lines to keep in memory |
calculate overrepresented sequence count
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") calc_over_rep_seq(infile)[seq_len(5)]
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") calc_over_rep_seq(infile)[seq_len(5)]
Extract the number of columns and rows for a FASTQ file using seqTools.
dimensions(fseq, sel)
dimensions(fseq, sel)
fseq |
an object that is the read result of the seq.read function |
sel |
'reads' for #reads/rows, 'positions' for #positions/columns |
a numeric value of the number of reads or the number of positions
infile <- system.file("extdata","10^5_reads_test.fq.gz", package = "qckitfastq") fseq <- seqTools::fastqq(infile,k=6) dimensions(fseq,"reads")
infile <- system.file("extdata","10^5_reads_test.fq.gz", package = "qckitfastq") fseq <- seqTools::fastqq(infile,k=6) dimensions(fseq,"reads")
Gets quality score encoding format from the FASTQ file. Return possibilities are Sanger(/Illumina1.8), Solexa(/Illumina1.0), Illumina1.3, and Illumina1.5. This encoding is heuristic based and may not be 100 since there is overlap in the encodings used, so it is best if you already know the format.
find_format(infile, reads_used)
find_format(infile, reads_used)
infile |
A string giving the path for the fastq file |
reads_used |
int, the number of reads to use to determine the encoding format. |
A string denoting the read format. Possibilities are Sanger, Solexa, Illumina1.3, and Illumina1.5.
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") find_format(infile,100)
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") find_format(infile,100)
Calculates GC content percentage for each read in the dataset.
GC_content(infile, output_file = NA)
GC_content(infile, output_file = NA)
infile |
the object that is the path to the FASTQ file |
output_file |
File to write results to. Default NA. |
Data frame with read ID and GC content of each read.
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") head(GC_content(infile))
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") head(GC_content(infile))
Calculate GC nucleotide sequence content per read of the FASTQ gzipped file
gc_per_read(infile)
gc_per_read(infile)
infile |
A string giving the path for the fastqfile |
GC content perncentage per read
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") gc_per_read(infile)[1:10]
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") gc_per_read(infile)[1:10]
Return kmer count per sequence for the length of kmer desired
kmer_count(infile, k, output_file = NA)
kmer_count(infile, k, output_file = NA)
infile |
the object that is the path to gzippped FASTQ file |
k |
the length of kmer |
output_file |
File to save plot to. Default NA. |
kmers counts per sequence
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") km<-kmer_count(infile,k=4) km[1:20,1:10]
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") km<-kmer_count(infile,k=4) km[1:20,1:10]
Generate overrepresented kmers of length k based on their observed to expected ratio at each position across all sequences in the dataset. The expected proportion of a length k kmer assumes site independence and is computed as the sum of the count of each base pair in the kmer times the probability of observing that base pair in the data set, i.e. P(A)count_in_kmer(A)+P(C)count_in_kmer(C)+... The observed to expected ratio is computed as log2(obs/exp). Those with obsexp_ratio > 2 are considered to be overrepresented and appear in the returned data frame along with their position in the sequence.
overrep_kmer(infile, k, output_file = NA)
overrep_kmer(infile, k, output_file = NA)
infile |
path to gzipped FASTQ file |
k |
the kmer length |
output_file |
File to save plot to. Default NA. |
Data frame with columns: Position (in read), Obsexp_ratio, & Kmer
infile <-system.file("extdata", "test.fq.gz", package = "qckitfastq") overrep_kmer(infile,k=4)
infile <-system.file("extdata", "test.fq.gz", package = "qckitfastq") overrep_kmer(infile,k=4)
Sort all sequences per read by count.
overrep_reads(infile, output_file = NA)
overrep_reads(infile, output_file = NA)
infile |
Path to gzippped FASTQ file. |
output_file |
File to save data frame to. Default NA. |
Table of sequences sorted by count.
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") overrep_reads(infile)[1:5,]
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") overrep_reads(infile)[1:5,]
Compute the mean, median, and percentiles of quality score per base. This is returned as a data frame.
per_base_quality(infile, output_file = NA)
per_base_quality(infile, output_file = NA)
infile |
Path to a gzippped FASTQ file |
output_file |
File to write results in CSV format to. Default NA. |
A dataframe of the mean, median and quantiles of the FASTQ file
Wenyue Xing, [email protected]
August Guang, [email protected]
per_base_quality(system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq"))
per_base_quality(system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq"))
per_read_quality
Compute the mean quality score per read.
per_read_quality
per_read_quality(infile, output_file = NA)
per_read_quality(infile, output_file = NA)
infile |
Path to FASTQ file |
output_file |
File to write plot to. Will not write to file if NA. Default NA. |
Data frame of mean quality score per read
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") prq <- per_read_quality(infile)
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") prq <- per_read_quality(infile)
Creates a bar plot of the top 5 most present adapter sequences.
plot_adapter_content(ac_sorted, output_file = NA)
plot_adapter_content(ac_sorted, output_file = NA)
ac_sorted |
Sorted table of adapters and counts. |
output_file |
File to save data frame to. Default NA. |
Barplot of top 5 most frequent adapter sequences.
if(.Platform$OS.type != "windows") { infile <- system.file("extdata", "test.fq.gz", package = "qckitfastq") ac_sorted <- adapter_content(infile) plot_adapter_content(ac_sorted) }
if(.Platform$OS.type != "windows") { infile <- system.file("extdata", "test.fq.gz", package = "qckitfastq") ac_sorted <- adapter_content(infile) plot_adapter_content(ac_sorted) }
Generate mean GC content histogram.
plot_GC_content(gc_df, output_file = NA)
plot_GC_content(gc_df, output_file = NA)
gc_df |
the object that is the GC content vectors generated from GC content function |
output_file |
File to write plot to. Will not write to file if NA. Default NA. |
A histogram of mean GC content.
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") gc_df<-GC_content(infile) plot_GC_content(gc_df)
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") gc_df<-GC_content(infile) plot_GC_content(gc_df)
Determine how to plot outliers. Heuristic used is whether their obsexp_ratio differs by more than 1 and whether they fall into the same bin or not. If for 2 outliers, obsexp_ratio differs by less than .4 and they are in the same bin, then combine into a single plotting point. NOT FULLY FUNCTIONAL
plot_outliers(overkm, top_num)
plot_outliers(overkm, top_num)
overkm |
data frame with columns pos, obsexp_ratio, and kmer that has already been reordered by descending obsexp_ratio |
top_num |
number of most overrepresented kmers to plot. Default is 5. |
currently 0 as function is not fully working.
Create a box plot of the log2(observed/expected) ratio across the length of the sequence as well as top overrepresented kmers. Only ratios greater than 2 are included in the box plot. Default is 20 bins across the length of the sequence and the top 2 overrepresented kmers, but this can be changed by the user.
plot_overrep_kmer(overkm, bins = 20, top_num = 2, output_file = NA)
plot_overrep_kmer(overkm, bins = 20, top_num = 2, output_file = NA)
overkm |
data frame with columns pos, obsexp_ratio, and kmer |
bins |
number of intervals across the length of the sequence |
top_num |
number of most overrepresented kmers to plot |
output_file |
File to write plot to. Will not write to file if NA. Default NA. |
A box plot of the log2(observed/expected ratio) across the length of the sequence
infile <- system.file("extdata", "test.fq.gz", package = "qckitfastq") over_km <- overrep_kmer(infile,k=4) plot_overrep_kmer(over_km)
infile <- system.file("extdata", "test.fq.gz", package = "qckitfastq") over_km <- overrep_kmer(infile,k=4) plot_overrep_kmer(over_km)
Plot the top 5 seqeunces
plot_overrep_reads(overrep_reads, output_file = NA)
plot_overrep_reads(overrep_reads, output_file = NA)
overrep_reads |
the table that sorts the sequence content and corresponding counts in descending order |
output_file |
File to save plot to. Will not write to file if NA. Default NA. |
plot of the top 5 overrepresented sequences
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") overrep_df <- overrep_reads(infile) plot_overrep_reads(overrep_df)
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") overrep_df <- overrep_reads(infile) plot_overrep_reads(overrep_df)
Generate a boxplot of the per position quality score.
plot_per_base_quality(per_base_quality, output_file = NA)
plot_per_base_quality(per_base_quality, output_file = NA)
per_base_quality |
a data frame of the mean, median and quantiles of sequence quality per base. Most likely generated with the 'per_base_quality' function. |
output_file |
File to save plot to. Will not write to file if NA. Default NA. |
A boxplot of per position quality score distribution.
pbq <- per_base_quality(system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq")) plot_per_base_quality(pbq)
pbq <- per_base_quality(system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq")) plot_per_base_quality(pbq)
plot_per_read_quality
Plot the mean quality score per sequence as a histogram.
High quality sequences are those mostly distributed over 30.
Low quality sequences are those mostly under 30.
plot_per_read_quality
plot_per_read_quality(prq, output_file = NA)
plot_per_read_quality(prq, output_file = NA)
prq |
Data frame from per_read_quality function |
output_file |
File to write plot to. Will not write to file if NA. Default NA. |
Plot of mean quality score per read
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") prq <- per_read_quality(infile) plot_per_read_quality(prq)
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") prq <- per_read_quality(infile) plot_per_read_quality(prq)
Plot the per position nucleotide content.
plot_read_content(read_content, output_file = NA)
plot_read_content(read_content, output_file = NA)
read_content |
Data frame produced by read_content function. |
output_file |
File to save plot to. Will not write to file if NA. Default NA. |
ggplot line plot of all nucleotide content inclding A, T, G, C and N
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") fseq <- seqTools::fastqq(infile,k=6) read_content <- read_content(fseq) plot_read_content(read_content)
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") fseq <- seqTools::fastqq(infile,k=6) read_content <- read_content(fseq) plot_read_content(read_content)
Plot a histogram of the number of reads with each read length.
plot_read_length(read_len, output_file = NA)
plot_read_length(read_len, output_file = NA)
read_len |
Data frame of read lengths and number of reads with that length. |
output_file |
File to save plot to. Default is NA, i.e. do not write to file. |
A histogram of the read length distribution.
Wenyue Xing, [email protected], August Guang, [email protected]
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") fseq <- seqTools::fastqq(infile,k=6) read_len <- read_length(fseq) plot_read_length(read_len)
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") fseq <- seqTools::fastqq(infile,k=6) read_len <- read_length(fseq) plot_read_length(read_len)
Calculate the mean quality score per read of the FASTQ gzipped file
qual_score_per_read(infile)
qual_score_per_read(infile)
infile |
A string giving the path for the fastqfile |
mean quality per read
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") qual_score_per_read(infile)$q50_per_position[1:10]
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") qual_score_per_read(infile)$q50_per_position[1:10]
Compute nucleotide content per position for a single base pair. Wrapper function around seqTools.
read_base_content(fseq, content)
read_base_content(fseq, content)
fseq |
a seqTools::fastqq object |
content |
nucleotide. Options are "A","T","G","C","N"(either capital or lower case) |
Nucleotide sequence content per position.
Wenyue Xing, [email protected], August Guang [email protected]
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") fseq <- seqTools::fastqq(infile,k=6) read_base_content(fseq,"A")
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") fseq <- seqTools::fastqq(infile,k=6) read_base_content(fseq,"A")
Compute nucleotide content per position. Wrapper function around seqTools.
read_content(fseq, output_file = NA)
read_content(fseq, output_file = NA)
fseq |
a seqTools::fastqq object |
output_file |
File to write results in CSV format to. Will not write to file if NA. Default NA. |
Data frame of nucleotide sequence content per position
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") fseq <- seqTools::fastqq(infile,k=6) read_content(fseq)
infile <- system.file("extdata", "10^5_reads_test.fq.gz", package = "qckitfastq") fseq <- seqTools::fastqq(infile,k=6) read_content(fseq)
Creates a data frame of read lengths and the number of reads with that read length.
read_length(fseq, output_file = NA)
read_length(fseq, output_file = NA)
fseq |
a seqTools object produced by seqTools::fastqq on the raw FASTQ file |
output_file |
File to save data frame to. Default NA. |
Data frame of read lengths and number of reads with that length.
infile <- system.file("extdata","test.fq.gz", package = "qckitfastq") fseq <- seqTools::fastqq(infile,k=6) read_len <- read_length(fseq)
infile <- system.file("extdata","test.fq.gz", package = "qckitfastq") fseq <- seqTools::fastqq(infile,k=6) read_len <- read_length(fseq)
Will run all functions in the qckitfastq suite and save the data frames and plots to a user-provided directory. Plot names are supplied by default.
run_all(infile, dir)
run_all(infile, dir)
infile |
Path to gzipped FASTQ file |
dir |
Directory to save results to |
Generate files from all functions
infile <- system.file("extdata", "test.fq.gz", package = "qckitfastq") testfolder <- tempdir() run_all(infile, testfolder)
infile <- system.file("extdata", "test.fq.gz", package = "qckitfastq") testfolder <- tempdir() run_all(infile, testfolder)