Title: | Accurate Inference of Genetic Ancestry from Cancer Sequences |
---|---|
Description: | This package implements specialized algorithms that enable genetic ancestry inference from various cancer sequences sources (RNA, Exome and Whole-Genome sequences). This package also implements a simulation algorithm that generates synthetic cancer-derived data. This code and analysis pipeline was designed and developed for the following publication: Belleau, P et al. Genetic Ancestry Inference from Cancer-Derived Molecular Data across Genomic and Transcriptomic Platforms. Cancer Res 1 January 2023; 83 (1): 49–58. |
Authors: | Pascal Belleau [cre, aut] , Astrid Deschênes [aut] , David A. Tuveson [aut] , Alexander Krasnitz [aut] |
Maintainer: | Pascal Belleau <[email protected]> |
License: | Apache License (>= 2) |
Version: | 1.5.0 |
Built: | 2024-11-19 04:14:47 UTC |
Source: | https://github.com/bioc/RAIDS |
The function extracts the information about the pruned SNVs from the 1KG GDS file and adds entries related to the pruned SNVs in the Profile GDS file. The nodes are added to the Profile GDS file: 'sample.id', 'snp.id', 'snp.chromosome', 'snp.position', 'snp.index', 'genotype' and 'lap'.
add1KG2SampleGDS(gdsReference, fileProfileGDS, currentProfile, studyID)
add1KG2SampleGDS(gdsReference, fileProfileGDS, currentProfile, studyID)
gdsReference |
an object of class gds.class (a GDS file), the opened 1KG GDS file. |
fileProfileGDS |
a |
currentProfile |
a |
studyID |
a |
The function returns 0L
when successful.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Required library for GDS library(SNPRelate) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") ## The data.frame containing the information about the study ## The 3 mandatory columns: "studyID", "study.desc", "study.platform" ## The entries should be strings, not factors (stringsAsFactors=FALSE) studyDF <- data.frame(study.id="MYDATA", study.desc="Description", study.platform="PLATFORM", stringsAsFactors=FALSE) ## Temporary Profile file fileProfile <- file.path(tempdir(), "ex2.gds") ## Copy required file file.copy(file.path(dataDir, "ex1_demo_with_pruning.gds"), fileProfile) ## Open 1KG file gds1KG <- snpgdsOpen(fileGDS) ## Compute the list of pruned SNVs for a specific profile 'ex1' ## and save it in the Profile GDS file 'ex2.gds' add1KG2SampleGDS(gdsReference=gds1KG, fileProfileGDS=fileProfile, currentProfile=c("ex1"), studyID=studyDF$study.id) ## Close the 1KG GDS file (important) closefn.gds(gds1KG) ## Check content of Profile GDS file ## The 'pruned.study' entry should be present content <- openfn.gds(fileProfile) content ## Close the Profile GDS file (important) closefn.gds(content) ## Remove Profile GDS file (created for demo purpose) unlink(fileProfile, force=TRUE)
## Required library for GDS library(SNPRelate) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") ## The data.frame containing the information about the study ## The 3 mandatory columns: "studyID", "study.desc", "study.platform" ## The entries should be strings, not factors (stringsAsFactors=FALSE) studyDF <- data.frame(study.id="MYDATA", study.desc="Description", study.platform="PLATFORM", stringsAsFactors=FALSE) ## Temporary Profile file fileProfile <- file.path(tempdir(), "ex2.gds") ## Copy required file file.copy(file.path(dataDir, "ex1_demo_with_pruning.gds"), fileProfile) ## Open 1KG file gds1KG <- snpgdsOpen(fileGDS) ## Compute the list of pruned SNVs for a specific profile 'ex1' ## and save it in the Profile GDS file 'ex2.gds' add1KG2SampleGDS(gdsReference=gds1KG, fileProfileGDS=fileProfile, currentProfile=c("ex1"), studyID=studyDF$study.id) ## Close the 1KG GDS file (important) closefn.gds(gds1KG) ## Check content of Profile GDS file ## The 'pruned.study' entry should be present content <- openfn.gds(fileProfile) content ## Close the Profile GDS file (important) closefn.gds(content) ## Remove Profile GDS file (created for demo purpose) unlink(fileProfile, force=TRUE)
The function appends the information about the ld blocks into the Population Reference SNV Annotation GDS file. The information is extracted from the Population Reference GDS file and files \'.det\'.
addBlockFromDetFile( fileReferenceGDS, gdsRefAnnotFile, pathBlock, superPop, blockName = "ldBlock", blockDesc = "Not Define", verbose = FALSE )
addBlockFromDetFile( fileReferenceGDS, gdsRefAnnotFile, pathBlock, superPop, blockName = "ldBlock", blockDesc = "Not Define", verbose = FALSE )
fileReferenceGDS |
a |
gdsRefAnnotFile |
a |
pathBlock |
a |
superPop |
a |
blockName |
a |
blockDesc |
a |
verbose |
a |
More information about GDS file format can be found at the Bioconductor gdsfmt website: https://bioconductor.org/packages/gdsfmt/
OL
when the function is successful.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds") ## Demo of of output file det from the plink block ## command for chromosome 1 fileLdBlock <- file.path(dirname(fileAnnotGDS), "block.sp.EUR.Ex.chr1.blocks.det") file.copy(file.path(dataDir, "tests", "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS) file.copy(file.path(dataDir, "block.sp.EUR.Ex.chr1.blocks.det"), fileLdBlock) ## GDS Reference file fileReferenceGDS <- file.path(dataDir, "tests", "ex1_good_small_1KG.gds") ## Append information associated to blocks addBlockFromDetFile(fileReferenceGDS=fileReferenceGDS, gdsRefAnnotFile=fileAnnotGDS, pathBlock=dirname(fileAnnotGDS), superPop="EUR") gdsAnnot1KG <- openfn.gds(fileAnnotGDS) print(gdsAnnot1KG) closefn.gds(gdsAnnot1KG) ## Remove temporary file unlink(fileAnnotGDS, force=TRUE) unlink(fileLdBlock, force=TRUE)
## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds") ## Demo of of output file det from the plink block ## command for chromosome 1 fileLdBlock <- file.path(dirname(fileAnnotGDS), "block.sp.EUR.Ex.chr1.blocks.det") file.copy(file.path(dataDir, "tests", "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS) file.copy(file.path(dataDir, "block.sp.EUR.Ex.chr1.blocks.det"), fileLdBlock) ## GDS Reference file fileReferenceGDS <- file.path(dataDir, "tests", "ex1_good_small_1KG.gds") ## Append information associated to blocks addBlockFromDetFile(fileReferenceGDS=fileReferenceGDS, gdsRefAnnotFile=fileAnnotGDS, pathBlock=dirname(fileAnnotGDS), superPop="EUR") gdsAnnot1KG <- openfn.gds(fileAnnotGDS) print(gdsAnnot1KG) closefn.gds(gdsAnnot1KG) ## Remove temporary file unlink(fileAnnotGDS, force=TRUE) unlink(fileLdBlock, force=TRUE)
The function appends the information about the blocks into the Population Reference SNV Annotation GDS file. The information is extracted from the Population Reference GDS file.
addGeneBlockGDSRefAnnot( gdsReference, gdsRefAnnotFile, winSize = 10000, ensDb, suffixBlockName )
addGeneBlockGDSRefAnnot( gdsReference, gdsRefAnnotFile, winSize = 10000, ensDb, suffixBlockName )
gdsReference |
an object of class gds.class (a GDS file), the opened Reference GDS file. |
gdsRefAnnotFile |
a |
winSize |
a single positive |
ensDb |
An object with the ensembl genome annotation
Default: |
suffixBlockName |
a |
The integer OL
when the function is successful.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Required library library(SNPRelate) ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds") ## Required library if (requireNamespace("EnsDb.Hsapiens.v86", quietly=TRUE)) { file.copy(file.path(dataDir, "tests", "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS) ## Making a "short cut" on the ensDb object edb <- EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86 ## GDS Reference file fileReferenceGDS <- file.path(dataDir, "tests", "ex1_good_small_1KG.gds") ## Open the reference GDS file (demo version) gds1KG <- snpgdsOpen(fileReferenceGDS) ## Append information associated to blocks addGeneBlockGDSRefAnnot(gdsReference=gds1KG, gdsRefAnnotFile=fileAnnotGDS, ensDb=edb, suffixBlockName="EnsDb.Hsapiens.v86") gdsAnnot1KG <- openfn.gds(fileAnnotGDS) print(gdsAnnot1KG) print(read.gdsn(index.gdsn(gdsAnnot1KG, "block.annot"))) ## Close GDS files closefn.gds(gds1KG) closefn.gds(gdsAnnot1KG) ## Remove temporary file unlink(fileAnnotGDS, force=TRUE) }
## Required library library(SNPRelate) ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds") ## Required library if (requireNamespace("EnsDb.Hsapiens.v86", quietly=TRUE)) { file.copy(file.path(dataDir, "tests", "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS) ## Making a "short cut" on the ensDb object edb <- EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86 ## GDS Reference file fileReferenceGDS <- file.path(dataDir, "tests", "ex1_good_small_1KG.gds") ## Open the reference GDS file (demo version) gds1KG <- snpgdsOpen(fileReferenceGDS) ## Append information associated to blocks addGeneBlockGDSRefAnnot(gdsReference=gds1KG, gdsRefAnnotFile=fileAnnotGDS, ensDb=edb, suffixBlockName="EnsDb.Hsapiens.v86") gdsAnnot1KG <- openfn.gds(fileAnnotGDS) print(gdsAnnot1KG) print(read.gdsn(index.gdsn(gdsAnnot1KG, "block.annot"))) ## Close GDS files closefn.gds(gds1KG) closefn.gds(gdsAnnot1KG) ## Remove temporary file unlink(fileAnnotGDS, force=TRUE) }
The function appends the information about the blocks into the Population Reference SNV Annotation GDS file. The information is extracted from the Population Reference GDS file.
addGeneBlockRefAnnot( fileReferenceGDS, gdsRefAnnotFile, winSize = 10000, ensDb, suffixBlockName )
addGeneBlockRefAnnot( fileReferenceGDS, gdsRefAnnotFile, winSize = 10000, ensDb, suffixBlockName )
fileReferenceGDS |
a |
gdsRefAnnotFile |
a |
winSize |
a single positive |
ensDb |
An object with the ensembl genome annotation
Default: |
suffixBlockName |
a |
The integer OL
when the function is successful.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds") ## Required library if (requireNamespace("EnsDb.Hsapiens.v86", quietly=TRUE)) { file.copy(file.path(dataDir, "tests", "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS) ## Making a "short cut" on the ensDb object edb <- EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86 ## GDS Reference file fileReferenceGDS <- file.path(dataDir, "tests", "ex1_good_small_1KG.gds") ## Append information associated to blocks addGeneBlockRefAnnot(fileReferenceGDS=fileReferenceGDS, gdsRefAnnotFile=fileAnnotGDS, ensDb=edb, suffixBlockName="EnsDb.Hsapiens.v86") gdsAnnot1KG <- openfn.gds(fileAnnotGDS) print(gdsAnnot1KG) print(read.gdsn(index.gdsn(gdsAnnot1KG, "block.annot"))) closefn.gds(gdsAnnot1KG) ## Remove temporary file unlink(fileAnnotGDS, force=TRUE) }
## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds") ## Required library if (requireNamespace("EnsDb.Hsapiens.v86", quietly=TRUE)) { file.copy(file.path(dataDir, "tests", "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS) ## Making a "short cut" on the ensDb object edb <- EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86 ## GDS Reference file fileReferenceGDS <- file.path(dataDir, "tests", "ex1_good_small_1KG.gds") ## Append information associated to blocks addGeneBlockRefAnnot(fileReferenceGDS=fileReferenceGDS, gdsRefAnnotFile=fileAnnotGDS, ensDb=edb, suffixBlockName="EnsDb.Hsapiens.v86") gdsAnnot1KG <- openfn.gds(fileAnnotGDS) print(gdsAnnot1KG) print(read.gdsn(index.gdsn(gdsAnnot1KG, "block.annot"))) closefn.gds(gdsAnnot1KG) ## Remove temporary file unlink(fileAnnotGDS, force=TRUE) }
This function adds the information about the unrelated patients
to the Reference GDS file. More specifically, it creates the field
sample.ref
which as the value 1
when the sample
is unrelated and the value 0
otherwise.
The sample.ref
is filled based on the information present in the
input RDS file.
addRef2GDS1KG(fileNameGDS, filePart)
addRef2GDS1KG(fileNameGDS, filePart)
fileNameGDS |
a |
filePart |
a |
The integer 0L
when successful.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Locate RDS with unrelated/related status for 1KG samples dataDir <- system.file("extdata", package="RAIDS") rdsFilePath <- file.path(dataDir, "unrelatedPatientsInfo_Demo.rds") ## Create a temporary GDS file in an test directory dataDir <- system.file("extdata/tests", package="RAIDS") gdsFilePath <- file.path(dataDir, "GDS_TEMP_201.gds") ## Create and open the GDS file tmpGDS <- createfn.gds(filename=gdsFilePath) ## Create "sample.id" node (the node must be present) sampleIDs <- c("HG00104", "HG00109", "HG00110") add.gdsn(node=tmpGDS, name="sample.id", val=sampleIDs) ## Create "snp.id" node (the node must be present) snpIDs <- c("s1", "s2", "s3", "s4", "s5", "s6") add.gdsn(node=tmpGDS, name="snp.id", val=snpIDs) ## Create "snp.position" node (the node must be present) snpPositions <- c(16102, 51478, 51897, 51927, 54489, 54707) add.gdsn(node=tmpGDS, name="snp.position", val=snpPositions) ## Create "snp.chromosome" node (the node must be present) snpPositions <- c(1, 1, 1, 1, 1, 1) add.gdsn(node=tmpGDS, name="snp.chromosome", val=snpPositions) ## Create "genotype" node (the node must be present) genotype <- matrix(rep(1, 18), ncol = 3) add.gdsn(node=tmpGDS, name="genotype", val=genotype) ## Close GDS file closefn.gds(tmpGDS) ## Create "sample.ref" node in GDS file using RDS information addRef2GDS1KG(fileNameGDS=gdsFilePath, filePart=rdsFilePath) ## Read sample reference data.frame fileGDS <- openfn.gds(gdsFilePath, readonly=TRUE) read.gdsn(index.gdsn(node=fileGDS, path="sample.ref")) closefn.gds(gdsfile=fileGDS) ## Delete the temporary GDS file unlink(x=gdsFilePath, force=TRUE)
## Locate RDS with unrelated/related status for 1KG samples dataDir <- system.file("extdata", package="RAIDS") rdsFilePath <- file.path(dataDir, "unrelatedPatientsInfo_Demo.rds") ## Create a temporary GDS file in an test directory dataDir <- system.file("extdata/tests", package="RAIDS") gdsFilePath <- file.path(dataDir, "GDS_TEMP_201.gds") ## Create and open the GDS file tmpGDS <- createfn.gds(filename=gdsFilePath) ## Create "sample.id" node (the node must be present) sampleIDs <- c("HG00104", "HG00109", "HG00110") add.gdsn(node=tmpGDS, name="sample.id", val=sampleIDs) ## Create "snp.id" node (the node must be present) snpIDs <- c("s1", "s2", "s3", "s4", "s5", "s6") add.gdsn(node=tmpGDS, name="snp.id", val=snpIDs) ## Create "snp.position" node (the node must be present) snpPositions <- c(16102, 51478, 51897, 51927, 54489, 54707) add.gdsn(node=tmpGDS, name="snp.position", val=snpPositions) ## Create "snp.chromosome" node (the node must be present) snpPositions <- c(1, 1, 1, 1, 1, 1) add.gdsn(node=tmpGDS, name="snp.chromosome", val=snpPositions) ## Create "genotype" node (the node must be present) genotype <- matrix(rep(1, 18), ncol = 3) add.gdsn(node=tmpGDS, name="genotype", val=genotype) ## Close GDS file closefn.gds(tmpGDS) ## Create "sample.ref" node in GDS file using RDS information addRef2GDS1KG(fileNameGDS=gdsFilePath, filePart=rdsFilePath) ## Read sample reference data.frame fileGDS <- openfn.gds(gdsFilePath, readonly=TRUE) read.gdsn(index.gdsn(node=fileGDS, path="sample.ref")) closefn.gds(gdsfile=fileGDS) ## Delete the temporary GDS file unlink(x=gdsFilePath, force=TRUE)
The information about the samples present in the 1KG GDS file is added into the GDS Sample file. Only the information about the unrelated samples from the 1OOO Genome Study are copied into the GDS Sample file. The information is only added to the GDS Sample file when the 1KG Study is not already present in the GDS Sample file. The sample information for all selected samples is appended to the GDS Sample file "study.annot" node. The study information is appended to the GDS Sample file "study.list" node.
addStudy1Kg(gdsReference, fileProfileGDS, verbose = FALSE)
addStudy1Kg(gdsReference, fileProfileGDS, verbose = FALSE)
gdsReference |
an object of class gds.class (a GDS file), the opened 1KG GDS file. |
fileProfileGDS |
a |
verbose |
a |
The integer 0L
when successful.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Required library for GDS library(gdsfmt) ## Get the temp folder tempDir <- tempdir() ## Create a temporary 1KG GDS file and add needed information fileName1KG <- file.path(tempDir, "GDS_TEMP_addStudy1Kg_1KG.gds") gds1KG <- createfn.gds(filename=fileName1KG) add.gdsn(gds1KG, "sample.id", c("HTT101", "HTT102", "HTT103")) samples <- data.frame(sex=c(1, 1, 2), pop.group=c("GBR", "GIH", "GBR"), superPop=c("EUR", "SAS", "EUR"), batch=rep(0, 3), stringsAsFactors = FALSE) add.gdsn(gds1KG, "sample.annot", samples) add.gdsn(gds1KG, "sample.ref", c(1,0, 1)) sync.gds(gds1KG) ## Create a temporary Profile GDS file fileNameProfile <- file.path(tempDir, "GDS_TEMP_addStudy1Kg_Sample.gds") gdsProfile <- createfn.gds(fileNameProfile) study.list <- data.frame(study.id=c("HTT Study"), study.desc=c("Important Study"), study.platform=c("Panel"), stringsAsFactors=FALSE) add.gdsn(gdsProfile, "study.list", study.list) study.annot <- data.frame(data.id=c("TOTO1"), case.id=c("TOTO1"), sample.type=c("Study"), diagnosis=c("Study"), source=rep("IGSR"), study.id=c("Study"), stringsAsFactors=FALSE) add.gdsn(gdsProfile, "study.annot", study.annot) sync.gds(gdsProfile) closefn.gds(gdsProfile) ## Append information about the 1KG samples into the Profile GDS file ## The Profile GDS file will contain 'study.list' and 'study.annot' entries addStudy1Kg(gdsReference=gds1KG, fileProfileGDS=fileNameProfile, verbose=TRUE) closefn.gds(gds1KG) unlink(fileNameProfile, recursive=TRUE, force=TRUE) unlink(fileName1KG, recursive=TRUE, force=TRUE) unlink(tempDir)
## Required library for GDS library(gdsfmt) ## Get the temp folder tempDir <- tempdir() ## Create a temporary 1KG GDS file and add needed information fileName1KG <- file.path(tempDir, "GDS_TEMP_addStudy1Kg_1KG.gds") gds1KG <- createfn.gds(filename=fileName1KG) add.gdsn(gds1KG, "sample.id", c("HTT101", "HTT102", "HTT103")) samples <- data.frame(sex=c(1, 1, 2), pop.group=c("GBR", "GIH", "GBR"), superPop=c("EUR", "SAS", "EUR"), batch=rep(0, 3), stringsAsFactors = FALSE) add.gdsn(gds1KG, "sample.annot", samples) add.gdsn(gds1KG, "sample.ref", c(1,0, 1)) sync.gds(gds1KG) ## Create a temporary Profile GDS file fileNameProfile <- file.path(tempDir, "GDS_TEMP_addStudy1Kg_Sample.gds") gdsProfile <- createfn.gds(fileNameProfile) study.list <- data.frame(study.id=c("HTT Study"), study.desc=c("Important Study"), study.platform=c("Panel"), stringsAsFactors=FALSE) add.gdsn(gdsProfile, "study.list", study.list) study.annot <- data.frame(data.id=c("TOTO1"), case.id=c("TOTO1"), sample.type=c("Study"), diagnosis=c("Study"), source=rep("IGSR"), study.id=c("Study"), stringsAsFactors=FALSE) add.gdsn(gdsProfile, "study.annot", study.annot) sync.gds(gdsProfile) closefn.gds(gdsProfile) ## Append information about the 1KG samples into the Profile GDS file ## The Profile GDS file will contain 'study.list' and 'study.annot' entries addStudy1Kg(gdsReference=gds1KG, fileProfileGDS=fileNameProfile, verbose=TRUE) closefn.gds(gds1KG) unlink(fileNameProfile, recursive=TRUE, force=TRUE) unlink(fileName1KG, recursive=TRUE, force=TRUE) unlink(tempDir)
The function select the optimal K and D parameters for a specific profile. The results on the synthetic data are used for the parameter selection. Once the optimal parameters are selected, the ancestry is inferred for the specific profile.
computeAncestryFromSynthetic( gdsReference, gdsProfile, syntheticKNN, pedSyn, currentProfile, spRef, studyIDSyn, np = 1L, listCatPop = c("EAS", "EUR", "AFR", "AMR", "SAS"), fieldPopIn1KG = "superPop", fieldPopInfAnc = "SuperPop", kList = seq(2, 15, 1), pcaList = seq(2, 15, 1), algorithm = c("exact", "randomized"), eigenCount = 32L, missingRate = NaN, verbose = FALSE )
computeAncestryFromSynthetic( gdsReference, gdsProfile, syntheticKNN, pedSyn, currentProfile, spRef, studyIDSyn, np = 1L, listCatPop = c("EAS", "EUR", "AFR", "AMR", "SAS"), fieldPopIn1KG = "superPop", fieldPopInfAnc = "SuperPop", kList = seq(2, 15, 1), pcaList = seq(2, 15, 1), algorithm = c("exact", "randomized"), eigenCount = 32L, missingRate = NaN, verbose = FALSE )
gdsReference |
an object of class gds.class (a GDS file), the opened 1KG GDS file. |
gdsProfile |
an object of class |
syntheticKNN |
a |
pedSyn |
a |
currentProfile |
a |
spRef |
a |
studyIDSyn |
a |
np |
a single positive |
listCatPop |
a |
fieldPopIn1KG |
a |
fieldPopInfAnc |
a |
kList |
a |
pcaList |
a |
algorithm |
a |
eigenCount |
a single |
missingRate |
a |
verbose |
a |
a list
containing 4 entries:
pcaSample
a list
containing the information related
to the eigenvectors. The list
contains those 3 entries:
sample.id
a character
string representing the unique
identifier of the current profile.
eigenvector.ref
a matrix
of numeric
containing
the eigenvectors for the reference profiles.
eigenvector
a matrix
of numeric
containing the
eigenvectors for the current profile projected on the PCA from the
reference profiles.
paraSample
a list
containing the results with
different D
and K
values that lead to optimal parameter
selection. The list
contains those entries:
dfPCA
a data.frame
containing statistical results
on all combined synthetic results done with a fixed value of D
(the
number of dimensions). The data.frame
contains those columns:
D
a numeric
representing the value of D
(the
number of dimensions).
median
a numeric
representing the median of the
minimum AUROC obtained (within super populations) for all combination of
the fixed D
value and all tested K
values.
mad
a numeric
representing the MAD of the minimum
AUROC obtained (within super populations) for all combination of the fixed
D
value and all tested K
values.
upQuartile
a numeric
representing the upper quartile
of the minimum AUROC obtained (within super populations) for all
combination of the fixed D
value and all tested K
values.
k
a numeric
representing the optimal K
value
(the number of neighbors) for a fixed D
value.
dfPop
a data.frame
containing statistical results on
all combined synthetic results done with different values of D
(the
number of dimensions) and K
(the number of neighbors).
The data.frame
contains those columns:
D
a numeric
representing the value of D
(the
number of dimensions).
K
a numeric
representing the value of K
(the
number of neighbors).
AUROC.min
a numeric
representing the minimum accuracy
obtained by grouping all the synthetic results by super-populations, for
the specified values of D
and K
.
AUROC
a numeric
representing the accuracy obtained
by grouping all the synthetic results for the specified values of D
and K
.
Accu.CM
a numeric
representing the value of accuracy
of the confusion matrix obtained by grouping all the synthetic results for
the specified values of D
and K
.
dfAUROC
a data.frame
the summary of the results by
super-population. The data.frame
contains
those columns:
D
a numeric
representing the value of D
(the
number of dimensions).
K
a numeric
representing the value of K
(the
number of neighbors).
Call
a character
string representing the
super-population.
L
a numeric
representing the lower value of the 95%
confidence interval for the AUROC obtained for the fixed values of
super-population, D
and K
.
AUROC
a numeric
representing the AUROC obtained for the
fixed values of super-population, D
and K
.
H
a numeric
representing the higher value of the 95%
confidence interval for the AUROC obtained for the fixed values of
super-population, D
and K
.
D
a numeric
representing the optimal D
value
(the number of dimensions) for the specific profile.
K
a numeric
representing the optimal K
value
(the number of neighbors) for the specific profile.
listD
a numeric
representing the optimal D
values (the number of dimensions) for the specific profile. More than one
D
is possible.
KNNSample
a list
containing the inferred ancestry
using different D
and K
values. The list
contains
those entries:
sample.id
a character
string representing the unique
identifier of the current profile.
matKNN
a data.frame
containing the inferred ancestry
for different values of K
and D
. The data.frame
contains those columns:
sample.id
a character
string representing the unique
identifier of the current profile.
D
a numeric
representing the value of D
(the
number of dimensions) used to infer the ancestry.
K
a numeric
representing the value of K
(the
number of neighbors) used to infer the ancestry.
SuperPop
a character
string representing the inferred
ancestry for the specified D
and K
values.
Ancestry
a data.frame
containing the inferred
ancestry for the current profile. The data.frame
contains those
columns:
sample.id
a character
string representing the unique
identifier of the current profile.
D
a numeric
representing the value of D
(the
number of dimensions) used to infer the ancestry.
K
a numeric
representing the value of K
(the
number of neighbors) used to infer the ancestry.
SuperPop
a character
string representing the inferred
ancestry.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
## Required library library(gdsfmt) ## Load the known ancestry for the demo 1KG reference profiles data(demoKnownSuperPop1KG) ## The Reference GDS file path1KG <- system.file("extdata/tests", package="RAIDS") ## Open the Reference GDS file gdsRef <- snpgdsOpen(file.path(path1KG, "ex1_good_small_1KG.gds")) ## Path to the demo synthetic results files ## List of the KNN result files from PCA run on synthetic data dataDirRes <- system.file("extdata/demoAncestryCall/ex1", package="RAIDS") listFilesName <- dir(file.path(dataDirRes), ".rds") listFiles <- file.path(file.path(dataDirRes) , listFilesName) syntheticKNN <- lapply(listFiles, FUN=function(x){return(readRDS(x))}) syntheticKNN <- do.call(rbind, syntheticKNN) # The name of the synthetic study studyID <- "MYDATA.Synthetic" ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoAncestryCall", package="RAIDS") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) ## Not run: pedSyn <- RAIDS:::prepPedSynthetic1KG(gdsReference=gdsRef, gdsSample=gdsProfile, studyID=studyID, popName="superPop") ## Run the ancestry inference on one profile called 'ex1' ## The values of K and D used for the inference are selected using the ## synthetic results listFiles=listFiles, resCall <- RAIDS:::computeAncestryFromSynthetic(gdsReference=gdsRef, gdsProfile=gdsProfile, syntheticKNN = syntheticKNN, pedSyn = pedSyn, currentProfile=c("ex1"), spRef=demoKnownSuperPop1KG, studyIDSyn=studyID, np=1L) ## The ancestry called with the optimal D and K values resCall$Ancestry ## End(Not run) ## Close the GDS files (important) closefn.gds(gdsProfile) closefn.gds(gdsRef)
## Required library library(gdsfmt) ## Load the known ancestry for the demo 1KG reference profiles data(demoKnownSuperPop1KG) ## The Reference GDS file path1KG <- system.file("extdata/tests", package="RAIDS") ## Open the Reference GDS file gdsRef <- snpgdsOpen(file.path(path1KG, "ex1_good_small_1KG.gds")) ## Path to the demo synthetic results files ## List of the KNN result files from PCA run on synthetic data dataDirRes <- system.file("extdata/demoAncestryCall/ex1", package="RAIDS") listFilesName <- dir(file.path(dataDirRes), ".rds") listFiles <- file.path(file.path(dataDirRes) , listFilesName) syntheticKNN <- lapply(listFiles, FUN=function(x){return(readRDS(x))}) syntheticKNN <- do.call(rbind, syntheticKNN) # The name of the synthetic study studyID <- "MYDATA.Synthetic" ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoAncestryCall", package="RAIDS") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) ## Not run: pedSyn <- RAIDS:::prepPedSynthetic1KG(gdsReference=gdsRef, gdsSample=gdsProfile, studyID=studyID, popName="superPop") ## Run the ancestry inference on one profile called 'ex1' ## The values of K and D used for the inference are selected using the ## synthetic results listFiles=listFiles, resCall <- RAIDS:::computeAncestryFromSynthetic(gdsReference=gdsRef, gdsProfile=gdsProfile, syntheticKNN = syntheticKNN, pedSyn = pedSyn, currentProfile=c("ex1"), spRef=demoKnownSuperPop1KG, studyIDSyn=studyID, np=1L) ## The ancestry called with the optimal D and K values resCall$Ancestry ## End(Not run) ## Close the GDS files (important) closefn.gds(gdsProfile) closefn.gds(gdsRef)
The function select the optimal K and D parameters for a specific profile. The results on the synthetic data are used for the parameter selection. Once the optimal parameters are selected, the ancestry is inferred for the specific profile.
computeAncestryFromSyntheticFile( gdsReference, gdsProfile, listFiles, currentProfile, spRef, studyIDSyn, np = 1L, listCatPop = c("EAS", "EUR", "AFR", "AMR", "SAS"), fieldPopIn1KG = "superPop", fieldPopInfAnc = "SuperPop", kList = seq(2, 15, 1), pcaList = seq(2, 15, 1), algorithm = c("exact", "randomized"), eigenCount = 32L, missingRate = NaN, verbose = FALSE )
computeAncestryFromSyntheticFile( gdsReference, gdsProfile, listFiles, currentProfile, spRef, studyIDSyn, np = 1L, listCatPop = c("EAS", "EUR", "AFR", "AMR", "SAS"), fieldPopIn1KG = "superPop", fieldPopInfAnc = "SuperPop", kList = seq(2, 15, 1), pcaList = seq(2, 15, 1), algorithm = c("exact", "randomized"), eigenCount = 32L, missingRate = NaN, verbose = FALSE )
gdsReference |
an object of class gds.class (a GDS file), the opened 1KG GDS file. |
gdsProfile |
an object of class |
listFiles |
a |
currentProfile |
a |
spRef |
a |
studyIDSyn |
a |
np |
a single positive |
listCatPop |
a |
fieldPopIn1KG |
a |
fieldPopInfAnc |
a |
kList |
a |
pcaList |
a |
algorithm |
a |
eigenCount |
a single |
missingRate |
a |
verbose |
a |
a list
containing 4 entries:
pcaSample
a list
containing the information related
to the eigenvectors. The list
contains those 3 entries:
sample.id
a character
string representing the unique
identifier of the current profile.
eigenvector.ref
a matrix
of numeric
containing
the eigenvectors for the reference profiles.
eigenvector
a matrix
of numeric
containing the
eigenvectors for the current profile projected on the PCA from the
reference profiles.
paraSample
a list
containing the results with
different D
and K
values that lead to optimal parameter
selection. The list
contains those entries:
dfPCA
a data.frame
containing statistical results
on all combined synthetic results done with a fixed value of D
(the
number of dimensions). The data.frame
contains those columns:
D
a numeric
representing the value of D
(the
number of dimensions).
median
a numeric
representing the median of the
minimum AUROC obtained (within super populations) for all combination of
the fixed D
value and all tested K
values.
mad
a numeric
representing the MAD of the minimum
AUROC obtained (within super populations) for all combination of the fixed
D
value and all tested K
values.
upQuartile
a numeric
representing the upper quartile
of the minimum AUROC obtained (within super populations) for all
combination of the fixed D
value and all tested K
values.
k
a numeric
representing the optimal K
value
(the number of neighbors) for a fixed D
value.
dfPop
a data.frame
containing statistical results on
all combined synthetic results done with different values of D
(the
number of dimensions) and K
(the number of neighbors).
The data.frame
contains those columns:
D
a numeric
representing the value of D
(the
number of dimensions).
K
a numeric
representing the value of K
(the
number of neighbors).
AUROC.min
a numeric
representing the minimum accuracy
obtained by grouping all the synthetic results by super-populations, for
the specified values of D
and K
.
AUROC
a numeric
representing the accuracy obtained
by grouping all the synthetic results for the specified values of D
and K
.
Accu.CM
a numeric
representing the value of accuracy
of the confusion matrix obtained by grouping all the synthetic results for
the specified values of D
and K
.
dfAUROC
a data.frame
the summary of the results by
super-population. The data.frame
contains
those columns:
pcaD
a numeric
representing the value of D
(the
number of dimensions).
K
a numeric
representing the value of K
(the
number of neighbors).
Call
a character
string representing the
super-population.
L
a numeric
representing the lower value of the 95%
confidence interval for the AUROC obtained for the fixed values of
super-population, D
and K
.
AUR
a numeric
representing the AUROC obtained for the
fixed values of super-population, D
and K
.
H
a numeric
representing the higher value of the 95%
confidence interval for the AUROC obtained for the fixed values of
super-population, D
and K
.
D
a numeric
representing the optimal D
value
(the number of dimensions) for the specific profile.
K
a numeric
representing the optimal K
value
(the number of neighbors) for the specific profile.
listD
a numeric
representing the optimal D
values (the number of dimensions) for the specific profile. More than one
D
is possible.
KNNSample
a list
containing the inferred ancestry
using different D
and K
values. The list
contains
those entries:
sample.id
a character
string representing the unique
identifier of the current profile.
matKNN
a data.frame
containing the inferred ancestry
for different values of K
and D
. The data.frame
contains those columns:
sample.id
a character
string representing the unique
identifier of the current profile.
D
a numeric
representing the value of D
(the
number of dimensions) used to infer the ancestry.
K
a numeric
representing the value of K
(the
number of neighbors) used to infer the ancestry.
SuperPop
a character
string representing the inferred
ancestry for the specified D
and K
values.
Ancestry
a data.frame
containing the inferred
ancestry for the current profile. The data.frame
contains those
columns:
sample.id
a character
string representing the unique
identifier of the current profile.
D
a numeric
representing the value of D
(the
number of dimensions) used to infer the ancestry.
K
a numeric
representing the value of K
(the
number of neighbors) used to infer the ancestry.
SuperPop
a character
string representing the inferred
ancestry.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
## Required library library(gdsfmt) ## Load the known ancestry for the demo 1KG reference profiles data(demoKnownSuperPop1KG) ## The Reference GDS file path1KG <- system.file("extdata/tests", package="RAIDS") ## Open the Reference GDS file gdsRef <- snpgdsOpen(file.path(path1KG, "ex1_good_small_1KG.gds")) ## Path to the demo synthetic results files ## List of the KNN result files from PCA run on synthetic data dataDirRes <- system.file("extdata/demoAncestryCall/ex1", package="RAIDS") listFilesName <- dir(file.path(dataDirRes), ".rds") listFiles <- file.path(file.path(dataDirRes) , listFilesName) # The name of the synthetic study studyID <- "MYDATA.Synthetic" ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoAncestryCall", package="RAIDS") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) ## Run the ancestry inference on one profile called 'ex1' ## The values of K and D used for the inference are selected using the ## synthetic results resCall <- computeAncestryFromSyntheticFile(gdsReference=gdsRef, gdsProfile=gdsProfile, listFiles=listFiles, currentProfile=c("ex1"), spRef=demoKnownSuperPop1KG, studyIDSyn=studyID, np=1L) ## The ancestry called with the optimal D and K values resCall$Ancestry ## Close the GDS files (important) closefn.gds(gdsProfile) closefn.gds(gdsRef)
## Required library library(gdsfmt) ## Load the known ancestry for the demo 1KG reference profiles data(demoKnownSuperPop1KG) ## The Reference GDS file path1KG <- system.file("extdata/tests", package="RAIDS") ## Open the Reference GDS file gdsRef <- snpgdsOpen(file.path(path1KG, "ex1_good_small_1KG.gds")) ## Path to the demo synthetic results files ## List of the KNN result files from PCA run on synthetic data dataDirRes <- system.file("extdata/demoAncestryCall/ex1", package="RAIDS") listFilesName <- dir(file.path(dataDirRes), ".rds") listFiles <- file.path(file.path(dataDirRes) , listFilesName) # The name of the synthetic study studyID <- "MYDATA.Synthetic" ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoAncestryCall", package="RAIDS") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) ## Run the ancestry inference on one profile called 'ex1' ## The values of K and D used for the inference are selected using the ## synthetic results resCall <- computeAncestryFromSyntheticFile(gdsReference=gdsRef, gdsProfile=gdsProfile, listFiles=listFiles, currentProfile=c("ex1"), spRef=demoKnownSuperPop1KG, studyIDSyn=studyID, np=1L) ## The ancestry called with the optimal D and K values resCall$Ancestry ## Close the GDS files (important) closefn.gds(gdsProfile) closefn.gds(gdsRef)
The function runs k-nearest neighbors analysis on a one specific profile. The function uses the 'knn' package.
computeKNNRefSample( listEigenvector, listCatPop = c("EAS", "EUR", "AFR", "AMR", "SAS"), spRef, fieldPopInfAnc = "SuperPop", kList = seq(2, 15, 1), pcaList = seq(2, 15, 1) )
computeKNNRefSample( listEigenvector, listCatPop = c("EAS", "EUR", "AFR", "AMR", "SAS"), spRef, fieldPopInfAnc = "SuperPop", kList = seq(2, 15, 1), pcaList = seq(2, 15, 1) )
listEigenvector |
a |
listCatPop |
a |
spRef |
|
fieldPopInfAnc |
a |
kList |
a |
pcaList |
a |
a list
containing 4 entries:
sample.id
a vector
of character
strings
representing the identifier of the profile analysed.
matKNN
a data.frame
containing the super population
inference for the profile for different values of PCA
dimensions D
and k-neighbors values K
. The fourth column title
corresponds to the fieldPopInfAnc
parameter.
The data.frame
contains 4 columns:
sample.id
a character
string representing
the identifier of the profile analysed.
D
a numeric
strings representing
the value of the PCA dimension used to infer the ancestry.
K
a numeric
strings representing
the value of the k-neighbors used to infer the ancestry..
fieldPopInfAnc
a character
string representing
the inferred ancestry.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Load the demo PCA on the synthetic profiles projected on the ## demo 1KG reference PCA data(demoPCASyntheticProfiles) ## Load the known ancestry for the demo 1KG reference profiles data(demoKnownSuperPop1KG) ## The PCA with 1 profile projected on the 1KG reference PCA ## Only one profile is retained pca <- demoPCASyntheticProfiles pca$sample.id <- pca$sample.id[1] pca$eigenvector <- pca$eigenvector[1, , drop=FALSE] ## Projects profile on 1KG PCA results <- computeKNNRefSample(listEigenvector=pca, listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), spRef=demoKnownSuperPop1KG, fieldPopInfAnc="SuperPop", kList=seq(10, 15, 1), pcaList=seq(10, 15, 1)) ## The assigned ancestry to the profile for different values of K and D head(results$matKNN)
## Load the demo PCA on the synthetic profiles projected on the ## demo 1KG reference PCA data(demoPCASyntheticProfiles) ## Load the known ancestry for the demo 1KG reference profiles data(demoKnownSuperPop1KG) ## The PCA with 1 profile projected on the 1KG reference PCA ## Only one profile is retained pca <- demoPCASyntheticProfiles pca$sample.id <- pca$sample.id[1] pca$eigenvector <- pca$eigenvector[1, , drop=FALSE] ## Projects profile on 1KG PCA results <- computeKNNRefSample(listEigenvector=pca, listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), spRef=demoKnownSuperPop1KG, fieldPopInfAnc="SuperPop", kList=seq(10, 15, 1), pcaList=seq(10, 15, 1)) ## The assigned ancestry to the profile for different values of K and D head(results$matKNN)
The function runs k-nearest neighbors analysis on a subset of the synthetic data set. The function uses the 'knn' package.
computeKNNRefSynthetic( gdsProfile, listEigenvector, listCatPop = c("EAS", "EUR", "AFR", "AMR", "SAS"), studyIDSyn, spRef, fieldPopInfAnc = "SuperPop", kList = seq(2, 15, 1), pcaList = seq(2, 15, 1) )
computeKNNRefSynthetic( gdsProfile, listEigenvector, listCatPop = c("EAS", "EUR", "AFR", "AMR", "SAS"), studyIDSyn, spRef, fieldPopInfAnc = "SuperPop", kList = seq(2, 15, 1), pcaList = seq(2, 15, 1) )
gdsProfile |
an object of class
|
listEigenvector |
a |
listCatPop |
a |
studyIDSyn |
a |
spRef |
|
fieldPopInfAnc |
a |
kList |
a |
pcaList |
a |
a list
containing 4 entries:
sample.id
a vector
of character
strings
representing the identifiers of the synthetic profiles analysed.
sample1Kg
a vector
of character
strings
representing the identifiers of the 1KG reference profiles used to
generate the synthetic profiles.
sp
a vector
of character
strings representing
the known super population ancestry of the 1KG reference profiles used
to generate the synthetic profiles.
matKNN
a data.frame
containing the super population
inference for each synthetic profiles for different values of PCA
dimensions D
and k-neighbors values K
. The fourth column title
corresponds to the fieldPopInfAnc
parameter.
The data.frame
contains 4 columns:
sample.id
a character
string representing
the identifier of the synthetic profile analysed.
D
a numeric
strings representing
the value of the PCA dimension used to infer the super population.
K
a numeric
strings representing
the value of the k-neighbors used to infer the super population.
fieldPopInfAnc
value a character
string representing
the inferred ancestry.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Required library library(gdsfmt) ## Load the demo PCA on the synthetic profiles projected on the ## demo 1KG reference PCA data(demoPCASyntheticProfiles) ## Load the known ancestry for the demo 1KG reference profiles data(demoKnownSuperPop1KG) ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) # The name of the synthetic study studyID <- "MYDATA.Synthetic" ## Projects synthetic profiles on 1KG PCA results <- computeKNNRefSynthetic(gdsProfile=gdsProfile, listEigenvector=demoPCASyntheticProfiles, listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), studyIDSyn=studyID, spRef=demoKnownSuperPop1KG) ## The inferred ancestry for the synthetic profiles for different values ## of D and K head(results$matKNN) ## Close Profile GDS file (important) closefn.gds(gdsProfile)
## Required library library(gdsfmt) ## Load the demo PCA on the synthetic profiles projected on the ## demo 1KG reference PCA data(demoPCASyntheticProfiles) ## Load the known ancestry for the demo 1KG reference profiles data(demoKnownSuperPop1KG) ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) # The name of the synthetic study studyID <- "MYDATA.Synthetic" ## Projects synthetic profiles on 1KG PCA results <- computeKNNRefSynthetic(gdsProfile=gdsProfile, listEigenvector=demoPCASyntheticProfiles, listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), studyIDSyn=studyID, spRef=demoKnownSuperPop1KG) ## The inferred ancestry for the synthetic profiles for different values ## of D and K head(results$matKNN) ## Close Profile GDS file (important) closefn.gds(gdsProfile)
The function projects the synthetic profiles onto existing principal component axes generated using the reference 1KG profiles. The reference profiles used to generate the synthetic profiles have previously been removed from the set of reference profiles.
computePCAMultiSynthetic( gdsProfile, listPCA, sampleRef, studyIDSyn, verbose = FALSE )
computePCAMultiSynthetic( gdsProfile, listPCA, sampleRef, studyIDSyn, verbose = FALSE )
gdsProfile |
an object of class gds.class (a GDS file), an opened Profile GDS file. |
listPCA |
a |
sampleRef |
a |
studyIDSyn |
a |
verbose |
a |
a list
containing 3 entries:
a vector
of character
strings representing
the identifiers of the synthetic profiles that have been projected onto
the 1KG PCA.
a matrix
of numeric
with the
eigenvectors of the 1KG reference profiles used to generate the PCA.
a matrix
of numeric
with the
eigenvectors of the synthetic profiles projected onto the 1KG PCA.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Required library library(gdsfmt) ## Loading demo PCA on subset of 1KG reference dataset data(demoPCA1KG) ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") # The name of the synthetic study studyID <- "MYDATA.Synthetic" samplesRM <- c("HG00246", "HG00325", "HG00611", "HG01173", "HG02165", "HG01112", "HG01615", "HG01968", "HG02658", "HG01850", "HG02013", "HG02465", "HG02974", "HG03814", "HG03445", "HG03689", "HG03789", "NA12751", "NA19107", "NA18548", "NA19075", "NA19475", "NA19712", "NA19731", "NA20528", "NA20908") names(samplesRM) <- c("GBR", "FIN", "CHS","PUR", "CDX", "CLM", "IBS", "PEL", "PJL", "KHV", "ACB", "GWD", "ESN", "BEB", "MSL", "STU", "ITU", "CEU", "YRI", "CHB", "JPT", "LWK", "ASW", "MXL", "TSI", "GIH") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) ## Projects synthetic profiles on 1KG PCA results <- computePCAMultiSynthetic(gdsProfile=gdsProfile, listPCA=demoPCA1KG, sampleRef=samplesRM, studyIDSyn=studyID, verbose=FALSE) ## The eigenvectors for the synthetic profiles head(results$eigenvector) ## Close Profile GDS file (important) closefn.gds(gdsProfile)
## Required library library(gdsfmt) ## Loading demo PCA on subset of 1KG reference dataset data(demoPCA1KG) ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") # The name of the synthetic study studyID <- "MYDATA.Synthetic" samplesRM <- c("HG00246", "HG00325", "HG00611", "HG01173", "HG02165", "HG01112", "HG01615", "HG01968", "HG02658", "HG01850", "HG02013", "HG02465", "HG02974", "HG03814", "HG03445", "HG03689", "HG03789", "NA12751", "NA19107", "NA18548", "NA19075", "NA19475", "NA19712", "NA19731", "NA20528", "NA20908") names(samplesRM) <- c("GBR", "FIN", "CHS","PUR", "CDX", "CLM", "IBS", "PEL", "PJL", "KHV", "ACB", "GWD", "ESN", "BEB", "MSL", "STU", "ITU", "CEU", "YRI", "CHB", "JPT", "LWK", "ASW", "MXL", "TSI", "GIH") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) ## Projects synthetic profiles on 1KG PCA results <- computePCAMultiSynthetic(gdsProfile=gdsProfile, listPCA=demoPCA1KG, sampleRef=samplesRM, studyIDSyn=studyID, verbose=FALSE) ## The eigenvectors for the synthetic profiles head(results$eigenvector) ## Close Profile GDS file (important) closefn.gds(gdsProfile)
This function generates a PCA using the know reference profiles. Them, it projects the specified profile onto the PCA axes.
computePCARefSample( gdsProfile, currentProfile, studyIDRef = "Ref.1KG", np = 1L, algorithm = c("exact", "randomized"), eigenCount = 32L, missingRate = NaN, verbose = FALSE )
computePCARefSample( gdsProfile, currentProfile, studyIDRef = "Ref.1KG", np = 1L, algorithm = c("exact", "randomized"), eigenCount = 32L, missingRate = NaN, verbose = FALSE )
gdsProfile |
an object of class gds.class, an opened Profile GDS file. |
currentProfile |
a single |
studyIDRef |
a single |
np |
a single positive |
algorithm |
a |
eigenCount |
a single |
missingRate |
a |
verbose |
a |
a list
containing 3 entries:
sample.id
a character
string representing the unique
identifier of the analyzed profile.
eigenvector.ref
a matrix
of numeric
representing the eigenvectors of the reference profiles.
eigenvector
a matrix
of numeric
representing
the eigenvectors of the analyzed profile.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
## Required library library(gdsfmt) ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoAncestryCall", package="RAIDS") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) ## Project a profile onto a PCA generated using reference profiles ## The reference profiles come from 1KG resPCA <- computePCARefSample(gdsProfile=gdsProfile, currentProfile=c("ex1"), studyIDRef="Ref.1KG", np=1L, verbose=FALSE) resPCA$sample.id resPCA$eigenvector ## Close the GDS files (important) closefn.gds(gdsProfile)
## Required library library(gdsfmt) ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoAncestryCall", package="RAIDS") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) ## Project a profile onto a PCA generated using reference profiles ## The reference profiles come from 1KG resPCA <- computePCARefSample(gdsProfile=gdsProfile, currentProfile=c("ex1"), studyIDRef="Ref.1KG", np=1L, verbose=FALSE) resPCA$sample.id resPCA$eigenvector ## Close the GDS files (important) closefn.gds(gdsProfile)
The function runs a PCA analysis using 1 synthetic profile from each sub-continental population. The reference profiles used to create those synthetic profiles are first removed from the list of 1KG reference profiles that generates the reference PCA. Then, the retained synthetic profiles are projected on the 1KG PCA space. Finally, a K-nearest neighbors analysis using a range of K and D values is done.
computePoolSyntheticAncestryGr( gdsProfile, sampleRM, spRef, studyIDSyn, np = 1L, listCatPop = c("EAS", "EUR", "AFR", "AMR", "SAS"), fieldPopInfAnc = "SuperPop", kList = seq(2, 15, 1), pcaList = seq(2, 15, 1), algorithm = c("exact", "randomized"), eigenCount = 32L, missingRate = 0.025, verbose = FALSE )
computePoolSyntheticAncestryGr( gdsProfile, sampleRM, spRef, studyIDSyn, np = 1L, listCatPop = c("EAS", "EUR", "AFR", "AMR", "SAS"), fieldPopInfAnc = "SuperPop", kList = seq(2, 15, 1), pcaList = seq(2, 15, 1), algorithm = c("exact", "randomized"), eigenCount = 32L, missingRate = 0.025, verbose = FALSE )
gdsProfile |
an object of class
|
sampleRM |
a |
spRef |
|
studyIDSyn |
a |
np |
a single positive |
listCatPop |
a |
fieldPopInfAnc |
a |
kList |
a |
pcaList |
a |
algorithm |
a |
eigenCount |
a single |
missingRate |
a |
verbose |
a |
a list
containing the following entries:
a vector
of character
strings representing
the identifiers of the synthetic profiles.
a vector
of character
strings representing
the identifiers of the reference 1KG profiles used to generate the
synthetic profiles.
a vector
of character
strings representing the
known ancestry for the reference 1KG profiles used to generate the
synthetic profiles.
a data.frame
containing 4 columns. The first column
'sample.id' contains the name of the synthetic profile. The second column
'D' represents the dimension D used to infer the ancestry. The third column
'K' represents the number of neighbors K used to infer the ancestry. The
fourth column 'SuperPop' contains the inferred ancestry.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
## Required library library(gdsfmt) ## Load the known ancestry for the demo 1KG reference profiles data(demoKnownSuperPop1KG) # The name of the synthetic study studyID <- "MYDATA.Synthetic" samplesRM <- c("HG00246", "HG00325", "HG00611", "HG01173", "HG02165", "HG01112", "HG01615", "HG01968", "HG02658", "HG01850", "HG02013", "HG02465", "HG02974", "HG03814", "HG03445", "HG03689", "HG03789", "NA12751", "NA19107", "NA18548", "NA19075", "NA19475", "NA19712", "NA19731", "NA20528", "NA20908") names(samplesRM) <- c("GBR", "FIN", "CHS","PUR", "CDX", "CLM", "IBS", "PEL", "PJL", "KHV", "ACB", "GWD", "ESN", "BEB", "MSL", "STU", "ITU", "CEU", "YRI", "CHB", "JPT", "LWK", "ASW", "MXL", "TSI", "GIH") ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) ## Run a PCA analysis and a K-nearest neighbors analysis on a small set ## of synthetic data results <- computePoolSyntheticAncestryGr(gdsProfile=gdsProfile, sampleRM=samplesRM, studyIDSyn=studyID, np=1L, spRef=demoKnownSuperPop1KG, kList=seq(10,15,1), pcaList=seq(10,15,1), eigenCount=15L) ## The ancestry inference for the synthetic data using ## different K and D values head(results$matKNN) ## Close Profile GDS file (important) closefn.gds(gdsProfile)
## Required library library(gdsfmt) ## Load the known ancestry for the demo 1KG reference profiles data(demoKnownSuperPop1KG) # The name of the synthetic study studyID <- "MYDATA.Synthetic" samplesRM <- c("HG00246", "HG00325", "HG00611", "HG01173", "HG02165", "HG01112", "HG01615", "HG01968", "HG02658", "HG01850", "HG02013", "HG02465", "HG02974", "HG03814", "HG03445", "HG03689", "HG03789", "NA12751", "NA19107", "NA18548", "NA19075", "NA19475", "NA19712", "NA19731", "NA20528", "NA20908") names(samplesRM) <- c("GBR", "FIN", "CHS","PUR", "CDX", "CLM", "IBS", "PEL", "PJL", "KHV", "ACB", "GWD", "ESN", "BEB", "MSL", "STU", "ITU", "CEU", "YRI", "CHB", "JPT", "LWK", "ASW", "MXL", "TSI", "GIH") ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) ## Run a PCA analysis and a K-nearest neighbors analysis on a small set ## of synthetic data results <- computePoolSyntheticAncestryGr(gdsProfile=gdsProfile, sampleRM=samplesRM, studyIDSyn=studyID, np=1L, spRef=demoKnownSuperPop1KG, kList=seq(10,15,1), pcaList=seq(10,15,1), eigenCount=15L) ## The ancestry inference for the synthetic data using ## different K and D values head(results$matKNN) ## Close Profile GDS file (important) closefn.gds(gdsProfile)
The function calculates the AUROC of the inferences for specific values of D and K using the inferred ancestry results from the synthetic profiles. The calculations are done on each super-population separately as well as on all the results together.
computeSyntheticROC( matKNN, matKNNAncestryColumn, pedCall, pedCallAncestryColumn, listCall = c("EAS", "EUR", "AFR", "AMR", "SAS") )
computeSyntheticROC( matKNN, matKNNAncestryColumn, pedCall, pedCallAncestryColumn, listCall = c("EAS", "EUR", "AFR", "AMR", "SAS") )
matKNN |
a |
matKNNAncestryColumn |
a |
pedCall |
a |
pedCallAncestryColumn |
a |
listCall |
a |
list
containing 3 entries:
matAUROC.All
a data.frame
containing the AUROC for all
the ancestry results.
matAUROC.Call
a data.frame
containing the AUROC
information for each super-population.
listROC.Call
a list
containing the output from the
roc
function for each super-population.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Loading demo dataset containing pedigree information for synthetic ## profiles and known ancestry of the profiles used to generate the ## synthetic profiles data(pedSynthetic) ## Loading demo dataset containing the inferred ancestry results ## for the synthetic data data(matKNNSynthetic) ## The inferred ancestry results for the synthetic data using ## values of D=6 and K=5 matKNN <- matKNNSynthetic[matKNNSynthetic$K == 6 & matKNNSynthetic$D == 5, ] ## Compile statistics from the ## synthetic profiles for fixed values of D and K results <- RAIDS:::computeSyntheticROC(matKNN=matKNN, matKNNAncestryColumn="SuperPop", pedCall=pedSynthetic, pedCallAncestryColumn="superPop", listCall=c("EAS", "EUR", "AFR", "AMR", "SAS")) results$matAUROC.All results$matAUROC.Call results$listROC.Call
## Loading demo dataset containing pedigree information for synthetic ## profiles and known ancestry of the profiles used to generate the ## synthetic profiles data(pedSynthetic) ## Loading demo dataset containing the inferred ancestry results ## for the synthetic data data(matKNNSynthetic) ## The inferred ancestry results for the synthetic data using ## values of D=6 and K=5 matKNN <- matKNNSynthetic[matKNNSynthetic$K == 6 & matKNNSynthetic$D == 5, ] ## Compile statistics from the ## synthetic profiles for fixed values of D and K results <- RAIDS:::computeSyntheticROC(matKNN=matKNN, matKNNAncestryColumn="SuperPop", pedCall=pedSynthetic, pedCallAncestryColumn="superPop", listCall=c("EAS", "EUR", "AFR", "AMR", "SAS")) results$matAUROC.All results$matAUROC.Call results$listROC.Call
This function extracts the required information from an output generated by RAIDS to create a graphic representation of the accuracy for different values of PCA dimensions and K-neighbors through all tested ancestries.
createAccuracyGraph( fileRDS, title = "", selectD = c(3, 7, 11), selectColor = c("#5e688a", "#cd5700", "#CC79A7") )
createAccuracyGraph( fileRDS, title = "", selectD = c(3, 7, 11), selectColor = c("#5e688a", "#cd5700", "#CC79A7") )
fileRDS |
a |
title |
a |
selectD |
a |
selectColor |
a |
a ggplot
object containing the graphic representation of the
accuracy for different values of PCA dimensions and K-neighbors through
all tested ancestries.
Astrid Deschênes and Pascal Belleau
## Required library library(ggplot2) ## Path to RDS file with ancestry information generated by RAIDS (demo file) dataDir <- system.file("extdata", package="RAIDS") fileRDS <- file.path(dataDir, "TEST_01.infoCall.RDS") ## Create accuracy graph accuracyGraph <- createAccuracyGraph(fileRDS=fileRDS, title="Test 01", selectD=c(3,6,9,12,15), selectColor=c("steelblue", "darkorange", "violet", "pink", "gray80")) accuracyGraph
## Required library library(ggplot2) ## Path to RDS file with ancestry information generated by RAIDS (demo file) dataDir <- system.file("extdata", package="RAIDS") fileRDS <- file.path(dataDir, "TEST_01.infoCall.RDS") ## Create accuracy graph accuracyGraph <- createAccuracyGraph(fileRDS=fileRDS, title="Test 01", selectD=c(3,6,9,12,15), selectColor=c("steelblue", "darkorange", "violet", "pink", "gray80")) accuracyGraph
This function extracts the required information from an output generated by RAIDS to create a graphic representation of the accuracy for different values of PCA dimensions and K-neighbors through all tested ancestries.
createAUROCGraph( dfAUROC, title = "", selectD = c(3, 7, 11), selectColor = c("#5e688a", "#cd5700", "#CC79A7") )
createAUROCGraph( dfAUROC, title = "", selectD = c(3, 7, 11), selectColor = c("#5e688a", "#cd5700", "#CC79A7") )
dfAUROC |
a |
title |
a |
selectD |
a |
selectColor |
a |
a ggplot
object containing the graphic representation of the
accuracy for different values of PCA dimensions and K-neighbors through
all tested ancestries.
Astrid Deschênes and Pascal Belleau
## Required library library(ggplot2) ## Path to RDS file with ancestry information generated by RAIDS (demo file) dataDir <- system.file("extdata", package="RAIDS") fileRDS <- file.path(dataDir, "TEST_01.infoCall.RDS") info <- readRDS(fileRDS) dfAUROC <- info$paraSample$dfAUROC colnames(dfAUROC) <- c("D", "K", "Call", "L", "AUROC", "H") ## Create accuracy graph accuracyGraph <- createAUROCGraph(dfAUROC=dfAUROC, title="Test 01", selectD=c(3,6,9,12,15), selectColor=c("steelblue", "darkorange", "violet", "pink", "gray80")) accuracyGraph
## Required library library(ggplot2) ## Path to RDS file with ancestry information generated by RAIDS (demo file) dataDir <- system.file("extdata", package="RAIDS") fileRDS <- file.path(dataDir, "TEST_01.infoCall.RDS") info <- readRDS(fileRDS) dfAUROC <- info$paraSample$dfAUROC colnames(dfAUROC) <- c("D", "K", "Call", "L", "AUROC", "H") ## Create accuracy graph accuracyGraph <- createAUROCGraph(dfAUROC=dfAUROC, title="Test 01", selectD=c(3,6,9,12,15), selectColor=c("steelblue", "darkorange", "violet", "pink", "gray80")) accuracyGraph
The function uses the information for the Reference GDS file
and the RDS Sample Description file to create the Profile GDS file. One
Profile GDS file is created per profile. One Profile GDS file will be
created for each entry present in the listProfiles
parameter.
createStudy2GDS1KG( pathGeno = file.path("data", "sampleGeno"), filePedRDS = NULL, pedStudy = NULL, fileNameGDS, batch = 1, studyDF, listProfiles = NULL, pathProfileGDS = NULL, genoSource = c("snp-pileup", "generic", "VCF"), verbose = FALSE )
createStudy2GDS1KG( pathGeno = file.path("data", "sampleGeno"), filePedRDS = NULL, pedStudy = NULL, fileNameGDS, batch = 1, studyDF, listProfiles = NULL, pathProfileGDS = NULL, genoSource = c("snp-pileup", "generic", "VCF"), verbose = FALSE )
pathGeno |
a |
filePedRDS |
a |
pedStudy |
a |
fileNameGDS |
a |
batch |
a single positive |
studyDF |
a |
listProfiles |
a |
pathProfileGDS |
a |
genoSource |
a |
verbose |
a |
The function returns 0L
when successful.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") ## The data.frame containing the information about the study ## The 3 mandatory columns: "study.id", "study.desc", "study.platform" ## The entries should be strings, not factors (stringsAsFactors=FALSE) studyDF <- data.frame(study.id = "MYDATA", study.desc = "Description", study.platform = "PLATFORM", stringsAsFactors = FALSE) ## The data.frame containing the information about the samples ## The entries should be strings, not factors (stringsAsFactors=FALSE) samplePED <- data.frame(Name.ID=c("ex1", "ex2"), Case.ID=c("Patient_h11", "Patient_h12"), Diagnosis=rep("Cancer", 2), Sample.Type=rep("Primary Tumor", 2), Source=rep("Databank B", 2), stringsAsFactors=FALSE) rownames(samplePED) <- samplePED$Name.ID ## Create the Profile GDS File for samples in 'listSamples' vector ## (in this case, samples "ex1") ## The Profile GDS file is created in the pathProfileGDS directory result <- createStudy2GDS1KG(pathGeno=dataDir, pedStudy=samplePED, fileNameGDS=fileGDS, studyDF=studyDF, listProfiles=c("ex1"), pathProfileGDS=tempdir(), genoSource="snp-pileup", verbose=FALSE) ## The function returns OL when successful result ## The Profile GDS file 'ex1.gds' has been created in the ## specified directory list.files(tempdir()) ## Remove Profile GDS file (created for demo purpose) unlink(file.path(tempdir(), "ex1.gds"), force=TRUE)
## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") ## The data.frame containing the information about the study ## The 3 mandatory columns: "study.id", "study.desc", "study.platform" ## The entries should be strings, not factors (stringsAsFactors=FALSE) studyDF <- data.frame(study.id = "MYDATA", study.desc = "Description", study.platform = "PLATFORM", stringsAsFactors = FALSE) ## The data.frame containing the information about the samples ## The entries should be strings, not factors (stringsAsFactors=FALSE) samplePED <- data.frame(Name.ID=c("ex1", "ex2"), Case.ID=c("Patient_h11", "Patient_h12"), Diagnosis=rep("Cancer", 2), Sample.Type=rep("Primary Tumor", 2), Source=rep("Databank B", 2), stringsAsFactors=FALSE) rownames(samplePED) <- samplePED$Name.ID ## Create the Profile GDS File for samples in 'listSamples' vector ## (in this case, samples "ex1") ## The Profile GDS file is created in the pathProfileGDS directory result <- createStudy2GDS1KG(pathGeno=dataDir, pedStudy=samplePED, fileNameGDS=fileGDS, studyDF=studyDF, listProfiles=c("ex1"), pathProfileGDS=tempdir(), genoSource="snp-pileup", verbose=FALSE) ## The function returns OL when successful result ## The Profile GDS file 'ex1.gds' has been created in the ## specified directory list.files(tempdir()) ## Remove Profile GDS file (created for demo purpose) unlink(file.path(tempdir(), "ex1.gds"), force=TRUE)
The object is a vector
.
data(demoKnownSuperPop1KG)
data(demoKnownSuperPop1KG)
The vector
containing the know super population ancestry
for the demo 1KG reference profiles.
This object can be
used to test the computeKNNRefSynthetic
and
computePoolSyntheticAncestryGr
functions.
The vector
containing the know super population ancestry
for the demo 1KG reference profiles.
computeKNNRefSynthetic
for running a k-nearest neighbors analysis on a subset of the synthetic data set.
computePoolSyntheticAncestryGr
for running a PCA analysis using 1 synthetic profile from each sub-continental population.
## Required library library(gdsfmt) ## Load the demo PCA on the synthetic profiles projected on the ## demo 1KG reference PCA data(demoPCASyntheticProfiles) ## Load the known ancestry for the demo 1KG reference profiles data(demoKnownSuperPop1KG) ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) # The name of the synthetic study studyID <- "MYDATA.Synthetic" ## Projects synthetic profiles on 1KG PCA results <- computeKNNRefSynthetic(gdsProfile=gdsProfile, listEigenvector=demoPCASyntheticProfiles, listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), studyIDSyn=studyID, spRef=demoKnownSuperPop1KG) ## The inferred ancestry for the synthetic profiles for different values ## of D and K head(results$matKNN) ## Close Profile GDS file (important) closefn.gds(gdsProfile)
## Required library library(gdsfmt) ## Load the demo PCA on the synthetic profiles projected on the ## demo 1KG reference PCA data(demoPCASyntheticProfiles) ## Load the known ancestry for the demo 1KG reference profiles data(demoKnownSuperPop1KG) ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) # The name of the synthetic study studyID <- "MYDATA.Synthetic" ## Projects synthetic profiles on 1KG PCA results <- computeKNNRefSynthetic(gdsProfile=gdsProfile, listEigenvector=demoPCASyntheticProfiles, listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), studyIDSyn=studyID, spRef=demoKnownSuperPop1KG) ## The inferred ancestry for the synthetic profiles for different values ## of D and K head(results$matKNN) ## Close Profile GDS file (important) closefn.gds(gdsProfile)
The object is a list
.
data(demoPCA1KG)
data(demoPCA1KG)
The list
containing the PCA results for a small subset of
the reference 1KG dataset. The list
contains 2 entries:
a vector
of SNV identifiers specifying selected SNVs
for the PCA analysis.
a snpgdsPCAClass
object containing the eigenvalues
as generated by snpgdsPCA function.
This object can be
used to test the computePCAMultiSynthetic
function.
The list
containing the PCA results for a small subset of
the reference 1KG dataset. The list
contains 2 entries:
a vector
of SNV identifiers specifying selected SNVs
for the PCA analysis.
a snpgdsPCAClass
object containing the eigenvalues
as generated by snpgdsPCA function.
## Required library library(gdsfmt) ## Loading demo PCA on subset of 1KG reference dataset data(demoPCA1KG) ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") # The name of the synthetic study studyID <- "MYDATA.Synthetic" samplesRM <- c("HG00246", "HG00325", "HG00611", "HG01173", "HG02165", "HG01112", "HG01615", "HG01968", "HG02658", "HG01850", "HG02013", "HG02465", "HG02974", "HG03814", "HG03445", "HG03689", "HG03789", "NA12751", "NA19107", "NA18548", "NA19075", "NA19475", "NA19712", "NA19731", "NA20528", "NA20908") names(samplesRM) <- c("GBR", "FIN", "CHS","PUR", "CDX", "CLM", "IBS", "PEL", "PJL", "KHV", "ACB", "GWD", "ESN", "BEB", "MSL", "STU", "ITU", "CEU", "YRI", "CHB", "JPT", "LWK", "ASW", "MXL", "TSI", "GIH") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) ## Projects synthetic profiles on demo 1KG PCA results <- computePCAMultiSynthetic(gdsProfile=gdsProfile, listPCA=demoPCA1KG, sampleRef=samplesRM, studyIDSyn=studyID, verbose=FALSE) ## The eigenvectors for the synthetic profiles head(results$eigenvector) ## Close Profile GDS file (important) closefn.gds(gdsProfile)
## Required library library(gdsfmt) ## Loading demo PCA on subset of 1KG reference dataset data(demoPCA1KG) ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") # The name of the synthetic study studyID <- "MYDATA.Synthetic" samplesRM <- c("HG00246", "HG00325", "HG00611", "HG01173", "HG02165", "HG01112", "HG01615", "HG01968", "HG02658", "HG01850", "HG02013", "HG02465", "HG02974", "HG03814", "HG03445", "HG03689", "HG03789", "NA12751", "NA19107", "NA18548", "NA19075", "NA19475", "NA19712", "NA19731", "NA20528", "NA20908") names(samplesRM) <- c("GBR", "FIN", "CHS","PUR", "CDX", "CLM", "IBS", "PEL", "PJL", "KHV", "ACB", "GWD", "ESN", "BEB", "MSL", "STU", "ITU", "CEU", "YRI", "CHB", "JPT", "LWK", "ASW", "MXL", "TSI", "GIH") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) ## Projects synthetic profiles on demo 1KG PCA results <- computePCAMultiSynthetic(gdsProfile=gdsProfile, listPCA=demoPCA1KG, sampleRef=samplesRM, studyIDSyn=studyID, verbose=FALSE) ## The eigenvectors for the synthetic profiles head(results$eigenvector) ## Close Profile GDS file (important) closefn.gds(gdsProfile)
The object is a list
.
data(demoPCASyntheticProfiles)
data(demoPCASyntheticProfiles)
The list
containing the PCA result of demo synthetic
profiles projected on the demo subset 1KG reference PCA.
The list
contains 3 entries:
a character
string representing the unique
identifier of the synthetic profiles.
a matrix
of numeric
containing
the eigenvectors for the reference profiles.
a matrix
of numeric
containing the
eigenvectors for the current synthetic profiles projected on the demo
PCA 1KG reference profiles.
This object can be
used to test the computeKNNRefSynthetic
function.
The list
containing the PCA result of demo synthetic
profiles projected on the demo subset 1KG reference PCA.
The list
contains 3 entries:
a character
string representing the unique
identifier of the synthetic profiles.
a matrix
of numeric
containing
the eigenvectors for the reference profiles.
a matrix
of numeric
containing the
eigenvectors for the current synthetic profiles projected on the demo
PCA 1KG reference profiles.
computeKNNRefSynthetic
for running a k-nearest neighbors analysis on a subset of the synthetic data set.
## Required library library(gdsfmt) ## Load the demo PCA on the synthetic profiles projected on the ## demo 1KG reference PCA data(demoPCASyntheticProfiles) ## Load the known ancestry for the demo 1KG reference profiles data(demoKnownSuperPop1KG) ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) # The name of the synthetic study studyID <- "MYDATA.Synthetic" ## Projects synthetic profiles on 1KG PCA results <- computeKNNRefSynthetic(gdsProfile=gdsProfile, listEigenvector=demoPCASyntheticProfiles, listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), studyIDSyn=studyID, spRef=demoKnownSuperPop1KG) ## The inferred ancestry for the synthetic profiles for different values ## of D and K head(results$matKNN) ## Close Profile GDS file (important) closefn.gds(gdsProfile)
## Required library library(gdsfmt) ## Load the demo PCA on the synthetic profiles projected on the ## demo 1KG reference PCA data(demoPCASyntheticProfiles) ## Load the known ancestry for the demo 1KG reference profiles data(demoKnownSuperPop1KG) ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) # The name of the synthetic study studyID <- "MYDATA.Synthetic" ## Projects synthetic profiles on 1KG PCA results <- computeKNNRefSynthetic(gdsProfile=gdsProfile, listEigenvector=demoPCASyntheticProfiles, listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), studyIDSyn=studyID, spRef=demoKnownSuperPop1KG) ## The inferred ancestry for the synthetic profiles for different values ## of D and K head(results$matKNN) ## Close Profile GDS file (important) closefn.gds(gdsProfile)
The object is a data.frame
.
data(demoPedigreeEx1)
data(demoPedigreeEx1)
The data.frame
containing the information about a demo
profile called 'ex1'. the data.frame
has 5 columns:
a character
string representing the unique
identifier of the profile.
a character
string representing the unique
identifier of the case associated to the profile.
a character
string describing the type of
profile.
a character
string describing the diagnosis of the
profile.
a character
string describing the source of the
profile.
This object can be
used to test the runExomeAncestry
function.
The data.frame
containing the information about a demo
profile called 'ex1'. the data.frame
has 5 columns:
a character
string representing the unique
identifier of the profile.
a character
string representing the unique
identifier of the case associated to the profile.
a character
string describing the type of
profile.
a character
string describing the diagnosis of the
profile.
a character
string describing the source of the
profile.
runExomeAncestry
for running runs most steps leading to the ancestry inference call on a specific exome profile.
## Required library for GDS library(SNPRelate) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata", package="RAIDS") ################################################################# ## Load the information about the profile ################################################################# data(demoPedigreeEx1) head(demoPedigreeEx1) ################################################################# ## The 1KG GDS file and the 1KG SNV Annotation GDS file ## need to be located in the same directory ## Note that the 1KG GDS file used for this example is a ## simplified version and CANNOT be used for any real analysis ################################################################# path1KG <- file.path(dataDir, "tests") fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") ################################################################# ## The Sample SNP pileup files (one per sample) need ## to be located in the same directory. ################################################################# pathGeno <- file.path(dataDir, "example", "snpPileup") ################################################################# ## The path where the Profile GDS Files (one per sample) ## will be created need to be specified. ################################################################# pathProfileGDS <- file.path(tempdir(), "out.tmp") pathOut <- file.path(tempdir(), "res.out") ################################################################# ## A data frame containing general information about the study ## is also required. The data frame must have ## those 3 columns: "studyID", "study.desc", "study.platform" ################################################################# studyDF <- data.frame(study.id="MYDATA", study.desc="Description", study.platform="PLATFORM", stringsAsFactors=FALSE) #################################################################### ## Fix seed to ensure reproducible results #################################################################### set.seed(2043) gds1KG <- snpgdsOpen(fileReferenceGDS) dataRef <- select1KGPop(gds1KG, nbProfiles=2L) closefn.gds(gds1KG) ## Required library for this example to run correctly if (requireNamespace("GenomeInfoDb", quietly=TRUE) && requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { ## Chromosome length information ## chr23 is chrX, chr24 is chrY and chrM is 25 chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] runExomeAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, pathProfileGDS=pathProfileGDS, pathGeno=pathGeno, pathOut=pathOut, fileReferenceGDS=fileReferenceGDS, fileReferenceAnnotGDS=fileAnnotGDS, chrInfo=chrInfo, syntheticRefDF=dataRef, genoSource="snp-pileup") unlink(pathProfileGDS, recursive=TRUE, force=TRUE) unlink(pathOut, recursive=TRUE, force=TRUE) }
## Required library for GDS library(SNPRelate) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata", package="RAIDS") ################################################################# ## Load the information about the profile ################################################################# data(demoPedigreeEx1) head(demoPedigreeEx1) ################################################################# ## The 1KG GDS file and the 1KG SNV Annotation GDS file ## need to be located in the same directory ## Note that the 1KG GDS file used for this example is a ## simplified version and CANNOT be used for any real analysis ################################################################# path1KG <- file.path(dataDir, "tests") fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") ################################################################# ## The Sample SNP pileup files (one per sample) need ## to be located in the same directory. ################################################################# pathGeno <- file.path(dataDir, "example", "snpPileup") ################################################################# ## The path where the Profile GDS Files (one per sample) ## will be created need to be specified. ################################################################# pathProfileGDS <- file.path(tempdir(), "out.tmp") pathOut <- file.path(tempdir(), "res.out") ################################################################# ## A data frame containing general information about the study ## is also required. The data frame must have ## those 3 columns: "studyID", "study.desc", "study.platform" ################################################################# studyDF <- data.frame(study.id="MYDATA", study.desc="Description", study.platform="PLATFORM", stringsAsFactors=FALSE) #################################################################### ## Fix seed to ensure reproducible results #################################################################### set.seed(2043) gds1KG <- snpgdsOpen(fileReferenceGDS) dataRef <- select1KGPop(gds1KG, nbProfiles=2L) closefn.gds(gds1KG) ## Required library for this example to run correctly if (requireNamespace("GenomeInfoDb", quietly=TRUE) && requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { ## Chromosome length information ## chr23 is chrX, chr24 is chrY and chrM is 25 chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] runExomeAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, pathProfileGDS=pathProfileGDS, pathGeno=pathGeno, pathOut=pathOut, fileReferenceGDS=fileReferenceGDS, fileReferenceAnnotGDS=fileAnnotGDS, chrInfo=chrInfo, syntheticRefDF=dataRef, genoSource="snp-pileup") unlink(pathProfileGDS, recursive=TRUE, force=TRUE) unlink(pathOut, recursive=TRUE, force=TRUE) }
The function estimates the allelic fraction of the SNVs for a specific profile and add the information to the associated Profile GDS file. The allelic fraction estimation method is adapted to the type of study (DNA or RNA).
estimateAllelicFraction( gdsReference, gdsProfile, currentProfile, studyID, chrInfo, studyType = c("DNA", "RNA"), minCov = 10L, minProb = 0.999, eProb = 0.001, cutOffLOH = -5, cutOffHomoScore = -3, wAR = 9, cutOffAR = 3, gdsRefAnnot = NULL, blockID = NULL, verbose = FALSE )
estimateAllelicFraction( gdsReference, gdsProfile, currentProfile, studyID, chrInfo, studyType = c("DNA", "RNA"), minCov = 10L, minProb = 0.999, eProb = 0.001, cutOffLOH = -5, cutOffHomoScore = -3, wAR = 9, cutOffAR = 3, gdsRefAnnot = NULL, blockID = NULL, verbose = FALSE )
gdsReference |
an object of class |
gdsProfile |
an object of class |
currentProfile |
a |
studyID |
a |
chrInfo |
a |
studyType |
a |
minCov |
a single positive |
minProb |
a single |
eProb |
a single |
cutOffLOH |
a single |
cutOffHomoScore |
a single |
wAR |
a single positive |
cutOffAR |
a single |
gdsRefAnnot |
an object of class |
blockID |
a |
verbose |
a |
The chrInfo
parameter contains the length of the chromosomes. The
length of the chromosomes can be obtain through the
seqlengths
library.
As example, for hg38 genome:
if (requireNamespace("GenomeInfoDb", quietly=TRUE) && requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] }
The integer 0L
when successful.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Required library for GDS library(gdsfmt) ## Path to the demo 1KG GDS file located in this package dataDir <- system.file("extdata/tests", package="RAIDS") fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") ## Profile GDS file for one profile fileProfile <- file.path(tempdir(), "ex1.gds") ## Copy the Profile GDS file demo that has been pruned and annotated ## into current directory file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), fileProfile) ## Open the reference GDS file (demo version) gds1KG <- snpgdsOpen(fileGDS) ## Profile GDS file for one profile profileGDS <- openfn.gds(fileProfile, readonly=FALSE) ## Required library for this example to run correctly if (requireNamespace("GenomeInfoDb", quietly=TRUE) && requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { ## Chromosome length information ## chr23 is chrX, chr24 is chrY and chrM is 25 chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] ## Estimate the allelic fraction of the pruned SNVs estimateAllelicFraction(gdsReference=gds1KG, gdsProfile=profileGDS, currentProfile="ex1", studyID="MYDATA", chrInfo=chrInfo, studyType="DNA", minCov=10L, minProb=0.999, eProb=0.001, cutOffLOH=-5, cutOffHomoScore=-3, wAR=9, cutOffAR=3, gdsRefAnnot=NULL, blockID=NULL) ## The allelic fraction is saved in the 'lap' node of Profile GDS file ## The 'lap' entry should be present profileGDS ## Close both GDS files (important) closefn.gds(profileGDS) closefn.gds(gds1KG) ## Remove Profile GDS file (created for demo purpose) unlink(fileProfile, force=TRUE) }
## Required library for GDS library(gdsfmt) ## Path to the demo 1KG GDS file located in this package dataDir <- system.file("extdata/tests", package="RAIDS") fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") ## Profile GDS file for one profile fileProfile <- file.path(tempdir(), "ex1.gds") ## Copy the Profile GDS file demo that has been pruned and annotated ## into current directory file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), fileProfile) ## Open the reference GDS file (demo version) gds1KG <- snpgdsOpen(fileGDS) ## Profile GDS file for one profile profileGDS <- openfn.gds(fileProfile, readonly=FALSE) ## Required library for this example to run correctly if (requireNamespace("GenomeInfoDb", quietly=TRUE) && requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { ## Chromosome length information ## chr23 is chrX, chr24 is chrY and chrM is 25 chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] ## Estimate the allelic fraction of the pruned SNVs estimateAllelicFraction(gdsReference=gds1KG, gdsProfile=profileGDS, currentProfile="ex1", studyID="MYDATA", chrInfo=chrInfo, studyType="DNA", minCov=10L, minProb=0.999, eProb=0.001, cutOffLOH=-5, cutOffHomoScore=-3, wAR=9, cutOffAR=3, gdsRefAnnot=NULL, blockID=NULL) ## The allelic fraction is saved in the 'lap' node of Profile GDS file ## The 'lap' entry should be present profileGDS ## Close both GDS files (important) closefn.gds(profileGDS) closefn.gds(gds1KG) ## Remove Profile GDS file (created for demo purpose) unlink(fileProfile, force=TRUE) }
This function generates the GDS file that will contain the information from Reference. The function also add the samples information, the SNP information and the genotyping information into the GDS file.
generateGDS1KG( pathGeno = file.path("data", "sampleGeno"), filePedRDS, fileSNVIndex, fileSNVSelected, fileNameGDS, listSamples = NULL, verbose = FALSE )
generateGDS1KG( pathGeno = file.path("data", "sampleGeno"), filePedRDS, fileSNVIndex, fileSNVSelected, fileNameGDS, listSamples = NULL, verbose = FALSE )
pathGeno |
a |
filePedRDS |
a |
fileSNVIndex |
a |
fileSNVSelected |
a |
fileNameGDS |
a |
listSamples |
a |
verbose |
a |
More information about GDS file format can be found at the Bioconductor gdsfmt website: https://bioconductor.org/packages/gdsfmt/
The integer 0L
when successful.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Path to the CSV genoytype files pathGeno <- file.path(dataDir, "demoProfileGenotypes") ## The RDS file containing the pedigree information pedigreeFile <- file.path(dataDir, "PedigreeReferenceDemo.rds") ## The RDS file containing the indexes of the retained SNPs snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") ## The RDS file containing the filtered SNP information filterSNVFile <- file.path(dataDir, "mapSNVSelected_Demo.rds") ## Temporary Reference GDS file tempRefGDS <- file.path(tempdir(), "1KG_TEMP.gds") ## Create a temporary Reference GDS file generateGDS1KG(pathGeno=pathGeno, filePedRDS=pedigreeFile, fileSNVIndex=snpIndexFile, fileSNVSelected=filterSNVFile, fileNameGDS=tempRefGDS, listSamples=NULL) ## Remove temporary files unlink(tempRefGDS, force=TRUE)
## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Path to the CSV genoytype files pathGeno <- file.path(dataDir, "demoProfileGenotypes") ## The RDS file containing the pedigree information pedigreeFile <- file.path(dataDir, "PedigreeReferenceDemo.rds") ## The RDS file containing the indexes of the retained SNPs snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") ## The RDS file containing the filtered SNP information filterSNVFile <- file.path(dataDir, "mapSNVSelected_Demo.rds") ## Temporary Reference GDS file tempRefGDS <- file.path(tempdir(), "1KG_TEMP.gds") ## Create a temporary Reference GDS file generateGDS1KG(pathGeno=pathGeno, filePedRDS=pedigreeFile, fileSNVIndex=snpIndexFile, fileSNVSelected=filterSNVFile, fileNameGDS=tempRefGDS, listSamples=NULL) ## Remove temporary files unlink(tempRefGDS, force=TRUE)
The function applies a cut-off filter to the SNP information file to retain only the SNP that have a frequency superior or equal to the specified cut-off in at least one super population. The information about the retained SNPs is saved in a RDS format file. A RDS file containing the indexes of the retained SNP is also created.
generateMapSnvSel(cutOff = 0.01, fileSNV, fileSNPsRDS, fileFREQ)
generateMapSnvSel(cutOff = 0.01, fileSNV, fileSNPsRDS, fileFREQ)
cutOff |
a single |
fileSNV |
a |
fileSNPsRDS |
a |
fileFREQ |
a |
The filtered SNP information RDS file (parameter fileFREQ
), contains
a data.frame
with those columns:
a character
string representing the chromosome where
the SNV is located.
a character
string representing the SNV position on the
chromosome.
a character
string representing the reference DNA base
for the SNV.
a character
string representing the alternative DNA base
for the SNV.
\
a character
string representing the allele frequency
of the EAS super population.
a character
string representing the allele frequency
of the AFR super population.
a character
string representing the allele frequency
of the AMR super population.
a character
string representing the allele frequency
of the SAS super population.
The integer 0
when successful.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Demo SNV information file used as input snvFile <- file.path(dataDir, "matFreqSNV_Demo.txt.bz2") ## Temporary output files ## The first file contains the indexes of the retained SNPs ## The second file contains the filtered SNP information snpIndexFile <- file.path(tempdir(), "listSNP_TEMP.rds") filterSNVFile <- file.path(tempdir(), "mapSNVSel_TEMP.rds") ## Create a data.frame containing the information of the retained ## samples (samples with existing genotyping files) generateMapSnvSel(cutOff=0.01, fileSNV=snvFile, fileSNPsRDS=snpIndexFile, fileFREQ=filterSNVFile) ## Remove temporary files unlink(snpIndexFile, force=TRUE) unlink(filterSNVFile, force=TRUE)
## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Demo SNV information file used as input snvFile <- file.path(dataDir, "matFreqSNV_Demo.txt.bz2") ## Temporary output files ## The first file contains the indexes of the retained SNPs ## The second file contains the filtered SNP information snpIndexFile <- file.path(tempdir(), "listSNP_TEMP.rds") filterSNVFile <- file.path(tempdir(), "mapSNVSel_TEMP.rds") ## Create a data.frame containing the information of the retained ## samples (samples with existing genotyping files) generateMapSnvSel(cutOff=0.01, fileSNV=snvFile, fileSNPsRDS=snpIndexFile, fileFREQ=filterSNVFile) ## Remove temporary files unlink(snpIndexFile, force=TRUE) unlink(filterSNVFile, force=TRUE)
The function is adding the phase information into the Reference Phase GDS file. The phase information is extracted from a Reference GDS file and is added into a Reference Phase GDS file. An entry called 'phase' is added to the Reference Phase GDS file.
generatePhase1KG2GDS( gdsReference, gdsReferencePhase, pathGeno, fileSNVIndex, verbose = FALSE )
generatePhase1KG2GDS( gdsReference, gdsReferencePhase, pathGeno, fileSNVIndex, verbose = FALSE )
gdsReference |
an object of class gds.class (GDS file), an opened Reference GDS file. |
gdsReferencePhase |
an object of class gds.class (GDS file), an opened Reference Phase GDS file. |
pathGeno |
a |
fileSNVIndex |
a |
verbose |
a |
The function returns 0L
when successful.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Required package library(gdsfmt) ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Path where the demo genotype CSV files are located pathGeno <- file.path(dataDir, "demoProfileGenotypes") ## The RDS file containing the pedigree information pedigreeFile <- file.path(dataDir, "PedigreeReferenceDemo.rds") ## The RDS file containing the indexes of the retained SNPs snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") ## The RDS file containing the filtered SNP information filterSNVFile <- file.path(dataDir, "mapSNVSelected_Demo.rds") ## Temporary Reference GDS file containing reference information fileReferenceGDS <- file.path(tempdir(), "1KG_TEMP_02.gds") ## Create a temporary Reference GDS file containing information from 1KG generateGDS1KG(pathGeno=pathGeno, filePedRDS=pedigreeFile, fileSNVIndex=snpIndexFile, fileSNVSelected=filterSNVFile, fileNameGDS=fileReferenceGDS, listSamples=NULL) ## Temporary Phase GDS file that will contain the 1KG Phase information fileRefPhaseGDS <- file.path(tempdir(), "1KG_TEMP_Phase_02.gds") ## Create Reference Phase GDS file gdsPhase <- createfn.gds(fileRefPhaseGDS) ## Open Reference GDS file gdsRef <- openfn.gds(fileReferenceGDS) ## Fill temporary Reference Phase GDS file if (FALSE) { generatePhase1KG2GDS(gdsReference=gdsRef, gdsReferencePhase=gdsPhase, pathGeno=pathGeno, fileSNVIndex=snpIndexFile, verbose=FALSE) } ## Close Reference Phase information file closefn.gds(gdsPhase) ## Close Reference information file closefn.gds(gdsRef) ## Remove temporary files unlink(fileReferenceGDS, force=TRUE) unlink(fileRefPhaseGDS, force=TRUE)
## Required package library(gdsfmt) ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Path where the demo genotype CSV files are located pathGeno <- file.path(dataDir, "demoProfileGenotypes") ## The RDS file containing the pedigree information pedigreeFile <- file.path(dataDir, "PedigreeReferenceDemo.rds") ## The RDS file containing the indexes of the retained SNPs snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") ## The RDS file containing the filtered SNP information filterSNVFile <- file.path(dataDir, "mapSNVSelected_Demo.rds") ## Temporary Reference GDS file containing reference information fileReferenceGDS <- file.path(tempdir(), "1KG_TEMP_02.gds") ## Create a temporary Reference GDS file containing information from 1KG generateGDS1KG(pathGeno=pathGeno, filePedRDS=pedigreeFile, fileSNVIndex=snpIndexFile, fileSNVSelected=filterSNVFile, fileNameGDS=fileReferenceGDS, listSamples=NULL) ## Temporary Phase GDS file that will contain the 1KG Phase information fileRefPhaseGDS <- file.path(tempdir(), "1KG_TEMP_Phase_02.gds") ## Create Reference Phase GDS file gdsPhase <- createfn.gds(fileRefPhaseGDS) ## Open Reference GDS file gdsRef <- openfn.gds(fileReferenceGDS) ## Fill temporary Reference Phase GDS file if (FALSE) { generatePhase1KG2GDS(gdsReference=gdsRef, gdsReferencePhase=gdsPhase, pathGeno=pathGeno, fileSNVIndex=snpIndexFile, verbose=FALSE) } ## Close Reference Phase information file closefn.gds(gdsPhase) ## Close Reference information file closefn.gds(gdsRef) ## Remove temporary files unlink(fileReferenceGDS, force=TRUE) unlink(fileRefPhaseGDS, force=TRUE)
The function is adding the phase information into the Reference Phase GDS file. The phase information is extracted from a Reference GDS file and is added into a Reference Phase GDS file. An entry called 'phase' is added to the Reference Phase GDS file.
generatePhaseRef( fileReferenceGDS, fileReferenceAnnotGDS, pathGeno, fileSNVIndex, verbose = FALSE )
generatePhaseRef( fileReferenceGDS, fileReferenceAnnotGDS, pathGeno, fileSNVIndex, verbose = FALSE )
fileReferenceGDS |
a |
fileReferenceAnnotGDS |
a |
pathGeno |
a |
fileSNVIndex |
a |
verbose |
a |
The function returns 0L
when successful.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Path where the demo genotype CSV files are located pathGeno <- file.path(dataDir, "demoProfileGenotypes") ## The RDS file containing the pedigree information pedigreeFile <- file.path(dataDir, "PedigreeReferenceDemo.rds") ## The RDS file containing the indexes of the retained SNPs snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") ## The RDS file containing the filtered SNP information filterSNVFile <- file.path(dataDir, "mapSNVSelected_Demo.rds") ## Temporary Reference GDS file containing reference information fileReferenceGDS <- file.path(tempdir(), "1KG_TEMP_02.gds") ## Create a temporary Reference GDS file containing information from 1KG generateGDS1KG(pathGeno=pathGeno, filePedRDS=pedigreeFile, fileSNVIndex=snpIndexFile, fileSNVSelected=filterSNVFile, fileNameGDS=fileReferenceGDS, listSamples=NULL) ## Temporary Phase GDS file that will contain the 1KG Phase information fileRefPhaseGDS <- file.path(tempdir(), "1KG_TEMP_Phase_02.gds") ## Fill temporary Reference Phase GDS file if (FALSE) { generatePhaseRef(fileReferenceGDS=fileReferenceGDS, fileReferenceAnnotGDS=fileRefPhaseGDS, pathGeno=pathGeno, fileSNVIndex=snpIndexFile, verbose=FALSE) } ## Remove temporary files unlink(fileReferenceGDS, force=TRUE) unlink(fileRefPhaseGDS, force=TRUE)
## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Path where the demo genotype CSV files are located pathGeno <- file.path(dataDir, "demoProfileGenotypes") ## The RDS file containing the pedigree information pedigreeFile <- file.path(dataDir, "PedigreeReferenceDemo.rds") ## The RDS file containing the indexes of the retained SNPs snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") ## The RDS file containing the filtered SNP information filterSNVFile <- file.path(dataDir, "mapSNVSelected_Demo.rds") ## Temporary Reference GDS file containing reference information fileReferenceGDS <- file.path(tempdir(), "1KG_TEMP_02.gds") ## Create a temporary Reference GDS file containing information from 1KG generateGDS1KG(pathGeno=pathGeno, filePedRDS=pedigreeFile, fileSNVIndex=snpIndexFile, fileSNVSelected=filterSNVFile, fileNameGDS=fileReferenceGDS, listSamples=NULL) ## Temporary Phase GDS file that will contain the 1KG Phase information fileRefPhaseGDS <- file.path(tempdir(), "1KG_TEMP_Phase_02.gds") ## Fill temporary Reference Phase GDS file if (FALSE) { generatePhaseRef(fileReferenceGDS=fileReferenceGDS, fileReferenceAnnotGDS=fileRefPhaseGDS, pathGeno=pathGeno, fileSNVIndex=snpIndexFile, verbose=FALSE) } ## Remove temporary files unlink(fileReferenceGDS, force=TRUE) unlink(fileRefPhaseGDS, force=TRUE)
The function extract the specified column for the 'sample.ref'
node present in the Reference GDS file. The column must be present in the
data.frame
saved in the 'sample.ref' node. Only the information for
the reference profiles is returned. The values
represent the known ancestry assignation.
getRef1KGPop(gdsReference, popName = "superPop")
getRef1KGPop(gdsReference, popName = "superPop")
gdsReference |
an object of class gds.class (a GDS file), the opened Reference GDS file. |
popName |
a |
vector
of character
strings representing the content
of the extracted column for the 1KG GDS 'sample.ref' node. The values
represent the known ancestry assignation. The profile
identifiers are used as names for the vector
.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Required library library(gdsfmt) ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Open existing demo 1K GDS file with "sample.ref" node nameFileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") fileGDS <- snpgdsOpen(nameFileGDS) ## Extract super population information for the 1KG profiles getRef1KGPop(gdsReference=fileGDS, popName="superPop") ## Close 1K GDS file closefn.gds(fileGDS)
## Required library library(gdsfmt) ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Open existing demo 1K GDS file with "sample.ref" node nameFileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") fileGDS <- snpgdsOpen(nameFileGDS) ## Extract super population information for the 1KG profiles getRef1KGPop(gdsReference=fileGDS, popName="superPop") ## Close 1K GDS file closefn.gds(fileGDS)
The function extract the specified column for the 'sample.ref'
node present in the Reference GDS file. The column must be present in the
data.frame
saved in the 'sample.ref' node. Only the information for
the reference profiles is returned. The values
represent the known ancestry assignation.
getRefSuperPop(fileReferenceGDS)
getRefSuperPop(fileReferenceGDS)
fileReferenceGDS |
a |
vector
of character
strings representing the content
of the extracted column for the 1KG GDS 'sample.ref' node. The values
represent the known ancestry assignation. The profile
identifiers are used as names for the vector
.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Open existing demo 1K GDS file with "sample.ref" node nameFileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") ## Extract super population information for the 1KG profiles getRefSuperPop(fileReferenceGDS=nameFileGDS)
## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Open existing demo 1K GDS file with "sample.ref" node nameFileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") ## Extract super population information for the 1KG profiles getRefSuperPop(fileReferenceGDS=nameFileGDS)
This function merge all the genotyping files associated to one specific sample into one file. That merged VCF file will be saved in a specified directory and will have the name of the sample. It will also be compressed (bzip). The function will merge the files for all samples present in the input directory.
groupChr1KGSNV(pathGenoChr, pathOut)
groupChr1KGSNV(pathGenoChr, pathOut)
pathGenoChr |
a |
pathOut |
a |
The integer 0L
when successful or FALSE
if not.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Path to the demo vcf files in this package dataDir <- system.file("extdata", package="RAIDS") pathGenoTar <- file.path(dataDir, "demoGenoChr", "demoGenoChr.tar") ## Path where the chromosomes files will be located pathGeno <- file.path(tempdir(), "tempGeno") dir.create(pathGeno, showWarnings=FALSE) ## Untar the file that contains the VCF files for 3 samples split by ## chromosome (one directory per chromosome) untar(tarfile=pathGenoTar, exdir=pathGeno) ## Path where the output VCF file will be created is ## the same where the split VCF are (pathGeno) ## The files must not exist if (!file.exists(file.path(pathGeno, "NA12003.csv.bz2")) && !file.exists(file.path(pathGeno, "NA12004.csv.bz2")) && !file.exists(file.path(pathGeno, "NA12005.csv.bz2"))) { ## Return 0 when successful ## The files "NA12003.csv.bz2", "NA12004.csv.bz2" and ## "NA12005.csv.bz2" should not be present in the current directory groupChr1KGSNV(pathGenoChr=pathGeno, pathOut=pathGeno) ## Validate that files have been created file.exists(file.path(pathGeno, "NA12003.csv.bz2")) file.exists(file.path(pathGeno, "NA12004.csv.bz2")) file.exists(file.path(pathGeno, "NA12005.csv.bz2")) } ## Remove temporary directory unlink(pathGeno, recursive=TRUE, force=TRUE)
## Path to the demo vcf files in this package dataDir <- system.file("extdata", package="RAIDS") pathGenoTar <- file.path(dataDir, "demoGenoChr", "demoGenoChr.tar") ## Path where the chromosomes files will be located pathGeno <- file.path(tempdir(), "tempGeno") dir.create(pathGeno, showWarnings=FALSE) ## Untar the file that contains the VCF files for 3 samples split by ## chromosome (one directory per chromosome) untar(tarfile=pathGenoTar, exdir=pathGeno) ## Path where the output VCF file will be created is ## the same where the split VCF are (pathGeno) ## The files must not exist if (!file.exists(file.path(pathGeno, "NA12003.csv.bz2")) && !file.exists(file.path(pathGeno, "NA12004.csv.bz2")) && !file.exists(file.path(pathGeno, "NA12005.csv.bz2"))) { ## Return 0 when successful ## The files "NA12003.csv.bz2", "NA12004.csv.bz2" and ## "NA12005.csv.bz2" should not be present in the current directory groupChr1KGSNV(pathGenoChr=pathGeno, pathOut=pathGeno) ## Validate that files have been created file.exists(file.path(pathGeno, "NA12003.csv.bz2")) file.exists(file.path(pathGeno, "NA12004.csv.bz2")) file.exists(file.path(pathGeno, "NA12005.csv.bz2")) } ## Remove temporary directory unlink(pathGeno, recursive=TRUE, force=TRUE)
The function identify patients that are genetically related in the Reference file. It generates a first RDS file with the list of unrelated patient. It also generates a second RDS file with the kinship coefficient between the patients.
identifyRelative(gds, maf = 0.05, thresh = 2^(-11/2), fileIBD, filePart)
identifyRelative(gds, maf = 0.05, thresh = 2^(-11/2), fileIBD, filePart)
gds |
an object of class
|
maf |
a single |
thresh |
a single |
fileIBD |
a |
filePart |
a |
NULL
invisibly.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Required package library(gdsfmt) ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Open existing demo Reference GDS file fileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") tmpGDS <- snpgdsOpen(fileGDS) ## Temporary output files ## The first RDS file will contain the list of unrelated patients ## The second RDS file will contain the kinship information between patients patientTmpFile <- "unrelatedPatients_TEMP.rds" ibdTmpFile <- "ibd_TEMP.rds" ## Different code depending of the withr package availability if (requireNamespace("withr", quietly=TRUE)) { ## Temporary output files ## The first RDS file will contain the list of unrelated patients ## The second RDS file will contain the kinship information ## between patients patientTmpFileLocal <- withr::local_file(patientTmpFile) ibdTmpFileLocal <- withr::local_file(ibdTmpFile) ## Identify unrelated patients in demo Reference GDS file identifyRelative(gds=tmpGDS, maf=0.05, thresh=2^(-11/2), fileIBD=ibdTmpFileLocal, filePart=patientTmpFileLocal) ## Close demo Reference GDS file closefn.gds(tmpGDS) ## Remove temporary files withr::deferred_run() } else { ## Identify unrelated patients in demo Reference GDS file identifyRelative(gds=tmpGDS, maf=0.05, thresh=2^(-11/2), fileIBD=ibdTmpFile, filePart=patientTmpFile) ## Close demo Reference GDS file closefn.gds(tmpGDS) ## Remove temporary files unlink(patientTmpFile, force=TRUE) unlink(ibdTmpFile, force=TRUE) }
## Required package library(gdsfmt) ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Open existing demo Reference GDS file fileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") tmpGDS <- snpgdsOpen(fileGDS) ## Temporary output files ## The first RDS file will contain the list of unrelated patients ## The second RDS file will contain the kinship information between patients patientTmpFile <- "unrelatedPatients_TEMP.rds" ibdTmpFile <- "ibd_TEMP.rds" ## Different code depending of the withr package availability if (requireNamespace("withr", quietly=TRUE)) { ## Temporary output files ## The first RDS file will contain the list of unrelated patients ## The second RDS file will contain the kinship information ## between patients patientTmpFileLocal <- withr::local_file(patientTmpFile) ibdTmpFileLocal <- withr::local_file(ibdTmpFile) ## Identify unrelated patients in demo Reference GDS file identifyRelative(gds=tmpGDS, maf=0.05, thresh=2^(-11/2), fileIBD=ibdTmpFileLocal, filePart=patientTmpFileLocal) ## Close demo Reference GDS file closefn.gds(tmpGDS) ## Remove temporary files withr::deferred_run() } else { ## Identify unrelated patients in demo Reference GDS file identifyRelative(gds=tmpGDS, maf=0.05, thresh=2^(-11/2), fileIBD=ibdTmpFile, filePart=patientTmpFile) ## Close demo Reference GDS file closefn.gds(tmpGDS) ## Remove temporary files unlink(patientTmpFile, force=TRUE) unlink(ibdTmpFile, force=TRUE) }
The function identify patients that are genetically related in the Reference file. It generates a first RDS file with the list of unrelated patient. It also generates a second RDS file with the kinship coefficient between the patients.
identifyRelativeRef( fileReferenceGDS, maf = 0.05, thresh = 2^(-11/2), fileIBD, filePart )
identifyRelativeRef( fileReferenceGDS, maf = 0.05, thresh = 2^(-11/2), fileIBD, filePart )
fileReferenceGDS |
a |
maf |
a single |
thresh |
a single |
fileIBD |
a |
filePart |
a |
NULL
invisibly.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Open existing demo Reference GDS file fileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") ## Temporary output files ## The first RDS file will contain the list of unrelated patients ## The second RDS file will contain the kinship information between patients patientTmpFile <- "unrelatedPatients_TEMP.rds" ibdTmpFile <- "ibd_TEMP.rds" ## Different code depending of the withr package availability if (requireNamespace("withr", quietly=TRUE)) { ## Temporary output files ## The first RDS file will contain the list of unrelated patients ## The second RDS file will contain the kinship information ## between patients patientTmpFileLocal <- withr::local_file(patientTmpFile) ibdTmpFileLocal <- withr::local_file(ibdTmpFile) ## Identify unrelated patients in demo Reference GDS file identifyRelativeRef(fileReferenceGDS=fileGDS, maf=0.05, thresh=2^(-11/2), fileIBD=ibdTmpFileLocal, filePart=patientTmpFileLocal) ## Remove temporary files withr::deferred_run() } else { ## Identify unrelated patients in demo Reference GDS file identifyRelativeRef(fileReferenceGDS=fileGDS, maf=0.05, thresh=2^(-11/2), fileIBD=ibdTmpFile, filePart=patientTmpFile) ## Remove temporary files unlink(patientTmpFile, force=TRUE) unlink(ibdTmpFile, force=TRUE) }
## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Open existing demo Reference GDS file fileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") ## Temporary output files ## The first RDS file will contain the list of unrelated patients ## The second RDS file will contain the kinship information between patients patientTmpFile <- "unrelatedPatients_TEMP.rds" ibdTmpFile <- "ibd_TEMP.rds" ## Different code depending of the withr package availability if (requireNamespace("withr", quietly=TRUE)) { ## Temporary output files ## The first RDS file will contain the list of unrelated patients ## The second RDS file will contain the kinship information ## between patients patientTmpFileLocal <- withr::local_file(patientTmpFile) ibdTmpFileLocal <- withr::local_file(ibdTmpFile) ## Identify unrelated patients in demo Reference GDS file identifyRelativeRef(fileReferenceGDS=fileGDS, maf=0.05, thresh=2^(-11/2), fileIBD=ibdTmpFileLocal, filePart=patientTmpFileLocal) ## Remove temporary files withr::deferred_run() } else { ## Identify unrelated patients in demo Reference GDS file identifyRelativeRef(fileReferenceGDS=fileGDS, maf=0.05, thresh=2^(-11/2), fileIBD=ibdTmpFile, filePart=patientTmpFile) ## Remove temporary files unlink(patientTmpFile, force=TRUE) unlink(ibdTmpFile, force=TRUE) }
This function runs most steps leading to the ancestry inference call on a specific RNA profile. First, the function creates the Profile GDS file for the specific profile using the information from a RDS Sample description file and the Population Reference GDS file.
inferAncestry( profileFile, pathProfileGDS, fileReferenceGDS, fileReferenceAnnotGDS, chrInfo, syntheticRefDF, genoSource = c("snp-pileup", "generic", "VCF", "bam"), np = 1L, verbose = FALSE )
inferAncestry( profileFile, pathProfileGDS, fileReferenceGDS, fileReferenceAnnotGDS, chrInfo, syntheticRefDF, genoSource = c("snp-pileup", "generic", "VCF", "bam"), np = 1L, verbose = FALSE )
profileFile |
a |
pathProfileGDS |
a |
fileReferenceGDS |
a |
fileReferenceAnnotGDS |
a |
chrInfo |
a |
syntheticRefDF |
a
|
genoSource |
a |
np |
a single positive |
verbose |
a |
a list
containing 4 entries:
pcaSample
a list
containing the information related
to the eigenvectors. The list
contains those 3 entries:
sample.id
a character
string representing the unique
identifier of the current profile.
eigenvector.ref
a matrix
of numeric
containing
the eigenvectors for the reference profiles.
eigenvector
a matrix
of numeric
containing the
eigenvectors for the current profile projected on the PCA from the
reference profiles.
paraSample
a list
containing the results with
different D
and K
values that lead to optimal parameter
selection. The list
contains those entries:
dfPCA
a data.frame
containing statistical results
on all combined synthetic results done with a fixed value of D
(the
number of dimensions). The data.frame
contains those columns:
D
a numeric
representing the value of D
(the
number of dimensions).
median
a numeric
representing the median of the
minimum AUROC obtained (within super populations) for all combination of
the fixed D
value and all tested K
values.
mad
a numeric
representing the MAD of the minimum
AUROC obtained (within super populations) for all combination of the fixed
D
value and all tested K
values.
upQuartile
a numeric
representing the upper quartile
of the minimum AUROC obtained (within super populations) for all
combination of the fixed D
value and all tested K
values.
k
a numeric
representing the optimal K
value
(the number of neighbors) for a fixed D
value.
dfPop
a data.frame
containing statistical results on
all combined synthetic results done with different values of D
(the
number of dimensions) and K
(the number of neighbors).
The data.frame
contains those columns:
D
a numeric
representing the value of D
(the
number of dimensions).
K
a numeric
representing the value of K
(the
number of neighbors).
AUROC.min
a numeric
representing the minimum accuracy
obtained by grouping all the synthetic results by super-populations, for
the specified values of D
and K
.
AUROC
a numeric
representing the accuracy obtained
by grouping all the synthetic results for the specified values of D
and K
.
Accu.CM
a numeric
representing the value of accuracy
of the confusion matrix obtained by grouping all the synthetic results for
the specified values of D
and K
.
dfAUROC
a data.frame
the summary of the results by
super-population. The data.frame
contains
those columns:
D
a numeric
representing the value of D
(the
number of dimensions).
K
a numeric
representing the value of K
(the
number of neighbors).
Call
a character
string representing the
super-population.
L
a numeric
representing the lower value of the 95%
confidence interval for the AUROC obtained for the fixed values of
super-population, D
and K
.
AUROC
a numeric
representing the AUROC obtained for the
fixed values of super-population, D
and K
.
H
a numeric
representing the higher value of the 95%
confidence interval for the AUROC obtained for the fixed values of
super-population, D
and K
.
D
a numeric
representing the optimal D
value
(the number of dimensions) for the specific profile.
K
a numeric
representing the optimal K
value
(the number of neighbors) for the specific profile.
listD
a numeric
representing the optimal D
values (the number of dimensions) for the specific profile. More than one
D
is possible.
KNNSample
a data.frame
containing the inferred ancestry
for different values of K
and D
. The data.frame
contains those columns:
sample.id
a character
string representing the unique
identifier of the current profile.
D
a numeric
representing the value of D
(the
number of dimensions) used to infer the ancestry.
K
a numeric
representing the value of K
(the
number of neighbors) used to infer the ancestry.
SuperPop
a character
string representing the inferred
ancestry for the specified D
and K
values.
KNNSynthetic
a data.frame
containing the inferred ancestry
for each synthetic data for different values of K
and D
.
The data.frame
contains those columns: "sample.id", "D", "K", "infer.superPop", "ref.superPop"
sample.id
a character
string representing the unique
identifier of the current synthetic data.
D
a numeric
representing the value of D
(the
number of dimensions) used to infer the ancestry.
K
a numeric
representing the value of K
(the
number of neighbors) used to infer the ancestry.
infer.superPop
a character
string representing the inferred
ancestry for the specified D
and K
values.
ref.superPop
a character
string representing the known
ancestry from the reference
Ancestry
a data.frame
containing the inferred
ancestry for the current profile. The data.frame
contains those
columns:
sample.id
a character
string representing the unique
identifier of the current profile.
D
a numeric
representing the value of D
(the
number of dimensions) used to infer the ancestry.
K
a numeric
representing the value of K
(the
number of neighbors) used to infer the ancestry.
SuperPop
a character
string representing the inferred
ancestry.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
## Required library for GDS library(SNPRelate) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata", package="RAIDS") ################################################################# ## The 1KG GDS file and the 1KG SNV Annotation GDS file ## need to be located in the same directory ## Note that the 1KG GDS file used for this example is a ## simplified version and CANNOT be used for any real analysis ################################################################# path1KG <- file.path(dataDir, "tests") fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") ################################################################# ## The Sample SNP pileup files (one per sample) need ## to be located in the same directory. ################################################################# demoProfileEx1 <- file.path(dataDir, "example", "snpPileup", "ex1.txt.gz") ################################################################# ## The path where the Profile GDS Files (one per sample) ## will be created need to be specified. ################################################################# pathProfileGDS <- file.path(tempdir(), "out.tmp") #################################################################### ## Fix seed to ensure reproducible results #################################################################### set.seed(3043) gds1KG <- snpgdsOpen(fileReferenceGDS) dataRef <- select1KGPop(gds1KG, nbProfiles=2L) closefn.gds(gds1KG) ## Required library for this example to run correctly if (requireNamespace("GenomeInfoDb", quietly=TRUE) && requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { ## Chromosome length information ## chr23 is chrX, chr24 is chrY and chrM is 25 chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] res <- inferAncestry(profileFile=demoProfileEx1, pathProfileGDS=pathProfileGDS, fileReferenceGDS=fileReferenceGDS, fileReferenceAnnotGDS=fileAnnotGDS, chrInfo=chrInfo, syntheticRefDF=dataRef, genoSource="snp-pileup") unlink(pathProfileGDS, recursive=TRUE, force=TRUE) }
## Required library for GDS library(SNPRelate) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata", package="RAIDS") ################################################################# ## The 1KG GDS file and the 1KG SNV Annotation GDS file ## need to be located in the same directory ## Note that the 1KG GDS file used for this example is a ## simplified version and CANNOT be used for any real analysis ################################################################# path1KG <- file.path(dataDir, "tests") fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") ################################################################# ## The Sample SNP pileup files (one per sample) need ## to be located in the same directory. ################################################################# demoProfileEx1 <- file.path(dataDir, "example", "snpPileup", "ex1.txt.gz") ################################################################# ## The path where the Profile GDS Files (one per sample) ## will be created need to be specified. ################################################################# pathProfileGDS <- file.path(tempdir(), "out.tmp") #################################################################### ## Fix seed to ensure reproducible results #################################################################### set.seed(3043) gds1KG <- snpgdsOpen(fileReferenceGDS) dataRef <- select1KGPop(gds1KG, nbProfiles=2L) closefn.gds(gds1KG) ## Required library for this example to run correctly if (requireNamespace("GenomeInfoDb", quietly=TRUE) && requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { ## Chromosome length information ## chr23 is chrX, chr24 is chrY and chrM is 25 chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] res <- inferAncestry(profileFile=demoProfileEx1, pathProfileGDS=pathProfileGDS, fileReferenceGDS=fileReferenceGDS, fileReferenceAnnotGDS=fileAnnotGDS, chrInfo=chrInfo, syntheticRefDF=dataRef, genoSource="snp-pileup") unlink(pathProfileGDS, recursive=TRUE, force=TRUE) }
This function runs most steps leading to the ancestry inference call on a specific RNA profile. First, the function creates the Profile GDS file for the specific profile using the information from a RDS Sample description file and the Population Reference GDS file.
inferAncestryGeneAware( profileFile, pathProfileGDS, fileReferenceGDS, fileReferenceAnnotGDS, chrInfo, syntheticRefDF, genoSource = c("snp-pileup", "generic", "VCF", "bam"), np = 1L, blockTypeID, verbose = FALSE )
inferAncestryGeneAware( profileFile, pathProfileGDS, fileReferenceGDS, fileReferenceAnnotGDS, chrInfo, syntheticRefDF, genoSource = c("snp-pileup", "generic", "VCF", "bam"), np = 1L, blockTypeID, verbose = FALSE )
profileFile |
a |
pathProfileGDS |
a |
fileReferenceGDS |
a |
fileReferenceAnnotGDS |
a |
chrInfo |
a |
syntheticRefDF |
a
|
genoSource |
a |
np |
a single positive |
blockTypeID |
a |
verbose |
a |
The runExomeAncestry() function generates 3 types of files in the OUTPUT directory.
The ancestry inference CSV file (".Ancestry.csv" file)
The inference information RDS file (".infoCall.rds" file)
The parameter information RDS files from the synthetic inference ("KNN.synt.*.rds" files in a sub-directory)
In addition, a sub-directory (named using the profile ID) is also created.
a list
containing 4 entries:
pcaSample
a list
containing the information related
to the eigenvectors. The list
contains those 3 entries:
sample.id
a character
string representing the unique
identifier of the current profile.
eigenvector.ref
a matrix
of numeric
containing
the eigenvectors for the reference profiles.
eigenvector
a matrix
of numeric
containing the
eigenvectors for the current profile projected on the PCA from the
reference profiles.
paraSample
a list
containing the results with
different D
and K
values that lead to optimal parameter
selection. The list
contains those entries:
dfPCA
a data.frame
containing statistical results
on all combined synthetic results done with a fixed value of D
(the
number of dimensions). The data.frame
contains those columns:
D
a numeric
representing the value of D
(the
number of dimensions).
median
a numeric
representing the median of the
minimum AUROC obtained (within super populations) for all combination of
the fixed D
value and all tested K
values.
mad
a numeric
representing the MAD of the minimum
AUROC obtained (within super populations) for all combination of the fixed
D
value and all tested K
values.
upQuartile
a numeric
representing the upper quartile
of the minimum AUROC obtained (within super populations) for all
combination of the fixed D
value and all tested K
values.
k
a numeric
representing the optimal K
value
(the number of neighbors) for a fixed D
value.
dfPop
a data.frame
containing statistical results on
all combined synthetic results done with different values of D
(the
number of dimensions) and K
(the number of neighbors).
The data.frame
contains those columns:
D
a numeric
representing the value of D
(the
number of dimensions).
K
a numeric
representing the value of K
(the
number of neighbors).
AUROC.min
a numeric
representing the minimum accuracy
obtained by grouping all the synthetic results by super-populations, for
the specified values of D
and K
.
AUROC
a numeric
representing the accuracy obtained
by grouping all the synthetic results for the specified values of D
and K
.
Accu.CM
a numeric
representing the value of accuracy
of the confusion matrix obtained by grouping all the synthetic results for
the specified values of D
and K
.
dfAUROC
a data.frame
the summary of the results by
super-population. The data.frame
contains
those columns:
D
a numeric
representing the value of D
(the
number of dimensions).
K
a numeric
representing the value of K
(the
number of neighbors).
Call
a character
string representing the
super-population.
L
a numeric
representing the lower value of the 95%
confidence interval for the AUROC obtained for the fixed values of
super-population, D
and K
.
AUROC
a numeric
representing the AUROC obtained for
the fixed values of super-population, D
and K
.
H
a numeric
representing the higher value of the 95%
confidence interval for the AUROC obtained for the fixed values of
super-population, D
and K
.
D
a numeric
representing the optimal D
value
(the number of dimensions) for the specific profile.
K
a numeric
representing the optimal K
value
(the number of neighbors) for the specific profile.
listD
a numeric
representing the optimal D
values (the number of dimensions) for the specific profile. More than one
D
is possible.
KNNSample
a data.frame
containing the inferred
ancestry for different values of K
and D
. The
data.frame
contains those columns:
sample.id
a character
string representing the unique
identifier of the current profile.
D
a numeric
representing the value of D
(the
number of dimensions) used to infer the ancestry.
K
a numeric
representing the value of K
(the
number of neighbors) used to infer the ancestry.
SuperPop
a character
string representing the inferred
ancestry for the specified D
and K
values.
KNNSynthetic
a data.frame
containing the inferred
ancestry for each synthetic data for different values of K
and
D
.
The data.frame
contains those columns:
sample.id
a character
string representing the unique
identifier of the current synthetic data.
D
a numeric
representing the value of D
(the
number of dimensions) used to infer the ancestry.
K
a numeric
representing the value of K
(the
number of neighbors) used to infer the ancestry.
infer.superPop
a character
string representing the
inferred ancestry for the specified D
and K
values.
ref.superPop
a character
string representing the known
ancestry from the reference
Ancestry
a data.frame
containing the inferred
ancestry for the current profile. The data.frame
contains those
columns:
sample.id
a character
string representing the unique
identifier of the current profile.
D
a numeric
representing the value of D
(the
number of dimensions) used to infer the ancestry.
K
a numeric
representing the value of K
(the
number of neighbors) used to infer the ancestry.
SuperPop
a character
string representing the inferred
ancestry.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
## Required library for GDS library(SNPRelate) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata", package="RAIDS") ################################################################# ## The 1KG GDS file and the 1KG SNV Annotation GDS file ## need to be located in the same directory ## Note that the 1KG GDS file used for this example is a ## simplified version and CANNOT be used for any real analysis ################################################################# path1KG <- file.path(dataDir, "tests") fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") ################################################################# ## The Sample SNP pileup files (one per sample) need ## to be located in the same directory. ################################################################# demoProfileEx1 <- file.path(dataDir, "example", "snpPileup", "ex1.txt.gz") ################################################################# ## The path where the Profile GDS Files (one per sample) ## will be created need to be specified. ################################################################# pathProfileGDS <- file.path(tempdir(), "out.tmp") #################################################################### ## Fix seed to ensure reproducible results #################################################################### set.seed(3043) gds1KG <- snpgdsOpen(fileReferenceGDS) dataRef <- select1KGPop(gds1KG, nbProfiles=2L) closefn.gds(gds1KG) ## Required library for this example to run correctly if (requireNamespace("GenomeInfoDb", quietly=TRUE) && requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { ## Chromosome length information ## chr23 is chrX, chr24 is chrY and chrM is 25 chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] res <- inferAncestryGeneAware(profileFile=demoProfileEx1, pathProfileGDS=pathProfileGDS, fileReferenceGDS=fileReferenceGDS, fileReferenceAnnotGDS=fileAnnotGDS, chrInfo=chrInfo, syntheticRefDF=dataRef, blockTypeID="GeneS.Ensembl.Hsapiens.v86", genoSource="snp-pileup") unlink(pathProfileGDS, recursive=TRUE, force=TRUE) }
## Required library for GDS library(SNPRelate) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata", package="RAIDS") ################################################################# ## The 1KG GDS file and the 1KG SNV Annotation GDS file ## need to be located in the same directory ## Note that the 1KG GDS file used for this example is a ## simplified version and CANNOT be used for any real analysis ################################################################# path1KG <- file.path(dataDir, "tests") fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") ################################################################# ## The Sample SNP pileup files (one per sample) need ## to be located in the same directory. ################################################################# demoProfileEx1 <- file.path(dataDir, "example", "snpPileup", "ex1.txt.gz") ################################################################# ## The path where the Profile GDS Files (one per sample) ## will be created need to be specified. ################################################################# pathProfileGDS <- file.path(tempdir(), "out.tmp") #################################################################### ## Fix seed to ensure reproducible results #################################################################### set.seed(3043) gds1KG <- snpgdsOpen(fileReferenceGDS) dataRef <- select1KGPop(gds1KG, nbProfiles=2L) closefn.gds(gds1KG) ## Required library for this example to run correctly if (requireNamespace("GenomeInfoDb", quietly=TRUE) && requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { ## Chromosome length information ## chr23 is chrX, chr24 is chrY and chrM is 25 chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] res <- inferAncestryGeneAware(profileFile=demoProfileEx1, pathProfileGDS=pathProfileGDS, fileReferenceGDS=fileReferenceGDS, fileReferenceAnnotGDS=fileAnnotGDS, chrInfo=chrInfo, syntheticRefDF=dataRef, blockTypeID="GeneS.Ensembl.Hsapiens.v86", genoSource="snp-pileup") unlink(pathProfileGDS, recursive=TRUE, force=TRUE) }
data.frame
containing the
inferred ancestry on the synthetic profiles.The object is a data.frame
with 4 columns.
data(matKNNSynthetic)
data(matKNNSynthetic)
The data.frame
containing the information about the
synthetic profiles. The data.frame
contains 4 columns:
sample.id
a character
string representing the unique
synthetic profile identifier.
D
a numeric
representing the number of dimensions used
to infer the ancestry of the synthetic profile.
K
a numeric
representing the number of neighbors used
to infer the ancestry of the synthetic profile.
SuperPop
a character
string representing the
inferred ancestry of the synthetic profile for the specific D and K values.
This dataset can be
used to test the computeSyntheticROC
function.
The data.frame
containing the information about the
synthetic profiles. The data.frame
contains 4 columns:
sample.id
a character
string representing the unique
synthetic profile identifier.
D
a numeric
representing the number of dimensions used
to infer the ancestry of the synthetic profile.
K
a numeric
representing the number of neighbors used
to infer the ancestry of the synthetic profile.
SuperPop
a character
string representing the
inferred ancestry of the synthetic profile for the specific D and K values.
computeSyntheticROC
for calculating the AUROC of the inferences for specific values of D and K using the inferred ancestry results from the synthetic profiles
## Loading demo dataset containing pedigree information for synthetic ## profiles data(pedSynthetic) ## Loading demo dataset containing the inferred ancestry results ## for the synthetic data data(matKNNSynthetic) ## Retain one K and one D value matKNN <- matKNNSynthetic[matKNNSynthetic$D == 5 & matKNNSynthetic$K == 4, ] ## Compile statistics from the ## synthetic profiles for fixed values of D and K results <- RAIDS:::computeSyntheticROC(matKNN=matKNN, matKNNAncestryColumn="SuperPop", pedCall=pedSynthetic, pedCallAncestryColumn="superPop", listCall=c("EAS", "EUR", "AFR", "AMR", "SAS")) results$matAUROC.All results$matAUROC.Call results$listROC.Call
## Loading demo dataset containing pedigree information for synthetic ## profiles data(pedSynthetic) ## Loading demo dataset containing the inferred ancestry results ## for the synthetic data data(matKNNSynthetic) ## Retain one K and one D value matKNN <- matKNNSynthetic[matKNNSynthetic$D == 5 & matKNNSynthetic$K == 4, ] ## Compile statistics from the ## synthetic profiles for fixed values of D and K results <- RAIDS:::computeSyntheticROC(matKNN=matKNN, matKNNAncestryColumn="SuperPop", pedCall=pedSynthetic, pedCallAncestryColumn="superPop", listCall=c("EAS", "EUR", "AFR", "AMR", "SAS")) results$matAUROC.All results$matAUROC.Call results$listROC.Call
data.frame
containing the information related to
synthetic profiles. The ancestry of the profiles used to generate the
synthetic profiles must be present.The object is a data.frame
with 7 columns. The row names of
the data.frame
must be the profile unique identifiers.
data(pedSynthetic)
data(pedSynthetic)
The data.frame
containing the information about the
synthetic profiles. The row names of
the data.frame
correspond to the profile unique identifiers.
The data.frame
contains 7 columns:
data.id
a character
string representing the unique
synthetic profile identifier.
case.id
a character
string representing the unique
profile identifier that was used to generate the synthetic profile.
sample.type
a character
string representing the type
of profile.
diagnosis
a character
string representing the
diagnosis of profile that was used to generate the synthetic profile.
source
a character
string representing the
source of the synthetic profile.
study.id
a character
string representing the
name of the study to which the synthetic profile is associated.
superPop
a character
string representing the
super population of the profile that was used to generate the synthetic
profile.
This dataset can be
used to test the computeSyntheticROC
function.
The data.frame
containing the information about the
synthetic profiles. The row names of
the data.frame
correspond to the profile unique identifiers.
The data.frame
contains 7 columns:
data.id
a character
string representing the unique
synthetic profile identifier.
case.id
a character
string representing the unique
profile identifier that was used to generate the synthetic profile.
sample.type
a character
string representing the type
of profile.
diagnosis
a character
string representing the
diagnosis of profile that was used to generate the synthetic profile.
source
a character
string representing the
source of the synthetic profile.
study.id
a character
string representing the
name of the study to which the synthetic profile is associated.
superPop
a character
string representing the
super population of the profile that was used to generate the synthetic
profile.
computeSyntheticROC
for calculating the AUROC of the inferences for specific values of D and K using the inferred ancestry results from the synthetic profiles
## Loading demo dataset containing pedigree information for synthetic ## profiles data(pedSynthetic) ## Loading demo dataset containing the inferred ancestry results ## for the synthetic data data(matKNNSynthetic) ## Retain one K and one D value matKNN <- matKNNSynthetic[matKNNSynthetic$D == 5 & matKNNSynthetic$K == 4, ] ## Compile statistics from the ## synthetic profiles for fixed values of D and K results <- RAIDS:::computeSyntheticROC(matKNN=matKNN, matKNNAncestryColumn="SuperPop", pedCall=pedSynthetic, pedCallAncestryColumn="superPop", listCall=c("EAS", "EUR", "AFR", "AMR", "SAS")) results$matAUROC.All results$matAUROC.Call results$listROC.Call
## Loading demo dataset containing pedigree information for synthetic ## profiles data(pedSynthetic) ## Loading demo dataset containing the inferred ancestry results ## for the synthetic data data(matKNNSynthetic) ## Retain one K and one D value matKNN <- matKNNSynthetic[matKNNSynthetic$D == 5 & matKNNSynthetic$K == 4, ] ## Compile statistics from the ## synthetic profiles for fixed values of D and K results <- RAIDS:::computeSyntheticROC(matKNN=matKNN, matKNNAncestryColumn="SuperPop", pedCall=pedSynthetic, pedCallAncestryColumn="superPop", listCall=c("EAS", "EUR", "AFR", "AMR", "SAS")) results$matAUROC.All results$matAUROC.Call results$listROC.Call
Using the pedigree file from Reference, this function extracts
needed information and formats it into a data.frame
so in can
be used in following steps of the ancestry inference process. The
function also requires that the genotyping files associated to each
sample be available in a specified directory.
prepPed1KG(filePed, pathGeno = file.path("data", "sampleGeno"), batch = 0L)
prepPed1KG(filePed, pathGeno = file.path("data", "sampleGeno"), batch = 0L)
filePed |
a |
pathGeno |
a |
batch |
a |
a data.frame
containing the needed pedigree information
from Reference. The data.frame
contains those columns:
a character
string representing the profile unique
ID.
a character
string representing the profile name.
a character
string representing the sex of the profile.
a character
string representing the
sub-continental ancestry of the profile.
a character
string representing the continental
ancestry of the profile.
a integer
representing the batch of the profile.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Path where the demo genotype CSV files are located pathGeno <- file.path(dataDir, "demoProfileGenotypes") ## Demo pedigree file pedDemoFile <- file.path(dataDir, "PedigreeDemo.ped") ## Create a data.frame containing the information of the retained ## samples (samples with existing genotyping files) prepPed1KG(filePed=pedDemoFile, pathGeno=pathGeno, batch=0L)
## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Path where the demo genotype CSV files are located pathGeno <- file.path(dataDir, "demoProfileGenotypes") ## Demo pedigree file pedDemoFile <- file.path(dataDir, "PedigreeDemo.ped") ## Create a data.frame containing the information of the retained ## samples (samples with existing genotyping files) prepPed1KG(filePed=pedDemoFile, pathGeno=pathGeno, batch=0L)
This function add entries related to synthetic profiles into a Profile GDS file. The entries are related to two types of information: the synthetic study and the synthetic profiles.
The study information is appended to the Profile GDS file "study.list" node. The "study.platform" entry is always set to 'Synthetic'.
The profile information, for all selected synthetic profiles, is appended to the Profile GDS file "study.annot" node. Both the "Source" and the "Sample.Type" entries are always set to 'Synthetic'.
The synthetic profiles are assigned unique names by combining:
prefix
.data.id.profile
.listSampleRef
.simulation
number(1 to nbSim)
prepSynthetic( fileProfileGDS, listSampleRef, profileID, studyDF, nbSim = 1L, prefix = "", verbose = FALSE )
prepSynthetic( fileProfileGDS, listSampleRef, profileID, studyDF, nbSim = 1L, prefix = "", verbose = FALSE )
fileProfileGDS |
a |
listSampleRef |
a |
profileID |
a |
studyDF |
a |
nbSim |
a single positive |
prefix |
a single |
verbose |
a |
0L
when successful.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Required library library(gdsfmt) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") ## Temporary Profile GDS file fileNameGDS <- file.path(tempdir(), "ex1.gds") ## Copy the Profile GDS file demo that has been pruned and annotated file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), fileNameGDS) ## Information about the synthetic data set syntheticStudyDF <- data.frame(study.id="MYDATA.Synthetic", study.desc="MYDATA synthetic data", study.platform="PLATFORM", stringsAsFactors=FALSE) ## Add information related to the synthetic profiles into the Profile GDS prepSynthetic(fileProfileGDS=fileNameGDS, listSampleRef=c("HG00243", "HG00150"), profileID="ex1", studyDF=syntheticStudyDF, nbSim=1L, prefix="synthetic", verbose=FALSE) ## Open Profile GDS file profileGDS <- openfn.gds(fileNameGDS) ## The synthetic profiles should be added in the 'study.annot' entry tail(read.gdsn(index.gdsn(profileGDS, "study.annot"))) ## The synthetic study information should be added to ## the 'study.list' entry tail(read.gdsn(index.gdsn(profileGDS, "study.list"))) ## Close GDS file (important) closefn.gds(profileGDS) ## Remove Profile GDS file (created for demo purpose) unlink(fileNameGDS, force=TRUE)
## Required library library(gdsfmt) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") ## Temporary Profile GDS file fileNameGDS <- file.path(tempdir(), "ex1.gds") ## Copy the Profile GDS file demo that has been pruned and annotated file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), fileNameGDS) ## Information about the synthetic data set syntheticStudyDF <- data.frame(study.id="MYDATA.Synthetic", study.desc="MYDATA synthetic data", study.platform="PLATFORM", stringsAsFactors=FALSE) ## Add information related to the synthetic profiles into the Profile GDS prepSynthetic(fileProfileGDS=fileNameGDS, listSampleRef=c("HG00243", "HG00150"), profileID="ex1", studyDF=syntheticStudyDF, nbSim=1L, prefix="synthetic", verbose=FALSE) ## Open Profile GDS file profileGDS <- openfn.gds(fileNameGDS) ## The synthetic profiles should be added in the 'study.annot' entry tail(read.gdsn(index.gdsn(profileGDS, "study.annot"))) ## The synthetic study information should be added to ## the 'study.list' entry tail(read.gdsn(index.gdsn(profileGDS, "study.list"))) ## Close GDS file (important) closefn.gds(profileGDS) ## Remove Profile GDS file (created for demo purpose) unlink(fileNameGDS, force=TRUE)
This function computes the list of pruned SNVs for a
specific profile. When
a group of SNVs are in linkage disequilibrium, only one SNV from that group
is retained. The linkage disequilibrium is calculated with the
snpgdsLDpruning
() function. The initial list of
SNVs that are passed to the snpgdsLDpruning
()
function can be specified by the user.
pruningSample( gdsReference, method = c("corr", "r", "dprime", "composite"), currentProfile, studyID, listSNP = NULL, slideWindowMaxBP = 500000L, thresholdLD = sqrt(0.1), np = 1L, verbose = FALSE, chr = NULL, superPopMinAF = NULL, keepPrunedGDS = TRUE, pathProfileGDS = NULL, keepFile = FALSE, pathPrunedGDS = ".", outPrefix = "pruned" )
pruningSample( gdsReference, method = c("corr", "r", "dprime", "composite"), currentProfile, studyID, listSNP = NULL, slideWindowMaxBP = 500000L, thresholdLD = sqrt(0.1), np = 1L, verbose = FALSE, chr = NULL, superPopMinAF = NULL, keepPrunedGDS = TRUE, pathProfileGDS = NULL, keepFile = FALSE, pathPrunedGDS = ".", outPrefix = "pruned" )
gdsReference |
an object of class gds.class (a GDS file), the 1 KG GDS file (reference data set). |
method |
a |
currentProfile |
a |
studyID |
a |
listSNP |
a |
slideWindowMaxBP |
a single positive |
thresholdLD |
a single |
np |
a single positive |
verbose |
a |
chr |
a |
superPopMinAF |
a single positive |
keepPrunedGDS |
a |
pathProfileGDS |
a |
keepFile |
a |
pathPrunedGDS |
a |
outPrefix |
a |
The function returns 0L
when successful.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Required library for GDS library(gdsfmt) ## Path to the demo Reference GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") ## The data.frame containing the information about the study ## The 3 mandatory columns: "study.id", "study.desc", "study.platform" ## The entries should be strings, not factors (stringsAsFactors=FALSE) studyDF <- data.frame(study.id = "MYDATA", study.desc = "Description", study.platform = "PLATFORM", stringsAsFactors = FALSE) ## The data.frame containing the information about the samples ## The entries should be strings, not factors (stringsAsFactors=FALSE) samplePED <- data.frame(Name.ID = c("ex1", "ex2"), Case.ID = c("Patient_h11", "Patient_h12"), Diagnosis = rep("Cancer", 2), Sample.Type = rep("Primary Tumor", 2), Source = rep("Databank B", 2), stringsAsFactors = FALSE) rownames(samplePED) <- samplePED$Name.ID ## Temporary Profile GDS file profileFile <- file.path(tempdir(), "ex1.gds") ## Copy the Profile GDS file demo that has not been pruned yet file.copy(file.path(dataDir, "ex1_demo.gds"), profileFile) ## Open 1KG file gds1KG <- snpgdsOpen(fileGDS) ## Compute the list of pruned SNVs for a specific profile 'ex1' ## and save it in the Profile GDS file 'ex1.gds' pruningSample(gdsReference=gds1KG, currentProfile=c("ex1"), studyID = studyDF$study.id, pathProfileGDS=tempdir()) ## Close the Reference GDS file (important) closefn.gds(gds1KG) ## Check content of Profile GDS file ## The 'pruned.study' entry should be present content <- openfn.gds(profileFile) content ## Close the Profile GDS file (important) closefn.gds(content) ## Remove Profile GDS file (created for demo purpose) unlink(profileFile, force=TRUE)
## Required library for GDS library(gdsfmt) ## Path to the demo Reference GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") ## The data.frame containing the information about the study ## The 3 mandatory columns: "study.id", "study.desc", "study.platform" ## The entries should be strings, not factors (stringsAsFactors=FALSE) studyDF <- data.frame(study.id = "MYDATA", study.desc = "Description", study.platform = "PLATFORM", stringsAsFactors = FALSE) ## The data.frame containing the information about the samples ## The entries should be strings, not factors (stringsAsFactors=FALSE) samplePED <- data.frame(Name.ID = c("ex1", "ex2"), Case.ID = c("Patient_h11", "Patient_h12"), Diagnosis = rep("Cancer", 2), Sample.Type = rep("Primary Tumor", 2), Source = rep("Databank B", 2), stringsAsFactors = FALSE) rownames(samplePED) <- samplePED$Name.ID ## Temporary Profile GDS file profileFile <- file.path(tempdir(), "ex1.gds") ## Copy the Profile GDS file demo that has not been pruned yet file.copy(file.path(dataDir, "ex1_demo.gds"), profileFile) ## Open 1KG file gds1KG <- snpgdsOpen(fileGDS) ## Compute the list of pruned SNVs for a specific profile 'ex1' ## and save it in the Profile GDS file 'ex1.gds' pruningSample(gdsReference=gds1KG, currentProfile=c("ex1"), studyID = studyDF$study.id, pathProfileGDS=tempdir()) ## Close the Reference GDS file (important) closefn.gds(gds1KG) ## Check content of Profile GDS file ## The 'pruned.study' entry should be present content <- openfn.gds(profileFile) content ## Close the Profile GDS file (important) closefn.gds(content) ## Remove Profile GDS file (created for demo purpose) unlink(profileFile, force=TRUE)
This function runs most steps leading to the ancestry inference call on a specific exome profile. First, the function creates the Profile GDS file for the specific profile using the information from a RDS Sample description file and the Population reference GDS file.
runExomeAncestry( pedStudy, studyDF, pathProfileGDS, pathGeno, pathOut, fileReferenceGDS, fileReferenceAnnotGDS, chrInfo, syntheticRefDF, genoSource = c("snp-pileup", "generic", "VCF"), np = 1L, verbose = FALSE )
runExomeAncestry( pedStudy, studyDF, pathProfileGDS, pathGeno, pathOut, fileReferenceGDS, fileReferenceAnnotGDS, chrInfo, syntheticRefDF, genoSource = c("snp-pileup", "generic", "VCF"), np = 1L, verbose = FALSE )
pedStudy |
a |
studyDF |
a |
pathProfileGDS |
a |
pathGeno |
a |
pathOut |
a |
fileReferenceGDS |
a |
fileReferenceAnnotGDS |
a |
chrInfo |
a |
syntheticRefDF |
a
|
genoSource |
a |
np |
a single positive |
verbose |
a |
The runExomeAncestry() function generates 3 types of files in the OUTPUT directory.
The ancestry inference CSV file (".Ancestry.csv" file)
The inference information RDS file (".infoCall.rds" file)
The parameter information RDS files from the synthetic inference ("KNN.synt.*.rds" files in a sub-directory)
In addition, a sub-directory (named using the profile ID) is also created.
The integer 0L
when successful. See details section for
more information about the generated output files.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
## Required library for GDS library(SNPRelate) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata", package="RAIDS") ################################################################# ## Load the information about the profile ################################################################# data(demoPedigreeEx1) head(demoPedigreeEx1) ################################################################# ## The 1KG GDS file and the 1KG SNV Annotation GDS file ## need to be located in the same directory ## Note that the 1KG GDS file used for this example is a ## simplified version and CANNOT be used for any real analysis ################################################################# path1KG <- file.path(dataDir, "tests") fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") ################################################################# ## The Sample SNP pileup files (one per sample) need ## to be located in the same directory. ################################################################# pathGeno <- file.path(dataDir, "example", "snpPileup") ################################################################# ## The path where the Profile GDS Files (one per sample) ## will be created need to be specified. ################################################################# pathProfileGDS <- file.path(tempdir(), "out.tmp") pathOut <- file.path(tempdir(), "res.out") ################################################################# ## A data frame containing general information about the study ## is also required. The data frame must have ## those 3 columns: "studyID", "study.desc", "study.platform" ################################################################# studyDF <- data.frame(study.id="MYDATA", study.desc="Description", study.platform="PLATFORM", stringsAsFactors=FALSE) #################################################################### ## Fix seed to ensure reproducible results #################################################################### set.seed(3043) gds1KG <- snpgdsOpen(fileReferenceGDS) dataRef <- select1KGPop(gds1KG, nbProfiles=2L) closefn.gds(gds1KG) ## Required library for this example to run correctly if (requireNamespace("GenomeInfoDb", quietly=TRUE) && requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { ## Chromosome length information ## chr23 is chrX, chr24 is chrY and chrM is 25 chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] runExomeAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, pathProfileGDS=pathProfileGDS, pathGeno=pathGeno, pathOut=pathOut, fileReferenceGDS=fileReferenceGDS, fileReferenceAnnotGDS=fileAnnotGDS, chrInfo=chrInfo, syntheticRefDF=dataRef, genoSource="snp-pileup") unlink(pathProfileGDS, recursive=TRUE, force=TRUE) unlink(pathOut, recursive=TRUE, force=TRUE) }
## Required library for GDS library(SNPRelate) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata", package="RAIDS") ################################################################# ## Load the information about the profile ################################################################# data(demoPedigreeEx1) head(demoPedigreeEx1) ################################################################# ## The 1KG GDS file and the 1KG SNV Annotation GDS file ## need to be located in the same directory ## Note that the 1KG GDS file used for this example is a ## simplified version and CANNOT be used for any real analysis ################################################################# path1KG <- file.path(dataDir, "tests") fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") ################################################################# ## The Sample SNP pileup files (one per sample) need ## to be located in the same directory. ################################################################# pathGeno <- file.path(dataDir, "example", "snpPileup") ################################################################# ## The path where the Profile GDS Files (one per sample) ## will be created need to be specified. ################################################################# pathProfileGDS <- file.path(tempdir(), "out.tmp") pathOut <- file.path(tempdir(), "res.out") ################################################################# ## A data frame containing general information about the study ## is also required. The data frame must have ## those 3 columns: "studyID", "study.desc", "study.platform" ################################################################# studyDF <- data.frame(study.id="MYDATA", study.desc="Description", study.platform="PLATFORM", stringsAsFactors=FALSE) #################################################################### ## Fix seed to ensure reproducible results #################################################################### set.seed(3043) gds1KG <- snpgdsOpen(fileReferenceGDS) dataRef <- select1KGPop(gds1KG, nbProfiles=2L) closefn.gds(gds1KG) ## Required library for this example to run correctly if (requireNamespace("GenomeInfoDb", quietly=TRUE) && requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { ## Chromosome length information ## chr23 is chrX, chr24 is chrY and chrM is 25 chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] runExomeAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, pathProfileGDS=pathProfileGDS, pathGeno=pathGeno, pathOut=pathOut, fileReferenceGDS=fileReferenceGDS, fileReferenceAnnotGDS=fileAnnotGDS, chrInfo=chrInfo, syntheticRefDF=dataRef, genoSource="snp-pileup") unlink(pathProfileGDS, recursive=TRUE, force=TRUE) unlink(pathOut, recursive=TRUE, force=TRUE) }
This function runs most steps leading to the ancestry inference call on a specific RNA profile. First, the function creates the Profile GDS file for the specific profile using the information from a RDS Sample description file and the Population Reference GDS file.
runRNAAncestry( pedStudy, studyDF, pathProfileGDS, pathGeno, pathOut, fileReferenceGDS, fileReferenceAnnotGDS, chrInfo, syntheticRefDF, genoSource = c("snp-pileup", "generic", "VCF"), np = 1L, blockTypeID, verbose = FALSE )
runRNAAncestry( pedStudy, studyDF, pathProfileGDS, pathGeno, pathOut, fileReferenceGDS, fileReferenceAnnotGDS, chrInfo, syntheticRefDF, genoSource = c("snp-pileup", "generic", "VCF"), np = 1L, blockTypeID, verbose = FALSE )
pedStudy |
a |
studyDF |
a |
pathProfileGDS |
a |
pathGeno |
a |
pathOut |
a |
fileReferenceGDS |
a |
fileReferenceAnnotGDS |
a |
chrInfo |
a |
syntheticRefDF |
a
|
genoSource |
a |
np |
a single positive |
blockTypeID |
a |
verbose |
a |
The runExomeAncestry() function generates 3 types of files in the OUTPUT directory.
The ancestry inference CSV file (".Ancestry.csv" file)
The inference information RDS file (".infoCall.rds" file)
The parameter information RDS files from the synthetic inference ("KNN.synt.*.rds" files in a sub-directory)
In addition, a sub-directory (named using the profile ID) is also created.
The integer 0L
when successful. See details section for
more information about the generated output files.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
## Required library for GDS library(SNPRelate) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata", package="RAIDS") ################################################################# ## Load the information about the profile ################################################################# data(demoPedigreeEx1) head(demoPedigreeEx1) ################################################################# ## The 1KG GDS file and the 1KG SNV Annotation GDS file ## need to be located in the same directory ## Note that the 1KG GDS file used for this example is a ## simplified version and CANNOT be used for any real analysis ################################################################# path1KG <- file.path(dataDir, "tests") fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") ################################################################# ## The Sample SNP pileup files (one per sample) need ## to be located in the same directory. ################################################################# pathGeno <- file.path(dataDir, "example", "snpPileup") ################################################################# ## The path where the Profile GDS Files (one per sample) ## will be created need to be specified. ################################################################# pathProfileGDS <- file.path(tempdir(), "out.tmp") pathOut <- file.path(tempdir(), "res.out") ################################################################# ## A data frame containing general information about the study ## is also required. The data frame must have ## those 3 columns: "studyID", "study.desc", "study.platform" ################################################################# studyDF <- data.frame(study.id="MYDATA", study.desc="Description", study.platform="PLATFORM", stringsAsFactors=FALSE) #################################################################### ## Fix seed to ensure reproducible results #################################################################### set.seed(3043) gds1KG <- snpgdsOpen(fileReferenceGDS) dataRef <- select1KGPop(gds1KG, nbProfiles=2L) closefn.gds(gds1KG) ## Required library for this example to run correctly if (requireNamespace("GenomeInfoDb", quietly=TRUE) && requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { ## Chromosome length information ## chr23 is chrX, chr24 is chrY and chrM is 25 chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] runRNAAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, pathProfileGDS=pathProfileGDS, pathGeno=pathGeno, pathOut=pathOut, fileReferenceGDS=fileReferenceGDS, fileReferenceAnnotGDS=fileAnnotGDS, chrInfo=chrInfo, syntheticRefDF=dataRef, blockTypeID="GeneS.Ensembl.Hsapiens.v86", genoSource="snp-pileup") unlink(pathProfileGDS, recursive=TRUE, force=TRUE) unlink(pathOut, recursive=TRUE, force=TRUE) }
## Required library for GDS library(SNPRelate) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata", package="RAIDS") ################################################################# ## Load the information about the profile ################################################################# data(demoPedigreeEx1) head(demoPedigreeEx1) ################################################################# ## The 1KG GDS file and the 1KG SNV Annotation GDS file ## need to be located in the same directory ## Note that the 1KG GDS file used for this example is a ## simplified version and CANNOT be used for any real analysis ################################################################# path1KG <- file.path(dataDir, "tests") fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") ################################################################# ## The Sample SNP pileup files (one per sample) need ## to be located in the same directory. ################################################################# pathGeno <- file.path(dataDir, "example", "snpPileup") ################################################################# ## The path where the Profile GDS Files (one per sample) ## will be created need to be specified. ################################################################# pathProfileGDS <- file.path(tempdir(), "out.tmp") pathOut <- file.path(tempdir(), "res.out") ################################################################# ## A data frame containing general information about the study ## is also required. The data frame must have ## those 3 columns: "studyID", "study.desc", "study.platform" ################################################################# studyDF <- data.frame(study.id="MYDATA", study.desc="Description", study.platform="PLATFORM", stringsAsFactors=FALSE) #################################################################### ## Fix seed to ensure reproducible results #################################################################### set.seed(3043) gds1KG <- snpgdsOpen(fileReferenceGDS) dataRef <- select1KGPop(gds1KG, nbProfiles=2L) closefn.gds(gds1KG) ## Required library for this example to run correctly if (requireNamespace("GenomeInfoDb", quietly=TRUE) && requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { ## Chromosome length information ## chr23 is chrX, chr24 is chrY and chrM is 25 chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] runRNAAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, pathProfileGDS=pathProfileGDS, pathGeno=pathGeno, pathOut=pathOut, fileReferenceGDS=fileReferenceGDS, fileReferenceAnnotGDS=fileAnnotGDS, chrInfo=chrInfo, syntheticRefDF=dataRef, blockTypeID="GeneS.Ensembl.Hsapiens.v86", genoSource="snp-pileup") unlink(pathProfileGDS, recursive=TRUE, force=TRUE) unlink(pathOut, recursive=TRUE, force=TRUE) }
The function randomly selects a fixed number of reference for each subcontinental population present in the 1KG GDS file. When a subcontinental population has less samples than the fixed number, all samples from the subcontinental population are selected.
select1KGPop(gdsReference, nbProfiles)
select1KGPop(gdsReference, nbProfiles)
gdsReference |
an object of class gds.class (a GDS file), the opened 1KG GDS file. |
nbProfiles |
a single positive |
a data.frame
containing those columns:
a character
string representing the sample
identifier.
a character
string representing the
subcontinental population assigned to the sample.
a character
string representing the
super-population assigned to the sample.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Required library library(gdsfmt) ## The number of samples needed by subcontinental population ## The number is small for demonstration purpose nbProfiles <- 5L ## Open 1KG GDS Demo file ## This file only one superpopulation (for demonstration purpose) dataDir <- system.file("extdata", package="RAIDS") fileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") gdsFileOpen <- openfn.gds(fileGDS, readonly=TRUE) ## Extract a selected number of random samples ## for each subcontinental population ## In the 1KG GDS Demo file, there is one subcontinental population dataR <- select1KGPop(gdsReference=gdsFileOpen, nbProfiles=nbProfiles) ## Close the 1KG GDS Demo file (important) closefn.gds(gdsFileOpen)
## Required library library(gdsfmt) ## The number of samples needed by subcontinental population ## The number is small for demonstration purpose nbProfiles <- 5L ## Open 1KG GDS Demo file ## This file only one superpopulation (for demonstration purpose) dataDir <- system.file("extdata", package="RAIDS") fileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") gdsFileOpen <- openfn.gds(fileGDS, readonly=TRUE) ## Extract a selected number of random samples ## for each subcontinental population ## In the 1KG GDS Demo file, there is one subcontinental population dataR <- select1KGPop(gdsReference=gdsFileOpen, nbProfiles=nbProfiles) ## Close the 1KG GDS Demo file (important) closefn.gds(gdsFileOpen)
The function randomly selects a fixed number of reference for each subcontinental population present in the 1KG GDS file. When a subcontinental population has less samples than the fixed number, all samples from the subcontinental population are selected.
select1KGPopForSynthetic(fileReferenceGDS, nbProfiles)
select1KGPopForSynthetic(fileReferenceGDS, nbProfiles)
fileReferenceGDS |
a |
nbProfiles |
a single positive |
a data.frame
containing those columns:
a character
string representing the sample
identifier.
a character
string representing the
subcontinental population assigned to the sample.
a character
string representing the
super-population assigned to the sample.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Required library library(gdsfmt) ## The number of samples needed by subcontinental population ## The number is small for demonstration purpose nbProfiles <- 5L ## 1KG GDS Demo file ## This file only one superpopulation (for demonstration purpose) dataDir <- system.file("extdata", package="RAIDS") fileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") ## Extract a selected number of random samples ## for each subcontinental population ## In the 1KG GDS Demo file, there is one subcontinental population dataR <- select1KGPopForSynthetic(fileReferenceGDS=fileGDS, nbProfiles=nbProfiles)
## Required library library(gdsfmt) ## The number of samples needed by subcontinental population ## The number is small for demonstration purpose nbProfiles <- 5L ## 1KG GDS Demo file ## This file only one superpopulation (for demonstration purpose) dataDir <- system.file("extdata", package="RAIDS") fileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") ## Extract a selected number of random samples ## for each subcontinental population ## In the 1KG GDS Demo file, there is one subcontinental population dataR <- select1KGPopForSynthetic(fileReferenceGDS=fileGDS, nbProfiles=nbProfiles)
data.frame
containing the
SNV information.The object is a data.frame
with 17 columns.
data(snpPositionDemo)
data(snpPositionDemo)
The data.frame
containing the information about the
synthetic profiles. The data.frame
contains 4 columns:
cnt.tot
a integer
representing the number of reads at
the SNV position.
cnt.ref
a integer
representing the number of reads
corresponding to the reference at the SNV position.
cnt.alt
a integer
representing the number of reads
different than the reference at the SNV position.
snp.pos
a integer
representing the position of the
SNV on the chromosome.
snp.chr
a integer
representing the chromosome on which
the SNV is located.
normal.geno
a integer
representing the genotype
(0=wild-type reference; 1=heterozygote; 2=homozygote alternative;
3=unkown).
pruned
a logical
indicated if the SNV is pruned.
snp.index
a integer
representing the index of the
SNV in the reference SNV GDS file.
keep
a logical
indicated if the genotype
exists for the SNV.
hetero
a logical
indicated if the SNV is
heterozygote.
homo
a logical
indicated if the SNV is homozygote.
block.id
a integer
representing the block identifier
associated to the current SNV.
phase
a integer
representing the block identifier
associated to the current SNV.
lap
a numeric
representing the lower allelic
fraction.
LOH
a integer
indicating if the SNV is in an LOH
region (0=not LOH, 1=in LOH).
imbAR
a integer
indicating if the SNV is in an
imbalanced region (-1=not classified as imbalanced or LOH, 0=in LOH;
1=tested positive for imbalance in at least 1 window).
freq
a numeric
representing the frequency of the
variant in the the reference.
This dataset can be
used to test the calcAFMLRNA
and tableBlockAF
internal functions.
The data.frame
containing the information about the
synthetic profiles. The data.frame
contains 4 columns:
cnt.tot
a integer
representing the number of reads at
the SNV position.
cnt.ref
a integer
representing the number of reads
corresponding to the reference at the SNV position.
cnt.alt
a integer
representing the number of reads
different than the reference at the SNV position.
snp.pos
a integer
representing the position of the
SNV on the chromosome.
snp.chr
a integer
representing the chromosome on which
the SNV is located.
normal.geno
a integer
representing the genotype
(0=wild-type reference; 1=heterozygote; 2=homozygote alternative; 3=unkown).
pruned
a logical
indicated if the SNV is pruned.
snp.index
a integer
representing the index of the
SNV in the reference SNV GDS file.
keep
a logical
indicated if the genotype
exists for the SNV.
hetero
a logical
indicated if the SNV is heterozygote.
homo
a logical
indicated if the SNV is homozygote.
block.id
a integer
representing the block identifier
associated to the current SNV.
phase
a integer
representing the block identifier
associated to the current SNV.
lap
a numeric
representing the lower allelic fraction.
LOH
a integer
indicating if the SNV is in an LOH region
(0=not LOH, 1=in LOH).
imbAR
a integer
indicating if the SNV is in an
imbalanced region (-1=not classified as imbalanced or LOH, 0=in LOH;
1=tested positive for imbalance in at least 1 window).
freq
a numeric
representing the frequency of the
variant in the the reference.
## Loading demo dataset containing SNV information data(snpPositionDemo) ## Only use a subset of heterozygote SNVs related to one block subset <- snpPositionDemo[which(snpPositionDemo$block.id == 2750 & snpPositionDemo$hetero), c("cnt.ref", "cnt.alt", "phase")] ## Compute the log likelihood ratio based on the coverage of ## each allele in a specific block result <- RAIDS:::calcAFMLRNA(subset) head(result)
## Loading demo dataset containing SNV information data(snpPositionDemo) ## Only use a subset of heterozygote SNVs related to one block subset <- snpPositionDemo[which(snpPositionDemo$block.id == 2750 & snpPositionDemo$hetero), c("cnt.ref", "cnt.alt", "phase")] ## Compute the log likelihood ratio based on the coverage of ## each allele in a specific block result <- RAIDS:::calcAFMLRNA(subset) head(result)
This function extract the SNPs that pass a frequency cut-off in at least one super population from a GDS SNP information file and save the retained SNP information into a VCF file.
snvListVCF(gdsReference, fileOut, offset = 0L, freqCutoff = NULL)
snvListVCF(gdsReference, fileOut, offset = 0L, freqCutoff = NULL)
gdsReference |
an object of class |
fileOut |
a |
offset |
a single |
freqCutoff |
a single positive |
The integer 0L
when successful.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Required library library(gdsfmt) ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Demo 1KG Reference GDS file fileGDS <- openfn.gds(file.path(dataDir, "PopulationReferenceDemo.gds")) ## Output VCF file that will be created (temporary) vcfFile <- file.path(tempdir(), "Demo_TMP_01.vcf") ## Create a VCF file with the SNV dataset present in the GDS file ## No cutoff on frequency, so all SNVs are saved snvListVCF(gdsReference=fileGDS, fileOut=vcfFile, offset=0L, freqCutoff=NULL) ## Close GDS file (IMPORTANT) closefn.gds(fileGDS) ## Remove temporary VCF file unlink(vcfFile, force=TRUE)
## Required library library(gdsfmt) ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") ## Demo 1KG Reference GDS file fileGDS <- openfn.gds(file.path(dataDir, "PopulationReferenceDemo.gds")) ## Output VCF file that will be created (temporary) vcfFile <- file.path(tempdir(), "Demo_TMP_01.vcf") ## Create a VCF file with the SNV dataset present in the GDS file ## No cutoff on frequency, so all SNVs are saved snvListVCF(gdsReference=fileGDS, fileOut=vcfFile, offset=0L, freqCutoff=NULL) ## Close GDS file (IMPORTANT) closefn.gds(fileGDS) ## Remove temporary VCF file unlink(vcfFile, force=TRUE)
The function groups the samples per subcontinental population and generates a matrix containing the sample identifiers and where each column is a subcontinental population.
splitSelectByPop(dataRef)
splitSelectByPop(dataRef)
dataRef |
a
|
a matrix
containing the sample identifiers and where
each column is the name of a subcontinental population. The number of
row corresponds to the number of samples for each subcontinental population.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## A data.frame containing samples from 2 subcontinental populations demo <- data.frame(sample.id=c("SampleA", "SampleB", "SampleC", "SampleD"), pop.group=c("TSI", "TSI", "YRI", "YRI"), superPop=c("EUR", "EUR", "AFR", "AFR")) ## Generate a matrix populated with the sample identifiers and where ## each row is a subcontinental population splitSelectByPop(dataRef=demo)
## A data.frame containing samples from 2 subcontinental populations demo <- data.frame(sample.id=c("SampleA", "SampleB", "SampleC", "SampleD"), pop.group=c("TSI", "TSI", "YRI", "YRI"), superPop=c("EUR", "EUR", "AFR", "AFR")) ## Generate a matrix populated with the sample identifiers and where ## each row is a subcontinental population splitSelectByPop(dataRef=demo)
The functions uses one cancer profile in combination with one 1KG reference profile to generate an synthetic profile that is saved in the Profile GDS file.
When more than one 1KG reference profiles are specified, the function recursively generates synthetic profiles for each cancer profile + 1KG reference profile combination.
The number of synthetic profiles generated by combination is specified by the number of simulation requested.
syntheticGeno( gdsReference, gdsRefAnnot, fileProfileGDS, profileID, listSampleRef, nbSim = 1L, prefix = "", pRecomb = 0.01, minProb = 0.999, seqError = 0.001 )
syntheticGeno( gdsReference, gdsRefAnnot, fileProfileGDS, profileID, listSampleRef, nbSim = 1L, prefix = "", pRecomb = 0.01, minProb = 0.999, seqError = 0.001 )
gdsReference |
an object of class |
gdsRefAnnot |
an object of class |
fileProfileGDS |
a |
profileID |
a |
listSampleRef |
a |
nbSim |
a single positive |
prefix |
a |
pRecomb |
a single positive |
minProb |
a single positive |
seqError |
a single positive |
The integer OL
when the function is successful.
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
## Required library library(gdsfmt) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") ## Profile GDS file (temporary) fileNameGDS <- file.path(tempdir(), "ex1.gds") ## Copy the Profile GDS file demo that has been pruned and annotated file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), fileNameGDS) ## Information about the synthetic data set syntheticStudyDF <- data.frame(study.id="MYDATA.Synthetic", study.desc="MYDATA synthetic data", study.platform="PLATFORM", stringsAsFactors=FALSE) ## Add information related to the synthetic profiles into the Profile GDS prepSynthetic(fileProfileGDS=fileNameGDS, listSampleRef=c("HG00243", "HG00150"), profileID="ex1", studyDF=syntheticStudyDF, nbSim=1L, prefix="synthTest", verbose=FALSE) ## The 1KG files gds1KG <- snpgdsOpen(file.path(dataDir, "ex1_good_small_1KG.gds")) gds1KGAnnot <- openfn.gds(file.path(dataDir, "ex1_good_small_1KG_Annot.gds")) ## Generate the synthetic profiles and add them into the Profile GDS syntheticGeno(gdsReference=gds1KG, gdsRefAnnot=gds1KGAnnot, fileProfileGDS=fileNameGDS, profileID="ex1", listSampleRef=c("HG00243", "HG00150"), nbSim=1, prefix="synthTest", pRecomb=0.01, minProb=0.999, seqError=0.001) ## Open Profile GDS file profileGDS <- openfn.gds(fileNameGDS) tail(read.gdsn(index.gdsn(profileGDS, "sample.id"))) ## Close GDS files (important) closefn.gds(profileGDS) closefn.gds(gds1KG) closefn.gds(gds1KGAnnot) ## Remove Profile GDS file (created for demo purpose) unlink(fileNameGDS, force=TRUE)
## Required library library(gdsfmt) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") ## Profile GDS file (temporary) fileNameGDS <- file.path(tempdir(), "ex1.gds") ## Copy the Profile GDS file demo that has been pruned and annotated file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), fileNameGDS) ## Information about the synthetic data set syntheticStudyDF <- data.frame(study.id="MYDATA.Synthetic", study.desc="MYDATA synthetic data", study.platform="PLATFORM", stringsAsFactors=FALSE) ## Add information related to the synthetic profiles into the Profile GDS prepSynthetic(fileProfileGDS=fileNameGDS, listSampleRef=c("HG00243", "HG00150"), profileID="ex1", studyDF=syntheticStudyDF, nbSim=1L, prefix="synthTest", verbose=FALSE) ## The 1KG files gds1KG <- snpgdsOpen(file.path(dataDir, "ex1_good_small_1KG.gds")) gds1KGAnnot <- openfn.gds(file.path(dataDir, "ex1_good_small_1KG_Annot.gds")) ## Generate the synthetic profiles and add them into the Profile GDS syntheticGeno(gdsReference=gds1KG, gdsRefAnnot=gds1KGAnnot, fileProfileGDS=fileNameGDS, profileID="ex1", listSampleRef=c("HG00243", "HG00150"), nbSim=1, prefix="synthTest", pRecomb=0.01, minProb=0.999, seqError=0.001) ## Open Profile GDS file profileGDS <- openfn.gds(fileNameGDS) tail(read.gdsn(index.gdsn(profileGDS, "sample.id"))) ## Close GDS files (important) closefn.gds(profileGDS) closefn.gds(gds1KG) closefn.gds(gds1KGAnnot) ## Remove Profile GDS file (created for demo purpose) unlink(fileNameGDS, force=TRUE)