--- title: "Example Data for RNAseqCovarImpute" author: - name: Brennan H Baker affiliation: University of Washington email: brennanhilton@gmail.com package: RNAseqCovarImpute output: BiocStyle::html_document vignette: | %\VignetteIndexEntry{Example Data for RNAseqCovarImpute} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} editor_options: chunk_output_type: console --- # Generate random data Normally you would have your own covariate and RNA-sequencing data. We generate random data below for the purpose of demonstrating how the functions in the RNAseqCovarImpute package work. ```{r, message=FALSE, warning=FALSE, eval = requireNamespace(c("stringr", "tidyr"))} library(dplyr) library(stringr) library(tidyr) library(edgeR) library(mice) # Generate random covariate data set.seed(2023) x1 <- rnorm(n = 500) y1 <- rnorm(n = 500) z1 <- rnorm(n = 500) a1 <- rbinom(n = 500, prob = .25, size = 1) b1 <- rbinom(n = 500, prob = .5, size = 1) data <- data.frame(x = x1, y = y1, z = z1, a = a1, b = b1) # Generate random count data from Poisson distribution nsamp <- nrow(data) ngene <- 500 mat <- matrix(stats::rpois(n = nsamp * ngene, lambda = sample(1:500, ngene, replace = TRUE)), nrow = ngene, ncol = nsamp ) # Make fake ENSEMBL gene numbers annot <- tibble(number = seq_len(ngene), name1 = "ENS") %>% mutate(ENSEMBL = str_c(name1, number)) %>% dplyr::select(ENSEMBL) rownames(mat) <- annot$ENSEMBL # Make DGE list and set rownames to ENSEMBL gene numbers above example_DGE <- DGEList(mat, genes = annot) rownames(example_DGE) <- annot$ENSEMBL ``` # Simulate missingness in the random data Now we simulate missingness in the random data generated above using the mice ampute function. The below code simulates missingness in the data for all but the first variable (x), which we consider the main predictor of interest in this example. See the ampute documentation for more details. ```{r, message=FALSE, warning=FALSE} # First get all combos of 0 or 1 for the 4 other variables (y, z, a, and b) pattern_vars <- expand.grid(c(0, 1), c(0, 1), c(0, 1), c(0, 1)) # Then add back a column for the predictor of interest, which is never amputed, so the first col =1 the whole way down pattern2 <- matrix(1, nrow = nrow(pattern_vars), ncol = 1) pattern1 <- cbind(pattern2, pattern_vars) # Remove last row which is all 1s (all 1s means no missingness induced) pattern <- pattern1[seq_len(15), ] # Specify proportion of missing data and missingness mechanism for the ampute function prop_miss <- 25 miss_mech <- "MAR" result <- ampute(data = data, prop = prop_miss, mech = miss_mech, patterns = pattern) # Extract the missing data example_data <- result$amp # Ampute function turns factors to numeric, so convert back to factors example_data <- example_data %>% mutate( a = as.factor(a), b = as.factor(b) ) # Calculate the new sample size if NAs are dropped as in a complete case analysis sample_size <- example_data %>% drop_na() %>% nrow() # As shown below, we have 24.2% missingness. 100 * (nsamp - sample_size) / nsamp ``` ```{r, message=FALSE, warning=FALSE, echo=FALSE} example_data %>% head(10) %>% knitr::kable(digits = 3, caption = "The first 10 rows of simulated covariate data with missingness") ``` # Session info {.unnumbered} ```{r sessionInfo, echo=FALSE} sessionInfo() ```