---
title: "Guided tutorial to COTAN V.2"
author: 
- name: "Silvia Giulia Galfrè"
  affiliation: "Department of Computer Science, University of Pisa"

package: COTAN
output: 
    BiocStyle::html_document
vignette: >
  %\VignetteIndexEntry{Guided tutorial to COTAN V.2}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8} 
---

## Preamble

```{r, include = FALSE}
knitr::opts_chunk[["set"]](
  collapse = TRUE,
  comment = "#>"
#  fig.width = 5L,
#  fig.height = 5L
)

```

```{r message=FALSE, warning=FALSE}
library(COTAN)
library(zeallot)
library(rlang)
library(data.table)
library(Rtsne)
library(qpdf)
library(GEOquery)

options(parallelly.fork.enable = TRUE)
```

## Introduction

This tutorial contains the same functionalities as the first release of the
COTAN tutorial but done using the new and updated functions.

## Get the data-set

Download the data-set for `"mouse cortex E17.5"`.

```{r eval=TRUE, include=TRUE}
dataDir <- tempdir()

GEO <- "GSM2861514"
fName <- "GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz"

dataSetFile <- file.path(dataDir, GEO, fName)

dir.create(file.path(dataDir, GEO), showWarnings = FALSE)

if (!file.exists(dataSetFile)) {
  getGEOSuppFiles(GEO, makeDirectory = TRUE,
                  baseDir = dataDir, fetch_files = TRUE,
                  filter_regex = fName)
}

sample.dataset <- read.csv(dataSetFile, sep = "\t", row.names = 1L)
```

Define a directory where the output will be stored.

```{r}
outDir <- dataDir

# Log-level 2 was chosen to showcase better how the package works
# In normal usage a level of 0 or 1 is more appropriate
setLoggingLevel(2L)

# This file will contain all the logs produced by the package
# as if at the highest logging level
setLoggingFile(file.path(outDir, "vignette_v2.log"))

message("COTAN uses the `torch` library when asked to `optimizeForSpeed`")
message("Run the command 'options(COTAN.UseTorch = FALSE)'",
        " in your session to disable `torch` completely!")

# this command does check whether the torch library is properly installed
c(useTorch, deviceStr) %<-% COTAN:::canUseTorch(TRUE, "cuda")
if (useTorch) {
  message("The `torch` library is available and ready to use")
  if (deviceStr == "cuda") {
    message("The `torch` library can use the `CUDA` GPU")
  } else {
    message("The `torch` library can only use the CPU")
    message("Please ensure you have the `OpenBLAS` libraries",
            " installed on the system")
  }
}

rm(useTorch, deviceStr)
```

# Analytical pipeline

Initialize the `COTAN` object with the row count table and
the metadata for the experiment.

```{r}
cond <- "mouse_cortex_E17.5"

obj <- COTAN(raw = sample.dataset)
obj <- initializeMetaDataset(obj,
                             GEO = GEO,
                             sequencingMethod = "Drop_seq",
                             sampleCondition = cond)

logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])),
        logLevel = 1L)
```

Before we proceed to the analysis, we need to clean the data.
The analysis will use a matrix of raw UMI counts as the input.
To obtain this matrix, we have to remove any potential cell doublets or
multiplets, as well as any low quality or dying cells.

## Data cleaning

We can check the library size (UMI number) with an empirical cumulative
distribution function

```{r}
plot(ECDPlot(obj))
```

```{r}
plot(cellSizePlot(obj))
```

```{r}
plot(genesSizePlot(obj))
```

```{r}
plot(scatterPlot(obj))
```

During the cleaning, every time we want to remove cells or genes
we can use the `dropGenesCells()`function.

Drop cells with too many reads as they are probably doublets or multiplets
```{r}
cellsSizeThr <- 6000L
obj <- addElementToMetaDataset(obj, "Cells size threshold", cellsSizeThr)

cells_to_rem <- getCells(obj)[getCellsSize(obj) > cellsSizeThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)

plot(cellSizePlot(obj))
```

To drop cells by gene number: high genes count might also indicate doublets...

```{r}
genesSizeHighThr <- 3000L
obj <- addElementToMetaDataset(obj, "Num genes high threshold", genesSizeHighThr)

cells_to_rem <- getCells(obj)[getNumExpressedGenes(obj) > genesSizeHighThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)

plot(genesSizePlot(obj))
```

Drop cells with too low genes expression as they are probably dead

```{r}
genesSizeLowThr <- 500L
obj <- addElementToMetaDataset(obj, "Num genes low threshold", genesSizeLowThr)

cells_to_rem <- getCells(obj)[getNumExpressedGenes(obj) < genesSizeLowThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)

plot(genesSizePlot(obj))
```


Cells with a too high percentage of mitochondrial genes are
likely dead (or at the last problematic) cells. So we drop them!

```{r}
c(mitPlot, mitSizes) %<-% mitochondrialPercentagePlot(obj, genePrefix = "^Mt")

plot(mitPlot)
```

```{r}
mitPercThr <- 1.0
obj <- addElementToMetaDataset(obj, "Mitoc. perc. threshold", mitPercThr)

cells_to_rem <- rownames(mitSizes)[mitSizes[["mit.percentage"]] > mitPercThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)

c(mitPlot, mitSizes) %<-% mitochondrialPercentagePlot(obj, genePrefix = "^Mt")

plot(mitPlot)
```


If we do not want to consider the mitochondrial genes we can remove them
before starting the analysis.

```{r}
genes_to_rem <- getGenes(obj)[grep("^Mt", getGenes(obj))]
cells_to_rem <- getCells(obj)[which(getCellsSize(obj) == 0L)]

obj <- dropGenesCells(obj, genes_to_rem, cells_to_rem)
```

We want also to log the current status.

```{r}
logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)
```

The `clean()` function estimates all the parameters for the data. Therefore, we have to run it again every time we remove any genes or cells from the data.

```{r}
obj <- addElementToMetaDataset(obj, "Num drop B group", 0)

obj <- clean(obj)

c(pcaCellsPlot, pcaCellsData, genesPlot,
  UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj)

plot(pcaCellsPlot)
```

```{r}
plot(genesPlot)
```

We can observe here that the red cells are really enriched in hemoglobin genes so we prefer to remove them. They can be extracted from the `pcaCellsData` object and removed.

```{r eval=TRUE, include=TRUE}
cells_to_rem <- rownames(pcaCellsData)[pcaCellsData[["groups"]] == "B"]
obj <- dropGenesCells(obj, cells = cells_to_rem)

obj <- addElementToMetaDataset(obj, "Num drop B group", 1)

obj <- clean(obj)

c(pcaCellsPlot, pcaCellsData, genesPlot,
  UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj)

plot(pcaCellsPlot)
```

To color the PCA based on `nu` (so the cells' efficiency)

```{r}
plot(UDEPlot)
```

UDE (color) should not correlate with principal components! This is very important.

The next part is used to remove the cells with efficiency too low.

```{r}
plot(nuPlot)
```

We can zoom on the smallest values and, if COTAN detects a clear elbow,
we can decide to remove the cells.

```{r}
plot(zoomedNuPlot)
```

We also save the defined threshold in the metadata and re-run the estimation

```{r}
UDELowThr <- 0.30
obj <- addElementToMetaDataset(obj, "Low UDE cells' threshold", UDELowThr)

obj <- addElementToMetaDataset(obj, "Num drop B group", 2)

cells_to_rem <- getCells(obj)[getNu(obj) < UDELowThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)
```

Repeat the estimation after the cells are removed

```{r}
obj <- clean(obj)

c(pcaCellsPlot, pcaCellsData, genesPlot,
  UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj)

plot(pcaCellsPlot)
```

```{r}
plot(scatterPlot(obj))
```

```{r}
logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)
```

## COTAN analysis

In this part, all the contingency tables are computed
and used to get the statistics necessary to `COEX` evaluation and storing

```{r}
obj <- proceedToCoex(obj, calcCoex = TRUE,
                     optimizeForSpeed = TRUE, cores = 6L, deviceStr = "cuda",
                     saveObj = FALSE, outDir = outDir)
```

When `saveObj == TRUE`, in the previous step, this step can be skipped
as the `COTAN` object has already been saved in the `outDir`.

```{r eval=FALSE, include=TRUE}
# saving the structure
saveRDS(obj, file = file.path(outDir, paste0(cond, ".cotan.RDS")))
```

## Automatic run

It is also possible to run directly a single function
if we don't want to clean anything.

```{r eval=FALSE, include=TRUE}
obj2 <- automaticCOTANObjectCreation(
  raw = sample.dataset,
  GEO = GEO,
  sequencingMethod = "Drop_seq",
  sampleCondition = cond,
  calcCoex = TRUE, cores = 6L, optimizeForSpeed = TRUE,
  saveObj = TRUE, outDir = outDir)
```

# Analysis of the elaborated data

## GDI

To calculate and store the Global Differentiation Index (`GDI`) we can run:

```{r}
gdiDF <- calculateGDI(obj)

head(gdiDF)

# This will store only the $GDI column
obj <- storeGDI(obj, genesGDI = gdiDF)
```

The next function can either plot the `GDI` for the dataset directly or
use the pre-computed dataframe.

It marks the given threshold 1.43 (in red) and
the 50% and 75% quantiles (in blue).
We can also specify some gene sets (three in this case) that
we want to label explicitly in the `GDI` plot.

```{r}
genesList <- list(
  "NPGs" = c("Nes", "Vim", "Sox2", "Sox1", "Notch1", "Hes1", "Hes5", "Pax6"),
  "PNGs" = c("Map2", "Tubb3", "Neurod1", "Nefm", "Nefl", "Dcx", "Tbr1"),
  "hk"   = c("Calm1", "Cox6b1", "Ppia", "Rpl18", "Cox7c", "Erh", "H3f3a",
             "Taf1", "Taf2", "Gapdh", "Actb", "Golph3", "Zfr", "Sub1",
             "Tars", "Amacr")
)

GDIPlot(obj, cond = cond, genes = genesList, GDIThreshold = 1.40)
```

The percentage of cells expressing the gene in the third column of this
data-frame is reported.

## Heatmaps

To perform the Gene Pair Analysis, we can plot a heatmap of the `COEX` values
between two gene sets.
We have to define the different gene sets (`list.genes`) in a list.
Then we can choose which sets to use in the function parameter sets
(for example, from 1 to 3).
We also have to provide an array of the file name prefixes for each condition
(for example, "mouse_cortex_E17.5").
In fact, this function can plot genes relationships across many different
conditions to get a complete overview.

```{r}
plot(heatmapPlot(obj, genesLists = genesList))
```

We can also plot a general heatmap of `COEX` values based on some markers like
the following one.

```{r}
plot(genesHeatmapPlot(obj, primaryMarkers = c("Satb2", "Bcl11b", "Vim", "Hes1"),
                      pValueThreshold = 0.001, symmetric = TRUE))
```

```{r eval=FALSE, include=TRUE}
plot(genesHeatmapPlot(obj, primaryMarkers = c("Satb2", "Bcl11b", "Fezf2"),
                      secondaryMarkers = c("Gabra3", "Meg3", "Cux1", "Neurod6"),
                      pValueThreshold = 0.001, symmetric = FALSE))
```

## Get data tables

Sometimes we can also be interested in the numbers present directly in the
contingency tables for two specific genes. To get them we can use two functions:

`contingencyTables()` to produce the observed and expected data

```{r}
print("Contingency Tables:")
contingencyTables(obj, g1 = "Satb2", g2 = "Bcl11b")

print("Corresponding Coex")
getGenesCoex(obj)["Satb2", "Bcl11b"]
```

Another useful function is `getGenesCoex()`. This can be used to extract
the whole or a partial `COEX` matrix from a `COTAN` object.

```{r}
# For the whole matrix
coex <- getGenesCoex(obj, zeroDiagonal = FALSE)
coex[1000L:1005L, 1000L:1005L]
```

```{r}
# For a partial matrix
coex <- getGenesCoex(obj, genes = c("Satb2", "Bcl11b", "Fezf2"))
coex[1000L:1005L, ]
```

## Establishing genes' clusters

`COTAN` provides a way to establish genes' clusters given some lists of markers

```{r eval=TRUE, include=TRUE}
layersGenes <- list(
  "L1"   = c("Reln",   "Lhx5"),
  "L2/3" = c("Satb2",  "Cux1"),
  "L4"   = c("Rorb",   "Sox5"),
  "L5/6" = c("Bcl11b", "Fezf2"),
  "Prog" = c("Vim",    "Hes1")
)
c(gSpace, eigPlot, pcaGenesClDF, treePlot) %<-%
  establishGenesClusters(obj, groupMarkers = layersGenes,
                         numGenesPerMarker = 25L, kCuts = 5L)

plot(eigPlot)
```

```{r eval=TRUE, include=TRUE}
plot(treePlot)
```

```{r eval=TRUE, include=TRUE}
colSelection <- vapply(pcaGenesClDF, is.numeric, logical(1L))
genesUmapPl <- UMAPPlot(pcaGenesClDF[, colSelection, drop = FALSE],
                        clusters = getColumnFromDF(pcaGenesClDF, "hclust"),
                        elements = layersGenes,
                        title = "Genes' clusters UMAP Plot",
                        numNeighbors = 32L, minPointsDist = 0.25)

plot(genesUmapPl)
```

## Uniform Clustering

It is possible to obtain a cell clusterization based on the concept of 
*uniformity* of expression of the genes across the cells.
That is the cluster satisfies the null hypothesis of the `COTAN` model:
the genes expression is not dependent on the cell in consideration.

There are two functions involved into obtaining a proper clusterization:
the first is `cellsUniformClustering` that uses standard tools clusterization
methods, but then discards and re-clusters any *non-uniform* cluster.

Please note that the most important parameters for the users are the
`GDIThreshold`s inside the **Uniform Transcript** checkers: they define how 
strict is the check. Default constructed advance check gives a pretty strong
guarantee of uniformity for the *cluster*.

```{r eval=FALSE, include=TRUE}
# This code is a little too computationally heavy to be used in an example
# So we stored the result and we can load it in the next section

# default constructed checker is OK
advChecker <- new("AdvancedGDIUniformityCheck")

c(splitClusters, splitCoexDF) %<-%
  cellsUniformClustering(obj, initialResolution = 0.8, checker = advChecker,
                         optimizeForSpeed = TRUE, deviceStr = "cuda",
                         cores = 6L, saveObj = TRUE, outDir = outDir)

obj <- addClusterization(obj, clName = "split",
                         clusters = splitClusters, coexDF = splitCoexDF)

table(splitClusters)
```

In the case one already has an existing *clusterization*, it is possible to
calculate the *DEA* `data.frame` and add it to the `COTAN` object.
```{r eval=TRUE, include=TRUE}
data("vignette.split.clusters", package = "COTAN")
splitClusters <- vignette.split.clusters[getCells(obj)]

splitCoexDF <- DEAOnClusters(obj, clusters = splitClusters)

obj <- addClusterization(obj, clName = "split", clusters = splitClusters,
                         coexDF = splitCoexDF, override = FALSE)
```

It is possible to have some statistics about the *clusterization*
```{r eval=TRUE, include=TRUE}
c(summaryData, summaryPlot) %<-%
  clustersSummaryPlot(obj, clName = "split", plotTitle = "split summary")

summaryData
```

The `ExpGenes` column contains the number of genes that are expressed in any
cell of the relevant *cluster*, while the `ExpGenes25` column contains the number of genes expressed in at the least 25% of the cells of the relevant *cluster*
```{r eval=TRUE, include=TRUE}
plot(summaryPlot)
```


It is possible to visualize how relevant are the *marker genes'* `lists` with
respect to the given *clusterization*
```{r eval=TRUE, include=TRUE}
c(., scoreDF) %<-% clustersMarkersHeatmapPlot(obj, groupMarkers = layersGenes,
                                              clName = "split", kCuts = 5L)
```


Since the algorithm that creates the *clusters* is not directly geared to
achieve cluster uniformity, there might be some *clusters* that can be merged
back together and still be **uniform**.

This is the purpose of the function `mergeUniformCellsClusters` that takes a
*clusterization* and tries to merge cluster pairs after checking that together
the pair forms a uniform cluster.

In order to avoid running the totality of the possible checks (as it can explode
quickly with the number of *clusters*), the function relies on a *related*
distance the find the cluster pairs that have the highest chance to be merged.

```{r eval=FALSE, include=TRUE}
c(mergedClusters, mergedCoexDF) %<-%
  mergeUniformCellsClusters(obj, clusters = splitClusters,
                            checkers = advChecker,
                            optimizeForSpeed = TRUE, deviceStr = "cuda",
                            cores = 6L, saveObj = TRUE, outDir = outDir)

# merges are:
#  1 <- 06 + 07
#  2 <- '-1' + 08 + 11
#  3 <- 09 + 10
#  4 <- 01
#  5 <- 02
#  6 <- 12 + 13
#  7 <- 03 + 05
#  8 <- 04

obj <- addClusterization(obj, clName = "merge", override = TRUE,
                         clusters = mergedClusters, coexDF = mergedCoexDF)

table(mergedClusters)
```

Again, here, the most important parameters for the users are the `GDIThreshold`s
inside the **Uniform Transcript** checkers: they define how  strict is the
check. Default constructed advance check gives a pretty strong guarantee of
uniformity for the *cluster*. If one wants to achieve a more *relaxed* merge,
it is possible to define a sequence of checkers, each less stringent than the
previous, that will be applied sequentially: First all the merges with the
stricter checker are performed, than those that satisfy the second, and so on.

```{r eval=FALSE, include=TRUE}

checkersList <- list(advChecker,
                     shiftCheckerThresholds(advChecker, 0.01),
                     shiftCheckerThresholds(advChecker, 0.03))

prevCheckRes <- data.frame()

# In this case we want to re-use the already calculated merge checks
# Se we reload them from the output files. This, along a similar facility for
# the split method, is also useful in the cases the execution is interrupted
# prematurely...
# 
if (TRUE) {
  # read from the last file among those named all_check_results_XX.csv
  mergeDir <- file.path(outDir, cond, "leafs_merge")
  resFiles <- list.files(path = mergeDir, pattern = "all_check_results_.*csv",
                         full.names = TRUE)

  prevCheckRes <- read.csv(resFiles[length(resFiles)],
                           header = TRUE, row.names = 1L)
}

c(mergedClusters2, mergedCoexDF2) %<-%
  mergeUniformCellsClusters(obj, clusters = splitClusters,
                            checkers = checkersList,
                            allCheckResults = prevCheckRes,
                            optimizeForSpeed = TRUE, deviceStr = "cuda",
                            cores = 6L, saveObj = TRUE, outDir = outDir)

# merges are:
#  1 <- '-1' + 06 + 09
#  2 <- 07
#  3 <- 03  + 04 + 05 + 10 + 13
#  4 <- 12
#  5 <- 01 + 08 + 11
#  6 <- 02

obj <- addClusterization(obj, clName = "merge2", override = TRUE,
                         clusters = mergedClusters2, coexDF = mergedCoexDF2)

table(mergedClusters2)
```


In the case one already has existing *clusterizations*, it is possible to
calculate their *DEA* `data.frame` and add them to the `COTAN` object.
```{r eval=TRUE, include=TRUE}
data("vignette.merge.clusters", package = "COTAN")
mergedClusters <- vignette.merge.clusters[getCells(obj)]

mergedCoexDF <- DEAOnClusters(obj, clusters = mergedClusters)

obj <- addClusterization(obj, clName = "merge", clusters = mergedClusters,
                         coexDF = mergedCoexDF, override = FALSE)

data("vignette.merge2.clusters", package = "COTAN")
mergedClusters2 <- vignette.merge2.clusters[getCells(obj)]

mergedCoexDF2 <- DEAOnClusters(obj, clusters = mergedClusters2)

obj <- addClusterization(obj, clName = "merge2", clusters = mergedClusters2,
                         coexDF = mergedCoexDF2, override = FALSE)
```


Here is possible to visualize how the `split` clusters are merged in to 
`merge` and `merge2`

```{r eval=TRUE, include=TRUE}
invisible(
  clustersMarkersHeatmapPlot(obj, clName = "merge", condNameList = "split",
                             conditionsList = list(splitClusters)))

invisible(
  clustersMarkersHeatmapPlot(obj, clName = "merge2", condNameList = "split",
                             conditionsList = list(splitClusters)))
```

In the following graph, it is possible to check that the found clusters align
very well to the expression of the layers' genes defined above...

It is possible to plot the *clusterization* on a `UMAP` plot

```{r}
c(umapPlot, cellsPCA) %<-% cellsUMAPPlot(obj, clName = "merge2",
                                         dataMethod = "AdjBinarized",
                                         genesSel = "HGDI",
                                         colors = NULL, numNeighbors = 15L,
                                         minPointsDist = 0.2)

plot(umapPlot)
```


## Vignette clean-up stage

The next few lines are just to clean.

```{r}
if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) {
  #Delete file if it exists
  file.remove(file.path(outDir, paste0(cond, ".cotan.RDS")))
}
if (file.exists(file.path(outDir, paste0(cond, "_times.csv")))) {
  #Delete file if it exists
  file.remove(file.path(outDir, paste0(cond, "_times.csv")))
}
if (dir.exists(file.path(outDir, cond))) {
  unlink(file.path(outDir, cond), recursive = TRUE)
}
if (dir.exists(file.path(outDir, GEO))) {
  unlink(file.path(outDir, GEO), recursive = TRUE)
}

# stop logging to file
setLoggingFile("")
file.remove(file.path(outDir, "vignette_v2.log"))

options(parallelly.fork.enable = FALSE)
```

```{r}
Sys.time()
```


```{r}
sessionInfo()
```