Introduction

SpatialDE by Svensson et al., 2018, is a method to identify spatially variable genes (SVGs) in spatially resolved transcriptomics data.

Installation

You can install spatialDE from Bioconductor with the following code:

if (!requireNamespace("BiocManager", quietly = TRUE)) {
    install.packages("BiocManager")
}
BiocManager::install("spatialDE")

Example: Mouse Olfactory Bulb

Reproducing the example analysis from the original SpatialDE Python package.

library(spatialDE)
library(ggplot2)

Load data

Files originally retrieved from SpatialDE GitHub repository from the following links: https://github.com/Teichlab/SpatialDE/blob/master/Analysis/MouseOB/data/Rep11_MOB_0.csv https://github.com/Teichlab/SpatialDE/blob/master/Analysis/MouseOB/MOB_sample_info.csv

# Expression file used in python SpatialDE. 
data("Rep11_MOB_0")

# Sample Info file used in python SpatialDE
data("MOB_sample_info")

The Rep11_MOB_0 object contains spatial expression data for 16218 genes on 262 spots, with genes as rows and spots as columns.

Rep11_MOB_0[1:5, 1:5]
#>         16.92x9.015 16.945x11.075 16.97x10.118 16.939x12.132 16.949x13.055
#> Nrf1              1             0            0             1             0
#> Zbtb5             1             0            1             0             0
#> Ccnl1             1             3            1             1             0
#> Lrrfip1           2             2            0             0             3
#> Bbs1              1             2            0             4             0
dim(Rep11_MOB_0)
#> [1] 16218   262

The MOB_sample_info object contains a data.frame with coordinates for each spot.

head(MOB_sample_info)
#>                    x      y total_counts
#> 16.92x9.015   16.920  9.015        18790
#> 16.945x11.075 16.945 11.075        36990
#> 16.97x10.118  16.970 10.118        12471
#> 16.939x12.132 16.939 12.132        22703
#> 16.949x13.055 16.949 13.055        18641
#> 16.942x15.088 16.942 15.088        23408

Filter out pratically unobserved genes

Rep11_MOB_0 <- Rep11_MOB_0[rowSums(Rep11_MOB_0) >= 3, ]

Get total_counts for every spot

Rep11_MOB_0 <- Rep11_MOB_0[, row.names(MOB_sample_info)]
MOB_sample_info$total_counts <- colSums(Rep11_MOB_0)
head(MOB_sample_info)
#>                    x      y total_counts
#> 16.92x9.015   16.920  9.015        18790
#> 16.945x11.075 16.945 11.075        36990
#> 16.97x10.118  16.970 10.118        12471
#> 16.939x12.132 16.939 12.132        22703
#> 16.949x13.055 16.949 13.055        18641
#> 16.942x15.088 16.942 15.088        23408

Get coordinates from `MOB_sample_info`

X <- MOB_sample_info[, c("x", "y")]
head(X)
#>                    x      y
#> 16.92x9.015   16.920  9.015
#> 16.945x11.075 16.945 11.075
#> 16.97x10.118  16.970 10.118
#> 16.939x12.132 16.939 12.132
#> 16.949x13.055 16.949 13.055
#> 16.942x15.088 16.942 15.088

`stabilize`

The SpatialDE method assumes normally distributed data, so we stabilize the variance of the negative binomial distributed counts data using Anscombe’s approximation. The stabilize() function takes as input a data.frame of expression values with samples in columns and genes in rows. Thus, in this case, we have to transpose the data.

norm_expr <- stabilize(Rep11_MOB_0)
#> + /github/home/.cache/R/basilisk/1.19.0/0/bin/conda create --yes --prefix /github/home/.cache/R/basilisk/1.19.0/spatialDE/1.13.0/env 'python=3.10.14' --quiet -c conda-forge --override-channels
#> + /github/home/.cache/R/basilisk/1.19.0/0/bin/conda install --yes --prefix /github/home/.cache/R/basilisk/1.19.0/spatialDE/1.13.0/env 'python=3.10.14' -c conda-forge --override-channels
#> + /github/home/.cache/R/basilisk/1.19.0/0/bin/conda install --yes --prefix /github/home/.cache/R/basilisk/1.19.0/spatialDE/1.13.0/env -c conda-forge 'python=3.10.14' 'numpy=1.23.5' 'scipy=1.9.3' 'patsy=0.5.3' 'pandas=1.5.2' --override-channels
norm_expr[1:5, 1:5]
#>         16.92x9.015 16.945x11.075 16.97x10.118 16.939x12.132 16.949x13.055
#> Nrf1       1.227749     0.8810934    0.8810934     1.2277491     0.8810934
#> Zbtb5      1.227749     0.8810934    1.2277491     0.8810934     0.8810934
#> Ccnl1      1.227749     1.6889027    1.2277491     1.2277491     0.8810934
#> Lrrfip1    1.484676     1.4846765    0.8810934     0.8810934     1.6889027
#> Bbs1       1.227749     1.4846765    0.8810934     1.8584110     0.8810934

`regress_out`

Next, we account for differences in library size between the samples by regressing out the effect of the total counts for each gene using linear regression.

resid_expr <- regress_out(norm_expr, sample_info = MOB_sample_info)
resid_expr[1:5, 1:5]
#>         16.92x9.015 16.945x11.075 16.97x10.118 16.939x12.132 16.949x13.055
#> Nrf1    -0.75226761    -1.2352000   -1.0164479    -0.7903289    -1.0973214
#> Zbtb5    0.09242373    -0.3323719    0.1397144    -0.2760560    -0.2533134
#> Ccnl1   -2.77597164    -2.5903783   -2.6092013    -2.8529340    -3.1193883
#> Lrrfip1 -1.92331333    -2.1578718   -2.3849405    -2.5924072    -1.7163300
#> Bbs1    -1.12186064    -1.0266476   -1.3706460    -0.5363646    -1.4666155

`run`

To reduce running time, the SpatialDE test is run on a subset of 1000 genes. Running it on the complete data set takes about 10 minutes.

# For this example, run spatialDE on the first 1000 genes
sample_resid_expr <- head(resid_expr, 1000)

results <- spatialDE::run(sample_resid_expr, coordinates = X)
head(results[order(results$qval), ])
#>           FSV M     g        l max_delta    max_ll max_mu_hat max_s2_t_hat
#> 752 0.5869020 4 Fabp7 1.135190 0.6872704 -154.7691   5.036316    3.6039400
#> 671 0.5276876 4   Apc 1.135190 0.8739619 -146.9456  -3.922771    2.1552117
#> 884 0.3651030 4 Kcnh3 1.907609 1.6225593 -177.5658  -8.066532    3.7081686
#> 668 0.3837007 4  Pcp4 1.135190 1.5683362 -127.1354 -11.866414   16.9252387
#> 722 0.3345161 4  Penk 1.135190 1.9424986 -163.0520  -9.491798   10.3640716
#> 725 0.3931224 4  Frzb 1.135190 1.5073475 -217.1284   1.180284    0.3109513
#>     model   n       s2_FSV s2_logdelta        time      BIC max_ll_null
#> 752    SE 260 0.0004068815 0.007999172 0.001467228 331.7809   -196.7353
#> 671    SE 260 0.0004218510 0.007939872 0.001475334 316.1339   -178.4892
#> 884    SE 260 0.0006305732 0.013314547 0.001681328 377.3743   -200.6982
#> 668    SE 260 0.0007326393 0.014985240 0.001700640 276.5136   -144.3654
#> 722    SE 260 0.0006993170 0.015755817 0.001932621 348.3468   -179.1260
#> 725    SE 260 0.0009077964 0.018308366 0.001434803 456.4996   -233.0538
#>          LLR         pval         qval
#> 752 41.96623 9.286349e-11 9.286349e-08
#> 671 31.54359 1.950129e-08 9.750644e-06
#> 884 23.13242 1.512193e-06 5.040644e-04
#> 668 17.22998 3.311683e-05 8.279207e-03
#> 722 16.07398 6.091542e-05 1.098165e-02
#> 725 15.92536 6.588988e-05 1.098165e-02

`model_search`

Finally, we can classify the DE genes to interpetable DE classes using the model_search function. We apply the model search on filtered DE results, using a threshold of 0.05 for the Q-values.

de_results <- results[results$qval < 0.05, ]

ms_results <- model_search(
    sample_resid_expr,
    coordinates = X,
    de_results = de_results
)

# To show ms_results sorted on qvalue, uncomment the following line
# head(ms_results[order(ms_results$qval), ])

head(ms_results)
#>        BIC        FSV LLR M              g        l max_delta    max_ll
#> 0 273.4206 0.06585344 NaN 4 X2900097C17Rik 5.386796 14.481719 -125.5890
#> 1 267.1570 0.10520888 NaN 4           Pcp4 9.052138  8.896112 -122.4571
#> 2 333.8619 0.10540242 NaN 4         Rbfox3 9.052138  8.877856 -155.8096
#> 3 370.2613 0.09234754 NaN 4           Sez6 9.052138 10.280760 -174.0093
#> 4 378.8001 0.08317351 NaN 4          Gng13 9.052138 11.530100 -178.2787
#> 5 394.8754 0.10227380 NaN 4           Gng4 9.052138  9.181433 -186.3163
#>   max_mu_hat max_s2_t_hat model   n       s2_FSV s2_logdelta        time
#> 0  -5.162205     1.808970   PER 260 0.0001697748  0.03696936 0.004212379
#> 1 -11.838117    14.729972   PER 260 0.0004604459  0.04570537 0.001943588
#> 2  -9.007761     8.556802   PER 260 0.0004796162  0.04746842 0.001922369
#> 3 -10.734952    10.511043   PER 260 0.0005243492  0.06432634 0.001939774
#> 4   3.970898     1.300596   PER 260 0.0004296904  0.06273417 0.001969099
#> 5 -12.350677    15.549625   PER 260 0.0005103723  0.05301686 0.001929045
#>    PER_prob      SE_prob  linear_prob         pval        qval max_ll_null
#> 0 0.8657858 1.342142e-01 3.671512e-10 3.776266e-04 0.026973327   -139.1608
#> 1 0.9999136 8.638788e-05 2.434950e-17 3.311683e-05 0.008279207   -144.3654
#> 2 1.0000000 4.526865e-08 1.223837e-18 1.105005e-04 0.013812568   -179.2132
#> 3 0.9997843 2.156997e-04 2.311672e-12 5.444193e-04 0.036294619   -190.1870
#> 4 0.7364092 2.635908e-01 3.356690e-12 9.052709e-05 0.012932442   -194.1171
#> 5 0.9999908 9.156503e-06 9.984359e-15 2.953494e-04 0.023714017   -205.2165

`spatial_patterns`

Furthermore, we can group spatially variable genes (SVGs) into spatial patterns using automatic expression histology (AEH).

sp <- spatial_patterns(
    sample_resid_expr,
    coordinates = X,
    de_results = de_results,
    n_patterns = 4L, length = 1.5
)
sp$pattern_results
#>                  g pattern membership
#> 595 X2900097C17Rik       2          1
#> 660         Eef1a1       0          1
#> 667         Pmepa1       1          1
#> 668           Pcp4       3          1
#> 671            Apc       2          1
#> 696         Rbfox3       3          1
#> 722           Penk       3          1
#> 725           Frzb       1          1
#> 752          Fabp7       1          1
#> 791        Gm13889       3          1
#> 808           Sez6       3          1
#> 829          Gng13       1          1
#> 833           Gng4       3          1
#> 884          Kcnh3       3          1
#> 906         Camk2b       3          1

Plots

Visualizing one of the most significant genes.

gene <- "Pcp4"

ggplot(data = MOB_sample_info, aes(x = x, y = y, color = norm_expr[gene, ])) +
    geom_point(size = 7) +
    ggtitle(gene) +
    scale_color_viridis_c() +
    labs(color = gene)

Plot Spatial Patterns of Multiple Genes

As an alternative to the previous figure, we can plot multiple genes using the normalized expression values.

ordered_de_results <- de_results[order(de_results$qval), ]

multiGenePlots(norm_expr,
    coordinates = X,
    ordered_de_results[1:6, ]$g,
    point_size = 4,
    viridis_option = "D",
    dark_theme = FALSE
)

Plot Fraction Spatial Variance vs Q-value

FSV_sig(results, ms_results)
#> Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
#> increasing max.overlaps

SpatialExperiment integration

The SpatialDE workflow can also be executed with a SpatialExperiment object as input.

library(SpatialExperiment)

# For SpatialExperiment object, we neeed to transpose the counts matrix in order
# to have genes on rows and spot on columns. 
# For this example, run spatialDE on the first 1000 genes

partial_counts <- head(Rep11_MOB_0, 1000)

spe <- SpatialExperiment(
  assays = list(counts = partial_counts),
  spatialData = DataFrame(MOB_sample_info[, c("x", "y")]),
  spatialCoordsNames = c("x", "y")
)

out <- spatialDE(spe, assay_type = "counts", verbose = FALSE)
head(out[order(out$qval), ])
#>           FSV M     g        l max_delta    max_ll max_mu_hat max_s2_t_hat
#> 759 0.5853287 4 Fabp7 1.135190 0.6917423 -157.8641   4.805770    3.2961235
#> 684 0.5215592 4   Apc 1.135190 0.8957046 -152.4253  -2.852122    1.1971964
#> 886 0.3425473 4 Kcnh3 1.907609 1.7908397 -201.1959  -7.162902    2.9230555
#> 681 0.3671680 4  Pcp4 1.135190 1.6829213 -124.8416  -9.209852   10.0688118
#> 735 0.3820350 4  Frzb 1.135190 1.5794319 -239.8355   1.156500    0.3240445
#> 832 0.2378540 4 Gng13 1.907609 2.9897888 -204.1019   3.648425    0.7506023
#>     model   n       s2_FSV s2_logdelta        time      BIC max_ll_null
#> 759    SE 260 0.0004113112 0.008071972 0.001441717 337.9709   -199.4754
#> 684    SE 260 0.0004271087 0.008023534 0.001456261 327.0933   -183.5046
#> 886    SE 260 0.0006874927 0.015203449 0.001703501 424.6346   -221.4201
#> 681    SE 260 0.0008102845 0.017044171 0.001708746 271.9259   -140.0619
#> 735    SE 260 0.0009344367 0.019163063 0.001443863 501.9136   -255.0426
#> 832    SE 260 0.0008862383 0.027973295 0.001982689 430.4466   -218.4267
#>          LLR         pval         qval
#> 759 41.61134 1.113458e-10 1.113458e-07
#> 684 31.07932 2.476965e-08 1.238482e-05
#> 886 20.22416 6.887758e-06 2.295919e-03
#> 681 15.22028 9.567033e-05 1.926777e-02
#> 735 15.20713 9.633886e-05 1.926777e-02
#> 832 14.32476 1.538286e-04 2.563810e-02

Plot Spatial Patterns of Multiple Genes (using SpatialExperiment object)

We can plot spatial patterns of multiples genes using the spe object.

spe_results <- out[out$qval < 0.05, ]

ordered_spe_results <- spe_results[order(spe_results$qval), ]

multiGenePlots(spe,
    assay_type = "counts",
    ordered_spe_results[1:6, ]$g,
    point_size = 4,
    viridis_option = "D",
    dark_theme = FALSE
)

Classify spatially variable genes with `model_search` and `spatial_patterns`

msearch <- modelSearch(spe,
    de_results = out, qval_thresh = 0.05,
    verbose = FALSE
)

head(msearch)
#>        BIC        FSV      LLR M              g        l max_delta    max_ll
#> 0 287.6679 0.06406448      NaN 4 X2900097C17Rik 5.386796 14.914618 -132.7126
#> 1 263.5043 0.09590705      NaN 4           Pcp4 9.052138  9.860377 -120.6308
#> 2 370.8671 0.09681127      NaN 4         Rbfox3 9.052138  9.758510 -174.3122
#> 3 428.9359 0.08063297      NaN 4          Gng13 9.052138 11.926341 -203.3466
#> 4 401.7737 0.14867725      NaN 4          Kcnh3 9.052138  5.989364 -189.7655
#> 5 479.4804 0.36943862 14.01272 4         Pmepa1 1.135190  1.666576 -228.6188
#>   max_mu_hat max_s2_t_hat model   n       s2_FSV s2_logdelta        time
#> 0 -3.6572048    0.8870009   PER 260 0.0001665651  0.03805707 0.007651329
#> 1 -9.1839788    8.0150340   PER 260 0.0004344069  0.05008671 0.001957417
#> 2 -7.7847610    5.8291532   PER 260 0.0004545203  0.05160913 0.001939058
#> 3  3.6145343    1.0492566   PER 260 0.0004240747  0.06523479 0.001945972
#> 4 -7.1054452    7.8745493   PER 260 0.0006681386  0.03907065 0.001951933
#> 5  0.1238686    0.1463362    SE 260 0.0007025967  0.01471845 0.001713753
#>     PER_prob      SE_prob  linear_prob         pval        qval max_ll_null
#> 0 0.55766485 4.423351e-01 5.814904e-10 3.104656e-04 0.032188921   -145.8346
#> 1 0.99977999 2.200120e-04 3.452128e-15 9.567033e-05 0.019267772   -140.0619
#> 2 0.99999990 1.031935e-07 1.553124e-16 3.218892e-04 0.032188921   -195.2940
#> 3 0.81916872 1.808313e-01 1.701216e-11 1.538286e-04 0.025638100   -218.4267
#> 4 1.00000000 1.179426e-10 8.336815e-26 6.887758e-06 0.002295919   -221.4201
#> 5 0.03133507 9.686649e-01 1.700936e-10 1.815779e-04 0.025717859   -242.6315

spatterns <- spatialPatterns(spe,
    de_results = de_results, qval_thresh = 0.05,
    n_patterns = 4L, length = 1.5, verbose = FALSE
)

spatterns$pattern_results
#>                  g pattern membership
#> 595 X2900097C17Rik       2          1
#> 660         Eef1a1       3          1
#> 667         Pmepa1       0          1
#> 668           Pcp4       3          1
#> 671            Apc       0          1
#> 696         Rbfox3       3          1
#> 722           Penk       3          1
#> 725           Frzb       0          1
#> 752          Fabp7       0          1
#> 791        Gm13889       3          1
#> 808           Sez6       3          1
#> 829          Gng13       0          1
#> 833           Gng4       3          1
#> 884          Kcnh3       3          1
#> 906         Camk2b       3          1

`sessionInfo`

Session info

#> [1] "2024-11-30 04:51:58 UTC"
#> R version 4.4.2 (2024-10-31)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.1 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] SpatialExperiment_1.17.0    SingleCellExperiment_1.29.1
#>  [3] SummarizedExperiment_1.37.0 Biobase_2.67.0             
#>  [5] GenomicRanges_1.59.1        GenomeInfoDb_1.43.2        
#>  [7] IRanges_2.41.1              S4Vectors_0.45.2           
#>  [9] BiocGenerics_0.53.3         generics_0.1.3             
#> [11] MatrixGenerics_1.19.0       matrixStats_1.4.1          
#> [13] ggplot2_3.5.1               spatialDE_1.13.0           
#> [15] BiocStyle_2.35.0           
#> 
#> loaded via a namespace (and not attached):
#>  [1] gtable_0.3.6            dir.expiry_1.15.0       rjson_0.2.23           
#>  [4] xfun_0.49               bslib_0.8.0             ggrepel_0.9.6          
#>  [7] lattice_0.22-6          vctrs_0.6.5             tools_4.4.2            
#> [10] parallel_4.4.2          tibble_3.2.1            fansi_1.0.6            
#> [13] pkgconfig_2.0.3         Matrix_1.7-1            checkmate_2.3.2        
#> [16] lifecycle_1.0.4         GenomeInfoDbData_1.2.13 farver_2.1.2           
#> [19] compiler_4.4.2          munsell_0.5.1           htmltools_0.5.8.1      
#> [22] sys_3.4.3               buildtools_1.0.0        sass_0.4.9             
#> [25] yaml_2.3.10             pillar_1.9.0            crayon_1.5.3           
#> [28] jquerylib_0.1.4         cachem_1.1.0            DelayedArray_0.33.2    
#> [31] magick_2.8.5            abind_1.4-8             basilisk_1.19.0        
#> [34] tidyselect_1.2.1        digest_0.6.37           dplyr_1.1.4            
#> [37] labeling_0.4.3          maketools_1.3.1         fastmap_1.2.0          
#> [40] grid_4.4.2              colorspace_2.1-1        cli_3.6.3              
#> [43] SparseArray_1.7.2       magrittr_2.0.3          S4Arrays_1.7.1         
#> [46] utf8_1.2.4              withr_3.0.2             backports_1.5.0        
#> [49] filelock_1.0.3          scales_1.3.0            UCSC.utils_1.3.0       
#> [52] rmarkdown_2.29          XVector_0.47.0          httr_1.4.7             
#> [55] gridExtra_2.3           reticulate_1.40.0       png_0.1-8              
#> [58] evaluate_1.0.1          knitr_1.49              basilisk.utils_1.19.0  
#> [61] viridisLite_0.4.2       rlang_1.1.4             Rcpp_1.0.13-1          
#> [64] glue_1.8.0              BiocManager_1.30.25     jsonlite_1.8.9         
#> [67] R6_2.5.1                zlibbioc_1.52.0

Introduction to spatialDE