vignettes/Solutions.Rmd
Solutions.Rmd
Exercise 1.1
# Label
colData(spatial_data)$macro_cluster = reducedDim(spatial_data, "UMAP")[,"UMAP1"] > -2.5
# Verify
scater::plotUMAP(spatial_data, colour_by = "macro_cluster", point_size = 0.2)
ggspavis::plotVisium(
spatial_data,
annotate = "macro_cluster",
highlight = "in_tissue"
)
Exercise 1.2
spe_joint <- do.call(cbind, spatial_data_list)
ggspavis::plotSpots(spe_joint, annotate = sprintf("%s", "clust_M0_lam0.2_k50_res0.7"), pal = pal) +
facet_wrap(~sample_id) +
theme(legend.position = "none") +
labs(title = "BANKSY clusters")
spe_joint$has_changed = !spe_joint$clust_M0_lam0.2_k50_res0.7 == spe_joint$clust_M0_lam0.2_k50_res0.7_smooth
plotSpotQC(
spe_joint,
plot_type = "spot",
annotate = "has_changed",
) +
facet_wrap(~sample_id)
Excercise 1.3
res_spatialLIBD = split(data.frame(res$mat), colData(spatial_data_gene_name)$sample_id )
lapply(res_spatialLIBD, function(x) plotCorrelationMatrix(as.matrix(x[,-10])))
Excercise 1.4
res_spatialLIBD = split(data.frame(res$mat), colData(spatial_data_gene_name)$spatialLIBD )
lapply(res_spatialLIBD, function(x) plotCorrelationMatrix(as.matrix(x[,-10])))
Excercise 1.5
# 1. Microglia + Neurons
is_microglia_neuron <- mat_df$Microglia > 0.1 &
mat_df$Neurons > 0.1 &
(mat_df$Microglia + mat_df$Neurons) > 0.4
spatial_data$is_microglia_neuron <- is_microglia_neuron
ggspavis::plotSpots(spatial_data, annotate = "is_microglia_neuron") +
facet_wrap(~sample_id) +
scale_color_manual(values = c("TRUE" = "red", "FALSE" = "grey")) +
theme(legend.position = "none") +
labs(title = "Microglia + Neurons")
# 2. Astrocytes + Stem cells
# note the space in the column name — use backticks
is_astrocyte_stem <- mat_df$Astrocytes > 0.1 &
mat_df$`Stem cells` > 0.1 &
(mat_df$Astrocytes + mat_df$`Stem cells`) > 0.4
spatial_data$is_astrocyte_stem <- is_astrocyte_stem
ggspavis::plotSpots(spatial_data, annotate = "is_astrocyte_stem") +
facet_wrap(~sample_id) +
scale_color_manual(values = c("TRUE" = "blue", "FALSE" = "grey")) +
theme(legend.position = "none") +
labs(title = "Astrocytes + Stem cells")
Excercise 1.6 :::
# Get reference
library(cellNexus)
library(HDF5Array)
tmp_file_path = tempfile()
DelayedArray::setAutoBlockSize(size=1e9)
brain_reference =
# Query metadata across 30M cells
get_metadata() |>
# Filter your data of interest
dplyr::filter(tissue=="dorsolateral prefrontal cortex", disease == "normal") |>
# Collect pseudobulk as SummarizedExperiment
get_pseudobulk() |>
# Normalise for Spotlight
scuttle::logNormCounts() |>
# Save for fast reading
HDF5Array::saveHDF5SummarizedExperiment(tmp_file_path, replace = TRUE, verbose = TRUE)
library(HDF5Array)
brain_reference = HDF5Array::loadHDF5SummarizedExperiment(tmp_file_path)
my_metadata = colData(brain_reference)
knitr::kable(head(my_metadata), format = "html")
These are the cell types included in our reference, and the number of pseudobulk samples we have for each cell type.
table(brain_reference$cell_types)
These are the number of samples we have for each of the three data sets.
table(brain_reference$dataset_id)
The collection_id
can be used to gather information on
the cell database. e.g. https://cellxgene.cziscience.com/collections/
table(brain_reference$collection_id)
Exercise 2.1
# Get top variable genes
genes <- !grepl(pattern = "^Rp[l|s]|Mt", x = rownames(spatial_data))
hvg =
spatial_data |>
scran::modelGeneVar(subset.row = genes, block = spatial_data$sample_id) |>
scran::getTopHVGs(n = 1000)
# Calculate PCA
spatial_data |>
scuttle::logNormCounts() |>
scater::runPCA(subset_row = hvg) |>
# Calculate UMAP
scater::runUMAP(dimred = "PCA") |>
# Filter one sample
filter(in_tissue, sample_id=="151673") |>
# Gate based on tissue morphology
tidySpatialExperiment::gate_spatial(alpha = 0.1) |>
# Plot
scater::plotUMAP(colour_by = ".gate")
Exercise 2.2
rowData(spatial_data) |>
as_tibble() |>
filter( gene_name == "PECAM1")
gene = "ENSG00000261371"
spatial_data |>
# Join the feature
join_features(gene, shape="wide") |>
# Calculate the quantile
mutate(my_quantile = quantile(ENSG00000261371, 0.75)) |>
# Label the pixels
mutate(PECAM1_positive = ENSG00000261371 > my_quantile) |>
# Plot
ggspavis::plotSpots(annotate = "PECAM1_positive") +
facet_wrap(~sample_id)
Excercise 2.3
library(tidySummarizedExperiment)
library(tidybulk)
differential_analysis =
spatial_data |>
mutate(
dead =
# Stringent threshold
subsets_mito_percent > 20 |
sum < 700 |
detected < 500
) |>
aggregate_cells(c(sample_id, spatialLIBD, dead)) |>
keep_abundant(factor_of_interest = c(dead)) |>
nest(data = - spatialLIBD) |>
# filter regions having both alive and dead cells
filter( map_int(data, ~ .x |> distinct(sample_id, dead) |> nrow() ) == 6 ) |>
# DE analyses
mutate(data = map(
data,
test_differential_abundance,
~ dead + sample_id,
method = "edgeR_quasi_likelihood",
test_above_log2_fold_change = log(2)
))
differential_analysis |>
mutate(data = map(data, pivot_transcript)) |>
unnest(data) |>
filter(FDR<0.05)
Excercise 2.4
rownames(spatial_data_filtered) = rowData(spatial_data_filtered)$gene_name
marker_genes_of_amyloid_plaques = c("APP", "PSEN1", "PSEN2", "CLU", "APOE", "CD68", "ITGAM", "AIF1")
spatial_data_filtered |>
# Join the features
join_features(marker_genes_of_amyloid_plaques, shape = "wide") |>
# Rescaling
mutate(across(any_of(marker_genes_of_amyloid_plaques), scales::rescale)) |>
# Summarising signature
mutate(amyloid_plaques_signature = APP + PSEN1 + PSEN2 + CLU + APOE + CD68 + ITGAM + AIF1) |>
# Plotting
ggspavis::plotSpots(
annotate = "amyloid_plaques_signature"
) +
facet_wrap(~sample_id)
Exercise 3.2
mgs <- scran::scoreMarkers(
tx_spe_sample_1,
groups = tx_spe_sample_1$clusters,
# Omit mitochondrial genes and keep all the genes in spatial
subset.row =
grep("NegControl.+|BLANK.+", rownames(tx_spe_sample_1), value=TRUE, invert=TRUE)
)
# Select the most informative markers
mgs_df <- lapply(names(mgs), function(i) {
x <- mgs[[i]]
# Filter and keep relevant marker genes, those with AUC > 0.8
x <- x[x$mean.AUC > 0.8, ]
# Sort the genes from highest to lowest weight
x <- x[order(x$mean.AUC, decreasing = TRUE), ]
# Add gene and cluster id to the dataframe
x$gene <- rownames(x)
x$cluster <- i
data.frame(x)
})
head(mgs_df[[1]])
tx_spe_sample_1 |>
plotReducedDim("UMAP", colour_by="Gjc3")
Exercise 3.3
library(Banksy)
# scale the counts, without log transformation
tx_spe_sample_1 = tx_spe_sample_1 |> logNormCounts(log=FALSE, name = "normcounts")
Highly-variable genes
The Banksy documentation, suggest the use of Seurat
for
the detection of highly variable genes.
library(Seurat)
"NegControl.+|BLANK.+"
tx_spe_sample_1_seu =
as.Seurat(tx_spe_sample_1, data = NULL)
genes <- !grepl(pattern = "NegControl.+|BLANK.+", x = rownames(tx_spe_sample_1_seu))
tx_spe_sample_1_seu = tx_spe_sample_1_seu[genes,]
tx_spe_sample_1_seu = tx_spe_sample_1_seu |>
NormalizeData(scale.factor = 5000, normalization.method = 'RC')
# Compute HVGs
hvgs <- VariableFeatures(FindVariableFeatures(tx_spe_sample_1_seu, nfeatures = 2000))
rm(tx_spe_sample_1_seu)
head(hvgs)
We now split the data by sample, to compute the neighbourhood matrices.
# Convert to list
tx_spe_sample_1_banksy = tx_spe_sample_1[hvgs,]
# Compute the component neighborhood matrices
tx_spe_sample_1_banksy = tx_spe_sample_1_banksy |>
computeBanksy(
assay_name = "normcounts"
)
Here, we perform PCA using the BANKSY algorithm on the joint dataset. The group argument specifies how to treat different samples, ensuring that features are scaled separately per sample group to account for variations among them.
tx_spe_sample_1_banksy <- runBanksyPCA( # Run PCA on the Banskly matrix
tx_spe_sample_1_banksy,
lambda = 0.2, # spatial weighting parameter. Larger values (e.g. 0.8) incorporate more spatial neighborhood
seed = 42
)
Once the dimensional reduction is complete, we cluster the spots
across all samples and use connectClusters
to visually
compare these new BANKSY clusters against manual annotations.
tx_spe_sample_1_banksy <- clusterBanksy( # clustering on the principal components computed on the BANKSY matrix
tx_spe_sample_1_banksy,
lambda = 0.2, # spatial weighting parameter. Larger values (e.g. 0.8) incorporate more spatial neighborhood
resolution = 0.7, # numeric vector specifying resolution used for clustering (louvain / leiden).
seed = 42
)
As an optional step, we smooth the cluster labels for each sample independently, which can enhance the visual coherence of clusters, especially in heterogeneous biological samples.
From SpiceMix paper Chidester et al., 2023
tx_spe_sample_1_banksy_smoothed = tx_spe_sample_1_banksy |>
smoothLabels(
cluster_names = "clust_M0_lam0.2_k50_res0.7",
k = 6L,
verbose = FALSE
)
The raw and smoothed cluster labels are stored in the
colData
slot of each SingleCellExperiment
or
SpatialExperiment
object.
Visualising Clusters and Annotations on Spatial Coordinates: We utilise the ggspavis package to visually map BANKSY clusters and manual annotations onto the spatial coordinates of the dataset, providing a comprehensive visual overview of spatial and clustering data relationships.
# Use scater:::.get_palette('tableau10medium')
ggspavis::plotSpots(
tx_spe_sample_1_banksy,
annotate = "clust_M0_lam0.2_k50_res0.7"
) +
guides(color = "none") +
labs(title = "BANKSY clusters")
ggspavis::plotSpots(
tx_spe_sample_1_banksy_smoothed,
annotate = "clust_M0_lam0.2_k50_res0.7_smooth"
) +
guides(color = "none") +
labs(title = "BANKSY clusters smoothed")
# ggspavis::plotSpots(
# tx_spe_sample_1_banksy,
# annotate = "clusters"
# ) +
# guides(color = "none") +
# labs(title = "cell clustering")
ggspavis::plotSpots(
tx_spe_sample_1_banksy,
annotate = "region"
) +
scale_color_manual(values = colorRampPalette( brewer.pal(9,"Set1") )(150) ) +
guides(color = "none") +
labs(title = "ground truth")
<mangiola.s at wehi.edu.au>↩︎