Preprocessing Data (Following OSTA Analysis Pipeline)

In this markdown, analysis and quality control steps are performed following chapters 8 and 9 of the book ‘Orchestrating Spatially-Resolved Transcriptomics Analysis with Bioconductor’ https://lmweber.org/OSTA-book/. Data is fetched and plotted, then QC metrics are calculated and data is filtered based on QC thresholds for sample 151507 only. After QC operations, 33538 genes and 4136 spots remain in the sample.

Loading Libraries and Fetching Data

Data is sourced from spatialLIBD (http://spatial.libd.org/spatialLIBD/), a project containing LIBD human dorsolateral pre-frontal cortex (DLPFC) spatial transcriptomics data from the 10x Genomics Visium Platform (https://10xgenomics.com/products/spatial-gene-expression). The data contains 12 lowres samples each with a scale factor of 0.0450045. For the purposes of this project, only sample 151507 is visualized and manipulated to start with, so the SpatialExperiment object is filtered to only include sample 151507. In sample 151507, 33538 genes and 4662 spots are captured, each with a unique barcode.

library(spatialLIBD)
library(ggspavis)
library(scater)
# fetch data: 33538 genes, 47681 spots
spe <- fetch_data(type='spe')

# filter to 4226 spots pertaining to sample 151507
sample_id <- unique(spe$sample_id)[1]
spe_sub <- spe[, spe$sample_id == sample_id]

# data frame of 4226 rows holding colData
sample_df <- as.data.frame(colData(spe_sub), optional = TRUE)

Plot Image of Sample

Sample 151507 is visualized by transforming the image into a rasterGrob object and calling ggplot.

# get histology image
img <- SpatialExperiment::imgRaster(spe_sub)

## Transform to a rasterGrob object
grob <- grid::rasterGrob(img, width = grid::unit(1, "npc"), height = grid::unit(1, "npc"))

## Make a plot using geom_spatial
p <- ggplot2::ggplot(
    sample_df,
    ggplot2::aes(
        x = pxl_col_in_fullres * SpatialExperiment::scaleFactors(spe_sub),
        y = pxl_row_in_fullres * SpatialExperiment::scaleFactors(spe_sub),
    )
) +
    geom_spatial(
        data = tibble::tibble(grob = list(grob)),
        ggplot2::aes(grob = grob),
        x = 0.5,
        y = 0.5
    )

## Show the plot
print(p)

Quality Control

We can identify LQ spots by:
1. Library size 2. Number of expressed genes 3. Proportion of mitochondrial reads
4. Number of cells per spot

Note: all spots are already over tissue.

Identify mitochrondrial genes and add per spot QC metrics

# identify mitochondrial genes
is_mito <- grepl("(^MT-)|(^mt-)", rowData(spe_sub)$gene_name)

# add per spot QC metrics, including mitochondrial genes as a feature subset
spe_sub <- addPerCellQC(spe_sub, subsets = list(mito = is_mito))

Set a lower threshold on library size

Library size = sum of UMI (unique molecular identifier) counts per spot, remove low library size

# continuous variable containing total number of genes at each spot
hist(colData(spe_sub)$sum, breaks = 20)

# set filtering threshold for library size at 400
plotQC(spe_sub, type = "scatter", 
       metric_x = "cell_count", metric_y = "sum", 
       threshold_y = 600)

# check number of spots below threshold
qc_lib_size <- colData(spe_sub)$sum < 600
table(qc_lib_size)
## qc_lib_size
## FALSE  TRUE 
##  4182    44
# look for spatial pattern in discarded spots
colData(spe_sub)$qc_lib_size <- qc_lib_size

# plot spots
threshold_row <- colData(spe_sub)[colData(spe_sub)$qc_lib_size,][,'array_row']
threshold_col <- colData(spe_sub)[colData(spe_sub)$qc_lib_size,][,'array_col']
ggplot(data.frame(threshold_row, threshold_col), aes(x=threshold_row, y=threshold_col)) + geom_point()

Set a lower threshold on number of expressed features

Discard spots with a low number of expressed genes per spot

# continuous variable containing total number of genes at each spot
hist(colData(spe_sub)$detected, breaks = 20)

# set filtering threshold for library size at 500
plotQC(spe_sub, type = "scatter", 
       metric_x = "cell_count", metric_y = "detected", 
       threshold_y = 400)

# check number of spots below threshold
qc_detected <- colData(spe_sub)$detected < 400
table(qc_lib_size)
## qc_lib_size
## FALSE  TRUE 
##  4182    44
# look for spatial pattern in discarded spots
colData(spe_sub)$qc_detected <- qc_detected

# plot spots
threshold_row <- colData(spe_sub)[colData(spe_sub)$qc_detected,][,'array_row']
threshold_col <- colData(spe_sub)[colData(spe_sub)$qc_detected,][,'array_col']
ggplot(data.frame(threshold_row, threshold_col), aes(x=threshold_row, y=threshold_col)) + geom_point()

Set upper threshold on the proportion of mitochondrial reads

A high proportion of mitochondrial reads can indicate cell damage

# histogram of mitochondrial read proportions
hist(colData(spe_sub)$subsets_mito_percent, breaks = 20)

# plot mitochondrial read proportion vs. number of cells per spot
plotQC(spe_sub, type = "scatter", 
       metric_x = "cell_count", metric_y = "subsets_mito_percent", 
       threshold_y = 28)

# select QC threshold for mitochondrial read proportion
qc_mito <- colData(spe_sub)$subsets_mito_percent > 28
table(qc_mito)
## qc_mito
## FALSE  TRUE 
##  4197    29
# look for spatial pattern in discarded spots
colData(spe_sub)$qc_mito <- qc_mito

# plot spots
threshold_row <- colData(spe_sub)[colData(spe_sub)$qc_mito,][,'array_row']
threshold_col <- colData(spe_sub)[colData(spe_sub)$qc_mito,][,'array_col']
ggplot(data.frame(threshold_row, threshold_col), aes(x=threshold_row, y=threshold_col)) + geom_point()

Remove outlier spots for the number of cells per spot

Spots with high cell counts have low numbers of expressed genes (i.e. experiments failed)

# histogram of number of cells per spot
hist(colData(spe_sub)$cell_count, breaks = 20)

# distribution of cells per spot
tbl_cells_per_spot <- table(colData(spe_sub)$cell_count)

# plot number of expressed genes vs. number of cells per spot
plotQC(spe_sub, type = "scatter", 
       metric_x = "cell_count", metric_y = "detected", 
       threshold_x = 12)

# select QC threshold for number of cells per spot
qc_cell_count <- colData(spe_sub)$cell_count > 12
table(qc_cell_count)
## qc_cell_count
## FALSE  TRUE 
##  4206    20
colData(spe_sub)$qc_cell_count <- qc_cell_count

# plot spots
threshold_row <- colData(spe_sub)[colData(spe_sub)$qc_cell_count,][,'array_row']
threshold_col <- colData(spe_sub)[colData(spe_sub)$qc_cell_count,][,'array_col']
ggplot(data.frame(threshold_row, threshold_col), aes(x=threshold_row, y=threshold_col)) + geom_point()

Remove Low-Quality Spots aggregated from the previous QC operations

# number of discarded spots for each metric
apply(cbind(qc_lib_size, qc_detected, qc_mito, qc_cell_count), 2, sum)
##   qc_lib_size   qc_detected       qc_mito qc_cell_count 
##            44            29            29            20
# combined set of discarded spots
discard <- qc_lib_size | qc_detected | qc_mito | qc_cell_count
table(discard)
## discard
## FALSE  TRUE 
##  4136    90
# store in object
colData(spe_sub)$discard <- discard

# check spatial pattern of combined set of discarded spots
# plot spots
threshold_row <- colData(spe_sub)[colData(spe_sub)$discard,][,'array_row']
threshold_col <- colData(spe_sub)[colData(spe_sub)$discard,][,'array_col']
ggplot(data.frame(threshold_row, threshold_col), aes(x=threshold_row, y=threshold_col)) + geom_point()

# remove combined set of low-quality spots
spe_sub <- spe_sub[, !colData(spe_sub)$discard]
dim(spe_sub)
## [1] 33538  4136