langevitour examples (ABACBS 2022)

These are the widgets shown in my ABACBS 2022 poster demonstrating the langevitour package.

scRNA-Seq

Points are cells. Axes are varimax rotated principal components.

library(langevitour)
library(tidyverse)
library(Seurat)
library(scDblFinder)

set.seed(1)

# Download data from Gene Expression Omnibus (GEO) ====

dir.create("data", showWarnings=FALSE)

# Download from GEO can be slow.
options(timeout=3600)

if (!file.exists("data/GSE96583_RAW.tar") || 
    file.info("data/GSE96583_RAW.tar")$size != 76195840) {
    download.file(
        "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE96583&format=file",
        "data/GSE96583_RAW.tar")
}

if (!file.exists("data/GSE96583_batch2.total.tsne.df.tsv.gz") ||
    file.info("data/GSE96583_batch2.total.tsne.df.tsv.gz")$size != 756342) {
    download.file(
        "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE96nnn/GSE96583/suppl/GSE96583_batch2.total.tsne.df.tsv.gz",
        "data/GSE96583_batch2.total.tsne.df.tsv.gz")
}

if (!file.exists("data/GSE96583_batch2.genes.tsv.gz") ||
    file.info("data/GSE96583_batch2.genes.tsv.gz")$size != 277054) {
    download.file(
        "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE96nnn/GSE96583/suppl/GSE96583_batch2.genes.tsv.gz",
        "data/GSE96583_batch2.genes.tsv.gz")
}


untar("data/GSE96583_RAW.tar", exdir="data")


# Load the data ====

cells <- read.table("data/GSE96583_batch2.total.tsne.df.tsv.gz", sep="\t")

counts <- cbind(
    ReadMtx("data/GSM2560248_2.1.mtx.gz", "data/GSM2560248_barcodes.tsv.gz", "data/GSE96583_batch2.genes.tsv.gz"),
    ReadMtx("data/GSM2560249_2.2.mtx.gz", "data/GSM2560249_barcodes.tsv.gz", "data/GSE96583_batch2.genes.tsv.gz"))

colnames(counts) <- rownames(cells)


# Seurat processing ====

obj <- CreateSeuratObject(counts, meta.data=cells)

obj$percentRibo <- PercentageFeatureSet(obj, pattern = "^(RPL|RPS)")

obj <- NormalizeData(obj)
obj <- FindVariableFeatures(obj, selection.method = "vst", nfeatures = 2000)
obj <- ScaleData(obj)
obj <- RunPCA(obj, "RNA")
obj <- RunUMAP(obj, dims=1:10)


## Varimax rotation ====

nDim <- 10

loadings <- obj$pca@feature.loadings[,1:nDim]
scores <- obj$pca@cell.embeddings[,1:nDim]

# Find varimax rotation (without Kaiser normalization)
rotation <- varimax(loadings, normalize=FALSE)$rotmat

# Tidy the rotation: flip components to have positive skew
flips <- ifelse(colSums((loadings %*% rotation) ** 3) < 0, -1, 1)
rotation <- sweep(rotation, 2, flips, '*')

# Tidy the rotation: order by variance of scores
rotation <- rotation[, rev(order(colSums((scores %*% rotation)**2)))]

colnames(rotation) <- paste0("C",1:nDim)

vmLoadings <- loadings %*% rotation
vmScores <- scores %*% rotation


## Denoising ====

denoisedScores <- knnDenoise(vmScores, k=30, steps=2)


## Doublet detection ====
#
# Some doublets have already been identified based on genetic differences between individuals.
#
# Attempt to identify further doublets where the two cells came from the same indiviudal. 
#
# Detection is done on the unstimulated and stimulated cells independently since there will be no cells that are a mixture of unstimulated and stimulated.
#

labelDoublets <- function(obj) {
    reco <- recoverDoublets(
        obj$pca@cell.embeddings, 
        doublets=obj$multiplets == "doublet", 
        samples=table(obj$ind),    # Relative proportions of cells from each sample, here individual donors
        transposed=TRUE)

    case_when(
        reco$known ~ "known",
        reco$predicted ~ "predicted",
        TRUE ~ "singlet")
}

doubletLabel <-
    split(seq_len(ncol(obj)), obj$stim) |> 
    map(~labelDoublets(obj[,.])) |> 
    unsplit(obj$stim)


## Simplified labels ====

typeLabel <- case_when(
        doubletLabel != "singlet" ~ "Doublet",
        obj$cell == "B cells" ~ "B cell",
        obj$cell == "CD4 T cells" ~ "CD4 T cell",
        obj$cell == "CD8 T cells" ~ "CD8 T cell",
        obj$cell == "NK cells" ~ "NK cell",
        obj$cell == "CD14+ Monocytes" ~ "Monocyte",
        obj$cell == "FCGR3A+ Monocytes" ~ "Monocyte",
        TRUE ~ "Other"
    ) |> factor(c(
        "Doublet",
        "CD4 T cell",
        "CD8 T cell",
        "NK cell",
        "B cell",
        "Monocyte",
        "Other"
    ))


## Write out results ====
#
# Only 5,000 cells are kept out of 29,065, so langevitour can run at a very smooth framerate.
#

keep <- sample.int(ncol(obj), 5000)

cellsOut <- tibble(
    barcode = gsub("-1","",colnames(obj)[keep]),
    nCount = obj$nCount_RNA[keep],
    nFeature = obj$nFeature_RNA[keep],
    percentRibo = obj$percentRibo[keep],
    individual = as.integer(factor(obj$ind))[keep],
    stim = obj$stim[keep],
    type = typeLabel[keep],
    doublet = doubletLabel[keep],
    umapX = as.vector(obj$umap@cell.embeddings[keep,1]),
    umapY = as.vector(obj$umap@cell.embeddings[keep,2]))

saveRDS(file="out.rds", object=list(
    cells = cellsOut,
    vmLoadings = vmLoadings,
    vmScores = vmScores[keep,],
    denoisedScores = denoisedScores[keep,],
    stdev = Stdev(obj)))

## 2022-11-19 This code requires the development version of langevitour
# remotes::install_github("pfh/langevitour")

library(tidyverse)
library(crosstalk)
library(DT)
library(langevitour)

out <- readRDS("out.rds")

cells <- out$cells
shared <- SharedData$new(cells)

stimLabel <- factor(ifelse(cells$stim=="stim","S","U"), c("U","S")) 
crossLabel <- fct_cross(stimLabel, cells$type, sep=" ")

h <- c(0,2,1,3,4,5,0)/6*360
c <- c(1,1,1,1,1,1,0)*100
colsCrossLabel <- hcl(h=rep(h,each=2), c=rep(c,each=2), l=rep(c(40,70),7), fixup=TRUE)
colsTypeLabel <- hcl(h=h, c=c, l=50, fixup=TRUE)

langevitour(
    out$vmScores, 
    crossLabel, 
    levelColors=colsCrossLabel, 
    state=list(labelInactive=list("U Doublet","S Doublet")),
    link=shared,
    width=900, height=700)

# A table widget, using the SharedData object
datatable(
        shared,
        rownames=FALSE, width="100%", 
        class='compact cell-border hover', extensions='Buttons',
        options=list(dom='Bfrtip',buttons=c('copy','csv','excel'))) |>
    formatRound("percentRibo", 0) |>
    formatRound(c("umapX","umapY"), 2)

If you select some cells in the langevitour widget, only selected cells will be shown in the table. You can also use the table to select cells, which will then be highlighted in the langevitour widget. It is also possible to have linked brushing with plotly widgets, but there is currently a performance problem, so I have made a separate demonstration of this.

→ Go to plotly example

Bulk RNA-Seq biplot

Points are genes. Axes are samples. Extra axes have been defined for average gene expression, the difference between upper and lower molars, and time.

samples <- read_csv("teeth.csv")
counts <- read_csv("teeth-read-counts.csv") |>
    column_to_rownames("gene") |>
    as.matrix()

colnames(counts) <- paste0(
    rep(c("L","U"),8),
    rep(1:8,each=2))
    
library(edgeR)

keep <- rowMeans(counts) >= 50
dge <- DGEList(counts[keep,]) |> calcNormFactors()
lcpm <- cpm(dge, log=TRUE)

rnaseqShared = SharedData$new(tibble(
    Gene=rownames(lcpm)))

extraAxes <- cbind(
    Average = rep(1/16,16),
    "Upper vs Lower" = rep(c(-1,1)/8,8),
    Time = samples$day-mean(samples$day))
    
axisColors <- hcl(l=rep(c(40,60),8), c=100, h=rep(seq(0,180,length.out=8),each=2), fixup=TRUE)

langevitour(
    lcpm, 
    name=rownames(lcpm), 
    extraAxes=extraAxes, 
    axisColors=axisColors, 
    scale=15, pointSize=2,
    link=rnaseqShared,
    state=list(heat=-2,guide=2,guideType="pca",labelInactive=list("Average")),
    width=900, height=700)

datatable(
        rnaseqShared,
        rownames=FALSE, width="50%", 
        class='compact cell-border hover', extensions='Buttons',
        options=list(dom='Bfrtip',buttons=c('copy','csv','excel')))