diff --git a/.github/workflows/basic_checks.yaml b/.github/workflows/basic_checks.yaml deleted file mode 100644 index fd95370..0000000 --- a/.github/workflows/basic_checks.yaml +++ /dev/null @@ -1,168 +0,0 @@ -name: Check, build, and push image - -on: [push, pull_request] - -env: - cache-version: v1 - -jobs: - r-build-and-check: - runs-on: ubuntu-latest - container: bioconductor/bioconductor_docker:devel - env: - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - steps: - - uses: actions/checkout@v2 - - - name: Query dependencies and update old packages - run: | - BiocManager::install(ask=FALSE) - saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) - shell: Rscript {0} - - - name: Cache R packages - if: runner.os != 'Windows' - uses: actions/cache@v1 - with: - path: /usr/local/lib/R/site-library - key: ${{ env.cache-version }}-${{ runner.os }}-r-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ env.cache-version }}-${{ runner.os }}-r- - - # This lets us augment with additional dependencies - - uses: r-lib/actions/setup-r@v2 - with: - r-version: release - - - uses: r-lib/actions/setup-r-dependencies@v2 - with: - extra-packages: rcmdcheck remotes reticulate BiocManager - - - uses: actions/setup-python@v4 - with: - python-version: "3.x" - - - name: setup r-reticulate venv - shell: Rscript {0} - run: | - python_packages <- - c("numpy") - - library(reticulate) - sessionInfo() - - - uses: r-lib/actions/check-r-package@v2 - -# - name: Install system dependencies -# if: runner.os == 'Linux' -# env: -# RHUB_PLATFORM: linux-x86_64-ubuntu-gcc -# run: | -# python -h -# Rscript -e "library(reticulate);sessionInfo();R.home()" -# Rscript -e "remotes::install_github('r-hub/sysreqs')" -# sysreqs=$(Rscript -e "cat(sysreqs::sysreq_commands('DESCRIPTION'))") -# cat 'DESCRIPTION' -# echo $sysreqs -# sudo -s eval "$sysreqs" - -# - name: Install dependencies -# run: | -# BiocManager::repositories() -# remotes::install_deps(dependencies = TRUE, repos = BiocManager::repositories()) -# remotes::install_cran("rcmdcheck") -# shell: Rscript {0} - - - name: Check - env: - _R_CHECK_CRAN_INCOMING_REMOTE_: false - run: rcmdcheck::rcmdcheck(args = c("--no-manual"), error_on = "warning", check_dir = "check") - shell: Rscript {0} - - - - name: Build pkgdown - run: | - PATH=$PATH:$HOME/bin/ Rscript -e 'pkgdown::build_site(".")' - - # deploy needs rsync? Seems so. - - name: Install deploy dependencies - run: | - apt-get update - apt-get -y install rsync - - - name: Deploy 🚀 - uses: JamesIves/github-pages-deploy-action@releases/v4 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - BRANCH: gh-pages # The branch the action should deploy to. - FOLDER: docs # The folder the action should deploy. - docker-build-and-push: - #needs: r-build-and-check - runs-on: ubuntu-latest - permissions: - contents: read - packages: write - # This is used to complete the identity challenge - # with sigstore/fulcio when running outside of PRs. - id-token: write - - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: Set Environment Variables - run: | - REPO_LOWER="$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]')" - REGISTRY=ghcr.io - echo "BUILD_DATE=$(date +'%Y-%m-%d %H:%M:%S')" >> $GITHUB_ENV - echo "GIT_SHA=$(echo ${{ github.sha }} | cut -c1-7)" >> $GITHUB_ENV - echo "REGISTRY=${REGISTRY}" >> $GITHUB_ENV - echo "IMAGE=${REGISTRY}/${REPO_LOWER}" >> $GITHUB_ENV - - - name: Show environment - run: | - env - - - # Install the cosign tool except on PR - # https://github.com/sigstore/cosign-installer - - name: Install cosign - if: github.event_name != 'pull_request' - uses: sigstore/cosign-installer@1e95c1de343b5b0c23352d6417ee3e48d5bcd422 - with: - cosign-release: 'v1.4.0' - - - # Workaround: https://github.com/docker/build-push-action/issues/461 - - name: Setup Docker buildx - uses: docker/setup-buildx-action@79abd3f86f79a9d68a23c75a09a9a85889262adf - - # Login against a Docker registry except on PR - # https://github.com/docker/login-action - - name: Log into registry ${{ env.REGISTRY }} - if: github.event_name != 'pull_request' - uses: docker/login-action@28218f9b04b4f3f62068d7b6ce6ca5b26e35336c - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - # Extract metadata (tags, labels) for Docker - # https://github.com/docker/metadata-action - - name: Extract Docker metadata - id: meta - uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38 - with: - images: ${{ env.IMAGE }} - - # Build and push Docker image with Buildx (don't push on PR) - # https://github.com/docker/build-push-action - - name: Build and push Docker image - id: build-and-push - uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc - with: - context: . - push: ${{ github.event_name != 'pull_request' }} - tags: | - ${{ env.IMAGE }}:latest - ${{ env.IMAGE }}:${{ env.GIT_SHA }} diff --git a/.github/workflows/rworkflows.yml b/.github/workflows/rworkflows.yml new file mode 100644 index 0000000..94d43eb --- /dev/null +++ b/.github/workflows/rworkflows.yml @@ -0,0 +1,57 @@ +name: rworkflows +'on': + push: + branches: + - master + - main + - devel + - RELEASE_** + pull_request: + branches: + - master + - main + - devel + - RELEASE_** +jobs: + rworkflows: + permissions: write-all + runs-on: ${{ matrix.config.os }} + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + container: ${{ matrix.config.cont }} + strategy: + fail-fast: ${{ false }} + matrix: + config: + - os: ubuntu-latest + bioc: devel + r: auto + cont: ghcr.io/bioconductor/bioconductor_docker:devel + rspm: ~ + - os: macOS-latest + bioc: release + r: auto + cont: ~ + rspm: ~ + - os: windows-latest + bioc: release + r: auto + cont: ~ + rspm: ~ + steps: + - uses: neurogenomics/rworkflows@master + with: + run_bioccheck: ${{ false }} + run_rcmdcheck: ${{ true }} + as_cran: ${{ true }} + run_vignettes: ${{ true }} + has_testthat: ${{ true }} + run_covr: ${{ true }} + run_pkgdown: ${{ true }} + has_runit: ${{ false }} + has_latex: ${{ false }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run_docker: ${{ false }} + DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }} + runner_os: ${{ runner.os }} + cache_version: cache-v1 + docker_registry: ghcr.io diff --git a/DESCRIPTION b/DESCRIPTION index b2e3017..fdab2cd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: tidyomicsWorkshop Title: Tidy genomic and transcriptomic analyses -Version: 0.16.3 +Version: 0.17.0 Authors@R: c( person(given = "Stefano", @@ -14,8 +14,7 @@ Authors@R: email = "michaelisaiahlove@gmail.com", comment = c(ORCID = "0000-0001-8401-0545")) ) -Description: Workflow for BioC conference showing tidy genomic - and transcriptomic analyses. +Description: Workshop material for tidyomics, including transcriptomics and genomics. License: MIT + file LICENSE Encoding: UTF-8 LazyData: true @@ -28,10 +27,16 @@ Depends: Imports: SummarizedExperiment, SingleCellExperiment, + SeuratObject, + Seurat, tidySummarizedExperiment, tidySingleCellExperiment, tidygate, tidyseurat, + SpatialExperiment, + tidySpatialExperiment, + plyranges, + nullranges, stats, utils, tibble, @@ -42,23 +47,30 @@ Imports: tidyr, purrr, forcats, - ggrepel, - plotly, - igraph, - broom, devtools, rlang, magrittr, R.utils, +Suggests: + knitr, + rmarkdown, + pkgdown, + STexampleData, dittoSeq, tidybulk, glue, scater, - batchelor -Suggests: - knitr, - rmarkdown, - pkgdown + scran, + batchelor, + ggrepel, + plotly, + igraph, + broom, + ggspavis +Remotes: + satijalab/seurat-wrappers, + stemangiola/tidySingleCellExperiment, + william-hutchison/tidySpatialExperiment Biarch: true URL: https://tidy-biology.github.io/TidyGenomicsTranscriptomicsWorkshop BugReports: https://github.com/tidy-biology/TidyGenomicsTranscriptomicsWorkshop/issues/new/choose diff --git a/NAMESPACE b/NAMESPACE index 6ae9268..01f6a4c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,2 +1,6 @@ # Generated by roxygen2: do not edit by hand +import(SeuratObject) +importFrom(SingleCellExperiment,SingleCellExperiment) +importFrom(SummarizedExperiment,SummarizedExperiment) +importFrom(ggplot2,theme) diff --git a/R/data_transcriptomics.R b/R/data_transcriptomics.R index b971a49..d733cdc 100644 --- a/R/data_transcriptomics.R +++ b/R/data_transcriptomics.R @@ -1,75 +1,161 @@ #' seurat_obj #' -#' A Seurat dataset of single cell RNA sequencing data -#' +#' A Seurat dataset containing single-cell RNA sequencing data. +#' +#' This Seurat object contains pre-processed single-cell RNA-seq data that can be used +#' for downstream analysis such as clustering, differential expression, and visualization. #' -#' @format A Seurat object. +#' @format A `Seurat` object. #' @usage data(seurat_obj) +#' @import SeuratObject "seurat_obj" #' gate_seurat_obj #' -#' Coordinates for a gate interactively drawn using tidygate -#' +#' Coordinates for a gate interactively drawn using `tidygate` on a Seurat object. +#' +#' These x, y coordinates represent a manually defined gate used to subset specific populations +#' from a Seurat object. #' -#' @format A list containing x,y coordinates for one gate +#' @format A list containing x, y coordinates for one gate. #' @usage data(gate_seurat_obj) "gate_seurat_obj" #' seurat_obj_UMAP3 #' -#' A Seurat dataset of single cell RNA sequencing data with 3 UMAP dimesions +#' A Seurat dataset of single-cell RNA sequencing data with 3 UMAP dimensions. #' +#' This Seurat object contains three-dimensional UMAP embeddings for visualizing cell populations +#' in a reduced-dimensional space. #' -#' @format A Seurat object. +#' @format A `Seurat` object. #' @usage data(seurat_obj_UMAP3) +#' @import SeuratObject "seurat_obj_UMAP3" #' pseudo_bulk #' -#' A SummarizedExperiment object +#' A `SummarizedExperiment` object containing pseudo-bulk RNA sequencing data. #' -#' @description -#' This object was saved only because leading to a strange failure of the github action, -#' while working perfectly in the local environment -#' +#' This object was created for benchmarking purposes, representing a pseudo-bulk aggregation +#' of single-cell data, saved due to a specific issue with GitHub actions while functioning +#' correctly in local environments. #' -#' @format A SummarizedExperiment object. +#' @format A `SummarizedExperiment` object. #' @usage data(pseudo_bulk) +#' @importFrom SummarizedExperiment SummarizedExperiment "pseudo_bulk" #' sce_obj #' -#' A sce dataset of single cell RNA sequencing data +#' A `SingleCellExperiment` dataset containing single-cell RNA sequencing data. #' +#' This SingleCellExperiment (SCE) object contains pre-processed single-cell RNA-seq data +#' ready for downstream analysis, such as clustering or differential expression analysis. #' -#' @format A sce object. +#' @format A `SingleCellExperiment` object. #' @usage data(sce_obj) +#' @importFrom SingleCellExperiment SingleCellExperiment "sce_obj" #' gate_sce_obj #' -#' Coordinates for a gate interactively drawn using tidygate -#' +#' Coordinates for a gate interactively drawn using `tidygate` on a SingleCellExperiment object. +#' +#' These x, y coordinates represent a manually defined gate used to subset specific populations +#' from a SingleCellExperiment object. #' -#' @format A list containing x,y coordinates for one gate +#' @format A list containing x, y coordinates for one gate. #' @usage data(gate_sce_obj) "gate_sce_obj" #' sce_obj_UMAP3 #' -#' A sce dataset of single cell RNA sequencing data with 3 UMAP dimesions +#' A `SingleCellExperiment` dataset of single-cell RNA sequencing data with 3 UMAP dimensions. #' +#' This SCE object contains three-dimensional UMAP embeddings, which allow visualization +#' of cell populations in reduced-dimensional space. #' -#' @format A sce object. +#' @format A `SingleCellExperiment` object. #' @usage data(sce_obj_UMAP3) +#' @importFrom SingleCellExperiment SingleCellExperiment "sce_obj_UMAP3" #' theme_multipanel #' -#' A file holding a theme object. -#' +#' A ggplot2 theme object designed for creating multipanel plots. +#' +#' This theme can be applied to ggplot2-based visualizations to standardize the look and feel +#' of multipanel figures, ensuring consistency across plots. #' -#' @format theme +#' @format A `ggplot2` theme. #' @usage data(theme_multipanel) +#' @importFrom ggplot2 theme "theme_multipanel" + +#' tidygate_env_gates +#' +#' A set of gates interactively drawn for spatial data using the `tidygate` package. +#' +#' This object contains gates that were drawn interactively on spatial data based on tissue morphology, +#' highlighting areas of interest for future reproducible analysis. These gates can be loaded and applied +#' to similar data for consistent and automated gating. +#' +#' @details +#' Gates were saved programmatically for future use. The object was created using the `gate` function +#' in `tidygate` and saved as an RDS file for reproducibility. This can be particularly useful in spatial +#' transcriptomics data analysis where regions of interest are manually gated. +#' +#' @return A list of x and y coordinates representing the interactive gates. +#' +#' @format A list containing the x and y coordinates for gates. +#' +#' @usage data(tidygate_env_gates) +#' +#' @examples +#' # Load the gates +#' data(tidygate_env_gates) +#' +#' # Apply the gates to new data for reproducible analysis +#' # spatial_data <- applyGates(spatial_data, gates = tidygate_env_gates) +#' +#' @source Created interactively using the `tidygate` package +"tidygate_env_gates" + +#' pbmc_h3k4me3_hg38 +#' +#' H3K4me3 peaks for human PBMCs lifted over to hg38 genome assembly +#' +#' This object contains H3K4me3 peaks for human peripheral blood mononuclear cells (PBMCs) mapped to +#' the hg38 genome assembly. The peaks were initially from hg19 and were lifted over to hg38 using +#' a UCSC chain file. The peaks contain several columns of information including signal value, +#' q-value, and peak locations, which can be used for visualisation and analysis. +#' +#' @details +#' The H3K4me3 peaks were lifted over from the hg19 to hg38 genome assembly using the `liftOver` +#' function from the `rtracklayer` package. The `seqlevels` and `seqinfo` were aligned with the +#' `SingleCellExperiment` object `sce_sub`. The Ensembl-style chromosome naming convention (NCBI) +#' was applied for consistency. This dataset contains selected columns such as `signalValue`, +#' `qValue`, and `peak`, making it ready for downstream analysis or plotting. +#' +#' @return A GRanges object containing the H3K4me3 peaks for PBMCs in the hg38 genome assembly. +#' +#' @format A `GRanges` object with columns: +#' \describe{ +#' \item{signalValue}{The signal value for each peak.} +#' \item{qValue}{The q-value for the significance of each peak.} +#' \item{peak}{The peak location in the genome.} +#' } +#' +#' @usage data(pbmc_h3k4me3_hg38) +#' +#' @examples +#' # Load the dataset +#' data(pbmc_h3k4me3_hg38) +#' +#' # Plot q-values of the H3K4me3 peaks +#' plot(pbmc_h3k4me3_hg38$qValue, type="l", ylab="q-value", main="H3K4me3 peaks") +#' abline(v=5000, lty=2) +#' +#' @source H3K4me3 peak data was lifted from hg19 to hg38 using UCSC liftOver chain files. +"pbmc_h3k4me3_hg38" \ No newline at end of file diff --git a/data/gate_sce_obj.rda b/data/gate_sce_obj.rda index 9686766..5b7520f 100755 Binary files a/data/gate_sce_obj.rda and b/data/gate_sce_obj.rda differ diff --git a/data/gate_seurat_obj.rda b/data/gate_seurat_obj.rda index cc38b05..7b5d920 100644 Binary files a/data/gate_seurat_obj.rda and b/data/gate_seurat_obj.rda differ diff --git a/data/pbmc_h3k4me3_hg38.rda b/data/pbmc_h3k4me3_hg38.rda new file mode 100644 index 0000000..5f7cac8 Binary files /dev/null and b/data/pbmc_h3k4me3_hg38.rda differ diff --git a/data/tidygate_env_gates.rda b/data/tidygate_env_gates.rda new file mode 100644 index 0000000..ec7f7bc Binary files /dev/null and b/data/tidygate_env_gates.rda differ diff --git a/inst/images/tidySPE_gate.png b/inst/images/tidySPE_gate.png new file mode 100644 index 0000000..9346822 Binary files /dev/null and b/inst/images/tidySPE_gate.png differ diff --git a/inst/images/tidyomics.png b/inst/images/tidyomics.png new file mode 100755 index 0000000..3b38d69 Binary files /dev/null and b/inst/images/tidyomics.png differ diff --git a/man/gate_sce_obj.Rd b/man/gate_sce_obj.Rd index 628249c..1bd026f 100644 --- a/man/gate_sce_obj.Rd +++ b/man/gate_sce_obj.Rd @@ -5,12 +5,16 @@ \alias{gate_sce_obj} \title{gate_sce_obj} \format{ -A list containing x,y coordinates for one gate +A list containing x, y coordinates for one gate. } \usage{ data(gate_sce_obj) } \description{ -Coordinates for a gate interactively drawn using tidygate +Coordinates for a gate interactively drawn using \code{tidygate} on a SingleCellExperiment object. +} +\details{ +These x, y coordinates represent a manually defined gate used to subset specific populations +from a SingleCellExperiment object. } \keyword{datasets} diff --git a/man/gate_seurat_obj.Rd b/man/gate_seurat_obj.Rd index 3e61a58..9f6072c 100644 --- a/man/gate_seurat_obj.Rd +++ b/man/gate_seurat_obj.Rd @@ -5,12 +5,16 @@ \alias{gate_seurat_obj} \title{gate_seurat_obj} \format{ -A list containing x,y coordinates for one gate +A list containing x, y coordinates for one gate. } \usage{ data(gate_seurat_obj) } \description{ -Coordinates for a gate interactively drawn using tidygate +Coordinates for a gate interactively drawn using \code{tidygate} on a Seurat object. +} +\details{ +These x, y coordinates represent a manually defined gate used to subset specific populations +from a Seurat object. } \keyword{datasets} diff --git a/man/pbmc_h3k4me3_hg38.Rd b/man/pbmc_h3k4me3_hg38.Rd new file mode 100644 index 0000000..1aa506e --- /dev/null +++ b/man/pbmc_h3k4me3_hg38.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_transcriptomics.R +\docType{data} +\name{pbmc_h3k4me3_hg38} +\alias{pbmc_h3k4me3_hg38} +\title{pbmc_h3k4me3_hg38} +\format{ +A \code{GRanges} object with columns: +\describe{ +\item{signalValue}{The signal value for each peak.} +\item{qValue}{The q-value for the significance of each peak.} +\item{peak}{The peak location in the genome.} +} +} +\source{ +H3K4me3 peak data was lifted from hg19 to hg38 using UCSC liftOver chain files. +} +\usage{ +data(pbmc_h3k4me3_hg38) +} +\value{ +A GRanges object containing the H3K4me3 peaks for PBMCs in the hg38 genome assembly. +} +\description{ +H3K4me3 peaks for human PBMCs lifted over to hg38 genome assembly +} +\details{ +This object contains H3K4me3 peaks for human peripheral blood mononuclear cells (PBMCs) mapped to +the hg38 genome assembly. The peaks were initially from hg19 and were lifted over to hg38 using +a UCSC chain file. The peaks contain several columns of information including signal value, +q-value, and peak locations, which can be used for visualisation and analysis. + +The H3K4me3 peaks were lifted over from the hg19 to hg38 genome assembly using the \code{liftOver} +function from the \code{rtracklayer} package. The \code{seqlevels} and \code{seqinfo} were aligned with the +\code{SingleCellExperiment} object \code{sce_sub}. The Ensembl-style chromosome naming convention (NCBI) +was applied for consistency. This dataset contains selected columns such as \code{signalValue}, +\code{qValue}, and \code{peak}, making it ready for downstream analysis or plotting. +} +\examples{ +# Load the dataset +data(pbmc_h3k4me3_hg38) + +# Plot q-values of the H3K4me3 peaks +plot(pbmc_h3k4me3_hg38$qValue, type="l", ylab="q-value", main="H3K4me3 peaks") +abline(v=5000, lty=2) + +} +\keyword{datasets} diff --git a/man/pseudo_bulk.Rd b/man/pseudo_bulk.Rd index 23ac750..55da963 100644 --- a/man/pseudo_bulk.Rd +++ b/man/pseudo_bulk.Rd @@ -5,16 +5,17 @@ \alias{pseudo_bulk} \title{pseudo_bulk} \format{ -A SummarizedExperiment object. +A \code{SummarizedExperiment} object. } \usage{ data(pseudo_bulk) } \description{ -This object was saved only because leading to a strange failure of the github action, -while working perfectly in the local environment +A \code{SummarizedExperiment} object containing pseudo-bulk RNA sequencing data. } \details{ -A SummarizedExperiment object +This object was created for benchmarking purposes, representing a pseudo-bulk aggregation +of single-cell data, saved due to a specific issue with GitHub actions while functioning +correctly in local environments. } \keyword{datasets} diff --git a/man/sce_obj.Rd b/man/sce_obj.Rd index 59af59b..6f5b379 100644 --- a/man/sce_obj.Rd +++ b/man/sce_obj.Rd @@ -5,12 +5,16 @@ \alias{sce_obj} \title{sce_obj} \format{ -A sce object. +A \code{SingleCellExperiment} object. } \usage{ data(sce_obj) } \description{ -A sce dataset of single cell RNA sequencing data +A \code{SingleCellExperiment} dataset containing single-cell RNA sequencing data. +} +\details{ +This SingleCellExperiment (SCE) object contains pre-processed single-cell RNA-seq data +ready for downstream analysis, such as clustering or differential expression analysis. } \keyword{datasets} diff --git a/man/sce_obj_UMAP3.Rd b/man/sce_obj_UMAP3.Rd index 8255916..512af82 100644 --- a/man/sce_obj_UMAP3.Rd +++ b/man/sce_obj_UMAP3.Rd @@ -5,12 +5,16 @@ \alias{sce_obj_UMAP3} \title{sce_obj_UMAP3} \format{ -A sce object. +A \code{SingleCellExperiment} object. } \usage{ data(sce_obj_UMAP3) } \description{ -A sce dataset of single cell RNA sequencing data with 3 UMAP dimesions +A \code{SingleCellExperiment} dataset of single-cell RNA sequencing data with 3 UMAP dimensions. +} +\details{ +This SCE object contains three-dimensional UMAP embeddings, which allow visualization +of cell populations in reduced-dimensional space. } \keyword{datasets} diff --git a/man/seurat_obj.Rd b/man/seurat_obj.Rd index 96b033d..9e01cd1 100644 --- a/man/seurat_obj.Rd +++ b/man/seurat_obj.Rd @@ -5,12 +5,16 @@ \alias{seurat_obj} \title{seurat_obj} \format{ -A Seurat object. +A \code{Seurat} object. } \usage{ data(seurat_obj) } \description{ -A Seurat dataset of single cell RNA sequencing data +A Seurat dataset containing single-cell RNA sequencing data. +} +\details{ +This Seurat object contains pre-processed single-cell RNA-seq data that can be used +for downstream analysis such as clustering, differential expression, and visualization. } \keyword{datasets} diff --git a/man/seurat_obj_UMAP3.Rd b/man/seurat_obj_UMAP3.Rd index 00a82ba..8495ac0 100644 --- a/man/seurat_obj_UMAP3.Rd +++ b/man/seurat_obj_UMAP3.Rd @@ -5,12 +5,16 @@ \alias{seurat_obj_UMAP3} \title{seurat_obj_UMAP3} \format{ -A Seurat object. +A \code{Seurat} object. } \usage{ data(seurat_obj_UMAP3) } \description{ -A Seurat dataset of single cell RNA sequencing data with 3 UMAP dimesions +A Seurat dataset of single-cell RNA sequencing data with 3 UMAP dimensions. +} +\details{ +This Seurat object contains three-dimensional UMAP embeddings for visualizing cell populations +in a reduced-dimensional space. } \keyword{datasets} diff --git a/man/theme_multipanel.Rd b/man/theme_multipanel.Rd index e6efec4..3fda9ee 100644 --- a/man/theme_multipanel.Rd +++ b/man/theme_multipanel.Rd @@ -5,12 +5,16 @@ \alias{theme_multipanel} \title{theme_multipanel} \format{ -theme +A \code{ggplot2} theme. } \usage{ data(theme_multipanel) } \description{ -A file holding a theme object. +A ggplot2 theme object designed for creating multipanel plots. +} +\details{ +This theme can be applied to ggplot2-based visualizations to standardize the look and feel +of multipanel figures, ensuring consistency across plots. } \keyword{datasets} diff --git a/man/tidygate_env_gates.Rd b/man/tidygate_env_gates.Rd new file mode 100644 index 0000000..544db6f --- /dev/null +++ b/man/tidygate_env_gates.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_transcriptomics.R +\docType{data} +\name{tidygate_env_gates} +\alias{tidygate_env_gates} +\title{tidygate_env_gates} +\format{ +A list containing the x and y coordinates for gates. +} +\source{ +Created interactively using the \code{tidygate} package +} +\usage{ +data(tidygate_env_gates) +} +\value{ +A list of x and y coordinates representing the interactive gates. +} +\description{ +A set of gates interactively drawn for spatial data using the \code{tidygate} package. +} +\details{ +This object contains gates that were drawn interactively on spatial data based on tissue morphology, +highlighting areas of interest for future reproducible analysis. These gates can be loaded and applied +to similar data for consistent and automated gating. + +Gates were saved programmatically for future use. The object was created using the \code{gate} function +in \code{tidygate} and saved as an RDS file for reproducibility. This can be particularly useful in spatial +transcriptomics data analysis where regions of interest are manually gated. +} +\examples{ +# Load the gates +data(tidygate_env_gates) + +# Apply the gates to new data for reproducible analysis +# spatial_data <- applyGates(spatial_data, gates = tidygate_env_gates) + +} +\keyword{datasets} diff --git a/vignettes/genomics_transcirptomics_integration.Rmd b/vignettes/genomics_transcirptomics_integration.Rmd new file mode 100644 index 0000000..b867029 --- /dev/null +++ b/vignettes/genomics_transcirptomics_integration.Rmd @@ -0,0 +1,403 @@ +--- +title: "Tidy genomic analyses in integration with single-cell transcriptomics" +author: + - Michael Love, UNC-Chapel Hill^[] +output: rmarkdown::html_vignette +bibliography: "`r file.path(system.file(package='tidyomicsWorkshop', 'vignettes'), 'tidyomics.bib')`" +vignette: > + %\VignetteIndexEntry{Tidy genomic and transcriptomic single-cell analyses} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, cache = FALSE) +``` + + +# Genomic and transcriptomic data integration + +In the single-cell vignette, we have examined expression of genes across our cells, but we +haven't considered the genomic location of the genes. In this section +we will operate on genomic location information using the *plyranges* +[@Lee2019] package, and tie this to the single cell transcriptomics +data we have been examining. + +*plyranges* provides tidyverse-style functionality to genomic ranges +(GRanges objects [@Lawrence2013]) analogous to how +*tidySingleCellExperiment* provides functionality for +SingleCellExperiment objects. For an example of a workflow using +*plyranges* as part of a bulk RNA-seq analysis, see @Lee2020. + +:::: {.note} +In some pipelines (e.g. using *tximeta* [@Love2020] to import bulk or +single-cell quantification data) our SingleCellExperiment or +SummarizedExperiment would already have genomic range information +attached to the rows of the object. That is, we would have information +about the starts and ends of the genes, their strand information, even +the lengths of the chromosomes and the genome build, etc. +:::: + +Here we load single-cell data in SingleCellExperiment object format. This data is peripheral blood mononuclear cells (PBMCs) from metastatic breast cancer patients. + + +```{r} +library(SingleCellExperiment) +library(ggplot2) +library(plotly) +library(dplyr) +library(colorspace) +library(dittoSeq) +library(tidySingleCellExperiment) + +# load single cell RNA sequencing data +sce_obj <- tidyomicsWorkshop::sce_obj + +# take a look +sce_obj +``` + +Before we start our integrative analysis, we will first take some +steps to add hg38 range information on our genes, matching by the gene +symbol provided on the rows of `sce_obj`. We add genes from the +*ensembldb* package [@Rainer2019] (to see how to add a particular +version of Ensembl, see the package vignette). + +```{r message=FALSE} +# what our rownames look like: gene symbols +sce_obj |> rownames() |> head() + +# we recommend Ensembl or GENCODE gene annotations +edb <- EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86 # hg38 +g <- ensembldb::genes(edb) + +#head(genome(g)) +``` + +We can examine the first three genes and their metadata: + +```{r message=FALSE} +library(plyranges) +g |> dplyr::slice(1:3) +``` + +Our first task is to subset `g`, the human genes, to the ones in our +SingleCellExperiment. While Ensembl IDs are unique, gene symbols are +not, so we will have to also remove any duplicate gene symbols +(recommend working with Ensembl IDs as feature identifiers when +possible). + +We subset the columns of `g` to just the `symbol` column, using the +`select()` function. *plyranges* enables us to use familiar tidy verbs +on GRanges objects, e.g. to `filter` rows or to `select` columns of +metadata attached to the ranges. + +```{r} +g <- g |> + + select(symbol) + +g |> dplyr::slice(1:3) +``` + +Now we filter the genes of our SingleCellExperiment to only those +present as rownames of `sce_obj`, and remove duplicate IDs. We then +sort by genomic location, and use the gene symbols as names of the +ranges. + +```{r} +gene_names <- sce_obj |> rownames() + +g <- g |> + + filter(symbol %in% gene_names) |> + filter(!duplicated(symbol)) |> + sort() # genomic position sorting + +names(g) <- g$symbol +``` + +Now we can add our gene information to the rows of the +`sce_obj`. + +:::: {.note} +Again, these following steps would not be necessary using a +pipeline that imports quantification data to ranged objects, but it +isn't too hard to add manually. If adding ranges manually, make note +of the provenance of where you downloaded the information (Ensembl, +GENCODE or otherwise), and assign a genome build if you plan on +sharing the final object (e.g. with `genome(x) <- "..."`). +:::: + +```{r} +all(names(g) %in% rownames(sce_obj)) +sce_sub <- sce_obj[ names(g), ] # put SCE in order of `g` +rowRanges(sce_sub) <- g # assign ranges `g` to the SCE +sce_sub |> rowRanges() # peek at the ranges +``` + +Now let's do something interesting with the gene ranges: let's see if +genes near peaks of active chromatin marks (H3K4me3 measured with +ChIP-seq) in another experiment involving PBMC have a difference in +their expression level compared to other genes. + +There are many sources of epigenetic tracks, ENCODE, Roadmap, +etc. Here we will use some ENCODE [@ENCODE2012] data available in +*AnnotationHub* on Bioconductor. + +:::: {.note} +We had to do a bit of book-keeping first. These peaks were in +the hg19 genome build, so we have ran the following commands (here +un-evaluated) to 'lift" the peaks to the hg38 genome build, and +provide proper sequence information. +:::: + +```{r eval=FALSE} +### un-evaluated code chunk ### +# AnnotationHub contains many useful genome tracks +library(AnnotationHub) +ah <- AnnotationHub() +# can be queried with keywords +query(ah, c("Peripheral_Blood","h3k4me3")) +# downloading a particular resource +peaks <- ah[["AH44823"]] +library(rtracklayer) +# lifting hg19 to hg38 +# Download https://hgdownload.soe.ucsc.edu/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz +new_peaks <- unlist( + liftOver( + peaks, + chain = import.chain("~/Downloads/hg19ToHg38.over.chain") + ) +) +# Ensembl-style chrom names +seqlevelsStyle(new_peaks) <- "NCBI" +# bring over any missing chroms +seqlevels(new_peaks) <- seqlevels(sce_sub) +# bring over chrom lengths +seqinfo(new_peaks) <- seqinfo(sce_sub) +# subset to a few columns +pbmc_h3k4me3_hg38 <- new_peaks |> + select(signalValue, qValue, peak) +# save for reloading below +save(pbmc_h3k4me3_hg38, file="data/pbmc_h3k4me3_hg38.rda", compress="xz") +``` + +Loading those peaks. + +```{r} +# loading those H3K4me3 peaks for PBMCs +peaks <- tidyomicsWorkshop::pbmc_h3k4me3_hg38 +plot(peaks$qValue, type="l", ylab="q-value", main="H3K4me3 peaks") +abline(v=5000, lty=2) +``` + +We will arbitrarily chose to take the top 5,000 peaks by p-value. How +to chose the number of epigenetic peaks and genes to consider in a +multi-omics analysis is a more involved question, and is worth +considering the impact of such a choice for enrichment analysis. + +```{r} +peaks <- peaks |> + + dplyr::slice(1:5000) |> # ordered already by p-value (most to least sig) + sort() # genomic position sorting +``` + +It is easy to compute the distance of the genes in our +SingleCellExperiment to the nearest H3K4me3 peak, using *plyranges* +convenience functions. + +See the +[plyranges package website](https://sa-lee.github.io/plyranges/reference/index.html) +for a full listing of such functions. + +```{r} +dists <- + rowRanges(sce_sub) |> + anchor_5p() |> # anchor 5 prime end + mutate(width=1) |> # shrink range to 1bp + add_nearest_distance(peaks) +``` + +```{r} +hist(log10(dists$distance + 1), breaks=40, + main="gene-to-peak distance", xlab="log10 distance") +``` + +Let's put the genes into 4 different bins by their distance to a +H3K4me3 peak: 1) overlapping [distance of 0], 2) 1bp-10kb, +3) 10kb-100kb, or 4) farther than 100kb. + +```{r} +library(tibble) +bins <- c(0,1,1e4,1e5,Inf) + +dists <- dists |> + + select(symbol, distance, .drop_ranges = TRUE) |> # remove chr/pos/strand etc. + as_tibble() |> + mutate(distance_bin = cut(distance, bins, include.lowest = TRUE)) |> + rename(feature = symbol) # this will help us add information to the SCE +``` + +We can see how many genes are in which category: + +```{r} +dists |> + + ggplot(aes(distance_bin)) + + geom_bar() +``` + +We will now take the SingleCellExperiment data and compress it down to +a SummarizedExperiment using the `aggregate_cells` function from +*tidySingleCellExperiment*. After this function, every column of the +SummarizedExperiment is a cell type. + +We immediately pipe the SummarizedExperiment into a `left_join` where +we add the distance-to-peak information, and then into a `nest` +command where we create a nested (tidy)SummarizedExperiment object, +where we have grouped by the distance bin that the genes fall into. + + +```{r message=FALSE} +library(tidySummarizedExperiment) +library(purrr) +nested <- sce_sub |> + + aggregate_cells(cell_type) |> + left_join(dists, by="feature") |> + nest(se = -distance_bin) + +nested +``` + +We can now operate on the SummarizedExperiment objects that are within +`nested`. For example, we can extract the column means of the counts +(cell-type-specific means of counts over genes). + +We save this as a new tibble called `smry` as it contains summary +information. + +```{r} +smry <- nested |> + + mutate(summary = map(se, \(se) { + tibble(count_mean = colMeans(assay(se)), + cell_type = se$cell_type) + }) + ) |> + select(-se) |> + unnest(summary) |> + mutate(cell = substr(cell_type, 1, 13)) # abbreviate cell_type for plot +``` + +As a final plot, we can look at the mean count over the genes in a +particular distance bin (x-axis), across the cell types (y-axis). The +different lines show how the genes' proximity to H3K4me3 peaks affects +the average count. + +```{r} +library(forcats) +smry |> + + mutate(cell = fct_reorder(cell, count_mean, .fun=median)) |> + ggplot(aes(count_mean, cell, color=distance_bin, group=distance_bin)) + + geom_point() + + geom_line(orientation="y") + + xlab("mean of count over genes in bin") + + ylab("cell type") +``` + +We finish this section with a re-cap of the steps we took: + +1. calculated distances from genes to H3K4me3 peaks +2. added distance information onto the SingleCellExperiment +3. condensed the dataset via pseudobulking +4. computed the mean count (over genes) within 4 bins of genes +5. visualized these mean counts over cell types + +Let's consider a number of issues with this first-pass analysis and +visualization: + +1. we arbitrarily thresholded the peaks at the beginning of the + analysis to take the top 5,000 +2. we just computed the mean count, not taking into account total + reads per cell type (sequencing depth per cell x number of cells) +3. we are comparing cell-type-specific expression to aggregate + ChIP-seq peaks in PBMC +4. the two experiments, while both PBMC, come from different projects, + different labs, and one is from cancer patients, while the other is + from the ENCODE project +5. we only plot the mean count over genes, perhaps it would be better + to show more information about the distribution + +:::: {.note} +As an alternative to (1), we could have counted the number of peaks +that were near each genes, or even computed on the metadata of the +peaks (signal value, etc.). An example follows of how we could have +counted overlaps instead (how many peaks within 20kb). +:::: + +```{r} +overlaps <- rowRanges(sce_sub) |> + + anchor_5p() |> + mutate(width=1) %>% + mutate(num_overlaps = count_overlaps(., peaks, maxgap=2e4)) + +table(overlaps$num_overlaps) +``` + +:::: {.note} +Another question that often arises is whether the distances or +overlaps we observe are more or less than we would expect if there +were no relationship between the positions of genes and peaks. +To answer questions like these, we have developed the *nullranges* +package, which allows for either bootstrapping of ranges throughout +the genome [@Mu2023], or selection of a covariate-matched background +set [@Davis2023], to compute probabilities under the null hypothesis. +A brief example follows of bootstrapping some of the peaks in a window +of chr1:20Mb-30Mb (see *nullranges* website for more comprehensive +examples). +:::: + +```{r} +library(nullranges) + +# make a small range for demonstration of bootstrapping +seg <- data.frame(seqnames=1, start=20e6, width=10e6) |> + as_granges() + +# generate a genome segmentation from this one range +seg <- oneRegionSegment(seg, seqlength=seqlengths(peaks)[[1]]) +seg + +# bootstrap peaks within this segment (just for demo) +peaks |> + filter_by_overlaps(seg[2]) |> + bootRanges(blockLength=1e6, seg=seg, R=10) + +# these bootstrapped ranges could then be used for computing +# generic test statistics, with the `iter` column used for +# building a null distribution +``` + +**Session Information** + +```{r} +sessionInfo() +``` + +**References** + +```{css echo=FALSE} +.note { + margin: 30px; + padding: 1em; + background: #FFF8F0; + border: 1px solid #EFE8E0; + border-radius: 10px; +} +``` diff --git a/vignettes/pseudobulk_transcriptomics.Rmd b/vignettes/pseudobulk_transcriptomics.Rmd index 27c9b62..aa39f43 100644 --- a/vignettes/pseudobulk_transcriptomics.Rmd +++ b/vignettes/pseudobulk_transcriptomics.Rmd @@ -2,7 +2,7 @@ title: "Tidy Transcriptomics for pseudobulk Sequencing Analyses" author: - Maria Doyle, Peter MacCallum Cancer Centre^[] - - Stefano Mangiola, Walter and Eliza Hall Institute^[] + - Stefano Mangiola, South Australian immunoGENomics Cancer Institute^[], Walter and Eliza Hall Institute^[] output: rmarkdown::html_vignette bibliography: "`r file.path(system.file(package='tidyomicsWorkshop', 'vignettes'), 'tidytranscriptomics.bib')`" vignette: > @@ -184,19 +184,19 @@ pseudo_bulk_nested = pseudo_bulk_nested |> # Identify top significant genes - mutate(top_genes = map_chr( + mutate(top_genes = map( grouped_summarized_experiment, ~ .x |> pivot_transcript() |> arrange(pvalue) |> - head(1) |> - pull(.feature) + dplyr::slice(1:10) |> + pull(.feature) )) |> # Filter top gene mutate(grouped_summarized_experiment = map2( grouped_summarized_experiment, top_genes, - ~ filter(.x, .feature == .y) + ~ .x |> filter(.feature %in% .y) )) pseudo_bulk_nested diff --git a/vignettes/single_cell_bioconductor_transcriptomics.Rmd b/vignettes/single_cell_bioconductor_transcriptomics.Rmd index 1ed150f..63eb126 100644 --- a/vignettes/single_cell_bioconductor_transcriptomics.Rmd +++ b/vignettes/single_cell_bioconductor_transcriptomics.Rmd @@ -2,7 +2,7 @@ title: "Tidy Transcriptomics for Single-cell RNA Sequencing Analyses with Bioconductor" author: - Maria Doyle, Peter MacCallum Cancer Centre^[] - - Stefano Mangiola, Walter and Eliza Hall Institute^[] + - Stefano Mangiola, South Australian immunoGENomics Cancer Institute^[], Walter and Eliza Hall Institute^[] output: rmarkdown::html_vignette bibliography: "`r file.path(system.file(package='tidyomicsWorkshop', 'vignettes'), 'tidytranscriptomics.bib')`" vignette: > @@ -17,7 +17,8 @@ knitr::opts_chunk$set(echo = TRUE) ## Instructors -*Dr. Stefano Mangiola* is currently a Postdoctoral researcher in the laboratory of Prof. Tony Papenfuss at the Walter and Eliza Hall Institute in Melbourne, Australia. His background spans from biotechnology to bioinformatics and biostatistics. His research focuses on prostate and breast tumour microenvironment, the development of statistical models for the analysis of RNA sequencing data, and data analysis and visualisation interfaces. +*Dr. Stefano Mangiola* is currently (2024) a group leader in Computational Cancer Immunology at SAiGENCI +(Adelaide) with co-appointment at WEHI Melbourne, Australia. His background spans from biotechnology to bioinformatics and biostatistics. His research focuses on prostate and breast tumour microenvironment, the development of statistical models for the analysis of RNA sequencing data, and data analysis and visualisation interfaces. ## Workshop goals and objectives @@ -59,7 +60,6 @@ frameborder="0"> # Load packages library(SingleCellExperiment) library(ggplot2) -library(plotly) library(dplyr) library(colorspace) library(dittoSeq) @@ -332,6 +332,7 @@ single_cell_object |> We'll demonstrate creating a 3D plot using some data that has 3 UMAP dimensions. This is a fantastic way to visualise both reduced dimensions and metadata in the same representation. ```{r umap plot 2, message = FALSE, warning = FALSE} +library(tidySingleCellExperiment) pbmc <- tidyomicsWorkshop::sce_obj_UMAP3 pbmc |> @@ -342,7 +343,7 @@ pbmc |> color = ~cell_type, colors = dittoSeq::dittoColors() ) %>% - add_markers(size = I(1)) + plotly::add_markers(size = I(1)) ``` ## Exercises @@ -452,7 +453,7 @@ sce_obj |> nest(sce = -cell_type) |> # Filter for sample with more than 30 cells - mutate(sce = map(sce, ~ .x |> add_count(sample) |> filter(n > 50))) |> + mutate(sce = map(sce, ~ .x |> add_count(sample) |> dplyr::filter(n > 50))) |> filter(map_int(sce, ncol) > 0) |> # Select significant genes @@ -486,7 +487,6 @@ sce_obj |> - Answer this question avoiding to save temporary variables, and using the function add_count to count the cells (before nesting), and then filter - Answer this question avoiding to save temporary variables, and using the function map_int to count the cells (after nesting), and the filter - **Session Information** ```{r} diff --git a/vignettes/single_cell_seurat_transcriptomics.Rmd b/vignettes/single_cell_seurat_transcriptomics.Rmd index 0e85aef..fc5b058 100644 --- a/vignettes/single_cell_seurat_transcriptomics.Rmd +++ b/vignettes/single_cell_seurat_transcriptomics.Rmd @@ -2,7 +2,7 @@ title: "Tidy Transcriptomics for Single-cell RNA Sequencing Analyses with Seurat" author: - Maria Doyle, Peter MacCallum Cancer Centre^[] - - Stefano Mangiola, Walter and Eliza Hall Institute^[] + - Stefano Mangiola, South Australian immunoGENomics Cancer Institute^[], Walter and Eliza Hall Institute^[] output: rmarkdown::html_vignette bibliography: "`r file.path(system.file(package='tidyomicsWorkshop', 'vignettes'), 'tidytranscriptomics.bib')`" vignette: > diff --git a/vignettes/solutions_transcriptomics.Rmd b/vignettes/solutions_transcriptomics.Rmd index 0dbcc2c..a1693d0 100644 --- a/vignettes/solutions_transcriptomics.Rmd +++ b/vignettes/solutions_transcriptomics.Rmd @@ -39,7 +39,7 @@ seurat_obj |> mutate(gamma_delta = signature_score > 0.7) |> dplyr::count(gamma_delta) |> - summarise(proportion = n/sum(n)) + mutate(proportion = n/sum(n)) ``` ## Question 2 diff --git a/vignettes/spatial_bioconductor_transcriptomics.Rmd b/vignettes/spatial_bioconductor_transcriptomics.Rmd new file mode 100644 index 0000000..c700567 --- /dev/null +++ b/vignettes/spatial_bioconductor_transcriptomics.Rmd @@ -0,0 +1,687 @@ +--- +title: "Tidy spatial analyses" +author: + - Stefano Mangiola, South Australian immunoGENomics Cancer Institute^[], Walter and Eliza Hall Institute^[] +output: rmarkdown::html_vignette +# bibliography: "`r file.path(system.file(package='tidyomicsWorkshop', 'vignettes'), 'tidyomics.bib')`" +vignette: > + %\VignetteIndexEntry{Tidy spatial analyses} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + markdown: + wrap: 72 +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, cache = FALSE) +library(here) +``` + + + +# Session 2: Tidying spatial data + +A good introduction of `tidyomics` can be found here + +[tidyomicsWorkshopBioc2023](https://github.com/tidyomics/tidyomicsWorkshopBioc2023) +[tidy transcriptomic manifesto](https://tidyomics.github.io/tidyomicsBlog/post/2021-07-07-tidy-transcriptomics-manifesto/) + +`tidyomics` is an interoperable software ecosystem that bridges Bioconductor and the tidyverse. `tidyomics` is installable with a single homonymous meta-package. This ecosystem includes three new packages: tidySummarizedExperiment, tidySingleCellExperiment, and tidySpatialExperiment, and five publicly available R packages: `plyranges`, `nullranges`, `tidyseurat`, `tidybulk`, `tidytof`. Importantly, `tidyomics` leaves the original data containers and methods unaltered, ensuring compatibility with existing software, maintainability and long-term Bioconductor support. + +`tidyomics` is presented in "The tidyomics ecosystem: Enhancing omic data analyses" [Hutchison and Keyes et al., 2024](https://www.biorxiv.org/content/10.1101/2023.09.10.557072v1) + +```{r, echo=FALSE, out.width="700px"} +knitr::include_graphics(here("inst/images/tidyomics.png")) +``` + +[Slides](https://docs.google.com/gview?url=https://raw.githubusercontent.com/tidytranscriptomics-workshops/LoveMangiola2022_tidytranscriptomics/master/inst/LoveMangiola2022_tidytranscriptomics.pdf) + + + +Let's load the libraries needed for this session + +```{r, message = FALSE} +library(SpatialExperiment) + +# Tidyverse +library(ggplot2) +library(plotly) +library(dplyr) +library(tidyr) +library(purrr) +library(glue) +library(stringr) + +# Plotting +library(colorspace) +library(dittoSeq) +library(ggspavis) + +# Analysis +library(scuttle) +library(scater) +library(scran) + +``` + +Similarly to **Section 2**, this section uses `spatialLIBD` and `ExperimentHub` packages to gather spatial transcriptomics data. + +doi: [10.1038/s41593-020-00787-0](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8095368/) + + +```{r, message = FALSE} +# From https://bioconductor.org/packages/devel/bioc/vignettes/Banksy/inst/doc/multi-sample.html +# library(ExperimentHub) + +library(STexampleData) +spatial_data <- Visium_mouseCoronal() +rownames(spatial_data) <- rowData(spatial_data)$gene_name +colData(spatial_data)$sum <- colSums(counts(spatial_data)) +colData(spatial_data)$subject = "UBR42" + +# Drop duplicates +spatial_data = spatial_data[!rownames(spatial_data) |> duplicated(), , drop=FALSE] + +# spatial_data <- +# ExperimentHub::ExperimentHub() |> +# fetch_data(type = "spe")( eh = _, type = "spe") +# +# # Clear the reductions +# reducedDims(spatial_data) = NULL + +# Display the object +spatial_data +``` + +### 1. tidySpatialExperiment package + +`tidySpatialExperiment` provides a bridge between the `SpatialExperiment` single-cell package and the tidyverse [@wickham2019welcome]. It creates an invisible layer that enables viewing the `SpatialExperiment` object as a tidyverse tibble, and provides `SpatialExperiment`-compatible `dplyr`, `tidyr`, `ggplot` +and `plotly` functions. + +If we load the `tidySpatialExperiment` package and then view the single cell data, it now displays as a tibble. + +```{r message = FALSE} +library(tidySpatialExperiment) + +spatial_data +``` + +#### Data interface, display + +If we want to revert to the standard SpatialExperiment view we can do that. + +```{r} +options("restore_SpatialExperiment_show" = TRUE) +spatial_data +``` + +If we want to revert back to tidy SpatialExperiment view we can. + +```{r} +options("restore_SpatialExperiment_show" = FALSE) +spatial_data +``` + +#### Original behaviour is preserved + +The tidy representation behaves exactly as a native `SpatialExperiment`. It can be interacted with using [SpatialExperiment commands](https://www.bioconductor.org/packages/release/bioc/vignettes/SpatialExperiment/inst/doc/SpatialExperiment.html) +such as `assays`. + +```{r} +assays(spatial_data) +``` + +### 2. Tidyverse commands + +We can also interact with our object as we do with any tidyverse tibble. We can use `tidyverse` commands, such as `filter`, `select` and `mutate` to explore the `tidySpatialExperiment` object. Some examples are shown below and more can be seen at the `tidySpatialExperiment` [website](https://stemangiola.github.io/tidySpatialExperiment/articles/introduction.html#tidyverse-commands-1). + +#### Filter + +We can use `filter` to choose rows, for example, to select our three samples we are going to work with. + + +```{r} +spatial_data = + spatial_data |> + filter(sample_id %in% c("sample01")) + +spatial_data +``` + +In comparison the base-R method recalls the variable multiple times + +```{r, eval=FALSE} +spatial_data = spatial_data[,spatial_data$sample_id %in% c("sample01")] +``` + +Or for example, to see just the rows for the cells in G1 cell-cycle stage. + +```{r} +spatial_data |> dplyr::filter(in_tissue == 1) +``` + +:::: {.note} +Note that **rows** in this context refers to rows of the abstraction, not **rows** of the SpatialExperiment which correspond to genes **tidySpatialExperiment** prioritizes cells as the units of observation in the abstraction, while the full dataset, including measurements of expression of all genes, is still available "in the background". +:::: + +#### Select + +We can use `select` to view columns, for example, to see the filename, total cellular RNA abundance and cell phase. + +If we use `select` we will also get any view-only columns returned, such as the UMAP columns generated during the preprocessing. + +```{r} +spatial_data |> select(.cell, sample_id, in_tissue) +``` + +#### Mutate + +We can use `mutate` to create a column. For example, we could create a new `Phase_l` column that contains a lower-case version of `Phase`. + +In this case, three columns that are view only (`sample_id`, `pxl_col_in_fullres`, `pxl_row_in_fullres`, `PC*`) will be always included in the tidy representation because they cannot be omitted from the data container (is opposed to metadata) + +```{r message=FALSE} +spatial_data |> + mutate(sample_id_upper = toupper(sample_id)) |> + select(.cell, sample_id, sample_id_upper) +``` + +We can use tidyverse commands to polish an annotation column. We will extract the sample, and group information from the file name column into separate columns. + +```{r message=FALSE} + +# Simulate file path +spatial_data = spatial_data |> mutate(file_path = glue("../data/spatial/{sample_id}/outs/raw_feature_bc_matrix/")) + + +# First take a look at the file column +spatial_data |> select(.cell, file_path) +``` + +#### Extract + +Extract specific identifiers from complex data paths, simplifying the dataset by isolating crucial metadata. This process allows for clearer identification of samples based on their file paths, improving data organization. + +```{r} +# Create column for sample +spatial_data <- spatial_data |> + # Extract sample ID from file path and display the updated data + tidyr::extract(file_path, "sample_id_from_file_path", "data/spatial/([a-zA-Z0-9_-]+)/outs.+", remove = FALSE) + +# Take a look +spatial_data |> select(.cell, sample_id_from_file_path, everything()) +``` + +#### Unite + +We could use tidyverse `unite` to combine columns, for example to create a new column for sample id combining the sample and subject id +(BCB) columns. + +```{r message=FALSE} +spatial_data <- spatial_data |> unite("sample_subject", sample_id, subject, remove = FALSE) + +# Take a look +spatial_data |> select(.cell, sample_id, sample_subject, subject) +``` + + +### 3. Advanced filtering/gating and pseudobulk + +`tidySpatialExperiment` provide a interactive advanced tool for gating region of interest for streamlined exploratory analyses. + +This capability is powered by `tidygate`. We show how you can visualise your data and manually drawing gates to select one or more regions of interest using an intuitive tidy grammar. From https://bioconductor.org/packages/devel/bioc/vignettes/tidySpatialExperiment/inst/doc/overview.html + +First let's visualise our data + +```{r} + +spatial_data |> + plotVisium(annotate = "sum", highlight = "in_tissue", + legend_position = "none") + +``` + +Let's draw an arbitrary gate interactively + +```{r, eval=FALSE} +spatial_data = + spatial_data |> + + # Filter one sample + filter(in_tissue == 1, sample_id=="sample01") |> + + # Gate based on tissue morphology + gate(colour = "sum", alpha = 0.8, size = 0.1) +``` + +We can save the gates programmately for future use + +```{r, eval=FALSE} +# Create the path +gates_file_path = + tempdir(check = FALSE) |> + paste0("reproducible_gates.rds") + +# Save +tidygate_env$gates |> + saveRDS(gates_file_path) +``` + +Let's load the gates + +```{r, eval=FALSE} +tidygate_env_gates = readRDS(gates_file_path) +``` + +```{r} +data(tidygate_env_gates) +``` + +We can now use the saved gate to reproduce our selection + +```{r} +# Recall the gate +spatial_data = + spatial_data |> + + # Filter one sample + filter(in_tissue == 1, sample_id=="sample01") |> + + # Gate based on tissue morphology + gate(colour = "in_tissue", alpha = 0.8, size = 0.1, + programmatic_gates = tidyomicsWorkshop::tidygate_env_gates + ) +``` + +This is recorded in the `.gate` column + +```{r, eval=FALSE} + +spatial_data |> select(.cell, .gated) +``` + +We can count how many pixels we selected with simple `tidyverse` grammar + +```{r, eval=FALSE} +spatial_data |> count(.gated) +``` + +We can visualise the gating + + +```{r, eval=FALSE} +spatial_data |> + + # Plot our gate + plotVisium( + annotate = ".gated", + highlight = "in_tissue", + legend_position = "none" + ) + +``` + +And filter, for further analyses + +```{r, eval=FALSE} +spatial_data |> + filter(.gated == "1") +``` + +#### Summarisation/aggregation + +The gated cells can then be divided into pseudobulks within a SummarizedExperiment object using tidySpatialExperiment’s aggregate_cells utility function. + +```{r , eval=FALSE} +spe_regions_aggregated <- + spatial_data |> + aggregate_cells(.gated) + +spe_regions_aggregated +``` + + +### 4. tidyfying your workflow + +We will take workflow used in **Session 2**, performed using mostly base R syntax and convert it to tidy R syntax. We will show you how the readability and modularity of your workflow will improve. + +#### Subset to keep only on-tissue spots. + +**Base R approach:** + +```{r, eval=FALSE} +spatial_data <- spatial_data[, colData(spatial_data)$in_tissue == 1] +``` + +**Tidyverse Approach:** + +```{r} +spatial_data <- + spatial_data |> + filter(in_tissue == 1) +``` + +**Specific Differences and Advantages:** + +The `tidyverse` `filter()` function clearly states the intent to filter the dataset, whereas the Base R approach uses subsetting which might not be immediately clear to someone unfamiliar with the syntax. + +The `tidyverse` approach inherently supports chaining further operations without manually checking dimensions, assuming that users trust the operation to behave as expected. + +#### Manipulating feature information + +:::: {.note} +For `SingleCellExperiment` there is no tidy API for manipulating feature wise data yet, on the contrary for `SummarizedExperiment`, because gene-centric the abstraction allow for direct gene information manipulation. Currently, `tidySingleCellExperiment` and `tidySpatialExperiment` do not prioritize the manipulation of features (genes). + +While these functions can employ genes for cell manipulation and visualisation, as demonstrated in `join_features()`, they lack tools for altering feature-related information. Instead, their primary focus is on cell information, which serves as the main observational unit in single-cell data. This contrasts with bulk RNA sequencing data, where features are more central. +:::: + +The tidy API for `SingleCellExperiment` has feature-manipulation API among our plans. See [tidyomics challenges](https://github.com/orgs/tidyomics/projects/1) + +**Base R approach:** + +```{r} +is_gene_mitochondrial <- grepl("(^MT-)|(^mt-)", rowData(spatial_data)$gene_name) +rowData(spatial_data)$gene_name[is_gene_mitochondrial] +``` + +#### Quality Control: + +Apply quality control measures to exclude cells based on mitochondrial content and read/gene count, a common indicator of cell health and viability. + +**Base R approach:** + +```{r, eval=FALSE} +spatial_data <- addPerCellQC(spatial_data, subsets = list(mito = is_gene_mitochondrial)) + +## Select expressed genes threshold +qc_mitochondrial_transcription <- colData(spatial_data)$subsets_mito_percent > 30 +colData(spatial_data)$qc_mitochondrial_transcription <- qc_mitochondrial_transcription + +``` + +**Tidyverse Approach:** + +```{r} + +spatial_data <- + spatial_data |> + + # Add QC + addPerCellQC(subsets = list(mito = is_gene_mitochondrial)) |> + + ## Add threshold in colData + mutate( + qc_mitochondrial_transcription = subsets_mito_percent > 30 + ) + +spatial_data + +``` + +**Specific Differences and Advantages:** + +`tidyverse` pipelines these operations without storing intermediate results, directly updating the dataset. Base R separates these steps, requiring manual tracking of variables and updating the dataset in multiple steps, increasing complexity and potential for errors. + +Direct Data Mutation: Tidyverse directly mutates the dataset within the pipeline, whereas Base R extracts, computes, and then reassigns values, which can be more verbose and less efficient in terms of workflow clarity and execution. + +#### Group-specific analyses + +**Base R approach:** + +```{r, eval=FALSE, fig.width=7, fig.height=8} +# get gene for subset +genes <- !grepl(pattern = "^Rp[l|s]|Mt", x = rownames(spatial_data)) + +# Convert to list +spatial_data_list <- lapply(unique(spatial_data$sample_id), function(x) spatial_data[, spatial_data$sample_id == x]) + +# Detect sample-specific hughly-variable genes +marker_genes = + lapply( spatial_data_list, + function(x){ + dec = scran::modelGeneVar(x, subset.row = genes) + scran::getTopHVGs(dec, n = 1000) + } + ) + +head(unique(unlist(marker_genes))) + +``` + +**Tidyverse Approach: group_split** + +```{r, fig.width=7, fig.height=8} +# get gene for subset +genes <- !grepl(pattern = "^Rp[l|s]|Mt", x = rownames(spatial_data)) + +marker_genes = + spatial_data |> + logNormCounts() |> + group_split(sample_id) |> + map(~ + .x |> + scran::modelGeneVar(subset.row = genes) |> + scran::getTopHVGs(n = 1000) + ) |> + purrr::reduce(union) + +marker_genes |> head() +``` + +**Tidyverse Approach: nest** + +```{r, fig.width=7, fig.height=8} + +spatial_data |> + logNormCounts() |> + nest(sample_data = -sample_id) |> + mutate(marker_genes = map(sample_data, ~ + .x |> + scran::modelGeneVar(subset.row = genes) |> + scran::getTopHVGs(n = 1000) + )) + +``` + + + +**Specific Differences and Advantages:** + +`tidyverse` neatly handles grouping and plotting within a single chain, using `nest()` or `group_split()` and `map()` for compartmentalized operations, which organizes the workflow into a coherent sequence. + +tidyverse's `map()` is a powerful functional language tool, which can return arbitrary types, such as `map_int`, `map_char`, `map_lgl`.It is integrated into the data manipulation workflow, making it part of the data pipeline. + +#### Multi-parameter filtering + +**Base R approach:** + +```{r, eval=FALSE} +## # Mitochondrial transcription +qc_mitochondrial_transcription <- colData(spatial_data)$subsets_mito_percent > 30 +colData(spatial_data)$qc_mitochondrial_transcription <- qc_mitochondrial_transcription + +# ## Select library size threshold +qc_total_counts <- colData(spatial_data)$sum < 700 +colData(spatial_data)$qc_total_counts <- qc_total_counts + +# ## Select expressed genes threshold +qc_detected_genes <- colData(spatial_data)$detected < 500 +colData(spatial_data)$qc_detected_genes <- qc_detected_genes + +# ## Find combination to filter +colData(spatial_data)$discard <- qc_total_counts | qc_detected_genes | qc_mitochondrial_transcription + +# # Filter +spatial_data = spatial_data[,!colData(spatial_data)$discard ] +``` + +**Tidyverse Approach:** + +```{r} + +spatial_data_filtered = + spatial_data |> + mutate( + discard = + subsets_mito_percent > 30 | + sum < 700 | + detected < 500 + ) |> + filter(!discard) +``` + +**Specific Differences and Advantages:** + +**Tidyverse:** The code directly applies multiple filtering conditions within a single filter() function, making it highly readable and concise. The conditions are clearly laid out, and the operation directly modifies the spatial_data dataframe. This approach is more intuitive for managing complex filters as it condenses them into a singular functional expression. + +**Base R:** The approach first calculates each condition and stores them within the colData of the dataset. These conditions are then combined to create a logical vector that flags rows to discard. Finally, it subsets the data by removing rows that meet any of the discard conditions. This method is more verbose and requires manually handling intermediate logical vectors, which can introduce errors and complexity in tracking multiple data transformations. + +**Why tidyverse might be better in this context:** + +**Coding efficiency:** `tidyverse` chains operations, reducing the need for intermediate variables and making the code cleaner and less error-prone. + +**Readability:** The filter conditions are all in one place, which simplifies understanding what the code does at a glance, especially for users familiar with the tidyverse syntax. + +**Maintainability:** Fewer and self-explanatory lines of code and no need for intermediate steps make the code easier to maintain and modify, especially when conditions change or additional filters are needed. + + +### 5. Visualisation + +Here, we will show how to use ad-hoc spatial visualisation, as well as `ggplot` to explore spatial data we will show how `tidySpatialExperiment` allowed to alternate between tidyverse visualisation, and any visualisation compatible with `SpatialExperiment`. + +#### Ad-hoc visualisation: Plotting the regions + +Let’s visualise the RNA output. + +```{r} +spatial_data_filtered |> + # Plot our gate + plotVisium( + annotate = "sum", + highlight = "in_tissue", + legend_position = "none" + ) +``` + +#### Custom visualisation: Plotting the regions + + + +```{r} +spatial_data |> + ggplot(aes(array_row, array_col)) + + geom_point(aes(color = sum)) + + facet_wrap(~sample_id) + + scale_color_distiller(palette = "Spectral") + + theme(legend.position = "none") +``` + +#### Custom visualisation: Plotting RNA output + +Now, let's observe what is the difference in total transcriptional cell output across regions. We can appreciate that different regions of these Visium slide is characterised by significantly different total RNA output. For example, the region one has a low R&D output, on the contrary regions to an L3, characterised by a high RNA output. + +We could conclude that when we use thresholding to filter "low-quality" pixels we have to be careful about possible biological and spatial effects. + +```{r, fig.width=7, fig.height=4} + +spatial_data_filtered |> + ggplot(aes(sum, color = .gated)) + + geom_density() + + facet_wrap(~sample_id) + + scale_x_log10() + + theme_bw() + +``` + +We provide another example of how the use of tidy. Spatial experiment makes custom visualisation, very easy and intuitive, leveraging `ggplot` functionalities. We will observe the relationship between mitochondrial transcription percentage, and total gene counts. We expect this relationship to be inverse as cells with higher mitochondrial transcription percentage tent to have a more limited transcriptional gene pool (e.g. for dieying or damaged cells). + +```{r, fig.width=7, fig.height=8} + +spatial_data_filtered |> + ggplot(aes(subsets_mito_percent, sum)) + + geom_point(aes(color = .gated), size=0.2) + + stat_ellipse(aes(group = .gated), alpha = 0.3) + + scale_y_log10() + + theme_bw() + +``` + + + +As you can appreciate, the relationship between the RNA output and the mitochondrial abundance per pixel it's quite consistent. + +:::: {.note} +**Excercise** + +To to practice the use of `tidyomics` on spatial data, we propose a few exercises that connect manipulation, calculations and visualisation. These exercises are just meant to be simple use cases that exploit tidy R streamlined language. + + +We assume that the cells we filtered as non-alive or damaged, characterised by being reached uniquely for mitochondrial, genes, and genes, linked to up ptosis. it is good practice to check these assumption. This exercise aims to estimate what genes are differentially expressed between filtered and unfiltered cells. Then visualise the results + +Use `tidyomic`s/`tidyverse` tools to label dead cells and perform differential expression within each region. Some of the comments you can use are: `mutate`, `nest`, `aggregate_cells`. +:::: + +**Solution** + +```{r, eval=FALSE} +library(tidySummarizedExperiment) +library(tidybulk) +library(spatialLIBD) + +libd_data <- fetch_data(type = "spe") + +differential_analysis = + libd_data |> + mutate( + dead = + + # Stringent threshold + subsets_mito_percent > 20 | + sum < 700 | + detected < 500 + ) |> + aggregate_cells(c(sample_id, .gated, dead)) |> + keep_abundant(factor_of_interest = c(dead)) |> + nest(data = - .gated) |> + + # filter regions having both alive and dead cells + filter( map_int(data, ~ .x |> distinct(sample_id, dead) |> nrow() ) == 6 ) |> + mutate(data = map( + data, + test_differential_abundance, + ~ dead + sample_id, + method = "edgeR_quasi_likelihood", + test_above_log2_fold_change = log(2) + )) + +differential_analysis |> + mutate(data = map(data, pivot_transcript)) |> + unnest(data) |> + filter(FDR<0.05) + +# tidybulk::test_differential_abundance(~ dead + sample_id + (1 | .gated), method = "glmmseq_lme4") +``` + +**Session Information** + +```{r} +sessionInfo() +``` + +**References** + +```{css echo=FALSE} +.note { + margin: 30px; + padding: 1em; + background: #FFF8F0; + border: 1px solid #EFE8E0; + border-radius: 10px; +} +``` \ No newline at end of file diff --git a/vignettes/supplementary_transcriptomics.Rmd b/vignettes/supplementary_transcriptomics.Rmd index ca33e17..16a9cf5 100644 --- a/vignettes/supplementary_transcriptomics.Rmd +++ b/vignettes/supplementary_transcriptomics.Rmd @@ -32,6 +32,7 @@ seurat_obj |> join_features( features = c("CD3D", "TRDC", "TRGC1", "TRGC2", "CD8A", "CD8B" ), shape = "wide" + ) |> mutate(signature_score = @@ -39,10 +40,9 @@ seurat_obj |> scales::rescale(CD8A + CD8B, to=c(0,1)) ) |> - mutate(gate = gate_int( - UMAP_1, UMAP_2, - .size = 0.1, - .color =signature_score + mutate(gate = gate(x= UMAP_1, y=UMAP_2, + size = 0.1, + colour = signature_score )) ``` @@ -62,11 +62,10 @@ seurat_obj |> scales::rescale(CD8A + CD8B, to=c(0,1)) ) |> - mutate(gate = gate_int( - UMAP_1, UMAP_2, - .size = 0.1, - .color =signature_score, - gate_list = tidyomicsWorkshop::gate_seurat_obj + mutate(gate = gate(x= UMAP_1, y=UMAP_2, + size = 0.1, + colour = signature_score, + programmatic_gates = tidyomicsWorkshop::gate_seurat_obj )) ``` @@ -88,7 +87,17 @@ seurat_obj_gamma_delta <- scales::rescale(CD8A + CD8B, to=c(0,1)) ) |> - mutate(gate = gate_int(UMAP_1, UMAP_2, gate_list = tidyomicsWorkshop::gate_seurat_obj)) |> + mutate(gate = gate(UMAP_1, UMAP_2, programmatic_gates = tidyomicsWorkshop::gate_seurat_obj)) |> filter(gate == 1) ``` + +## CuratedAtlasQuery + +We will explore the tidy interface for the CELLxGENE harmonised data exploration and download. Please install `CuratedAtlasQuery` + +```{r, eval=FALSE} +job::job({ + remotes::install_github("stemangiola/CuratedAtlasQueryR") + }) +```