diff --git a/apis/r/DESCRIPTION b/apis/r/DESCRIPTION index 7542859fcc..a970f03dab 100644 --- a/apis/r/DESCRIPTION +++ b/apis/r/DESCRIPTION @@ -57,6 +57,7 @@ LinkingTo: Rcpp, RcppInt64 Suggests: + anndataR, BPCells, datasets, iterators, diff --git a/apis/r/NAMESPACE b/apis/r/NAMESPACE index 03016d5f6e..9f918a0b0d 100644 --- a/apis/r/NAMESPACE +++ b/apis/r/NAMESPACE @@ -21,6 +21,7 @@ S3method(names,MappingBase) S3method(r_type_from_arrow_type,DataType) S3method(r_type_from_arrow_type,Field) S3method(r_type_from_arrow_type,Schema) +S3method(write_soma,AbstractAnnData) S3method(write_soma,Assay) S3method(write_soma,Assay5) S3method(write_soma,DataFrame) diff --git a/apis/r/R/SOMAExperimentAxisQuery.R b/apis/r/R/SOMAExperimentAxisQuery.R index 8d1992f974..2158ab2317 100644 --- a/apis/r/R/SOMAExperimentAxisQuery.R +++ b/apis/r/R/SOMAExperimentAxisQuery.R @@ -1330,6 +1330,248 @@ SOMAExperimentAxisQuery <- R6::R6Class( # Validate and return methods::validObject(sce) return(sce) + }, + + #' Export to AnnData + #' + #' Export query results as an `AnnData` object. This method is + #' a convenience wrapper around `anndataR::AnnData()`. + #' + #' @param X_layers A named character vector specifying the X layers + #' to export. If `NULL`, exports all available layers. + #' @param obs_index Name of the cell-level metadata column to use as + #' cell names in the exported object. If `NULL`, defaults to + #' `paste0("obs", soma_joinid)`. + #' @param var_index Name of the feature-level metadata column to use as + #' feature names in the exported object. If `NULL`, defaults to + #' `paste0("var", soma_joinid)`. + #' @param obs_column_names Names of cell-level metadata columns to export. + #' If `NULL`, defaults to all columns. If `FALSE`, no metadata is exported. + #' @param var_column_names Names of feature-level metadata columns to export. + #' If `NULL`, defaults to all columns. If `FALSE`, no metadata is exported. + #' @param obsm_layers Names of observation matrices to export. + #' If `NULL`, defaults to all available matrices. If `FALSE`, none are exported. + #' @param varm_layers Names of variable matrices to export. + #' If `NULL`, defaults to all available matrices. If `FALSE`, none are exported. + #' @param obsp_layers Names of observation pairwise matrices to export. + #' If `NULL`, defaults to all available matrices. If `FALSE`, none are exported. + #' @param varp_layers Names of variable pairwise matrices to export. + #' If `NULL`, defaults to all available matrices. If `FALSE`, none are exported. + #' @param drop_levels Remove unused factor levels from categorical columns. + #' @param ... Additional arguments passed to `anndataR::AnnData()`. + #' + #' @return An `AnnData` object containing the query results. + #' + #' @export + to_anndata = function( + X_layers = NULL, + obs_index = NULL, + var_index = NULL, + obs_column_names = NULL, + var_column_names = NULL, + obsm_layers = NULL, + varm_layers = NULL, + obsp_layers = NULL, + varp_layers = NULL, + drop_levels = FALSE, + ... + ) { + check_package("anndataR") + stopifnot( + "'X_layers' must be a character vector" = is_character_or_null(X_layers), + "'obs_index' must be a single character value" = is.null(obs_index) || + (is_scalar_character(obs_index) && !is.na(obs_index)), + "'var_index' must be a single character value" = is.null(var_index) || + (is_scalar_character(var_index) && !is.na(var_index)), + "'obs_column_names' must be a character vector" = is.null(obs_column_names) || + is.character(obs_column_names) || + is_scalar_logical(obs_column_names), + "'var_column_names' must be a character vector" = is.null(var_column_names) || + is.character(var_column_names) || + is_scalar_logical(var_column_names), + "'obsm_layers' must be a character vector" = is.null(obsm_layers) || + is.character(obsm_layers) || + is_scalar_logical(obsm_layers), + "'varm_layers' must be a character vector" = is.null(varm_layers) || + is.character(varm_layers) || + is_scalar_logical(varm_layers), + "'obsp_layers' must be a character vector" = is.null(obsp_layers) || + is.character(obsp_layers) || + is_scalar_logical(obsp_layers), + "'varp_layers' must be a character vector" = is.null(varp_layers) || + is.character(varp_layers) || + is_scalar_logical(varp_layers), + "'drop_levels' must be TRUE or FALSE" = isTRUE(drop_levels) || + isFALSE(drop_levels) + ) + + # Load in colData (obs) + obs <- private$.load_df( + "obs", + index = obs_index, + column_names = obs_column_names, + drop_levels = drop_levels + ) + + # Load in rowData (var) + var <- private$.load_df( + "var", + index = var_index, + column_names = var_column_names, + drop_levels = drop_levels + ) + + # Check the layers + X_layers <- pad_names(X_layers %||% self$ms$X$names()) + assert_subset(x = X_layers, y = self$ms$X$names(), type = "X_layer") + + # Read in the X matrix (main data matrix) + X <- if (length(X_layers) > 0) { + # Use the first layer as the main X matrix + layer_name <- X_layers[1] + mat <- Matrix::t(self$to_sparse_matrix( + collection = "X", + layer_name = layer_name + )) + dimnames(mat) <- list(row.names(obs), row.names(var)) + mat + } else { + NULL + } + + # Read in the remaining layers + layers <- if (length(X_layers) > 1) { + layers_list <- lapply( + X = X_layers[-1], # Skip first layer as it's used for X + FUN = function(layer) { + mat <- Matrix::t(self$to_sparse_matrix( + collection = "X", + layer_name = layer + )) + dimnames(mat) <- list(row.names(obs), row.names(var)) + return(mat) + } + ) + names(layers_list) <- names(X_layers)[-1] + layers_list + } else { + NULL + } + + # Load obsm matrices (observation embeddings/reduced dimensions) + obsm <- if (!isFALSE(obsm_layers)) { + ms_obsm <- tryCatch(expr = self$ms$obsm$names(), error = function(e) NULL) + if (!is.null(ms_obsm)) { + obsm_layers <- obsm_layers %||% ms_obsm + obsm_layers <- pad_names(obsm_layers) + assert_subset(x = obsm_layers, y = ms_obsm, type = "observation matrix") + + obsm_list <- lapply( + X = obsm_layers, + FUN = function(layer) { + mat <- private$.load_m_axis(layer = layer, m_axis = "obsm") + dimnames(mat) <- list(row.names(obs), NULL) + return(mat) + } + ) + names(obsm_list) <- names(obsm_layers) + obsm_list + } else { + NULL + } + } else { + NULL + } + + # Load varm matrices (variable embeddings/feature loadings) + varm <- if (!isFALSE(varm_layers)) { + ms_varm <- tryCatch(expr = self$ms$varm$names(), error = function(e) NULL) + if (!is.null(ms_varm)) { + varm_layers <- varm_layers %||% ms_varm + varm_layers <- pad_names(varm_layers) + assert_subset(x = varm_layers, y = ms_varm, type = "variable matrix") + + varm_list <- lapply( + X = varm_layers, + FUN = function(layer) { + mat <- private$.load_m_axis(layer = layer, m_axis = "varm") + dimnames(mat) <- list(row.names(var), NULL) + return(mat) + } + ) + names(varm_list) <- names(varm_layers) + varm_list + } else { + NULL + } + } else { + NULL + } + + # Load obsp matrices (observation pairwise relationships) + obsp <- if (!isFALSE(obsp_layers)) { + ms_obsp <- tryCatch(expr = self$ms$obsp$names(), error = function(e) NULL) + if (!is.null(ms_obsp)) { + obsp_layers <- obsp_layers %||% ms_obsp + obsp_layers <- pad_names(obsp_layers) + assert_subset(x = obsp_layers, y = ms_obsp, type = "observation pairwise matrix") + + obsp_list <- lapply( + X = obsp_layers, + FUN = function(layer) { + mat <- private$.load_p_axis(layer = layer, p_axis = "obsp", repr = "C") + dimnames(mat) <- list(row.names(obs), row.names(obs)) + return(mat) + } + ) + names(obsp_list) <- names(obsp_layers) + obsp_list + } else { + NULL + } + } else { + NULL + } + + # Load varp matrices (variable pairwise relationships) + varp <- if (!isFALSE(varp_layers)) { + ms_varp <- tryCatch(expr = self$ms$varp$names(), error = function(e) NULL) + if (!is.null(ms_varp)) { + varp_layers <- varp_layers %||% ms_varp + varp_layers <- pad_names(varp_layers) + assert_subset(x = varp_layers, y = ms_varp, type = "variable pairwise matrix") + + varp_list <- lapply( + X = varp_layers, + FUN = function(layer) { + mat <- private$.load_p_axis(layer = layer, p_axis = "varp", repr = "C") + dimnames(mat) <- list(row.names(var), row.names(var)) + return(mat) + } + ) + names(varp_list) <- names(varp_layers) + varp_list + } else { + NULL + } + } else { + NULL + } + + # Create the AnnData object + adata <- anndataR::AnnData( + X = X, + obs = obs, + var = var, + layers = layers, + obsm = obsm, + varm = varm, + obsp = obsp, + varp = varp, + ... + ) + + return(adata) } ), active = list( diff --git a/apis/r/R/write_anndata.R b/apis/r/R/write_anndata.R new file mode 100644 index 0000000000..4942805db0 --- /dev/null +++ b/apis/r/R/write_anndata.R @@ -0,0 +1,352 @@ +#' Write an AnnData object to a SOMA +#' +#' Convert an AnnData object to its SOMA counterpart and save it. +#' +#' @param x An `AnnData` object from the `anndataR` package. +#' @param uri URI for the resulting SOMA object. +#' @inheritParams write_soma_objects +#' +#' @return The URI to the resulting [`SOMAExperiment`] generated from +#' the data contained in `x`. +#' +#' @section Writing AnnData objects: +#' AnnData objects are written out as [`SOMAExperiment`] objects: +#' \itemize{ +#' \item `obs` is written out as a [`SOMADataFrame`] called "obs" at +#' the experiment level. +#' \item `var` is written out as a [`SOMADataFrame`] called "var" within +#' the measurement. +#' \item `X` matrix is written out as a [`SOMASparseNDArray`] called +#' "X" within the measurement's X group. +#' \item `layers` are written out as [`SOMASparseNDArray`] objects within +#' the measurement's X group. +#' \item `obsm` matrices are written out as [`SOMASparseNDArray`] objects +#' within the measurement's obsm group. +#' \item `varm` matrices are written out as [`SOMASparseNDArray`] objects +#' within the measurement's varm group. +#' \item `obsp` matrices are written out as [`SOMASparseNDArray`] objects +#' within the measurement's obsp group. +#' \item `varp` matrices are written out as [`SOMASparseNDArray`] objects +#' within the measurement's varp group. +#' } +#' Expression matrices are transposed (cells as rows) prior to writing. +#' +#' @method write_soma AbstractAnnData +#' @export +#' +#' @examplesIf requireNamespace("withr", quietly = TRUE) && requireNamespace("anndataR", quietly = TRUE) +#' \donttest{ +#' uri <- withr::local_tempfile(pattern = "anndata") +#' +#' # Create a simple AnnData object +#' adata <- anndataR::AnnData( +#' X = matrix(1:15, nrow = 3), +#' obs = data.frame(cell_type = c("A", "B", "C")), +#' var = data.frame(gene_name = paste0("gene_", 1:5)) +#' ) +#' +#' uri <- write_soma(adata, uri) +#' +#' (exp <- SOMAExperimentOpen(uri)) +#' exp$obs +#' (ms <- exp$ms$get("RNA")) +#' ms$var +#' ms$X$names() +#' +#' exp$close() +#' } +#' +write_soma.AbstractAnnData <- function( + x, + uri, + ms_name = "RNA", + ..., + ingest_mode = "write", + platform_config = NULL, + tiledbsoma_ctx = NULL +) { + check_package("anndataR") + stopifnot( + "'uri' must be a single character value" = is.null(uri) || + is_scalar_character(uri), + "'ms_name' must be a single character value" = is_scalar_character(ms_name) && + nzchar(ms_name), + "'x' must be an AbstractAnnData object" = inherits(x, "AbstractAnnData") + ) + ingest_mode <- match.arg(arg = ingest_mode, choices = c("write", "resume")) + + # Create the experiment + experiment <- SOMAExperimentCreate( + uri = uri, + ingest_mode = ingest_mode, + platform_config = platform_config, + tiledbsoma_ctx = tiledbsoma_ctx + ) + on.exit(experiment$close(), add = TRUE, after = FALSE) + + # Write cell-level metadata (obs) + spdl::info("Adding obs") + obs_df <- x$obs + if (is.null(rownames(obs_df))) { + rownames(obs_df) <- paste0("obs", seq_len(nrow(obs_df))) + } + obs_df <- .df_index(obs_df, axis = "obs") + obs_df[[attr(obs_df, "index")]] <- rownames(x$obs) + + write_soma( + x = obs_df, + uri = "obs", + soma_parent = experiment, + key = "obs", + ingest_mode = ingest_mode, + platform_config = platform_config, + tiledbsoma_ctx = tiledbsoma_ctx + ) + + # Create measurements collection + spdl::info("Creating measurements collection") + expms <- SOMACollectionCreate( + file_path(experiment$uri, "ms"), + ingest_mode = ingest_mode, + platform_config = platform_config, + tiledbsoma_ctx = tiledbsoma_ctx + ) + withCallingHandlers( + expr = .register_soma_object(expms, soma_parent = experiment, key = "ms"), + existingKeyWarning = .maybe_muffle + ) + + # Create measurement + ms_uri <- .check_soma_uri(uri = ms_name, soma_parent = expms) + ms <- SOMAMeasurementCreate( + uri = ms_uri, + ingest_mode = ingest_mode, + platform_config = platform_config, + tiledbsoma_ctx = tiledbsoma_ctx + ) + on.exit(ms$close(), add = TRUE, after = FALSE) + + # Write feature-level metadata (var) + spdl::info("Adding var") + var_df <- x$var + if (is.null(rownames(var_df))) { + rownames(var_df) <- paste0("var", seq_len(nrow(var_df))) + } + var_df <- .df_index(var_df, axis = "var") + var_df[[attr(var_df, "index")]] <- rownames(x$var) + + write_soma( + x = var_df, + uri = "var", + soma_parent = ms, + key = "var", + ingest_mode = ingest_mode, + platform_config = platform_config, + tiledbsoma_ctx = tiledbsoma_ctx + ) + + # Create X collection + X <- if (!"X" %in% ms$names()) { + SOMACollectionCreate( + file_path(ms$uri, "X"), + ingest_mode = ingest_mode, + platform_config = platform_config, + tiledbsoma_ctx = tiledbsoma_ctx + ) + } else { + SOMACollectionOpen(file_path(ms$uri, "X"), mode = "WRITE") + } + withCallingHandlers( + .register_soma_object(X, soma_parent = ms, key = "X"), + existingKeyWarning = .maybe_muffle + ) + on.exit(X$close(), add = TRUE, after = FALSE) + + # Write main X matrix + if (!is.null(x$X)) { + spdl::info("Adding X matrix") + write_soma( + x = x$X, + uri = "X", + soma_parent = X, + sparse = TRUE, + transpose = TRUE, # AnnData uses genes x cells, SOMA uses cells x genes + key = "X", + ingest_mode = ingest_mode, + platform_config = platform_config, + tiledbsoma_ctx = tiledbsoma_ctx + ) + } + + # Write layers + if (length(x$layers_keys()) > 0) { + spdl::info("Adding layers") + for (layer_name in x$layers_keys()) { + spdl::info("Adding layer {}", layer_name) + layer_data <- x$layers[[layer_name]] + write_soma( + x = layer_data, + uri = layer_name, + soma_parent = X, + sparse = TRUE, + transpose = TRUE, # AnnData uses genes x cells, SOMA uses cells x genes + key = layer_name, + ingest_mode = ingest_mode, + platform_config = platform_config, + tiledbsoma_ctx = tiledbsoma_ctx + ) + } + } + + # Write obsm matrices + if (length(x$obsm_keys()) > 0) { + spdl::info("Adding obsm matrices") + obsm <- if (!"obsm" %in% ms$names()) { + SOMACollectionCreate( + file_path(ms$uri, "obsm"), + ingest_mode = ingest_mode, + platform_config = platform_config, + tiledbsoma_ctx = tiledbsoma_ctx + ) + } else { + SOMACollectionOpen(file_path(ms$uri, "obsm"), mode = "WRITE") + } + withCallingHandlers( + .register_soma_object(obsm, soma_parent = ms, key = "obsm"), + existingKeyWarning = .maybe_muffle + ) + on.exit(obsm$close(), add = TRUE, after = FALSE) + + for (obsm_name in x$obsm_keys()) { + spdl::info("Adding obsm matrix {}", obsm_name) + obsm_data <- x$obsm[[obsm_name]] + write_soma( + x = obsm_data, + uri = obsm_name, + soma_parent = obsm, + sparse = TRUE, + transpose = FALSE, # obsm should keep original orientation + key = obsm_name, + ingest_mode = ingest_mode, + platform_config = platform_config, + tiledbsoma_ctx = tiledbsoma_ctx + ) + } + } + + # Write varm matrices + if (length(x$varm_keys()) > 0) { + spdl::info("Adding varm matrices") + varm <- if (!"varm" %in% ms$names()) { + SOMACollectionCreate( + file_path(ms$uri, "varm"), + ingest_mode = ingest_mode, + platform_config = platform_config, + tiledbsoma_ctx = tiledbsoma_ctx + ) + } else { + SOMACollectionOpen(file_path(ms$uri, "varm"), mode = "WRITE") + } + withCallingHandlers( + .register_soma_object(varm, soma_parent = ms, key = "varm"), + existingKeyWarning = .maybe_muffle + ) + on.exit(varm$close(), add = TRUE, after = FALSE) + + for (varm_name in x$varm_keys()) { + spdl::info("Adding varm matrix {}", varm_name) + varm_data <- x$varm[[varm_name]] + write_soma( + x = varm_data, + uri = varm_name, + soma_parent = varm, + sparse = TRUE, + transpose = FALSE, # varm should keep original orientation + key = varm_name, + ingest_mode = ingest_mode, + platform_config = platform_config, + tiledbsoma_ctx = tiledbsoma_ctx + ) + } + } + + # Write obsp matrices + if (length(x$obsp_keys()) > 0) { + spdl::info("Adding obsp matrices") + obsp <- if (!"obsp" %in% ms$names()) { + SOMACollectionCreate( + file_path(ms$uri, "obsp"), + ingest_mode = ingest_mode, + platform_config = platform_config, + tiledbsoma_ctx = tiledbsoma_ctx + ) + } else { + SOMACollectionOpen(file_path(ms$uri, "obsp"), mode = "WRITE") + } + withCallingHandlers( + .register_soma_object(obsp, soma_parent = ms, key = "obsp"), + existingKeyWarning = .maybe_muffle + ) + on.exit(obsp$close(), add = TRUE, after = FALSE) + + for (obsp_name in x$obsp_keys()) { + spdl::info("Adding obsp matrix {}", obsp_name) + obsp_data <- x$obsp[[obsp_name]] + write_soma( + x = obsp_data, + uri = obsp_name, + soma_parent = obsp, + sparse = TRUE, + transpose = FALSE, # obsp should keep original orientation + key = obsp_name, + ingest_mode = ingest_mode, + platform_config = platform_config, + tiledbsoma_ctx = tiledbsoma_ctx + ) + } + } + + # Write varp matrices + if (length(x$varp_keys()) > 0) { + spdl::info("Adding varp matrices") + varp <- if (!"varp" %in% ms$names()) { + SOMACollectionCreate( + file_path(ms$uri, "varp"), + ingest_mode = ingest_mode, + platform_config = platform_config, + tiledbsoma_ctx = tiledbsoma_ctx + ) + } else { + SOMACollectionOpen(file_path(ms$uri, "varp"), mode = "WRITE") + } + withCallingHandlers( + .register_soma_object(varp, soma_parent = ms, key = "varp"), + existingKeyWarning = .maybe_muffle + ) + on.exit(varp$close(), add = TRUE, after = FALSE) + + for (varp_name in x$varp_keys()) { + spdl::info("Adding varp matrix {}", varp_name) + varp_data <- x$varp[[varp_name]] + write_soma( + x = varp_data, + uri = varp_name, + soma_parent = varp, + sparse = TRUE, + transpose = FALSE, # varp should keep original orientation + key = varp_name, + ingest_mode = ingest_mode, + platform_config = platform_config, + tiledbsoma_ctx = tiledbsoma_ctx + ) + } + } + + withCallingHandlers( + .register_soma_object(ms, soma_parent = expms, key = ms_name), + existingKeyWarning = .maybe_muffle + ) + + return(experiment$uri) +} + diff --git a/apis/r/tests/testthat/test-17-write-anndata.R b/apis/r/tests/testthat/test-17-write-anndata.R new file mode 100644 index 0000000000..e66513ce2c --- /dev/null +++ b/apis/r/tests/testthat/test-17-write-anndata.R @@ -0,0 +1,47 @@ +test_that("to_anndata with obs_index parameter", { + skip_if(!extended_tests()) + skip_if_not_installed("anndataR") + + # Create test data with string cell names + # AnnData expects cells x genes format + X <- matrix(rpois(50, 3), nrow = 5, ncol = 10, + dimnames = list(paste0("cell", 1:5), paste0("gene", 1:10))) + + obs <- data.frame( + cell_id = paste0("cell_", 1:5), + cell_type = c("A", "B", "C", "A", "B"), + n_genes = c(100, 150, 120, 80, 110), + row.names = rownames(X) + ) + + var <- data.frame( + gene_name = colnames(X), + highly_variable = c(rep(TRUE, 5), rep(FALSE, 5)), + row.names = colnames(X) + ) + + # Create and write AnnData object + adata <- anndataR::AnnData( + X = Matrix::Matrix(X, sparse = TRUE), + obs = obs, + var = var + ) + + uri <- tempfile(pattern = "to-anndata-obs-index-test") + experiment_uri <- write_soma(adata, uri) + exp <- SOMAExperimentOpen(experiment_uri) + + # Create a query object + query <- SOMAExperimentAxisQuery$new(exp, "RNA") + + # Test with obs_index parameter + expect_no_condition(adata_result <- query$to_anndata(obs_index = "cell_id")) + expect_s3_class(adata_result, "AbstractAnnData") + + # Verify that the obs index was used + obs_result <- as.data.frame(adata_result$obs) + expect_true(all(paste0("cell_", 1:5) %in% rownames(obs_result))) + + # Clean up + exp$close() +})