diff --git a/Project.toml b/Project.toml index 6b08173..f3e6fc3 100644 --- a/Project.toml +++ b/Project.toml @@ -3,13 +3,29 @@ uuid = "94e1309d-ccf4-42de-905f-515f1d7b1cae" authors = ["Dilum Aluthge", "contributors"] version = "2.0.0" +[deps] +FeatureTransforms = "8fd68953-04b8-4117-ac19-158bf6de9782" +InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" +OMOPCommonDataModel = "ba65db9e-6590-4054-ab8a-101ed9124986" +PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" + [weakdeps] +DBInterface = "a10d1c49-ce27-4219-8d33-6db1a4562965" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" DrWatson = "634d3b9d-ee7a-5ddf-bec9-22491ea816e1" +DuckDB = "d2f5444f-75bc-4fdf-ac35-56f514c445e1" +Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" [extensions] HealthBaseDrWatsonExt = "DrWatson" +HealthBaseOMOPCDMExt = ["DataFrames", "OMOPCommonDataModel", "InlineStrings", "Serialization", "Dates", "FeatureTransforms", "DBInterface", "DuckDB"] [compat] +Dates = "1.10" +PrettyTables = "2.4.0" +Tables = "1.12.1" julia = "1.10" [extras] diff --git a/assets/version_info b/assets/version_info new file mode 100644 index 0000000..62b6504 Binary files /dev/null and b/assets/version_info differ diff --git a/docs/Project.toml b/docs/Project.toml index 2b39e2a..75bbdbb 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,5 +1,21 @@ [deps] +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" DocumenterTools = "35a29f4d-8980-5a13-9543-d66fff28ecb8" +DuckDB = "d2f5444f-75bc-4fdf-ac35-56f514c445e1" +FeatureTransforms = "8fd68953-04b8-4117-ac19-158bf6de9782" HealthBase = "94e1309d-ccf4-42de-905f-515f1d7b1cae" LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589" +OMOPCommonDataModel = "ba65db9e-6590-4054-ab8a-101ed9124986" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" + +[compat] +Documenter = "1" +DocumenterTools = "0.1.10" +HealthBase = "1, 2" +LiveServer = "1" +julia = "1.10" +DuckDB = "1" +FeatureTransforms = "0.4.0" +OMOPCommonDataModel = "0.1" +Tables = "1.12.1" diff --git a/docs/make.jl b/docs/make.jl index b98964a..c398233 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,11 +1,21 @@ using HealthBase using Documenter +using Tables +using DataFrames +using OMOPCommonDataModel +using FeatureTransforms +using DuckDB -DocMeta.setdocmeta!(HealthBase, :DocTestSetup, :(using HealthBase); recursive = true) +DocMeta.setdocmeta!(HealthBase, :DocTestSetup, :(using HealthBase, Tables); recursive = true) makedocs(; - modules = [HealthBase], - authors = "Jacob S. Zelko, Dilum Aluthge and contributors", +modules = [ + HealthBase, + isdefined(Base, :get_extension) ? + Base.get_extension(HealthBase, :HealthBaseOMOPCDMExt) : HealthBase.HealthBaseOMOPCDMExt + ], + checkdocs = :none, + authors = "Jacob S. Zelko, Dilum Aluthge and contributors", repo = "https://github.com/JuliaHealth/HealthBase.jl/blob/{commit}{path}#{line}", sitename = "HealthBase.jl", format = Documenter.HTML(; @@ -15,7 +25,18 @@ makedocs(; ), pages = [ "Home" => "index.md", - "Workflow Guides" => ["observational_template_workflow.md"], + "Quickstart" => "quickstart.md", + + "Workflow Guides" => [ + "Observational Template Workflow" => "observational_template_workflow.md", + "OMOP CDM Workflow" => "OMOPCDMWorkflow.md", + ], + + "HealthTable System" => [ + "HealthTable: General Tables.jl Interface" => "HealthTableGeneral.md", + "HealthTable: OMOP CDM Support" => "HealthTableOMOPCDM.md", + "HealthTable: Preprocessing Functions" => "HealthTablePreprocessing.md", + ], "API" => "api.md", ], # TODO: Update and configure doctests before next release diff --git a/docs/src/HealthTableGeneral.md b/docs/src/HealthTableGeneral.md new file mode 100644 index 0000000..5ff3c3f --- /dev/null +++ b/docs/src/HealthTableGeneral.md @@ -0,0 +1,27 @@ +# HealthTable: Tables.jl Interface (General) + +## The `HealthTable` Struct + +The core of the interface is the `HealthTable` struct. + +```@docs +HealthBase.HealthTable +``` + +## `Tables.jl` API Implementation + +The `HealthTable` wrapper types will implement key `Tables.jl` methods: + +`HealthTable` implements the `Tables.jl` interface to ensure compatibility with the Julia data ecosystem: + +```@docs +Tables.istable(::Type{<:HealthBase.HealthTable}) +Tables.rowaccess(::Type{<:HealthBase.HealthTable}) +Tables.rows(::HealthBase.HealthTable) +Tables.columnaccess(::Type{<:HealthBase.HealthTable}) +Tables.columns(::HealthBase.HealthTable) +Tables.schema(::HealthBase.HealthTable) +Tables.materializer(::Type{<:HealthBase.HealthTable}) +``` + +Source: https://tables.juliadata.org/stable/implementing-the-interface/ diff --git a/docs/src/HealthTableOMOPCDM.md b/docs/src/HealthTableOMOPCDM.md new file mode 100644 index 0000000..57d144c --- /dev/null +++ b/docs/src/HealthTableOMOPCDM.md @@ -0,0 +1,35 @@ +# OMOP CDM Support for HealthTable + +## Core Goals & Features + +The `HealthTable` interface in `HealthBase.jl` is designed to make working with OMOP CDM data in Julia easy, robust, and compatible with the `Tables.jl` ecosystem. The key features include: + +- **Schema-Aware Validation**: Instead of just wrapping your data, `HealthTable` actively validates it against the official OMOP CDM specification using `OMOPCommonDataModel.jl`. This includes: + - **Column Type Enforcement**: Verifies that column types in the input `DataFrame` match the official OMOP schema (e.g., `person_id` is `Int64`, `condition_start_date` is `Date`). + - **Clear Error Reporting**: If mismatches exist, the constructor returns detailed messages about all invalid columns or can emit warnings if type enforcement is disabled. + - **Metadata Attachment**: Attaches OMOP metadata (like `cdmDatatype`, `standardConcept`, etc.) directly to each validated column. + +- **Preprocessing Utilities**: Built-in tools for data preparation include: + - `one_hot_encode`: One-hot encodes categorical variables using `FeatureTransforms.jl`. + - `apply_vocabulary_compression`: Groups rare categorical values under a shared `"Other"` label. + - `map_concepts`: Maps concept IDs to human-readable concept names using a DuckDB-backed `concept` table. + - `map_concepts!`: An in-place variant of concept mapping that modifies the existing table. + +- **Tables.jl Compatibility**: The `HealthTable` type implements the full `Tables.jl` interface so it can be used with any downstream package in the Julia data ecosystem. + +- **JuliaHealth Integration**: Designed to interoperate seamlessly with current and future JuliaHealth tools and projects. + +- **Extensible Foundation**: The core architecture is extensible future support could include streaming, direct DuckDB views, or remote OMOP datasets. + + +## `Tables.jl` Interface Sketch + +The `HealthTable` type is the main interface for working with OMOP CDM tables. You construct it by passing in a `DataFrame` and optionally specifying a CDM version. The constructor will validate the schema and attach metadata. The resulting object: + +- Is a wrapper over the validated DataFrame (`ht.source`), +- Provides schema-aware access to data, +- Can be used anywhere a `Tables.jl`-compatible table is expected. + +This eliminates the need for a separate wrapping step the constructor itself ensures conformance and returns a ready-to-use tabular object. + +In future extensions, similar wrappers could be created for other data sources, such as database queries or streaming sources. These types would implement the same `Tables.jl` interface to support composable workflows. \ No newline at end of file diff --git a/docs/src/HealthTablePreprocessing.md b/docs/src/HealthTablePreprocessing.md new file mode 100644 index 0000000..ca631f4 --- /dev/null +++ b/docs/src/HealthTablePreprocessing.md @@ -0,0 +1,37 @@ +# HealthTable: Preprocessing Functions + +This page documents the preprocessing and transformation functions available for `HealthTable` objects when working with OMOP CDM data. These functions are provided by the OMOP CDM extension and enable data preparation workflows for machine learning and analysis. + +## One-Hot Encoding + +Transform categorical variables into binary indicator columns suitable for machine learning algorithms. + +```@docs +HealthBase.one_hot_encode +``` + +## Vocabulary Compression + +Reduce the dimensionality of categorical variables by grouping infrequent levels under a common label. + +```@docs +HealthBase.apply_vocabulary_compression +``` + +## Concept Translation + +### Concept Mapping (Immutable) + +Map OMOP concept IDs to human-readable concept names using the OMOP vocabulary tables, returning a new `HealthTable`. + +```@docs +HealthBase.map_concepts +``` + +### Concept Mapping (In-Place) + +In-place version of concept mapping that modifies the original `HealthTable` directly for memory efficiency. + +```@docs +HealthBase.map_concepts! +``` diff --git a/docs/src/OMOPCDMWorkflow.md b/docs/src/OMOPCDMWorkflow.md new file mode 100644 index 0000000..be13ae2 --- /dev/null +++ b/docs/src/OMOPCDMWorkflow.md @@ -0,0 +1,76 @@ +# OMOP CDM Workflow with HealthTable + +## Typical Workflow + +The envisioned process for working with OMOP CDM data using the `HealthBase.jl` components typically follows these steps: + +1. **Data Loading** + Raw data is loaded into a suitable tabular structure, most commonly a `DataFrame`. + +2. **Validation and Wrapping with `HealthTable`** + The raw `DataFrame` is then wrapped using `HealthBase.HealthTable`. This function takes the `DataFrame` and uses the attached OMOP CDM version (e.g., "v5.4.1") to validate its structure and column types against the OMOP CDM schema. + + - It checks if the column types are compatible with the expected OMOP CDM types (from `OMOPCommonDataModel.jl`). + - If `disable_type_enforcement = false`, it will throw errors on mismatches or attempt safe conversions. + - It attaches metadata to columns indicating their OMOP CDM types. + - The result is a `HealthTable` instance that wraps the validated `DataFrame` and exposes the `Tables.jl` interface. + +3. **Interacting via `Tables.jl`** + Once wrapped, the `HealthTable` instance can be seamlessly used with any `Tables.jl`-compatible tools and standard `Tables.jl` functions. + +4. **Applying Preprocessing Utilities** + After wrapping, you can apply preprocessing steps essential for analysis or modeling. These include: + + - One-hot encoding + - Handling of high-cardinality categorical variables + - Concept mapping utilities + + These utilities usually return a modified `HealthTable` or a materialized `DataFrame` ready for downstream use. + +## Example Usage + +```julia +using DataFrames, OMOPCommonDataModel, InlineStrings, Serialization, Dates, FeatureTransforms, DBInterface, DuckDB +using HealthBase + +# Assume 'condition_occurrence_df' is a DataFrame loaded from a CSV/database +condition_occurrence_df = DataFrame( + condition_occurrence_id = [1, 2, 3], + person_id = [101, 102, 101], + condition_concept_id = [201826, 433736, 317009], + condition_start_date = [Date(2010,1,1), Date(2012,5,10), Date(2011,3,15)] + # ... other fields +) + +# Validate and wrap the DataFrame with HealthTable +ht_conditions = HealthTable(condition_occurrence_df; omop_cdm_version="v5.4.1") + +# 1. Schema Inspection +sch = Tables.schema(ht_conditions) +println("Schema Names: ", sch.names) +println("Schema Types: ", sch.types) +# This should output the names and types from the validated DataFrame + +# 2. Iteration (Rows) +for row in Tables.rows(ht_conditions) + # 'row' is a Tables.Row, with fields matching the OMOP schema + println("Person ID: $(row.person_id), Condition: $(row.condition_concept_id)") +end + +# 3. Integration with other packages (example: MLJ.jl) +# 4. Materialization +# DataFrame(ht_conditions) +``` + +## Preprocessing and Utilities + +Preprocessing utilities can operate on `HealthTable` objects (or their materialized versions), leveraging the `Tables.jl` interface and schema awareness derived via `Tables.schema`. + +Examples include: + +- `one_hot_encode(ht::HealthTable, column_symbol::Symbol; drop_original=true)` +- `apply_vocabulary_compression(ht::HealthTable, column_symbol::Symbol, mapping_dict::Dict)` +- `map_concepts(ht::HealthTable, column_symbol::Symbol, concept_map::AbstractDict)` +- `map_concepts!(ht::HealthTable, column_symbol::Symbol, concept_map::AbstractDict)` *(in-place version)* + +These functions follow the principle of user-triggered, optional transformations configurable via keyword arguments. diff --git a/docs/src/api.md b/docs/src/api.md index df5138f..b5f4b98 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -9,4 +9,12 @@ CurrentModule = HealthBase ```@autodocs Modules = [HealthBase] +Filter = t -> !(t in [HealthBase.HealthTable, + Base.getproperty(Tables, :columns), + Base.getproperty(Tables, :rows), + Base.getproperty(Tables, :schema), + Base.getproperty(Tables, :istable), + Base.getproperty(Tables, :rowaccess), + Base.getproperty(Tables, :columnaccess), + Base.getproperty(Tables, :materializer)]) ``` diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md new file mode 100644 index 0000000..ebe547d --- /dev/null +++ b/docs/src/quickstart.md @@ -0,0 +1,158 @@ +# Quickstart + +Welcome to the **Quickstart** guide for [`HealthBase.jl`](https://github.com/JuliaHealth/HealthBase.jl)! +This guide walks you through setting up your Julia environment, creating example OMOP CDM data, validating it, and applying preprocessing steps using the `HealthTable` system. + +## Getting Started + +### Launch Julia and Enter Your Project Environment + +To get started: + +1. Open your terminal or Julia REPL. +2. Navigate to your project folder (where `Project.toml` is located): + +```sh +cd path/to/your/project +``` + +3. Activate the project: + +```julia +julia --project=. +``` + +4. (Optional for docs) For working on documentation: + +```sh +julia --project=docs +``` + +## 1. Load Packages + +Before loading `HealthBase`, you must first load some **trigger packages**. +These packages enable HealthBase's extensions, which power important features like type validation and concept mapping. + +> ⚠️ **Important:** Load the following packages **before** `using HealthBase`. +> Otherwise, some functions may not be available due to missing extensions. + +```julia +# First, load the trigger packages +using DataFrames, OMOPCommonDataModel, InlineStrings, Serialization, Dates, FeatureTransforms, DBInterface, DuckDB + +# Then, load HealthBase +using HealthBase +``` + +## 2. Create Example DataFrames + +We'll create two `DataFrame`s: + +- `good_df` - a minimal, valid slice of the OMOP _person_ table. +- `wrong_df` - intentionally invalid (wrong types & extra column) so you can see the constructor’s validation in action. + +```julia +good_df = DataFrame( + person_id = 1:6, + gender_concept_id = [8507, 8507, 8532, 8532, 8507, 8532], + year_of_birth = [1980, 1995, 1990, 1975, 1988, 2001], + race_concept_id = [8527, 8515, 8527, 8516, 8527, 8516] +) + +# Invalid DataFrame to test validation +wrong_df = DataFrame( + person_id = ["1", "2"], # Should be Int64 + gender_concept_id = [8507, 8532], + year_of_birth = [1990, 1985], + race_concept_id = [8527, 8516], + extra_col = [true, false], # Extra column not in the OMOP schema +) + +ht = HealthTable(good_df; omop_cdm_version="v5.4.1") + +# OMOP CDM version metadata +metadata(ht.source, "omop_cdm_version") + +# Will give column-specific metadata +colmetadata(ht.source, :gender_concept_id) + +# This will throw an error (strict enforcement) +ht = HealthTable(wrong_df; omop_cdm_version="v5.4.1", disable_type_enforcement = false) + +# If you want to *load anyway* and just receive warnings, disable type enforcement: +ht_relaxed = HealthTable(wrong_df; omop_cdm_version="v5.4.1", disable_type_enforcement = true) +``` + +## 3. Preprocessing Pipeline + +Now, we'll apply a series of transformations to clean and prepare the data. + +### Mapping Concepts + +Convert concept codes (e.g., gender ID) into readable or binary columns using a DuckDB connection. + +```julia +conn = DBInterface.connect(DuckDB.DB, "synthea_1M_3YR.duckdb") + +# Single column, auto-suffixed column name (gender_concept_id_mapped) +ht_mapped = map_concepts(ht, :gender_concept_id, conn; schema = "dbt_synthea_dev") + +# Multiple columns, custom new column names +ht_mapped2 = map_concepts(ht, [:gender_concept_id, :race_concept_id], conn; new_cols = ["gender", "race"], schema = "dbt_synthea_dev", drop_original=true) + +# In-place variant +map_concepts!(ht, [:gender_concept_id], conn; schema = "dbt_synthea_dev") +``` + +### Manual Concept Mapping (Without DB) + +Sometimes, you may want to map concept IDs using a custom dictionary instead of querying the database. + +```julia +# Define custom mapping manually +custom_map = Dict(8507 => "Male", 8532 => "Female") + +# Option 1: Add a new column using `Base.map` +ht.source.gender_label = map(x -> get(custom_map, x, "Unknown"), ht.source.gender_concept_id) + +# Option 2: Use `Base.map!` with a new destination vector +gender_labels = Vector{String}(undef, length(ht.source.gender_concept_id)) +map!(x -> get(custom_map, x, "Unknown"), gender_labels, ht.source.gender_concept_id) +ht.source.gender_label = gender_labels +``` + +### Compress sparse categories + +Group rare values into an "Other" category so they don’t overwhelm your model. + +```julia +ht_compressed = apply_vocabulary_compression(ht_mapped; cols = [:race_concept_id], min_freq = 2, other_label = "Other") +``` + +### One-hot encode categorical columns + +Convert categorical codes into binary indicator columns (true/false). + +```julia +ht_ohe = one_hot_encode(ht_compressed; cols=[:gender_concept_id, :race_concept_id]) +``` + +### For Developers: Interactive Use in the REPL + +When working interactively in the REPL during development: + +- Always load the **trigger packages first** +- Then load `HealthBase` +- Only after that, use extension functions like `one_hot_encode`, `map_concepts`, etc. + +```julia +# Correct load order for extensions to work: +using DataFrames, OMOPCommonDataModel, InlineStrings, Serialization, Dates, FeatureTransforms, DBInterface, DuckDB +using HealthBase + +# Now this will work: +# ht_ohe = one_hot_encode(ht; cols=[:gender_concept_id]) +``` + +Happy experimenting with `HealthBase.jl`! 🎉 +Feel free to explore more advanced workflows in the other guide sections. diff --git a/ext/HealthBaseOMOPCDMExt.jl b/ext/HealthBaseOMOPCDMExt.jl new file mode 100644 index 0000000..c961302 --- /dev/null +++ b/ext/HealthBaseOMOPCDMExt.jl @@ -0,0 +1,452 @@ +module HealthBaseOMOPCDMExt + +using HealthBase +using DataFrames +using OMOPCommonDataModel +using Serialization +using InlineStrings +using Dates +import FeatureTransforms: + OneHotEncoding, apply_append +using DuckDB +using DBInterface: execute + +# NOTE: In the future, replace this with OMOP CDM version info directly from OMOPCommonDataModel.jl dependencies. +const OMOPCDM_VERSIONS = deserialize(joinpath(@__DIR__, "..", "assets", "version_info")) + +# Mapping OMOP CDM datatypes to Julia types +const DATATYPE_MAP = Dict( + "integer" => Int64, "Integer" => Int64, "bigint" => Int64, + "float" => Float64, + "date" => Date, "datetime" => DateTime, + "varchar(1)" => String, "varchar(2)" => String, "varchar(3)" => String, + "varchar(9)" => String, "varchar(10)" => String, "varchar(20)" => String, + "varchar(25)" => String, "varchar(50)" => String, "varchar(80)" => String, + "varchar(250)" => String, "varchar(255)" => String, "varchar(1000)" => String, + "varchar(2000)" => String, "varchar(MAX)" => String +) + +function __init__() + @info "OMOP CDM extension for HealthBase has been loaded!" +end + +""" + HealthTable(df::DataFrame; omop_cdm_version=nothing, disable_type_enforcement=false, collect_errors=true) + +Constructs a `HealthTable` for an OMOP CDM dataset by validating the given `DataFrame`. + +This constructor validates the `DataFrame` against the OMOP CDM schema for the specified version (if not provided, takes default version "v5.4.0"). +It ensures that: +- all column names exist in the OMOP schema, +- each column's data type matches the expected type from the schema, +- appropriate metadata is attached to each column based on the OMOP CDM field definition. + +If any mismatches are found, a comprehensive error (or warning, depending on settings) will be +raised to help users correct their data. Once validated, the table is wrapped into a schema-aware +`HealthTable`, suitable for downstream use. + +## Arguments +- `df::DataFrame`: The `DataFrame` to wrap. It should contain columns corresponding to an OMOP CDM table. + +## Keyword Arguments +- `omop_cdm_version::Union{Nothing,String}=nothing`: Optional. Pass a specific version or leave `nothing` to auto-detect from the DataFrame metadata (falls back to "v5.4.0"). +- `disable_type_enforcement::Bool=false`: If `true`, type mismatches will emit a single comprehensive warning instead of throwing an error. +- `collect_errors::Bool=true`: If `false`, the constructor will throw an error immediately upon finding the first column with a type mismatch. If `true` (the default), it will collect all errors and report them in a single message. + +## Returns +- `HealthTable`: A new `HealthTable` instance with validated data and attached metadata. + +## Examples + +1. Loading a DataFrame from scratch: +```julia +using DataFrames, OMOPCommonDataModel, InlineStrings, Serialization, Dates, FeatureTransforms, DBInterface, DuckDB +using HealthBase + +person_df = DataFrame( + person_id = 1:6, + gender_concept_id = [8507, 8507, 8532, 8532, 8507, 8532], + year_of_birth = [1980, 1995, 1990, 1975, 1988, 2001], + race_concept_id = [8527, 8515, 8527, 8516, 8527, 8516] +) +ht = HealthTable(person_df; omop_cdm_version="v5.4.0") +``` + +2. Loading a DataFrame from a database query: +```julia +using DBInterface, DuckDB, DataFrames, HealthBase +# db = DuckDB.DB("synthea.duckdb") # Example database file +# person_df = DBInterface.execute(db, "SELECT * FROM person") |> DataFrame +# ht = HealthTable(person_df; omop_cdm_version="v5.4.0") +``` + +3. Accessing column metadata: +```julia +# After constructing ht as above: +colnames = names(ht.source) +coltypes = eltype.(eachcol(ht.source)) +# OMOP metadata can be accessed from ht or its source columns if attached +``` + +4. Quick-fail/warning for bad data: +You can control how strictly HealthTable enforces schema validation: + +```julia +# Fail immediately on first mismatch +ht = HealthTable(df; collect_errors = false) + +# Collect all mismatches and throw a combined error +ht = HealthTable(df; collect_errors = true) + +# Only warn on mismatches; allows proceeding (use with caution) +ht = HealthTable(df; disable_type_enforcement = true) +``` +Use disable_type_enforcement=true if you're exploring or cleaning data but for modeling or analysis, validated types are strongly recommended. +""" +function HealthBase.HealthTable( + df::DataFrame; + omop_cdm_version::String="v5.4.0", + disable_type_enforcement=false, + collect_errors=true +) + if !haskey(OMOPCDM_VERSIONS, omop_cdm_version) + throw(ArgumentError("OMOP CDM version '$(omop_cdm_version)' is not supported. Available versions: $(keys(OMOPCDM_VERSIONS))")) + end + + omop_fields = OMOPCDM_VERSIONS[omop_cdm_version][:fields] + @assert !isempty(omop_fields) "OMOP CDM version $(omop_cdm_version) has no registered fields." + failed_columns = Vector{NamedTuple{(:colname, :type, :expected), Tuple{String, Any, Any}}}() + extra_columns = String[] + + for col in names(df) + col_symbol = Symbol(col) + + if !haskey(omop_fields, col_symbol) + push!(extra_columns, col) + continue + end + + fieldinfo = omop_fields[col_symbol] + actual_type = eltype(df[!, col_symbol]) + + if !haskey(fieldinfo, :cdmDatatype) + if !collect_errors + throw(ArgumentError("Column '$(col)' is missing :cdmDatatype information in the schema.")) + end + push!(failed_columns, (colname=col, type=actual_type, expected="")) + else + expected_string = fieldinfo[:cdmDatatype] + + if !haskey(DATATYPE_MAP, expected_string) + push!(failed_columns, (colname=col, type=actual_type, expected="Unrecognized OMOP datatype: $(expected_string)")) + else + expected_type = DATATYPE_MAP[expected_string] + + if !(actual_type <: Union{expected_type, Missing}) + if !collect_errors + throw(ArgumentError("Column '$(col)' has type $(actual_type), but expected a subtype of $(expected_type).")) + end + push!(failed_columns, (colname=col, type=actual_type, expected=expected_type)) + end + end + + for (key, val) in fieldinfo + if !ismissing(val) + colmetadata!(df, col_symbol, String(key), string(val)) + end + end + end + end + + validation_msgs = String[] + + if !isempty(failed_columns) + error_details = join(["Column '$(err.colname)': has type $(err.type), expected $(err.expected)" for err in failed_columns], "\n") + push!(validation_msgs, "OMOP CDM type validation failed for the following columns:\n" * error_details) + end + + if !isempty(validation_msgs) + full_message = join(validation_msgs, "\n\n") * "\n" + if disable_type_enforcement + @warn full_message * "\nType enforcement is disabled. Unexpected behavior may occur." + else + throw(ArgumentError(full_message)) + end + end + + DataFrames.metadata!(df, "omop_cdm_version", omop_cdm_version) + + return HealthBase.HealthTable{typeof(df)}(df) +end + +""" + one_hot_encode(ht::HealthTable; cols, drop_original=true, return_features_only=false) + +One-hot encode the categorical columns in `ht` using **FeatureTransforms.jl**. + +For every requested column the function appends Boolean indicator columns — one per +unique (non-missing) level. New columns are named `col_value`, e.g. `gender_concept_id_8507`. + +Boolean source columns are detected and skipped automatically with a warning. + +# Arguments +- `ht::HealthTable`: Table to transform (schema-aware). + +# Keyword Arguments +- `cols::Vector{Symbol}`: Categorical columns to encode. +- `drop_original::Bool=true`: Drop the source columns after encoding. +- `return_features_only::Bool=false`: If `true` return a **DataFrame** containing only the + encoded data; if `false` wrap the result in a `HealthTable` with + `disable_type_enforcement=true` (because the output is no longer standard OMOP CDM). + +# Returns +- `DataFrame` or `HealthTable` depending on `return_features_only`. + +# Example +```julia +ht_ohe = one_hot_encode(ht; cols = [:gender_concept_id, :race_concept_id]) +X = one_hot_encode(ht; cols = [:gender_concept_id], return_features_only = true) # ML features +``` +""" +function HealthBase.one_hot_encode( + ht::HealthTable; + cols::Vector{Symbol}, + drop_original::Bool = true, + return_features_only::Bool = false +) + df = copy(ht.source) + missing = setdiff(cols, Symbol.(names(df))) + @assert isempty(missing) "Columns $(missing) not found." + + for col in cols + if eltype(df[!, col]) <: Bool + @warn "Column $col is already Boolean; skipping one-hot." + continue + end + + cats = unique(skipmissing(df[!, col])) + enc = OneHotEncoding(cats) + header = Symbol.(string(col, "_", c) for c in cats) + df = apply_append(df, enc; cols=[col], header=header) + end + + drop_original && select!(df, Not(cols)) + + return return_features_only ? df : HealthBase.HealthTable{typeof(df)}(df) +end + +""" + map_concepts(ht::HealthTable, col::Symbol, new_col::String, conn::DuckDB.DB; drop_original::Bool = false, concept_table::String = "concept", schema::String = "main") + +Map concept IDs in a column to their corresponding concept names using the OMOP `concept` table. Only direct mappings using concept IDs are supported. + + +# Arguments +- `ht::HealthTable`: Input OMOP data table. +- `cols::Union{Symbol, Vector{Symbol}}`: Column(s) containing concept IDs. +- `conn::DuckDB.DB`: Database connection for concept lookup. + +# Keyword Arguments +- `new_cols`: Name(s) for output columns. If not provided, uses `col * suffix`. +- `suffix::String="_mapped"`: Suffix for default new column names. +- `drop_original::Bool=false`: Drop source column(s) after mapping. +- `concept_table::String="concept"`: Table name for concepts. +- `schema::String="main"`: Schema containing the concept table. + +# Returns +- A new `HealthTable` with the concept names added in `new_col`. + +# Example +```julia +conn = DBInterface.connect(DuckDB.DB, "path/to/db/.duckdb") + +# Map gender_concept_id to concept_name +ht_mapped = map_concepts(ht, :gender_concept_id, "gender_name", conn; schema = "dbt_synthea_dev") +``` +""" +function HealthBase.map_concepts( + ht::HealthTable, + cols::Union{Symbol, Vector{Symbol}}, + conn::DuckDB.DB; + new_cols::Union{Nothing, String, Vector{String}} = nothing, + drop_original::Bool = false, + suffix::String = "_mapped", + concept_table::String = "concept", + schema::String = "main" +) + df = copy(ht.source) + _map_concepts!(df, cols, conn; new_cols, drop_original, suffix, concept_table, schema) + + return HealthBase.HealthTable{typeof(df)}(df) +end + +""" + map_concepts!(ht::HealthTable, cols, conn; ...) + +In-place version of `map_concepts`. Maps concept IDs to human-readable names using the OMOP `concept` table. + +# Arguments +- `ht::HealthTable`: The table to update. +- `cols`: Single column or list of columns with concept IDs. +- `conn::DuckDB.DB`: Connection to the OMOP database. + +# Keyword Arguments +- `new_cols`: Optional new column names. Defaults to `col * "_mapped"`. +- `suffix`: Suffix used when `new_cols` is not provided. +- `drop_original`: Whether to drop the original columns. +- `concept_table`, `schema`: Source table and schema. + +# Returns +- The mutated `HealthTable`. + +# Example +```julia +conn = DBInterface.connect(DuckDB.DB, "path/to/db/.duckdb") + +# Map gender_concept_id to concept_name in-place +map_concepts!(ht, :gender_concept_id, conn; new_cols="gender_name", schema="dbt_synthea_dev") +``` +""" +function HealthBase.map_concepts!( + ht::HealthTable, + cols::Union{Symbol, Vector{Symbol}}, + conn::DuckDB.DB; + new_cols::Union{Nothing, String, Vector{String}} = nothing, + drop_original::Bool = false, + suffix::String = "_mapped", + concept_table::String = "concept", + schema::String = "main" +) + _map_concepts!( + ht.source, + cols, + conn; + new_cols = new_cols, + drop_original = drop_original, + suffix = suffix, + concept_table = concept_table, + schema = schema + ) + return ht +end + +""" + _map_concepts!(df, cols, conn; ...) + +Low-level internal helper to map concept IDs to names directly on a `DataFrame`. + +# Arguments +- `df::DataFrame`: Target DataFrame. +- `cols`: Single or multiple columns with concept IDs. +- `conn::DuckDB.DB`: Database connection. + +# Keyword Arguments +- `new_cols`: New column names or `nothing` (defaults to col * `suffix`). +- `drop_original`: Drop source columns after mapping. +- `suffix`: Suffix for auto-generated column names. +- `concept_table`, `schema`: OMOP source location. + +# Notes +- This is called internally by `map_concepts` and `map_concepts!`. +""" +function _map_concepts!( + df::DataFrame, + cols::Union{Symbol, Vector{Symbol}}, + conn::DuckDB.DB; + new_cols::Union{Nothing, String, Vector{String}} = nothing, + drop_original::Bool = false, + suffix::String = "_mapped", + concept_table::String = "concept", + schema::String = "main" +) + cols = isa(cols, Symbol) ? [cols] : cols + + if isnothing(new_cols) + new_cols = [string(col, suffix) for col in cols] + elseif isa(new_cols, String) + new_cols = [new_cols] + end + + @assert length(cols) == length(new_cols) "Length of `cols` and `new_cols` must match." + + for (col, new_col) in zip(cols, new_cols) + @assert col in propertynames(df) "Column '$col' not found in table." + + ids = unique(skipmissing(df[!, col])) + if isempty(ids) + @warn "No concept_ids found in column $col; skipping." + continue + end + + id_list_str = join(string.(ids), ", ") + query = """ + SELECT concept_id, concept_name + FROM $schema.$concept_table + WHERE concept_id IN ($id_list_str) + """ + + result_df = DataFrame(execute(conn, query)) + if isempty(result_df) + @warn "Concept mapping for $col returned empty result. Check table, schema, and values." + continue + end + + mapping = Dict((cid => cname) for (cid, cname) in zip(result_df.concept_id, result_df.concept_name)) + df[!, new_col] = map(x -> get(mapping, x, missing), df[!, col]) + + if drop_original + select!(df, Not(col)) + end + end +end + +""" + apply_vocabulary_compression(ht::HealthTable; cols, min_freq=10, other_label="Other") + +Group infrequent categorical levels under a single *other* label. + +# Arguments +- `ht::HealthTable`: Input data table. + +# Keyword Arguments +- `cols::Vector{Symbol}`: Columns to compress. +- `min_freq::Int=10`: Minimum frequency for a value to remain unchanged. +- `other_label::String="Other"`: Label used to replace infrequent values. +- `drop_original::Bool=false`: Whether to drop original columns after compression. + +# Returns +- `HealthTable`: Table with compressed categorical levels. + +# Examples +```julia +ht_small = apply_vocabulary_compression(ht; cols=[:condition_source_value], min_freq=5) +``` +""" +function HealthBase.apply_vocabulary_compression( + ht::HealthTable; + cols::Vector{Symbol}, + min_freq::Integer = 10, + other_label::AbstractString = "Other", + drop_original::Bool = false, +) + df = copy(ht.source) + + for col in cols + @assert col in propertynames(df) "Column '$(col)' not found in table." + dest_col = Symbol(string(col), "_compressed") + counts = combine(groupby(df, col), nrow => :freq) + to_compress = counts[counts.freq .< min_freq, col] + if !isempty(to_compress) + df[!, dest_col] = map(x -> in(x, to_compress) ? other_label : string(x), df[!, col]) + end + end + + if drop_original + select!(df, Not(cols)) + end + + return HealthBase.HealthTable{typeof(df)}(df) +end + +end + diff --git a/src/HealthBase.jl b/src/HealthBase.jl index 7c1a74b..aad28a1 100644 --- a/src/HealthBase.jl +++ b/src/HealthBase.jl @@ -1,8 +1,9 @@ module HealthBase -using Base: get_extension - +using Base: get_extension, @kwdef using Base.Experimental: register_error_hint +using Tables +using DataFrames include("exceptions.jl") @@ -12,6 +13,10 @@ function __init__() if isnothing(get_extension(HealthBase, :HealthBaseDrWatsonExt)) _extension_message("DrWatson", cohortsdir, io) end + elseif exc.f == HealthTable + if isnothing(get_extension(HealthBase, :HealthBaseOMOPCDMExt)) + _extension_message("OMOPCommonDataModel, DataFrames", HealthTable, io) + end elseif exc.f == corpusdir if isnothing(get_extension(HealthBase, :HealthBaseDrWatsonExt)) _extension_message("DrWatson", corpusdir, io) @@ -36,6 +41,9 @@ function __init__() end end +include("healthtable_interface.jl") include("drwatson_stub.jl") +include("omopcdm_stub.jl") +include("show.jl") end diff --git a/src/healthtable_interface.jl b/src/healthtable_interface.jl new file mode 100644 index 0000000..cd6ad33 --- /dev/null +++ b/src/healthtable_interface.jl @@ -0,0 +1,122 @@ +""" + HealthTable{T} + +A lightweight, schema-aware wrapper for OMOP CDM tables, providing a standardized Tables.jl interface and metadata tracking. + +The `HealthTable` struct is designed to wrap OMOP CDM-compliant data sources (such as DataFrames), ensuring that all columns +conform to the OMOP CDM specification for a given version. It attaches the OMOP CDM version as metadata and enables seamless +integration with the Julia Tables.jl ecosystem. + +# Fields +- `source::T`: The underlying data source (typically a `DataFrame`) containing the OMOP CDM table data. + +# Examples +```julia +person_df = DataFrame( + person_id=1:3, + gender_concept_id=[8507, 8532, 8507], + year_of_birth=[1990, 1985, 2000] +) +ht = HealthTable(person_df; omop_cdm_version="v5.4.1") +Tables.schema(ht) # Get the schema +DataFrame(ht) # Materialize as DataFrame +``` +""" +@kwdef struct HealthTable{T} + source::T +end + +""" + Tables.istable(::Type{<:HealthTable}) + +Signal that `HealthTable` is a table according to the Tables.jl interface. + +This function is part of the Tables.jl interface and is used to identify types that can be treated as tabular data. + +## Returns +- `Bool`: Always returns `true` for the `HealthTable` type. +""" +Tables.istable(::Type{<:HealthTable}) = true + +""" + Tables.rowaccess(::Type{<:HealthTable}) + +Signal that `HealthTable` supports row-based iteration. + +This function is part of the Tables.jl interface. A `true` return value indicates that `Tables.rows` can be called on an instance of `HealthTable`. + +## Returns +- `Bool`: Always returns `true` for the `HealthTable` type. +""" +Tables.rowaccess(::Type{<:HealthTable}) = true + +""" + Tables.rows(ht::HealthTable) + +Return an iterator over the rows of the `HealthTable`. + +This function implements the row-access part of the Tables.jl interface by delegating to the underlying `source` object. + +## Arguments +- `ht::HealthTable`: The `HealthTable` instance. + +## Returns +- An iterator object that yields each row of the table. +""" +Tables.rows(ht::HealthTable) = Tables.rows(ht.source) + +""" + Tables.columnaccess(::Type{<:HealthTable}) + +Signal that `HealthTable` supports column-based access. + +This function is part of the Tables.jl interface. A `true` return value indicates that `Tables.columns` can be called on an instance of `HealthTable`. + +## Returns +- `Bool`: Always returns `true` for the `HealthTable` type. +""" +Tables.columnaccess(::Type{<:HealthTable}) = true + +""" + Tables.columns(ht::HealthTable) + +Return the `HealthTable`'s data as a set of columns. + +This function implements the column-access part of the Tables.jl interface by delegating to the underlying `source` object. + +## Arguments +- `ht::HealthTable`: The `HealthTable` instance. + +## Returns +- A column-accessible object that represents the table's data. +""" +Tables.columns(ht::HealthTable) = Tables.columns(ht.source) + +""" + Tables.schema(ht::HealthTable) + +Get the schema of the `HealthTable`. + +The schema includes the names and types of the columns. This function delegates the call to the underlying `source`. + +## Arguments +- `ht::HealthTable`: The `HealthTable` instance. + +## Returns +- `Tables.Schema`: An object describing the column names and their Julia types. +""" +Tables.schema(ht::HealthTable) = Tables.schema(ht.source) + +""" + Tables.materializer(::Type{<:HealthTable}) + +Specify the default type to use when materializing a `HealthTable`. + +This function is part of the Tables.jl interface. It allows other packages to convert a `HealthTable` into a concrete table type like a `DataFrame` by calling `DataFrame(ht)`. + +## Returns +- `Type`: The `DataFrame` type, indicating it as the preferred materialization format. +""" +Tables.materializer(::Type{<:HealthTable}) = DataFrame + +export HealthTable \ No newline at end of file diff --git a/src/omopcdm_stub.jl b/src/omopcdm_stub.jl new file mode 100644 index 0000000..d6fbed2 --- /dev/null +++ b/src/omopcdm_stub.jl @@ -0,0 +1,9 @@ +function one_hot_encode end +function apply_vocabulary_compression end +function map_concepts end +function map_concepts! end + +export one_hot_encode +export apply_vocabulary_compression +export map_concepts +export map_concepts! diff --git a/src/show.jl b/src/show.jl new file mode 100644 index 0000000..6356a46 --- /dev/null +++ b/src/show.jl @@ -0,0 +1,29 @@ +using PrettyTables +using DataFrames + +""" + Base.show(io::IO, ht::HealthTable) + +Pretty-print a `HealthTable` to any IO stream (REPL, file, etc.). + +- If the underlying table is empty, prints a friendly message. +- Otherwise prints the full table using **PrettyTables.jl** with left-aligned columns. +- Displays the OMOP-CDM version (from metadata) beneath the table when available. + +This method is purely for display; it returns `nothing`. +""" +function Base.show(io::IO, ht::HealthTable) + df = ht.source + + if nrow(df) == 0 + pretty_table(io, ["HealthTable is empty"]; header = [""]) + else + pretty_table(io, df; alignment = :l) + end + + if haskey(metadata(df), "omop_cdm_version") + println(io, "\nOMOP CDM version: ", metadata(df, "omop_cdm_version")) + end + + return nothing +end diff --git a/test/Project.toml b/test/Project.toml index cb94583..179b3b6 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,25 @@ [deps] +DBInterface = "a10d1c49-ce27-4219-8d33-6db1a4562965" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" DrWatson = "634d3b9d-ee7a-5ddf-bec9-22491ea816e1" +DuckDB = "d2f5444f-75bc-4fdf-ac35-56f514c445e1" +FeatureTransforms = "8fd68953-04b8-4117-ac19-158bf6de9782" +InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" +OMOPCommonDataModel = "ba65db9e-6590-4054-ab8a-101ed9124986" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[compat] +DBInterface = "2.2" +DataFrames = "1" +Dates = "1.10" +DrWatson = "2" +DuckDB = "1" +FeatureTransforms = "0.4.0" +InlineStrings = "1" +OMOPCommonDataModel = "0.1" +Tables = "1.12.1" +julia = "1.10" diff --git a/test/healthtable_interface.jl b/test/healthtable_interface.jl new file mode 100644 index 0000000..0fcafb1 --- /dev/null +++ b/test/healthtable_interface.jl @@ -0,0 +1,75 @@ +@testset "HealthTable Interface" begin + df = DataFrame( + person_id = 1:5, + gender_concept_id = [8507, 8532, 8507, 8532, 8507], + year_of_birth = [1990, 1985, 2000, 1975, 1988] + ) + + @testset "Constructor" begin + # Test basic constructor + ht = HealthBase.HealthTable(df) + @test ht isa HealthBase.HealthTable + @test ht.source === df + + # Test keyword constructor + ht_kw = HealthBase.HealthTable(source=df) + @test ht_kw isa HealthBase.HealthTable + @test ht_kw.source === df + end + + @testset "Tables.jl Interface" begin + ht = HealthBase.HealthTable(df) + + # Test istable + @test Tables.istable(HealthBase.HealthTable) == true + @test Tables.istable(typeof(ht)) == true + + # Test rowaccess + @test Tables.rowaccess(HealthBase.HealthTable) == true + @test Tables.rowaccess(typeof(ht)) == true + + # Test columnaccess + @test Tables.columnaccess(HealthBase.HealthTable) == true + @test Tables.columnaccess(typeof(ht)) == true + + # Test schema + schema_ht = Tables.schema(ht) + schema_df = Tables.schema(df) + @test schema_ht.names == schema_df.names + @test schema_ht.types == schema_df.types + + # Test rows + rows_ht = collect(Tables.rows(ht)) + rows_df = collect(Tables.rows(df)) + @test length(rows_ht) == length(rows_df) + @test rows_ht[1].person_id == rows_df[1].person_id + + # Test columns + cols_ht = Tables.columns(ht) + cols_df = Tables.columns(df) + @test Tables.columnnames(cols_ht) == Tables.columnnames(cols_df) + + # Test materializer + @test Tables.materializer(HealthBase.HealthTable) == DataFrame + + # Test DataFrame materialization + df_materialized = DataFrame(ht) + @test df_materialized == df + end + + @testset "Different data types" begin + # Test with different Tables.jl compatible types + + # Test with named tuple + nt = [(person_id=1, name="Alice"), (person_id=2, name="Bob")] + ht_nt = HealthBase.HealthTable(nt) + @test Tables.istable(typeof(ht_nt)) + @test length(collect(Tables.rows(ht_nt))) == 2 + + # Test with empty DataFrame + empty_df = DataFrame(person_id = Int[], name = String[]) + ht_empty = HealthBase.HealthTable(empty_df) + @test Tables.istable(typeof(ht_empty)) + @test length(collect(Tables.rows(ht_empty))) == 0 + end +end diff --git a/test/omopcdmext.jl b/test/omopcdmext.jl new file mode 100644 index 0000000..9044bfa --- /dev/null +++ b/test/omopcdmext.jl @@ -0,0 +1,282 @@ +@testset "HealthBaseOMOPCDMExt" begin + # Check if extension is loaded properly + ext = Base.get_extension(HealthBase, :HealthBaseOMOPCDMExt) + if isnothing(ext) + @warn "HealthBaseOMOPCDMExt extension is not loaded. Skipping tests." + return + end + + # This DataFrame is compliant with the OMOP CDM v5.4.1 PERSON table schema. + person_df_good = DataFrame( + person_id=1, + gender_concept_id=8507, + year_of_birth=1990, + month_of_birth=1, + day_of_birth=1, + birth_datetime=DateTime(1990, 1, 1), + race_concept_id=0, + ethnicity_concept_id=0 + ) + + # This DataFrame has an incorrect type for the `year_of_birth` column. + person_df_bad = DataFrame( + person_id=1, + gender_concept_id=8507, + year_of_birth="1990", # Incorrect: Should be an Int + month_of_birth=1, + day_of_birth=1, + birth_datetime=DateTime(1990, 1, 1), + race_concept_id=0, + ethnicity_concept_id=0 + ) + + ht = HealthBase.HealthTable(person_df_good; omop_cdm_version="v5.4.1") + + @testset "Constructor and Type Validation" begin + @testset "Valid DataFrame" begin + @test ht isa HealthBase.HealthTable + @test metadata(ht.source, "omop_cdm_version") == "v5.4.1" + + # Test with default version + ht_default = HealthBase.HealthTable(person_df_good) + @test metadata(ht_default.source, "omop_cdm_version") == "v5.4.0" + end + + @testset "Invalid DataFrame Type Check" begin + @test_throws ArgumentError HealthBase.HealthTable(person_df_bad; omop_cdm_version="v5.4.1") + end + + @testset "Unsupported OMOP CDM Version" begin + @test_throws ArgumentError HealthBase.HealthTable(person_df_good; omop_cdm_version="v999.0") + end + + @testset "Type Enforcement Options" begin + # Test with type enforcement disabled (should warn, not error) + @test_logs (:warn, r"Type enforcement is disabled") HealthBase.HealthTable(person_df_bad; omop_cdm_version="v5.4.1", disable_type_enforcement=true) + + # Test with collect_errors=false (should fail on first error) + @test_throws ArgumentError HealthBase.HealthTable(person_df_bad; omop_cdm_version="v5.4.1", collect_errors=false) + end + + @testset "Extra Columns" begin + df_extra = copy(person_df_good) + df_extra[!, :extra_column] = ["extra_value"] + + ht_extra = HealthBase.HealthTable(df_extra; omop_cdm_version="v5.4.1") + @test "extra_column" in names(ht_extra.source) + end + + @testset "Schema Validation Edge Cases" begin + # Test multiple validation errors collected + df_multiple_errors = DataFrame( + person_id = "invalid_string", # Wrong type + gender_concept_id = "another_string" # Wrong type + ) + @test_throws ArgumentError HealthBase.HealthTable(df_multiple_errors; omop_cdm_version="v5.4.1", collect_errors=true) + end + end + + @testset "Version detection from metadata" begin + df_meta = DataFrame(person_id=1:3, + gender_concept_id=[8507,8532,8507], + year_of_birth=[1990,1985,2000], + race_concept_id=[8527,8516,8527]) + ht_meta = HealthBase.HealthTable(df_meta; omop_cdm_version="v5.4.1") + @test metadata(ht_meta.source, "omop_cdm_version") == "v5.4.1" + end + + @testset "Preprocessing Functions" begin + df = DataFrame( + person_id = 1:4, + gender_concept_id = [8507, 8507, 8532, 8532], + condition_source_value = ["Diabetes", "Hypertension", "Diabetes", "RareCondition"], + bool_column = [true, false, true, false] + ) + ht = HealthBase.HealthTable(df; omop_cdm_version="v5.4.1") + + @testset "one_hot_encode function" begin + # Test basic functionality + result = HealthBase.one_hot_encode(ht; cols=[:gender_concept_id], return_features_only=true) + expected_cols = ["gender_concept_id_8507", "gender_concept_id_8532"] + @test all(col in string.(names(result)) for col in expected_cols) + + # Test with HealthTable return + result_ht = HealthBase.one_hot_encode(ht; cols=[:gender_concept_id], return_features_only=false) + @test result_ht isa HealthBase.HealthTable + + # Test with Boolean column (should warn and skip) + @test_logs (:warn, r"Column bool_column is already Boolean") HealthBase.one_hot_encode(ht; cols=[:bool_column], return_features_only=true) + + # Test with missing column + @test_throws AssertionError HealthBase.one_hot_encode(ht; cols=[:nonexistent_column], return_features_only=true) + end + + @testset "apply_vocabulary_compression function" begin + # Test basic functionality + compressed = HealthBase.apply_vocabulary_compression(ht; cols=[:condition_source_value], min_freq=2) + @test "condition_source_value_compressed" in names(compressed.source) + compressed_vals = unique(compressed.source.condition_source_value_compressed) + @test "Other" in compressed_vals + + # Test with custom other_label + compressed_custom = HealthBase.apply_vocabulary_compression(ht; cols=[:condition_source_value], min_freq=2, other_label="RARE") + @test "RARE" in unique(compressed_custom.source.condition_source_value_compressed) + + # Test with missing column + @test_throws AssertionError HealthBase.apply_vocabulary_compression(ht; cols=[:nonexistent_column], min_freq=2) + end + + @testset "map_concepts function (mocked)" begin + # Create a simple in-memory DuckDB for testing + db = DuckDB.DB() + + # Create a mock concept table + DBInterface.execute(db, """ + CREATE TABLE concept ( + concept_id INTEGER, + concept_name VARCHAR + ) + """) + + DBInterface.execute(db, """ + INSERT INTO concept VALUES + (8507, 'Male'), + (8532, 'Female') + """) + + # Test map_concepts (returns new HealthTable) + ht_mapped = HealthBase.map_concepts(ht, :gender_concept_id, db; new_cols="gender_name") + @test "gender_name" in names(ht_mapped.source) + @test ht_mapped.source.gender_name[1] == "Male" + + # Test map_concepts! (modifies in place) + ht_copy = HealthBase.HealthTable(copy(df); omop_cdm_version="v5.4.1") + HealthBase.map_concepts!(ht_copy, :gender_concept_id, db; new_cols="gender_name_inplace") + @test "gender_name_inplace" in names(ht_copy.source) + + # Test error cases + @test_throws AssertionError HealthBase.map_concepts(ht, :nonexistent_column, db) + + # Close the database + DuckDB.close(db) + end + end + + @testset "Edge Cases and Error Handling" begin + @testset "HealthTable Constructor Error Paths" begin + @test_throws ArgumentError HealthBase.HealthTable(person_df_good; omop_cdm_version="v999.0") + + # Test with disable_type_enforcement=true for warning path + @test_logs (:warn, r"Type enforcement is disabled") HealthBase.HealthTable(person_df_bad; disable_type_enforcement=true) + end + + @testset "Internal Schema Validation Coverage" begin + # Get the extension to access internal constants + ext = Base.get_extension(HealthBase, :HealthBaseOMOPCDMExt) + + if !isnothing(ext) + # Access the OMOPCDM_VERSIONS constant from the extension + omop_versions = getfield(ext, :OMOPCDM_VERSIONS) + + # Create a test scenario by making a copy and corrupting it temporarily + if haskey(omop_versions, "v5.4.1") + original_fields = omop_versions["v5.4.1"][:fields] + + # Create a corrupted version for testing + corrupted_fields = copy(original_fields) + if haskey(corrupted_fields, :person_id) + # Remove cdmDatatype from person_id field to trigger + original_person_field = corrupted_fields[:person_id] + corrupted_person_field = Dict{Symbol, Any}() + for (k, v) in original_person_field + if k != :cdmDatatype # Skip cdmDatatype to trigger the error + corrupted_person_field[k] = v + end + end + corrupted_fields[:person_id] = corrupted_person_field + + # Temporarily replace the schema + corrupted_version = Dict{Symbol, Any}(:fields => corrupted_fields) + omop_versions["v5.4.1"] = corrupted_version + + # Test the missing cdmDatatype error path + df_test = DataFrame(person_id=1) + @test_throws ArgumentError HealthBase.HealthTable(df_test; omop_cdm_version="v5.4.1", collect_errors=false) + + # Test the missing cdmDatatype with collect_errors=true + @test_throws ArgumentError HealthBase.HealthTable(df_test; omop_cdm_version="v5.4.1", collect_errors=true) + + # Restore original schema + omop_versions["v5.4.1"] = Dict{Symbol, Any}(:fields => original_fields) + end + + # Now test unrecognized datatype (line 141) + corrupted_fields_2 = copy(original_fields) + if haskey(corrupted_fields_2, :person_id) + # Add an unrecognized datatype to trigger line 141 + corrupted_person_field_2 = copy(corrupted_fields_2[:person_id]) + corrupted_person_field_2[:cdmDatatype] = "INVALID_DATATYPE_XYZ" + corrupted_fields_2[:person_id] = corrupted_person_field_2 + + # Temporarily replace the schema + corrupted_version_2 = Dict{Symbol, Any}(:fields => corrupted_fields_2) + omop_versions["v5.4.1"] = corrupted_version_2 + + # Test the unrecognized datatype error path + df_test2 = DataFrame(person_id=1) + @test_throws ArgumentError HealthBase.HealthTable(df_test2; omop_cdm_version="v5.4.1", collect_errors=true) + + # Restore original schema + omop_versions["v5.4.1"] = Dict{Symbol, Any}(:fields => original_fields) + end + end + end + end + + @testset "map_concepts Edge Cases" begin + # Set up test database with concept table + db = DuckDB.DB() + DuckDB.execute(db, "CREATE TABLE concept (concept_id INTEGER, concept_name VARCHAR)") + DuckDB.execute(db, "INSERT INTO concept VALUES (8507, 'Male')") + + df_empty = DataFrame(empty_col=[missing, missing]) + ht_empty = HealthBase.HealthTable(df_empty; omop_cdm_version="v5.4.1") + + @test_logs (:warn, r"No concept_ids found") HealthBase.map_concepts!(ht_empty, :empty_col, db; new_cols="mapped_empty") + + df_nonexistent = DataFrame(nonexistent_ids=[99999]) + ht_nonexistent = HealthBase.HealthTable(df_nonexistent; omop_cdm_version="v5.4.1") + + # When mapping fails, the column is NOT added (the function continues/skips) + HealthBase.map_concepts!(ht_nonexistent, :nonexistent_ids, db; new_cols="mapped_nonexistent") + @test !("mapped_nonexistent" in names(ht_nonexistent.source)) # Column should NOT be added when mapping fails + + # Test drop_original=true for map_concepts! + df_drop = DataFrame(concept_col=[8507]) + ht_drop = HealthBase.HealthTable(df_drop; omop_cdm_version="v5.4.1") + HealthBase.map_concepts!(ht_drop, :concept_col, db; new_cols="mapped_col", drop_original=true) + @test !("concept_col" in names(ht_drop.source)) # Original column should be dropped + @test "mapped_col" in names(ht_drop.source) + + DuckDB.close(db) + end + + @testset "apply_vocabulary_compression drop_original" begin + df_compress = DataFrame( + col1=["A", "A", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K"], + col2=["X", "X", "X", "Y", "Z", "Z", "Z", "Z", "Z", "Z", "Z", "Z", "Z"] + ) + ht_compress = HealthBase.HealthTable(df_compress; omop_cdm_version="v5.4.1") + + # Apply compression with drop_original=true + ht_result = HealthBase.apply_vocabulary_compression(ht_compress; cols=[:col1, :col2], min_freq=3, drop_original=true) + + # Original columns should be dropped + @test !("col1" in names(ht_result.source)) + @test !("col2" in names(ht_result.source)) + # Compressed columns should exist + @test "col1_compressed" in names(ht_result.source) + @test "col2_compressed" in names(ht_result.source) + end + end +end diff --git a/test/runtests.jl b/test/runtests.jl index ceef466..04e5928 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,12 +1,32 @@ using DrWatson -using HealthBase -using Pkg using Test +using InlineStrings +using FeatureTransforms +using Serialization +using DataFrames +using OMOPCommonDataModel +using Dates +using DBInterface +using DuckDB +using Tables +using HealthBase @testset "Exceptions" begin include("exceptions.jl") end +@testset "HealthTable Interface" begin + include("healthtable_interface.jl") +end + +@testset "HealthTable Show Method" begin + include("show.jl") +end + @testset "HealthBaseDrWatsonExt" begin include("drwatsonext.jl") end + +@testset "HealthBaseOMOPCDMExt" begin + include("omopcdmext.jl") +end diff --git a/test/show.jl b/test/show.jl new file mode 100644 index 0000000..5b58024 --- /dev/null +++ b/test/show.jl @@ -0,0 +1,59 @@ +@testset "HealthTable Show Methods" begin + # Test with basic HealthTable + df = DataFrame( + person_id = 1:3, + gender_concept_id = [8507, 8532, 8507], + year_of_birth = [1990, 1985, 2000] + ) + ht = HealthBase.HealthTable(df) + + @testset "Basic show functionality" begin + # Test that show returns nothing + output = show(IOBuffer(), ht) + @test output === nothing + + # Test show output contains table data + io = IOBuffer() + show(io, ht) + output_str = String(take!(io)) + @test contains(output_str, "person_id") + @test contains(output_str, "gender_concept_id") + @test contains(output_str, "year_of_birth") + end + + @testset "Empty HealthTable show" begin + empty_df = DataFrame(person_id = Int[], gender_concept_id = Int[]) + empty_ht = HealthBase.HealthTable(empty_df) + + io = IOBuffer() + show(io, empty_ht) + output_str = String(take!(io)) + @test contains(output_str, "HealthTable is empty") + end + + @testset "Show with OMOP CDM metadata" begin + # Check if OMOP extension is available for metadata test + ext = Base.get_extension(HealthBase, :HealthBaseOMOPCDMExt) + if !isnothing(ext) + ht_omop = HealthBase.HealthTable(df; omop_cdm_version="v5.4.1") + + io = IOBuffer() + show(io, ht_omop) + output_str = String(take!(io)) + @test contains(output_str, "OMOP CDM version: v5.4.1") + else + @warn "HealthBaseOMOPCDMExt not available, skipping OMOP metadata test" + end + end + + @testset "Show with regular metadata" begin + df_with_meta = copy(df) + DataFrames.metadata!(df_with_meta, "omop_cdm_version", "v5.4.0") + ht_meta = HealthBase.HealthTable(df_with_meta) + + io = IOBuffer() + show(io, ht_meta) + output_str = String(take!(io)) + @test contains(output_str, "OMOP CDM version: v5.4.0") + end +end