Skip to content

Commit fae3026

Browse files
committed
added other preprocessing functions
1 parent 688d47c commit fae3026

File tree

4 files changed

+173
-44
lines changed

4 files changed

+173
-44
lines changed

Project.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ authors = ["Dilum Aluthge", "contributors"]
44
version = "2.0.0"
55

66
[deps]
7+
InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
8+
OMOPCommonDataModel = "ba65db9e-6590-4054-ab8a-101ed9124986"
79
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
810

911
[weakdeps]
@@ -12,15 +14,15 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
1214
DrWatson = "634d3b9d-ee7a-5ddf-bec9-22491ea816e1"
1315
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
1416
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
15-
InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
16-
OMOPCommonDataModel = "ba65db9e-6590-4054-ab8a-101ed9124986"
1717

1818
[extensions]
1919
HealthBaseDrWatsonExt = "DrWatson"
2020
HealthBaseOMOPCDMExt = ["DataFrames", "OMOPCommonDataModel", "InlineStrings", "Serialization", "Statistics", "Dates"]
2121

2222
[compat]
2323
Dates = "1.11.0"
24+
InlineStrings = "1.4.4"
25+
OMOPCommonDataModel = "0.1.4"
2426
Tables = "1.12.1"
2527
julia = "1.10"
2628

ext/HealthBaseOMOPCDMExt.jl

Lines changed: 87 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,11 @@ coltypes = eltype.(eachcol(ht.source))
8787
4. Quick-fail/warning for bad data:
8888
# TODO: Will finalize the implementation soon and then add an example
8989
"""
90-
function HealthBase.HealthTable(df::DataFrame; omop_cdm_version::String="v5.4.0", disable_type_enforcement=false, collect_errors=true)
90+
function HealthBase.HealthTable(
91+
df::DataFrame; omop_cdm_version::String="v5.4.0",
92+
disable_type_enforcement=false,
93+
collect_errors=true
94+
)
9195
if !haskey(OMOPCDM_VERSIONS, omop_cdm_version)
9296
throw(ArgumentError("OMOP CDM version '$(omop_cdm_version)' is not supported. Available versions: $(keys(OMOPCDM_VERSIONS))"))
9397
end
@@ -142,7 +146,11 @@ function HealthBase.HealthTable(df::DataFrame; omop_cdm_version::String="v5.4.0"
142146
end
143147

144148
# TODO: Add Documentation
145-
function HealthBase.one_hot_encode(ht::HealthTable; cols::Vector{Symbol}, drop_original::Bool=true)
149+
function HealthBase.one_hot_encode(
150+
ht::HealthTable;
151+
cols::Vector{Symbol},
152+
drop_original::Bool=true
153+
)
146154
df = copy(ht.source)
147155
for col in cols
148156
unique_vals = unique(skipmissing(df[!, col]))
@@ -158,21 +166,87 @@ function HealthBase.one_hot_encode(ht::HealthTable; cols::Vector{Symbol}, drop_o
158166
end
159167

160168
# TODO: Add Documentation
161-
function HealthBase.impute_missing(ht::HealthTable; cols::Vector{Symbol}, strategy::Symbol=:mean)
169+
function HealthBase.impute_missing(
170+
ht::HealthTable;
171+
cols::Union{Vector{Symbol}, Vector{Pair{Symbol,Symbol}}},
172+
strategy::Symbol=:mean,
173+
)
162174
df = copy(ht.source)
163-
for col in cols
164-
if strategy == :mean
165-
non_missing_vals = skipmissing(df[!, col])
166-
if isempty(non_missing_vals)
167-
throw(ArgumentError("Column '$col' has only missing values."))
175+
176+
strat_pairs = cols isa Vector{Symbol} ? [c => strategy for c in cols] : cols
177+
178+
for (col, strat) in strat_pairs
179+
@assert col in propertynames(df) "Column '$(col)' not found in table."
180+
vals = df[!, col]
181+
nonmiss = collect(skipmissing(vals))
182+
if isempty(nonmiss)
183+
throw(ArgumentError("Column '$(col)' has only missing values – cannot impute."))
184+
end
185+
186+
replacement = begin
187+
if strat == :mean
188+
mean(nonmiss)
189+
elseif strat == :median
190+
median(nonmiss)
191+
elseif strat == :mode
192+
mode_val = nothing
193+
counts = Dict{Any,Int}()
194+
for v in nonmiss
195+
counts[v] = get(counts,v,0)+1
196+
if mode_val === nothing || counts[v] > counts[mode_val]
197+
mode_val = v
198+
end
199+
end
200+
mode_val
201+
else
202+
throw(ArgumentError("Unsupported imputation strategy '$(strat)'. Supported: :mean, :median, :mode."))
168203
end
169-
mean_val = mean(non_missing_vals)
170-
df[!, col] = coalesce.(df[!, col], mean_val)
171-
else
172-
throw(ArgumentError("Unsupported imputation strategy: $strategy"))
173204
end
205+
df[!, col] = coalesce.(vals, replacement)
174206
end
175-
return HealthBase.HealthTable(df; omop_cdm_version=ht.omop_cdm_version)
207+
208+
return HealthBase.HealthTable(source=df, omop_cdm_version=ht.omop_cdm_version)
209+
end
210+
211+
# TODO: Add Documentation
212+
function HealthBase.apply_vocabulary_compression(
213+
ht::HealthTable;
214+
cols::Vector{Symbol},
215+
min_freq::Integer=10,
216+
other_label::AbstractString="Other"
217+
)
218+
df = copy(ht.source)
219+
for col in cols
220+
@assert col in propertynames(df) "Column '$(col)' not found in table."
221+
counts = combine(groupby(df, col), nrow => :freq)
222+
to_compress = counts[counts.freq .< min_freq, col]
223+
if !isempty(to_compress)
224+
mask = in(to_compress).(df[!, col])
225+
df[mask, col] .= other_label
226+
end
227+
end
228+
return HealthBase.HealthTable(source=df, omop_cdm_version=ht.omop_cdm_version)
229+
end
230+
231+
# TODO: Add Documentation
232+
function HealthBase.map_concepts(
233+
ht::HealthTable;
234+
col::Symbol,
235+
mapping::AbstractDict,
236+
new_col::Union{Symbol,Nothing}=nothing,
237+
drop_original::Bool=false,
238+
)
239+
@assert col in propertynames(ht.source) "Column '$(col)' not found in table."
240+
df = copy(ht.source)
241+
242+
target_col = isnothing(new_col) ? col : new_col
243+
df[!, target_col] = get.(Ref(mapping), df[!, col], df[!, col])
244+
245+
if drop_original && !isnothing(new_col)
246+
select!(df, Not(col))
247+
end
248+
249+
return HealthBase.HealthTable(source=df, omop_cdm_version=ht.omop_cdm_version)
176250
end
177251

178252
# TODO: Add Documentation
@@ -194,6 +268,4 @@ function HealthBase.normalize_column(ht::HealthTable; cols::Vector{Symbol}, meth
194268
return HealthBase.HealthTable(df; omop_cdm_version=ht.omop_cdm_version)
195269
end
196270

197-
# TODO: Add Other Preprocessing Utilities
198-
199271
end

src/omopcdm_stub.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
function one_hot_encode end
22
function normalize_column end
33
function impute_missing end
4+
function apply_vocabulary_compression end
5+
function map_concepts end
46

57
export one_hot_encode
68
export normalize_column
79
export impute_missing
10+
export apply_vocabulary_compression
11+
export map_concepts

test/omopcdmext.jl

Lines changed: 78 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using Statistics
2+
using DataFrames
23

34
@testset "HealthBaseOMOPCDMExt" begin
45
# This DataFrame is compliant with the OMOP CDM v5.4.0 PERSON table schema.
@@ -57,48 +58,98 @@ using Statistics
5758
@test df2 == person_df_good
5859
end
5960

60-
@testset "Preprocessing Utilities" begin
61+
@testset "Preprocessing Utilities (single example DataFrame)" begin
6162
df = DataFrame(
62-
id = 1:4,
63-
cat = ["a", "b", "a", "c"],
64-
num1 = [1.0, 2.5, missing, 4.0],
65-
num2 = [10.0, missing, 30.0, 40.0]
63+
person_id = 1:10,
64+
condition_source_value = [
65+
"Hypertension", "Diabetes", "Asthma", "Asthma", "Hypertension",
66+
"Fibromyalgia", "Hyperlipidemia", "RareDisease1", "RareDisease2", "RareDisease3"
67+
],
68+
condition_concept_id = [
69+
316866, # Hypertension
70+
201826, # Diabetes
71+
317009, # Asthma (mild)
72+
317010, # Asthma (severe)
73+
316866, # Hypertension
74+
317707, # Fibromyalgia
75+
4329058, # Hyperlipidemia
76+
1234567, 2345678, 3456789 # Rare
77+
],
78+
systolic_bp = [140.0, 130.0, 110.0, missing, 150.0, 120.0, missing, 135.0, 128.0, 145.0],
79+
diastolic_bp = [90.0, 85.0, 70.0, missing, 95.0, 80.0, missing, 88.0, 82.0, 92.0]
6680
)
67-
base_ht = HealthTable(df)
81+
base_ht = HealthTable(source=df; omop_cdm_version="v5.4.1")
6882

6983
@testset "one_hot_encode - drop_original=true" begin
70-
ht_oh = one_hot_encode(base_ht; cols=[:cat], drop_original=true)
71-
@test "cat" names(ht_oh.source)
72-
expected_cols = Set([:cat_a, :cat_b, :cat_c])
84+
ht_oh = one_hot_encode(base_ht; cols=[:condition_source_value], drop_original=true)
85+
@test :condition_source_value names(ht_oh.source)
86+
expected_cols = Set(Symbol.("condition_source_value_" .* ["Hypertension", "Diabetes", "Asthma", "Fibromyalgia", "Hyperlipidemia"]))
7387
@test expected_cols Set(Symbol.(names(ht_oh.source)))
74-
@test ht_oh.source.cat_a == [true, false, true, false]
75-
@test ht_oh.source.cat_b == [false, true, false, false]
76-
@test ht_oh.source.cat_c == [false, false, false, true]
88+
@test ht_oh.source.condition_source_value_Hypertension == (df.condition_source_value .== "Hypertension")
89+
@test ht_oh.source.condition_source_value_Diabetes == (df.condition_source_value .== "Diabetes")
7790
end
7891

7992
@testset "one_hot_encode - drop_original=false" begin
80-
ht_oh = one_hot_encode(base_ht; cols=[:cat], drop_original=false)
81-
@test "cat" in names(ht_oh.source)
82-
expected_cols = Set([:cat_a, :cat_b, :cat_c])
93+
ht_oh = one_hot_encode(base_ht; cols=[:condition_source_value], drop_original=false)
94+
@test :condition_source_value in Symbol.(names(ht_oh.source))
95+
expected_cols = Set(Symbol.("condition_source_value_" .* ["Hypertension", "Diabetes", "Asthma", "Fibromyalgia", "Hyperlipidemia"]))
8396
@test expected_cols Set(Symbol.(names(ht_oh.source)))
8497
end
8598

8699
@testset "impute_missing - mean" begin
87-
ht_imp = impute_missing(base_ht; cols=[:num1, :num2], strategy=:mean)
88-
@test all(!ismissing, ht_imp.source.num1)
89-
@test all(!ismissing, ht_imp.source.num2)
90-
@test ht_imp.source.num1[3] mean(skipmissing(base_ht.source.num1))
91-
@test ht_imp.source.num2[2] mean(skipmissing(base_ht.source.num2))
100+
ht_imp = impute_missing(base_ht; cols=[:systolic_bp, :diastolic_bp], strategy=:mean)
101+
@test all(!ismissing, ht_imp.source.systolic_bp)
102+
@test all(!ismissing, ht_imp.source.diastolic_bp)
103+
@test ht_imp.source.systolic_bp[4] mean(skipmissing(base_ht.source.systolic_bp))
104+
@test ht_imp.source.diastolic_bp[4] mean(skipmissing(base_ht.source.diastolic_bp))
92105
end
93106

94107
@testset "normalize_column - standard" begin
95-
ht_imp = impute_missing(base_ht; cols=[:num1, :num2])
96-
ht_norm = normalize_column(ht_imp; cols=[:num1, :num2])
97-
@test isapprox(mean(ht_norm.source.num1), 0.0; atol=1e-8)
98-
@test isapprox(std(ht_norm.source.num1), 1.0; atol=1e-8)
99-
@test isapprox(mean(ht_norm.source.num2), 0.0; atol=1e-8)
100-
@test isapprox(std(ht_norm.source.num2), 1.0; atol=1e-8)
101-
@test all(x -> x isa Float64, vec(Matrix(ht_norm.source[:, [:num1, :num2]])))
108+
ht_imp = impute_missing(base_ht; cols=[:systolic_bp, :diastolic_bp])
109+
ht_norm = normalize_column(ht_imp; cols=[:systolic_bp, :diastolic_bp])
110+
@test isapprox(mean(ht_norm.source.systolic_bp), 0.0; atol=1e-8)
111+
@test isapprox(std(ht_norm.source.systolic_bp), 1.0; atol=1e-8)
112+
@test isapprox(mean(ht_norm.source.diastolic_bp), 0.0; atol=1e-8)
113+
@test isapprox(std(ht_norm.source.diastolic_bp), 1.0; atol=1e-8)
114+
@test all(x -> x isa Float64, vec(Matrix(ht_norm.source[:, [:systolic_bp, :diastolic_bp]])))
115+
end
116+
117+
@testset "impute_missing - median" begin
118+
df_mid = DataFrame(num = [1.0, missing, 3.0, missing])
119+
ht_mid = HealthTable(df_mid)
120+
ht_imp = impute_missing(ht_mid; cols=[:num], strategy=:median)
121+
@test all(!ismissing, ht_imp.source.num)
122+
@test ht_imp.source.num[2] == median([1.0, 3.0])
123+
end
124+
125+
@testset "impute_missing - mixed strategies" begin
126+
ht_mix = impute_missing(base_ht; cols=[:systolic_bp => :mean, :diastolic_bp => :median])
127+
@test ht_mix.source.systolic_bp[4] mean(skipmissing(base_ht.source.systolic_bp))
128+
@test ht_mix.source.diastolic_bp[4] == median(skipmissing(base_ht.source.diastolic_bp))
129+
end
130+
131+
@testset "apply_vocabulary_compression" begin
132+
ht_comp = apply_vocabulary_compression(base_ht; cols=[:condition_source_value], min_freq=2, other_label="Other")
133+
@test ht_comp.source.condition_source_value == [
134+
"Hypertension", "Other", "Asthma", "Asthma", "Hypertension", "Other", "Other", "Other", "Other", "Other"
135+
]
136+
end
137+
138+
@testset "map_concepts" begin
139+
mapping = Dict(
140+
316866 => "Hypertension",
141+
201826 => "Diabetes",
142+
317009 => "Asthma",
143+
317010 => "Asthma"
144+
)
145+
ht_m1 = map_concepts(base_ht; col=:condition_concept_id, mapping=mapping, new_col=:condition_group)
146+
@test ht_m1.source.condition_group == [
147+
"Hypertension", "Diabetes", "Asthma", "Asthma", "Hypertension", 317707, 4329058, 1234567, 2345678, 3456789
148+
]
149+
@test :condition_concept_id in Symbol.(names(ht_m1.source))
150+
ht_m2 = map_concepts(base_ht; col=:condition_concept_id, mapping=mapping, new_col=:condition_group, drop_original=true)
151+
@test :condition_concept_id names(ht_m2.source)
152+
@test ht_m2.source.condition_group[1] == "Hypertension"
102153
end
103154
end
104155

0 commit comments

Comments
 (0)