|
1 | 1 | using Statistics |
| 2 | +using DataFrames |
2 | 3 |
|
3 | 4 | @testset "HealthBaseOMOPCDMExt" begin |
4 | 5 | # This DataFrame is compliant with the OMOP CDM v5.4.0 PERSON table schema. |
@@ -57,48 +58,98 @@ using Statistics |
57 | 58 | @test df2 == person_df_good |
58 | 59 | end |
59 | 60 |
|
60 | | - @testset "Preprocessing Utilities" begin |
| 61 | + @testset "Preprocessing Utilities (single example DataFrame)" begin |
61 | 62 | df = DataFrame( |
62 | | - id = 1:4, |
63 | | - cat = ["a", "b", "a", "c"], |
64 | | - num1 = [1.0, 2.5, missing, 4.0], |
65 | | - num2 = [10.0, missing, 30.0, 40.0] |
| 63 | + person_id = 1:10, |
| 64 | + condition_source_value = [ |
| 65 | + "Hypertension", "Diabetes", "Asthma", "Asthma", "Hypertension", |
| 66 | + "Fibromyalgia", "Hyperlipidemia", "RareDisease1", "RareDisease2", "RareDisease3" |
| 67 | + ], |
| 68 | + condition_concept_id = [ |
| 69 | + 316866, # Hypertension |
| 70 | + 201826, # Diabetes |
| 71 | + 317009, # Asthma (mild) |
| 72 | + 317010, # Asthma (severe) |
| 73 | + 316866, # Hypertension |
| 74 | + 317707, # Fibromyalgia |
| 75 | + 4329058, # Hyperlipidemia |
| 76 | + 1234567, 2345678, 3456789 # Rare |
| 77 | + ], |
| 78 | + systolic_bp = [140.0, 130.0, 110.0, missing, 150.0, 120.0, missing, 135.0, 128.0, 145.0], |
| 79 | + diastolic_bp = [90.0, 85.0, 70.0, missing, 95.0, 80.0, missing, 88.0, 82.0, 92.0] |
66 | 80 | ) |
67 | | - base_ht = HealthTable(df) |
| 81 | + base_ht = HealthTable(source=df; omop_cdm_version="v5.4.1") |
68 | 82 |
|
69 | 83 | @testset "one_hot_encode - drop_original=true" begin |
70 | | - ht_oh = one_hot_encode(base_ht; cols=[:cat], drop_original=true) |
71 | | - @test "cat" ∉ names(ht_oh.source) |
72 | | - expected_cols = Set([:cat_a, :cat_b, :cat_c]) |
| 84 | + ht_oh = one_hot_encode(base_ht; cols=[:condition_source_value], drop_original=true) |
| 85 | + @test :condition_source_value ∉ names(ht_oh.source) |
| 86 | + expected_cols = Set(Symbol.("condition_source_value_" .* ["Hypertension", "Diabetes", "Asthma", "Fibromyalgia", "Hyperlipidemia"])) |
73 | 87 | @test expected_cols ⊆ Set(Symbol.(names(ht_oh.source))) |
74 | | - @test ht_oh.source.cat_a == [true, false, true, false] |
75 | | - @test ht_oh.source.cat_b == [false, true, false, false] |
76 | | - @test ht_oh.source.cat_c == [false, false, false, true] |
| 88 | + @test ht_oh.source.condition_source_value_Hypertension == (df.condition_source_value .== "Hypertension") |
| 89 | + @test ht_oh.source.condition_source_value_Diabetes == (df.condition_source_value .== "Diabetes") |
77 | 90 | end |
78 | 91 |
|
79 | 92 | @testset "one_hot_encode - drop_original=false" begin |
80 | | - ht_oh = one_hot_encode(base_ht; cols=[:cat], drop_original=false) |
81 | | - @test "cat" in names(ht_oh.source) |
82 | | - expected_cols = Set([:cat_a, :cat_b, :cat_c]) |
| 93 | + ht_oh = one_hot_encode(base_ht; cols=[:condition_source_value], drop_original=false) |
| 94 | + @test :condition_source_value in Symbol.(names(ht_oh.source)) |
| 95 | + expected_cols = Set(Symbol.("condition_source_value_" .* ["Hypertension", "Diabetes", "Asthma", "Fibromyalgia", "Hyperlipidemia"])) |
83 | 96 | @test expected_cols ⊆ Set(Symbol.(names(ht_oh.source))) |
84 | 97 | end |
85 | 98 |
|
86 | 99 | @testset "impute_missing - mean" begin |
87 | | - ht_imp = impute_missing(base_ht; cols=[:num1, :num2], strategy=:mean) |
88 | | - @test all(!ismissing, ht_imp.source.num1) |
89 | | - @test all(!ismissing, ht_imp.source.num2) |
90 | | - @test ht_imp.source.num1[3] ≈ mean(skipmissing(base_ht.source.num1)) |
91 | | - @test ht_imp.source.num2[2] ≈ mean(skipmissing(base_ht.source.num2)) |
| 100 | + ht_imp = impute_missing(base_ht; cols=[:systolic_bp, :diastolic_bp], strategy=:mean) |
| 101 | + @test all(!ismissing, ht_imp.source.systolic_bp) |
| 102 | + @test all(!ismissing, ht_imp.source.diastolic_bp) |
| 103 | + @test ht_imp.source.systolic_bp[4] ≈ mean(skipmissing(base_ht.source.systolic_bp)) |
| 104 | + @test ht_imp.source.diastolic_bp[4] ≈ mean(skipmissing(base_ht.source.diastolic_bp)) |
92 | 105 | end |
93 | 106 |
|
94 | 107 | @testset "normalize_column - standard" begin |
95 | | - ht_imp = impute_missing(base_ht; cols=[:num1, :num2]) |
96 | | - ht_norm = normalize_column(ht_imp; cols=[:num1, :num2]) |
97 | | - @test isapprox(mean(ht_norm.source.num1), 0.0; atol=1e-8) |
98 | | - @test isapprox(std(ht_norm.source.num1), 1.0; atol=1e-8) |
99 | | - @test isapprox(mean(ht_norm.source.num2), 0.0; atol=1e-8) |
100 | | - @test isapprox(std(ht_norm.source.num2), 1.0; atol=1e-8) |
101 | | - @test all(x -> x isa Float64, vec(Matrix(ht_norm.source[:, [:num1, :num2]]))) |
| 108 | + ht_imp = impute_missing(base_ht; cols=[:systolic_bp, :diastolic_bp]) |
| 109 | + ht_norm = normalize_column(ht_imp; cols=[:systolic_bp, :diastolic_bp]) |
| 110 | + @test isapprox(mean(ht_norm.source.systolic_bp), 0.0; atol=1e-8) |
| 111 | + @test isapprox(std(ht_norm.source.systolic_bp), 1.0; atol=1e-8) |
| 112 | + @test isapprox(mean(ht_norm.source.diastolic_bp), 0.0; atol=1e-8) |
| 113 | + @test isapprox(std(ht_norm.source.diastolic_bp), 1.0; atol=1e-8) |
| 114 | + @test all(x -> x isa Float64, vec(Matrix(ht_norm.source[:, [:systolic_bp, :diastolic_bp]]))) |
| 115 | + end |
| 116 | + |
| 117 | + @testset "impute_missing - median" begin |
| 118 | + df_mid = DataFrame(num = [1.0, missing, 3.0, missing]) |
| 119 | + ht_mid = HealthTable(df_mid) |
| 120 | + ht_imp = impute_missing(ht_mid; cols=[:num], strategy=:median) |
| 121 | + @test all(!ismissing, ht_imp.source.num) |
| 122 | + @test ht_imp.source.num[2] == median([1.0, 3.0]) |
| 123 | + end |
| 124 | + |
| 125 | + @testset "impute_missing - mixed strategies" begin |
| 126 | + ht_mix = impute_missing(base_ht; cols=[:systolic_bp => :mean, :diastolic_bp => :median]) |
| 127 | + @test ht_mix.source.systolic_bp[4] ≈ mean(skipmissing(base_ht.source.systolic_bp)) |
| 128 | + @test ht_mix.source.diastolic_bp[4] == median(skipmissing(base_ht.source.diastolic_bp)) |
| 129 | + end |
| 130 | + |
| 131 | + @testset "apply_vocabulary_compression" begin |
| 132 | + ht_comp = apply_vocabulary_compression(base_ht; cols=[:condition_source_value], min_freq=2, other_label="Other") |
| 133 | + @test ht_comp.source.condition_source_value == [ |
| 134 | + "Hypertension", "Other", "Asthma", "Asthma", "Hypertension", "Other", "Other", "Other", "Other", "Other" |
| 135 | + ] |
| 136 | + end |
| 137 | + |
| 138 | + @testset "map_concepts" begin |
| 139 | + mapping = Dict( |
| 140 | + 316866 => "Hypertension", |
| 141 | + 201826 => "Diabetes", |
| 142 | + 317009 => "Asthma", |
| 143 | + 317010 => "Asthma" |
| 144 | + ) |
| 145 | + ht_m1 = map_concepts(base_ht; col=:condition_concept_id, mapping=mapping, new_col=:condition_group) |
| 146 | + @test ht_m1.source.condition_group == [ |
| 147 | + "Hypertension", "Diabetes", "Asthma", "Asthma", "Hypertension", 317707, 4329058, 1234567, 2345678, 3456789 |
| 148 | + ] |
| 149 | + @test :condition_concept_id in Symbol.(names(ht_m1.source)) |
| 150 | + ht_m2 = map_concepts(base_ht; col=:condition_concept_id, mapping=mapping, new_col=:condition_group, drop_original=true) |
| 151 | + @test :condition_concept_id ∉ names(ht_m2.source) |
| 152 | + @test ht_m2.source.condition_group[1] == "Hypertension" |
102 | 153 | end |
103 | 154 | end |
104 | 155 |
|
|
0 commit comments