PySATL
diff --git a/‎examples/readme_example/example_ml.py‎
Lines changed: 471 additions & 0 deletions b/‎examples/readme_example/example_ml.py‎
Lines changed: 471 additions & 0 deletions
diff --git a/‎examples/readme_example/results/plots/comparison/comparison_gaussian_v1.1.png‎
706 KB b/‎examples/readme_example/results/plots/comparison/comparison_gaussian_v1.1.png‎
706 KB
diff --git a/‎examples/readme_example/results/plots/comparison/comparison_original_v.1.1.png‎
757 KB b/‎examples/readme_example/results/plots/comparison/comparison_original_v.1.1.png‎
757 KB
diff --git a/‎examples/readme_example/results/plots/comparison/comparison_weibull_v1.1.png‎
628 KB b/‎examples/readme_example/results/plots/comparison/comparison_weibull_v1.1.png‎
628 KB
diff --git a/‎examples/readme_example/results/tables/metrics_gaussian.png‎
153 KB b/‎examples/readme_example/results/tables/metrics_gaussian.png‎
153 KB
diff --git a/‎examples/readme_example/results/tables/metrics_original.png‎
153 KB b/‎examples/readme_example/results/tables/metrics_original.png‎
153 KB
diff --git a/‎examples/readme_example/results/tables/metrics_weibull.png‎
161 KB b/‎examples/readme_example/results/tables/metrics_weibull.png‎
161 KB
diff --git a/‎mpest/em/methods/likelihood_method.py‎
Lines changed: 187 additions & 14 deletions b/‎mpest/em/methods/likelihood_method.py‎
Lines changed: 187 additions & 14 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/test_ml_estep/test_ml.py‎
Lines changed: 150 additions & 0 deletions b/‎tests/test_ml_estep/test_ml.py‎
Lines changed: 150 additions & 0 deletions
@@ -3,19 +3,19 @@
 from functools import partial
 
 import numpy as np
+from scipy.stats import FitError, norm, weibull_min
 
 from mpest.core.distribution import Distribution
 from mpest.core.mixture_distribution import MixtureDistribution
 from mpest.core.problem import Problem, Result
 from mpest.em.methods.abstract_steps import AExpectation, AMaximization
-from mpest.exceptions import EStepError
-from mpest.models import AModel, AModelDifferentiable
+from mpest.exceptions import EStepError, SampleError
+from mpest.models import AModel, AModelDifferentiable, WeibullModelExp
 from mpest.optimizers import AOptimizerJacobian, TOptimizer
 from mpest.utils import ResultWithError
 
 EResult = tuple[Problem, np.ndarray] | ResultWithError[MixtureDistribution]
 
-
 class BayesEStep(AExpectation[EResult]):
     """
     Class which represents Bayesian method for calculating matrix for M step in likelihood method
@@ -69,17 +69,190 @@ def step(self, problem: Problem) -> EResult:
         return new_problem, h
 
 
-# class ML(AExpectation[EResult]):
-#     """
-#     Class which represents ML method for calculating matrix for M step in likelihood method
-#     """
-#
-#     def step(self, problem: Problem) -> EResult:
-#         """
-#         A function that performs E step
-#
-#         :param problem: Object of class Problem, which contains samples and mixture.
-#         """
+class ClusteringEStep(AExpectation[EResult]):
+    """
+    E step that uses clustering methods for recalculate mixture parameters
+    Supported methods: DBScan (dbscan), Agglomerative (agglo), KMeans (kmeans)
+    Use accurate_init for the best accuracy of the
+    parameter values for each individual component (recommended for mixtures from several different distributions)
+    """
+    MIN_SAMPLES = 2
+    MIN_PROB = 1e-100
+    MIN_COMPONENT_SIZE = 10
+    def __init__(self,models: list[AModel], clusterizer,eps: float=0.3,accurate_init: bool=False) -> None:
+        self._n_components = len(models)
+        self._models = models
+        self._initialized = False
+        self._current_mixture = MixtureDistribution([])
+        self._eps = eps
+        self._accurate_init_flag = accurate_init
+        self._clusterizer = clusterizer
+
+
+    @staticmethod
+    def _estimate_weibull_params(data: np.ndarray) -> list[float]:
+        """Robust Weibull parameter estimation using MLE"""
+        try:
+            params = weibull_min.fit(data, floc=0)
+            return [float(params[0]), float(params[2])]
+        except (ValueError, TypeError, FitError):
+            return [0.5, float(np.mean(data))]
+
+    def _find_best_cluster_for_model(self, clusters: dict[int, np.ndarray],
+                                     model: AModel) -> tuple[int | None, list[float] | None, float]:
+        best_k, best_params, best_score = None, None, -np.inf
+        for k, X_k in clusters.items():
+            if len(X_k) < self.MIN_SAMPLES:
+                continue
+            try:
+                if isinstance(model, WeibullModelExp):
+                    params = self._estimate_weibull_params(X_k)
+                    params_arr = np.clip(params, [0.1, 0.1], [2.0, 1000.0])
+                    params = [float(params_arr[0]), float(params_arr[1])]
+                    score = np.sum(np.log(weibull_min.pdf(X_k, *params)))
+                else:
+                    mean = np.mean(X_k)
+                    std = np.clip(np.std(X_k), 0.1, 100.0)
+                    params = [mean, std]
+                    score = np.sum(np.log(norm.pdf(X_k, mean, std)))
+                if score > best_score:
+                    best_score = score
+                    best_k = k
+                    best_params = params
+            except ValueError:
+                continue
+        return best_k, best_params, best_score
+
+    def _accurate_init(self, X: np.ndarray, labels: np.ndarray) -> tuple[list[tuple[AModel, list[float]]], list[float]]:
+        clusters = {k: X[labels == k] for k in range(self._n_components)}
+        distributions: list[tuple[AModel, list[float]]] = []
+        weights: list[float] = []
+        for model in self._models:
+            best_k, best_params, best_score = self._find_best_cluster_for_model(clusters, model)
+
+            if best_k is None or best_params is None:
+                X_k = np.random.choice(X, size=10, replace=True)
+                weight = 1.0 / self._n_components
+                if isinstance(model, WeibullModelExp):
+                    best_params = self._estimate_weibull_params(X_k)
+                else:
+                    best_params = [np.mean(X_k), np.std(X_k)]
+            else:
+                weight = len(clusters[best_k]) / len(X)
+                clusters.pop(best_k)
+
+            distributions.append(
+                (model, best_params)
+            )
+            weights.append(float(weight))
+
+        return distributions, weights
+
+    def _fast_init(self, X: np.ndarray, labels: np.ndarray) -> tuple[list[tuple[AModel, list[float]]], list[float]]:
+        distributions: list[tuple[AModel, list[float]]] = []
+        weights: list[float] = []
+        for k in range(self._n_components):
+            X_k = X[labels == k]
+            weight = len(X_k) / len(X)
+
+            if len(X_k) == 0:
+                X_k = np.random.choice(X, size=self.MIN_COMPONENT_SIZE, replace=True)
+                weight = 1.0 / self._n_components
+
+            model = self._models[k]
+            if isinstance(model, WeibullModelExp):
+                params = self._estimate_weibull_params(X_k)
+                params = list(np.clip(params, [0.1, 0.1], [2.0, 1000.0]))
+                params[0], params[1] = float(params[0]), float(params[1])
+            else:
+                mean = np.mean(X_k)
+                std = np.clip(np.std(X_k), 0.1, 100.0)
+                params = [mean, std]
+
+            distributions.append((model, params))
+            weights.append(float(weight))
+        return distributions, weights
+
+    def _initialize_distributions(self, X: np.ndarray, labels: np.ndarray) -> MixtureDistribution:
+        """Improved initialization with distribution-aware parameter estimation"""
+        if self._accurate_init_flag:
+            distributions, weights = self._accurate_init(X, labels)
+        else:
+            distributions, weights = self._fast_init(X, labels)
+
+        total_weight = sum(weights)
+        normalized_weights: list[float | None] | None = [w / total_weight for w in weights]
+        self._current_mixture = MixtureDistribution.from_distributions(
+            (
+                [
+                    Distribution.from_params(dist[0].__class__, dist[1])
+                    for dist in distributions
+                ]
+            ),
+            normalized_weights
+        )
+        self._initialized = True
+        return self._current_mixture
+
+    def _clusterize(self, X: np.ndarray, clusterizer) -> np.ndarray:
+        if hasattr(clusterizer, 'n_clusters') and self._n_components != clusterizer.n_clusters:
+            raise EStepError("Count of components and clusters doesn't match.")
+        X = X.reshape(-1, 1)
+        labels = clusterizer.fit_predict(X)
+        if -1 in labels:
+            labels[labels == -1] = np.random.choice(range(self._n_components), np.sum(labels == -1))
+        return labels
+
+
+    def step(self, problem: Problem) -> EResult:
+        """E-step with improved numerical stability"""
+        samples = problem.samples
+        if not self._initialized:
+            try:
+                labels = self._clusterize(samples, self._clusterizer)
+                mixture_dist = self._initialize_distributions(samples, labels)
+            except EStepError as e:
+                return ResultWithError(problem.distributions, e)
+        else:
+            mixture_dist = problem.distributions
+
+        p_xij = []
+        active_samples = []
+
+        for x in samples:
+            p = np.zeros(len(mixture_dist.distributions))
+            for i, d in enumerate(mixture_dist.distributions):
+                try:
+                    pdf_val = d.model.pdf(x, d.params)
+                    p[i] = max(pdf_val, self.MIN_PROB)
+                except ValueError:
+                    p[i] = self.MIN_PROB
+
+            if np.any(p > self.MIN_PROB):
+                p_xij.append(p)
+                active_samples.append(x)
+
+        if not active_samples:
+            error = SampleError("None of the elements in the sample is correct for this mixture")
+            return ResultWithError(mixture_dist, error)
+
+        m = len(p_xij)
+        k = len(mixture_dist.distributions)
+        h = np.zeros([k, m], dtype=float)
+        curr_w = np.array([d.prior_probability or (1.0 / k) for d in mixture_dist.distributions])
+        curr_w /= curr_w.sum()
+
+        for i, p in enumerate(p_xij):
+            wp = curr_w * p
+            swp = np.sum(wp)
+
+            if swp < self.MIN_PROB:
+                h[:, i] = curr_w / np.sum(curr_w)
+            else:
+                h[:, i] = wp / swp
+
+        new_problem = Problem(np.array(active_samples), mixture_dist)
+        return new_problem, h
 
 
 class LikelihoodMStep(AMaximization[EResult]):
 
@@ -33,6 +33,7 @@ pre-commit = "^4.1.0"
 ruff = "^0.9.6"
 mypy = "==1.13.0"
 sphinx = "^8.2.0"
+hypothesis = "^6.98.0"
 
 [tool.poetry.group.experiments.dependencies]
 tqdm = "^4.67.1"
 
@@ -0,0 +1,150 @@
+import numpy as np
+import pytest
+from hypothesis import assume, given
+from hypothesis import strategies as st
+from sklearn.cluster import KMeans
+
+from mpest import Distribution, MixtureDistribution, Problem
+from mpest.em.methods.likelihood_method import ClusteringEStep
+from mpest.models import GaussianModel, WeibullModelExp
+from mpest.utils import ResultWithError
+
+WEIBULL_PARAMS_COUNT = 2
+MIN_COMPONENT_SIZE = 10
+
+
+def valid_weibull_data():
+    return st.lists(
+        st.floats(min_value=0.1, max_value=100, allow_nan=False, allow_infinity=False),
+        min_size=10, max_size=1000
+    ).map(np.array)
+
+
+def valid_gaussian_data():
+    return st.lists(
+        st.floats(min_value=-100, max_value=100, allow_nan=False, allow_infinity=False),
+        min_size=10, max_size=1000
+    ).map(np.array)
+
+
+def mixed_data():
+    return st.one_of(valid_weibull_data(), valid_gaussian_data())
+
+
+def mixture_problems():
+    return st.builds(
+        lambda samples: Problem(
+            samples,
+            MixtureDistribution.from_distributions([
+                Distribution.from_params(WeibullModelExp, [1.0, 1.0]),
+                Distribution.from_params(GaussianModel, [0.0, 1.0]),
+            ], [0.5, 0.5])
+        ),
+        mixed_data()
+    )
+
+
+class TestClusteringEStepInitialization:
+    @given(st.lists(st.sampled_from([WeibullModelExp(), GaussianModel()])),
+                    st.integers(min_value=0, max_value=1000))
+    def test_initialization(self, models, labels_seed):
+        assume(len(models) > 0)
+        np.random.seed(labels_seed)
+        labels = np.random.randint(0, len(models), size=100)
+        ml = ClusteringEStep(models, labels)
+        assert ml._n_components == len(models)
+        assert len(ml._models) == len(models)
+        assert not ml._initialized
+        assert ml._current_mixture.distributions == []
+
+
+class TestWeibullParamEstimation:
+
+    @given(valid_weibull_data())
+    def test_weibull_param_estimation(self, data):
+        models = [WeibullModelExp(), GaussianModel()]
+        ml = ClusteringEStep(models, np.zeros(len(data), dtype=int))
+        params = ml._estimate_weibull_params(data)
+        assert len(params) == WEIBULL_PARAMS_COUNT
+        assert params[0] > 0
+        assert params[1] > 0
+
+    @given(st.lists(st.floats(min_value=0, max_value=0, allow_nan=False), min_size=1))
+    def test_weibull_param_estimation_with_bad_data(self, data):
+        models = [WeibullModelExp(), GaussianModel()]
+        ml = ClusteringEStep(models, np.zeros(len(data), dtype=int))
+        params = ml._estimate_weibull_params(np.array(data))
+        assert params[0] > 0
+        assert isinstance(params[1], float)
+
+
+class TestDistributionInitialization:
+    @given(mixed_data(), st.booleans())
+    def test_initialization(self, data, accurate_init):
+        assume(len(data) >= MIN_COMPONENT_SIZE)
+        models = [WeibullModelExp(), GaussianModel()]
+        labels = np.random.randint(0, len(models), size=len(data))
+        ml = ClusteringEStep(models, labels, accurate_init=accurate_init)
+        mixture = ml._initialize_distributions(data, labels)
+        assert len(mixture.distributions) == len(models)
+        for dist in mixture.distributions:
+            assert dist.params is not None
+            if isinstance(dist.model, WeibullModelExp):
+                assert dist.params[0] > 0
+                assert dist.params[1] > 0
+            else:
+                assert dist.params[1] > 0
+
+
+class TestEStep:
+    @given(mixture_problems())
+    def test_e_step(self, problem):
+        models = [WeibullModelExp(), GaussianModel()]
+        clusterizer = KMeans(n_clusters=len(models))
+        ml = ClusteringEStep(models, clusterizer)
+        result = ml.step(problem)
+
+        if isinstance(result, ResultWithError):
+            pytest.fail("Unexpected error in E-step")
+        else:
+            new_problem, h = result
+            assert len(new_problem.samples) <= len(problem.samples)
+            assert h.shape[0] == len(models)
+            if len(new_problem.samples) > 0:
+                assert h.shape[1] == len(new_problem.samples)
+                for col in h.T:
+                    assert pytest.approx(1.0, abs=1e-6) == sum(col)
+
+    @given(st.lists(st.floats(min_value=0, max_value=1, allow_nan=False)),
+                    st.integers(min_value=1, max_value=10))
+    def test_e_step_with_empty_cluster(self, data, n_components):
+        data = np.array(data)
+        assume(len(data) >= n_components)
+        models = [WeibullModelExp() if i % 2 else GaussianModel() for i in range(n_components)]
+        clusterizer = KMeans(n_clusters=len(models))
+        ml = ClusteringEStep(models, clusterizer)
+
+        initial_mixture = MixtureDistribution.from_distributions(
+            [Distribution.from_params(model.__class__, [1.0, 1.0]) for model in models]
+        )
+        problem = Problem(data, initial_mixture)
+
+        result = ml.step(problem)
+        if isinstance(result, ResultWithError):
+            pytest.fail("Unexpected error in E-step")
+        else:
+            new_problem, h = result
+            assert h.shape == (n_components, len(new_problem.samples))
+
+
+class TestEdgeCases:
+    @given(mixed_data())
+    def test_single_component(self, data):
+        assume(len(data) >= MIN_COMPONENT_SIZE)
+        models = [WeibullModelExp()]
+        labels = np.zeros(len(data), dtype=int)
+        ml = ClusteringEStep(models, labels)
+        mixture = ml._initialize_distributions(data, labels)
+        assert len(mixture.distributions) == 1
+        assert mixture.distributions[0].params[0] > 0
+        assert mixture.distributions[0].params[1] > 0