|
3 | 3 | from functools import partial |
4 | 4 |
|
5 | 5 | import numpy as np |
| 6 | +from scipy.stats import FitError, norm, weibull_min |
6 | 7 |
|
7 | 8 | from mpest.core.distribution import Distribution |
8 | 9 | from mpest.core.mixture_distribution import MixtureDistribution |
9 | 10 | from mpest.core.problem import Problem, Result |
10 | 11 | from mpest.em.methods.abstract_steps import AExpectation, AMaximization |
11 | | -from mpest.exceptions import EStepError |
12 | | -from mpest.models import AModel, AModelDifferentiable |
| 12 | +from mpest.exceptions import EStepError, SampleError |
| 13 | +from mpest.models import AModel, AModelDifferentiable, WeibullModelExp |
13 | 14 | from mpest.optimizers import AOptimizerJacobian, TOptimizer |
14 | 15 | from mpest.utils import ResultWithError |
15 | 16 |
|
16 | 17 | EResult = tuple[Problem, np.ndarray] | ResultWithError[MixtureDistribution] |
17 | 18 |
|
18 | | - |
19 | 19 | class BayesEStep(AExpectation[EResult]): |
20 | 20 | """ |
21 | 21 | Class which represents Bayesian method for calculating matrix for M step in likelihood method |
@@ -69,17 +69,190 @@ def step(self, problem: Problem) -> EResult: |
69 | 69 | return new_problem, h |
70 | 70 |
|
71 | 71 |
|
72 | | -# class ML(AExpectation[EResult]): |
73 | | -# """ |
74 | | -# Class which represents ML method for calculating matrix for M step in likelihood method |
75 | | -# """ |
76 | | -# |
77 | | -# def step(self, problem: Problem) -> EResult: |
78 | | -# """ |
79 | | -# A function that performs E step |
80 | | -# |
81 | | -# :param problem: Object of class Problem, which contains samples and mixture. |
82 | | -# """ |
| 72 | +class ClusteringEStep(AExpectation[EResult]): |
| 73 | + """ |
| 74 | + E step that uses clustering methods for recalculate mixture parameters |
| 75 | + Supported methods: DBScan (dbscan), Agglomerative (agglo), KMeans (kmeans) |
| 76 | + Use accurate_init for the best accuracy of the |
| 77 | + parameter values for each individual component (recommended for mixtures from several different distributions) |
| 78 | + """ |
| 79 | + MIN_SAMPLES = 2 |
| 80 | + MIN_PROB = 1e-100 |
| 81 | + MIN_COMPONENT_SIZE = 10 |
| 82 | + def __init__(self,models: list[AModel], clusterizer,eps: float=0.3,accurate_init: bool=False) -> None: |
| 83 | + self._n_components = len(models) |
| 84 | + self._models = models |
| 85 | + self._initialized = False |
| 86 | + self._current_mixture = MixtureDistribution([]) |
| 87 | + self._eps = eps |
| 88 | + self._accurate_init_flag = accurate_init |
| 89 | + self._clusterizer = clusterizer |
| 90 | + |
| 91 | + |
| 92 | + @staticmethod |
| 93 | + def _estimate_weibull_params(data: np.ndarray) -> list[float]: |
| 94 | + """Robust Weibull parameter estimation using MLE""" |
| 95 | + try: |
| 96 | + params = weibull_min.fit(data, floc=0) |
| 97 | + return [float(params[0]), float(params[2])] |
| 98 | + except (ValueError, TypeError, FitError): |
| 99 | + return [0.5, float(np.mean(data))] |
| 100 | + |
| 101 | + def _find_best_cluster_for_model(self, clusters: dict[int, np.ndarray], |
| 102 | + model: AModel) -> tuple[int | None, list[float] | None, float]: |
| 103 | + best_k, best_params, best_score = None, None, -np.inf |
| 104 | + for k, X_k in clusters.items(): |
| 105 | + if len(X_k) < self.MIN_SAMPLES: |
| 106 | + continue |
| 107 | + try: |
| 108 | + if isinstance(model, WeibullModelExp): |
| 109 | + params = self._estimate_weibull_params(X_k) |
| 110 | + params_arr = np.clip(params, [0.1, 0.1], [2.0, 1000.0]) |
| 111 | + params = [float(params_arr[0]), float(params_arr[1])] |
| 112 | + score = np.sum(np.log(weibull_min.pdf(X_k, *params))) |
| 113 | + else: |
| 114 | + mean = np.mean(X_k) |
| 115 | + std = np.clip(np.std(X_k), 0.1, 100.0) |
| 116 | + params = [mean, std] |
| 117 | + score = np.sum(np.log(norm.pdf(X_k, mean, std))) |
| 118 | + if score > best_score: |
| 119 | + best_score = score |
| 120 | + best_k = k |
| 121 | + best_params = params |
| 122 | + except ValueError: |
| 123 | + continue |
| 124 | + return best_k, best_params, best_score |
| 125 | + |
| 126 | + def _accurate_init(self, X: np.ndarray, labels: np.ndarray) -> tuple[list[tuple[AModel, list[float]]], list[float]]: |
| 127 | + clusters = {k: X[labels == k] for k in range(self._n_components)} |
| 128 | + distributions: list[tuple[AModel, list[float]]] = [] |
| 129 | + weights: list[float] = [] |
| 130 | + for model in self._models: |
| 131 | + best_k, best_params, best_score = self._find_best_cluster_for_model(clusters, model) |
| 132 | + |
| 133 | + if best_k is None or best_params is None: |
| 134 | + X_k = np.random.choice(X, size=10, replace=True) |
| 135 | + weight = 1.0 / self._n_components |
| 136 | + if isinstance(model, WeibullModelExp): |
| 137 | + best_params = self._estimate_weibull_params(X_k) |
| 138 | + else: |
| 139 | + best_params = [np.mean(X_k), np.std(X_k)] |
| 140 | + else: |
| 141 | + weight = len(clusters[best_k]) / len(X) |
| 142 | + clusters.pop(best_k) |
| 143 | + |
| 144 | + distributions.append( |
| 145 | + (model, best_params) |
| 146 | + ) |
| 147 | + weights.append(float(weight)) |
| 148 | + |
| 149 | + return distributions, weights |
| 150 | + |
| 151 | + def _fast_init(self, X: np.ndarray, labels: np.ndarray) -> tuple[list[tuple[AModel, list[float]]], list[float]]: |
| 152 | + distributions: list[tuple[AModel, list[float]]] = [] |
| 153 | + weights: list[float] = [] |
| 154 | + for k in range(self._n_components): |
| 155 | + X_k = X[labels == k] |
| 156 | + weight = len(X_k) / len(X) |
| 157 | + |
| 158 | + if len(X_k) == 0: |
| 159 | + X_k = np.random.choice(X, size=self.MIN_COMPONENT_SIZE, replace=True) |
| 160 | + weight = 1.0 / self._n_components |
| 161 | + |
| 162 | + model = self._models[k] |
| 163 | + if isinstance(model, WeibullModelExp): |
| 164 | + params = self._estimate_weibull_params(X_k) |
| 165 | + params = list(np.clip(params, [0.1, 0.1], [2.0, 1000.0])) |
| 166 | + params[0], params[1] = float(params[0]), float(params[1]) |
| 167 | + else: |
| 168 | + mean = np.mean(X_k) |
| 169 | + std = np.clip(np.std(X_k), 0.1, 100.0) |
| 170 | + params = [mean, std] |
| 171 | + |
| 172 | + distributions.append((model, params)) |
| 173 | + weights.append(float(weight)) |
| 174 | + return distributions, weights |
| 175 | + |
| 176 | + def _initialize_distributions(self, X: np.ndarray, labels: np.ndarray) -> MixtureDistribution: |
| 177 | + """Improved initialization with distribution-aware parameter estimation""" |
| 178 | + if self._accurate_init_flag: |
| 179 | + distributions, weights = self._accurate_init(X, labels) |
| 180 | + else: |
| 181 | + distributions, weights = self._fast_init(X, labels) |
| 182 | + |
| 183 | + total_weight = sum(weights) |
| 184 | + normalized_weights: list[float | None] | None = [w / total_weight for w in weights] |
| 185 | + self._current_mixture = MixtureDistribution.from_distributions( |
| 186 | + ( |
| 187 | + [ |
| 188 | + Distribution.from_params(dist[0].__class__, dist[1]) |
| 189 | + for dist in distributions |
| 190 | + ] |
| 191 | + ), |
| 192 | + normalized_weights |
| 193 | + ) |
| 194 | + self._initialized = True |
| 195 | + return self._current_mixture |
| 196 | + |
| 197 | + def _clusterize(self, X: np.ndarray, clusterizer) -> np.ndarray: |
| 198 | + if hasattr(clusterizer, 'n_clusters') and self._n_components != clusterizer.n_clusters: |
| 199 | + raise EStepError("Count of components and clusters doesn't match.") |
| 200 | + X = X.reshape(-1, 1) |
| 201 | + labels = clusterizer.fit_predict(X) |
| 202 | + if -1 in labels: |
| 203 | + labels[labels == -1] = np.random.choice(range(self._n_components), np.sum(labels == -1)) |
| 204 | + return labels |
| 205 | + |
| 206 | + |
| 207 | + def step(self, problem: Problem) -> EResult: |
| 208 | + """E-step with improved numerical stability""" |
| 209 | + samples = problem.samples |
| 210 | + if not self._initialized: |
| 211 | + try: |
| 212 | + labels = self._clusterize(samples, self._clusterizer) |
| 213 | + mixture_dist = self._initialize_distributions(samples, labels) |
| 214 | + except EStepError as e: |
| 215 | + return ResultWithError(problem.distributions, e) |
| 216 | + else: |
| 217 | + mixture_dist = problem.distributions |
| 218 | + |
| 219 | + p_xij = [] |
| 220 | + active_samples = [] |
| 221 | + |
| 222 | + for x in samples: |
| 223 | + p = np.zeros(len(mixture_dist.distributions)) |
| 224 | + for i, d in enumerate(mixture_dist.distributions): |
| 225 | + try: |
| 226 | + pdf_val = d.model.pdf(x, d.params) |
| 227 | + p[i] = max(pdf_val, self.MIN_PROB) |
| 228 | + except ValueError: |
| 229 | + p[i] = self.MIN_PROB |
| 230 | + |
| 231 | + if np.any(p > self.MIN_PROB): |
| 232 | + p_xij.append(p) |
| 233 | + active_samples.append(x) |
| 234 | + |
| 235 | + if not active_samples: |
| 236 | + error = SampleError("None of the elements in the sample is correct for this mixture") |
| 237 | + return ResultWithError(mixture_dist, error) |
| 238 | + |
| 239 | + m = len(p_xij) |
| 240 | + k = len(mixture_dist.distributions) |
| 241 | + h = np.zeros([k, m], dtype=float) |
| 242 | + curr_w = np.array([d.prior_probability or (1.0 / k) for d in mixture_dist.distributions]) |
| 243 | + curr_w /= curr_w.sum() |
| 244 | + |
| 245 | + for i, p in enumerate(p_xij): |
| 246 | + wp = curr_w * p |
| 247 | + swp = np.sum(wp) |
| 248 | + |
| 249 | + if swp < self.MIN_PROB: |
| 250 | + h[:, i] = curr_w / np.sum(curr_w) |
| 251 | + else: |
| 252 | + h[:, i] = wp / swp |
| 253 | + |
| 254 | + new_problem = Problem(np.array(active_samples), mixture_dist) |
| 255 | + return new_problem, h |
83 | 256 |
|
84 | 257 |
|
85 | 258 | class LikelihoodMStep(AMaximization[EResult]): |
|
0 commit comments