From 9b58297d956a360fa4c0c57c931369281189c8e2 Mon Sep 17 00:00:00 2001 From: dmt <> Date: Thu, 10 Oct 2019 17:36:54 +0200 Subject: [PATCH] Copy and Paste ckmeans algorithms. --- cml/domain/data_source.py | 141 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/cml/domain/data_source.py b/cml/domain/data_source.py index a1717a6..460c36a 100644 --- a/cml/domain/data_source.py +++ b/cml/domain/data_source.py @@ -100,3 +100,144 @@ class LearnblockIdentifier: already_seen.add(value_pair) kw = {args[0]: value_pair[0], args[1]: value_pair[1]} yield block.get_values(**kw) + + def _get_sigma_zeta_relatives(self, block, **kw): + relatives = block.get_values(**kw) + time_column = relatives.get_column_values("T") + density = self.density_estimator.train(time_column).density() + self._remove_time_dense_relatives(relatives, density) + clusters = self._cluster_sigma_zeta_realtives(relatives, density) + for time_values in clusters: + yield relatives.new_block_from(time_values) + + def _remove_time_dense_relatives(self, block, density): + max_dens = max(density) + for index, dens in enumerate(density): + if dens > max_dens*(self.settings.sigma_zeta_cutoff/100): + block.drop_row(index) + + def _cluster_sigma_zeta_realtives(self, cutted_block, density): + # TOOD (dmt): Don't rely on data series from pandas, 'cause ckmeans + # needs primitives data types. + + time_column = list(cutted_block.get_column_values("T")) + _, maxs = self._relative_extrema(density) + return ckmeans(time_column, len(maxs)) + + +# TODO (dmt): Refactor Ckmeans algorithm! +# Resource: https://journal.r-project.org/archive/2011-2/RJournal_2011-2_Wang+Song.pdf +def ssq(j, i, sum_x, sum_x_sq): + if j > 0: + muji = (sum_x[i] - sum_x[j-1]) / (i - j + 1) + sji = sum_x_sq[i] - sum_x_sq[j-1] - (i - j + 1) * muji ** 2 + else: + sji = sum_x_sq[i] - sum_x[i] ** 2 / (i+1) + + return 0 if sji < 0 else sji + + +def fill_row_k(imin, imax, k, S, J, sum_x, sum_x_sq, N): + if imin > imax: return + + i = (imin+imax) // 2 + S[k][i] = S[k-1][i-1] + J[k][i] = i + + jlow = k + + if imin > k: + jlow = int(max(jlow, J[k][imin-1])) + jlow = int(max(jlow, J[k-1][i])) + + jhigh = i-1 + if imax < N-1: + jhigh = int(min(jhigh, J[k][imax+1])) + + for j in range(jhigh, jlow-1, -1): + sji = ssq(j, i, sum_x, sum_x_sq) + + if sji + S[k-1][jlow-1] >= S[k][i]: break + + # Examine the lower bound of the cluster border + # compute s(jlow, i) + sjlowi = ssq(jlow, i, sum_x, sum_x_sq) + + SSQ_jlow = sjlowi + S[k-1][jlow-1] + + if SSQ_jlow < S[k][i]: + S[k][i] = SSQ_jlow + J[k][i] = jlow + + jlow += 1 + + SSQ_j = sji + S[k-1][j-1] + if SSQ_j < S[k][i]: + S[k][i] = SSQ_j + J[k][i] = j + + fill_row_k(imin, i-1, k, S, J, sum_x, sum_x_sq, N) + fill_row_k(i+1, imax, k, S, J, sum_x, sum_x_sq, N) + + +def fill_dp_matrix(data, S, J, K, N): + import numpy as np + sum_x = np.zeros(N, dtype=np.float_) + sum_x_sq = np.zeros(N, dtype=np.float_) + + # median. used to shift the values of x to improve numerical stability + shift = data[N//2] + + for i in range(N): + if i == 0: + sum_x[0] = data[0] - shift + sum_x_sq[0] = (data[0] - shift) ** 2 + else: + sum_x[i] = sum_x[i-1] + data[i] - shift + sum_x_sq[i] = sum_x_sq[i-1] + (data[i] - shift) ** 2 + + S[0][i] = ssq(0, i, sum_x, sum_x_sq) + J[0][i] = 0 + + for k in range(1, K): + if k < K-1: + imin = max(1, k) + else: + imin = N-1 + + fill_row_k(imin, N-1, k, S, J, sum_x, sum_x_sq, N) + + +def ckmeans(data, n_clusters): + import numpy as np + if n_clusters <= 0: + raise ValueError("Cannot classify into 0 or less clusters") + if n_clusters > len(data): + raise ValueError("Cannot generate more classes than there are data values") + + # if there's only one value, return it; there's no sensible way to split + # it. This means that len(ckmeans([data], 2)) may not == 2. Is that OK? + unique = len(set(data)) + if unique == 1: + return [data] + + data.sort() + n = len(data) + + S = np.zeros((n_clusters, n), dtype=np.float_) + + J = np.zeros((n_clusters, n), dtype=np.uint64) + + fill_dp_matrix(data, S, J, n_clusters, n) + + clusters = [] + cluster_right = n-1 + + for cluster in range(n_clusters-1, -1, -1): + cluster_left = int(J[cluster][cluster_right]) + clusters.append(data[cluster_left:cluster_right+1]) + + if cluster > 0: + cluster_right = cluster_left - 1 + + return list(reversed(clusters)) -- GitLab