From 9b58297d956a360fa4c0c57c931369281189c8e2 Mon Sep 17 00:00:00 2001
From: dmt <>
Date: Thu, 10 Oct 2019 17:36:54 +0200
Subject: [PATCH] Copy and Paste ckmeans algorithms.

---
 cml/domain/data_source.py | 141 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)

diff --git a/cml/domain/data_source.py b/cml/domain/data_source.py
index a1717a6..460c36a 100644
--- a/cml/domain/data_source.py
+++ b/cml/domain/data_source.py
@@ -100,3 +100,144 @@ class LearnblockIdentifier:
                 already_seen.add(value_pair)
                 kw = {args[0]: value_pair[0], args[1]: value_pair[1]}
                 yield block.get_values(**kw)
+
+    def _get_sigma_zeta_relatives(self, block, **kw):
+        relatives = block.get_values(**kw)
+        time_column = relatives.get_column_values("T")
+        density = self.density_estimator.train(time_column).density()
+        self._remove_time_dense_relatives(relatives, density)
+        clusters = self._cluster_sigma_zeta_realtives(relatives, density)
+        for time_values in clusters:
+            yield relatives.new_block_from(time_values)
+
+    def _remove_time_dense_relatives(self, block, density):
+        max_dens = max(density)
+        for index, dens in enumerate(density):
+            if dens > max_dens*(self.settings.sigma_zeta_cutoff/100):
+                block.drop_row(index)
+
+    def _cluster_sigma_zeta_realtives(self, cutted_block, density):
+        # TOOD (dmt): Don't rely on data series from pandas, 'cause ckmeans
+        # needs primitives data types.
+
+        time_column = list(cutted_block.get_column_values("T"))
+        _, maxs = self._relative_extrema(density)
+        return ckmeans(time_column, len(maxs))
+
+
+# TODO (dmt): Refactor Ckmeans algorithm!
+# Resource: https://journal.r-project.org/archive/2011-2/RJournal_2011-2_Wang+Song.pdf
+def ssq(j, i, sum_x, sum_x_sq):
+    if j > 0:
+        muji = (sum_x[i] - sum_x[j-1]) / (i - j + 1)
+        sji = sum_x_sq[i] - sum_x_sq[j-1] - (i - j + 1) * muji ** 2
+    else:
+        sji = sum_x_sq[i] - sum_x[i] ** 2 / (i+1)
+
+    return 0 if sji < 0 else sji
+
+
+def fill_row_k(imin, imax, k, S, J, sum_x, sum_x_sq, N):
+    if imin > imax: return
+
+    i = (imin+imax) // 2
+    S[k][i] = S[k-1][i-1]
+    J[k][i] = i
+
+    jlow = k
+
+    if imin > k:
+        jlow = int(max(jlow, J[k][imin-1]))
+    jlow = int(max(jlow, J[k-1][i]))
+
+    jhigh = i-1
+    if imax < N-1:
+        jhigh = int(min(jhigh, J[k][imax+1]))
+
+    for j in range(jhigh, jlow-1, -1):
+        sji = ssq(j, i, sum_x, sum_x_sq)
+
+        if sji + S[k-1][jlow-1] >= S[k][i]: break
+
+        # Examine the lower bound of the cluster border
+        # compute s(jlow, i)
+        sjlowi = ssq(jlow, i, sum_x, sum_x_sq)
+
+        SSQ_jlow = sjlowi + S[k-1][jlow-1]
+
+        if SSQ_jlow < S[k][i]:
+            S[k][i] = SSQ_jlow
+            J[k][i] = jlow
+
+        jlow += 1
+
+        SSQ_j = sji + S[k-1][j-1]
+        if SSQ_j < S[k][i]:
+            S[k][i] = SSQ_j
+            J[k][i] = j
+
+    fill_row_k(imin, i-1, k, S, J, sum_x, sum_x_sq, N)
+    fill_row_k(i+1, imax, k, S, J, sum_x, sum_x_sq, N)
+
+
+def fill_dp_matrix(data, S, J, K, N):
+    import numpy as np
+    sum_x = np.zeros(N, dtype=np.float_)
+    sum_x_sq = np.zeros(N, dtype=np.float_)
+
+    # median. used to shift the values of x to improve numerical stability
+    shift = data[N//2]
+
+    for i in range(N):
+        if i == 0:
+            sum_x[0] = data[0] - shift
+            sum_x_sq[0] = (data[0] - shift) ** 2
+        else:
+            sum_x[i] = sum_x[i-1] + data[i] - shift
+            sum_x_sq[i] = sum_x_sq[i-1] + (data[i] - shift) ** 2
+
+        S[0][i] = ssq(0, i, sum_x, sum_x_sq)
+        J[0][i] = 0
+
+    for k in range(1, K):
+        if k < K-1:
+            imin = max(1, k)
+        else:
+            imin = N-1
+
+        fill_row_k(imin, N-1, k, S, J, sum_x, sum_x_sq, N)
+
+
+def ckmeans(data, n_clusters):
+    import numpy as np
+    if n_clusters <= 0:
+        raise ValueError("Cannot classify into 0 or less clusters")
+    if n_clusters > len(data):
+        raise ValueError("Cannot generate more classes than there are data values")
+
+    # if there's only one value, return it; there's no sensible way to split
+    # it. This means that len(ckmeans([data], 2)) may not == 2. Is that OK?
+    unique = len(set(data))
+    if unique == 1:
+        return [data]
+
+    data.sort()
+    n = len(data)
+
+    S = np.zeros((n_clusters, n), dtype=np.float_)
+
+    J = np.zeros((n_clusters, n), dtype=np.uint64)
+
+    fill_dp_matrix(data, S, J, n_clusters, n)
+
+    clusters = []
+    cluster_right = n-1
+
+    for cluster in range(n_clusters-1, -1, -1):
+        cluster_left = int(J[cluster][cluster_right])
+        clusters.append(data[cluster_left:cluster_right+1])
+
+        if cluster > 0:
+            cluster_right = cluster_left - 1
+
+    return list(reversed(clusters))
-- 
GitLab