Skip to content
Snippets Groups Projects
Commit 9b58297d authored by dmt's avatar dmt
Browse files

Copy and Paste ckmeans algorithms.

parent b872dea6
No related branches found
No related tags found
No related merge requests found
...@@ -100,3 +100,144 @@ class LearnblockIdentifier: ...@@ -100,3 +100,144 @@ class LearnblockIdentifier:
already_seen.add(value_pair) already_seen.add(value_pair)
kw = {args[0]: value_pair[0], args[1]: value_pair[1]} kw = {args[0]: value_pair[0], args[1]: value_pair[1]}
yield block.get_values(**kw) yield block.get_values(**kw)
def _get_sigma_zeta_relatives(self, block, **kw):
relatives = block.get_values(**kw)
time_column = relatives.get_column_values("T")
density = self.density_estimator.train(time_column).density()
self._remove_time_dense_relatives(relatives, density)
clusters = self._cluster_sigma_zeta_realtives(relatives, density)
for time_values in clusters:
yield relatives.new_block_from(time_values)
def _remove_time_dense_relatives(self, block, density):
max_dens = max(density)
for index, dens in enumerate(density):
if dens > max_dens*(self.settings.sigma_zeta_cutoff/100):
block.drop_row(index)
def _cluster_sigma_zeta_realtives(self, cutted_block, density):
# TOOD (dmt): Don't rely on data series from pandas, 'cause ckmeans
# needs primitives data types.
time_column = list(cutted_block.get_column_values("T"))
_, maxs = self._relative_extrema(density)
return ckmeans(time_column, len(maxs))
# TODO (dmt): Refactor Ckmeans algorithm!
# Resource: https://journal.r-project.org/archive/2011-2/RJournal_2011-2_Wang+Song.pdf
def ssq(j, i, sum_x, sum_x_sq):
if j > 0:
muji = (sum_x[i] - sum_x[j-1]) / (i - j + 1)
sji = sum_x_sq[i] - sum_x_sq[j-1] - (i - j + 1) * muji ** 2
else:
sji = sum_x_sq[i] - sum_x[i] ** 2 / (i+1)
return 0 if sji < 0 else sji
def fill_row_k(imin, imax, k, S, J, sum_x, sum_x_sq, N):
if imin > imax: return
i = (imin+imax) // 2
S[k][i] = S[k-1][i-1]
J[k][i] = i
jlow = k
if imin > k:
jlow = int(max(jlow, J[k][imin-1]))
jlow = int(max(jlow, J[k-1][i]))
jhigh = i-1
if imax < N-1:
jhigh = int(min(jhigh, J[k][imax+1]))
for j in range(jhigh, jlow-1, -1):
sji = ssq(j, i, sum_x, sum_x_sq)
if sji + S[k-1][jlow-1] >= S[k][i]: break
# Examine the lower bound of the cluster border
# compute s(jlow, i)
sjlowi = ssq(jlow, i, sum_x, sum_x_sq)
SSQ_jlow = sjlowi + S[k-1][jlow-1]
if SSQ_jlow < S[k][i]:
S[k][i] = SSQ_jlow
J[k][i] = jlow
jlow += 1
SSQ_j = sji + S[k-1][j-1]
if SSQ_j < S[k][i]:
S[k][i] = SSQ_j
J[k][i] = j
fill_row_k(imin, i-1, k, S, J, sum_x, sum_x_sq, N)
fill_row_k(i+1, imax, k, S, J, sum_x, sum_x_sq, N)
def fill_dp_matrix(data, S, J, K, N):
import numpy as np
sum_x = np.zeros(N, dtype=np.float_)
sum_x_sq = np.zeros(N, dtype=np.float_)
# median. used to shift the values of x to improve numerical stability
shift = data[N//2]
for i in range(N):
if i == 0:
sum_x[0] = data[0] - shift
sum_x_sq[0] = (data[0] - shift) ** 2
else:
sum_x[i] = sum_x[i-1] + data[i] - shift
sum_x_sq[i] = sum_x_sq[i-1] + (data[i] - shift) ** 2
S[0][i] = ssq(0, i, sum_x, sum_x_sq)
J[0][i] = 0
for k in range(1, K):
if k < K-1:
imin = max(1, k)
else:
imin = N-1
fill_row_k(imin, N-1, k, S, J, sum_x, sum_x_sq, N)
def ckmeans(data, n_clusters):
import numpy as np
if n_clusters <= 0:
raise ValueError("Cannot classify into 0 or less clusters")
if n_clusters > len(data):
raise ValueError("Cannot generate more classes than there are data values")
# if there's only one value, return it; there's no sensible way to split
# it. This means that len(ckmeans([data], 2)) may not == 2. Is that OK?
unique = len(set(data))
if unique == 1:
return [data]
data.sort()
n = len(data)
S = np.zeros((n_clusters, n), dtype=np.float_)
J = np.zeros((n_clusters, n), dtype=np.uint64)
fill_dp_matrix(data, S, J, n_clusters, n)
clusters = []
cluster_right = n-1
for cluster in range(n_clusters-1, -1, -1):
cluster_left = int(J[cluster][cluster_right])
clusters.append(data[cluster_left:cluster_right+1])
if cluster > 0:
cluster_right = cluster_left - 1
return list(reversed(clusters))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment