Skip to content
Snippets Groups Projects
Commit 9bb65612 authored by dmt's avatar dmt
Browse files

Refactor data source.

parent 5f9d463b
No related branches found
No related tags found
No related merge requests found
...@@ -61,9 +61,14 @@ class DataSource: ...@@ -61,9 +61,14 @@ class DataSource:
@property @property
@log_learnblock_processing @log_learnblock_processing
def learnblocks(self): def learnblocks(self):
if self.settings.block_size > len(self):
raise ValueError("Block size cannot be larger then the size"
"the data source.")
for block in self: for block in self:
learnblock = self.__learnblock_identifier.identify(block) learnblock = self.__learnblock_identifier.identify(block)
if learnblock: if learnblock:
learnblock.origin = "source"
self._flip_source_halde_flags(learnblock.indexes) self._flip_source_halde_flags(learnblock.indexes)
yield learnblock yield learnblock
...@@ -78,7 +83,11 @@ class DataSource: ...@@ -78,7 +83,11 @@ class DataSource:
halde_runs = -1 halde_runs = -1
for i in cycle(range(0, len(self))): for i in cycle(range(0, len(self))):
if counter == self.block_size: if halde_runs >= self.settings.stack_iterations:
# manually stop generator
return
if counter == self.settings.block_size:
old_index = i old_index = i
counter = 0 counter = 0
yield self.__source.get_block_via_index(block_indexes) yield self.__source.get_block_via_index(block_indexes)
...@@ -108,15 +117,33 @@ class DataSource: ...@@ -108,15 +117,33 @@ class DataSource:
def __len__(self): def __len__(self):
return self.__source.length return self.__source.length
def get_block(self, indices=None): def get_block(self, indices=None, columns=None):
return self.__source.get_block_via_index(indices) return self.__source.get_block_via_index(indices, columns=columns)
def time_sigma_relatives(self, block): def time_sigma_relatives(self, block):
return next(iter(self.__learnblock_identifier._identify_relatives( return next(iter(self.__learnblock_identifier._identify_relatives(
block, "T", "Sigma"))) block, "T", "Sigma")))
def estimate_density(self, data): def estimate_density(self, data):
return self.__learnblock_identifier.density_estimator(data).density() kernel_density_estimator = self.__learnblock_identifier.\
density_estimator.train(data)
return kernel_density_estimator.density()
def remove_time_dense_relatives(self, block, density):
self.__learnblock_identifier._remove_time_dense_relatives(
block, density)
def cluster(self, block, density):
return self.__learnblock_identifier._cluster_sigma_zeta_relatives(
block, density
)
def new_learnblock(self, values, columns, index, origin):
return self.__source.new_block(values, columns, index, origin)
def get_time_values(self, indices):
return self.__source.get_block_via_index(indices, columns="T")\
.as_numpy_array()
class Preprocessor: class Preprocessor:
...@@ -165,15 +192,20 @@ class Preprocessor: ...@@ -165,15 +192,20 @@ class Preprocessor:
class LearnblockIdentifier: class LearnblockIdentifier:
def __init__(self, settings, density_estimator, relative_extrema): def __init__(self, settings, density_estimator, relative_extrema):
self.settings = settings self.settings = settings
self.column_pairs = (("T", "Z"), ("T", "Sigma"), ("Sigma", "Z"))
self.density_estimator = density_estimator self.density_estimator = density_estimator
self._relative_extrema = relative_extrema self._relative_extrema = relative_extrema
@classmethod
def _column_pairs(cls):
yield ("T", "Z")
yield ("T", "Sigma")
yield ("Sigma", "Z")
def identify(self, block): def identify(self, block):
biggest_learn_block = None biggest_learn_block = None
biggest_block_size = 0 biggest_block_size = 0
for pair in self.column_pairs: for pair in self._column_pairs():
for possible_learnblock in self._identify_relatives(block, *pair): for possible_learnblock in self._identify_relatives(block, *pair):
if self._is_learn_block(possible_learnblock.length): if self._is_learn_block(possible_learnblock.length):
if possible_learnblock.length > biggest_block_size: if possible_learnblock.length > biggest_block_size:
...@@ -209,7 +241,7 @@ class LearnblockIdentifier: ...@@ -209,7 +241,7 @@ class LearnblockIdentifier:
time_column = relatives.get_column_values("T") time_column = relatives.get_column_values("T")
density = self.density_estimator.train(time_column).density() density = self.density_estimator.train(time_column).density()
self._remove_time_dense_relatives(relatives, density) self._remove_time_dense_relatives(relatives, density)
clusters = self._cluster_sigma_zeta_realtives(relatives, density) clusters = self._cluster_sigma_zeta_relatives(relatives, density)
for time_values in clusters: for time_values in clusters:
yield relatives.new_block_from(time_values) yield relatives.new_block_from(time_values)
...@@ -219,7 +251,7 @@ class LearnblockIdentifier: ...@@ -219,7 +251,7 @@ class LearnblockIdentifier:
if dens > max_dens*(self.settings.sigma_zeta_cutoff/100): if dens > max_dens*(self.settings.sigma_zeta_cutoff/100):
block.drop_row(index) block.drop_row(index)
def _cluster_sigma_zeta_realtives(self, cutted_block, density): def _cluster_sigma_zeta_relatives(self, cutted_block, density):
# TOOD (dmt): Don't rely on data series from pandas, 'cause ckmeans # TOOD (dmt): Don't rely on data series from pandas, 'cause ckmeans
# needs primitives data types. # needs primitives data types.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment