diff --git a/cml/domain/data_source.py b/cml/domain/data_source.py index f853b43992a30e0ee96c436ad09209933f041dca..06baa1af3b6263124bb82d887c85c434a048c445 100644 --- a/cml/domain/data_source.py +++ b/cml/domain/data_source.py @@ -61,9 +61,14 @@ class DataSource: @property @log_learnblock_processing def learnblocks(self): + if self.settings.block_size > len(self): + raise ValueError("Block size cannot be larger then the size" + "the data source.") + for block in self: learnblock = self.__learnblock_identifier.identify(block) if learnblock: + learnblock.origin = "source" self._flip_source_halde_flags(learnblock.indexes) yield learnblock @@ -78,7 +83,11 @@ class DataSource: halde_runs = -1 for i in cycle(range(0, len(self))): - if counter == self.block_size: + if halde_runs >= self.settings.stack_iterations: + # manually stop generator + return + + if counter == self.settings.block_size: old_index = i counter = 0 yield self.__source.get_block_via_index(block_indexes) @@ -108,15 +117,33 @@ class DataSource: def __len__(self): return self.__source.length - def get_block(self, indices=None): - return self.__source.get_block_via_index(indices) + def get_block(self, indices=None, columns=None): + return self.__source.get_block_via_index(indices, columns=columns) def time_sigma_relatives(self, block): return next(iter(self.__learnblock_identifier._identify_relatives( block, "T", "Sigma"))) def estimate_density(self, data): - return self.__learnblock_identifier.density_estimator(data).density() + kernel_density_estimator = self.__learnblock_identifier.\ + density_estimator.train(data) + return kernel_density_estimator.density() + + def remove_time_dense_relatives(self, block, density): + self.__learnblock_identifier._remove_time_dense_relatives( + block, density) + + def cluster(self, block, density): + return self.__learnblock_identifier._cluster_sigma_zeta_relatives( + block, density + ) + + def new_learnblock(self, values, columns, index, origin): + return self.__source.new_block(values, columns, index, origin) + + def get_time_values(self, indices): + return self.__source.get_block_via_index(indices, columns="T")\ + .as_numpy_array() class Preprocessor: @@ -165,15 +192,20 @@ class Preprocessor: class LearnblockIdentifier: def __init__(self, settings, density_estimator, relative_extrema): self.settings = settings - self.column_pairs = (("T", "Z"), ("T", "Sigma"), ("Sigma", "Z")) self.density_estimator = density_estimator self._relative_extrema = relative_extrema + @classmethod + def _column_pairs(cls): + yield ("T", "Z") + yield ("T", "Sigma") + yield ("Sigma", "Z") + def identify(self, block): biggest_learn_block = None biggest_block_size = 0 - for pair in self.column_pairs: + for pair in self._column_pairs(): for possible_learnblock in self._identify_relatives(block, *pair): if self._is_learn_block(possible_learnblock.length): if possible_learnblock.length > biggest_block_size: @@ -209,7 +241,7 @@ class LearnblockIdentifier: time_column = relatives.get_column_values("T") density = self.density_estimator.train(time_column).density() self._remove_time_dense_relatives(relatives, density) - clusters = self._cluster_sigma_zeta_realtives(relatives, density) + clusters = self._cluster_sigma_zeta_relatives(relatives, density) for time_values in clusters: yield relatives.new_block_from(time_values) @@ -219,7 +251,7 @@ class LearnblockIdentifier: if dens > max_dens*(self.settings.sigma_zeta_cutoff/100): block.drop_row(index) - def _cluster_sigma_zeta_realtives(self, cutted_block, density): + def _cluster_sigma_zeta_relatives(self, cutted_block, density): # TOOD (dmt): Don't rely on data series from pandas, 'cause ckmeans # needs primitives data types.