diff --git a/cml/domain/data_source.py b/cml/domain/data_source.py index c438572d3b11533ff974bf7a06b9a9162f61c018..d9e624299406c0d437d140fef7820ab0e471a99b 100644 --- a/cml/domain/data_source.py +++ b/cml/domain/data_source.py @@ -57,3 +57,22 @@ class Preprocessor: class LearnblockIdentifier: def __init__(self, settings): self.settings = settings + self.column_pairs = (("T", "Z"), ("T", "Sigma"), ("Sigma", "Z")) + + def identify(self, block): + for pair in self.column_pairs: + for possible_learnblock in self._identify_relatives(block, *pair): + if self._is_learn_block(possible_learnblock.length): + yield possible_learnblock + + def _is_learn_block(self, block_length): + return block_length > self.settings.learn_block_minimum + + def _identify_relatives(self, block, *args): + # TODO (dmt): Implement density and kmeans! + already_seen = set() + for value_pair in block.get_duplicated_pairs(args[0], args[1]): + if value_pair not in already_seen: + already_seen.add(value_pair) + kw = {args[0]: value_pair[0], args[1]: value_pair[1]} + yield block.get_values(**kw)