From f342a4f6b65116f69e3be99eea561a95e4b18c4a Mon Sep 17 00:00:00 2001 From: dmt <> Date: Thu, 3 Oct 2019 19:06:53 +0200 Subject: [PATCH] Identify blocks that satisfy the criteria for being a learning block. --- cml/domain/data_source.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/cml/domain/data_source.py b/cml/domain/data_source.py index c438572..d9e6242 100644 --- a/cml/domain/data_source.py +++ b/cml/domain/data_source.py @@ -57,3 +57,22 @@ class Preprocessor: class LearnblockIdentifier: def __init__(self, settings): self.settings = settings + self.column_pairs = (("T", "Z"), ("T", "Sigma"), ("Sigma", "Z")) + + def identify(self, block): + for pair in self.column_pairs: + for possible_learnblock in self._identify_relatives(block, *pair): + if self._is_learn_block(possible_learnblock.length): + yield possible_learnblock + + def _is_learn_block(self, block_length): + return block_length > self.settings.learn_block_minimum + + def _identify_relatives(self, block, *args): + # TODO (dmt): Implement density and kmeans! + already_seen = set() + for value_pair in block.get_duplicated_pairs(args[0], args[1]): + if value_pair not in already_seen: + already_seen.add(value_pair) + kw = {args[0]: value_pair[0], args[1]: value_pair[1]} + yield block.get_values(**kw) -- GitLab