From 0319d03de04c3c09a0b43134c7c084fefbdae491 Mon Sep 17 00:00:00 2001
From: dmt <>
Date: Tue, 1 Oct 2019 20:53:37 +0200
Subject: [PATCH] Define the DataSource, Preprocessor and LearnblockIdentifier.

---
 cml/domain/data_source.py | 59 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 cml/domain/data_source.py

diff --git a/cml/domain/data_source.py b/cml/domain/data_source.py
new file mode 100644
index 0000000..c438572
--- /dev/null
+++ b/cml/domain/data_source.py
@@ -0,0 +1,59 @@
+from os.path import commonprefix
+
+
+__all__ = (
+    "DataSource",
+    "Preprocessor",
+    "LearnblockIdentifier"
+)
+
+
+class DataSource:
+    def __init__(self, source, learnblock_identifier):
+        self.source = source
+        self.learnblock_identifier = learnblock_identifier
+
+
+class Preprocessor:
+    TARGET_COLUMN = "Z"
+    TIME_COLUMN = "T"
+
+    def __init__(self, settings):
+        self.settings = settings
+
+    def clean(self, table):
+        self._drop_irrelevant_columns(table)
+
+        if self.settings.set_targets:
+            self._overwrite_target_column(table)
+
+        if self.settings.sort_time_stamp:
+            self._sort_according_time_stamp(table)
+
+        if self.settings.cut_time_stamp:
+            self._remove_common_time_stamp_prefix(table)
+
+    def _drop_irrelevant_columns(self, table):
+        # TODO (dmt): Don't drop T, Z and Sigma columns!
+        for column in table.get_columns():
+            column_index = table.get_column_index_by_name(column)
+            if column_index not in self.settings.set_features:
+                table.drop_column_by_index(column_index)
+
+    def _overwrite_target_column(self, table):
+        table.set_column_value(self.TARGET_COLUMN, self.settings.set_targets)
+
+    def _sort_according_time_stamp(self, table):
+        table.sort(self.TIME_COLUMN)
+
+    def _remove_common_time_stamp_prefix(self, table):
+        # TODO (dmt): Check if timestamp column is of type string!
+        time_column = table.get_column_values_as_list(self.TIME_COLUMN)
+        common_prefix = commonprefix(time_column)
+        cleaned_time_column = [s.lstrip(common_prefix) for s in time_column]
+        table.set_column_values(cleaned_time_column)
+
+
+class LearnblockIdentifier:
+    def __init__(self, settings):
+        self.settings = settings
-- 
GitLab