From 77e00eb7837fd937fae7516549f2e6bc48020c85 Mon Sep 17 00:00:00 2001 From: dmt <> Date: Tue, 12 Nov 2019 18:34:15 +0100 Subject: [PATCH] Get overlapping data samples through joins. --- cml/ports/source_adapters.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/cml/ports/source_adapters.py b/cml/ports/source_adapters.py index 369b90e..473c66c 100644 --- a/cml/ports/source_adapters.py +++ b/cml/ports/source_adapters.py @@ -63,6 +63,10 @@ class PandasBlock: def __len__(self): return self.__data_block.shape[0] + @property + def data_block(self): + return self.__data_block + def has_nan(self): return self.__data_block.isna().any()[0] @@ -188,6 +192,28 @@ class PandasBlock: self.__data_block.drop(remove_columns, axis=1, inplace=True) + def get_overlapping(self, block, on=None): + if len(block) < len(self): + biggest = block + block = self + else: + biggest = self + block = block + + join = biggest.data_block.join( + block.data_block.set_index("T"), + on="T", + lsuffix="_other", + how="inner") + + for column in join.columns: + if column not in ("T", "Z", "Z_other"): + join.drop(column, axis=1, inplace=True) + join = join[["Z", "Z_other", "T"]] + # TODO (dmt): Make purpose and origin a tuple. + print(len(join)) + return PandasBlock(join, self.purpose, self.origin) + class PandasAdapter: def __init__(self, data_frame): @@ -199,7 +225,7 @@ class PandasAdapter: return block @classmethod - def read_csv_data(cls, path): + def read_csv(cls, path): data_frame = pd.read_csv(path, index_col=False) return PandasAdapter(data_frame) -- GitLab