diff --git a/cml/ports/source_adapters.py b/cml/ports/source_adapters.py index 369b90e053f1204257344204adf0405b4086d8a8..473c66c684d85b21df14ea43e4b0ce7cf9b6f184 100644 --- a/cml/ports/source_adapters.py +++ b/cml/ports/source_adapters.py @@ -63,6 +63,10 @@ class PandasBlock: def __len__(self): return self.__data_block.shape[0] + @property + def data_block(self): + return self.__data_block + def has_nan(self): return self.__data_block.isna().any()[0] @@ -188,6 +192,28 @@ class PandasBlock: self.__data_block.drop(remove_columns, axis=1, inplace=True) + def get_overlapping(self, block, on=None): + if len(block) < len(self): + biggest = block + block = self + else: + biggest = self + block = block + + join = biggest.data_block.join( + block.data_block.set_index("T"), + on="T", + lsuffix="_other", + how="inner") + + for column in join.columns: + if column not in ("T", "Z", "Z_other"): + join.drop(column, axis=1, inplace=True) + join = join[["Z", "Z_other", "T"]] + # TODO (dmt): Make purpose and origin a tuple. + print(len(join)) + return PandasBlock(join, self.purpose, self.origin) + class PandasAdapter: def __init__(self, data_frame): @@ -199,7 +225,7 @@ class PandasAdapter: return block @classmethod - def read_csv_data(cls, path): + def read_csv(cls, path): data_frame = pd.read_csv(path, index_col=False) return PandasAdapter(data_frame)