diff --git a/cml/ports/source_adapters.py b/cml/ports/source_adapters.py index c9493377f153d8f1130bf416447a991c3483c2dc..c25dce8f0fabd62b0e90d1b68e7add22b54fa911 100644 --- a/cml/ports/source_adapters.py +++ b/cml/ports/source_adapters.py @@ -41,11 +41,12 @@ class Adapter(ABC): class PandasBlock: _LAST_THREE_COLUMNS = 3 - def __init__(self, data_block, relatives=None): + def __init__(self, data_block, relatives=None, purpose=None, origin=None): self.__data_block = data_block self.relatives = relatives + self.purpose = purpose self.n_cluster = None - self.purpose = None + self.origin = origin def __str__(self): return str(self.__data_block) @@ -56,11 +57,15 @@ class PandasBlock: self.__data_block.columns[:self.rows-self._LAST_THREE_COLUMNS]]) def __getitem__(self, item): - return self.__data_block.iloc[item][:self.rows-self._LAST_THREE_COLUMNS] + return self.__data_block.iloc[ + item][:len(self.columns)-self._LAST_THREE_COLUMNS] def __len__(self): return self.__data_block.shape[0] + def has_nan(self): + return self.__data_block.isna().any()[0] + @property def labeled(self): return not self.__data_block.Z.nunique() == 1 @@ -76,23 +81,36 @@ class PandasBlock: def as_numpy_array(self): return self.__data_block[ self.__data_block.columns[ - :self.rows - self._LAST_THREE_COLUMNS]].values + :len(self.columns) - self._LAST_THREE_COLUMNS]].values def set_labels(self, labels): data_frame = self.__data_block.copy() data_frame["Z"] = labels - return PandasBlock(data_frame, self.relatives) - - def overlapping_rows(self, block): - overlapping = self.__data_block[self.__data_block.isin(block)].dropna( - axis=0, how="all" - ) - return self.new_block_from_rows_index(overlapping.indexes) + return PandasBlock(data_frame, + self.relatives, + self.purpose, + self.origin) + + def overlapping_rows(self, block, subset=None): + big_df = pd.concat([self.__data_block, block.__data_block], sort=False) + overlapping_data_frame = big_df[big_df.duplicated( + subset=subset, keep=False)].drop_duplicates(keep="first") + return PandasBlock(overlapping_data_frame, + purpose=self.purpose, + origin=self.origin) def fusion(self, block): - return self.new_block_from_rows_index( - self.indexes+block.indexes - ) + features = list(set(self.columns).intersection(block.columns)) + features.remove("Z") + features.remove("Sigma") + features.remove("T") + features.append("T") + features.append("Sigma") + features.append("Z") + + df = pd.concat([self.__data_block, block.__data_block], sort=False) + df = df[features] + return PandasBlock(df, purpose=self.purpose, origin=self.origin) @property def min_timestamp(self): @@ -104,21 +122,22 @@ class PandasBlock: @property def learn_rows(self): + # TODO (dmt): Fix this Bug, you get rows and not columns! return self.__data_block.shape[1] - 3 @property def rows(self): - return self.__data_block.shape[1] + return self.__data_block.shape[0] def new_block_from_rows_index(self, indices: List[int]): data_from = self.__data_block.loc[indices] - return PandasBlock(data_from) + return PandasBlock(data_from, purpose=self.purpose, origin=self.origin) def new_block_from(self, column_values): data_from = self.__data_block.loc[self.__data_block["T"].isin( column_values)] - return PandasBlock(data_from) + return PandasBlock(data_from, purpose=self.purpose, origin=self.origin) def get_duplicated_pairs(self, *args): bool_series = self.__data_block.duplicated(subset=[args[0], args[1]]) @@ -128,7 +147,7 @@ class PandasBlock: @property def indexes(self): - return tuple(self.__data_block.index) + return list(self.__data_block.index) def get_values(self, **kwargs): t, z, sigma = kwargs.get("T"), kwargs.get("Z"), kwargs.get("Sigma") @@ -151,7 +170,7 @@ class PandasBlock: # TODO (dmt): Write proper error handling. raise Exception() - return PandasBlock(data_frame) + return PandasBlock(data_frame, purpose=self.purpose, origin=self.origin) @property def length(self): @@ -161,7 +180,7 @@ class PandasBlock: self.__data_block.drop(index, inplace=True) def get_column_values(self, column_name): - return self.__data_block[column_name] + return list(self.__data_block[column_name]) def get_column_name_by_index(self, index): column_names = self.__data_block.column @@ -184,6 +203,11 @@ class PandasAdapter: def __init__(self, data_frame): self.__data_frame = data_frame + def new_block(self, values, columns, index, origin): + data_frame = pd.DataFrame(data=values, columns=columns, index=index) + block = PandasBlock(data_frame, origin=origin) + return block + @classmethod def read_csv_data(cls, path): data_frame = pd.read_csv(path, index_col=False) @@ -193,11 +217,13 @@ class PandasAdapter: def length(self): return self.__data_frame.shape[0] - def get_block_via_index(self, indexes): - return PandasBlock(self.__data_frame.iloc[indexes]) + def get_block_via_index(self, indexes, columns=None): + if columns: + return PandasBlock(self.__data_frame.iloc[list(indexes)][columns]) + return PandasBlock(self.__data_frame.iloc[list(indexes)]) - def get_block(self, start, end=None, step=None): - return PandasBlock(self.__data_frame[start:end:step]) + def get_block(self, start, end=None, step=None, columns: List[str] = None): + return PandasBlock(self.__data_frame[start:end:step][columns]) def get_column_values(self, column_name): return self.__data_frame[column_name]