Dataset

`Dataset`

Bases: dict

A class representing a dataset.

This class extends the built-in dict class and provides additional functionality for working with datasets.

Attributes:

Name	Type	Description
`data_splitter`		An optional data splitter object used to split the data into train and test sets.
`target_column`		The name of the target column in the data.
`name`		The name of the dataset.
`_is_data_splitted`		A flag indicating whether the data has been split.
`data`		The input data for the dataset.
`_X`		The feature matrix X.
`_y`		The target variable array.
`splits`		A dictionary containing the splits of the dataset.

Methods:

Name	Description
`X`	Returns the feature matrix X.
`y`	Returns the target variable array.
`columns`	Returns the list of column names.
`shape`	Returns the shape of the feature matrix X.
`_split_data`	Splits the data into train and test sets.
`_run_checks`	Runs checks on the splits to ensure data integrity.
`load_split`	Loads a specific split of the dataset.
`load_train_test`	Loads the training and testing data splits from the dataset.
`create_from_pipeline`	Creates a dataset from a data loading function and optional data pipeline.
`create_from_splits`	Creates a dataset from splits.

Source code in model_forge/data/dataset.py

class Dataset(dict):
    """
    A class representing a dataset.

    This class extends the built-in `dict` class and provides additional functionality for working with datasets.

    Attributes:
        data_splitter: An optional data splitter object used to split the data into train and test sets.
        target_column: The name of the target column in the data.
        name: The name of the dataset.
        _is_data_splitted: A flag indicating whether the data has been split.
        data: The input data for the dataset.
        _X: The feature matrix X.
        _y: The target variable array.
        splits: A dictionary containing the splits of the dataset.

    Methods:
        X: Returns the feature matrix X.
        y: Returns the target variable array.
        columns: Returns the list of column names.
        shape: Returns the shape of the feature matrix X.
        _split_data: Splits the data into train and test sets.
        _run_checks: Runs checks on the splits to ensure data integrity.
        load_split: Loads a specific split of the dataset.
        load_train_test: Loads the training and testing data splits from the dataset.
        create_from_pipeline: Creates a dataset from a data loading function and optional data pipeline.
        create_from_splits: Creates a dataset from splits.
    """

    def __init__(
        self,
        data: pd.DataFrame,
        data_splitter=None,
        target_column: str = "y",
        name: str = "dataset",
        splits_columns: list = None,
    ) -> None:
        """
        Initialize a Dataset object.

        Args:
            data (pd.DataFrame): The input data for the dataset.
            data_splitter (optional): An optional data splitter object used to split the data into train and test sets.
            target_column (str): The name of the target column in the data.
            name (str): The name of the dataset.

        Returns:
            None
        """

        self.data_splitter = data_splitter
        self.target_column = target_column
        self.splits_columns = splits_columns
        self.name = name
        self._is_data_splitted = False
        self.data = data

        self._split_data()
        super().__init__(self.splits)

    @property
    def X(self) -> pd.DataFrame:
        """
        Returns the feature matrix X.

        Returns:
            pd.DataFrame: The feature matrix X.
        """
        return self["ALL"][0]

    @property
    def y(self) -> np.array:
        """
        Returns the target variable array.

        Returns:
            np.array: The target variable array.
        """
        return self["ALL"][1]

    @property
    def columns(self):
        """
        Returns a list of column names in the dataset.

        Returns:
            list: A list of column names.
        """
        return list(self.splits.values())[0][0].columns.tolist()

    @property
    def shape(self):
        """
        Returns the shape of the dataset.

        Returns:
            tuple: A tuple representing the shape of the dataset.
        """
        return self.X.shape

    def _split_data(self) -> None:
        """
        Split the data into train and test sets.

        This method splits the data into train and test sets based on the provided data splitter.
        If no data splitter is provided, it assumes all the data is the train set.

        Returns:
            None
        """
        self.splits = {}
        self.splits["ALL"] = [True] * len(self.data)
        if self.splits_columns is not None:
            for column in self.splits_columns:
                self.splits[column] = list(self.data[column] == 1)

        self._is_data_splitted = True
        self._run_checks()

    def __getitem__(self, key: Any) -> Any:
        """
        Retrieve an item from the dataset.

        Args:
            key (Any): The key used to retrieve the item.

        Returns:
            Any: The item corresponding to the given key.
        """
        indexes = super().__getitem__(key)
        return (
            self.data.drop(columns=self.target_column)[indexes],
            self.data[self.target_column][indexes],
        )

    def _run_checks(self) -> None:
        """
        Run checks on the splits of the dataset.

        Raises:
            AssertionError: If any of the splits is None, not a list, or empty.
        """
        for split_name, indexes in self.splits.items():
            assert indexes is not None, f"Split '{split_name}' is None"
            assert isinstance(indexes, list), f"Split '{split_name}' is not a list"
            assert len(indexes) != 0, f"Split '{split_name}' is empty"

    def __getattr__(self, attr_name: str) -> Any:
        """
        Retrieves the attribute specified by __name.

        Args:
            __name (str): The name of the attribute to retrieve.

        Returns:
            Any: The value of the attribute.

        Raises:
            AttributeError: If the attribute specified by __name is not found.
        """

        if attr_name.startswith(("X_", "y_")):
            try:
                _, split_name = attr_name.split("_", 1)
                if split_name in self.keys():
                    return (
                        self[split_name][0]
                        if attr_name.startswith("X_")
                        else self[split_name][1]
                    )
            except AttributeError as e:
                raise AttributeError(
                    f"Split '{attr_name}' not found. Attribute. Original error: {str(e)}"
                )

        if not attr_name.startswith(("X_", "y_")):
            try:
                return super().__getattr__(attr_name)
            except AttributeError as e:
                raise AttributeError(
                    f"Attribute '{attr_name}' not found. Original error: {str(e)}"
                )

        raise AttributeError(f"Attribute '{attr_name}' not found")

    def __iter__(self):
        self._iter_keys = iter(self.keys())
        return self

    def __next__(self):
        key = next(self._iter_keys)
        return key, (self[key][0], self[key][1])

    def load_split(
        self,
        split: str,
        return_X_y: bool = False,
        sample_n_rows: Optional[int] = None,
        random_state: int = 36,
    ) -> Union[tuple[pd.DataFrame, np.array], pd.DataFrame]:
        """
        Load a specific split of the dataset.

        Args:
            split (str): The name of the split to load.
            return_X_y (bool, optional): Whether to return X and y separately. Defaults to False.
            sample_n_rows (int, optional): Number of rows to sample from the split. Defaults to None.
            random_state (int, optional): Random state for sampling rows. Defaults to 36.

        Returns:
            Union[tuple[pd.DataFrame, np.array], pd.DataFrame]: The loaded split of the dataset.
                If return_X_y is True, returns a tuple of X and y.
                If return_X_y is False, returns a DataFrame with X and y as columns.
        """

        if not self._is_data_splitted:
            self._split_data()
        if split not in self.splits.keys():
            raise ValueError(
                f"Invalid Split: You requested split '{split}'. Valid splits are: {*list(self.splits.keys()),} "
            )
        X, y = self[split][0], self[split][1]
        if sample_n_rows is not None:
            X = X.sample(sample_n_rows, random_state=random_state)
            y = y[X.index]

        if return_X_y:
            return X, y
        else:
            return X.assign(**{self.target_column: y})

    @classmethod
    def create_from_pipeline(
        cls,
        data_loading_function: Callable[[], pd.DataFrame],
        data_pipeline=None,
        data_splitter=None,
        target_column="y",
        name: str = "dataset",
        splits_columns=None,
    ):
        """
        Create a dataset from a data loading function and optional data pipeline.

        Args:
            cls: The class of the dataset.
            data_loading_function: A function that loads the data and returns a pandas DataFrame.
            data_pipeline: An optional data pipeline to apply to the loaded data.
            data_splitter: An optional data splitter to split the data into train and test sets.
            target_column: The name of the target column in the dataset.
            name: The name of the dataset.

        Returns:
            An instance of the dataset class.

        """
        data = data_loading_function()
        if data_pipeline:
            data = data_pipeline.apply(data)
        return cls(
            data=data,
            data_splitter=data_splitter,
            target_column=target_column,
            name=name,
            splits_columns=splits_columns,
        )

    @classmethod
    def create_from_splits(
        cls,
        splits: dict[str, tuple[pd.DataFrame, np.array]],
        name: str = "dataset",
        target_column: str = "y",
    ):
        """
        Create a dataset from splits.

        Args:
            cls (class): The class of the dataset.
            splits (dict[str, tuple[pd.DataFrame, np.array]]): A dictionary containing the splits of the dataset.
                Each split is represented as a tuple of a pandas DataFrame (X) and a numpy array (y).
            name (str, optional): The name of the dataset. Defaults to "dataset".
            target_column (str, optional): The name of the target column. Defaults to "y".

        Returns:
            dataset (cls): The created dataset.
        """
        Xs = []
        for split_name, (X, y) in splits.items():
            assert (
                target_column not in X.columns
            ), f"Split {split_name} already has a target column ({target_column}), please drop or rename"
            Xs.append(X.assign(y=y))
        fullX = pd.concat(Xs, ignore_index=True)

        dataset = cls(
            data=fullX, data_splitter=None, target_column=target_column, name=name
        )
        dataset._is_data_splitted = True
        dataset.splits = splits
        dataset._run_checks()
        return dataset

`X: pd.DataFrame` `property`

Returns the feature matrix X.

Returns:

Type	Description
`DataFrame`	pd.DataFrame: The feature matrix X.

`columns` `property`

Returns a list of column names in the dataset.

Returns:

Name	Type	Description
`list`		A list of column names.

`shape` `property`

Returns the shape of the dataset.

Returns:

Name	Type	Description
`tuple`		A tuple representing the shape of the dataset.

`y: np.array` `property`

Returns the target variable array.

Returns:

Type	Description
`array`	np.array: The target variable array.

`getattr(attr_name)`

Retrieves the attribute specified by __name.

Parameters:

Name	Type	Description	Default
`__name`	`str`	The name of the attribute to retrieve.	required

Returns:

Name	Type	Description
`Any`	`Any`	The value of the attribute.

Raises:

Type	Description
`AttributeError`	If the attribute specified by __name is not found.

Source code in model_forge/data/dataset.py

def __getattr__(self, attr_name: str) -> Any:
    """
    Retrieves the attribute specified by __name.

    Args:
        __name (str): The name of the attribute to retrieve.

    Returns:
        Any: The value of the attribute.

    Raises:
        AttributeError: If the attribute specified by __name is not found.
    """

    if attr_name.startswith(("X_", "y_")):
        try:
            _, split_name = attr_name.split("_", 1)
            if split_name in self.keys():
                return (
                    self[split_name][0]
                    if attr_name.startswith("X_")
                    else self[split_name][1]
                )
        except AttributeError as e:
            raise AttributeError(
                f"Split '{attr_name}' not found. Attribute. Original error: {str(e)}"
            )

    if not attr_name.startswith(("X_", "y_")):
        try:
            return super().__getattr__(attr_name)
        except AttributeError as e:
            raise AttributeError(
                f"Attribute '{attr_name}' not found. Original error: {str(e)}"
            )

    raise AttributeError(f"Attribute '{attr_name}' not found")

`getitem(key)`

Retrieve an item from the dataset.

Parameters:

Name	Type	Description	Default
`key`	`Any`	The key used to retrieve the item.	required

Returns:

Name	Type	Description
`Any`	`Any`	The item corresponding to the given key.

Source code in model_forge/data/dataset.py

def __getitem__(self, key: Any) -> Any:
    """
    Retrieve an item from the dataset.

    Args:
        key (Any): The key used to retrieve the item.

    Returns:
        Any: The item corresponding to the given key.
    """
    indexes = super().__getitem__(key)
    return (
        self.data.drop(columns=self.target_column)[indexes],
        self.data[self.target_column][indexes],
    )

`init(data, data_splitter=None, target_column='y', name='dataset', splits_columns=None)`

Initialize a Dataset object.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	The input data for the dataset.	required
`data_splitter`	`optional`	An optional data splitter object used to split the data into train and test sets.	`None`
`target_column`	`str`	The name of the target column in the data.	`'y'`
`name`	`str`	The name of the dataset.	`'dataset'`

Returns:

Type	Description
`None`	None

Source code in model_forge/data/dataset.py

def __init__(
    self,
    data: pd.DataFrame,
    data_splitter=None,
    target_column: str = "y",
    name: str = "dataset",
    splits_columns: list = None,
) -> None:
    """
    Initialize a Dataset object.

    Args:
        data (pd.DataFrame): The input data for the dataset.
        data_splitter (optional): An optional data splitter object used to split the data into train and test sets.
        target_column (str): The name of the target column in the data.
        name (str): The name of the dataset.

    Returns:
        None
    """

    self.data_splitter = data_splitter
    self.target_column = target_column
    self.splits_columns = splits_columns
    self.name = name
    self._is_data_splitted = False
    self.data = data

    self._split_data()
    super().__init__(self.splits)

`create_from_pipeline(data_loading_function, data_pipeline=None, data_splitter=None, target_column='y', name='dataset', splits_columns=None)` `classmethod`

Create a dataset from a data loading function and optional data pipeline.

Parameters:

Name	Type	Description	Default
`cls`		The class of the dataset.	required
`data_loading_function`	`Callable[[], DataFrame]`	A function that loads the data and returns a pandas DataFrame.	required
`data_pipeline`		An optional data pipeline to apply to the loaded data.	`None`
`data_splitter`		An optional data splitter to split the data into train and test sets.	`None`
`target_column`		The name of the target column in the dataset.	`'y'`
`name`	`str`	The name of the dataset.	`'dataset'`

Returns:

Type	Description
	An instance of the dataset class.

Source code in model_forge/data/dataset.py

@classmethod
def create_from_pipeline(
    cls,
    data_loading_function: Callable[[], pd.DataFrame],
    data_pipeline=None,
    data_splitter=None,
    target_column="y",
    name: str = "dataset",
    splits_columns=None,
):
    """
    Create a dataset from a data loading function and optional data pipeline.

    Args:
        cls: The class of the dataset.
        data_loading_function: A function that loads the data and returns a pandas DataFrame.
        data_pipeline: An optional data pipeline to apply to the loaded data.
        data_splitter: An optional data splitter to split the data into train and test sets.
        target_column: The name of the target column in the dataset.
        name: The name of the dataset.

    Returns:
        An instance of the dataset class.

    """
    data = data_loading_function()
    if data_pipeline:
        data = data_pipeline.apply(data)
    return cls(
        data=data,
        data_splitter=data_splitter,
        target_column=target_column,
        name=name,
        splits_columns=splits_columns,
    )

`create_from_splits(splits, name='dataset', target_column='y')` `classmethod`

Create a dataset from splits.

Parameters:

Name	Type	Description	Default
`cls`	`class`	The class of the dataset.	required
`splits`	`dict[str, tuple[DataFrame, array]]`	A dictionary containing the splits of the dataset. Each split is represented as a tuple of a pandas DataFrame (X) and a numpy array (y).	required
`name`	`str`	The name of the dataset. Defaults to "dataset".	`'dataset'`
`target_column`	`str`	The name of the target column. Defaults to "y".	`'y'`

Returns:

Name	Type	Description
`dataset`	`cls`	The created dataset.

Source code in model_forge/data/dataset.py

@classmethod
def create_from_splits(
    cls,
    splits: dict[str, tuple[pd.DataFrame, np.array]],
    name: str = "dataset",
    target_column: str = "y",
):
    """
    Create a dataset from splits.

    Args:
        cls (class): The class of the dataset.
        splits (dict[str, tuple[pd.DataFrame, np.array]]): A dictionary containing the splits of the dataset.
            Each split is represented as a tuple of a pandas DataFrame (X) and a numpy array (y).
        name (str, optional): The name of the dataset. Defaults to "dataset".
        target_column (str, optional): The name of the target column. Defaults to "y".

    Returns:
        dataset (cls): The created dataset.
    """
    Xs = []
    for split_name, (X, y) in splits.items():
        assert (
            target_column not in X.columns
        ), f"Split {split_name} already has a target column ({target_column}), please drop or rename"
        Xs.append(X.assign(y=y))
    fullX = pd.concat(Xs, ignore_index=True)

    dataset = cls(
        data=fullX, data_splitter=None, target_column=target_column, name=name
    )
    dataset._is_data_splitted = True
    dataset.splits = splits
    dataset._run_checks()
    return dataset

`load_split(split, return_X_y=False, sample_n_rows=None, random_state=36)`

Load a specific split of the dataset.

Parameters:

Name	Type	Description	Default
`split`	`str`	The name of the split to load.	required
`return_X_y`	`bool`	Whether to return X and y separately. Defaults to False.	`False`
`sample_n_rows`	`int`	Number of rows to sample from the split. Defaults to None.	`None`
`random_state`	`int`	Random state for sampling rows. Defaults to 36.	`36`

Returns:

Type	Description
`Union[tuple[DataFrame, array], DataFrame]`	Union[tuple[pd.DataFrame, np.array], pd.DataFrame]: The loaded split of the dataset. If return_X_y is True, returns a tuple of X and y. If return_X_y is False, returns a DataFrame with X and y as columns.

Source code in model_forge/data/dataset.py

def load_split(
    self,
    split: str,
    return_X_y: bool = False,
    sample_n_rows: Optional[int] = None,
    random_state: int = 36,
) -> Union[tuple[pd.DataFrame, np.array], pd.DataFrame]:
    """
    Load a specific split of the dataset.

    Args:
        split (str): The name of the split to load.
        return_X_y (bool, optional): Whether to return X and y separately. Defaults to False.
        sample_n_rows (int, optional): Number of rows to sample from the split. Defaults to None.
        random_state (int, optional): Random state for sampling rows. Defaults to 36.

    Returns:
        Union[tuple[pd.DataFrame, np.array], pd.DataFrame]: The loaded split of the dataset.
            If return_X_y is True, returns a tuple of X and y.
            If return_X_y is False, returns a DataFrame with X and y as columns.
    """

    if not self._is_data_splitted:
        self._split_data()
    if split not in self.splits.keys():
        raise ValueError(
            f"Invalid Split: You requested split '{split}'. Valid splits are: {*list(self.splits.keys()),} "
        )
    X, y = self[split][0], self[split][1]
    if sample_n_rows is not None:
        X = X.sample(sample_n_rows, random_state=random_state)
        y = y[X.index]

    if return_X_y:
        return X, y
    else:
        return X.assign(**{self.target_column: y})

Dataset

Dataset

X: pd.DataFrame property

columns property

shape property

y: np.array property

__getattr__(attr_name)

__getitem__(key)

__init__(data, data_splitter=None, target_column='y', name='dataset', splits_columns=None)

create_from_pipeline(data_loading_function, data_pipeline=None, data_splitter=None, target_column='y', name='dataset', splits_columns=None) classmethod

create_from_splits(splits, name='dataset', target_column='y') classmethod

load_split(split, return_X_y=False, sample_n_rows=None, random_state=36)

`Dataset`

`X: pd.DataFrame` `property`

`columns` `property`

`shape` `property`

`y: np.array` `property`

`getattr(attr_name)`

`getitem(key)`

`init(data, data_splitter=None, target_column='y', name='dataset', splits_columns=None)`

`create_from_pipeline(data_loading_function, data_pipeline=None, data_splitter=None, target_column='y', name='dataset', splits_columns=None)` `classmethod`

`create_from_splits(splits, name='dataset', target_column='y')` `classmethod`

`load_split(split, return_X_y=False, sample_n_rows=None, random_state=36)`