Source code for deepr.prepros.base

"""Abstract Base Class for preprocessing"""

from abc import ABC, abstractmethod
import logging
import functools
from typing import Callable, Type, Optional
import inspect

import tensorflow as tf

from deepr.utils.tables import TableContext


LOGGER = logging.getLogger(__name__)


[docs]class Prepro(ABC):
    """Base class for composable preprocessing functions.

    `Prepro` are the basic building blocks of a preprocessing pipeline.
    A `Prepro` defines a function on a tf.data.Dataset.

    The basic usage of a :class:`~Prepro` is to apply it on a Dataset. For
    example:
    >>> from deepr import readers
    >>> from deepr.prepros import Map
    >>> def gen():
    ...     for i in range(3):
    ...         yield {"a": i}
    >>> raw_dataset = tf.data.Dataset.from_generator(gen, {"a": tf.int32}, {"a": tf.TensorShape([])})
    >>> list(readers.from_dataset(raw_dataset))
    [{'a': 0}, {'a': 1}, {'a': 2}]
    >>> prepro_fn = Map(lambda x: {'a': x['a'] + 1})
    >>> dataset = prepro_fn(raw_dataset)
    >>> list(readers.from_dataset(dataset))
    [{'a': 1}, {'a': 2}, {'a': 3}]

    Because some preprocessing pipelines behave differently depending
    on the mode (TRAIN, EVAL, PREDICT), an optional argument can be
    provided:
    >>> def map_func(element, mode=None):
    ...     if mode == tf.estimator.ModeKeys.PREDICT:
    ...         return {'a': 0}
    ...     else:
    ...         return element
    >>> prepro_fn = Map(map_func)
    >>> list(readers.from_dataset(raw_dataset))
    [{'a': 0}, {'a': 1}, {'a': 2}]
    >>> dataset = prepro_fn(raw_dataset, mode=tf.estimator.ModeKeys.TRAIN)
    >>> list(readers.from_dataset(dataset))
    [{'a': 0}, {'a': 1}, {'a': 2}]
    >>> dataset = prepro_fn(raw_dataset, mode=tf.estimator.ModeKeys.PREDICT)
    >>> list(readers.from_dataset(dataset))
    [{'a': 0}, {'a': 1}, {'a': 2}]

    TODO: Actually mode in map_func is not taken into account

    :class:`~Map`, :class:`~Filter`, :class:`~Shuffle` and :class:`~Repeat` have a special attribute
    `modes` that you can use to specify the modes on which the
    preprocessing should be applied. For example:
    >>> def map_func(element, mode=None):
    ...     return {'a': 0}
    >>> prepro_fn = Map(map_func, modes=[tf.estimator.ModeKeys.PREDICT])
    >>> dataset = prepro_fn(raw_dataset, tf.estimator.ModeKeys.TRAIN)
    >>> list(readers.from_dataset(dataset))
    [{'a': 0}, {'a': 1}, {'a': 2}]
    >>> dataset = prepro_fn(dataset, tf.estimator.ModeKeys.PREDICT)
    >>> list(readers.from_dataset(dataset))
    [{'a': 0}, {'a': 0}, {'a': 0}]

    Authors of new :class:`~Prepro` subclasses typically override the `apply`
    method of the base :class:`~Prepro` class::

        def apply(self, dataset: tf.data.Dataset, mode: str = None) -> tf.data.Dataset:
            return dataset

    The easiest way to define custom preprocessors is to use the
    `prepro` decorator (see documentation).
    """

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}"

    def __call__(self, dataset: tf.data.Dataset, mode: str = None) -> tf.data.Dataset:
        """Alias for apply"""
        if TableContext.is_active():
            return self.apply(dataset, mode=mode)
        else:
            with TableContext():
                return self.apply(dataset, mode=mode)

[docs]    @abstractmethod
    def apply(self, dataset: tf.data.Dataset, mode: str = None) -> tf.data.Dataset:
        """Pre-process a dataset"""
        raise NotImplementedError()


[docs]class PreproFn(Prepro):
    """Prepro from function."""

    def __init__(self, prepro_fn: Callable[[tf.data.Dataset, Optional[str]], tf.data.Dataset]):
        self.prepro_fn = prepro_fn

[docs]    def apply(self, dataset: tf.data.Dataset, mode: str = None) -> tf.data.Dataset:
        return self.prepro_fn(dataset, mode)


[docs]def prepro(fn: Callable) -> Type[Prepro]:
    """Decorator that creates a :class:`~Prepro` class from a function.

    For example, the following snippet defines a subclass of
    :class:`~Prepro` whose `apply` offsets each element of the dataset
    by `offset`:

    >>> from deepr import readers
    >>> from deepr.prepros import prepro
    >>> @prepro
    ... def AddOffset(dataset, mode, offset):
    ...     return dataset.map(lambda element: element + offset)
    >>> raw_dataset = tf.data.Dataset.from_tensor_slices([0, 1, 2])
    >>> prepro_fn = AddOffset(offset=1)
    >>> dataset = prepro_fn(raw_dataset)
    >>> list(readers.from_dataset(dataset))
    [1, 2, 3]

    The class created by the decorator is roughly equivalent to

    .. code-block:: python

        class AddOffset(Prepro):

            def __init__(self, offset)
                Prepro.__init__(self)
                self.offset = offset

            def apply(self, dataset, mode: str = None):
                return dataset.map(lambda element: element + self.offset)

    You can also add a 'mode' argument to your preprocessor like so
    >>> @prepro
    ... def AddOffsetInTrain(dataset, mode, offset):
    ...     if mode == tf.estimator.ModeKeys.TRAIN:
    ...         return dataset.map(lambda element: element + offset)
    ...     else:
    ...         return dataset
    >>> prepro_fn = AddOffsetInTrain(offset=1)
    >>> dataset = prepro_fn(raw_dataset, tf.estimator.ModeKeys.TRAIN)
    >>> list(readers.from_dataset(dataset))
    [1, 2, 3]
    >>> dataset = prepro_fn(raw_dataset, tf.estimator.ModeKeys.PREDICT)
    >>> list(readers.from_dataset(dataset))
    [0, 1, 2]
    >>> dataset = prepro_fn(raw_dataset)
    >>> list(readers.from_dataset(dataset))
    [0, 1, 2]

    Note that 'dataset' and 'mode' need to be the the first arguments
    of the function IN THIS ORDER.
    """
    # pylint: disable=protected-access,invalid-name
    parameters = inspect.signature(fn).parameters
    signature = inspect.Signature([param for key, param in parameters.items() if key not in {"dataset", "mode"}])

    # Check parameters
    if list(parameters.keys())[0] != "dataset":
        raise TypeError(f"'dataset' should be the first parameter of {fn.__name__}")
    if "mode" in parameters:
        if list(parameters.keys())[1] != "mode":
            raise TypeError(f"'mode' should be the second parameter of {fn.__name__}")

    @functools.wraps(fn)
    def _init(self, *args, **kwargs):
        Prepro.__init__(self)
        signature.bind(*args, **kwargs)
        self._args = args
        self._kwargs = kwargs

    if "mode" in parameters:

        def _apply(self, dataset, mode: str = None):
            return fn(dataset, mode, *self._args, **self._kwargs)

    else:

        def _apply(self, dataset, mode: str = None):
            # pylint: disable=unused-argument
            return fn(dataset, *self._args, **self._kwargs)

    attributes = {"__module__": fn.__module__, "__doc__": fn.__doc__, "__init__": _init, "apply": _apply}
    return type(fn.__name__, (Prepro,), attributes)