"""Field."""
from collections import namedtuple
from typing import Any, Tuple, Union
import numpy as np
import tensorflow as tf
[docs]class Field:
"""Convenient way to define fields for features.
Attributes
----------
default : Any
Default value of the field for padding
dtype : tf.DType
Tensorflow type of the field (automatically inferred if string)
name : str
Name of the field
sequence : bool
If True, the field represents a sequence.
Used for ``tf.Example`` message serialization : if ``sequence``
is ``True``, the field with be stored in the ``feature_list``
entry of a ``tf.train.SequenceExample``.
Automatically set if not given : ``True`` if ``shape``'s first
dimension is ``None``.
shape : Tuple
Shape of the field
"""
[docs] def __init__(self, name: str, shape: Tuple, dtype, default: Any = None, sequence: bool = None):
self.name = name
self.shape = tuple(shape)
self.dtype = TensorType(dtype).tf
self.default = default if default is not None else TensorType(dtype).default
self.sequence = (
sequence if sequence is not None else (any(dim is None for dim in shape) if len(shape) == 2 else False)
)
if self.sequence and not self.shape:
msg = f"sequence=True but shape={self.shape}: expected at least one dimension."
raise ValueError(msg)
def __repr__(self):
return f"{self.__class__.__name__}({self.name}, {self.shape}, {self.dtype}, {self.default})"
def __str__(self):
return self.name
def __hash__(self):
return hash(self.name)
@property
def feature_specs(self):
"""Return feature specs for parsing Example messages."""
if not self.is_featurizable():
raise ValueError(f"{self} is not featurizable, no feature specs.")
if self.sequence:
if any(dim is None for dim in self.shape[1:]):
return tf.io.VarLenFeature(dtype=self.dtype)
else:
return tf.io.FixedLenSequenceFeature(shape=self.shape[1:], dtype=self.dtype)
else:
if any(dim is None for dim in self.shape):
return tf.io.VarLenFeature(dtype=self.dtype)
else:
return tf.io.FixedLenFeature(shape=self.shape, dtype=self.dtype)
@property
def batch_shape(self):
return tuple([None] + list(self.shape))
[docs] def is_sparse(self) -> bool:
if self.is_featurizable():
return isinstance(self.feature_specs, tf.io.VarLenFeature)
return False
[docs] def is_featurizable(self) -> bool:
if self.sequence:
if len(self.shape) > 2 and any(dim is None for dim in self.shape[1:]):
return False
else:
if len(self.shape) > 1 and any(dim is None for dim in self.shape):
return False
return True
[docs] def startswith(self, prefix: str):
return self.name.startswith(prefix)
[docs] def as_placeholder(self, batch: bool = False) -> tf.placeholder:
shape = tuple([None] + list(self.shape)) if batch else self.shape
return tf.placeholder(dtype=self.dtype, shape=shape, name=self.name)
[docs] def to_feature(self, value: np.array) -> Union[tf.train.Feature, tf.train.FeatureList]:
"""Convert value to tf.train.Feature or tf.train.FeatureList.
For shapes with more than 2 dimensions, uses ``np.ravel`` to
flatten tensors in a list of values. Note that because
``tf.Example`` uses row-major to parse list of values, we make
sure to use the same order with NumPy.
For that reason, if any of the dimensions is not set (i.e. is
``None``), a ``ValueError`` is raised.
Parameters
----------
value : np.array
Tensor values
Returns
-------
tf.train.FeatureList
If ``sequence`` is ``True``
tf.train.Feature
If ``sequence`` is ``False``
Raises
------
ValueError
If ``sequence``, ``len(shape) > 2`` and one of the non-first
dimensions is not set (i.e. is ``None``).
If not ``sequence``, ``len(shape) > 2`` and any of the
dimensions is not set (i.e. is ``None``).
"""
def _to_feature(val):
"""Return tf.train.Feature"""
if self.dtype is tf.int32 or self.dtype is tf.int64:
return tf.train.Feature(int64_list=tf.train.Int64List(value=val))
if self.dtype is tf.float32 or self.dtype is tf.float64:
return tf.train.Feature(float_list=tf.train.FloatList(value=val))
if self.dtype is tf.string:
return tf.train.Feature(bytes_list=tf.train.BytesList(value=val))
else:
raise TypeError()
if self.sequence:
if len(self.shape) == 0:
msg = f"sequence=True but shape={self.shape}: expected at least one dimension."
raise ValueError(msg)
if len(self.shape) == 1:
return tf.train.FeatureList(feature=[_to_feature([val]) for val in value])
if len(self.shape) == 2:
return tf.train.FeatureList(feature=[_to_feature(val) for val in value])
if any(dim is None for dim in self.shape[1:]):
msg = f"Unable to convert field {self} to feature. If ndim > 2, dimensions must be static."
raise ValueError(msg)
return tf.train.FeatureList(feature=[_to_feature(np.ravel(val, order="C")) for val in value])
else:
if len(self.shape) == 0:
return _to_feature([value])
if len(self.shape) == 1:
return _to_feature(value)
if any(dim is None for dim in self.shape):
msg = f"Unable to convert field {self} to feature. If ndim > 2, dimensions must be static."
raise ValueError(msg)
return _to_feature(np.ravel(value, order="C"))
_TensorType = namedtuple("TensorType", ("tf, np, py, default, string"))
_TENSOR_TYPES = [
_TensorType(tf.int32, np.int32, int, -1, "int32"),
_TensorType(tf.int64, np.int64, int, -1, "int64"),
_TensorType(tf.float32, np.float32, float, 0.0, "float32"),
_TensorType(tf.float64, np.float64, float, 0.0, "float64"),
_TensorType(tf.bool, np.bool, bool, True, "bool"),
_TensorType(tf.string, np.dtype("S"), bytes, b"", "bytes"),
_TensorType(tf.string, np.dtype("S"), str, b"", "string"),
]
[docs]def TensorType(dtype):
"""Return TensorType from Python, TensorFlow or NumPy type"""
# pylint: disable=invalid-name
for tt in _TENSOR_TYPES:
if dtype is tt.tf:
return tt
elif dtype is tt.np:
return tt
elif dtype is tt.py:
return tt
elif dtype == tt.string:
return tt
raise ValueError(f"TensorType not found `{dtype}`")