Source code for holoviews.core.data

from __future__ import absolute_import

try:
    import itertools.izip as zip
except ImportError:
    pass

import numpy as np
import param

from ..dimension import redim
from ..util import dimension_range, basestring
from .interface import Interface, iloc, ndloc
from .array import ArrayInterface
from .dictionary import DictInterface
from .grid import GridInterface
from .multipath import MultiInterface         # noqa (API import)
from .image import ImageInterface             # noqa (API import)

datatypes = ['array', 'dictionary', 'grid']

try:
    import pandas as pd # noqa (Availability import)
    from .pandas import PandasInterface
    datatypes = ['array', 'dataframe', 'dictionary', 'grid', 'ndelement']
    DFColumns = PandasInterface
except ImportError:
    pass
except Exception as e:
    param.main.warning('Pandas interface failed to import with '
                       'following error: %s' % e)

try:
    import iris # noqa (Availability import)
    from .iris import CubeInterface # noqa (Conditional API import)
    datatypes.append('cube')
except ImportError:
    pass
except Exception as e:
    param.main.warning('Iris interface failed to import with '
                       'following error: %s' % e)

try:
    import xarray # noqa (Availability import)
    from .xarray import XArrayInterface # noqa (Conditional API import)
    datatypes.append('xarray')
except ImportError:
    pass

try:
    from .dask import DaskInterface   # noqa (Conditional API import)
    datatypes.append('dask')
except ImportError:
    pass

from ..dimension import Dimension, process_dimensions
from ..element import Element
from ..ndmapping import OrderedDict
from ..spaces import HoloMap, DynamicMap
from .. import util


[docs]class DataConversion(object):
    """
    DataConversion is a very simple container object which can be
    given an existing Dataset Element and provides methods to convert
    the Dataset into most other Element types.
    """

    def __init__(self, element):
        self._element = element

    def __call__(self, new_type, kdims=None, vdims=None, groupby=None,
                 sort=False, **kwargs):
        """
        Generic conversion method for Dataset based Element
        types. Supply the Dataset Element type to convert to and
        optionally the key dimensions (kdims), value dimensions
        (vdims) and the dimensions.  to group over. Converted Columns
        can be automatically sorted via the sort option and kwargs can
        be passed through.
        """
        if 'mdims' in kwargs:
            if groupby:
                raise ValueError('Cannot supply both mdims and groupby')
            else:
                self._element.warning("'mdims' keyword has been renamed "
                                      "to 'groupby'; the name mdims is "
                                      "deprecated and will be removed "
                                      "after version 1.7.")
                groupby = kwargs.pop('mdims')

        element_params = new_type.params()
        kdim_param = element_params['kdims']
        vdim_param = element_params['vdims']
        if isinstance(kdim_param.bounds[1], int):
            ndim = min([kdim_param.bounds[1], len(kdim_param.default)])
        else:
            ndim = None
        nvdim = vdim_param.bounds[1] if isinstance(vdim_param.bounds[1], int) else None
        if kdims is None:
            kd_filter = groupby or []
            if not isinstance(kd_filter, list):
                kd_filter = [groupby]
            kdims = [kd for kd in self._element.kdims if kd not in kd_filter][:ndim]
        elif kdims and not isinstance(kdims, list): kdims = [kdims]
        if vdims is None:
            vdims = [d for d in self._element.vdims if d not in kdims][:nvdim]
        if vdims and not isinstance(vdims, list): vdims = [vdims]

        # Checks Element type supports dimensionality
        type_name = new_type.__name__
        for dim_type, dims in (('kdims', kdims), ('vdims', vdims)):
            min_d, max_d = new_type.params(dim_type).bounds
            if ((min_d is not None and len(dims) < min_d) or
                (max_d is not None and len(dims) > max_d)):
                raise ValueError("%s %s must be between length %s and %s." %
                                 (type_name, dim_type, min_d, max_d))

        if groupby is None:
            groupby = [d for d in self._element.kdims if d not in kdims+vdims]
        elif groupby and not isinstance(groupby, list):
            groupby = [groupby]

        if self._element.interface.gridded:
            dropped_kdims = [kd for kd in self._element.kdims if kd not in groupby+kdims]
            if dropped_kdims:
                selected = self._element.reindex(groupby+kdims, vdims)
            else:
                selected = self._element
        else:
            selected = self._element.reindex(groupby+kdims, vdims)
        params = {'kdims': [selected.get_dimension(kd, strict=True) for kd in kdims],
                  'vdims': [selected.get_dimension(vd, strict=True) for vd in vdims],
                  'label': selected.label}
        if selected.group != selected.params()['group'].default:
            params['group'] = selected.group
        params.update(kwargs)
        if len(kdims) == selected.ndims or not groupby:
            element = new_type(selected, **params)
            return element.sort() if sort else element
        group = selected.groupby(groupby, container_type=HoloMap,
                                 group_type=new_type, **params)
        if sort:
            return group.map(lambda x: x.sort(), [new_type])
        else:
            return group



[docs]class Dataset(Element):
    """
    Dataset provides a general baseclass for Element types that
    contain structured data and supports a range of data formats.

    The Dataset class supports various methods offering a consistent way
    of working with the stored data regardless of the storage format
    used. These operations include indexing, selection and various ways
    of aggregating or collapsing the data with a supplied function.
    """

    datatype = param.List(datatypes,
        doc=""" A priority list of the data types to be used for storage
        on the .data attribute. If the input supplied to the element
        constructor cannot be put into the requested format, the next
        format listed will be used until a suitable format is found (or
        the data fails to be understood).""")

    group = param.String(default='Dataset', constant=True)

    # In the 1D case the interfaces should not automatically add x-values
    # to supplied data
    _auto_indexable_1d = True

    # Define a class used to transform Datasets into other Element types
    _conversion_interface = DataConversion

    _vdim_reductions = {}
    _kdim_reductions = {}

    def __init__(self, data, kdims=None, vdims=None, **kwargs):
        if isinstance(data, Element):
            pvals = util.get_param_values(data)
            kwargs.update([(l, pvals[l]) for l in ['group', 'label']
                           if l in pvals and l not in kwargs])
        kwargs.update(process_dimensions(kdims, vdims))
        kdims, vdims = kwargs.get('kdims'), kwargs.get('vdims')

        initialized = Interface.initialize(type(self), data, kdims, vdims,
                                           datatype=kwargs.get('datatype'))
        (data, self.interface, dims, extra_kws) = initialized
        validate_vdims = kwargs.pop('_validate_vdims', True)
        super(Dataset, self).__init__(data, **dict(kwargs, **dict(dims, **extra_kws)))
        self.interface.validate(self, validate_vdims)

        self.redim = redim(self, mode='dataset')


[docs]    def closest(self, coords=[], **kwargs):
        """
        Given a single coordinate or multiple coordinates as
        a tuple or list of tuples or keyword arguments matching
        the dimension closest will find the closest actual x/y
        coordinates. Different Element types should implement this
        appropriately depending on the space they represent, if the
        Element does not support snapping raise NotImplementedError.
        """
        if self.ndims > 1:
            raise NotImplementedError("Closest method currently only "
                                      "implemented for 1D Elements")

        if kwargs:
            if len(kwargs) > 1:
                raise NotImplementedError("Closest method currently only "
                                          "supports 1D indexes")
            samples = list(kwargs.values())[0]
            coords = samples if isinstance(samples, list) else [samples]

        xs = self.dimension_values(0)
        if xs.dtype.kind in 'SO':
            raise NotImplementedError("Closest only supported for numeric types")
        idxs = [np.argmin(np.abs(xs-coord)) for coord in coords]
        return [xs[idx] for idx in idxs]


[docs]    def sort(self, by=[], reverse=False):
        """
        Sorts the data by the values along the supplied dimensions.
        """
        if not by: by = self.kdims
        if not isinstance(by, list): by = [by]

        sorted_columns = self.interface.sort(self, by, reverse)
        return self.clone(sorted_columns)


[docs]    def range(self, dim, data_range=True):
        """
        Computes the range of values along a supplied dimension, taking
        into account the range and soft_range defined on the Dimension
        object.
        """
        dim = self.get_dimension(dim)
        if dim is None:
            return (None, None)
        elif all(v is not None and np.isfinite(v) for v in dim.range):
            return dim.range
        elif dim in self.dimensions() and data_range and len(self):
            lower, upper = self.interface.range(self, dim)
        else:
            lower, upper = (np.NaN, np.NaN)
        return dimension_range(lower, upper, dim)


[docs]    def add_dimension(self, dimension, dim_pos, dim_val, vdim=False, **kwargs):
        """
        Create a new object with an additional key dimensions.  Requires
        the dimension name or object, the desired position in the key
        dimensions and a key value scalar or sequence of the same length
        as the existing keys.
        """
        if isinstance(dimension, (util.basestring, tuple)):
            dimension = Dimension(dimension)

        if dimension.name in self.kdims:
            raise Exception('{dim} dimension already defined'.format(dim=dimension.name))

        if vdim:
            dims = self.vdims[:]
            dims.insert(dim_pos, dimension)
            dimensions = dict(vdims=dims)
            dim_pos += self.ndims
        else:
            dims = self.kdims[:]
            dims.insert(dim_pos, dimension)
            dimensions = dict(kdims=dims)

        data = self.interface.add_dimension(self, dimension, dim_pos, dim_val, vdim)
        return self.clone(data, **dimensions)


[docs]    def select(self, selection_specs=None, **selection):
        """
        Allows selecting data by the slices, sets and scalar values
        along a particular dimension. The indices should be supplied as
        keywords mapping between the selected dimension and
        value. Additionally selection_specs (taking the form of a list
        of type.group.label strings, types or functions) may be
        supplied, which will ensure the selection is only applied if the
        specs match the selected object.
        """
        selection = {dim: sel for dim, sel in selection.items()
                     if dim in self.dimensions()+['selection_mask']}
        if (selection_specs and not any(self.matches(sp) for sp in selection_specs)
            or not selection):
            return self

        data = self.interface.select(self, **selection)

        if np.isscalar(data):
            return data
        else:
            return self.clone(data)


[docs]    def reindex(self, kdims=None, vdims=None):
        """
        Create a new object with a re-ordered set of dimensions.  Allows
        converting key dimensions to value dimensions and vice versa.
        """
        if kdims is None:
            key_dims = [d for d in self.kdims if not vdims or d not in vdims]
        else:
            key_dims = [self.get_dimension(k, strict=True) for k in kdims]

        new_type = None
        if vdims is None:
            val_dims = [d for d in self.vdims if not kdims or d not in kdims]
        else:
            val_dims = [self.get_dimension(v, strict=True) for v in vdims]
            new_type = self._vdim_reductions.get(len(val_dims), type(self))

        data = self.interface.reindex(self, key_dims, val_dims)
        return self.clone(data, kdims=key_dims, vdims=val_dims,
                          new_type=new_type)


    def __getitem__(self, slices):
        """
        Allows slicing and selecting values in the Dataset object.
        Supports multiple indexing modes:

           (1) Slicing and indexing along the values of each dimension
               in the columns object using either scalars, slices or
               sets of values.
           (2) Supplying the name of a dimension as the first argument
               will return the values along that dimension as a numpy
               array.
           (3) Slicing of all key dimensions and selecting a single
               value dimension by name.
           (4) A boolean array index matching the length of the Dataset
               object.
        """
        slices = util.process_ellipses(self, slices, vdim_selection=True)
        if isinstance(slices, np.ndarray) and slices.dtype.kind == 'b':
            if not len(slices) == len(self):
                raise IndexError("Boolean index must match length of sliced object")
            return self.clone(self.select(selection_mask=slices))
        elif slices in [(), Ellipsis]:
            return self
        if not isinstance(slices, tuple): slices = (slices,)
        value_select = None
        if len(slices) == 1 and slices[0] in self.dimensions():
            return self.dimension_values(slices[0])
        elif len(slices) == self.ndims+1 and slices[self.ndims] in self.dimensions():
            selection = dict(zip(self.dimensions('key', label=True), slices))
            value_select = slices[self.ndims]
        elif len(slices) == self.ndims+1 and isinstance(slices[self.ndims],
                                                        (Dimension,str)):
            raise Exception("%r is not an available value dimension" % slices[self.ndims])
        else:
            selection = dict(zip(self.dimensions(label=True), slices))
        data = self.select(**selection)
        if value_select:
            if data.shape[0] == 1:
                return data[value_select][0]
            else:
                return data.reindex(vdims=[value_select])
        return data


[docs]    def sample(self, samples=[], closest=True, **kwargs):
        """
        Allows sampling of Dataset as an iterator of coordinates
        matching the key dimensions, returning a new object containing
        just the selected samples. Alternatively may supply kwargs
        to sample a coordinate on an object. By default it will attempt
        to snap to the nearest coordinate if the Element supports it,
        snapping may be disabled with the closest argument.
        """
        if kwargs and samples:
            raise Exception('Supply explicit list of samples or kwargs, not both.')
        elif kwargs:
            sample = [slice(None) for _ in range(self.ndims)]
            for dim, val in kwargs.items():
                sample[self.get_dimension_index(dim)] = val
            samples = [tuple(sample)]

        # Note: Special handling sampling of gridded 2D data as Curve
        # may be replaced with more general handling
        # see https://github.com/ioam/holoviews/issues/1173
        from ...element import Table, Curve
        if len(samples) == 1:
            sel = {kd.name: s for kd, s in zip(self.kdims, samples[0])}
            dims = [kd for kd, v in sel.items() if not np.isscalar(v)]
            selection = self.select(**sel)

            # If a 1D cross-section of 2D space return Curve
            if self.interface.gridded and self.ndims == 2 and len(dims) == 1:
                new_type = Curve
                kdims = [self.get_dimension(kd) for kd in dims]
            else:
                new_type = Table
                kdims = self.kdims

            if np.isscalar(selection):
                selection = [samples[0]+(selection,)]
            else:
                selection = tuple(selection.columns(kdims+self.vdims).values())

            datatype = list(util.unique_iterator(self.datatype+['dataframe', 'dict']))
            return self.clone(selection, kdims=kdims, new_type=new_type,
                              datatype=datatype)

        lens = set(len(util.wrap_tuple(s)) for s in samples)
        if len(lens) > 1:
            raise IndexError('Sample coordinates must all be of the same length.')

        if closest:
            try:
                samples = self.closest(samples)
            except NotImplementedError:
                pass
        samples = [util.wrap_tuple(s) for s in samples]
        return self.clone(self.interface.sample(self, samples), new_type=Table)


[docs]    def reduce(self, dimensions=[], function=None, spreadfn=None, **reduce_map):
        """
        Allows reducing the values along one or more key dimension with
        the supplied function. The dimensions may be supplied as a list
        and a function to apply or a mapping between the dimensions and
        functions to apply along each dimension.
        """
        if any(dim in self.vdims for dim in dimensions):
            raise Exception("Reduce cannot be applied to value dimensions")
        function, dims = self._reduce_map(dimensions, function, reduce_map)
        dims = [d for d in self.kdims if d not in dims]
        return self.aggregate(dims, function, spreadfn)


[docs]    def aggregate(self, dimensions=None, function=None, spreadfn=None, **kwargs):
        """
        Aggregates over the supplied key dimensions with the defined
        function.
        """
        if function is None:
            raise ValueError("The aggregate method requires a function to be specified")
        if dimensions is None: dimensions = self.kdims
        elif not isinstance(dimensions, list): dimensions = [dimensions]
        kdims = [self.get_dimension(d, strict=True) for d in dimensions]
        if not len(self):
            if spreadfn:
                spread_name = spreadfn.__name__
                vdims = [d for vd in self.vdims for d in [vd, vd('_'.join([vd.name, spread_name]))]]
            else:
                vdims = self.vdims
            return self.clone([], kdims=kdims, vdims=vdims)

        aggregated = self.interface.aggregate(self, kdims, function, **kwargs)
        aggregated = self.interface.unpack_scalar(self, aggregated)

        ndims = len(dimensions)
        min_d, max_d = self.params('kdims').bounds
        generic_type = (min_d is not None and ndims < min_d) or (max_d is not None and ndims > max_d)

        vdims = self.vdims
        if spreadfn:
            error = self.interface.aggregate(self, dimensions, spreadfn)
            spread_name = spreadfn.__name__
            ndims = len(vdims)
            error = self.clone(error, kdims=kdims, new_type=Dataset)
            combined = self.clone(aggregated, kdims=kdims, new_type=Dataset)
            for i, d in enumerate(vdims):
                dim = d('_'.join([d.name, spread_name]))
                dvals = error.dimension_values(d, flat=False)
                combined = combined.add_dimension(dim, ndims+i, dvals, True)
            return combined.clone(new_type=Dataset if generic_type else type(self))

        if np.isscalar(aggregated):
            return aggregated
        else:
            try:
                # Should be checking the dimensions declared on the element are compatible
                return self.clone(aggregated, kdims=kdims, vdims=vdims)
            except:
                datatype = self.params('datatype').default
                return self.clone(aggregated, kdims=kdims, vdims=vdims,
                                  new_type=Dataset if generic_type else None,
                                  datatype=datatype)


[docs]    def groupby(self, dimensions=[], container_type=HoloMap, group_type=None,
                dynamic=False, **kwargs):
        """Return the results of a groupby operation over the specified
        dimensions as an object of type container_type (expected to be
        dictionary-like).

        Keys vary over the columns (dimensions) and the corresponding
        values are collections of group_type (e.g an Element, list, tuple)
        constructed with kwargs (if supplied).

        If dynamic is requested container_type is automatically set to
        a DynamicMap, allowing dynamic exploration of large
        datasets. If the data does not represent a full cartesian grid
        of the requested dimensions some Elements will be empty.
        """
        if not isinstance(dimensions, list): dimensions = [dimensions]
        if not len(dimensions): dimensions = self.dimensions('key', True)
        if group_type is None: group_type = type(self)

        dimensions = [self.get_dimension(d, strict=True) for d in dimensions]
        dim_names = [d.name for d in dimensions]

        if dynamic:
            group_dims = [d.name for d in self.kdims if d not in dimensions]
            kdims = [self.get_dimension(d) for d in group_dims]
            group_kwargs = dict(util.get_param_values(self), kdims=kdims)
            group_kwargs.update(kwargs)
            drop_dim = len(kdims) != len(group_kwargs['kdims'])
            def load_subset(*args):
                constraint = dict(zip(dim_names, args))
                group = self.select(**constraint)
                if np.isscalar(group):
                    return group_type(([group],), group=self.group,
                                      label=self.label, vdims=self.vdims)
                data = group.reindex(group_dims)
                if drop_dim and self.interface.gridded:
                    data = data.columns()
                return group_type(data, **group_kwargs)
            dynamic_dims = [d(values=list(self.interface.values(self, d.name, False)))
                            for d in dimensions]
            return DynamicMap(load_subset, kdims=dynamic_dims)

        return self.interface.groupby(self, dim_names, container_type,
                                      group_type, **kwargs)

    def __len__(self):
        """
        Returns the number of rows in the Dataset object.
        """
        return self.interface.length(self)

    def __nonzero__(self):
        return self.interface.nonzero(self)

    __bool__ = __nonzero__

    @property
    def shape(self):
        "Returns the shape of the data."
        return self.interface.shape(self)


[docs]    def dimension_values(self, dim, expanded=True, flat=True):
        """
        Returns the values along a particular dimension. If unique
        values are requested will return only unique values.
        """
        dim = self.get_dimension(dim, strict=True)
        return self.interface.values(self, dim, expanded, flat)


[docs]    def get_dimension_type(self, dim):
        """
        Returns the specified Dimension type if specified or
        if the dimension_values types are consistent otherwise
        None is returned.
        """
        dim_obj = self.get_dimension(dim)
        if dim_obj and dim_obj.type is not None:
            return dim_obj.type
        return self.interface.dimension_type(self, dim_obj)


[docs]    def dframe(self, dimensions=None):
        """
        Returns the data in the form of a DataFrame. Supplying a list
        of dimensions filters the dataframe. If the data is already
        a DataFrame a copy is returned.
        """
        if dimensions:
            dimensions = [self.get_dimension(d, strict=True).name for d in dimensions]
        return self.interface.dframe(self, dimensions)


    def columns(self, dimensions=None):
        if dimensions is None:
            dimensions = self.dimensions()
        else:
            dimensions = [self.get_dimension(d, strict=True) for d in dimensions]
        return OrderedDict([(d.name, self.dimension_values(d)) for d in dimensions])


    @property
    def to(self):
        """
        Property to create a conversion interface with methods to
        convert to other Element types.
        """
        return self._conversion_interface(self)


    @property
    def iloc(self):
        """
        Returns an iloc object providing a convenient interface to
        slice and index into the Dataset using row and column indices.
        Allow selection by integer index, slice and list of integer
        indices and boolean arrays.

        Examples:

        * Index the first row and column:

            dataset.iloc[0, 0]

        * Select rows 1 and 2 with a slice:

            dataset.iloc[1:3, :]

        * Select with a list of integer coordinates:

            dataset.iloc[[0, 2, 3]]
        """
        return iloc(self)


    @property
    def ndloc(self):
        """
        Returns an ndloc object providing nd-array like indexing for
        gridded datasets. Follows NumPy array indexing conventions,
        allowing for indexing, slicing and selecting a list of indices
        on multi-dimensional arrays using integer indices. The order
        of array indices is inverted relative to the Dataset key
        dimensions, e.g. an Image with key dimensions 'x' and 'y' can
        be indexed with ``image.ndloc[iy, ix]``, where ``iy`` and
        ``ix`` are integer indices along the y and x dimensions.

        Examples:

        * Index value in 2D array:

            dataset.ndloc[3, 1]

        * Slice along y-axis of 2D array:

            dataset.ndloc[2:5, :]

        * Vectorized (non-orthogonal) indexing along x- and y-axes:

            dataset.ndloc[[1, 2, 3], [0, 2, 3]]
        """
        return ndloc(self)


# Aliases for pickle backward compatibility
Columns      = Dataset
ArrayColumns = ArrayInterface
DictColumns  = DictInterface
GridColumns  = GridInterface