DummyDataset¶

The main class for creating and managing dataset metadata specifications.

DummyDataset is composed of multiple mixins that provide different functionality:

Core - Basic dataset operations (dimensions, coordinates, variables)
HistoryMixin - Operation tracking and replay
ProvenanceMixin - Track what changed in operations
CFComplianceMixin - CF convention support
CFStandardsMixin - CF standard names and vocabulary
IOMixin - Serialization and format conversion
ValidationMixin - Dataset structure validation
DataGenerationMixin - Generate realistic random data
FileTrackerMixin - Track source files in multi-file datasets

Class Reference¶

Bases: HistoryMixin, ProvenanceMixin, CFComplianceMixin, CFStandardsMixin, IOMixin, ValidationMixin, DataGenerationMixin, FileTrackerMixin

A dummy xarray-like dataset for building metadata specifications.

This class allows you to define the structure of a dataset including dimensions, coordinates, variables, and global attributes before creating the actual xarray.Dataset with real data.

Source code in src/dummyxarray/core.py

class DummyDataset(
    HistoryMixin,
    ProvenanceMixin,
    CFComplianceMixin,
    CFStandardsMixin,
    IOMixin,
    ValidationMixin,
    DataGenerationMixin,
    FileTrackerMixin,
):
    """
    A dummy xarray-like dataset for building metadata specifications.

    This class allows you to define the structure of a dataset including
    dimensions, coordinates, variables, and global attributes before
    creating the actual xarray.Dataset with real data.
    """

    def __init__(self, _record_history=True):
        """
        Initialize an empty DummyDataset.

        Parameters
        ----------
        _record_history : bool, optional
            Whether to record operation history (default: True)
        """
        self.dims = {}  # dim_name → size
        self.coords = {}  # coord_name → DummyArray
        self.variables = {}  # var_name  → DummyArray
        self.attrs = {}  # global attributes

        # Operation history tracking
        self._history = [] if _record_history else None
        if _record_history:
            self._record_operation("__init__", {})

    def __repr__(self):
        """Return a string representation similar to xarray.Dataset."""
        lines = ["<dummyxarray.DummyDataset>"]

        # Dimensions
        if self.dims:
            lines.append("Dimensions:")
            dim_strs = [f"  {name}: {size}" for name, size in self.dims.items()]
            lines.extend(dim_strs)
        else:
            lines.append("Dimensions: ()")

        # Coordinates
        if self.coords:
            lines.append("Coordinates:")
            for name, arr in self.coords.items():
                dims_str = f"({', '.join(arr.dims)})" if arr.dims else "()"
                has_data = "✓" if arr.data is not None else "✗"
                dtype_str = f"{arr.data.dtype}" if arr.data is not None else "?"
                lines.append(f"  {has_data} {name:20s} {dims_str:20s} {dtype_str}")

        # Data variables
        if self.variables:
            lines.append("Data variables:")
            for name, arr in self.variables.items():
                dims_str = f"({', '.join(arr.dims)})" if arr.dims else "()"
                has_data = "✓" if arr.data is not None else "✗"
                dtype_str = f"{arr.data.dtype}" if arr.data is not None else "?"
                lines.append(f"  {has_data} {name:20s} {dims_str:20s} {dtype_str}")

        # Global attributes
        if self.attrs:
            lines.append("Attributes:")
            for key, value in self.attrs.items():
                value_str = str(value)
                if len(value_str) > 50:
                    value_str = value_str[:47] + "..."
                lines.append(f"    {key}: {value_str}")

        return "\n".join(lines)

    def __getattr__(self, name):
        """
        Allow attribute-style access to coordinates and variables.

        This enables xarray-style access like `ds.time` instead of `ds.coords['time']`.
        Coordinates take precedence over variables if both exist with the same name.

        Parameters
        ----------
        name : str
            Name of the coordinate or variable to access

        Returns
        -------
        DummyArray
            The coordinate or variable array

        Raises
        ------
        AttributeError
            If the name is not found in coords or variables
        """
        # Check coordinates first (like xarray does)
        if name in self.coords:
            return self.coords[name]
        # Then check variables
        if name in self.variables:
            return self.variables[name]
        # If not found, raise AttributeError
        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")

    def __setattr__(self, name, value):
        """
        Handle attribute assignment.

        Special handling for internal attributes (dims, coords, variables, attrs).
        For other names, this could be extended to allow setting coords/variables.
        """
        # Internal attributes that should be set normally
        # Allow private attributes (starting with _) for mixins
        if name in ("dims", "coords", "variables", "attrs", "_history") or name.startswith("_"):
            object.__setattr__(self, name, value)
        else:
            # For now, raise an error to avoid confusion
            # Could be extended to allow ds.time = DummyArray(...) in the future
            raise AttributeError(
                f"Cannot set attribute '{name}' directly. "
                f"Use ds.coords['{name}'] or ds.variables['{name}'] instead."
            )

    def __dir__(self):
        """
        Customize dir() output to include coordinates and variables.

        This makes tab-completion work in IPython/Jupyter.
        """
        # Get default attributes
        default_attrs = set(object.__dir__(self))
        # Add coordinate and variable names
        return sorted(default_attrs | set(self.coords.keys()) | set(self.variables.keys()))

    # ------------------------------------------------------------
    # Core API
    # ------------------------------------------------------------

    def set_global_attrs(self, **kwargs):
        """
        Set or update global dataset attributes.

        Parameters
        ----------
        **kwargs
            Attributes to set

        Examples
        --------
        >>> ds = DummyDataset()
        >>> ds.set_global_attrs(title="My Dataset", institution="DKRZ")
        """
        self.attrs.update(kwargs)

    def assign_attrs(self, **kwargs):
        """
        Assign new global attributes to this dataset (xarray-compatible API).

        Parameters
        ----------
        **kwargs
            Attributes to assign

        Returns
        -------
        self
            Returns self for method chaining

        Examples
        --------
        >>> ds = DummyDataset()
        >>> ds.assign_attrs(title="My Dataset", institution="DKRZ")
        """
        # Capture provenance
        provenance = {"modified": {}}
        for key, value in kwargs.items():
            old_value = self.attrs.get(key)
            provenance["modified"][key] = {"before": old_value, "after": value}

        self._record_operation("assign_attrs", kwargs, provenance)
        self.attrs.update(kwargs)
        return self

    def add_dim(self, name, size):
        """
        Add a dimension with a specific size.

        Parameters
        ----------
        name : str
            Dimension name
        size : int
            Dimension size

        Examples
        --------
        >>> ds = DummyDataset()
        >>> ds.add_dim("time", 10)
        >>> ds.add_dim("lat", 64)
        """
        # Capture provenance
        if name in self.dims:
            provenance = {"modified": {name: {"before": self.dims[name], "after": size}}}
        else:
            provenance = {"added": [name]}

        self._record_operation("add_dim", {"name": name, "size": size}, provenance)
        self.dims[name] = size

    def add_coord(self, name, dims=None, attrs=None, data=None, encoding=None):
        """
        Add a coordinate variable.

        Parameters
        ----------
        name : str
            Coordinate name
        dims : list of str, optional
            Dimension names
        attrs : dict, optional
            Metadata attributes
        data : array-like, optional
            Coordinate data
        encoding : dict, optional
            Encoding parameters
        """
        # Record operation (don't store actual data)
        args = {"name": name}
        if dims is not None:
            args["dims"] = dims
        if attrs:
            args["attrs"] = attrs
        if data is not None:
            args["data"] = "<data>"
        if encoding:
            args["encoding"] = encoding

        # Capture provenance
        provenance = {}
        if name in self.coords:
            # Coordinate already exists - track what changed
            old_coord = self.coords[name]
            changes = {}
            if dims != old_coord.dims:
                changes["dims"] = {"before": old_coord.dims, "after": dims}
            if attrs and attrs != old_coord.attrs:
                changes["attrs"] = {"before": old_coord.attrs.copy(), "after": attrs}
            if changes:
                provenance["modified"] = {name: changes}
        else:
            provenance["added"] = [name]

        self._record_operation("add_coord", args, provenance)

        arr = DummyArray(dims, attrs, data, encoding, _record_history=False)
        self._infer_and_register_dims(arr)
        self.coords[name] = arr

    def add_variable(self, name, dims=None, attrs=None, data=None, encoding=None):
        """
        Add a data variable.

        Parameters
        ----------
        name : str
            Variable name
        dims : list of str, optional
            Dimension names
        attrs : dict, optional
            Metadata attributes
        data : array-like, optional
            Variable data
        encoding : dict, optional
            Encoding parameters
        """
        # Record operation (don't store actual data)
        args = {"name": name}
        if dims is not None:
            args["dims"] = dims
        if attrs:
            args["attrs"] = attrs
        if data is not None:
            args["data"] = "<data>"
        if encoding:
            args["encoding"] = encoding

        # Capture provenance
        provenance = {}
        if name in self.variables:
            # Variable already exists - track what changed
            old_var = self.variables[name]
            changes = {}
            if dims != old_var.dims:
                changes["dims"] = {"before": old_var.dims, "after": dims}
            if attrs and attrs != old_var.attrs:
                changes["attrs"] = {"before": old_var.attrs.copy(), "after": attrs}
            if changes:
                provenance["modified"] = {name: changes}
        else:
            provenance["added"] = [name]

        self._record_operation("add_variable", args, provenance)

        arr = DummyArray(dims, attrs, data, encoding, _record_history=False)
        self._infer_and_register_dims(arr)
        self.variables[name] = arr

    def rename_dims(self, dims_dict=None, **dims):
        """
        Rename dimensions (xarray-compatible API).

        Parameters
        ----------
        dims_dict : dict-like, optional
            Dictionary whose keys are current dimension names and whose
            values are the desired names.
        **dims : optional
            Keyword form of dims_dict.
            One of dims_dict or dims must be provided.

        Returns
        -------
        DummyDataset
            Returns self for method chaining

        Raises
        ------
        KeyError
            If a dimension doesn't exist
        ValueError
            If a new name already exists

        Examples
        --------
        >>> ds = DummyDataset()
        >>> ds.add_dim("time", 10)
        >>> ds.add_dim("lat", 64)
        >>> ds.rename_dims({"time": "t", "lat": "latitude"})
        >>> # Or using keyword arguments:
        >>> ds.rename_dims(time="t", lat="latitude")
        """
        # Merge dims_dict and **dims
        name_dict = {}
        if dims_dict is not None:
            name_dict.update(dims_dict)
        name_dict.update(dims)

        if not name_dict:
            raise ValueError("Either dims_dict or keyword arguments must be provided")

        # Validate all renames first
        for old_name, new_name in name_dict.items():
            if old_name not in self.dims:
                raise KeyError(f"Dimension '{old_name}' does not exist")
            if new_name in self.dims and new_name != old_name:
                raise ValueError(f"Dimension '{new_name}' already exists")

        # Capture provenance
        provenance = {
            "renamed": name_dict.copy(),
            "removed": list(name_dict.keys()),
            "added": list(name_dict.values()),
        }

        self._record_operation("rename_dims", {"dims_dict": name_dict}, provenance)

        # Perform all renames
        for old_name, new_name in name_dict.items():
            if old_name != new_name:
                self.dims[new_name] = self.dims.pop(old_name)

                # Update dimension references in coords and variables
                for coord in self.coords.values():
                    if coord.dims:
                        coord.dims = [new_name if d == old_name else d for d in coord.dims]
                for var in self.variables.values():
                    if var.dims:
                        var.dims = [new_name if d == old_name else d for d in var.dims]

        return self

    def rename_vars(self, name_dict=None, **names):
        """
        Rename variables (xarray-compatible API).

        Parameters
        ----------
        name_dict : dict-like, optional
            Dictionary whose keys are current variable names and whose
            values are the desired names.
        **names : optional
            Keyword form of name_dict.
            One of name_dict or names must be provided.

        Returns
        -------
        DummyDataset
            Returns self for method chaining

        Raises
        ------
        KeyError
            If a variable doesn't exist
        ValueError
            If a new name already exists

        Examples
        --------
        >>> ds = DummyDataset()
        >>> ds.add_dim("time", 10)
        >>> ds.add_variable("temperature", dims=["time"])
        >>> ds.rename_vars({"temperature": "temp"})
        >>> # Or using keyword arguments:
        >>> ds.rename_vars(temperature="temp")
        """
        # Merge name_dict and **names
        rename_dict = {}
        if name_dict is not None:
            rename_dict.update(name_dict)
        rename_dict.update(names)

        if not rename_dict:
            raise ValueError("Either name_dict or keyword arguments must be provided")

        # Validate all renames first
        for old_name, new_name in rename_dict.items():
            if old_name not in self.variables:
                raise KeyError(f"Variable '{old_name}' does not exist")
            if new_name in self.variables and new_name != old_name:
                raise ValueError(f"Variable '{new_name}' already exists")

        # Capture provenance
        provenance = {
            "renamed": rename_dict.copy(),
            "removed": list(rename_dict.keys()),
            "added": list(rename_dict.values()),
        }

        self._record_operation("rename_vars", {"name_dict": rename_dict}, provenance)

        # Perform all renames
        for old_name, new_name in rename_dict.items():
            if old_name != new_name:
                self.variables[new_name] = self.variables.pop(old_name)

        return self

    def rename(self, name_dict=None, **names):
        """
        Rename variables, coordinates, and dimensions (xarray-compatible API).

        This method can rename any combination of variables, coordinates, and dimensions.

        Parameters
        ----------
        name_dict : dict-like, optional
            Dictionary whose keys are current names (variables, coordinates, or dimensions)
            and whose values are the desired names.
        **names : optional
            Keyword form of name_dict.
            One of name_dict or names must be provided.

        Returns
        -------
        DummyDataset
            Returns self for method chaining

        Raises
        ------
        KeyError
            If a name doesn't exist
        ValueError
            If a new name already exists

        Examples
        --------
        >>> ds = DummyDataset()
        >>> ds.add_dim("time", 10)
        >>> ds.add_coord("time", dims=["time"])
        >>> ds.add_variable("temperature", dims=["time"])
        >>> # Rename multiple items at once
        >>> ds.rename({"time": "t", "temperature": "temp"})
        >>> # Or using keyword arguments:
        >>> ds.rename(time="t", temperature="temp")
        """
        # Merge name_dict and **names
        rename_dict = {}
        if name_dict is not None:
            rename_dict.update(name_dict)
        rename_dict.update(names)

        if not rename_dict:
            raise ValueError("Either name_dict or keyword arguments must be provided")

        # Categorize renames
        dim_renames = {}
        coord_renames = {}
        var_renames = {}

        for old_name, new_name in rename_dict.items():
            if old_name in self.dims:
                dim_renames[old_name] = new_name
            if old_name in self.coords:
                coord_renames[old_name] = new_name
            if old_name in self.variables:
                var_renames[old_name] = new_name

            # Check if name exists anywhere
            if (
                old_name not in self.dims
                and old_name not in self.coords
                and old_name not in self.variables
            ):
                raise KeyError(
                    f"'{old_name}' does not exist in dimensions, coordinates, or variables"
                )

        # Capture provenance
        provenance = {
            "renamed": rename_dict.copy(),
            "removed": list(rename_dict.keys()),
            "added": list(rename_dict.values()),
        }

        self._record_operation("rename", {"name_dict": rename_dict}, provenance)

        # Perform renames in order: dimensions first (affects coords/vars), then coords, then vars
        if dim_renames:
            for old_name, new_name in dim_renames.items():
                if old_name != new_name and old_name in self.dims:
                    self.dims[new_name] = self.dims.pop(old_name)
                    # Update dimension references
                    for coord in self.coords.values():
                        if coord.dims:
                            coord.dims = [new_name if d == old_name else d for d in coord.dims]
                    for var in self.variables.values():
                        if var.dims:
                            var.dims = [new_name if d == old_name else d for d in var.dims]

        if coord_renames:
            for old_name, new_name in coord_renames.items():
                if old_name != new_name and old_name in self.coords:
                    self.coords[new_name] = self.coords.pop(old_name)

        if var_renames:
            for old_name, new_name in var_renames.items():
                if old_name != new_name and old_name in self.variables:
                    self.variables[new_name] = self.variables.pop(old_name)

        return self

    @classmethod
    def open_mfdataset(cls, paths, concat_dim="time", combine="nested", **kwargs):
        """Open multiple files as a single DummyDataset with file tracking.

        This class method reads metadata from multiple NetCDF files and combines them
        into a single DummyDataset, tracking which files contribute to which
        coordinate ranges along the concatenation dimension.

        Parameters
        ----------
        paths : str or list of str
            Either a glob pattern (e.g., "data/*.nc") or a list of file paths
        concat_dim : str, optional
            The dimension along which to concatenate files (default: "time")
        combine : str, optional
            How to combine datasets. Currently supports "nested" (default)
        **kwargs : optional
            Additional keyword arguments (reserved for future use)

        Returns
        -------
        DummyDataset
            A DummyDataset with metadata from all files and file tracking enabled

        Examples
        --------
        >>> ds = DummyDataset.open_mfdataset("data/*.nc", concat_dim="time")
        >>> files = ds.get_source_files(time=slice(0, 10))
        >>> print(files)
        ['data/file1.nc', 'data/file2.nc']

        See Also
        --------
        enable_file_tracking : Enable file tracking on an existing dataset
        get_source_files : Query which files contain specific coordinate ranges
        """
        from .mfdataset import open_mfdataset

        return open_mfdataset(paths, concat_dim=concat_dim, combine=combine, **kwargs)

    def groupby_time(
        self,
        freq: str,
        dim: str = "time",
        normalize_units: bool = True,
    ) -> List["DummyDataset"]:
        """Group dataset by time frequency using metadata only.

        This method splits a multi-file dataset into time-based groups without loading
        any data arrays. Each group is a new DummyDataset with adjusted metadata.

        Parameters
        ----------
        freq : str
            Grouping frequency using pandas-style strings:
            - Years: '1Y', '5Y', '10Y'
            - Months: '1M', '3M', '6M'
            - Days: '1D', '7D', '30D'
            - Hours: '1H', '6H', '12H'
        dim : str, default "time"
            Time dimension to group by
        normalize_units : bool, default True
            Update time units to reference each group's start datetime

        Returns
        -------
        list of DummyDataset
            One dataset per time group, each with:
            - Updated time:units attribute (if normalize_units=True)
            - Filtered file sources for that time period
            - Adjusted dimension sizes
            - Preserved frequency attribute

        Raises
        ------
        ValueError
            If time coordinate has no frequency attribute (open with open_mfdataset)
        ValueError
            If time coordinate has no units attribute
        ValueError
            If dimension does not exist

        Examples
        --------
        >>> # Open 100 years of hourly data
        >>> ds = DummyDataset.open_mfdataset("hourly_*.nc", concat_dim="time")
        >>> print(ds.coords['time'].attrs['frequency'])
        '1H'
        >>> print(ds.dims['time'])
        876000

        >>> # Group into decades
        >>> decades = ds.groupby_time('10Y')
        >>> print(len(decades))
        10

        >>> # Each decade has normalized units
        >>> decade_0 = decades[0]
        >>> print(decade_0.coords['time'].attrs['units'])
        'hours since 2000-01-01 00:00:00'
        >>> print(decade_0.dims['time'])
        87600

        >>> # Query files for specific decade
        >>> files = decade_0.get_source_files()
        >>> print(files)
        ['hourly_2000.nc', 'hourly_2001.nc', ..., 'hourly_2009.nc']

        See Also
        --------
        open_mfdataset : Open multiple files with automatic frequency inference
        get_source_files : Query which files contain specific coordinate ranges
        """
        from .mfdataset import groupby_time_impl

        return groupby_time_impl(self, freq, dim, normalize_units)

    # STAC-related methods
    def to_stac_item(
        self,
        id: str,
        geometry: Optional[Dict[str, Any]] = None,
        properties: Optional[Dict[str, Any]] = None,
        assets: Optional[Dict[str, Any]] = None,
        collection_id: Optional[str] = None,
        **kwargs,
    ):
        """
        Convert this DummyDataset to a STAC Item.

        Parameters
        ----------
        id : str
            Unique identifier for the STAC Item
        geometry : dict, optional
            GeoJSON geometry dict (required if not in dataset.attrs)
        properties : dict, optional
            Additional properties for the STAC Item
        assets : dict, optional
            Dictionary of pystac.Asset objects
        collection_id : str, optional
            ID of the parent collection
        **kwargs
            Additional arguments passed to pystac.Item

        Returns
        -------
        pystac.Item
            The generated STAC Item

        Examples
        --------
        >>> ds = DummyDataset()
        >>> ds.add_dim("lat", 10)
        >>> ds.add_dim("lon", 20)
        >>> item = ds.to_stac_item(
        ...     id="climate-data-2020",
        ...     geometry={"type": "Polygon", "coordinates": [...]},
        ...     properties={"datetime": "2020-01-01T00:00:00Z"}
        ... )
        """
        from .stac import dataset_to_stac_item

        return dataset_to_stac_item(
            self,
            id=id,
            geometry=geometry,
            properties=properties,
            assets=assets,
            collection_id=collection_id,
            **kwargs,
        )

    def to_stac_collection(
        self,
        id: str,
        description: Optional[str] = None,
        license: Optional[str] = None,
        extent: Optional[Any] = None,
        **kwargs,
    ):
        """
        Convert this DummyDataset to a STAC Collection.

        Parameters
        ----------
        id : str
            Unique identifier for the STAC Collection
        description : str, optional
            Description of the collection
        license : str, optional
            License for the collection
        extent : pystac.Extent, optional
            Spatial and temporal extent
        **kwargs
            Additional arguments passed to pystac.Collection

        Returns
        -------
        pystac.Collection
            The generated STAC Collection

        Examples
        --------
        >>> ds = DummyDataset()
        >>> collection = ds.to_stac_collection(
        ...     id="climate-collection",
        ...     description="Climate model output collection",
        ...     license="MIT"
        ... )
        """
        from .stac import dataset_to_stac_collection

        return dataset_to_stac_collection(
            self, id=id, description=description, license=license, extent=extent, **kwargs
        )

    def add_spatial_extent(self, lat_bounds: tuple, lon_bounds: tuple):
        """
        Add spatial extent information to the dataset.

        Parameters
        ----------
        lat_bounds : tuple
            (min_lat, max_lat) latitude bounds
        lon_bounds : tuple
            (min_lon, max_lon) longitude bounds

        Examples
        --------
        >>> ds = DummyDataset()
        >>> ds.add_spatial_extent(lat_bounds=(-90, 90), lon_bounds=(-180, 180))
        """
        min_lat, max_lat = lat_bounds
        min_lon, max_lon = lon_bounds

        self.attrs["geospatial_bounds"] = {
            "type": "Polygon",
            "coordinates": [
                [
                    [min_lon, min_lat],
                    [max_lon, min_lat],
                    [max_lon, max_lat],
                    [min_lon, max_lat],
                    [min_lon, min_lat],
                ]
            ],
        }
        self.attrs["geospatial_lat_min"] = min_lat
        self.attrs["geospatial_lat_max"] = max_lat
        self.attrs["geospatial_lon_min"] = min_lon
        self.attrs["geospatial_lon_max"] = max_lon

    def infer_temporal_extent(self):
        """
        Infer temporal extent from time coordinate.

        Returns
        -------
        tuple
            (start_time, end_time) as datetime objects or None if not found

        Examples
        --------
        >>> ds = DummyDataset()
        >>> ds.add_coord("time", ["time"], attrs={"units": "days since 2000-01-01"})
        >>> start, end = ds.infer_temporal_extent()
        """
        if "time" in self.coords:
            time_coord = self.coords["time"]
            if hasattr(time_coord, "data") and time_coord.data is not None:
                try:
                    # Try to get time values
                    time_values = time_coord.data
                    if len(time_values) > 0:
                        # This is a simplified approach - in practice you'd need
                        # to handle different time encodings and units
                        from datetime import datetime, timedelta

                        import numpy as np

                        # Assume days since some epoch for now
                        if "units" in time_coord.attrs:
                            units = time_coord.attrs["units"]
                            if "since" in units:
                                epoch_str = units.split("since")[1].strip()
                                try:
                                    epoch = datetime.fromisoformat(epoch_str.replace("Z", "+00:00"))
                                    start_delta = timedelta(days=float(np.min(time_values)))
                                    end_delta = timedelta(days=float(np.max(time_values)))
                                    start_time = epoch + start_delta
                                    end_time = epoch + end_delta

                                    self.attrs["time_coverage_start"] = start_time.isoformat()
                                    self.attrs["time_coverage_end"] = end_time.isoformat()
                                    return start_time, end_time
                                except (ValueError, TypeError):
                                    pass
                except Exception:
                    pass
        return None, None

    def validate_spatial_metadata(self):
        """
        Validate spatial metadata in the dataset.

        Returns
        -------
        dict
            Validation results with any issues found

        Examples
        --------
        >>> ds = DummyDataset()
        >>> ds.add_spatial_extent(lat_bounds=(-90, 90), lon_bounds=(-180, 180))
        >>> validation = ds.validate_spatial_metadata()
        >>> print(validation['valid'])
        True
        """
        issues = []

        # Check geospatial_bounds
        if "geospatial_bounds" in self.attrs:
            bounds = self.attrs["geospatial_bounds"]
            if not isinstance(bounds, dict):
                issues.append("geospatial_bounds must be a dictionary")
            elif "type" not in bounds or "coordinates" not in bounds:
                issues.append("geospatial_bounds missing required 'type' or 'coordinates'")
            elif bounds["type"] != "Polygon":
                issues.append("geospatial_bounds type must be 'Polygon'")
        else:
            # Try to infer from coordinates
            lat_found = any(name in self.coords for name in ["lat", "latitude", "Latitude"])
            lon_found = any(name in self.coords for name in ["lon", "longitude", "Longitude"])
            if not (lat_found and lon_found):
                issues.append(
                    "No spatial information found - need geospatial_bounds or lat/lon coordinates"
                )

        return {"valid": len(issues) == 0, "issues": issues}

    @classmethod
    def from_stac_item(cls, item):
        """
        Create a DummyDataset from a STAC Item.

        Parameters
        ----------
        item : pystac.Item
            The STAC Item to convert

        Returns
        -------
        DummyDataset
            The generated DummyDataset

        Examples
        --------
        >>> import pystac
        >>> item = pystac.Item.from_file("data.json")
        >>> ds = DummyDataset.from_stac_item(item)
        """
        from .stac import stac_item_to_dataset

        return stac_item_to_dataset(item)

    @classmethod
    def from_stac_collection(cls, collection, item_id=None):
        """
        Create a DummyDataset from a STAC Collection.

        Parameters
        ----------
        collection : pystac.Collection
            The STAC Collection to convert
        item_id : str, optional
            Specific item ID to extract from collection

        Returns
        -------
        DummyDataset or list of DummyDataset
            The generated DummyDataset(s)

        Examples
        --------
        >>> import pystac
        >>> collection = pystac.Collection.from_file("collection.json")
        >>> ds = DummyDataset.from_stac_collection(collection, item_id="climate-data-2020")
        """
        from .stac import stac_collection_to_dataset

        return stac_collection_to_dataset(collection, item_id)

    @classmethod
    def create_stac_collection(cls, datasets, collection_id, description=None, license=None):
        """
        Create a STAC Collection from multiple datasets.

        Parameters
        ----------
        datasets : list of DummyDataset
            List of datasets to include in the collection
        collection_id : str
            Unique identifier for the STAC Collection
        description : str, optional
            Description of the collection
        license : str, optional
            License for the collection

        Returns
        -------
        pystac.Collection
            The generated STAC Collection

        Examples
        --------
        >>> temperature_ds = DummyDataset()
        >>> precipitation_ds = DummyDataset()
        >>> wind_ds = DummyDataset()
        >>> collection = DummyDataset.create_stac_collection(
        ...     [temperature_ds, precipitation_ds, wind_ds],
        ...     collection_id="climate-2020"
        ... )
        """
        from .stac import create_stac_collection_from_datasets

        return create_stac_collection_from_datasets(
            datasets, collection_id, description=description, license=license
        )

Functions¶

init ¶

__init__(_record_history=True)

Initialize an empty DummyDataset.

Parameters:

Name	Type	Description	Default
`_record_history`	`bool`	Whether to record operation history (default: True)	`True`

Source code in src/dummyxarray/core.py

def __init__(self, _record_history=True):
    """
    Initialize an empty DummyDataset.

    Parameters
    ----------
    _record_history : bool, optional
        Whether to record operation history (default: True)
    """
    self.dims = {}  # dim_name → size
    self.coords = {}  # coord_name → DummyArray
    self.variables = {}  # var_name  → DummyArray
    self.attrs = {}  # global attributes

    # Operation history tracking
    self._history = [] if _record_history else None
    if _record_history:
        self._record_operation("__init__", {})

repr ¶

__repr__()

Return a string representation similar to xarray.Dataset.

Source code in src/dummyxarray/core.py

def __repr__(self):
    """Return a string representation similar to xarray.Dataset."""
    lines = ["<dummyxarray.DummyDataset>"]

    # Dimensions
    if self.dims:
        lines.append("Dimensions:")
        dim_strs = [f"  {name}: {size}" for name, size in self.dims.items()]
        lines.extend(dim_strs)
    else:
        lines.append("Dimensions: ()")

    # Coordinates
    if self.coords:
        lines.append("Coordinates:")
        for name, arr in self.coords.items():
            dims_str = f"({', '.join(arr.dims)})" if arr.dims else "()"
            has_data = "✓" if arr.data is not None else "✗"
            dtype_str = f"{arr.data.dtype}" if arr.data is not None else "?"
            lines.append(f"  {has_data} {name:20s} {dims_str:20s} {dtype_str}")

    # Data variables
    if self.variables:
        lines.append("Data variables:")
        for name, arr in self.variables.items():
            dims_str = f"({', '.join(arr.dims)})" if arr.dims else "()"
            has_data = "✓" if arr.data is not None else "✗"
            dtype_str = f"{arr.data.dtype}" if arr.data is not None else "?"
            lines.append(f"  {has_data} {name:20s} {dims_str:20s} {dtype_str}")

    # Global attributes
    if self.attrs:
        lines.append("Attributes:")
        for key, value in self.attrs.items():
            value_str = str(value)
            if len(value_str) > 50:
                value_str = value_str[:47] + "..."
            lines.append(f"    {key}: {value_str}")

    return "\n".join(lines)

getattr ¶

__getattr__(name)

Allow attribute-style access to coordinates and variables.

This enables xarray-style access like ds.time instead of ds.coords['time']. Coordinates take precedence over variables if both exist with the same name.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name of the coordinate or variable to access	required

Returns:

Type	Description
`DummyArray`	The coordinate or variable array

Raises:

Type	Description
`AttributeError`	If the name is not found in coords or variables

Source code in src/dummyxarray/core.py

def __getattr__(self, name):
    """
    Allow attribute-style access to coordinates and variables.

    This enables xarray-style access like `ds.time` instead of `ds.coords['time']`.
    Coordinates take precedence over variables if both exist with the same name.

    Parameters
    ----------
    name : str
        Name of the coordinate or variable to access

    Returns
    -------
    DummyArray
        The coordinate or variable array

    Raises
    ------
    AttributeError
        If the name is not found in coords or variables
    """
    # Check coordinates first (like xarray does)
    if name in self.coords:
        return self.coords[name]
    # Then check variables
    if name in self.variables:
        return self.variables[name]
    # If not found, raise AttributeError
    raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")

setattr ¶

__setattr__(name, value)

Handle attribute assignment.

Special handling for internal attributes (dims, coords, variables, attrs). For other names, this could be extended to allow setting coords/variables.

Source code in src/dummyxarray/core.py

def __setattr__(self, name, value):
    """
    Handle attribute assignment.

    Special handling for internal attributes (dims, coords, variables, attrs).
    For other names, this could be extended to allow setting coords/variables.
    """
    # Internal attributes that should be set normally
    # Allow private attributes (starting with _) for mixins
    if name in ("dims", "coords", "variables", "attrs", "_history") or name.startswith("_"):
        object.__setattr__(self, name, value)
    else:
        # For now, raise an error to avoid confusion
        # Could be extended to allow ds.time = DummyArray(...) in the future
        raise AttributeError(
            f"Cannot set attribute '{name}' directly. "
            f"Use ds.coords['{name}'] or ds.variables['{name}'] instead."
        )

dir ¶

__dir__()

Customize dir() output to include coordinates and variables.

This makes tab-completion work in IPython/Jupyter.

Source code in src/dummyxarray/core.py

def __dir__(self):
    """
    Customize dir() output to include coordinates and variables.

    This makes tab-completion work in IPython/Jupyter.
    """
    # Get default attributes
    default_attrs = set(object.__dir__(self))
    # Add coordinate and variable names
    return sorted(default_attrs | set(self.coords.keys()) | set(self.variables.keys()))

set_global_attrs ¶

set_global_attrs(**kwargs)

Set or update global dataset attributes.

Parameters:

Name	Type	Description	Default
`**kwargs`		Attributes to set	`{}`

Examples:

>>> ds = DummyDataset()
>>> ds.set_global_attrs(title="My Dataset", institution="DKRZ")

Source code in src/dummyxarray/core.py

def set_global_attrs(self, **kwargs):
    """
    Set or update global dataset attributes.

    Parameters
    ----------
    **kwargs
        Attributes to set

    Examples
    --------
    >>> ds = DummyDataset()
    >>> ds.set_global_attrs(title="My Dataset", institution="DKRZ")
    """
    self.attrs.update(kwargs)

assign_attrs ¶

assign_attrs(**kwargs)

Assign new global attributes to this dataset (xarray-compatible API).

Parameters:

Name	Type	Description	Default
`**kwargs`		Attributes to assign	`{}`

Returns:

Type	Description
`self`	Returns self for method chaining

Examples:

>>> ds = DummyDataset()
>>> ds.assign_attrs(title="My Dataset", institution="DKRZ")

Source code in src/dummyxarray/core.py

def assign_attrs(self, **kwargs):
    """
    Assign new global attributes to this dataset (xarray-compatible API).

    Parameters
    ----------
    **kwargs
        Attributes to assign

    Returns
    -------
    self
        Returns self for method chaining

    Examples
    --------
    >>> ds = DummyDataset()
    >>> ds.assign_attrs(title="My Dataset", institution="DKRZ")
    """
    # Capture provenance
    provenance = {"modified": {}}
    for key, value in kwargs.items():
        old_value = self.attrs.get(key)
        provenance["modified"][key] = {"before": old_value, "after": value}

    self._record_operation("assign_attrs", kwargs, provenance)
    self.attrs.update(kwargs)
    return self

add_dim ¶

add_dim(name, size)

Add a dimension with a specific size.

Parameters:

Name	Type	Description	Default
`name`	`str`	Dimension name	required
`size`	`int`	Dimension size	required

Examples:

>>> ds = DummyDataset()
>>> ds.add_dim("time", 10)
>>> ds.add_dim("lat", 64)

Source code in src/dummyxarray/core.py

def add_dim(self, name, size):
    """
    Add a dimension with a specific size.

    Parameters
    ----------
    name : str
        Dimension name
    size : int
        Dimension size

    Examples
    --------
    >>> ds = DummyDataset()
    >>> ds.add_dim("time", 10)
    >>> ds.add_dim("lat", 64)
    """
    # Capture provenance
    if name in self.dims:
        provenance = {"modified": {name: {"before": self.dims[name], "after": size}}}
    else:
        provenance = {"added": [name]}

    self._record_operation("add_dim", {"name": name, "size": size}, provenance)
    self.dims[name] = size

add_coord ¶

add_coord(
    name, dims=None, attrs=None, data=None, encoding=None
)

Add a coordinate variable.

Parameters:

Name	Type	Description	Default
`name`	`str`	Coordinate name	required
`dims`	`list of str`	Dimension names	`None`
`attrs`	`dict`	Metadata attributes	`None`
`data`	`array - like`	Coordinate data	`None`
`encoding`	`dict`	Encoding parameters	`None`

Source code in src/dummyxarray/core.py

def add_coord(self, name, dims=None, attrs=None, data=None, encoding=None):
    """
    Add a coordinate variable.

    Parameters
    ----------
    name : str
        Coordinate name
    dims : list of str, optional
        Dimension names
    attrs : dict, optional
        Metadata attributes
    data : array-like, optional
        Coordinate data
    encoding : dict, optional
        Encoding parameters
    """
    # Record operation (don't store actual data)
    args = {"name": name}
    if dims is not None:
        args["dims"] = dims
    if attrs:
        args["attrs"] = attrs
    if data is not None:
        args["data"] = "<data>"
    if encoding:
        args["encoding"] = encoding

    # Capture provenance
    provenance = {}
    if name in self.coords:
        # Coordinate already exists - track what changed
        old_coord = self.coords[name]
        changes = {}
        if dims != old_coord.dims:
            changes["dims"] = {"before": old_coord.dims, "after": dims}
        if attrs and attrs != old_coord.attrs:
            changes["attrs"] = {"before": old_coord.attrs.copy(), "after": attrs}
        if changes:
            provenance["modified"] = {name: changes}
    else:
        provenance["added"] = [name]

    self._record_operation("add_coord", args, provenance)

    arr = DummyArray(dims, attrs, data, encoding, _record_history=False)
    self._infer_and_register_dims(arr)
    self.coords[name] = arr

add_variable ¶

add_variable(
    name, dims=None, attrs=None, data=None, encoding=None
)

Add a data variable.

Parameters:

Name	Type	Description	Default
`name`	`str`	Variable name	required
`dims`	`list of str`	Dimension names	`None`
`attrs`	`dict`	Metadata attributes	`None`
`data`	`array - like`	Variable data	`None`
`encoding`	`dict`	Encoding parameters	`None`

Source code in src/dummyxarray/core.py

def add_variable(self, name, dims=None, attrs=None, data=None, encoding=None):
    """
    Add a data variable.

    Parameters
    ----------
    name : str
        Variable name
    dims : list of str, optional
        Dimension names
    attrs : dict, optional
        Metadata attributes
    data : array-like, optional
        Variable data
    encoding : dict, optional
        Encoding parameters
    """
    # Record operation (don't store actual data)
    args = {"name": name}
    if dims is not None:
        args["dims"] = dims
    if attrs:
        args["attrs"] = attrs
    if data is not None:
        args["data"] = "<data>"
    if encoding:
        args["encoding"] = encoding

    # Capture provenance
    provenance = {}
    if name in self.variables:
        # Variable already exists - track what changed
        old_var = self.variables[name]
        changes = {}
        if dims != old_var.dims:
            changes["dims"] = {"before": old_var.dims, "after": dims}
        if attrs and attrs != old_var.attrs:
            changes["attrs"] = {"before": old_var.attrs.copy(), "after": attrs}
        if changes:
            provenance["modified"] = {name: changes}
    else:
        provenance["added"] = [name]

    self._record_operation("add_variable", args, provenance)

    arr = DummyArray(dims, attrs, data, encoding, _record_history=False)
    self._infer_and_register_dims(arr)
    self.variables[name] = arr

rename_dims ¶

rename_dims(dims_dict=None, **dims)

Rename dimensions (xarray-compatible API).

Parameters:

Name	Type	Description	Default
`dims_dict`	`dict - like`	Dictionary whose keys are current dimension names and whose values are the desired names.	`None`
`**dims`	`optional`	Keyword form of dims_dict. One of dims_dict or dims must be provided.	`{}`

Returns:

Type	Description
`DummyDataset`	Returns self for method chaining

Raises:

Type	Description
`KeyError`	If a dimension doesn't exist
`ValueError`	If a new name already exists

Examples:

>>> ds = DummyDataset()
>>> ds.add_dim("time", 10)
>>> ds.add_dim("lat", 64)
>>> ds.rename_dims({"time": "t", "lat": "latitude"})
>>> # Or using keyword arguments:
>>> ds.rename_dims(time="t", lat="latitude")

Source code in src/dummyxarray/core.py

def rename_dims(self, dims_dict=None, **dims):
    """
    Rename dimensions (xarray-compatible API).

    Parameters
    ----------
    dims_dict : dict-like, optional
        Dictionary whose keys are current dimension names and whose
        values are the desired names.
    **dims : optional
        Keyword form of dims_dict.
        One of dims_dict or dims must be provided.

    Returns
    -------
    DummyDataset
        Returns self for method chaining

    Raises
    ------
    KeyError
        If a dimension doesn't exist
    ValueError
        If a new name already exists

    Examples
    --------
    >>> ds = DummyDataset()
    >>> ds.add_dim("time", 10)
    >>> ds.add_dim("lat", 64)
    >>> ds.rename_dims({"time": "t", "lat": "latitude"})
    >>> # Or using keyword arguments:
    >>> ds.rename_dims(time="t", lat="latitude")
    """
    # Merge dims_dict and **dims
    name_dict = {}
    if dims_dict is not None:
        name_dict.update(dims_dict)
    name_dict.update(dims)

    if not name_dict:
        raise ValueError("Either dims_dict or keyword arguments must be provided")

    # Validate all renames first
    for old_name, new_name in name_dict.items():
        if old_name not in self.dims:
            raise KeyError(f"Dimension '{old_name}' does not exist")
        if new_name in self.dims and new_name != old_name:
            raise ValueError(f"Dimension '{new_name}' already exists")

    # Capture provenance
    provenance = {
        "renamed": name_dict.copy(),
        "removed": list(name_dict.keys()),
        "added": list(name_dict.values()),
    }

    self._record_operation("rename_dims", {"dims_dict": name_dict}, provenance)

    # Perform all renames
    for old_name, new_name in name_dict.items():
        if old_name != new_name:
            self.dims[new_name] = self.dims.pop(old_name)

            # Update dimension references in coords and variables
            for coord in self.coords.values():
                if coord.dims:
                    coord.dims = [new_name if d == old_name else d for d in coord.dims]
            for var in self.variables.values():
                if var.dims:
                    var.dims = [new_name if d == old_name else d for d in var.dims]

    return self

rename_vars ¶

rename_vars(name_dict=None, **names)

Rename variables (xarray-compatible API).

Parameters:

Name	Type	Description	Default
`name_dict`	`dict - like`	Dictionary whose keys are current variable names and whose values are the desired names.	`None`
`**names`	`optional`	Keyword form of name_dict. One of name_dict or names must be provided.	`{}`

Returns:

Type	Description
`DummyDataset`	Returns self for method chaining

Raises:

Type	Description
`KeyError`	If a variable doesn't exist
`ValueError`	If a new name already exists

Examples:

>>> ds = DummyDataset()
>>> ds.add_dim("time", 10)
>>> ds.add_variable("temperature", dims=["time"])
>>> ds.rename_vars({"temperature": "temp"})
>>> # Or using keyword arguments:
>>> ds.rename_vars(temperature="temp")

Source code in src/dummyxarray/core.py

def rename_vars(self, name_dict=None, **names):
    """
    Rename variables (xarray-compatible API).

    Parameters
    ----------
    name_dict : dict-like, optional
        Dictionary whose keys are current variable names and whose
        values are the desired names.
    **names : optional
        Keyword form of name_dict.
        One of name_dict or names must be provided.

    Returns
    -------
    DummyDataset
        Returns self for method chaining

    Raises
    ------
    KeyError
        If a variable doesn't exist
    ValueError
        If a new name already exists

    Examples
    --------
    >>> ds = DummyDataset()
    >>> ds.add_dim("time", 10)
    >>> ds.add_variable("temperature", dims=["time"])
    >>> ds.rename_vars({"temperature": "temp"})
    >>> # Or using keyword arguments:
    >>> ds.rename_vars(temperature="temp")
    """
    # Merge name_dict and **names
    rename_dict = {}
    if name_dict is not None:
        rename_dict.update(name_dict)
    rename_dict.update(names)

    if not rename_dict:
        raise ValueError("Either name_dict or keyword arguments must be provided")

    # Validate all renames first
    for old_name, new_name in rename_dict.items():
        if old_name not in self.variables:
            raise KeyError(f"Variable '{old_name}' does not exist")
        if new_name in self.variables and new_name != old_name:
            raise ValueError(f"Variable '{new_name}' already exists")

    # Capture provenance
    provenance = {
        "renamed": rename_dict.copy(),
        "removed": list(rename_dict.keys()),
        "added": list(rename_dict.values()),
    }

    self._record_operation("rename_vars", {"name_dict": rename_dict}, provenance)

    # Perform all renames
    for old_name, new_name in rename_dict.items():
        if old_name != new_name:
            self.variables[new_name] = self.variables.pop(old_name)

    return self

rename ¶

rename(name_dict=None, **names)

Rename variables, coordinates, and dimensions (xarray-compatible API).

This method can rename any combination of variables, coordinates, and dimensions.

Parameters:

Name	Type	Description	Default
`name_dict`	`dict - like`	Dictionary whose keys are current names (variables, coordinates, or dimensions) and whose values are the desired names.	`None`
`**names`	`optional`	Keyword form of name_dict. One of name_dict or names must be provided.	`{}`

Returns:

Type	Description
`DummyDataset`	Returns self for method chaining

Raises:

Type	Description
`KeyError`	If a name doesn't exist
`ValueError`	If a new name already exists

Examples:

>>> ds = DummyDataset()
>>> ds.add_dim("time", 10)
>>> ds.add_coord("time", dims=["time"])
>>> ds.add_variable("temperature", dims=["time"])
>>> # Rename multiple items at once
>>> ds.rename({"time": "t", "temperature": "temp"})
>>> # Or using keyword arguments:
>>> ds.rename(time="t", temperature="temp")

Source code in src/dummyxarray/core.py

def rename(self, name_dict=None, **names):
    """
    Rename variables, coordinates, and dimensions (xarray-compatible API).

    This method can rename any combination of variables, coordinates, and dimensions.

    Parameters
    ----------
    name_dict : dict-like, optional
        Dictionary whose keys are current names (variables, coordinates, or dimensions)
        and whose values are the desired names.
    **names : optional
        Keyword form of name_dict.
        One of name_dict or names must be provided.

    Returns
    -------
    DummyDataset
        Returns self for method chaining

    Raises
    ------
    KeyError
        If a name doesn't exist
    ValueError
        If a new name already exists

    Examples
    --------
    >>> ds = DummyDataset()
    >>> ds.add_dim("time", 10)
    >>> ds.add_coord("time", dims=["time"])
    >>> ds.add_variable("temperature", dims=["time"])
    >>> # Rename multiple items at once
    >>> ds.rename({"time": "t", "temperature": "temp"})
    >>> # Or using keyword arguments:
    >>> ds.rename(time="t", temperature="temp")
    """
    # Merge name_dict and **names
    rename_dict = {}
    if name_dict is not None:
        rename_dict.update(name_dict)
    rename_dict.update(names)

    if not rename_dict:
        raise ValueError("Either name_dict or keyword arguments must be provided")

    # Categorize renames
    dim_renames = {}
    coord_renames = {}
    var_renames = {}

    for old_name, new_name in rename_dict.items():
        if old_name in self.dims:
            dim_renames[old_name] = new_name
        if old_name in self.coords:
            coord_renames[old_name] = new_name
        if old_name in self.variables:
            var_renames[old_name] = new_name

        # Check if name exists anywhere
        if (
            old_name not in self.dims
            and old_name not in self.coords
            and old_name not in self.variables
        ):
            raise KeyError(
                f"'{old_name}' does not exist in dimensions, coordinates, or variables"
            )

    # Capture provenance
    provenance = {
        "renamed": rename_dict.copy(),
        "removed": list(rename_dict.keys()),
        "added": list(rename_dict.values()),
    }

    self._record_operation("rename", {"name_dict": rename_dict}, provenance)

    # Perform renames in order: dimensions first (affects coords/vars), then coords, then vars
    if dim_renames:
        for old_name, new_name in dim_renames.items():
            if old_name != new_name and old_name in self.dims:
                self.dims[new_name] = self.dims.pop(old_name)
                # Update dimension references
                for coord in self.coords.values():
                    if coord.dims:
                        coord.dims = [new_name if d == old_name else d for d in coord.dims]
                for var in self.variables.values():
                    if var.dims:
                        var.dims = [new_name if d == old_name else d for d in var.dims]

    if coord_renames:
        for old_name, new_name in coord_renames.items():
            if old_name != new_name and old_name in self.coords:
                self.coords[new_name] = self.coords.pop(old_name)

    if var_renames:
        for old_name, new_name in var_renames.items():
            if old_name != new_name and old_name in self.variables:
                self.variables[new_name] = self.variables.pop(old_name)

    return self

open_mfdataset `classmethod` ¶

open_mfdataset(
    paths, concat_dim="time", combine="nested", **kwargs
)

Open multiple files as a single DummyDataset with file tracking.

This class method reads metadata from multiple NetCDF files and combines them into a single DummyDataset, tracking which files contribute to which coordinate ranges along the concatenation dimension.

Parameters:

Name	Type	Description	Default
`paths`	`str or list of str`	Either a glob pattern (e.g., "data/*.nc") or a list of file paths	required
`concat_dim`	`str`	The dimension along which to concatenate files (default: "time")	`'time'`
`combine`	`str`	How to combine datasets. Currently supports "nested" (default)	`'nested'`
`**kwargs`	`optional`	Additional keyword arguments (reserved for future use)	`{}`

Returns:

Type	Description
`DummyDataset`	A DummyDataset with metadata from all files and file tracking enabled

Examples:

>>> ds = DummyDataset.open_mfdataset("data/*.nc", concat_dim="time")
>>> files = ds.get_source_files(time=slice(0, 10))
>>> print(files)
['data/file1.nc', 'data/file2.nc']

groupby_time ¶

groupby_time(
    freq: str,
    dim: str = "time",
    normalize_units: bool = True,
) -> List[DummyDataset]

Group dataset by time frequency using metadata only.

This method splits a multi-file dataset into time-based groups without loading any data arrays. Each group is a new DummyDataset with adjusted metadata.

Parameters:

Name	Type	Description	Default
`freq`	`str`	Grouping frequency using pandas-style strings: - Years: '1Y', '5Y', '10Y' - Months: '1M', '3M', '6M' - Days: '1D', '7D', '30D' - Hours: '1H', '6H', '12H'	required
`dim`	`str`	Time dimension to group by	`"time"`
`normalize_units`	`bool`	Update time units to reference each group's start datetime	`True`

Returns:

Type	Description
`list of DummyDataset`	One dataset per time group, each with: - Updated time:units attribute (if normalize_units=True) - Filtered file sources for that time period - Adjusted dimension sizes - Preserved frequency attribute

Raises:

Type	Description
`ValueError`	If time coordinate has no frequency attribute (open with open_mfdataset)
`ValueError`	If time coordinate has no units attribute
`ValueError`	If dimension does not exist

Examples:

>>> # Open 100 years of hourly data
>>> ds = DummyDataset.open_mfdataset("hourly_*.nc", concat_dim="time")
>>> print(ds.coords['time'].attrs['frequency'])
'1H'
>>> print(ds.dims['time'])
876000

>>> # Group into decades
>>> decades = ds.groupby_time('10Y')
>>> print(len(decades))
10

>>> # Each decade has normalized units
>>> decade_0 = decades[0]
>>> print(decade_0.coords['time'].attrs['units'])
'hours since 2000-01-01 00:00:00'
>>> print(decade_0.dims['time'])
87600

>>> # Query files for specific decade
>>> files = decade_0.get_source_files()
>>> print(files)
['hourly_2000.nc', 'hourly_2001.nc', ..., 'hourly_2009.nc']

to_stac_item ¶

to_stac_item(
    id: str,
    geometry: Optional[Dict[str, Any]] = None,
    properties: Optional[Dict[str, Any]] = None,
    assets: Optional[Dict[str, Any]] = None,
    collection_id: Optional[str] = None,
    **kwargs
)

Convert this DummyDataset to a STAC Item.

Parameters:

Name	Type	Description	Default
`id`	`str`	Unique identifier for the STAC Item	required
`geometry`	`dict`	GeoJSON geometry dict (required if not in dataset.attrs)	`None`
`properties`	`dict`	Additional properties for the STAC Item	`None`
`assets`	`dict`	Dictionary of pystac.Asset objects	`None`
`collection_id`	`str`	ID of the parent collection	`None`
`**kwargs`		Additional arguments passed to pystac.Item	`{}`

Returns:

Type	Description
`Item`	The generated STAC Item

Examples:

>>> ds = DummyDataset()
>>> ds.add_dim("lat", 10)
>>> ds.add_dim("lon", 20)
>>> item = ds.to_stac_item(
...     id="climate-data-2020",
...     geometry={"type": "Polygon", "coordinates": [...]},
...     properties={"datetime": "2020-01-01T00:00:00Z"}
... )

Source code in src/dummyxarray/core.py

def to_stac_item(
    self,
    id: str,
    geometry: Optional[Dict[str, Any]] = None,
    properties: Optional[Dict[str, Any]] = None,
    assets: Optional[Dict[str, Any]] = None,
    collection_id: Optional[str] = None,
    **kwargs,
):
    """
    Convert this DummyDataset to a STAC Item.

    Parameters
    ----------
    id : str
        Unique identifier for the STAC Item
    geometry : dict, optional
        GeoJSON geometry dict (required if not in dataset.attrs)
    properties : dict, optional
        Additional properties for the STAC Item
    assets : dict, optional
        Dictionary of pystac.Asset objects
    collection_id : str, optional
        ID of the parent collection
    **kwargs
        Additional arguments passed to pystac.Item

    Returns
    -------
    pystac.Item
        The generated STAC Item

    Examples
    --------
    >>> ds = DummyDataset()
    >>> ds.add_dim("lat", 10)
    >>> ds.add_dim("lon", 20)
    >>> item = ds.to_stac_item(
    ...     id="climate-data-2020",
    ...     geometry={"type": "Polygon", "coordinates": [...]},
    ...     properties={"datetime": "2020-01-01T00:00:00Z"}
    ... )
    """
    from .stac import dataset_to_stac_item

    return dataset_to_stac_item(
        self,
        id=id,
        geometry=geometry,
        properties=properties,
        assets=assets,
        collection_id=collection_id,
        **kwargs,
    )

to_stac_collection ¶

to_stac_collection(
    id: str,
    description: Optional[str] = None,
    license: Optional[str] = None,
    extent: Optional[Any] = None,
    **kwargs
)

Convert this DummyDataset to a STAC Collection.

Parameters:

Name	Type	Description	Default
`id`	`str`	Unique identifier for the STAC Collection	required
`description`	`str`	Description of the collection	`None`
`license`	`str`	License for the collection	`None`
`extent`	`Extent`	Spatial and temporal extent	`None`
`**kwargs`		Additional arguments passed to pystac.Collection	`{}`

Returns:

Type	Description
`Collection`	The generated STAC Collection

Examples:

>>> ds = DummyDataset()
>>> collection = ds.to_stac_collection(
...     id="climate-collection",
...     description="Climate model output collection",
...     license="MIT"
... )

Source code in src/dummyxarray/core.py

def to_stac_collection(
    self,
    id: str,
    description: Optional[str] = None,
    license: Optional[str] = None,
    extent: Optional[Any] = None,
    **kwargs,
):
    """
    Convert this DummyDataset to a STAC Collection.

    Parameters
    ----------
    id : str
        Unique identifier for the STAC Collection
    description : str, optional
        Description of the collection
    license : str, optional
        License for the collection
    extent : pystac.Extent, optional
        Spatial and temporal extent
    **kwargs
        Additional arguments passed to pystac.Collection

    Returns
    -------
    pystac.Collection
        The generated STAC Collection

    Examples
    --------
    >>> ds = DummyDataset()
    >>> collection = ds.to_stac_collection(
    ...     id="climate-collection",
    ...     description="Climate model output collection",
    ...     license="MIT"
    ... )
    """
    from .stac import dataset_to_stac_collection

    return dataset_to_stac_collection(
        self, id=id, description=description, license=license, extent=extent, **kwargs
    )

add_spatial_extent ¶

add_spatial_extent(lat_bounds: tuple, lon_bounds: tuple)

Add spatial extent information to the dataset.

Parameters:

Name	Type	Description	Default
`lat_bounds`	`tuple`	(min_lat, max_lat) latitude bounds	required
`lon_bounds`	`tuple`	(min_lon, max_lon) longitude bounds	required

Examples:

>>> ds = DummyDataset()
>>> ds.add_spatial_extent(lat_bounds=(-90, 90), lon_bounds=(-180, 180))

Source code in src/dummyxarray/core.py

def add_spatial_extent(self, lat_bounds: tuple, lon_bounds: tuple):
    """
    Add spatial extent information to the dataset.

    Parameters
    ----------
    lat_bounds : tuple
        (min_lat, max_lat) latitude bounds
    lon_bounds : tuple
        (min_lon, max_lon) longitude bounds

    Examples
    --------
    >>> ds = DummyDataset()
    >>> ds.add_spatial_extent(lat_bounds=(-90, 90), lon_bounds=(-180, 180))
    """
    min_lat, max_lat = lat_bounds
    min_lon, max_lon = lon_bounds

    self.attrs["geospatial_bounds"] = {
        "type": "Polygon",
        "coordinates": [
            [
                [min_lon, min_lat],
                [max_lon, min_lat],
                [max_lon, max_lat],
                [min_lon, max_lat],
                [min_lon, min_lat],
            ]
        ],
    }
    self.attrs["geospatial_lat_min"] = min_lat
    self.attrs["geospatial_lat_max"] = max_lat
    self.attrs["geospatial_lon_min"] = min_lon
    self.attrs["geospatial_lon_max"] = max_lon

infer_temporal_extent ¶

infer_temporal_extent()

Infer temporal extent from time coordinate.

Returns:

Type	Description
`tuple`	(start_time, end_time) as datetime objects or None if not found

Examples:

>>> ds = DummyDataset()
>>> ds.add_coord("time", ["time"], attrs={"units": "days since 2000-01-01"})
>>> start, end = ds.infer_temporal_extent()

Source code in src/dummyxarray/core.py

def infer_temporal_extent(self):
    """
    Infer temporal extent from time coordinate.

    Returns
    -------
    tuple
        (start_time, end_time) as datetime objects or None if not found

    Examples
    --------
    >>> ds = DummyDataset()
    >>> ds.add_coord("time", ["time"], attrs={"units": "days since 2000-01-01"})
    >>> start, end = ds.infer_temporal_extent()
    """
    if "time" in self.coords:
        time_coord = self.coords["time"]
        if hasattr(time_coord, "data") and time_coord.data is not None:
            try:
                # Try to get time values
                time_values = time_coord.data
                if len(time_values) > 0:
                    # This is a simplified approach - in practice you'd need
                    # to handle different time encodings and units
                    from datetime import datetime, timedelta

                    import numpy as np

                    # Assume days since some epoch for now
                    if "units" in time_coord.attrs:
                        units = time_coord.attrs["units"]
                        if "since" in units:
                            epoch_str = units.split("since")[1].strip()
                            try:
                                epoch = datetime.fromisoformat(epoch_str.replace("Z", "+00:00"))
                                start_delta = timedelta(days=float(np.min(time_values)))
                                end_delta = timedelta(days=float(np.max(time_values)))
                                start_time = epoch + start_delta
                                end_time = epoch + end_delta

                                self.attrs["time_coverage_start"] = start_time.isoformat()
                                self.attrs["time_coverage_end"] = end_time.isoformat()
                                return start_time, end_time
                            except (ValueError, TypeError):
                                pass
            except Exception:
                pass
    return None, None

validate_spatial_metadata ¶

validate_spatial_metadata()

Validate spatial metadata in the dataset.

Returns:

Type	Description
`dict`	Validation results with any issues found

Examples:

>>> ds = DummyDataset()
>>> ds.add_spatial_extent(lat_bounds=(-90, 90), lon_bounds=(-180, 180))
>>> validation = ds.validate_spatial_metadata()
>>> print(validation['valid'])
True

Source code in src/dummyxarray/core.py

def validate_spatial_metadata(self):
    """
    Validate spatial metadata in the dataset.

    Returns
    -------
    dict
        Validation results with any issues found

    Examples
    --------
    >>> ds = DummyDataset()
    >>> ds.add_spatial_extent(lat_bounds=(-90, 90), lon_bounds=(-180, 180))
    >>> validation = ds.validate_spatial_metadata()
    >>> print(validation['valid'])
    True
    """
    issues = []

    # Check geospatial_bounds
    if "geospatial_bounds" in self.attrs:
        bounds = self.attrs["geospatial_bounds"]
        if not isinstance(bounds, dict):
            issues.append("geospatial_bounds must be a dictionary")
        elif "type" not in bounds or "coordinates" not in bounds:
            issues.append("geospatial_bounds missing required 'type' or 'coordinates'")
        elif bounds["type"] != "Polygon":
            issues.append("geospatial_bounds type must be 'Polygon'")
    else:
        # Try to infer from coordinates
        lat_found = any(name in self.coords for name in ["lat", "latitude", "Latitude"])
        lon_found = any(name in self.coords for name in ["lon", "longitude", "Longitude"])
        if not (lat_found and lon_found):
            issues.append(
                "No spatial information found - need geospatial_bounds or lat/lon coordinates"
            )

    return {"valid": len(issues) == 0, "issues": issues}

from_stac_item `classmethod` ¶

from_stac_item(item)

Create a DummyDataset from a STAC Item.

Parameters:

Name	Type	Description	Default
`item`	`Item`	The STAC Item to convert	required

Returns:

Type	Description
`DummyDataset`	The generated DummyDataset

Examples:

>>> import pystac
>>> item = pystac.Item.from_file("data.json")
>>> ds = DummyDataset.from_stac_item(item)

Source code in src/dummyxarray/core.py

@classmethod
def from_stac_item(cls, item):
    """
    Create a DummyDataset from a STAC Item.

    Parameters
    ----------
    item : pystac.Item
        The STAC Item to convert

    Returns
    -------
    DummyDataset
        The generated DummyDataset

    Examples
    --------
    >>> import pystac
    >>> item = pystac.Item.from_file("data.json")
    >>> ds = DummyDataset.from_stac_item(item)
    """
    from .stac import stac_item_to_dataset

    return stac_item_to_dataset(item)

from_stac_collection `classmethod` ¶

from_stac_collection(collection, item_id=None)

Create a DummyDataset from a STAC Collection.

Parameters:

Name	Type	Description	Default
`collection`	`Collection`	The STAC Collection to convert	required
`item_id`	`str`	Specific item ID to extract from collection	`None`

Returns:

Type	Description
`DummyDataset or list of DummyDataset`	The generated DummyDataset(s)

Examples:

>>> import pystac
>>> collection = pystac.Collection.from_file("collection.json")
>>> ds = DummyDataset.from_stac_collection(collection, item_id="climate-data-2020")

Source code in src/dummyxarray/core.py

@classmethod
def from_stac_collection(cls, collection, item_id=None):
    """
    Create a DummyDataset from a STAC Collection.

    Parameters
    ----------
    collection : pystac.Collection
        The STAC Collection to convert
    item_id : str, optional
        Specific item ID to extract from collection

    Returns
    -------
    DummyDataset or list of DummyDataset
        The generated DummyDataset(s)

    Examples
    --------
    >>> import pystac
    >>> collection = pystac.Collection.from_file("collection.json")
    >>> ds = DummyDataset.from_stac_collection(collection, item_id="climate-data-2020")
    """
    from .stac import stac_collection_to_dataset

    return stac_collection_to_dataset(collection, item_id)

create_stac_collection `classmethod` ¶

create_stac_collection(
    datasets, collection_id, description=None, license=None
)

Create a STAC Collection from multiple datasets.

Parameters:

Name	Type	Description	Default
`datasets`	`list of DummyDataset`	List of datasets to include in the collection	required
`collection_id`	`str`	Unique identifier for the STAC Collection	required
`description`	`str`	Description of the collection	`None`
`license`	`str`	License for the collection	`None`

Returns:

Type	Description
`Collection`	The generated STAC Collection

Examples:

>>> temperature_ds = DummyDataset()
>>> precipitation_ds = DummyDataset()
>>> wind_ds = DummyDataset()
>>> collection = DummyDataset.create_stac_collection(
...     [temperature_ds, precipitation_ds, wind_ds],
...     collection_id="climate-2020"
... )

Source code in src/dummyxarray/core.py

@classmethod
def create_stac_collection(cls, datasets, collection_id, description=None, license=None):
    """
    Create a STAC Collection from multiple datasets.

    Parameters
    ----------
    datasets : list of DummyDataset
        List of datasets to include in the collection
    collection_id : str
        Unique identifier for the STAC Collection
    description : str, optional
        Description of the collection
    license : str, optional
        License for the collection

    Returns
    -------
    pystac.Collection
        The generated STAC Collection

    Examples
    --------
    >>> temperature_ds = DummyDataset()
    >>> precipitation_ds = DummyDataset()
    >>> wind_ds = DummyDataset()
    >>> collection = DummyDataset.create_stac_collection(
    ...     [temperature_ds, precipitation_ds, wind_ds],
    ...     collection_id="climate-2020"
    ... )
    """
    from .stac import create_stac_collection_from_datasets

    return create_stac_collection_from_datasets(
        datasets, collection_id, description=description, license=license
    )

DummyDataset¶

Class Reference¶

Functions¶

__init__ ¶

__repr__ ¶

__getattr__ ¶

__setattr__ ¶

__dir__ ¶

set_global_attrs ¶

assign_attrs ¶

add_dim ¶

add_coord ¶

add_variable ¶

rename_dims ¶

rename_vars ¶

rename ¶

open_mfdataset classmethod ¶

groupby_time ¶

to_stac_item ¶

to_stac_collection ¶

add_spatial_extent ¶

infer_temporal_extent ¶

validate_spatial_metadata ¶

from_stac_item classmethod ¶

from_stac_collection classmethod ¶

create_stac_collection classmethod ¶

init ¶

repr ¶

getattr ¶

setattr ¶

dir ¶

open_mfdataset `classmethod` ¶

from_stac_item `classmethod` ¶

from_stac_collection `classmethod` ¶

create_stac_collection `classmethod` ¶