Geospatial Workflows¶
Overview¶
This guide provides end-to-end examples of geospatial workflows using DummyXarray's STAC support, demonstrating real-world use cases from data creation to catalog publishing.
Workflow 1: Climate Data Publishing¶
Complete Climate Dataset Workflow¶
This example shows how to create, validate, and publish climate model output as STAC catalogs.
from dummyxarray import DummyDataset
import numpy as np
# Step 1: Create dataset with spatial/temporal structure
ds = DummyDataset()
# Define dimensions
ds.add_dim('time', 365)
ds.add_dim('lat', 180)
ds.add_dim('lon', 360)
# Add time coordinate
ds.add_coord('time', ['time'],
data=np.arange(365),
attrs={
'units': 'days since 2020-01-01',
'calendar': 'gregorian',
'standard_name': 'time',
'long_name': 'Time'
})
# Add spatial coordinates
ds.add_coord('lat', ['lat'],
data=np.linspace(-89.5, 89.5, 180),
attrs={
'units': 'degrees_north',
'standard_name': 'latitude',
'long_name': 'Latitude'
})
ds.add_coord('lon', ['lon'],
data=np.linspace(-179.5, 179.5, 360),
attrs={
'units': 'degrees_east',
'standard_name': 'longitude',
'long_name': 'Longitude'
})
# Step 2: Add climate variables
ds.add_variable('temperature', ['time', 'lat', 'lon'],
attrs={
'units': 'K',
'standard_name': 'air_temperature',
'long_name': 'Near-Surface Air Temperature'
})
ds.add_variable('precipitation', ['time', 'lat', 'lon'],
attrs={
'units': 'kg m-2 s-1',
'standard_name': 'precipitation_flux',
'long_name': 'Precipitation Flux'
})
# Step 3: Add global metadata
ds.attrs.update({
'title': 'Global Climate Model Output 2020',
'institution': 'Climate Research Institute',
'source': 'CESM2 Climate Model',
'references': 'https://doi.org/10.xxxx/climate-2020',
'comment': 'Daily global climate data at 1-degree resolution',
'Conventions': 'CF-1.8',
'history': f'Created on {np.datetime64("now")}'
})
# Step 4: Infer and validate spatial/temporal metadata
start, end = ds.infer_temporal_extent()
print(f"Temporal coverage: {start} to {end}")
validation = ds.validate_spatial_metadata()
if validation['valid']:
print("✓ Spatial metadata validated")
else:
print("✗ Spatial validation issues:")
for issue in validation['issues']:
print(f" - {issue}")
# Step 5: Create STAC Item
item = ds.to_stac_item(
id='climate-global-2020',
properties={
'model': 'CESM2',
'experiment': 'historical',
'frequency': 'daily',
'realm': 'atmos',
'institution_id': 'CRI'
}
)
# Step 6: Save STAC catalog
ds.save_stac_item('climate_2020.json', id='climate-global-2020')
print("✓ STAC Item saved to climate_2020.json")
# Step 7: Create collection for multiple years
datasets = []
for year in range(2020, 2023):
yearly_ds = DummyDataset()
# ... configure yearly_ds ...
yearly_ds.attrs['title'] = f'Climate Data {year}'
datasets.append(yearly_ds)
collection = DummyDataset.create_stac_collection(
datasets,
collection_id='climate-2020-2022',
description='Global climate data 2020-2022',
license='CC-BY-4.0'
)
print(f"✓ Collection created with {len(list(collection.get_items()))} items")
Workflow 2: Regional Satellite Data¶
Processing Regional Satellite Imagery¶
This example demonstrates handling regional satellite data with specific geographic coverage.
from dummyxarray import DummyDataset
import numpy as np
# Step 1: Define regional extent (North America)
ds = DummyDataset()
# Regional dimensions (0.25-degree resolution)
ds.add_dim('time', 1)
ds.add_dim('lat', 180) # 25°N to 70°N
ds.add_dim('lon', 360) # 170°W to 50°W
# Step 2: Add regional coordinates
ds.add_coord('lat', ['lat'],
data=np.linspace(25, 70, 180),
attrs={
'units': 'degrees_north',
'standard_name': 'latitude'
})
ds.add_coord('lon', ['lon'],
data=np.linspace(-170, -50, 360),
attrs={
'units': 'degrees_east',
'standard_name': 'longitude'
})
# Step 3: Add satellite bands
ds.add_variable('red', ['time', 'lat', 'lon'],
attrs={
'long_name': 'Red Band Reflectance',
'units': '1',
'wavelength': '0.665 micrometers'
})
ds.add_variable('nir', ['time', 'lat', 'lon'],
attrs={
'long_name': 'Near-Infrared Band Reflectance',
'units': '1',
'wavelength': '0.865 micrometers'
})
# Step 4: Add spatial extent
ds.add_spatial_extent(
lat_bounds=(25, 70),
lon_bounds=(-170, -50)
)
# Step 5: Add satellite-specific metadata
ds.attrs.update({
'title': 'Landsat 8 North America',
'platform': 'Landsat-8',
'instrument': 'OLI',
'processing_level': 'L2',
'cloud_cover': 5.2,
'time_coverage_start': '2020-06-15T18:30:00Z',
'time_coverage_end': '2020-06-15T18:30:00Z'
})
# Step 6: Create STAC Item with custom geometry
item = ds.to_stac_item(
id='landsat8-na-20200615',
properties={
'eo:cloud_cover': 5.2,
'platform': 'landsat-8',
'instruments': ['oli'],
'gsd': 30 # Ground sample distance in meters
}
)
# Step 7: Save with satellite-specific metadata
ds.save_stac_item(
'landsat8_na.json',
id='landsat8-na-20200615',
properties={
'eo:bands': [
{'name': 'red', 'common_name': 'red'},
{'name': 'nir', 'common_name': 'nir'}
]
}
)
print("✓ Regional satellite STAC Item created")
Workflow 3: Multi-Resolution Dataset Collection¶
Creating Collections with Multiple Resolutions¶
This example shows how to organize datasets at different spatial resolutions.
from dummyxarray import DummyDataset
import numpy as np
def create_resolution_dataset(resolution_deg, name):
"""Create a dataset at a specific resolution."""
ds = DummyDataset()
# Calculate dimensions based on resolution
nlat = int(180 / resolution_deg)
nlon = int(360 / resolution_deg)
ds.add_dim('time', 12)
ds.add_dim('lat', nlat)
ds.add_dim('lon', nlon)
# Add coordinates
ds.add_coord('lat', ['lat'],
data=np.linspace(-90 + resolution_deg/2,
90 - resolution_deg/2, nlat))
ds.add_coord('lon', ['lon'],
data=np.linspace(-180 + resolution_deg/2,
180 - resolution_deg/2, nlon))
# Add variable
ds.add_variable('temperature', ['time', 'lat', 'lon'],
attrs={'units': 'K', 'standard_name': 'air_temperature'})
# Add metadata
ds.attrs.update({
'title': f'{name} - {resolution_deg}° resolution',
'spatial_resolution': f'{resolution_deg} degrees',
'grid_type': 'regular_lat_lon'
})
# Infer temporal extent
ds.add_coord('time', ['time'],
data=np.arange(12),
attrs={'units': 'months since 2020-01-01'})
ds.infer_temporal_extent()
return ds
# Create datasets at different resolutions
coarse_ds = create_resolution_dataset(2.0, 'Coarse Resolution')
medium_ds = create_resolution_dataset(1.0, 'Medium Resolution')
fine_ds = create_resolution_dataset(0.5, 'Fine Resolution')
# Create multi-resolution collection
collection = DummyDataset.create_stac_collection(
[coarse_ds, medium_ds, fine_ds],
collection_id='multi-resolution-temperature',
description='Temperature data at multiple spatial resolutions',
license='CC-BY-4.0'
)
print(f"✓ Multi-resolution collection created")
print(f" Resolutions: 2.0°, 1.0°, 0.5°")
Workflow 4: Time Series with Spatial Coverage¶
Monthly Time Series Dataset¶
This example demonstrates creating a time series of spatial datasets.
from dummyxarray import DummyDataset
import numpy as np
from datetime import datetime, timedelta
def create_monthly_dataset(year, month):
"""Create a dataset for a specific month."""
ds = DummyDataset()
# Spatial dimensions
ds.add_dim('lat', 180)
ds.add_dim('lon', 360)
# Add spatial coordinates
ds.add_coord('lat', ['lat'], data=np.linspace(-89.5, 89.5, 180))
ds.add_coord('lon', ['lon'], data=np.linspace(-179.5, 179.5, 360))
# Add monthly data
ds.add_variable('sst', ['lat', 'lon'],
attrs={
'units': 'degree_Celsius',
'standard_name': 'sea_surface_temperature',
'long_name': 'Sea Surface Temperature'
})
# Add temporal metadata
start_date = datetime(year, month, 1)
if month == 12:
end_date = datetime(year + 1, 1, 1) - timedelta(days=1)
else:
end_date = datetime(year, month + 1, 1) - timedelta(days=1)
ds.attrs.update({
'title': f'SST {year}-{month:02d}',
'time_coverage_start': start_date.isoformat() + 'Z',
'time_coverage_end': end_date.isoformat() + 'Z',
'temporal_resolution': 'monthly'
})
# Add spatial extent
ds.add_spatial_extent(lat_bounds=(-90, 90), lon_bounds=(-180, 180))
return ds
# Create monthly datasets for 2020
monthly_datasets = []
for month in range(1, 13):
ds = create_monthly_dataset(2020, month)
monthly_datasets.append(ds)
# Create time series collection
collection = DummyDataset.create_stac_collection(
monthly_datasets,
collection_id='sst-monthly-2020',
description='Monthly sea surface temperature for 2020',
license='CC-BY-4.0'
)
print(f"✓ Time series collection created with {len(monthly_datasets)} months")
Workflow 5: Data Validation Pipeline¶
Complete Validation and Quality Control¶
This example shows a comprehensive validation pipeline before publishing.
from dummyxarray import DummyDataset
import numpy as np
def validate_dataset_for_stac(ds, dataset_id):
"""Comprehensive validation before STAC export."""
print(f"\n=== Validating {dataset_id} ===")
issues = []
# 1. Check spatial metadata
spatial_validation = ds.validate_spatial_metadata()
if not spatial_validation['valid']:
issues.extend(spatial_validation['issues'])
print("✗ Spatial validation failed:")
for issue in spatial_validation['issues']:
print(f" - {issue}")
else:
print(f"✓ Spatial metadata valid: {', '.join(spatial_validation['found'])}")
# 2. Check temporal metadata
if 'time_coverage_start' not in ds.attrs:
issues.append("Missing time_coverage_start")
print("✗ Missing temporal coverage start")
else:
print(f"✓ Temporal coverage: {ds.attrs['time_coverage_start']}")
# 3. Check required global attributes
required_attrs = ['title', 'description']
for attr in required_attrs:
if attr not in ds.attrs:
issues.append(f"Missing required attribute: {attr}")
print(f"✗ Missing {attr}")
else:
print(f"✓ Has {attr}")
# 4. Check coordinate attributes
if 'lat' in ds.coords:
lat_coord = ds.coords['lat']
if 'units' not in lat_coord.attrs:
issues.append("Latitude missing units")
print("✗ Latitude missing units")
else:
print(f"✓ Latitude units: {lat_coord.attrs['units']}")
# 5. Check variable metadata
for var_name, var in ds.variables.items():
if 'units' not in var.attrs:
issues.append(f"Variable {var_name} missing units")
print(f"✗ Variable {var_name} missing units")
else:
print(f"✓ Variable {var_name} has units: {var.attrs['units']}")
# Summary
if not issues:
print("\n✓✓✓ All validation checks passed!")
return True
else:
print(f"\n✗✗✗ Found {len(issues)} issues")
return False
# Example usage
ds = DummyDataset()
ds.add_dim('lat', 180)
ds.add_dim('lon', 360)
ds.add_coord('lat', ['lat'],
data=np.linspace(-89.5, 89.5, 180),
attrs={'units': 'degrees_north', 'standard_name': 'latitude'})
ds.add_coord('lon', ['lon'],
data=np.linspace(-179.5, 179.5, 360),
attrs={'units': 'degrees_east', 'standard_name': 'longitude'})
ds.add_variable('temperature', ['lat', 'lon'],
attrs={'units': 'K', 'standard_name': 'air_temperature'})
ds.attrs.update({
'title': 'Global Temperature',
'description': 'Global temperature analysis',
'time_coverage_start': '2020-01-01T00:00:00Z'
})
ds.add_spatial_extent(lat_bounds=(-90, 90), lon_bounds=(-180, 180))
# Validate before export
if validate_dataset_for_stac(ds, 'temperature-global'):
item = ds.to_stac_item(id='temperature-global')
ds.save_stac_item('validated_temperature.json', id='temperature-global')
print("\n✓ STAC Item created and saved")
else:
print("\n✗ Fix validation issues before creating STAC Item")
Workflow 6: Integration with Existing Data¶
Converting Existing Metadata to STAC¶
This example shows how to convert existing dataset metadata to STAC format.
from dummyxarray import DummyDataset
import numpy as np
# Simulate loading existing dataset metadata
existing_metadata = {
'dataset_name': 'ERA5 Reanalysis',
'variables': ['temperature', 'pressure', 'humidity'],
'lat_min': -90, 'lat_max': 90,
'lon_min': -180, 'lon_max': 180,
'time_start': '2020-01-01',
'time_end': '2020-12-31',
'resolution': '0.25 degrees',
'source': 'ECMWF ERA5',
'license': 'Copernicus License'
}
# Convert to DummyDataset
ds = DummyDataset()
# Add dimensions based on resolution
resolution = 0.25
nlat = int(180 / resolution)
nlon = int(360 / resolution)
ds.add_dim('time', 365)
ds.add_dim('lat', nlat)
ds.add_dim('lon', nlon)
# Add coordinates
ds.add_coord('lat', ['lat'],
data=np.linspace(existing_metadata['lat_min'],
existing_metadata['lat_max'], nlat))
ds.add_coord('lon', ['lon'],
data=np.linspace(existing_metadata['lon_min'],
existing_metadata['lon_max'], nlon))
# Add variables from metadata
for var_name in existing_metadata['variables']:
ds.add_variable(var_name, ['time', 'lat', 'lon'])
# Convert metadata to STAC-compatible format
ds.attrs.update({
'title': existing_metadata['dataset_name'],
'description': f"{existing_metadata['dataset_name']} at {existing_metadata['resolution']}",
'source': existing_metadata['source'],
'license': existing_metadata['license'],
'time_coverage_start': existing_metadata['time_start'] + 'T00:00:00Z',
'time_coverage_end': existing_metadata['time_end'] + 'T23:59:59Z',
'spatial_resolution': existing_metadata['resolution']
})
# Add spatial extent
ds.add_spatial_extent(
lat_bounds=(existing_metadata['lat_min'], existing_metadata['lat_max']),
lon_bounds=(existing_metadata['lon_min'], existing_metadata['lon_max'])
)
# Create STAC Item
item = ds.to_stac_item(
id='era5-reanalysis-2020',
properties={
'provider': 'ECMWF',
'product': 'ERA5',
'resolution': existing_metadata['resolution']
}
)
print(f"✓ Converted {existing_metadata['dataset_name']} to STAC")
print(f" Variables: {', '.join(existing_metadata['variables'])}")
print(f" Coverage: {existing_metadata['time_start']} to {existing_metadata['time_end']}")
Best Practices Summary¶
1. Always Validate Before Publishing¶
validation = ds.validate_spatial_metadata()
if validation['valid']:
item = ds.to_stac_item(id='validated-item')
2. Include Comprehensive Metadata¶
ds.attrs.update({
'title': 'Descriptive Title',
'description': 'Detailed description',
'institution': 'Data Provider',
'source': 'Data Source',
'references': 'DOI or URL',
'license': 'CC-BY-4.0'
})
3. Use Standard Coordinate Attributes¶
ds.add_coord('lat', ['lat'],
data=lat_data,
attrs={
'units': 'degrees_north',
'standard_name': 'latitude',
'axis': 'Y'
})
4. Document Temporal Coverage¶
ds.attrs.update({
'time_coverage_start': '2020-01-01T00:00:00Z',
'time_coverage_end': '2020-12-31T23:59:59Z',
'temporal_resolution': 'daily'
})
5. Organize Collections Logically¶
# Group by time period
collection = DummyDataset.create_stac_collection(
yearly_datasets,
collection_id='climate-2020-2025'
)
# Group by variable
collection = DummyDataset.create_stac_collection(
[temp_ds, precip_ds, wind_ds],
collection_id='atmospheric-variables'
)
See Also¶
- STAC Catalogs - STAC catalog API reference
- Spatial Metadata - Spatial metadata handling
- CF Compliance - CF conventions
- Intake Catalogs - Alternative catalog format