util.py
Just a helper to generate some data for training and inference.
import pandas as pd
import dask.array as da
import xarray as xr
def generate_X_y(
=1000, features=10, classes=2
samples-> tuple[da.core.Array, da.core.Array]:
) """Generate random features and labels
Parameters
----------
samples : int, optional
Number of samples, by default 1000
features : int, optional
Number of features, by default 10
classes : int, optional
Number of classes, by default 2
Returns
-------
tuple[da.core.Array, da.core.Array]
X and y, with shape (samples, features) and (samples,)
"""
= da.random.random((samples, features))
X = da.random.randint(0, classes, samples)
y
return X, y
def generate_3d_dataset(lat=40, lon=60, time=12) -> xr.Dataset:
"""Generate a 3-dimensional xarray.Dataset with lat, lon and time dimensions of
the specified size. lat/lon chunks will be int(dim.size/10), time will be a single chunk (-1)
Parameters
----------
lat : int, optional
Size of lat dimension, by default 40
lon : int, optional
Size of lon dimension, by default 60
time : int, optional
Size of time dimension, by default 12
Returns
-------
xr.Dataset
Dask-backed dataset.
"""
= int(lat / 10), int(lon / 10)
lat_chunk, lon_chunk = da.arange(lat)
lat_coords = da.arange(lon)
lon_coords = pd.Timestamp(year=2021, month=1, day=1)
start = pd.date_range(start, start + pd.Timedelta(days=time - 1), freq="1D")
time_coords = da.random.random(
data
(lat_coords.size, lon_coords.size, time_coords.size),=(lat_chunk, lon_chunk, -1),
chunks
)
= xr.DataArray(
ds
data,=[lat_coords, lon_coords, time_coords],
coords=["lat", "lon", "time"],
dims="test",
name
).to_dataset()
return ds