Forcing HMETS with the extended CANOPEX dataset

Here we use ravenpy to launch the HMETS hydrological model and analyze the output. We also prepare and gather data directly from the CANOPEX dataset made available freely for all users.

import warnings

from numba.core.errors import NumbaDeprecationWarning

warnings.simplefilter("ignore", category=NumbaDeprecationWarning)
# Cookie-cutter template necessary to provide the tools, packages and paths for the project. All notebooks
# need this template (or a slightly adjusted one depending on the required packages)
import datetime as dt
import tempfile
from pathlib import Path

import pandas as pd
import spotpy
import xarray as xr

from ravenpy.config import commands as rc
from ravenpy.config.emulators import HMETS
from ravenpy.utilities.calibration import SpotSetup
from ravenpy.utilities.testdata import get_file

# Make a temporary folder
tmp = Path(tempfile.mkdtemp())
# DATA MAIN SOURCE - DAP link to CANOPEX dataset.
CANOPEX_DAP = "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/birdhouse/ets/Watersheds_5797_cfcompliant.nc"
ds = xr.open_dataset(CANOPEX_DAP)

# Explore the dataset:
display(ds)
# We could explore the dataset and find a watershed of interest, but for now, let's pick one at random
# from the dataset:
watershedID = 5600

# And show what it includes:
ts = ds.isel({"watershed": watershedID})
# Let's write the file to disk to make it more efficient to retrieve:
fname = tmp / "CANOPEX_extracted.nc"
ts.to_netcdf(fname)
ds.close()
ts.close()
# With this info, we can gather some properties from the CANOPEX database. This same database is used for
# regionalization, so let's query it there where more information is available:
tmp = pd.read_csv(get_file("regionalisation_data/gauged_catchment_properties.csv"))

basin_area = float(tmp["area"][watershedID])
basin_latitude = float(tmp["latitude"][watershedID])
basin_longitude = float(tmp["longitude"][watershedID])
basin_elevation = float(tmp["elevation"][watershedID])
basin_name = ds.watershed.data

print("Basin name: ", basin_name)
print("Latitude: ", basin_latitude, " °N")
print("Area: ", basin_area, " km^2")

Now, we might have the model and data, but we don’t have model parameters! We need to calibrate. This next snippets show how to configure the model and the calibration.

# We will also calibrate on only a subset of the years for now to keep the computations faster in this notebook.
start_calib = dt.datetime(1998, 1, 1)
end_calib = dt.datetime(1999, 12, 31)

# General parameters depending on the data source. We can find them by exploring the CANOPEX dataset in the
# cells above.
data_type = ["TEMP_MAX", "TEMP_MIN", "PRECIP"]

alt_names = {
    "TEMP_MIN": "tasmin",
    "TEMP_MAX": "tasmax",
    "PRECIP": "pr",
}

hru = {}
hru = dict(
    area=basin_area,
    elevation=basin_elevation,
    latitude=basin_latitude,
    longitude=basin_longitude,
    hru_type="land",
)

data_kwds = {
    "ALL": {
        "elevation": hru["elevation"],
        "latitude": hru["latitude"],
        "longitude": hru["longitude"],
    }
}
# Set the evaluation metrics to be calculated by Raven
eval_metrics = ("NASH_SUTCLIFFE",)

model_config = HMETS(
    ObservationData=[
        rc.ObservationData.from_nc(fname, alt_names="discharge", station_idx=1)
    ],
    Gauge=[
        rc.Gauge.from_nc(
            fname,
            station_idx=1,
            data_type=data_type,  # Note that this is the list of all the variables
            alt_names=alt_names,  # Note that all variables here are mapped to their names in the netcdf file.
            data_kwds=data_kwds,
        )
    ],
    HRUs=[hru],
    StartDate=start_calib,
    EndDate=end_calib,
    RunName="CANOPEX_test",
    EvaluationMetrics=eval_metrics,
    RainSnowFraction="RAINSNOW_DINGMAN",
    SuppressOutput=True,
)
# The model parameters bounds can either be set independently or we can use the defaults.
low_params = (
    0.3,
    0.01,
    0.5,
    0.15,
    0.0,
    0.0,
    -2.0,
    0.01,
    0.0,
    0.01,
    0.005,
    -5.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.00001,
    0.0,
    0.00001,
    0.0,
    0.0,
)
high_params = (
    20.0,
    5.0,
    13.0,
    1.5,
    20.0,
    20.0,
    3.0,
    0.2,
    0.1,
    0.3,
    0.1,
    2.0,
    5.0,
    1.0,
    3.0,
    1.0,
    0.02,
    0.1,
    0.01,
    0.5,
    2.0,
)

# Setup the spotpy optimizer
spot_setup = SpotSetup(
    config=model_config,
    low=low_params,
    high=high_params,
)

Finally, we can run the optimizer:

# We'll definitely want to adjust the random seed and number of model evaluations:
model_evaluations = (
    50  # This is to keep computing time fast for the demo, increase as necessary
)

# Setup the spotpy sampler with the method, the setup configuration, a run name and other options. Please refer to
# the spotpy documentation for more options. We recommend sticking to this format for efficiency of most applications.
sampler = spotpy.algorithms.dds(
    spot_setup,
    dbname="CANOPEX_test",
    dbformat="ram",
    save_sim=False,
)

# Launch the actual optimization. Multiple trials can be launched, where the entire process is repeated and
# the best overall value from all trials is returned.
sampler.sample(model_evaluations, trials=1)
# Get the model diagnostics
diag = spot_setup.diagnostics

# Print the NSE and the parameter set in 2 different ways:
print("Nash-Sutcliffe value is: " + str(diag["DIAG_NASH_SUTCLIFFE"]))

# Get all the values of each iteration
results = sampler.getdata()

# Get the raw resutlts directly in an array
params = spotpy.analyser.get_best_parameterset(results)[0]
params

At this stage, we have calibrated the model on the observations for the desired dates. Now, let’s run the model on a longer time period and look at the hydrograph

from ravenpy import Emulator

conf = model_config.set_params(params)
conf.suppress_output = False
out = Emulator(conf).run()

The hydrograph and storage outputs are netCDF files storing the time series. These files are opened by default using xarray, which provides convenient and powerful time series analysis and plotting tools.

q = out.hydrograph.q_sim
# You can also get statistics from the data directly.
print("Max: ", q.max().values)
print("Mean: ", q.mean().values)
print(
    "Monthly means: ",
    q.groupby("time.month").mean(dim="time").values,
)
# Plot the simulated hydrograph
from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters()
q.plot()