Source code for ravenpy.extractors.forecasts

import datetime as dt
import logging
import os
import re
import warnings
from pathlib import Path
from typing import Any, List, Tuple, Union
from urllib.parse import urljoin

import pandas as pd
import xarray as xr
from pandas import DatetimeIndex, Series, Timestamp
from xarray import Dataset

from . import gis_import_error_message

    import fiona
except (ImportError, ModuleNotFoundError) as e:
    msg = gis_import_error_message.format(Path(__file__).stem)
    raise ImportError(msg) from e

LOGGER = logging.getLogger("PYWPS")

# Can be set at runtime with `$ env RAVENPY_THREDDS_URL=https://xx.yy.zz/geoserver/ ...`.
THREDDS_URL = os.environ.get(
if not THREDDS_URL.endswith("/"):

[docs] def get_hindcast_day(region_coll: fiona.Collection, date, climate_model="GEPS"): """Generate a forecast dataset that can be used to run raven. Data comes from the CASPAR archive and must be aggregated such that each file contains forecast data for a single day, but for all forecast timesteps and all members. The code takes the region shapefile, the forecast date required, and the climate_model to use, here GEPS by default, but eventually could be GEPS, GDPS, REPS or RDPS. """ # Get the file locations and filenames as a function of the climate model and date [ds, times] = get_CASPAR_dataset(climate_model, date) return get_subsetted_forecast(region_coll, ds, times, True)
[docs] def get_CASPAR_dataset( climate_model: str, date: dt.datetime, thredds: str = THREDDS_URL, directory: str = "dodsC/birdhouse/disk2/caspar/daily/", ) -> Tuple[ xr.Dataset, List[Union[Union[DatetimeIndex, Series, Timestamp, Timestamp], Any]] ]: """Return CASPAR dataset. Parameters ---------- climate_model : str Type of climate model, for now only "GEPS" is supported. date : dt.datetime The date of the forecast. thredds : str The thredds server url. Default: "" directory : str The directory on the thredds server where the data is stored. Default: "dodsC/birdhouse/disk2/caspar/daily/" Returns ------- xr.Dataset The forecast dataset. """ if thredds[-1] != "/": warnings.warn( "The thredds url should end with a slash. Appending it to the url." ) thredds = f"{thredds}/" if climate_model == "GEPS": d = dt.datetime.strftime(date, "%Y%m%d") file_location = urljoin(directory, f"GEPS_{d}.nc") file_url = urljoin(thredds, file_location) ds = xr.open_dataset(file_url) # Here we also extract the times at 6-hour intervals as Raven must have # constant timesteps and GEPS goes to 6 hours start = pd.to_datetime(ds.time[0].values) times = [start + dt.timedelta(hours=n) for n in range(0, 384, 6)] else: # Eventually: GDPS, RDPS and REPS raise NotImplementedError("Only the GEPS model is currently supported") # Checking that these exist. for f in ["pr", "tas"]: if f not in ds: raise AttributeError(f"'{f}' not present in dataset") return ds, times
[docs] def get_ECCC_dataset( climate_model: str, thredds: str = THREDDS_URL, directory: str = "dodsC/datasets/forecasts/eccc_geps/", ) -> Tuple[ Dataset, List[Union[Union[DatetimeIndex, Series, Timestamp, Timestamp], Any]] ]: """Return latest GEPS forecast dataset. Parameters ---------- climate_model : str Type of climate model, for now only "GEPS" is supported. thredds : str The thredds server url. Default: "" directory : str The directory on the thredds server where the data is stored. Default: "dodsC/datasets/forecasts/eccc_geps/" Returns ------- xr.Dataset The forecast dataset. """ if thredds[-1] != "/": warnings.warn( "The thredds url should end with a slash. Appending it to the url." ) thredds = f"{thredds}/" if climate_model == "GEPS": # Eventually the file will find a permanent home, until then let's use the test folder. file_location = urljoin(directory, "GEPS_latest.ncml") file_url = urljoin(thredds, file_location) ds = xr.open_dataset(file_url) # Here we also extract the times at 6-hour intervals as Raven must have # constant timesteps and GEPS goes to 6 hours start = pd.to_datetime(ds.time[0].values) times = [start + dt.timedelta(hours=n) for n in range(0, 384, 6)] else: # Eventually: GDPS, RDPS and REPS raise NotImplementedError("Only the GEPS model is currently supported") # Checking that these exist. IF the files are still processing, possible that one or both are not available! for f in ["pr", "tas"]: if f not in ds: raise AttributeError(f"'{f}' not present in dataset") return ds, times
[docs] def get_recent_ECCC_forecast( region_coll: fiona.Collection, climate_model: str = "GEPS" ) -> xr.Dataset: """Generate a forecast dataset that can be used to run raven. Data comes from the ECCC datamart and collected daily. It is aggregated such that each file contains forecast data for a single day, but for all forecast timesteps and all members. The code takes the region shapefile and the climate_model to use, here GEPS by default, but eventually could be GEPS, GDPS, REPS or RDPS. Parameters ---------- region_coll : fiona.Collection The region vectors. climate_model : str Type of climate model, for now only "GEPS" is supported. Returns ------- xr.Dataset The forecast dataset. """ [ds, times] = get_ECCC_dataset(climate_model) # Make the variable name compatible with the hindcasting tools. ds = ds.rename({"member": "members"}) return get_subsetted_forecast(region_coll, ds, times, False)
[docs] def get_subsetted_forecast( region_coll: fiona.Collection, ds: xr.Dataset, times: Union[dt.datetime, xr.DataArray], is_caspar: bool, ) -> xr.Dataset: """Get Subsetted Forecast. This function takes a dataset, a region and the time sampling array and returns the subsetted values for the given region and times. Parameters ---------- region_coll : fiona.Collection The region vectors. ds : xr.Dataset The dataset containing the raw, worldwide forecast data times : dt.datetime or xr.DataArray The array of times required to do the forecast. is_caspar : bool True if the data comes from Caspar, false otherwise. Used to define lat/lon on rotated grid. Returns ------- xr.Dataset The forecast dataset. """ # Extract the bounding box to subset the entire forecast grid to something # more manageable lon_min = region_coll.bounds[0] lon_max = region_coll.bounds[2] lat_min = region_coll.bounds[1] lat_max = region_coll.bounds[3] # Add a very simple lon wraparound if data suggests for it if ((ds.lon.min() >= 0) and (ds.lon.max() <= 360)) and (lon_max < 0): lon_min += 360 lon_max += 360 # Subset the data to the desired location (bounding box) and times ds = ds.where( (ds.lon <= lon_max) & (ds.lon >= lon_min) & ( <= lat_max) & ( >= lat_min), drop=True, ).sel(time=times) # Rioxarray requires CRS definitions for variables # Get CRS, e.g. 4326 crs = int(re.match(r"epsg:(\d+)",["init"]).group(1)) # Here the name of the variable could differ based on the Caspar file processing tas = pr = ds = xr.merge([tas, pr]) # Now apply the mask of the basin contour and average the values to get a single time series if is_caspar:"rlon", "rlat") ds["rlon"] = ds["rlon"] - 360 # clip the netcdf and average across space. shdf = [next(iter(region_coll))["geometry"]] forecast =, crs=crs) forecast = forecast.mean(dim={"rlat", "rlon"}, keep_attrs=True) else:"lon", "lat") ds["lon"] = ds["lon"] - 360 # clip the netcdf and average across space. shdf = [next(iter(region_coll))["geometry"]] forecast =, crs=crs) forecast = forecast.mean(dim={"lat", "lon"}, keep_attrs=True) return forecast