πͺ Merge Datasets#
Often users want to get multiple files across several files (across time or forecast hours). Here is an example of how to get those data and merge them into a single xarray Dataset.
[2]:
from herbie import Herbie, FastHerbie
import xarray as xr
import pandas as pd
from itertools import chain
[3]:
# Create multiple Herbie objects for a range of dates
# Some data we want is in the RAP pressure grid file while other data is
# in the RAP native grid file.
dates = pd.date_range("2024-01-01", periods=3, freq="1H")
FH_prs = FastHerbie(dates, model="rap", product="awp130pgrb", fxx=[0])
FH_nat = FastHerbie(dates, model="rap", product="awp130bgrb", fxx=[0])
[28]:
FH_prs.file_exists, FH_nat.file_exists
[28]:
([ββHerbie RAP model awp130pgrb product initialized 2024-Jan-01 00:00 UTC F00 β source=local,
ββHerbie RAP model awp130pgrb product initialized 2024-Jan-01 01:00 UTC F00 β source=aws,
ββHerbie RAP model awp130pgrb product initialized 2024-Jan-01 02:00 UTC F00 β source=aws],
[ββHerbie RAP model awp130bgrb product initialized 2024-Jan-01 00:00 UTC F00 β source=aws,
ββHerbie RAP model awp130bgrb product initialized 2024-Jan-01 01:00 UTC F00 β source=aws,
ββHerbie RAP model awp130bgrb product initialized 2024-Jan-01 02:00 UTC F00 β source=aws])
[4]:
# Get xarray data for pressure level data
ds_prs = [
H.xarray("(?:TMP:2 m|GRD:10 m|DPT:2 m|GUST|TMP:1000 mb|TMP:500 mb)")
for H in FH_prs.file_exists
]
# flatten the list of lists into just a list of Datasets
ds_prs = list(chain(*ds_prs))
len(ds_prs)
[4]:
12
[5]:
# Get xarray data for native level data
ds_nat = [H.xarray("(?:SOIL|VGTYP|TOSIL)") for H in FH_nat.file_exists]
# flatten the list of lists into just a list of Datasets
ds_nat = list(chain(*ds_nat))
len(ds_nat)
[5]:
6
[15]:
def merge_datasets(ds_list):
"""Merge list of Datasets together.
Since cfgrib doesn't merge data in different "hypercubes", we will
do the merge ourselves.
Parameters
----------
ds_list : list
A list of xarray.Datasets, usually from the list of datasets
returned by cfgrib when data is on multiple levels.
"""
these = []
for ds in ds_list:
ds = ds.drop_vars("gribfile_projection")
expand_dims = []
for i in [
"heightAboveGround",
"time",
"step",
"isobaricInhPa",
"depthBelowLandLayer",
]:
if i in ds and i not in ds.dims:
expand_dims.append(i)
these.append(ds.expand_dims(expand_dims))
return xr.merge(these, compat="override")
[16]:
# Merge prs and nat data into one dataframe
# (NOTE: I'm not 100% convinced I did the merge correct. Why are there
# so many NaN values?)
ds = merge_datasets(ds_prs + ds_nat)
ds
[16]:
<xarray.Dataset> Dimensions: (time: 3, step: 1, heightAboveGround: 2, y: 337, x: 451, isobaricInhPa: 2, depthBelowLandLayer: 9) Coordinates: * time (time) datetime64[ns] 2024-01-01 ... 2024-01-01T02:0... * step (step) timedelta64[ns] 00:00:00 * heightAboveGround (heightAboveGround) float64 2.0 10.0 latitude (y, x) float64 16.28 16.31 16.34 ... 55.54 55.51 55.48 longitude (y, x) float64 233.9 234.0 234.1 ... 302.3 302.4 302.6 valid_time datetime64[ns] 2024-01-01 * isobaricInhPa (isobaricInhPa) float64 1e+03 500.0 surface float64 0.0 * depthBelowLandLayer (depthBelowLandLayer) float64 0.0 0.01 0.04 ... 1.6 3.0 Dimensions without coordinates: y, x Data variables: u10 (heightAboveGround, time, step, y, x) float32 nan ..... v10 (heightAboveGround, time, step, y, x) float32 nan ..... t2m (heightAboveGround, time, step, y, x) float32 297.1 ... d2m (heightAboveGround, time, step, y, x) float32 291.1 ... t (time, step, isobaricInhPa, y, x) float32 295.4 ... nan gust (time, step, y, x) float32 6.305 6.18 6.055 ... nan nan st (time, step, depthBelowLandLayer, y, x) float32 298.... soilw (time, step, depthBelowLandLayer, y, x) float32 1.0 ... gppbfas (time, step, y, x) float32 17.0 17.0 17.0 ... nan nan Attributes: GRIB_edition: 2 GRIB_centre: kwbc GRIB_centreDescription: US National Weather Service - NCEP GRIB_subCentre: 0 Conventions: CF-1.7 institution: US National Weather Service - NCEP model: rap product: awp130pgrb description: Rapid Refresh (RAP) from NOMADS and Big Data Pro... remote_grib: /home/blaylock/data/rap/20240101/rap.t00z.awp130... local_grib: /home/blaylock/data/rap/20240101/subset_6bef9e80... search: (?:TMP:2 m|GRD:10 m|DPT:2 m|GUST|TMP:1000 mb|TMP...
[21]:
# Get data at a single point
point = ds.sel(x=100, y=100).squeeze()
point
# (Note: to get data nearest a specific lat/lon point, you would have to
# do a nearest neighbor search on the latitude/longitude coordinates
# get the index, and then query the specific x/y location. Maybe a Ball
# Tree algorithm could be used for that.)
[21]:
<xarray.Dataset> Dimensions: (time: 3, heightAboveGround: 2, isobaricInhPa: 2, depthBelowLandLayer: 9) Coordinates: * time (time) datetime64[ns] 2024-01-01 ... 2024-01-01T02:0... step timedelta64[ns] 00:00:00 * heightAboveGround (heightAboveGround) float64 2.0 10.0 latitude float64 30.52 longitude float64 244.4 valid_time datetime64[ns] 2024-01-01 * isobaricInhPa (isobaricInhPa) float64 1e+03 500.0 surface float64 0.0 * depthBelowLandLayer (depthBelowLandLayer) float64 0.0 0.01 0.04 ... 1.6 3.0 Data variables: u10 (heightAboveGround, time) float32 nan nan ... nan nan v10 (heightAboveGround, time) float32 nan nan ... nan nan t2m (heightAboveGround, time) float32 287.2 nan ... nan nan d2m (heightAboveGround, time) float32 280.6 nan ... nan nan t (time, isobaricInhPa) float32 289.6 255.8 ... nan nan gust (time) float32 2.868 nan nan st (time, depthBelowLandLayer) float32 287.9 288.2 ... nan soilw (time, depthBelowLandLayer) float32 0.081 ... nan gppbfas (time) float32 7.0 nan nan Attributes: GRIB_edition: 2 GRIB_centre: kwbc GRIB_centreDescription: US National Weather Service - NCEP GRIB_subCentre: 0 Conventions: CF-1.7 institution: US National Weather Service - NCEP model: rap product: awp130pgrb description: Rapid Refresh (RAP) from NOMADS and Big Data Pro... remote_grib: /home/blaylock/data/rap/20240101/rap.t00z.awp130... local_grib: /home/blaylock/data/rap/20240101/subset_6bef9e80... search: (?:TMP:2 m|GRD:10 m|DPT:2 m|GUST|TMP:1000 mb|TMP...
[29]:
point.u10
[29]:
<xarray.DataArray 'u10' (heightAboveGround: 2, time: 3)> array([[ nan, nan, nan], [1.8895483, nan, nan]], dtype=float32) Coordinates: * time (time) datetime64[ns] 2024-01-01 ... 2024-01-01T02:00:00 step timedelta64[ns] 00:00:00 * heightAboveGround (heightAboveGround) float64 2.0 10.0 latitude float64 30.52 longitude float64 244.4 valid_time datetime64[ns] 2024-01-01 surface float64 0.0 Attributes: (12/37) GRIB_paramId: 165 GRIB_dataType: fc GRIB_numberOfPoints: 151987 GRIB_typeOfLevel: heightAboveGround GRIB_stepUnits: 1 GRIB_stepType: instant ... ... GRIB_stepRange: 0 GRIB_units: m s**-1 long_name: 10 metre U wind component units: m s**-1 standard_name: eastward_wind grid_mapping: gribfile_projection