{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 🪄 Merge Datasets\n", "\n", "Often users want to get multiple files across several files (across time or forecast hours). Here is an example of how to get those data and merge them into a single xarray Dataset.\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from herbie import Herbie, FastHerbie\n", "import xarray as xr\n", "import pandas as pd\n", "\n", "from itertools import chain" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Create multiple Herbie objects for a range of dates\n", "# Some data we want is in the RAP pressure grid file while other data is\n", "# in the RAP native grid file.\n", "\n", "dates = pd.date_range(\"2024-01-01\", periods=3, freq=\"1H\")\n", "\n", "FH_prs = FastHerbie(dates, model=\"rap\", product=\"awp130pgrb\", fxx=[0])\n", "FH_nat = FastHerbie(dates, model=\"rap\", product=\"awp130bgrb\", fxx=[0])" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "([\u001b[48;2;255;255;255m\u001b[38;2;136;33;27m▌\u001b[0m\u001b[38;2;12;53;118m\u001b[48;2;240;234;210m▌\u001b[38;2;0;0;0m\u001b[1mHerbie\u001b[0m RAP model \u001b[3mawp130pgrb\u001b[0m product initialized \u001b[38;2;41;130;13m2024-Jan-01 00:00 UTC\u001b[92m F00\u001b[0m ┊ \u001b[38;2;255;153;0m\u001b[3msource=local\u001b[0m,\n", " \u001b[48;2;255;255;255m\u001b[38;2;136;33;27m▌\u001b[0m\u001b[38;2;12;53;118m\u001b[48;2;240;234;210m▌\u001b[38;2;0;0;0m\u001b[1mHerbie\u001b[0m RAP model \u001b[3mawp130pgrb\u001b[0m product initialized \u001b[38;2;41;130;13m2024-Jan-01 01:00 UTC\u001b[92m F00\u001b[0m ┊ \u001b[38;2;255;153;0m\u001b[3msource=aws\u001b[0m,\n", " \u001b[48;2;255;255;255m\u001b[38;2;136;33;27m▌\u001b[0m\u001b[38;2;12;53;118m\u001b[48;2;240;234;210m▌\u001b[38;2;0;0;0m\u001b[1mHerbie\u001b[0m RAP model \u001b[3mawp130pgrb\u001b[0m product initialized \u001b[38;2;41;130;13m2024-Jan-01 02:00 UTC\u001b[92m F00\u001b[0m ┊ \u001b[38;2;255;153;0m\u001b[3msource=aws\u001b[0m],\n", " [\u001b[48;2;255;255;255m\u001b[38;2;136;33;27m▌\u001b[0m\u001b[38;2;12;53;118m\u001b[48;2;240;234;210m▌\u001b[38;2;0;0;0m\u001b[1mHerbie\u001b[0m RAP model \u001b[3mawp130bgrb\u001b[0m product initialized \u001b[38;2;41;130;13m2024-Jan-01 00:00 UTC\u001b[92m F00\u001b[0m ┊ \u001b[38;2;255;153;0m\u001b[3msource=aws\u001b[0m,\n", " \u001b[48;2;255;255;255m\u001b[38;2;136;33;27m▌\u001b[0m\u001b[38;2;12;53;118m\u001b[48;2;240;234;210m▌\u001b[38;2;0;0;0m\u001b[1mHerbie\u001b[0m RAP model \u001b[3mawp130bgrb\u001b[0m product initialized \u001b[38;2;41;130;13m2024-Jan-01 01:00 UTC\u001b[92m F00\u001b[0m ┊ \u001b[38;2;255;153;0m\u001b[3msource=aws\u001b[0m,\n", " \u001b[48;2;255;255;255m\u001b[38;2;136;33;27m▌\u001b[0m\u001b[38;2;12;53;118m\u001b[48;2;240;234;210m▌\u001b[38;2;0;0;0m\u001b[1mHerbie\u001b[0m RAP model \u001b[3mawp130bgrb\u001b[0m product initialized \u001b[38;2;41;130;13m2024-Jan-01 02:00 UTC\u001b[92m F00\u001b[0m ┊ \u001b[38;2;255;153;0m\u001b[3msource=aws\u001b[0m])" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "FH_prs.file_exists, FH_nat.file_exists" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "12" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get xarray data for pressure level data\n", "ds_prs = [\n", " H.xarray(\"(?:TMP:2 m|GRD:10 m|DPT:2 m|GUST|TMP:1000 mb|TMP:500 mb)\")\n", " for H in FH_prs.file_exists\n", "]\n", "\n", "# flatten the list of lists into just a list of Datasets\n", "ds_prs = list(chain(*ds_prs))\n", "len(ds_prs)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get xarray data for native level data\n", "ds_nat = [H.xarray(\"(?:SOIL|VGTYP|TOSIL)\") for H in FH_nat.file_exists]\n", "\n", "# flatten the list of lists into just a list of Datasets\n", "ds_nat = list(chain(*ds_nat))\n", "len(ds_nat)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def merge_datasets(ds_list):\n", " \"\"\"Merge list of Datasets together.\n", "\n", " Since cfgrib doesn't merge data in different \"hypercubes\", we will\n", " do the merge ourselves.\n", "\n", " Parameters\n", " ----------\n", " ds_list : list\n", " A list of xarray.Datasets, usually from the list of datasets\n", " returned by cfgrib when data is on multiple levels.\n", " \"\"\"\n", " these = []\n", " for ds in ds_list:\n", " ds = ds.drop_vars(\"gribfile_projection\")\n", " expand_dims = []\n", " for i in [\n", " \"heightAboveGround\",\n", " \"time\",\n", " \"step\",\n", " \"isobaricInhPa\",\n", " \"depthBelowLandLayer\",\n", " ]:\n", " if i in ds and i not in ds.dims:\n", " expand_dims.append(i)\n", " these.append(ds.expand_dims(expand_dims))\n", " return xr.merge(these, compat=\"override\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
<xarray.Dataset>\n",
"Dimensions: (time: 3, step: 1, heightAboveGround: 2, y: 337,\n",
" x: 451, isobaricInhPa: 2, depthBelowLandLayer: 9)\n",
"Coordinates:\n",
" * time (time) datetime64[ns] 2024-01-01 ... 2024-01-01T02:0...\n",
" * step (step) timedelta64[ns] 00:00:00\n",
" * heightAboveGround (heightAboveGround) float64 2.0 10.0\n",
" latitude (y, x) float64 16.28 16.31 16.34 ... 55.54 55.51 55.48\n",
" longitude (y, x) float64 233.9 234.0 234.1 ... 302.3 302.4 302.6\n",
" valid_time datetime64[ns] 2024-01-01\n",
" * isobaricInhPa (isobaricInhPa) float64 1e+03 500.0\n",
" surface float64 0.0\n",
" * depthBelowLandLayer (depthBelowLandLayer) float64 0.0 0.01 0.04 ... 1.6 3.0\n",
"Dimensions without coordinates: y, x\n",
"Data variables:\n",
" u10 (heightAboveGround, time, step, y, x) float32 nan .....\n",
" v10 (heightAboveGround, time, step, y, x) float32 nan .....\n",
" t2m (heightAboveGround, time, step, y, x) float32 297.1 ...\n",
" d2m (heightAboveGround, time, step, y, x) float32 291.1 ...\n",
" t (time, step, isobaricInhPa, y, x) float32 295.4 ... nan\n",
" gust (time, step, y, x) float32 6.305 6.18 6.055 ... nan nan\n",
" st (time, step, depthBelowLandLayer, y, x) float32 298....\n",
" soilw (time, step, depthBelowLandLayer, y, x) float32 1.0 ...\n",
" gppbfas (time, step, y, x) float32 17.0 17.0 17.0 ... nan nan\n",
"Attributes:\n",
" GRIB_edition: 2\n",
" GRIB_centre: kwbc\n",
" GRIB_centreDescription: US National Weather Service - NCEP\n",
" GRIB_subCentre: 0\n",
" Conventions: CF-1.7\n",
" institution: US National Weather Service - NCEP\n",
" model: rap\n",
" product: awp130pgrb\n",
" description: Rapid Refresh (RAP) from NOMADS and Big Data Pro...\n",
" remote_grib: /home/blaylock/data/rap/20240101/rap.t00z.awp130...\n",
" local_grib: /home/blaylock/data/rap/20240101/subset_6bef9e80...\n",
" search: (?:TMP:2 m|GRD:10 m|DPT:2 m|GUST|TMP:1000 mb|TMP...<xarray.Dataset>\n",
"Dimensions: (time: 3, heightAboveGround: 2, isobaricInhPa: 2,\n",
" depthBelowLandLayer: 9)\n",
"Coordinates:\n",
" * time (time) datetime64[ns] 2024-01-01 ... 2024-01-01T02:0...\n",
" step timedelta64[ns] 00:00:00\n",
" * heightAboveGround (heightAboveGround) float64 2.0 10.0\n",
" latitude float64 30.52\n",
" longitude float64 244.4\n",
" valid_time datetime64[ns] 2024-01-01\n",
" * isobaricInhPa (isobaricInhPa) float64 1e+03 500.0\n",
" surface float64 0.0\n",
" * depthBelowLandLayer (depthBelowLandLayer) float64 0.0 0.01 0.04 ... 1.6 3.0\n",
"Data variables:\n",
" u10 (heightAboveGround, time) float32 nan nan ... nan nan\n",
" v10 (heightAboveGround, time) float32 nan nan ... nan nan\n",
" t2m (heightAboveGround, time) float32 287.2 nan ... nan nan\n",
" d2m (heightAboveGround, time) float32 280.6 nan ... nan nan\n",
" t (time, isobaricInhPa) float32 289.6 255.8 ... nan nan\n",
" gust (time) float32 2.868 nan nan\n",
" st (time, depthBelowLandLayer) float32 287.9 288.2 ... nan\n",
" soilw (time, depthBelowLandLayer) float32 0.081 ... nan\n",
" gppbfas (time) float32 7.0 nan nan\n",
"Attributes:\n",
" GRIB_edition: 2\n",
" GRIB_centre: kwbc\n",
" GRIB_centreDescription: US National Weather Service - NCEP\n",
" GRIB_subCentre: 0\n",
" Conventions: CF-1.7\n",
" institution: US National Weather Service - NCEP\n",
" model: rap\n",
" product: awp130pgrb\n",
" description: Rapid Refresh (RAP) from NOMADS and Big Data Pro...\n",
" remote_grib: /home/blaylock/data/rap/20240101/rap.t00z.awp130...\n",
" local_grib: /home/blaylock/data/rap/20240101/subset_6bef9e80...\n",
" search: (?:TMP:2 m|GRD:10 m|DPT:2 m|GUST|TMP:1000 mb|TMP...<xarray.DataArray 'u10' (heightAboveGround: 2, time: 3)>\n",
"array([[ nan, nan, nan],\n",
" [1.8895483, nan, nan]], dtype=float32)\n",
"Coordinates:\n",
" * time (time) datetime64[ns] 2024-01-01 ... 2024-01-01T02:00:00\n",
" step timedelta64[ns] 00:00:00\n",
" * heightAboveGround (heightAboveGround) float64 2.0 10.0\n",
" latitude float64 30.52\n",
" longitude float64 244.4\n",
" valid_time datetime64[ns] 2024-01-01\n",
" surface float64 0.0\n",
"Attributes: (12/37)\n",
" GRIB_paramId: 165\n",
" GRIB_dataType: fc\n",
" GRIB_numberOfPoints: 151987\n",
" GRIB_typeOfLevel: heightAboveGround\n",
" GRIB_stepUnits: 1\n",
" GRIB_stepType: instant\n",
" ... ...\n",
" GRIB_stepRange: 0\n",
" GRIB_units: m s**-1\n",
" long_name: 10 metre U wind component\n",
" units: m s**-1\n",
" standard_name: eastward_wind\n",
" grid_mapping: gribfile_projection