From f04e510b8dda2229db77c1e3212691976af9b4e6 Mon Sep 17 00:00:00 2001 From: finnwimberly Date: Wed, 20 Aug 2025 21:42:50 +0000 Subject: [PATCH 1/2] latest version --- contributor_folders/finn/data_readin.ipynb | 3 --- 1 file changed, 3 deletions(-) diff --git a/contributor_folders/finn/data_readin.ipynb b/contributor_folders/finn/data_readin.ipynb index 2ea91de..8921724 100644 --- a/contributor_folders/finn/data_readin.ipynb +++ b/contributor_folders/finn/data_readin.ipynb @@ -25,9 +25,6 @@ "source": [ "### helper functions to normalize coords\n", "\n", - "# --------------------- helpers (unchanged from previous) --------------------- #\n", - "# Note: These helpers are kept as they perform general-purpose coordinate and variable handling.\n", - "\n", "def _select_variable(ds: xr.Dataset, var: Union[str, Dict[str, str]]) -> str:\n", " \"\"\"\n", " Pick a variable name from a Dataset.\n", From b19f50003b0e6e9a36bc64c8cd03e14ed9e61b7f Mon Sep 17 00:00:00 2001 From: finnwimberly Date: Thu, 21 Aug 2025 18:04:46 +0000 Subject: [PATCH 2/2] big push --- .gitignore | 2 + .../aidan/__pycache__/dataset.cpython-312.pyc | Bin 0 -> 1947 bytes contributor_folders/aidan/dataset.py | 35 + contributor_folders/aidan/datasets.ipynb | 134 + .../aidan/explore_langchain.ipynb | 763 ++ contributor_folders/finn/data_readin.ipynb | 7503 ++++++++++++++++- 6 files changed, 8314 insertions(+), 123 deletions(-) create mode 100644 .gitignore create mode 100644 contributor_folders/aidan/__pycache__/dataset.cpython-312.pyc create mode 100644 contributor_folders/aidan/dataset.py create mode 100644 contributor_folders/aidan/datasets.ipynb create mode 100644 contributor_folders/aidan/explore_langchain.ipynb diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..51fc844 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.env +**/.ipynb_checkpoints/* diff --git a/contributor_folders/aidan/__pycache__/dataset.cpython-312.pyc b/contributor_folders/aidan/__pycache__/dataset.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a6342db6e04e34eb13488e1ee4da450dc969f3f3 GIT binary patch literal 1947 zcmbtV&5I*N6tC*8B%Mq?CLc4b?m9A#2n}LHP%q1ZtDqxJ_TUGaOHq@qF}?Ihtm;|K z#mkH^dz;((2Z*rvANVhL%p5|2c<|&+nD*ky_o|a68zL^)(63+rURCG!e($}?FO5cx z!1ei^AICp9g#3;#%d50-u73vS5eY~@BNEdgjh&%G@!g5sm<<^v4tbvh?l}pVWIwsZ zzG2CtB|yt7XxX9_K)n^TV$mv~wH4H}=oX;$6|`#62B6Iqv}Vy3pxZ0x*0k++{y;*$ zW4wElEKCy-8TL`AGoPC3U7_Vs8pvo=kS^>Z19JTmygwq63~4}y&Zk7Wk_GfOpw@HH zo4P(T%}-=JNtKB1rl(1uO-*M)WjqUGX*`(()*C!78ISWs#FBGU<2+7-(+K@K=U<(Q zXc2LBrc8@-ktAs*kh13dF}X%^e-HZOG?xAQ>4T|A`sw)VH{RqEmEPxp$V4FY_&60R z;87I!M`@C&@c1-K6+cO%Kq}oAVF253t(Mtqlc}n}N3CJt=sKA1$b7qdc5mKnpC76A zGQ3%!w;=pq0$HitC!#ScTFEb)#>W%L?ullVI81mXGE+Ed0*?4*fuNHn1HYuUkP-4! z4KSv35~U(jEd;jFSjHXndQ0Phqj$kTYK_j>;3;!HD7N70%}G9U7nI)_jilDbn?M>T zsfrDQ495e7L$=QDCSqusa4&^#w$F~{wZ{2j!3o3vlau$EP@y=Eq-j7;CIL1F_LKk$FCs%UsHR4mpkH@)#&+lJ-WBT=9C5*Bj(ZSUZ;s~xlci#YrjGBL>XM%} z)dvfzdf=DW>a4ImEt!p_KENm%tnyOKR`>FQhu>bY7Yd>n-c0mEh^!M`A9GoP(J#6f zjJZ1J@@9Je-RTj2| z27Yy20Mx(8-HxnnZkx7kq_x(61s9`v9&U|ahSJpmJIg`9ZJTv^rh>K?xxbr6ksRUaWxU0C^RIQabT^IgtH4E}!5LiEwb{9R`(b>= zZWeXL@W8K|ayFfW$(PE-J=iWVhf_H1vT&sEzpOawtZr5^+ul||`}6jvo|e&gw~HhC zj(QzFK;hOL%%3i$^f$8qE7{ALLpzsWke&RSiG5g4&Aw!;+mb@ gqjc}GniF{BUYT|;!koY(-=_4|_tn1uzh%?<2c}V#Jpcdz literal 0 HcmV?d00001 diff --git a/contributor_folders/aidan/dataset.py b/contributor_folders/aidan/dataset.py new file mode 100644 index 0000000..2160352 --- /dev/null +++ b/contributor_folders/aidan/dataset.py @@ -0,0 +1,35 @@ +from typing import Optional, List +from pydantic import BaseModel + +class TemporalBounds(BaseModel): + start_time: str + end_time: str + +class SpatialBounds(BaseModel): + min_lat: float + min_lon: float + max_lat: float + max_lon: float + +class Access(BaseModel): + platform: str + path: str + +class Variable(BaseModel): + standard_name: str + description: str + +class Variables(BaseModel): + variables: List[Variable] + + +class Dataset(BaseModel): + name: str + description: str + temporal_bounds: TemporalBounds + spatial_bounds: SpatialBounds + variables: Variables + access: Access + +class DatasetCollection(BaseModel): + datasets: List[Dataset] \ No newline at end of file diff --git a/contributor_folders/aidan/datasets.ipynb b/contributor_folders/aidan/datasets.ipynb new file mode 100644 index 0000000..5df2250 --- /dev/null +++ b/contributor_folders/aidan/datasets.ipynb @@ -0,0 +1,134 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "f571ec6c-d60e-4049-a198-25b4dad1b7bb", + "metadata": {}, + "outputs": [], + "source": [ + "from dataset import Dataset, SpatialBounds, TemporalBounds, Variables, Access, Variable, DatasetCollection\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e1f9d631-4ba9-435c-bddf-97962db3ccce", + "metadata": {}, + "outputs": [], + "source": [ + "d = Dataset(\n", + " name=\"Multi-Scale Ultra High Resolution (MUR) Sea Surface Temperature (SST)\",\n", + " description=\"A global, gap-free, gridded, daily 1 km Sea Surface Temperature (SST) dataset created by merging multiple Level-2 satellite SST datasets. Those input datasets include the NASA Advanced Microwave Scanning Radiometer-EOS (AMSR-E), the JAXA Advanced Microwave Scanning Radiometer 2 (AMSR-2) on GCOM-W1, the Moderate Resolution Imaging Spectroradiometers (MODIS) on the NASA Aqua and Terra platforms, the US Navy microwave WindSat radiometer, the Advanced Very High Resolution Radiometer (AVHRR) on several NOAA satellites, and in situ SST observations from the NOAA iQuam project. Data are available from 2002 to present in Zarr format. The original source of the MUR data is the NASA JPL Physical Oceanography DAAC.\",\n", + " spatial_bounds=SpatialBounds(\n", + " min_lat=1.0,\n", + " min_lon=1.0,\n", + " max_lat=1.0,\n", + " max_lon=1.0\n", + " ),\n", + " temporal_bounds=TemporalBounds(\n", + " start_time=\"1234\",\n", + " end_time=\"4567\"\n", + " ),\n", + " variables=Variables(\n", + " variables=[Variable(standard_name=\"water temp\", description=\"how hot da water\")]\n", + " ),\n", + " access=Access(\n", + " platform=\"aws\",\n", + " path=\"s3://path_to_file.zarr\"\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0440e690-0fc9-48af-a10a-41283d7bc009", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'name': 'Multi-Scale Ultra High Resolution (MUR) Sea Surface Temperature (SST)',\n", + " 'description': 'A global, gap-free, gridded, daily 1 km Sea Surface Temperature (SST) dataset created by merging multiple Level-2 satellite SST datasets. Those input datasets include the NASA Advanced Microwave Scanning Radiometer-EOS (AMSR-E), the JAXA Advanced Microwave Scanning Radiometer 2 (AMSR-2) on GCOM-W1, the Moderate Resolution Imaging Spectroradiometers (MODIS) on the NASA Aqua and Terra platforms, the US Navy microwave WindSat radiometer, the Advanced Very High Resolution Radiometer (AVHRR) on several NOAA satellites, and in situ SST observations from the NOAA iQuam project. Data are available from 2002 to present in Zarr format. The original source of the MUR data is the NASA JPL Physical Oceanography DAAC.',\n", + " 'temporal_bounds': {'start_time': '1234', 'end_time': '4567'},\n", + " 'spatial_bounds': {'min_lat': 1.0,\n", + " 'min_lon': 1.0,\n", + " 'max_lat': 1.0,\n", + " 'max_lon': 1.0},\n", + " 'variables': {'variables': [{'standard_name': 'water temp',\n", + " 'description': 'how hot da water'}]},\n", + " 'access': {'platform': 'aws', 'path': 's3://path_to_file.zarr'}}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "json.loads(d.json())" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ed735e0c-28b6-41e0-a36e-e551627b1af5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'datasets': [{'name': 'Multi-Scale Ultra High Resolution (MUR) Sea Surface Temperature (SST)',\n", + " 'description': 'A global, gap-free, gridded, daily 1 km Sea Surface Temperature (SST) dataset created by merging multiple Level-2 satellite SST datasets. Those input datasets include the NASA Advanced Microwave Scanning Radiometer-EOS (AMSR-E), the JAXA Advanced Microwave Scanning Radiometer 2 (AMSR-2) on GCOM-W1, the Moderate Resolution Imaging Spectroradiometers (MODIS) on the NASA Aqua and Terra platforms, the US Navy microwave WindSat radiometer, the Advanced Very High Resolution Radiometer (AVHRR) on several NOAA satellites, and in situ SST observations from the NOAA iQuam project. Data are available from 2002 to present in Zarr format. The original source of the MUR data is the NASA JPL Physical Oceanography DAAC.',\n", + " 'temporal_bounds': {'start_time': '1234', 'end_time': '4567'},\n", + " 'spatial_bounds': {'min_lat': 1.0,\n", + " 'min_lon': 1.0,\n", + " 'max_lat': 1.0,\n", + " 'max_lon': 1.0},\n", + " 'variables': {'variables': [{'standard_name': 'water temp',\n", + " 'description': 'how hot da water'}]},\n", + " 'access': {'platform': 'aws', 'path': 's3://path_to_file.zarr'}}]}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "json.loads(DatasetCollection(datasets=[d]).json())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cca20b5c-7b2d-4b2d-844c-011d301c2ba8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (Pixi)", + "language": "python", + "name": "pixi-kernel-python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/contributor_folders/aidan/explore_langchain.ipynb b/contributor_folders/aidan/explore_langchain.ipynb new file mode 100644 index 0000000..879b811 --- /dev/null +++ b/contributor_folders/aidan/explore_langchain.ipynb @@ -0,0 +1,763 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "id": "8d62f199-5830-460c-b996-e4c1c6b374d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_openai import ChatOpenAI\n", + "import os \n", + "from dotenv import load_dotenv\n", + "load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7b3011e2-2ad1-4cb6-b5ef-19feb9f5022f", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.prompts import ChatPromptTemplate\n", + "from langchain_core.output_parsers import StrOutputParser" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9c930d1b-8fac-4d3e-b545-3b4930143128", + "metadata": {}, + "outputs": [], + "source": [ + "HF_TOKEN = os.environ[\"HF_TOKEN\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "aa7de930-95fe-4e73-a89a-6c6738b725af", + "metadata": {}, + "outputs": [], + "source": [ + "llm = ChatOpenAI(\n", + " base_url=\"https://router.huggingface.co/v1\",\n", + " api_key=HF_TOKEN,\n", + " model=\"openai/gpt-oss-20b:fireworks-ai\" \n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "41ea9225-21aa-4026-8393-7109f5401df2", + "metadata": {}, + "outputs": [], + "source": [ + "prompt = ChatPromptTemplate.from_messages([\n", + " (\"system\", \"You shall answer my questions\"),\n", + " (\"human\", \"{question}\")\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "0bf8c583-f053-41d9-bfe4-eddfcd00f528", + "metadata": {}, + "outputs": [], + "source": [ + "chain_no_context = prompt | llm | StrOutputParser()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e706ae18-ac5e-4669-aac1-2f6bb98337bf", + "metadata": {}, + "outputs": [], + "source": [ + "resp = chain_no_context.invoke({\"question\": \"hey buddy\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d3caa552-8b70-4d44-b31b-209c730f51f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Hey there! How can I help you today?'" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "resp" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c48c0b5a-95e0-45ed-bca7-033baecac369", + "metadata": {}, + "outputs": [], + "source": [ + "from arraylake import Client" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "0001cc2c-3fd2-4665-9f7f-7dfabf86fb44", + "metadata": {}, + "outputs": [], + "source": [ + "import xarray as xr" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "86001908-aa14-44b5-9f56-06e7bbe12970", + "metadata": {}, + "outputs": [], + "source": [ + "def read_arraylake_dataset(repo_name: str, group_name: str, variable: str = \"all\", branch: str = \"main\") -> xr.Dataset:\n", + " \n", + " client = Client()\n", + " \n", + " repo = client.get_repo(repo_name)\n", + " \n", + " session = repo.readonly_session(branch=branch)\n", + " \n", + " ds = xr.open_zarr(session.store, group=group_name, consolidated=False)\n", + "\n", + " if variable != \"all\":\n", + " ds = ds[variable]\n", + " \n", + " return ds\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3acde5d4-3299-4a30-8624-2f40af33bf17", + "metadata": {}, + "outputs": [], + "source": [ + "read_arraylake_dataset(\n", + " \"earthmover-public/era5-surface-aws\",\n", + " \"spatial\",\n", + " \"sst\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "00fff333-8015-4776-92bc-88700297235a", + "metadata": {}, + "outputs": [], + "source": [ + "def read_dataset(\n", + " source: str,\n", + " **kwargs\n", + "):\n", + " if source == \"arraylake\":\n", + " return read_arraylake_dataset(**kwargs)\n", + " else:\n", + " # this is where we will have functions for \n", + " # aws, gcp, etc.\n", + " raise NotImplementedError()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "36d07847-96f0-40f7-965d-0894afd7b837", + "metadata": {}, + "outputs": [], + "source": [ + "ds = read_dataset(\n", + " \"arraylake\",\n", + " repo_name=\"earthmover-public/era5-surface-aws\",\n", + " group_name=\"spatial\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "cc7498ef-3aea-4747-ba81-f5aaf83575de", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Frozen({'blh': Size: 2TB\n", + "dask.array\n", + "Attributes: (12/14)\n", + " long_name: Boundary layer height\n", + " short_name: blh\n", + " units: m\n", + " original_format: WMO GRIB 1 with ECMWF...\n", + " ecmwf_local_table: 128\n", + " ecmwf_parameter: 159\n", + " ... ...\n", + " grid_specification: 0.25 degree x 0.25 de...\n", + " rda_dataset: ds633.0\n", + " rda_dataset_url: https:/rda.ucar.edu/d...\n", + " rda_dataset_doi: DOI: 10.5065/BH6N-5N20\n", + " rda_dataset_group: ERA5 atmospheric surf...\n", + " QuantizeGranularBitGroomNumberOfSignificantDigits: 7, 'd2': Size: 2TB\n", + "dask.array\n", + "Attributes: (12/14)\n", + " long_name: 2 metre dewpoint temp...\n", + " short_name: 2d\n", + " units: K\n", + " original_format: WMO GRIB 1 with ECMWF...\n", + " ecmwf_local_table: 128\n", + " ecmwf_parameter: 168\n", + " ... ...\n", + " grid_specification: 0.25 degree x 0.25 de...\n", + " rda_dataset: ds633.0\n", + " rda_dataset_url: https:/rda.ucar.edu/d...\n", + " rda_dataset_doi: DOI: 10.5065/BH6N-5N20\n", + " rda_dataset_group: ERA5 atmospheric surf...\n", + " QuantizeGranularBitGroomNumberOfSignificantDigits: 7, 'cape': Size: 2TB\n", + "dask.array\n", + "Attributes: (12/14)\n", + " long_name: Convective available ...\n", + " short_name: cape\n", + " units: J kg**-1\n", + " original_format: WMO GRIB 1 with ECMWF...\n", + " ecmwf_local_table: 128\n", + " ecmwf_parameter: 59\n", + " ... ...\n", + " grid_specification: 0.25 degree x 0.25 de...\n", + " rda_dataset: ds633.0\n", + " rda_dataset_url: https:/rda.ucar.edu/d...\n", + " rda_dataset_doi: DOI: 10.5065/BH6N-5N20\n", + " rda_dataset_group: ERA5 atmospheric surf...\n", + " QuantizeGranularBitGroomNumberOfSignificantDigits: 7, 'mslp': Size: 2TB\n", + "dask.array\n", + "Attributes: (12/14)\n", + " long_name: Mean sea level pressure\n", + " short_name: msl\n", + " units: Pa\n", + " original_format: WMO GRIB 1 with ECMWF...\n", + " ecmwf_local_table: 128\n", + " ecmwf_parameter: 151\n", + " ... ...\n", + " grid_specification: 0.25 degree x 0.25 de...\n", + " rda_dataset: ds633.0\n", + " rda_dataset_url: https:/rda.ucar.edu/d...\n", + " rda_dataset_doi: DOI: 10.5065/BH6N-5N20\n", + " rda_dataset_group: ERA5 atmospheric surf...\n", + " QuantizeGranularBitGroomNumberOfSignificantDigits: 7, 'sd': Size: 2TB\n", + "dask.array\n", + "Attributes: (12/14)\n", + " long_name: Snow depth\n", + " short_name: sd\n", + " units: m of water equivalent\n", + " original_format: WMO GRIB 1 with ECMWF...\n", + " ecmwf_local_table: 128\n", + " ecmwf_parameter: 141\n", + " ... ...\n", + " grid_specification: 0.25 degree x 0.25 de...\n", + " rda_dataset: ds633.0\n", + " rda_dataset_url: https:/rda.ucar.edu/d...\n", + " rda_dataset_doi: DOI: 10.5065/BH6N-5N20\n", + " rda_dataset_group: ERA5 atmospheric surf...\n", + " QuantizeGranularBitGroomNumberOfSignificantDigits: 7, 'swvl1': Size: 2TB\n", + "dask.array\n", + "Attributes: (12/14)\n", + " long_name: Volumetric soil water...\n", + " short_name: swvl1\n", + " units: m**3 m**-3\n", + " original_format: WMO GRIB 1 with ECMWF...\n", + " ecmwf_local_table: 128\n", + " ecmwf_parameter: 39\n", + " ... ...\n", + " grid_specification: 0.25 degree x 0.25 de...\n", + " rda_dataset: ds633.0\n", + " rda_dataset_url: https:/rda.ucar.edu/d...\n", + " rda_dataset_doi: DOI: 10.5065/BH6N-5N20\n", + " rda_dataset_group: ERA5 atmospheric surf...\n", + " QuantizeGranularBitGroomNumberOfSignificantDigits: 7, 'sst': Size: 2TB\n", + "dask.array\n", + "Attributes: (12/14)\n", + " long_name: Sea surface temperature\n", + " short_name: sstk\n", + " units: K\n", + " original_format: WMO GRIB 1 with ECMWF...\n", + " ecmwf_local_table: 128\n", + " ecmwf_parameter: 34\n", + " ... ...\n", + " grid_specification: 0.25 degree x 0.25 de...\n", + " rda_dataset: ds633.0\n", + " rda_dataset_url: https:/rda.ucar.edu/d...\n", + " rda_dataset_doi: DOI: 10.5065/BH6N-5N20\n", + " rda_dataset_group: ERA5 atmospheric surf...\n", + " QuantizeGranularBitGroomNumberOfSignificantDigits: 7, 't2': Size: 2TB\n", + "dask.array\n", + "Attributes: (12/14)\n", + " long_name: 2 metre temperature\n", + " short_name: 2t\n", + " units: K\n", + " original_format: WMO GRIB 1 with ECMWF...\n", + " ecmwf_local_table: 128\n", + " ecmwf_parameter: 167\n", + " ... ...\n", + " grid_specification: 0.25 degree x 0.25 de...\n", + " rda_dataset: ds633.0\n", + " rda_dataset_url: https:/rda.ucar.edu/d...\n", + " rda_dataset_doi: DOI: 10.5065/BH6N-5N20\n", + " rda_dataset_group: ERA5 atmospheric surf...\n", + " QuantizeGranularBitGroomNumberOfSignificantDigits: 7, 'tcc': Size: 2TB\n", + "dask.array\n", + "Attributes: (12/14)\n", + " long_name: Total cloud cover\n", + " short_name: tcc\n", + " units: (0-1)\n", + " original_format: WMO GRIB 1 with ECMWF...\n", + " ecmwf_local_table: 128\n", + " ecmwf_parameter: 164\n", + " ... ...\n", + " grid_specification: 0.25 degree x 0.25 de...\n", + " rda_dataset: ds633.0\n", + " rda_dataset_url: https:/rda.ucar.edu/d...\n", + " rda_dataset_doi: DOI: 10.5065/BH6N-5N20\n", + " rda_dataset_group: ERA5 atmospheric surf...\n", + " QuantizeGranularBitGroomNumberOfSignificantDigits: 7, 'tcw': Size: 2TB\n", + "dask.array\n", + "Attributes: (12/14)\n", + " long_name: Total column water\n", + " short_name: tcw\n", + " units: kg m**-2\n", + " original_format: WMO GRIB 1 with ECMWF...\n", + " ecmwf_local_table: 128\n", + " ecmwf_parameter: 136\n", + " ... ...\n", + " grid_specification: 0.25 degree x 0.25 de...\n", + " rda_dataset: ds633.0\n", + " rda_dataset_url: https:/rda.ucar.edu/d...\n", + " rda_dataset_doi: DOI: 10.5065/BH6N-5N20\n", + " rda_dataset_group: ERA5 atmospheric surf...\n", + " QuantizeGranularBitGroomNumberOfSignificantDigits: 7, 'skt': Size: 2TB\n", + "dask.array\n", + "Attributes: (12/14)\n", + " long_name: Skin temperature\n", + " short_name: skt\n", + " units: K\n", + " original_format: WMO GRIB 1 with ECMWF...\n", + " ecmwf_local_table: 128\n", + " ecmwf_parameter: 235\n", + " ... ...\n", + " grid_specification: 0.25 degree x 0.25 de...\n", + " rda_dataset: ds633.0\n", + " rda_dataset_url: https:/rda.ucar.edu/d...\n", + " rda_dataset_doi: DOI: 10.5065/BH6N-5N20\n", + " rda_dataset_group: ERA5 atmospheric surf...\n", + " QuantizeGranularBitGroomNumberOfSignificantDigits: 7, 'stl1': Size: 2TB\n", + "dask.array\n", + "Attributes: (12/14)\n", + " long_name: Soil temperature level 1\n", + " short_name: stl1\n", + " units: K\n", + " original_format: WMO GRIB 1 with ECMWF...\n", + " ecmwf_local_table: 128\n", + " ecmwf_parameter: 139\n", + " ... ...\n", + " grid_specification: 0.25 degree x 0.25 de...\n", + " rda_dataset: ds633.0\n", + " rda_dataset_url: https:/rda.ucar.edu/d...\n", + " rda_dataset_doi: DOI: 10.5065/BH6N-5N20\n", + " rda_dataset_group: ERA5 atmospheric surf...\n", + " QuantizeGranularBitGroomNumberOfSignificantDigits: 7, 'tcwv': Size: 2TB\n", + "dask.array\n", + "Attributes: (12/14)\n", + " long_name: Total column water va...\n", + " short_name: tcwv\n", + " units: kg m**-2\n", + " original_format: WMO GRIB 1 with ECMWF...\n", + " ecmwf_local_table: 128\n", + " ecmwf_parameter: 137\n", + " ... ...\n", + " grid_specification: 0.25 degree x 0.25 de...\n", + " rda_dataset: ds633.0\n", + " rda_dataset_url: https:/rda.ucar.edu/d...\n", + " rda_dataset_doi: DOI: 10.5065/BH6N-5N20\n", + " rda_dataset_group: ERA5 atmospheric surf...\n", + " QuantizeGranularBitGroomNumberOfSignificantDigits: 7, 'u10': Size: 2TB\n", + "dask.array\n", + "Attributes: (12/14)\n", + " long_name: 10 metre U wind compo...\n", + " short_name: 10u\n", + " units: m s**-1\n", + " original_format: WMO GRIB 1 with ECMWF...\n", + " ecmwf_local_table: 128\n", + " ecmwf_parameter: 165\n", + " ... ...\n", + " grid_specification: 0.25 degree x 0.25 de...\n", + " rda_dataset: ds633.0\n", + " rda_dataset_url: https:/rda.ucar.edu/d...\n", + " rda_dataset_doi: DOI: 10.5065/BH6N-5N20\n", + " rda_dataset_group: ERA5 atmospheric surf...\n", + " QuantizeGranularBitGroomNumberOfSignificantDigits: 7, 'u100': Size: 2TB\n", + "dask.array\n", + "Attributes: (12/14)\n", + " long_name: 100 metre U wind comp...\n", + " short_name: 100u\n", + " units: m s**-1\n", + " original_format: WMO GRIB 1 with ECMWF...\n", + " ecmwf_local_table: 228\n", + " ecmwf_parameter: 246\n", + " ... ...\n", + " grid_specification: 0.25 degree x 0.25 de...\n", + " rda_dataset: ds633.0\n", + " rda_dataset_url: https:/rda.ucar.edu/d...\n", + " rda_dataset_doi: DOI: 10.5065/BH6N-5N20\n", + " rda_dataset_group: ERA5 atmospheric surf...\n", + " QuantizeGranularBitGroomNumberOfSignificantDigits: 7, 'sp': Size: 2TB\n", + "dask.array\n", + "Attributes: (12/14)\n", + " long_name: Surface pressure\n", + " short_name: sp\n", + " units: Pa\n", + " original_format: WMO GRIB 1 with ECMWF...\n", + " ecmwf_local_table: 128\n", + " ecmwf_parameter: 134\n", + " ... ...\n", + " grid_specification: 0.25 degree x 0.25 de...\n", + " rda_dataset: ds633.0\n", + " rda_dataset_url: https:/rda.ucar.edu/d...\n", + " rda_dataset_doi: DOI: 10.5065/BH6N-5N20\n", + " rda_dataset_group: ERA5 atmospheric surf...\n", + " QuantizeGranularBitGroomNumberOfSignificantDigits: 7, 'v100': Size: 2TB\n", + "dask.array\n", + "Attributes: (12/14)\n", + " long_name: 100 metre V wind comp...\n", + " short_name: 100v\n", + " units: m s**-1\n", + " original_format: WMO GRIB 1 with ECMWF...\n", + " ecmwf_local_table: 228\n", + " ecmwf_parameter: 247\n", + " ... ...\n", + " grid_specification: 0.25 degree x 0.25 de...\n", + " rda_dataset: ds633.0\n", + " rda_dataset_url: https:/rda.ucar.edu/d...\n", + " rda_dataset_doi: DOI: 10.5065/BH6N-5N20\n", + " rda_dataset_group: ERA5 atmospheric surf...\n", + " QuantizeGranularBitGroomNumberOfSignificantDigits: 7, 'v10': Size: 2TB\n", + "dask.array\n", + "Attributes: (12/14)\n", + " long_name: 10 metre V wind compo...\n", + " short_name: 10v\n", + " units: m s**-1\n", + " original_format: WMO GRIB 1 with ECMWF...\n", + " ecmwf_local_table: 128\n", + " ecmwf_parameter: 166\n", + " ... ...\n", + " grid_specification: 0.25 degree x 0.25 de...\n", + " rda_dataset: ds633.0\n", + " rda_dataset_url: https:/rda.ucar.edu/d...\n", + " rda_dataset_doi: DOI: 10.5065/BH6N-5N20\n", + " rda_dataset_group: ERA5 atmospheric surf...\n", + " QuantizeGranularBitGroomNumberOfSignificantDigits: 7, 'latitude': Size: 6kB\n", + "array([ 90. , 89.75, 89.5 , ..., -89.5 , -89.75, -90. ], shape=(721,))\n", + "Attributes:\n", + " long_name: latitude\n", + " short_name: lat\n", + " units: degrees_north\n", + " axis: Y, 'longitude': Size: 12kB\n", + "array([0.0000e+00, 2.5000e-01, 5.0000e-01, ..., 3.5925e+02, 3.5950e+02,\n", + " 3.5975e+02], shape=(1440,))\n", + "Attributes:\n", + " long_name: longitude\n", + " short_name: lon\n", + " units: degrees_east\n", + " axis: X, 'time': Size: 4MB\n", + "array(['1975-01-01T00:00:00.000000000', '1975-01-01T01:00:00.000000000',\n", + " '1975-01-01T02:00:00.000000000', ..., '2024-12-31T21:00:00.000000000',\n", + " '2024-12-31T22:00:00.000000000', '2024-12-31T23:00:00.000000000'],\n", + " shape=(438312,), dtype='datetime64[ns]')\n", + "Attributes:\n", + " long_name: time\n", + " axis: T})" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds.variables" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8f366135-9a9a-4adc-95ca-3d5e6590519d", + "metadata": {}, + "outputs": [], + "source": [ + "from pydantic import BaseModel, Field, confloat\n", + "from typing import Literal" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8d500b2f-180f-42b3-a62d-cbbafafd770d", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.tools import StructuredTool" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "aa1f29a7-7396-46fb-a2c2-b9539a480d07", + "metadata": {}, + "outputs": [ + { + "ename": "SyntaxError", + "evalue": "invalid syntax (2646928006.py, line 1)", + "output_type": "error", + "traceback": [ + "\u001b[0;36m Cell \u001b[0;32mIn[5], line 1\u001b[0;36m\u001b[0m\n\u001b[0;31m = StructuredTool.from_function(\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" + ] + } + ], + "source": [ + " = StructuredTool.from_function(\n", + " extract_arraylake_data,\n", + " name=\"extract_arraylake_data\",\n", + " description=(),\n", + " args_schema=ERA5Params,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "838fa1a0-bed5-4e9d-a506-9a1681c7e3a5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22387f06-06a1-47e0-af07-a7ddcda4ce86", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d0846fd-45a4-426a-8abc-d82b188f162a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6cc9ae6-474c-4aa2-9321-89cb21703e09", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "eff31dba-873f-4bb2-a04c-6152b43e886c", + "metadata": {}, + "outputs": [], + "source": [ + "from dataset import Dataset, SpatialBounds, TemporalBounds, Variables, Access, Variable, DatasetCollection" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3ae17d9a-e3fe-4cc3-a841-db068cd64a2f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dataset.Dataset" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3938df60-0c4d-4c5e-a1d7-a6353f97f4b0", + "metadata": {}, + "outputs": [], + "source": [ + "d = Dataset(\n", + " name=\"Multi-Scale Ultra High Resolution (MUR) Sea Surface Temperature (SST)\",\n", + " description=\"A global, gap-free, gridded, daily 1 km Sea Surface Temperature (SST) dataset created by merging multiple Level-2 satellite SST datasets. Those input datasets include the NASA Advanced Microwave Scanning Radiometer-EOS (AMSR-E), the JAXA Advanced Microwave Scanning Radiometer 2 (AMSR-2) on GCOM-W1, the Moderate Resolution Imaging Spectroradiometers (MODIS) on the NASA Aqua and Terra platforms, the US Navy microwave WindSat radiometer, the Advanced Very High Resolution Radiometer (AVHRR) on several NOAA satellites, and in situ SST observations from the NOAA iQuam project. Data are available from 2002 to present in Zarr format. The original source of the MUR data is the NASA JPL Physical Oceanography DAAC.\",\n", + " spatial_bounds=SpatialBounds(\n", + " min_lat=1.0,\n", + " min_lon=1.0,\n", + " max_lat=1.0,\n", + " max_lon=1.0\n", + " ),\n", + " temporal_bounds=TemporalBounds(\n", + " start_time=\"1234\",\n", + " end_time=\"4567\"\n", + " ),\n", + " variables=Variables(\n", + " variables=[Variable(standard_name=\"water temp\", description=\"how hot da water\")]\n", + " ),\n", + " access=Access(\n", + " platform=\"aws\",\n", + " path=\"s3://path_to_file.zarr\"\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e66084c4-30a9-48c5-aee2-13395a551a1a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'datasets': [{'name': 'Multi-Scale Ultra High Resolution (MUR) Sea Surface Temperature (SST)',\n", + " 'description': 'A global, gap-free, gridded, daily 1 km Sea Surface Temperature (SST) dataset created by merging multiple Level-2 satellite SST datasets. Those input datasets include the NASA Advanced Microwave Scanning Radiometer-EOS (AMSR-E), the JAXA Advanced Microwave Scanning Radiometer 2 (AMSR-2) on GCOM-W1, the Moderate Resolution Imaging Spectroradiometers (MODIS) on the NASA Aqua and Terra platforms, the US Navy microwave WindSat radiometer, the Advanced Very High Resolution Radiometer (AVHRR) on several NOAA satellites, and in situ SST observations from the NOAA iQuam project. Data are available from 2002 to present in Zarr format. The original source of the MUR data is the NASA JPL Physical Oceanography DAAC.',\n", + " 'temporal_bounds': {'start_time': '1234', 'end_time': '4567'},\n", + " 'spatial_bounds': {'min_lat': 1.0,\n", + " 'min_lon': 1.0,\n", + " 'max_lat': 1.0,\n", + " 'max_lon': 1.0},\n", + " 'variables': {'variables': [{'standard_name': 'water temp',\n", + " 'description': 'how hot da water'}]},\n", + " 'access': {'platform': 'aws', 'path': 's3://path_to_file.zarr'}}]}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "json.loads(DatasetCollection(datasets=[d]).json())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7f9a2a3b-66c5-4ef8-a1b6-55c112c749ac", + "metadata": {}, + "outputs": [], + "source": [ + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "5153dd47-88e0-4031-8dc3-f75726253606", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'name': 'Multi-Scale Ultra High Resolution (MUR) Sea Surface Temperature (SST)',\n", + " 'description': 'A global, gap-free, gridded, daily 1 km Sea Surface Temperature (SST) dataset created by merging multiple Level-2 satellite SST datasets. Those input datasets include the NASA Advanced Microwave Scanning Radiometer-EOS (AMSR-E), the JAXA Advanced Microwave Scanning Radiometer 2 (AMSR-2) on GCOM-W1, the Moderate Resolution Imaging Spectroradiometers (MODIS) on the NASA Aqua and Terra platforms, the US Navy microwave WindSat radiometer, the Advanced Very High Resolution Radiometer (AVHRR) on several NOAA satellites, and in situ SST observations from the NOAA iQuam project. Data are available from 2002 to present in Zarr format. The original source of the MUR data is the NASA JPL Physical Oceanography DAAC.',\n", + " 'temporal_bounds': {'start_time': '1234', 'end_time': '4567'},\n", + " 'spatial_bounds': {'min_lat': 1.0,\n", + " 'min_lon': 1.0,\n", + " 'max_lat': 1.0,\n", + " 'max_lon': 1.0},\n", + " 'variables': {'variables': [{'standard_name': 'water temp',\n", + " 'description': 'how hot da water'}]},\n", + " 'access': {'platform': 'aws', 'path': 's3://path_to_file.zarr'}}" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "json.loads(d.json())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "942b1ddf-8202-4481-88c4-4d0e9120da97", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ad44cc1-f429-4bf6-9c57-ca6784c42838", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/contributor_folders/finn/data_readin.ipynb b/contributor_folders/finn/data_readin.ipynb index 8921724..f6972b9 100644 --- a/contributor_folders/finn/data_readin.ipynb +++ b/contributor_folders/finn/data_readin.ipynb @@ -8,12 +8,9 @@ "outputs": [], "source": [ "from __future__ import annotations\n", - "from typing import Tuple\n", - "\n", + "from typing import Optional, Union, Tuple, Dict, Any\n", "import xarray as xr\n", - "import numpy as np\n", - "import cf_xarray\n", - "import s3fs, gcsfs, fsspec, zarr" + "import numpy as np" ] }, { @@ -57,15 +54,22 @@ " raise KeyError(f\"Could not locate variable from hints {var}. Variables: {list(ds.data_vars)}\")\n", "\n", "\n", - "def _normalize_coord_names(ds: xr.Dataset) -> xr.Dataset:\n", + "def _get_coord_names(ds: xr.Dataset) -> Tuple[str, str]:\n", " \"\"\"\n", - " Standardize coordinate names to 'latitude', 'longitude', 'time'.\n", + " Get the longitude and latitude coordinate names from the dataset.\n", + " Supports both long ('longitude', 'latitude') and short ('lon', 'lat') names.\n", + " \n", + " Returns\n", + " -------\n", + " tuple of (lon_name, lat_name)\n", " \"\"\"\n", - " rename_map = {}\n", - " for alias, standard in {\"lat\": \"latitude\", \"lon\": \"longitude\"}.items():\n", - " if alias in ds.coords and standard not in ds.coords:\n", - " rename_map[alias] = standard\n", - " return ds.rename(rename_map) if rename_map else ds\n", + " lon_name = next((name for name in ['longitude', 'lon'] if name in ds.coords), None)\n", + " lat_name = next((name for name in ['latitude', 'lat'] if name in ds.coords), None)\n", + " \n", + " if not lon_name or not lat_name:\n", + " raise ValueError(f\"Could not find longitude/latitude coordinates. Found: {list(ds.coords)}\")\n", + " \n", + " return lon_name, lat_name\n", "\n", "\n", "def _infer_target_lon_frame(lon_min: float, lon_max: float) -> str:\n", @@ -78,11 +82,11 @@ "def _coerce_longitudes(ds: xr.Dataset, target_frame: str, assume_frame: Optional[str] = None) -> xr.Dataset:\n", " \"\"\"\n", " Coerce dataset longitudes to a target frame ('0-360' or '-180-180').\n", + " Works with either 'longitude' or 'lon' coordinate names.\n", " \"\"\"\n", - " if \"longitude\" not in ds.coords:\n", - " return ds\n", - "\n", - " lon = ds[\"longitude\"].values\n", + " lon_name, _ = _get_coord_names(ds)\n", + " \n", + " lon = ds[lon_name].values\n", " if assume_frame:\n", " current = assume_frame\n", " else:\n", @@ -96,159 +100,7412 @@ " else: # target is -180-180\n", " lon_new = ((lon + 180) % 360) - 180\n", " \n", - " ds = ds.assign_coords(longitude=lon_new)\n", - " return ds.sortby(\"longitude\")\n", + " ds = ds.assign_coords({lon_name: lon_new})\n", + " return ds.sortby(lon_name)\n", "\n", "\n", "def _ensure_lat_monotonic(ds: xr.Dataset) -> xr.Dataset:\n", " \"\"\"\n", " Ensures the latitude coordinate is monotonically increasing.\n", + " Works with either 'latitude' or 'lat' coordinate names.\n", " \"\"\"\n", - " if \"latitude\" in ds.coords and ds[\"latitude\"].ndim == 1 and ds[\"latitude\"].values[0] > ds[\"latitude\"].values[-1]:\n", - " return ds.sortby(\"latitude\")\n", + " _, lat_name = _get_coord_names(ds)\n", + " \n", + " if ds[lat_name].ndim == 1 and ds[lat_name].values[0] > ds[lat_name].values[-1]:\n", + " return ds.sortby(lat_name)\n", " return ds\n", "\n", "\n", "def _slice_longitude(ds: xr.Dataset, lon_min: float, lon_max: float) -> xr.Dataset:\n", " \"\"\"\n", " Slice longitude robustly, handling wrap-around for ranges like 350E to 10E.\n", + " Works with either 'longitude' or 'lon' coordinate names.\n", " \"\"\"\n", + " lon_name, _ = _get_coord_names(ds)\n", + " \n", " if lon_min <= lon_max:\n", - " return ds.sel(longitude=slice(lon_min, lon_max))\n", + " return ds.sel(**{lon_name: slice(lon_min, lon_max)})\n", " \n", - " lon = ds[\"longitude\"]\n", - " part1 = ds.sel(longitude=slice(lon_min, float(lon.max())))\n", - " part2 = ds.sel(longitude=slice(float(lon.min()), lon_max))\n", - " return xr.concat([part1, part2], dim=\"longitude\")" + " lon = ds[lon_name]\n", + " part1 = ds.sel(**{lon_name: slice(lon_min, float(lon.max()))})\n", + " part2 = ds.sel(**{lon_name: slice(float(lon.min()), lon_max)})\n", + " return xr.concat([part1, part2], dim=lon_name)" ] }, { "cell_type": "code", "execution_count": 3, - "id": "d5ebf53d-efcf-4498-aac2-86dba953ab22", + "id": "6391de80-3409-4398-82cc-5a880af149d3", "metadata": {}, "outputs": [], "source": [ - "def load_aws_dataset(\n", - " s3_path: str,\n", - " variable_of_interest: Union[str, Dict[str, str]],\n", - " region_of_interest: Optional[Dict[str, float]] = None,\n", - " time_of_interest: Optional[Union[slice, Tuple[str, str]]] = None,\n", + "def load_climate_data(\n", + " cloud_path: str,\n", + " variable: Union[str, Dict[str, str]],\n", + " lon_range: Optional[Tuple[float, float]] = None,\n", + " lat_range: Optional[Tuple[float, float]] = None,\n", " *,\n", - " group: Optional[str] = None,\n", - " consolidated: Optional[bool] = None,\n", - " chunks: Optional[Dict] = None,\n", - " assume_lon: Optional[str] = None, # \"0-360\" or \"-180-180\" if you know...\n", - " return_dataset: bool = False,\n", - " save_to: Optional[Union[str, pathlib.Path]] = None,\n", - ") -> Union[xr.DataArray, xr.Dataset]:\n", + " time_range: Optional[Tuple[str, str]] = None,\n", + " resample_to: Optional[str] = None,\n", + " chunks: Optional[Dict[str, int]] = None,\n", + "):\n", " \"\"\"\n", - " Load and subset a Zarr dataset from a public AWS S3 bucket.\n", - "\n", + " Load climate data from cloud storage (S3 or GCS) with consistent processing.\n", + " \n", " Parameters\n", " ----------\n", - " s3_path:\n", - " The full S3 path to the Zarr store (e.g., \"s3://era5-pds/zarr/...\").\n", - " variable_of_interest:\n", - " - Name of the variable in the dataset (e.g., \"sst\", \"tos\", \"t2m\"), OR\n", - " - A mapping of CF/long-name hints to try, e.g.:\n", - " {\"standard_name\": \"sea_surface_temperature\"}\n", - " region_of_interest:\n", - " Dict with geographic bounds: {\"lat_min\": -90, \"lat_max\": 90, \"lon_min\": 0, \"lon_max\": 360}.\n", - " Longitudes may be 0–360 or −180–180. Function will reconcile.\n", - " time_of_interest:\n", - " Either a Python slice (e.g., slice(\"1990-01-01\",\"2000-12-31\")) or a 2-tuple of ISO strings.\n", - " group:\n", - " Zarr group within the store (e.g., \"spatial\" for ERA5).\n", - " consolidated:\n", - " Whether the Zarr store is consolidated. If None, attempts sensible defaults.\n", - " chunks:\n", - " Dask chunking dict, e.g., {\"time\": 2400}.\n", - " assume_lon:\n", - " If set, forces interpretation of dataset longitudes as \"0-360\" or \"-180-180\".\n", - " return_dataset:\n", - " If True, return the full Dataset. Otherwise return the selected DataArray.\n", - " save_to:\n", - " Optional path to save the subset as NetCDF.\n", - "\n", + " cloud_path : str\n", + " Full URL to the Zarr store (e.g., \"s3://...\" or \"gs://...\")\n", + " variable : str or dict\n", + " Variable name or CF-style selector (e.g., {\"standard_name\": \"air_temperature\"})\n", + " lon_range : tuple of float\n", + " (min_longitude, max_longitude) in dataset's native frame\n", + " lat_range : tuple of float\n", + " (min_latitude, max_latitude)\n", + " time_range : tuple of str, optional\n", + " (start_date, end_date) as ISO strings\n", + " convert_kelvin_to_celsius : bool, default True\n", + " If True, convert temperature data from Kelvin to Celsius\n", + " resample_to : str, optional\n", + " If provided, resample time dimension (e.g., \"MS\" for month start)\n", + " chunks : dict, optional\n", + " Dask chunks specification (e.g., {\"time\": 1024})\n", + " \n", " Returns\n", " -------\n", - " xr.DataArray or xr.Dataset\n", - " The subsetted data.\n", + " xr.Dataset\n", + " Processed dataset with consistent dimensions\n", " \"\"\"\n", - " # normalize input params\n", - " if isinstance(time_of_interest, tuple):\n", - " time_of_interest = slice(time_of_interest[0], time_of_interest[1])\n", - "\n", - " region = region_of_interest or {}\n", - " lat_min = region.get(\"lat_min\", None)\n", - " lat_max = region.get(\"lat_max\", None)\n", - " lon_min = region.get(\"lon_min\", None)\n", - " lon_max = region.get(\"lon_max\", None)\n", - "\n", - " # config aws\n", - " storage_options = {\"anon\": True}\n", - " if consolidated is None:\n", - " consolidated = False if (group is not None and \"era5\" in s3_path.lower()) else True\n", - "\n", - " # open dataset\n", + " \n", + " # Open dataset\n", " ds = xr.open_dataset(\n", - " s3_path,\n", + " cloud_path,\n", " engine=\"zarr\",\n", " chunks=chunks,\n", - " consolidated=consolidated,\n", - " backend_kwargs={\n", - " \"storage_options\": storage_options,\n", - " **({\"group\": group} if group else {}),\n", - " },\n", " )\n", + " \n", + " # Get coordinate names\n", + " lon_name, lat_name = _get_coord_names(ds)\n", + " \n", + " # Subset space and time\n", + " region = {}\n", + " if lon_range is not None and lat_range is not None:\n", + " region.update({\n", + " lon_name: slice(*lon_range),\n", + " lat_name: slice(*lat_range)\n", + " })\n", + " if time_range is not None:\n", + " region[\"time\"] = slice(*time_range)\n", + " \n", + " # Only apply selection if we have regions to subset\n", + " if region:\n", + " ds = ds.sel(**region)\n", + " \n", + " # Handle longitude frame and monotonic latitude\n", + " # Handle longitude frame and monotonic latitude\n", + " if lon_range is not None:\n", + " target_frame = _infer_target_lon_frame(*lon_range)\n", + " ds = _coerce_longitudes(ds, target_frame)\n", + " ds = _ensure_lat_monotonic(ds)\n", + " \n", + " # Optional time resampling\n", + " if resample_to:\n", + " ds = ds.resample(time=resample_to).mean()\n", + " \n", + " # Ensure consistent dimension order\n", + " # Get available dimensions\n", + " dims = list(ds.dims)\n", + " # Core dims we want first (if they exist)\n", + " core_dims = [\"time\", \"latitude\", \"longitude\"]\n", + " # Filter out core dims that actually exist\n", + " core_dims = [d for d in core_dims if d in dims]\n", + " # Add any remaining dims at the end\n", + " other_dims = [d for d in dims if d not in core_dims]\n", + " # Combine for final ordering\n", + " final_dims = core_dims + other_dims\n", + " \n", + " ds = ds.transpose(*final_dims)\n", "\n", - " # select variable (cf-aware if possible)\n", - " var = _select_variable(ds, variable_of_interest)\n", - "\n", - " # normalize coordinate names\n", - " ds = _normalize_coord_names(ds)\n", - "\n", - " # fix lon to desired slicing frame, if needed \n", - " if (lon_min is not None) and (lon_max is not None):\n", - " ds = _coerce_longitudes(ds, target_frame=_infer_target_lon_frame(lon_min, lon_max), assume_frame=assume_lon)\n", - "\n", - " # wnsure latitude selection works if lat is descending (ERA5 style)\n", - " if (lat_min is not None) and (lat_max is not None):\n", - " ds = _ensure_lat_monotonic(ds)\n", - "\n", - " # apply coord selections\n", - " sel = ds\n", - " if time_of_interest is not None and (\"time\" in sel.dims or \"time\" in sel.coords):\n", - " sel = sel.sel(time=time_of_interest)\n", - " if (lat_min is not None) and (lat_max is not None) and \"latitude\" in sel.coords:\n", - " sel = sel.sel(latitude=slice(min(lat_min, lat_max), max(lat_min, lat_max)))\n", - " if (lon_min is not None) and (lon_max is not None) and \"longitude\" in sel.coords:\n", - " sel = _slice_longitude(sel, lon_min, lon_max)\n", - "\n", - " # return subset da/ds\n", - " out = sel if return_dataset else sel[var]\n", - " if save_to is not None:\n", - " save_path = pathlib.Path(save_to).expanduser().resolve()\n", - " save_path.parent.mkdir(parents=True, exist_ok=True)\n", - " out.to_netcdf(save_path)\n", - " return out" + " if variable:\n", + " var = _select_variable(ds, variable)\n", + " ds = ds[var]\n", + " \n", + " return ds" ] }, { "cell_type": "code", - "execution_count": null, - "id": "3b9e2d45-2f97-40d3-9596-634bf031320a", + "execution_count": 4, + "id": "a4811e9a-5912-4094-90ed-9d6c478c4a7b", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "era5_data = load_climate_data(\n", + " cloud_path=\"gs://weatherbench2/datasets/era5/1959-2023_01_10-6h-240x121_equiangular_with_poles_conservative.zarr\",\n", + " variable=None,\n", + " lon_range=(0, 90), \n", + " lat_range=(-20, 60), \n", + " time_range=(\"2020-01-01\", \"2020-12-31\"),\n", + " resample_to=\"MS\", \n", + " chunks={\"time\": 1024})" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e1d1e0db-a5b1-40a0-b43c-91a46fc95c0f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 38MB\n",
+       "Dimensions:                                           (time: 12, longitude: 61,\n",
+       "                                                       latitude: 54, level: 13)\n",
+       "Coordinates:\n",
+       "  * latitude                                          (latitude) float64 432B ...\n",
+       "  * level                                             (level) int64 104B 50 ....\n",
+       "  * longitude                                         (longitude) float64 488B ...\n",
+       "  * time                                              (time) datetime64[ns] 96B ...\n",
+       "Data variables: (12/62)\n",
+       "    10m_u_component_of_wind                           (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    10m_v_component_of_wind                           (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    10m_wind_speed                                    (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    2m_dewpoint_temperature                           (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    2m_temperature                                    (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    above_ground                                      (time, latitude, longitude, level) float32 2MB dask.array<chunksize=(12, 54, 61, 13), meta=np.ndarray>\n",
+       "    ...                                                ...\n",
+       "    slope_of_sub_gridscale_orography                  (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    soil_type                                         (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    standard_deviation_of_filtered_subgrid_orography  (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    standard_deviation_of_orography                   (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    type_of_high_vegetation                           (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>\n",
+       "    type_of_low_vegetation                            (time, latitude, longitude) float32 158kB dask.array<chunksize=(12, 54, 61), meta=np.ndarray>
" + ], + "text/plain": [ + " Size: 38MB\n", + "Dimensions: (time: 12, longitude: 61,\n", + " latitude: 54, level: 13)\n", + "Coordinates:\n", + " * latitude (latitude) float64 432B ...\n", + " * level (level) int64 104B 50 ....\n", + " * longitude (longitude) float64 488B ...\n", + " * time (time) datetime64[ns] 96B ...\n", + "Data variables: (12/62)\n", + " 10m_u_component_of_wind (time, latitude, longitude) float32 158kB dask.array\n", + " 10m_v_component_of_wind (time, latitude, longitude) float32 158kB dask.array\n", + " 10m_wind_speed (time, latitude, longitude) float32 158kB dask.array\n", + " 2m_dewpoint_temperature (time, latitude, longitude) float32 158kB dask.array\n", + " 2m_temperature (time, latitude, longitude) float32 158kB dask.array\n", + " above_ground (time, latitude, longitude, level) float32 2MB dask.array\n", + " ... ...\n", + " slope_of_sub_gridscale_orography (time, latitude, longitude) float32 158kB dask.array\n", + " soil_type (time, latitude, longitude) float32 158kB dask.array\n", + " standard_deviation_of_filtered_subgrid_orography (time, latitude, longitude) float32 158kB dask.array\n", + " standard_deviation_of_orography (time, latitude, longitude) float32 158kB dask.array\n", + " type_of_high_vegetation (time, latitude, longitude) float32 158kB dask.array\n", + " type_of_low_vegetation (time, latitude, longitude) float32 158kB dask.array" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "era5_data" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "9b05cb01-a62f-4570-bf06-305d7e887b0a", + "metadata": {}, + "outputs": [], + "source": [ + "chl_data = load_climate_data(\n", + " cloud_path=\"gcs://nmfs_odp_nwfsc/CB/mind_the_chl_gap/IO.zarr\",\n", + " variable=None,\n", + " time_range=(\"2020-01-01\", \"2020-03-31\"),\n", + " resample_to=\"MS\",)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4d0835c4-fe09-472a-a0a5-055bac4a86ae", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 14MB\n",
+       "Dimensions:                       (time: 3, lat: 177, lon: 241)\n",
+       "Coordinates:\n",
+       "  * lat                           (lat) float32 708B -12.0 -11.75 ... 31.75 32.0\n",
+       "  * lon                           (lon) float32 964B 42.0 42.25 ... 101.8 102.0\n",
+       "  * time                          (time) datetime64[ns] 24B 2020-01-01 ... 20...\n",
+       "Data variables: (12/27)\n",
+       "    CHL                           (time, lat, lon) float32 512kB nan nan ... nan\n",
+       "    CHL_cmes-cloud                (time, lat, lon) float64 1MB 2.0 2.0 ... 2.0\n",
+       "    CHL_cmes-gapfree              (time, lat, lon) float32 512kB nan nan ... nan\n",
+       "    CHL_cmes-level3               (time, lat, lon) float32 512kB nan nan ... nan\n",
+       "    CHL_cmes_flags-gapfree        (time, lat, lon) float32 512kB nan nan ... nan\n",
+       "    CHL_cmes_flags-level3         (time, lat, lon) float32 512kB nan nan ... nan\n",
+       "    ...                            ...\n",
+       "    v_wind                        (time, lat, lon) float32 512kB -3.804 ... 0...\n",
+       "    vg_curr                       (time, lat, lon) float32 512kB -0.1609 ... nan\n",
+       "    wind_dir                      (time, lat, lon) float32 512kB -65.01 ... 6...\n",
+       "    wind_speed                    (time, lat, lon) float32 512kB 4.731 ... 0....\n",
+       "    CHL_cmes-land                 (time, lat, lon) uint8 128kB 2 2 2 2 ... 2 2 2\n",
+       "    topo                          (time, lat, lon) float64 1MB -2.658e+03 ......\n",
+       "Attributes: (12/92)\n",
+       "    Conventions:                     CF-1.8, ACDD-1.3\n",
+       "    DPM_reference:                   GC-UD-ACRI-PUG\n",
+       "    IODD_reference:                  GC-UD-ACRI-PUG\n",
+       "    acknowledgement:                 The Licensees will ensure that original ...\n",
+       "    citation:                        The Licensees will ensure that original ...\n",
+       "    cmems_product_id:                OCEANCOLOUR_GLO_BGC_L3_MY_009_103\n",
+       "    ...                              ...\n",
+       "    time_coverage_end:               2024-04-18T02:58:23Z\n",
+       "    time_coverage_resolution:        P1D\n",
+       "    time_coverage_start:             2024-04-16T21:12:05Z\n",
+       "    title:                           cmems_obs-oc_glo_bgc-plankton_my_l3-mult...\n",
+       "    westernmost_longitude:           -180.0\n",
+       "    westernmost_valid_longitude:     -180.0
" + ], + "text/plain": [ + " Size: 14MB\n", + "Dimensions: (time: 3, lat: 177, lon: 241)\n", + "Coordinates:\n", + " * lat (lat) float32 708B -12.0 -11.75 ... 31.75 32.0\n", + " * lon (lon) float32 964B 42.0 42.25 ... 101.8 102.0\n", + " * time (time) datetime64[ns] 24B 2020-01-01 ... 20...\n", + "Data variables: (12/27)\n", + " CHL (time, lat, lon) float32 512kB nan nan ... nan\n", + " CHL_cmes-cloud (time, lat, lon) float64 1MB 2.0 2.0 ... 2.0\n", + " CHL_cmes-gapfree (time, lat, lon) float32 512kB nan nan ... nan\n", + " CHL_cmes-level3 (time, lat, lon) float32 512kB nan nan ... nan\n", + " CHL_cmes_flags-gapfree (time, lat, lon) float32 512kB nan nan ... nan\n", + " CHL_cmes_flags-level3 (time, lat, lon) float32 512kB nan nan ... nan\n", + " ... ...\n", + " v_wind (time, lat, lon) float32 512kB -3.804 ... 0...\n", + " vg_curr (time, lat, lon) float32 512kB -0.1609 ... nan\n", + " wind_dir (time, lat, lon) float32 512kB -65.01 ... 6...\n", + " wind_speed (time, lat, lon) float32 512kB 4.731 ... 0....\n", + " CHL_cmes-land (time, lat, lon) uint8 128kB 2 2 2 2 ... 2 2 2\n", + " topo (time, lat, lon) float64 1MB -2.658e+03 ......\n", + "Attributes: (12/92)\n", + " Conventions: CF-1.8, ACDD-1.3\n", + " DPM_reference: GC-UD-ACRI-PUG\n", + " IODD_reference: GC-UD-ACRI-PUG\n", + " acknowledgement: The Licensees will ensure that original ...\n", + " citation: The Licensees will ensure that original ...\n", + " cmems_product_id: OCEANCOLOUR_GLO_BGC_L3_MY_009_103\n", + " ... ...\n", + " time_coverage_end: 2024-04-18T02:58:23Z\n", + " time_coverage_resolution: P1D\n", + " time_coverage_start: 2024-04-16T21:12:05Z\n", + " title: cmems_obs-oc_glo_bgc-plankton_my_l3-mult...\n", + " westernmost_longitude: -180.0\n", + " westernmost_valid_longitude: -180.0" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chl_data" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python (Pixi)", "language": "python", - "name": "python3" + "name": "pixi-kernel-python3" }, "language_info": { "codemirror_mode": {