Accessing nested HDF5 file from http via kerchunk

Hi there,

I’m trying to figure out if there’s a way to access some HDF5 files stored on what seems like a HTTP server directly with xarray. The dataset is at chabud-team/chabud-extra at main, and looks to be on a Git LFS system. If I’m not mistaken, the URL to the dataset is https://huggingface.co/datasets/chabud-team/chabud-extra/raw/main/california_2.hdf5.

First thing I tried was xarray.open_dataset directly:

import xarray as xr

xr.open_dataset(
    filename_or_obj="https://huggingface.co/datasets/chabud-team/chabud-extra/raw/main/california_2.hdf5",
    engine="h5netcdf",  # Requires h5netcdf and h5pyd to be installed
)

which gives a JSONDecodeError like so:

---------------------------------------------------------------------------
JSONDecodeError                           Traceback (most recent call last)
Cell In[5], line 1
----> 1 xr.open_dataset(
      2     filename_or_obj="https://huggingface.co/datasets/chabud-team/chabud-extra/resolve/main/california_2.hdf5",
      3     engine="h5netcdf",  # Requires h5netcdf and h5pyd to be installed
      4 )

File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/api.py:525, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, backend_kwargs, **kwargs)
    513 decoders = _resolve_decoders_kwargs(
    514     decode_cf,
    515     open_backend_dataset_parameters=backend.open_dataset_parameters,
   (...)
    521     decode_coords=decode_coords,
    522 )
    524 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
--> 525 backend_ds = backend.open_dataset(
    526     filename_or_obj,
    527     drop_variables=drop_variables,
    528     **decoders,
    529     **kwargs,
    530 )
    531 ds = _dataset_from_backend_dataset(
    532     backend_ds,
    533     filename_or_obj,
   (...)
    541     **kwargs,
    542 )
    543 return ds

File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/h5netcdf_.py:413, in H5netcdfBackendEntrypoint.open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, format, group, lock, invalid_netcdf, phony_dims, decode_vlen_strings)
    394 def open_dataset(  # type: ignore[override]  # allow LSP violation, not supporting **kwargs
    395     self,
    396     filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
   (...)
    410     decode_vlen_strings=True,
    411 ) -> Dataset:
    412     filename_or_obj = _normalize_path(filename_or_obj)
--> 413     store = H5NetCDFStore.open(
    414         filename_or_obj,
    415         format=format,
    416         group=group,
    417         lock=lock,
    418         invalid_netcdf=invalid_netcdf,
    419         phony_dims=phony_dims,
    420         decode_vlen_strings=decode_vlen_strings,
    421     )
    423     store_entrypoint = StoreBackendEntrypoint()
    425     ds = store_entrypoint.open_dataset(
    426         store,
    427         mask_and_scale=mask_and_scale,
   (...)
    433         decode_timedelta=decode_timedelta,
    434     )

File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/h5netcdf_.py:176, in H5NetCDFStore.open(cls, filename, mode, format, group, lock, autoclose, invalid_netcdf, phony_dims, decode_vlen_strings)
    173         lock = combine_locks([HDF5_LOCK, get_write_lock(filename)])
    175 manager = CachingFileManager(h5netcdf.File, filename, mode=mode, kwargs=kwargs)
--> 176 return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose)

File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/h5netcdf_.py:127, in H5NetCDFStore.__init__(self, manager, group, mode, lock, autoclose)
    124 self.format = None
    125 # todo: utilizing find_root_and_group seems a bit clunky
    126 #  making filename available on h5netcdf.Group seems better
--> 127 self._filename = find_root_and_group(self.ds)[0].filename
    128 self.is_remote = is_remote_uri(self._filename)
    129 self.lock = ensure_lock(lock)

File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/h5netcdf_.py:187, in H5NetCDFStore.ds(self)
    185 @property
    186 def ds(self):
--> 187     return self._acquire()

File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/h5netcdf_.py:179, in H5NetCDFStore._acquire(self, needs_lock)
    178 def _acquire(self, needs_lock=True):
--> 179     with self._manager.acquire_context(needs_lock) as root:
    180         ds = _nc4_require_group(
    181             root, self._group, self._mode, create_group=_h5netcdf_create_group
    182         )
    183     return ds

File ~/mambaforge/envs/chabud/lib/python3.11/contextlib.py:137, in _GeneratorContextManager.__enter__(self)
    135 del self.args, self.kwds, self.func
    136 try:
--> 137     return next(self.gen)
    138 except StopIteration:
    139     raise RuntimeError("generator didn't yield") from None

File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/file_manager.py:198, in CachingFileManager.acquire_context(self, needs_lock)
    195 @contextlib.contextmanager
    196 def acquire_context(self, needs_lock=True):
    197     """Context manager for acquiring a file."""
--> 198     file, cached = self._acquire_with_cache_info(needs_lock)
    199     try:
    200         yield file

File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/file_manager.py:216, in CachingFileManager._acquire_with_cache_info(self, needs_lock)
    214     kwargs = kwargs.copy()
    215     kwargs["mode"] = self._mode
--> 216 file = self._opener(*self._args, **kwargs)
    217 if self._mode == "w":
    218     # ensure file doesn't get overridden when opened again
    219     self._mode = "a"

File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/h5netcdf/core.py:1040, in File.__init__(self, path, mode, invalid_netcdf, phony_dims, **kwargs)
   1035     raise ImportError(
   1036         "No module named 'h5pyd'. h5pyd is required for "
   1037         "opening urls: {}".format(path)
   1038     )
   1039 try:
-> 1040     with h5pyd.File(path, "r", **kwargs) as f:  # noqa
   1041         pass
   1042     self._preexisting_file = True

File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/h5pyd/_hl/files.py:280, in File.__init__(self, domain, mode, endpoint, username, password, bucket, api_key, use_session, use_cache, logger, owner, linked_domain, retries, timeout, **kwds)
    278 while True:
    279     try:
--> 280         rsp = http_conn.GET(req, params=params)
    281         break
    282     except IOError:

File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/h5pyd/_hl/httpconn.py:493, in HttpConn.GET(self, req, format, params, headers, use_cache)
    491     if rsp.status_code == 200 and req == "/":
    492         self.log.info(f"got domain json: {len(rsp.text)} bytes")
--> 493         self._domain_json = json.loads(rsp.text)
    495 # when calling AWS Lambda thru API Gatway, the status_code
    496 # indicates the Lambda request was successful, but not necessarily
    497 # the requested HSDS action was.
    498 # Check here and raise IOError is needed.
    499 if (
    500     rsp.status_code == 200
    501     and content_type
    502     and content_type.startswith("application/json")
    503 ):

File ~/mambaforge/envs/chabud/lib/python3.11/json/__init__.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    341     s = s.decode(detect_encoding(s), 'surrogatepass')
    343 if (cls is None and object_hook is None and
    344         parse_int is None and parse_float is None and
    345         parse_constant is None and object_pairs_hook is None and not kw):
--> 346     return _default_decoder.decode(s)
    347 if cls is None:
    348     cls = JSONDecoder

File ~/mambaforge/envs/chabud/lib/python3.11/json/decoder.py:337, in JSONDecoder.decode(self, s, _w)
    332 def decode(self, s, _w=WHITESPACE.match):
    333     """Return the Python representation of ``s`` (a ``str`` instance
    334     containing a JSON document).
    335 
    336     """
--> 337     obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    338     end = _w(s, end).end()
    339     if end != len(s):

File ~/mambaforge/envs/chabud/lib/python3.11/json/decoder.py:355, in JSONDecoder.raw_decode(self, s, idx)
    353     obj, end = self.scan_once(s, idx)
    354 except StopIteration as err:
--> 355     raise JSONDecodeError("Expecting value", s, err.value) from None
    356 return obj, end

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

Next, I tried following Kerchunk Basics — Kerchunk Cookbook, and tried some code like this:

import fsspec
import kerchunk.hdf
import xarray as xr

url = "https://huggingface.co/datasets/chabud-team/chabud-extra/raw/main/california_2.hdf5"
with fsspec.open(urlpath=url) as inf:
    h5chunks = kerchunk.hdf.SingleHdf5ToZarr(inf, url)
    h5chunks.translate()
    with open("single_file_kerchunk.json", "wb") as f:
        f.write(ujson.dumps(h5chunks.translate()).encode())

But that fails with an OSError:

---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
Cell In[9], line 2
      1 with fsspec.open(urlpath=url) as inf:
----> 2     h5chunks = kerchunk.hdf.SingleHdf5ToZarr(inf, url)
      3     h5chunks.translate()
      4     with open("single_file_kerchunk.json", "wb") as f:

File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/kerchunk/hdf.py:92, in SingleHdf5ToZarr.__init__(self, h5f, url, spec, inline_threshold, storage_options, error, vlen_encode)
     90     raise NotImplementedError
     91 self.vlen = vlen_encode
---> 92 self._h5f = h5py.File(self.input_file, mode="r")
     94 self.store = {}
     95 self._zroot = zarr.group(store=self.store, overwrite=True)

File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/h5py/_hl/files.py:567, in File.__init__(self, name, mode, driver, libver, userblock_size, swmr, rdcc_nslots, rdcc_nbytes, rdcc_w0, track_order, fs_strategy, fs_persist, fs_threshold, fs_page_size, page_buf_size, min_meta_keep, min_raw_keep, locking, alignment_threshold, alignment_interval, meta_block_size, **kwds)
    558     fapl = make_fapl(driver, libver, rdcc_nslots, rdcc_nbytes, rdcc_w0,
    559                      locking, page_buf_size, min_meta_keep, min_raw_keep,
    560                      alignment_threshold=alignment_threshold,
    561                      alignment_interval=alignment_interval,
    562                      meta_block_size=meta_block_size,
    563                      **kwds)
    564     fcpl = make_fcpl(track_order=track_order, fs_strategy=fs_strategy,
    565                      fs_persist=fs_persist, fs_threshold=fs_threshold,
    566                      fs_page_size=fs_page_size)
--> 567     fid = make_fid(name, mode, userblock_size, fapl, fcpl, swmr=swmr)
    569 if isinstance(libver, tuple):
    570     self._libver = libver

File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/h5py/_hl/files.py:231, in make_fid(name, mode, userblock_size, fapl, fcpl, swmr)
    229     if swmr and swmr_support:
    230         flags |= h5f.ACC_SWMR_READ
--> 231     fid = h5f.open(name, flags, fapl=fapl)
    232 elif mode == 'r+':
    233     fid = h5f.open(name, h5f.ACC_RDWR, fapl=fapl)

File h5py/_objects.pyx:54, in h5py._objects.with_phil.wrapper()

File h5py/_objects.pyx:55, in h5py._objects.with_phil.wrapper()

File h5py/h5f.pyx:106, in h5py.h5f.open()

OSError: Unable to synchronously open file (file signature not found)

I’m wondering if there’s some missing argument to fsspec.open that I need to pass. There shouldn’t be any authentication or token needed (clicking on the link works directly in the browser), so I’m not sure what else is needed.

It seems that the URL there is not actually the HDF5 file, but a git-lfs metadata blob (plain text). Although it indicates something about how to get the file, I don’t know how to decode it.

version https://git-lfs.github.com/spec/v1
oid sha256:0af569c8930348109b495a5f2768758a52a6deec85768fd70c0efd9370f84578
size 368152403

Ah that could explain why… I found this other link (https://huggingface.co/datasets/chabud-team/chabud-extra/resolve/main/california_2.hdf5) by visiting the LFS data page that should point to the HDF5 file directly. The code now seems to run (can be a bit slow since it’s trying to stream a 368MB dataset):

import fsspec
import kerchunk.hdf
import ujson
import xarray as xr

url = "https://huggingface.co/datasets/chabud-team/chabud-extra/resolve/main/california_2.hdf5"
with fsspec.open(urlpath=url) as inf:
    h5chunks = kerchunk.hdf.SingleHdf5ToZarr(inf, url)
    h5chunks.translate()
    with open("single_file_kerchunk.json", "wb") as f:
        f.write(ujson.dumps(h5chunks.translate()).encode())

This raises some errors, possibly related to the nested structure of the HDF5 file which isn’t handled properly yet by SingleHdf5ToZarr:

/home/user/mambaforge/envs/chabud/lib/python3.11/site-packages/kerchunk/hdf.py:436: UserWarning: The following excepion was caught and quashed while traversing HDF5
path 'e0f11e18-eed0-4eee-972e-6955c2024c19_71' contains a group
Traceback (most recent call last):
  File "/home/user/mambaforge/envs/chabud/lib/python3.11/site-packages/kerchunk/hdf.py", line 413, in _translator
    zgrp = self._zroot.create_group(h5obj.name)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/mambaforge/envs/chabud/lib/python3.11/site-packages/zarr/hierarchy.py", line 921, in create_group
    return self._write_op(self._create_group_nosync, name, overwrite=overwrite)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/mambaforge/envs/chabud/lib/python3.11/site-packages/zarr/hierarchy.py", line 895, in _write_op
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/home/user/mambaforge/envs/chabud/lib/python3.11/site-packages/zarr/hierarchy.py", line 927, in _create_group_nosync
    init_group(self._store, path=path, chunk_store=self._chunk_store,
  File "/home/user/mambaforge/envs/chabud/lib/python3.11/site-packages/zarr/storage.py", line 648, in init_group
    _init_group_metadata(store=store, overwrite=overwrite, path=path,
zarr.errors.ContainsGroupError: path 'e0f11e18-eed0-4eee-972e-6955c2024c19_71' contains a group

  warnings.warn(msg)
/home/user/mambaforge/envs/chabud/lib/python3.11/site-packages/kerchunk/hdf.py:436: UserWarning: The following excepion was caught and quashed while traversing HDF5
path 'e0f11e18-eed0-4eee-972e-6955c2024c19_72' contains a group
Traceback (most recent call last):
  File "/home/user/mambaforge/envs/chabud/lib/python3.11/site-packages/kerchunk/hdf.py", line 413, in _translator
    zgrp = self._zroot.create_group(h5obj.name)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/mambaforge/envs/chabud/lib/python3.11/site-packages/zarr/hierarchy.py", line 921, in create_group
    return self._write_op(self._create_group_nosync, name, overwrite=overwrite)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/mambaforge/envs/chabud/lib/python3.11/site-packages/zarr/hierarchy.py", line 895, in _write_op
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/home/user/mambaforge/envs/chabud/lib/python3.11/site-packages/zarr/hierarchy.py", line 927, in _create_group_nosync
    init_group(self._store, path=path, chunk_store=self._chunk_store,
  File "/home/user/mambaforge/envs/chabud/lib/python3.11/site-packages/zarr/storage.py", line 648, in init_group
    _init_group_metadata(store=store, overwrite=overwrite, path=path,
zarr.errors.ContainsGroupError: path 'e0f11e18-eed0-4eee-972e-6955c2024c19_72' contains a group

But it does produce a single_file_kerchunk.json file (truncated here for brevity)

{"version":1,"refs":{".zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_65\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_66\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_67\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_68\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_69\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_7\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_70\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_71\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_72\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_73\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_74\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_75\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_76\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_77\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_78\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_79\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_8\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_80\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_81\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_82\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_83\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_84\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_85\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_86\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_87\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_88\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_89\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_9\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_90\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_91\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_92\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_93\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_94\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_95\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_96\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_97\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_98\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_99\/.zgroup":"{\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/.zarray":"{\"chunks\":[1,64,128],\"compressor\":{\"id\":\"zlib\",\"level\":4},\"dtype\":\"|u1\",\"fill_value\":0,\"filters\":null,\"order\":\"C\",\"shape\":[1,512,512],\"zarr_format\":2}","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/.zattrs":"{\"_ARRAY_DIMENSIONS\":[\"phony_dim_0\",\"phony_dim_1\",\"phony_dim_2\"]}","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.0.0":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.0.1":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.0.2":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.0.3":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.1.0":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.1.1":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.1.2":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.1.3":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.2.0":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.2.1":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.2.2":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.2.3":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.3.0":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.3.1":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.3.2":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.3.3":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.4.0":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.4.1":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.4.2":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.4.3":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.5.0":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.5.1":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.5.2":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.5.3":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.6.0":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.6.1":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.6.2":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.6.3":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.7.0":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.7.1":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.7.2":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/mask\/0.7.3":"base64:eF7twQENAAAAwqD3T20ON6AAAAAAAAAAgHcDIAAAAQ==","e0f11e18-eed0-4eee-972e-6955c2024c19_64\/post_fire\/

Let me try and sort out the warnings and issues, should be solvable I think :smiley:

SingleHdf5ToZarr allows you to drop into pdb each time it hits an exception like this, if it helps. The warning means it basically skipped whatever part of the dataset was causing the problem.

We have had some luck using Zarr from Huggingface. See this issue for details

1 Like

Thanks @rabernat, your tweet was exactly what inspired me to try this out :smiley: The dataset I’m working with is in a HDF5 format though, and I’m trying to tackle 1) The nested HDF5 structure which requires xarray-datatree to read, and 2) Getting kerchunk to work so that we could read the nested HDF5 file using engine="zarr".

Going beyond that, I’m hoping that there’s a way to use kvikIO to read those HDF5 files (via kerchunk/Zarr) directly into GPU memory (via NVIDIA GPU Direct Storage) as mentioned in Favorite way to go from netCDF (&xarray) to torch/TF/Jax et al - #8 by weiji14, but that’s getting ahead of myself a little bit. This would be a nice for [use case demonstration] Kvikio Direct-to-gpu -> xarray -> xbatcher -> ml model · Issue #87 · xarray-contrib/xbatcher · GitHub though!

1 Like