Hi there,
I’m trying to figure out if there’s a way to access some HDF5 files stored on what seems like a HTTP server directly with xarray. The dataset is at chabud-team/chabud-extra at main, and looks to be on a Git LFS system. If I’m not mistaken, the URL to the dataset is https://huggingface.co/datasets/chabud-team/chabud-extra/raw/main/california_2.hdf5.
First thing I tried was xarray.open_dataset directly:
import xarray as xr
xr.open_dataset(
    filename_or_obj="https://huggingface.co/datasets/chabud-team/chabud-extra/raw/main/california_2.hdf5",
    engine="h5netcdf",  # Requires h5netcdf and h5pyd to be installed
)
which gives a JSONDecodeError like so:
---------------------------------------------------------------------------
JSONDecodeError                           Traceback (most recent call last)
Cell In[5], line 1
----> 1 xr.open_dataset(
      2     filename_or_obj="https://huggingface.co/datasets/chabud-team/chabud-extra/resolve/main/california_2.hdf5",
      3     engine="h5netcdf",  # Requires h5netcdf and h5pyd to be installed
      4 )
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/api.py:525, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, backend_kwargs, **kwargs)
    513 decoders = _resolve_decoders_kwargs(
    514     decode_cf,
    515     open_backend_dataset_parameters=backend.open_dataset_parameters,
   (...)
    521     decode_coords=decode_coords,
    522 )
    524 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
--> 525 backend_ds = backend.open_dataset(
    526     filename_or_obj,
    527     drop_variables=drop_variables,
    528     **decoders,
    529     **kwargs,
    530 )
    531 ds = _dataset_from_backend_dataset(
    532     backend_ds,
    533     filename_or_obj,
   (...)
    541     **kwargs,
    542 )
    543 return ds
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/h5netcdf_.py:413, in H5netcdfBackendEntrypoint.open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, format, group, lock, invalid_netcdf, phony_dims, decode_vlen_strings)
    394 def open_dataset(  # type: ignore[override]  # allow LSP violation, not supporting **kwargs
    395     self,
    396     filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
   (...)
    410     decode_vlen_strings=True,
    411 ) -> Dataset:
    412     filename_or_obj = _normalize_path(filename_or_obj)
--> 413     store = H5NetCDFStore.open(
    414         filename_or_obj,
    415         format=format,
    416         group=group,
    417         lock=lock,
    418         invalid_netcdf=invalid_netcdf,
    419         phony_dims=phony_dims,
    420         decode_vlen_strings=decode_vlen_strings,
    421     )
    423     store_entrypoint = StoreBackendEntrypoint()
    425     ds = store_entrypoint.open_dataset(
    426         store,
    427         mask_and_scale=mask_and_scale,
   (...)
    433         decode_timedelta=decode_timedelta,
    434     )
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/h5netcdf_.py:176, in H5NetCDFStore.open(cls, filename, mode, format, group, lock, autoclose, invalid_netcdf, phony_dims, decode_vlen_strings)
    173         lock = combine_locks([HDF5_LOCK, get_write_lock(filename)])
    175 manager = CachingFileManager(h5netcdf.File, filename, mode=mode, kwargs=kwargs)
--> 176 return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose)
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/h5netcdf_.py:127, in H5NetCDFStore.__init__(self, manager, group, mode, lock, autoclose)
    124 self.format = None
    125 # todo: utilizing find_root_and_group seems a bit clunky
    126 #  making filename available on h5netcdf.Group seems better
--> 127 self._filename = find_root_and_group(self.ds)[0].filename
    128 self.is_remote = is_remote_uri(self._filename)
    129 self.lock = ensure_lock(lock)
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/h5netcdf_.py:187, in H5NetCDFStore.ds(self)
    185 @property
    186 def ds(self):
--> 187     return self._acquire()
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/h5netcdf_.py:179, in H5NetCDFStore._acquire(self, needs_lock)
    178 def _acquire(self, needs_lock=True):
--> 179     with self._manager.acquire_context(needs_lock) as root:
    180         ds = _nc4_require_group(
    181             root, self._group, self._mode, create_group=_h5netcdf_create_group
    182         )
    183     return ds
File ~/mambaforge/envs/chabud/lib/python3.11/contextlib.py:137, in _GeneratorContextManager.__enter__(self)
    135 del self.args, self.kwds, self.func
    136 try:
--> 137     return next(self.gen)
    138 except StopIteration:
    139     raise RuntimeError("generator didn't yield") from None
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/file_manager.py:198, in CachingFileManager.acquire_context(self, needs_lock)
    195 @contextlib.contextmanager
    196 def acquire_context(self, needs_lock=True):
    197     """Context manager for acquiring a file."""
--> 198     file, cached = self._acquire_with_cache_info(needs_lock)
    199     try:
    200         yield file
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/file_manager.py:216, in CachingFileManager._acquire_with_cache_info(self, needs_lock)
    214     kwargs = kwargs.copy()
    215     kwargs["mode"] = self._mode
--> 216 file = self._opener(*self._args, **kwargs)
    217 if self._mode == "w":
    218     # ensure file doesn't get overridden when opened again
    219     self._mode = "a"
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/h5netcdf/core.py:1040, in File.__init__(self, path, mode, invalid_netcdf, phony_dims, **kwargs)
   1035     raise ImportError(
   1036         "No module named 'h5pyd'. h5pyd is required for "
   1037         "opening urls: {}".format(path)
   1038     )
   1039 try:
-> 1040     with h5pyd.File(path, "r", **kwargs) as f:  # noqa
   1041         pass
   1042     self._preexisting_file = True
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/h5pyd/_hl/files.py:280, in File.__init__(self, domain, mode, endpoint, username, password, bucket, api_key, use_session, use_cache, logger, owner, linked_domain, retries, timeout, **kwds)
    278 while True:
    279     try:
--> 280         rsp = http_conn.GET(req, params=params)
    281         break
    282     except IOError:
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/h5pyd/_hl/httpconn.py:493, in HttpConn.GET(self, req, format, params, headers, use_cache)
    491     if rsp.status_code == 200 and req == "/":
    492         self.log.info(f"got domain json: {len(rsp.text)} bytes")
--> 493         self._domain_json = json.loads(rsp.text)
    495 # when calling AWS Lambda thru API Gatway, the status_code
    496 # indicates the Lambda request was successful, but not necessarily
    497 # the requested HSDS action was.
    498 # Check here and raise IOError is needed.
    499 if (
    500     rsp.status_code == 200
    501     and content_type
    502     and content_type.startswith("application/json")
    503 ):
File ~/mambaforge/envs/chabud/lib/python3.11/json/__init__.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    341     s = s.decode(detect_encoding(s), 'surrogatepass')
    343 if (cls is None and object_hook is None and
    344         parse_int is None and parse_float is None and
    345         parse_constant is None and object_pairs_hook is None and not kw):
--> 346     return _default_decoder.decode(s)
    347 if cls is None:
    348     cls = JSONDecoder
File ~/mambaforge/envs/chabud/lib/python3.11/json/decoder.py:337, in JSONDecoder.decode(self, s, _w)
    332 def decode(self, s, _w=WHITESPACE.match):
    333     """Return the Python representation of ``s`` (a ``str`` instance
    334     containing a JSON document).
    335 
    336     """
--> 337     obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    338     end = _w(s, end).end()
    339     if end != len(s):
File ~/mambaforge/envs/chabud/lib/python3.11/json/decoder.py:355, in JSONDecoder.raw_decode(self, s, idx)
    353     obj, end = self.scan_once(s, idx)
    354 except StopIteration as err:
--> 355     raise JSONDecodeError("Expecting value", s, err.value) from None
    356 return obj, end
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Next, I tried following Kerchunk Basics — Kerchunk Cookbook, and tried some code like this:
import fsspec
import kerchunk.hdf
import xarray as xr
url = "https://huggingface.co/datasets/chabud-team/chabud-extra/raw/main/california_2.hdf5"
with fsspec.open(urlpath=url) as inf:
    h5chunks = kerchunk.hdf.SingleHdf5ToZarr(inf, url)
    h5chunks.translate()
    with open("single_file_kerchunk.json", "wb") as f:
        f.write(ujson.dumps(h5chunks.translate()).encode())
But that fails with an OSError:
---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
Cell In[9], line 2
      1 with fsspec.open(urlpath=url) as inf:
----> 2     h5chunks = kerchunk.hdf.SingleHdf5ToZarr(inf, url)
      3     h5chunks.translate()
      4     with open("single_file_kerchunk.json", "wb") as f:
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/kerchunk/hdf.py:92, in SingleHdf5ToZarr.__init__(self, h5f, url, spec, inline_threshold, storage_options, error, vlen_encode)
     90     raise NotImplementedError
     91 self.vlen = vlen_encode
---> 92 self._h5f = h5py.File(self.input_file, mode="r")
     94 self.store = {}
     95 self._zroot = zarr.group(store=self.store, overwrite=True)
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/h5py/_hl/files.py:567, in File.__init__(self, name, mode, driver, libver, userblock_size, swmr, rdcc_nslots, rdcc_nbytes, rdcc_w0, track_order, fs_strategy, fs_persist, fs_threshold, fs_page_size, page_buf_size, min_meta_keep, min_raw_keep, locking, alignment_threshold, alignment_interval, meta_block_size, **kwds)
    558     fapl = make_fapl(driver, libver, rdcc_nslots, rdcc_nbytes, rdcc_w0,
    559                      locking, page_buf_size, min_meta_keep, min_raw_keep,
    560                      alignment_threshold=alignment_threshold,
    561                      alignment_interval=alignment_interval,
    562                      meta_block_size=meta_block_size,
    563                      **kwds)
    564     fcpl = make_fcpl(track_order=track_order, fs_strategy=fs_strategy,
    565                      fs_persist=fs_persist, fs_threshold=fs_threshold,
    566                      fs_page_size=fs_page_size)
--> 567     fid = make_fid(name, mode, userblock_size, fapl, fcpl, swmr=swmr)
    569 if isinstance(libver, tuple):
    570     self._libver = libver
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/h5py/_hl/files.py:231, in make_fid(name, mode, userblock_size, fapl, fcpl, swmr)
    229     if swmr and swmr_support:
    230         flags |= h5f.ACC_SWMR_READ
--> 231     fid = h5f.open(name, flags, fapl=fapl)
    232 elif mode == 'r+':
    233     fid = h5f.open(name, h5f.ACC_RDWR, fapl=fapl)
File h5py/_objects.pyx:54, in h5py._objects.with_phil.wrapper()
File h5py/_objects.pyx:55, in h5py._objects.with_phil.wrapper()
File h5py/h5f.pyx:106, in h5py.h5f.open()
OSError: Unable to synchronously open file (file signature not found)
I’m wondering if there’s some missing argument to fsspec.open that I need to pass. There shouldn’t be any authentication or token needed (clicking on the link works directly in the browser), so I’m not sure what else is needed.