Hi there,
I’m trying to figure out if there’s a way to access some HDF5 files stored on what seems like a HTTP server directly with xarray. The dataset is at chabud-team/chabud-extra at main, and looks to be on a Git LFS system. If I’m not mistaken, the URL to the dataset is https://huggingface.co/datasets/chabud-team/chabud-extra/raw/main/california_2.hdf5.
First thing I tried was xarray.open_dataset
directly:
import xarray as xr
xr.open_dataset(
filename_or_obj="https://huggingface.co/datasets/chabud-team/chabud-extra/raw/main/california_2.hdf5",
engine="h5netcdf", # Requires h5netcdf and h5pyd to be installed
)
which gives a JSONDecodeError
like so:
---------------------------------------------------------------------------
JSONDecodeError Traceback (most recent call last)
Cell In[5], line 1
----> 1 xr.open_dataset(
2 filename_or_obj="https://huggingface.co/datasets/chabud-team/chabud-extra/resolve/main/california_2.hdf5",
3 engine="h5netcdf", # Requires h5netcdf and h5pyd to be installed
4 )
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/api.py:525, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, backend_kwargs, **kwargs)
513 decoders = _resolve_decoders_kwargs(
514 decode_cf,
515 open_backend_dataset_parameters=backend.open_dataset_parameters,
(...)
521 decode_coords=decode_coords,
522 )
524 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
--> 525 backend_ds = backend.open_dataset(
526 filename_or_obj,
527 drop_variables=drop_variables,
528 **decoders,
529 **kwargs,
530 )
531 ds = _dataset_from_backend_dataset(
532 backend_ds,
533 filename_or_obj,
(...)
541 **kwargs,
542 )
543 return ds
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/h5netcdf_.py:413, in H5netcdfBackendEntrypoint.open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, format, group, lock, invalid_netcdf, phony_dims, decode_vlen_strings)
394 def open_dataset( # type: ignore[override] # allow LSP violation, not supporting **kwargs
395 self,
396 filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
(...)
410 decode_vlen_strings=True,
411 ) -> Dataset:
412 filename_or_obj = _normalize_path(filename_or_obj)
--> 413 store = H5NetCDFStore.open(
414 filename_or_obj,
415 format=format,
416 group=group,
417 lock=lock,
418 invalid_netcdf=invalid_netcdf,
419 phony_dims=phony_dims,
420 decode_vlen_strings=decode_vlen_strings,
421 )
423 store_entrypoint = StoreBackendEntrypoint()
425 ds = store_entrypoint.open_dataset(
426 store,
427 mask_and_scale=mask_and_scale,
(...)
433 decode_timedelta=decode_timedelta,
434 )
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/h5netcdf_.py:176, in H5NetCDFStore.open(cls, filename, mode, format, group, lock, autoclose, invalid_netcdf, phony_dims, decode_vlen_strings)
173 lock = combine_locks([HDF5_LOCK, get_write_lock(filename)])
175 manager = CachingFileManager(h5netcdf.File, filename, mode=mode, kwargs=kwargs)
--> 176 return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose)
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/h5netcdf_.py:127, in H5NetCDFStore.__init__(self, manager, group, mode, lock, autoclose)
124 self.format = None
125 # todo: utilizing find_root_and_group seems a bit clunky
126 # making filename available on h5netcdf.Group seems better
--> 127 self._filename = find_root_and_group(self.ds)[0].filename
128 self.is_remote = is_remote_uri(self._filename)
129 self.lock = ensure_lock(lock)
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/h5netcdf_.py:187, in H5NetCDFStore.ds(self)
185 @property
186 def ds(self):
--> 187 return self._acquire()
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/h5netcdf_.py:179, in H5NetCDFStore._acquire(self, needs_lock)
178 def _acquire(self, needs_lock=True):
--> 179 with self._manager.acquire_context(needs_lock) as root:
180 ds = _nc4_require_group(
181 root, self._group, self._mode, create_group=_h5netcdf_create_group
182 )
183 return ds
File ~/mambaforge/envs/chabud/lib/python3.11/contextlib.py:137, in _GeneratorContextManager.__enter__(self)
135 del self.args, self.kwds, self.func
136 try:
--> 137 return next(self.gen)
138 except StopIteration:
139 raise RuntimeError("generator didn't yield") from None
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/file_manager.py:198, in CachingFileManager.acquire_context(self, needs_lock)
195 @contextlib.contextmanager
196 def acquire_context(self, needs_lock=True):
197 """Context manager for acquiring a file."""
--> 198 file, cached = self._acquire_with_cache_info(needs_lock)
199 try:
200 yield file
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/xarray/backends/file_manager.py:216, in CachingFileManager._acquire_with_cache_info(self, needs_lock)
214 kwargs = kwargs.copy()
215 kwargs["mode"] = self._mode
--> 216 file = self._opener(*self._args, **kwargs)
217 if self._mode == "w":
218 # ensure file doesn't get overridden when opened again
219 self._mode = "a"
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/h5netcdf/core.py:1040, in File.__init__(self, path, mode, invalid_netcdf, phony_dims, **kwargs)
1035 raise ImportError(
1036 "No module named 'h5pyd'. h5pyd is required for "
1037 "opening urls: {}".format(path)
1038 )
1039 try:
-> 1040 with h5pyd.File(path, "r", **kwargs) as f: # noqa
1041 pass
1042 self._preexisting_file = True
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/h5pyd/_hl/files.py:280, in File.__init__(self, domain, mode, endpoint, username, password, bucket, api_key, use_session, use_cache, logger, owner, linked_domain, retries, timeout, **kwds)
278 while True:
279 try:
--> 280 rsp = http_conn.GET(req, params=params)
281 break
282 except IOError:
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/h5pyd/_hl/httpconn.py:493, in HttpConn.GET(self, req, format, params, headers, use_cache)
491 if rsp.status_code == 200 and req == "/":
492 self.log.info(f"got domain json: {len(rsp.text)} bytes")
--> 493 self._domain_json = json.loads(rsp.text)
495 # when calling AWS Lambda thru API Gatway, the status_code
496 # indicates the Lambda request was successful, but not necessarily
497 # the requested HSDS action was.
498 # Check here and raise IOError is needed.
499 if (
500 rsp.status_code == 200
501 and content_type
502 and content_type.startswith("application/json")
503 ):
File ~/mambaforge/envs/chabud/lib/python3.11/json/__init__.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
341 s = s.decode(detect_encoding(s), 'surrogatepass')
343 if (cls is None and object_hook is None and
344 parse_int is None and parse_float is None and
345 parse_constant is None and object_pairs_hook is None and not kw):
--> 346 return _default_decoder.decode(s)
347 if cls is None:
348 cls = JSONDecoder
File ~/mambaforge/envs/chabud/lib/python3.11/json/decoder.py:337, in JSONDecoder.decode(self, s, _w)
332 def decode(self, s, _w=WHITESPACE.match):
333 """Return the Python representation of ``s`` (a ``str`` instance
334 containing a JSON document).
335
336 """
--> 337 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
338 end = _w(s, end).end()
339 if end != len(s):
File ~/mambaforge/envs/chabud/lib/python3.11/json/decoder.py:355, in JSONDecoder.raw_decode(self, s, idx)
353 obj, end = self.scan_once(s, idx)
354 except StopIteration as err:
--> 355 raise JSONDecodeError("Expecting value", s, err.value) from None
356 return obj, end
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Next, I tried following Kerchunk Basics — Kerchunk Cookbook, and tried some code like this:
import fsspec
import kerchunk.hdf
import xarray as xr
url = "https://huggingface.co/datasets/chabud-team/chabud-extra/raw/main/california_2.hdf5"
with fsspec.open(urlpath=url) as inf:
h5chunks = kerchunk.hdf.SingleHdf5ToZarr(inf, url)
h5chunks.translate()
with open("single_file_kerchunk.json", "wb") as f:
f.write(ujson.dumps(h5chunks.translate()).encode())
But that fails with an OSError
:
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
Cell In[9], line 2
1 with fsspec.open(urlpath=url) as inf:
----> 2 h5chunks = kerchunk.hdf.SingleHdf5ToZarr(inf, url)
3 h5chunks.translate()
4 with open("single_file_kerchunk.json", "wb") as f:
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/kerchunk/hdf.py:92, in SingleHdf5ToZarr.__init__(self, h5f, url, spec, inline_threshold, storage_options, error, vlen_encode)
90 raise NotImplementedError
91 self.vlen = vlen_encode
---> 92 self._h5f = h5py.File(self.input_file, mode="r")
94 self.store = {}
95 self._zroot = zarr.group(store=self.store, overwrite=True)
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/h5py/_hl/files.py:567, in File.__init__(self, name, mode, driver, libver, userblock_size, swmr, rdcc_nslots, rdcc_nbytes, rdcc_w0, track_order, fs_strategy, fs_persist, fs_threshold, fs_page_size, page_buf_size, min_meta_keep, min_raw_keep, locking, alignment_threshold, alignment_interval, meta_block_size, **kwds)
558 fapl = make_fapl(driver, libver, rdcc_nslots, rdcc_nbytes, rdcc_w0,
559 locking, page_buf_size, min_meta_keep, min_raw_keep,
560 alignment_threshold=alignment_threshold,
561 alignment_interval=alignment_interval,
562 meta_block_size=meta_block_size,
563 **kwds)
564 fcpl = make_fcpl(track_order=track_order, fs_strategy=fs_strategy,
565 fs_persist=fs_persist, fs_threshold=fs_threshold,
566 fs_page_size=fs_page_size)
--> 567 fid = make_fid(name, mode, userblock_size, fapl, fcpl, swmr=swmr)
569 if isinstance(libver, tuple):
570 self._libver = libver
File ~/mambaforge/envs/chabud/lib/python3.11/site-packages/h5py/_hl/files.py:231, in make_fid(name, mode, userblock_size, fapl, fcpl, swmr)
229 if swmr and swmr_support:
230 flags |= h5f.ACC_SWMR_READ
--> 231 fid = h5f.open(name, flags, fapl=fapl)
232 elif mode == 'r+':
233 fid = h5f.open(name, h5f.ACC_RDWR, fapl=fapl)
File h5py/_objects.pyx:54, in h5py._objects.with_phil.wrapper()
File h5py/_objects.pyx:55, in h5py._objects.with_phil.wrapper()
File h5py/h5f.pyx:106, in h5py.h5f.open()
OSError: Unable to synchronously open file (file signature not found)
I’m wondering if there’s some missing argument to fsspec.open
that I need to pass. There shouldn’t be any authentication or token needed (clicking on the link works directly in the browser), so I’m not sure what else is needed.