I am converting my half hourly data in hdf files to zarr using xarray. I read the file and then using to_zarr I keep appending in zarr store.Is there any way it can take any less time ,each file initialy was taking 10 seconds to append but now its increased to 30 seconds.Is there any particular way to convert data proeprly?
Below is the code I am using
import xarray as xr
import numpy as np
import os
import pandas as pd
from numcodecs import Blosc
import gc
import time
lat2d = np.load('latitude.npy')
lon2d = np.load('longitude.npy')
lat1d = lat2d[:,0]
lon1d = lon2d[0,:]
input_base_dir = r"D:\jaysheel"
output_dir = r"D:\zarrdata"
store = os.path.join(output_dir, "insat2.zarr")
compressor = Blosc(cname="zstd")
def extract_timestamp(filename):
return pd.to_datetime(filename.split('_')[1] + filename.split('_')[2],
format='%d%b%Y%H%M')
def process_file(fp):
ds = xr.open_dataset(fp, engine="h5netcdf")
for v in ds.data_vars:
if "time" not in ds[v].dims:
ds[v] = ds[v].expand_dims("time")
ds = ds.assign_coords(
lat2d=(("Y","X"), lat2d),
lon2d=(("Y","X"), lon2d),
lat=("Y", lat1d),
lon=("X", lon1d),
)
return ds
all_files = []
for year in [2021, 2022, 2023, 2024]:
d = os.path.join(input_base_dir, f"INSAT_{year}")
flist = sorted([os.path.join(d, f) for f in os.listdir(d) if f.endswith(".h5")],
key=lambda x: extract_timestamp(os.path.basename(x)))
all_files.extend(flist)
first = True
total_start = time.time()
for i, fp in enumerate(all_files, 1):
try:
file_start = time.time()
print(f"[{i}/{len(all_files)}] Processing: {os.path.basename(fp)}")
ds = process_file(fp)
enc = {}
for v in ds.data_vars:
if len(ds[v].dims) == 3:
ds[v] = ds[v].chunk({"time": 1, "Y": 512, "X": 512})
enc[v] = {"compressor": compressor}
if first:
ds.to_zarr(store, mode="w", consolidated=True, encoding=enc,zarr_format=2)
first = False
else:
ds.to_zarr(store, mode="a", append_dim="time", consolidated=True,zarr_format=2)
ds.close()
del ds
gc.collect()
file_end = time.time()
print(f"Ingested in {file_end - file_start:.2f} ")
except Exception as e:
print(f"Skipping file {fp} due to error: {e}")
continue
total_end = time.time()
print(f" Total ingestion time: {total_end - total_start:.2f} seconds")
Also script stops randomly in between without errors. I am on windows.Can it be an issue with hdf library? I have checked the data files ,all are fine none of em is corrupt.


