Hello!
I am trying to download a single variable for a small area in South America. It appears that after the spatial subset, I can fit everything in memory. However, once I execute everything to load in memory, it is taking a VERY long time (>40 min so far). I am thinking the spatial subset is probably making everything really slow. Is there a better way of doing this? I am very new to using S3 datasets, but I am wanting to create a jupyter notebook that generates every figure for a paper, without someone having to download the datasets first.
Here’s the code:
# %%
### Importing all the necessary packages ###
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import pandas as pd
import cartopy.crs as ccrs
import cartopy
from matplotlib.offsetbox import AnchoredText
import cartopy.feature as cfeature
import scipy.fft as sf
from scipy import signal
from scipy.stats import circmean
from scipy import optimize
from mpl_toolkits.axes_grid1 import make_axes_locatable
import time
import intake
# %%
from dask.distributed import Client
client = Client()
client
# %%
catalog_url = 'https://ncar-cesm-lens.s3-us-west-2.amazonaws.com/catalogs/aws-cesm1-le.json'
col = intake.open_esm_datastore(catalog_url)
col
# %%
col_subset = col.search(frequency=["daily"], component="atm", variable="PRECT",
experiment=["20C", "RCP85", "HIST"])
# %%
dsets = col_subset.to_dataset_dict(zarr_kwargs={"consolidated": True}, storage_options={"anon": True})
print(f"\nDataset dictionary keys:\n {dsets.keys()}")
# %%
ds_HIST = dsets['atm.HIST.daily']['PRECT']
ds_20C = dsets['atm.20C.daily']['PRECT']
ds_RCP85 = dsets['atm.RCP85.daily']['PRECT']
# %%
### Peru Domain ###
min_lon = -83+360
min_lat = -18.0
max_lon = -67+360
max_lat = 0.0
subset20C = ds_20C.sel(lat=slice(min_lat,max_lat), lon=slice(min_lon,max_lon)).load()
subsetRCP85 = ds_RCP85.sel(lat=slice(min_lat,max_lat), lon=slice(min_lon,max_lon)).load()
subsetHIST = ds_HIST.sel(lat=slice(min_lat,max_lat), lon=slice(min_lon,max_lon)).load()
Thanks for any insight!