Hi,
I am trying to access MERRA-2 dataset using opendap
links on xarray
.
The code below is based on a tutorial that @betolink sent me as an example.
The code runs well if parallel=False
, but returns OSError: [Errno -70] NetCDF: DAP server error
if I set parallel=True
, no matter if I create the cluster or not.
@betolink suspected that the workers doesn’t know the authentication and suggested me to do something like mentioned in @rsignell issue.
Which would involve adding client.register_worker_plugin(UploadFile('~/.netrc'))
after creating the client. I also tested that but returned the same error. In the code below I had to replace ~/.netrc
for the full path because it was returning file not found
error.
It is important to say that parallel=True
works fine on my local computer using Ubuntu by WSL.
Has anyone faced this problem before or has any guesses on how to solve this issue?
# ----------------------------------
# Import Python modules
# ----------------------------------
import warnings
warnings.filterwarnings("ignore")
import xarray as xr
import matplotlib.pyplot as plt
from calendar import monthrange
create_cluster = True
parallel = True
upload_file = True
if create_cluster:
# --------------------------------------
# Creating 50 workers with 1core and 2Gb each
# --------------------------------------
import os
from dask_jobqueue import SLURMCluster
from dask.distributed import Client
from dask.distributed import WorkerPlugin
class UploadFile(WorkerPlugin):
"""A WorkerPlugin to upload a local file to workers.
Parameters
----------
filepath: str
A path to the file to upload
Examples
--------
>>> client.register_worker_plugin(UploadFile(".env"))
"""
def __init__(self, filepath):
"""
Initialize the plugin by reading in the data from the given file.
"""
self.filename = os.path.basename(filepath)
self.dirname = os.path.dirname(filepath)
with open(filepath, "rb") as f:
self.data = f.read()
async def setup(self, worker):
if not os.path.exists(self.dirname):
os.mkdir(self.dirname)
os.chdir(self.dirname)
with open(self.filename, "wb+") as f:
f.write(self.data)
return os.listdir()
cluster = SLURMCluster(cores=1, memory="40GB")
cluster.scale(jobs=10)
client = Client(cluster) # Connect this local process to remote workers
if upload_file:
client.register_worker_plugin(UploadFile('/home/isimoesdesousa/.netrc'))
# ---------------------------------
# Read data
# ---------------------------------
# MERRA-2 collection (hourly)
collection_shortname = 'M2T1NXAER'
collection_longname = 'tavg1_2d_aer_Nx'
collection_number = 'MERRA2_400'
MERRA2_version = '5.12.4'
year = 2020
# Open dataset
# Read selected days in the same month and year
month = 1 # January
day_beg = 1
day_end = 31
# Note that collection_number is MERRA2_401 in a few cases, refer to "Records of MERRA-2 Data Reprocessing and Service Changes"
if year == 2020 and month == 9:
collection_number = 'MERRA2_401'
# OPeNDAP URL
url = 'https://goldsmr4.gesdisc.eosdis.nasa.gov/opendap/MERRA2/{}.{}/{}/{:0>2d}'.format(collection_shortname, MERRA2_version, year, month)
files_month = ['{}/{}.{}.{}{:0>2d}{:0>2d}.nc4'.format(url,collection_number, collection_longname, year, month, days) for days in range(day_beg,day_end+1,1)]
# Get the number of files
len_files_month=len(files_month)
# Print
print("{} files to be opened:".format(len_files_month))
print("files_month", files_month)
# Read dataset URLs
ds = xr.open_mfdataset(files_month, parallel=parallel)
# View metadata (function like ncdump -c)
ds