Hi,
I am trying to access MERRA-2 dataset using opendap links on xarray.
The code below is based on a tutorial that @betolink sent me as an example.
The code runs well if parallel=False, but returns OSError: [Errno -70] NetCDF: DAP server error if I set parallel=True, no matter if I create the cluster or not.
@betolink suspected that the workers doesn’t know the authentication and suggested me to do something like mentioned in @rsignell issue.
Which would involve adding client.register_worker_plugin(UploadFile('~/.netrc')) after creating the client. I also tested that but returned the same error. In the code below I had to replace ~/.netrc for the full path because it was returning file not found error.
It is important to say that parallel=True works fine on my local computer using Ubuntu by WSL.
Has anyone faced this problem before or has any guesses on how to solve this issue?
# ----------------------------------
# Import Python modules
# ----------------------------------
import warnings
warnings.filterwarnings("ignore")
import xarray as xr
import matplotlib.pyplot as plt
from calendar import monthrange
create_cluster = True
parallel = True
upload_file = True
if create_cluster:
    # --------------------------------------
    # Creating 50 workers with 1core and 2Gb each
    # --------------------------------------
    import os
    from dask_jobqueue import SLURMCluster
    from dask.distributed import Client
    from dask.distributed import WorkerPlugin
    class UploadFile(WorkerPlugin):
        """A WorkerPlugin to upload a local file to workers.
        Parameters
        ----------
        filepath: str
            A path to the file to upload
        Examples
        --------
        >>> client.register_worker_plugin(UploadFile(".env"))
        """
        def __init__(self, filepath):
            """
            Initialize the plugin by reading in the data from the given file.
            """
            self.filename = os.path.basename(filepath)
            self.dirname = os.path.dirname(filepath)
            with open(filepath, "rb") as f:
                self.data = f.read()
        async def setup(self, worker):
            if not os.path.exists(self.dirname):
                os.mkdir(self.dirname)
            os.chdir(self.dirname)
            with open(self.filename, "wb+") as f:
                f.write(self.data)
            return os.listdir()
    
    cluster = SLURMCluster(cores=1, memory="40GB")
    cluster.scale(jobs=10)
    client = Client(cluster)  # Connect this local process to remote workers
    if upload_file:
        client.register_worker_plugin(UploadFile('/home/isimoesdesousa/.netrc'))
# ---------------------------------
# Read data
# ---------------------------------
# MERRA-2 collection (hourly)
collection_shortname = 'M2T1NXAER'
collection_longname  = 'tavg1_2d_aer_Nx'
collection_number = 'MERRA2_400'  
MERRA2_version = '5.12.4'
year = 2020
    
# Open dataset
# Read selected days in the same month and year
month = 1  # January
day_beg = 1
day_end = 31
    
# Note that collection_number is MERRA2_401 in a few cases, refer to "Records of MERRA-2 Data Reprocessing and Service Changes"
if year == 2020 and month == 9:
    collection_number = 'MERRA2_401'
            
# OPeNDAP URL 
url = 'https://goldsmr4.gesdisc.eosdis.nasa.gov/opendap/MERRA2/{}.{}/{}/{:0>2d}'.format(collection_shortname, MERRA2_version, year, month)
files_month = ['{}/{}.{}.{}{:0>2d}{:0>2d}.nc4'.format(url,collection_number, collection_longname, year, month, days) for days in range(day_beg,day_end+1,1)]
# Get the number of files
len_files_month=len(files_month)
# Print
print("{} files to be opened:".format(len_files_month))
print("files_month", files_month)
# Read dataset URLs
ds = xr.open_mfdataset(files_month, parallel=parallel)
   
# View metadata (function like ncdump -c)
ds