Source code for tools.combine_memberfiles

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 5 2023.

This module provide a method to combine PRO simulation output files from different members into one single file.

Authors:
  - Jari-Pekka Nousu

"""

import glob
import re
import os

from netCDF4 import Dataset


[docs] def combine_memberfiles(parentfolder, keep_open=False): ''' Combining ensemble of SURFEX outputs into a single outputfile with additional dimension for ensemble members - creates output folder e.g. 'mb0001_mb0005' - adds new profile file Notes: - currently ensemble dimensions added for all variables, including constant outputs.. - couple of work arounds to avoid AttributeErrors, fix better later - only tested for outputs for one point :param parentfolder: Path to the folder hosting output folders for each member. The actual simulation output file should be in ``parentfolder/mbXXXX/pro/XXXX.nc``. :type parentfolder: str :returns: None or the output netCDF4 dataset when keep_open is set to True. :rtype: None or netCDF4.Dataset ''' # find the folders under parentfolder folders = glob.glob(f'{parentfolder}/mb*') # sorting to have members in order folders.sort() # finding pro files inside folders files = [] for f in folders: # TODO: Optional input: name of nc file to select ? # currently we cannot use the script if more than one netcdf file is present for each member ! # + treat errors in case no file is found. # <04-08-23, Léo Viallon-Galinier> # files.append(glob.glob(f'{f}/pro/*.nc')[0]) # TODO: Do we really need to sort two times ? <04-08-23, Léo Viallon-Galinier> # files.sort() # opening first file to derive information on dimensions, variables etc. data = Dataset(files[0], 'r') # defining outputfilename according to first and last member outname = re.split('(/)', files[0])[-1] firstmember = re.split('(/)', folders[0])[-1] lastmember = re.split('(/)', folders[-1])[-1] outputfolder = f'{parentfolder}/{firstmember}_{lastmember}' outfilefull = f'{outputfolder}/{outname}' # new repo for output if does not exist if not os.path.exists(outputfolder): os.makedirs(outputfolder) # creating the new output netcdf file new_output = Dataset(outfilefull, 'w', format='NETCDF4_CLASSIC') # dimensions as in the example file for d in data.dimensions.keys(): new_output.createDimension(d, data.dimensions[d].size) # plus additional dimension for ensemble members new_output.createDimension('members', len(files)) # list of key information list_of_variables = list(data.variables.keys()) list_of_variable_units = [] list_of_variable_longnames = [] # create list of units for v in data.variables.keys(): try: list_of_variable_longnames.append(data[v].long_name) except AttributeError: list_of_variable_longnames.append('') try: list_of_variable_units.append(data[v].units) except AttributeError: list_of_variable_units.append('') # creating variables for netcdf file for i in range(len(list_of_variables)): variable = list_of_variables[i] try: units = data[variable].units except AttributeError: units = False dtype = data[variable].dtype if variable == 'time': dimensions = data[variable].dimensions else: dimensions = data[variable].dimensions + ('members',) longname = list_of_variable_longnames[i] try: fill_value = data[variable]._FillValue except AttributeError: # going around AttributeError, at least 'Projection_Type'... fix this better later fill_value = -2147483647 output_var = new_output.createVariable(variable, dtype, dimensions, fill_value=fill_value) if units is not False: output_var.units = units output_var.long_name = longname # looping over the files to read data into right member dimension for member in range(len(files)): file = files[member] print('Filling ensemble member:', member) tempfile = Dataset(file, 'r') for key in list_of_variables: if tempfile[key].dimensions != (): dimensionlen = len(list(tempfile[key].dimensions)) else: dimensionlen = 0 if dimensionlen == 1: new_output[key][:] = tempfile[key][:] elif dimensionlen > 1: new_output[key][..., member] = tempfile[key][...] tempfile.close() if keep_open is True: return new_output else: new_output.close()