Merge branch 'meteoswiss-dev' into metoffice

leric2 · leric2 · commit 3296962ad7bb · 2025-08-28T17:12:35.000+02:00
diff --git a/mwr_raw2l1/config/config_0-20000-0-06610_A.yaml b/mwr_raw2l1/config/config_0-20000-0-06610_A.yaml
@@ -19,6 +19,10 @@ station_latitude: 46.81
 station_longitude: 6.94
 station_altitude: 491.
 
+# liquid cloud check parameters
+liquid_cloud_check:
+  do_check: False
+  multiplying_factor: 0.075
 
 # instrument parameters (frequency-dependent parameters go are associated to channels sorted by increasing frequency)
 # ---------------------
diff --git a/mwr_raw2l1/config/qc_config.yaml b/mwr_raw2l1/config/qc_config.yaml
@@ -13,6 +13,3 @@ check_Tb_offset: False
 delta_azi_sun: 7  # minimum accepted absolute azimuth offset between instrument pointing and sun
 delta_ele_sun: 7  # minimum accepted absolute elevation offset between instrument pointing and sun
 Tb_threshold: [2.7, 330.0]   # Threshold for min and max Tb
-
-
-# potential future extension for liquid_cloud_flag
diff --git a/mwr_raw2l1/main.py b/mwr_raw2l1/main.py
@@ -58,6 +58,15 @@ def run(inst_config_file, nc_format_config_file=None, qc_config_file=None, conca
     conf_inst = get_inst_config(inst_config_file)
     conf_nc = get_nc_format_config(nc_format_config_file)
     conf_qc = get_qc_config(qc_config_file)
+    try:
+        if conf_inst['lwcl_check'] and 'do_check' in conf_inst['lwcl_check']:
+            logger.info('Liquid cloud check activated for this instrument.')
+            conf_qc['lwcl_check'] = conf_inst['lwcl_check']['do_check']
+            conf_qc['lwcl_multiplying_factor'] = conf_inst['lwcl_check']['multiplying_factor']
+    except KeyError:
+        conf_qc['lwcl_check'] = False
+        conf_qc['lwcl_multiplying_factor'] = None
+        logger.info('No liquid cloud check configured in instrument config file.')
 
     reader = get_reader(conf_inst['reader'])
     meas_constructor = get_meas_constructor(conf_inst['meas_constructor'])
diff --git a/mwr_raw2l1/measurement/measurement.py b/mwr_raw2l1/measurement/measurement.py
@@ -5,7 +5,7 @@
 from mwr_raw2l1.log import logger
 from mwr_raw2l1.measurement.measurement_constructors import MeasurementConstructors
 from mwr_raw2l1.measurement.measurement_helpers import channels2receiver, get_receiver_vars, is_var_in_data
-from mwr_raw2l1.measurement.measurement_qc_helpers import check_rain, check_receiver_sanity, check_sun
+from mwr_raw2l1.measurement.measurement_qc_helpers import check_rain, check_receiver_sanity, check_sun, find_lwcl_from_mwr
 from mwr_raw2l1.utils.num_utils import setbit, timedelta2s, unsetbit
 
 
@@ -252,6 +252,11 @@ def apply_quality_control(self, conf_qc):
             qc_thresholds = qc_thresholds[:-1]
         self.data['qc_thresholds'] = qc_thresholds
 
+        # Compute the liquid cloud flag using MWRpy threshold method:
+        if conf_qc['lwcl_check']:
+            self.data = find_lwcl_from_mwr(self.data, multiplying_factor=conf_qc['lwcl_multiplying_factor'])
+            self.data['liquid_cloud_flag_status'] = xr.ones_like(self.data['liquid_cloud_flag'], dtype=np.int32)
+
     def _setbits_qc(self, bit_nb, channel, mask_fail, mask_applied=None):
         """set values for quality_flag and quality_flag status for executed checks"""
         if mask_applied is None:
diff --git a/mwr_raw2l1/measurement/measurement_construct_helpers.py b/mwr_raw2l1/measurement/measurement_construct_helpers.py
@@ -1,5 +1,6 @@
 import numpy as np
 import xarray as xr
+import pandas as pd
 
 from mwr_raw2l1.errors import DimensionError, MissingInputArgument, TimeMismatch
 from mwr_raw2l1.log import logger
@@ -109,7 +110,10 @@ def rpg_to_si(all_data):
         all_data['met']['windspeed'] = all_data['met']['windspeed'] / 3.6  # km/h -> m/s
     except KeyError:  # KeyError will only occur if quantity not in data, what can well happen. Do nothing in this case
         pass
-
+    try:
+        all_data['irt']['IRT'] = all_data['irt']['IRT'] + 273.15  # °C -> K
+    except KeyError:  # KeyError will only occur if quantity not in data, what can well happen. Do nothing in this case
+        pass
     return all_data
 
 
@@ -149,7 +153,7 @@ def make_dataset(data, dims, vars, vars_opt=None, multidim_vars=None, time_vecto
     if data is None or not data:
         if time_vector is None:
             raise MissingInputArgument('if data is empty or None the input argument time_vector must be specified')
-        data = {'time': time_vector}  # start overwriting empty data variable
+        data = {'time': pd.to_datetime(time_vector)}  # start overwriting empty data variable
         for dim in dims[1:]:  # assume first dimension to be 'time'
             data[dim] = np.array([missing_val])  # other dimensions all one-element
         for var in all_vars:
@@ -174,7 +178,11 @@ def make_dataset(data, dims, vars, vars_opt=None, multidim_vars=None, time_vecto
             raise DimensionError(dims, var, nd)
         spec[var] = dict(dims=dims[0:nd], data=data[var])
 
-    return xr.Dataset.from_dict(spec)
+    ds = xr.Dataset.from_dict(spec)
+    # For some reason, this does not keep the formatting of the time coordinates so we overwrite it again
+    if not isinstance(ds['time'].data[0], np.datetime64):
+        ds['time'] = spec['time']['data'].values
+    return ds
 
 
 def to_single_dataset(data_dicts, *args, **kwargs):
@@ -221,6 +229,8 @@ def merge_aux_data(mwr_data, all_data, srcs_to_ignore=None):
                 all_data[src] = all_data[src].rename(varname_map)
 
         # interp to same time grid (time grid from blb now stems from some interp) and merge into out
+        # Note that this does not do any extrapolation which leaves some values (e.g. IRT) to NaN
+        # in case of a file starting with a scan
         srcdat_interp = all_data[src].interp(time=out['time'], method='nearest')  # nearest: flags stay integer
         out = out.merge(srcdat_interp, join='left')
 
@@ -264,7 +274,9 @@ def merge_brt_blb(all_data):
                 logger.warning(
                     'Skipping {} of {} scanning observations due to identical timestamp with zenith obs for {}'.format(
                         len(duplicate_times), len(blb_ts.time), duplicate_times))
-                out = out.merge(blb_ts, join='outer', compat='override')
+                # remove duplicate times from BRT and merge
+                out = out.sel(time=~out.time.isin(duplicate_times))
+                out = out.merge(blb_ts, join='outer')
         else:
             out = scan_to_timeseries_from_aux(all_data['blb'], hkd=all_data['hkd'])
 
diff --git a/mwr_raw2l1/measurement/measurement_constructors.py b/mwr_raw2l1/measurement/measurement_constructors.py
@@ -60,7 +60,7 @@ def from_radiometrics(cls, readin_data, conf_inst=None):
         # dimensions and variable names for usage with make_dataset
         dims = {'mwr': ['time', 'frequency'],
                 'aux': ['time']}
-        vars = {'mwr': ['Tb', 'ele', 'azi', 'quality'],
+        vars = {'mwr': ['Tb', 'ele', 'azi', 'quality', 'T_amb'],
                 'aux': ['IRT', 'p', 'T', 'RH', 'rainflag', 'quality']}
         vars_opt = {'mwr': [],
                     'aux': []}
@@ -72,6 +72,9 @@ def from_radiometrics(cls, readin_data, conf_inst=None):
         all_data['mwr']['scanflag'] = ('time', flags_here)
         data = merge_aux_data(all_data['mwr'], all_data)
 
+        # adapt the dimensions for the T_amb variable as only 1 temperature is given for 2 receivers
+        data['T_amb'] = data['T_amb'].expand_dims(dim={'receiver_nb':2}, axis=1)
+
         data['mfr'] = 'radiometrics'  # manufacturer (lowercase)
 
         return cls(data, conf_inst)
diff --git a/mwr_raw2l1/measurement/measurement_qc_helpers.py b/mwr_raw2l1/measurement/measurement_qc_helpers.py
@@ -1,5 +1,6 @@
 import ephem
 import numpy as np
+import xarray as xr
 
 from mwr_raw2l1.errors import UnknownManufacturer
 from mwr_raw2l1.log import logger
@@ -26,6 +27,11 @@ def check_receiver_sanity(data, channel):
         masks_and_checks = []  # collect all output tuples from flag_check here
         masks_and_checks.append(flag_check(data, 'channel_quality_ok', 0, channel))
         masks_and_checks.append(flag_check(data, 'alarm', 1, channel=None))
+        masks_and_checks.append(flag_check(data, 'noisediode_ok_hum', 0, channel=None))
+        masks_and_checks.append(flag_check(data, 'noisediode_ok_temp', 0, channel=None))
+        masks_and_checks.append(flag_check(data, 'Tstab_ok_hum', 0, channel=None))
+        masks_and_checks.append(flag_check(data, 'Tstab_ok_temp', 0, channel=None))
+        masks_and_checks.append(flag_check(data, 'Tstab_ok_amb', 0, channel=None))
         # TODO: could add checks for noisediode_ok_hum, noisediode_ok_temp, Tstab_ok_hum, Tstab_ok_temp, Tstab_ok_amb
         check_applied_all = [m[1] for m in masks_and_checks]
         if any(check_applied_all):
@@ -164,3 +170,73 @@ def flag_check(data, varname, value, channel=None):
     else:
         logger.info("Cannot apply check for '{}' during quality control as variable does not exist".format(varname))
         return None, False
+
+def find_lwcl_from_mwr(data, multiplying_factor=0.075):
+    """
+    This is a copy of the MWRpy function to find liquid water cloud free periods using 31.4 GHz TB variability.
+    It uses water vapor channel as proxy for a humidity dependent threshold.
+
+    For now, it works only for HATPRO instrument as this includes some empirically derived parameters. 
+
+    Refactored to work directly with xarray data instead of dict
+
+    Args:
+        data: dataset, commonly Measurement.data
+        multiplying_factor: factor to multiply the threshold with, empirically derived
+
+    Returns:
+        dataset with liquid cloud flag set
+    """
+    # Different frequencies for window and water vapor channels depending on instrument type
+    freq_win = np.where(
+        (np.isclose(data["frequency"].values, 31.4, atol=2))
+        | (np.isclose(data["frequency"].values, 190.8, atol=1))
+    )[0]
+    freq_win = np.array([freq_win[0]]) if len(freq_win) > 1 else freq_win
+    freq_wv = np.where(
+        (np.isclose(np.round(data["frequency"][:], 1), 22.2))
+        | (np.isclose(np.round(data["frequency"][:], 1), 183.9))
+    )[0]
+
+    if len(freq_win) == 1 and len(freq_wv) == 1:
+        tb = data["Tb"].isel(frequency=freq_win)
+        tb = tb.squeeze(dim='frequency', drop=True)
+        tb_zenith = tb.where(data["scanflag"] == 0, drop=True).where((data["ele"] > 89.0) & (data["ele"] < 91.0), drop=True)
+        mean_diff_t = np.nanmean(tb.time.diff(dim='time').dt.seconds)
+        number_of_samples = 180/mean_diff_t.round() if mean_diff_t < 1.8 else 180/mean_diff_t.round()
+        # tb_std = tb_df.rolling(
+        #     pd.tseries.frequencies.to_offset(offset), center=True, min_periods=50
+        # ).std()
+        tb_std = tb_zenith.rolling(
+            time=int(number_of_samples), center=True
+        ).std()
+        number_of_samples = 600/mean_diff_t.round() if mean_diff_t < 1.8 else 600/mean_diff_t.round()
+        # tb_mx = tb_std.rolling(
+        #     pd.tseries.frequencies.to_offset(offset), center=True, min_periods=100
+        # ).max()
+        tb_mx = tb_std.rolling(
+            time=int(number_of_samples), center=True
+        ).max()
+        #tb_wv = np.squeeze(ds["tb"][:, freq_wv])
+        tb_wv = data["Tb"].isel(frequency=freq_wv)
+        tb_wv = tb_wv.squeeze(dim='frequency', drop=True)
+        # In order to compute the ratio, we need to get rid of the frequency coordinates
+
+        tb_rat = tb_wv / tb
+        tb_rat = tb_rat.rolling(
+            time=int(number_of_samples), center=True
+        ).max()
+
+        threshold_rat = tb_rat * multiplying_factor
+        
+        
+        data['liquid_cloud_flag'] = xr.where(
+            tb_mx < threshold_rat,
+            0,
+            1
+        )
+    data['liquid_cloud_flag'] = xr.where((data["ele"] > 89.0) & (data["ele"] < 91.0), data['liquid_cloud_flag'], 2)
+    # also fill nans with 2
+    data['liquid_cloud_flag'] = data['liquid_cloud_flag'].fillna(2)
+
+    return data
diff --git a/mwr_raw2l1/measurement/scan_transform.py b/mwr_raw2l1/measurement/scan_transform.py
@@ -9,7 +9,7 @@
 from mwr_raw2l1.utils.num_utils import timedelta2s
 
 
-def scan_endtime_to_time(endtime, n_angles, time_per_angle=17):
+def scan_endtime_to_time(endtime, n_angles, time_per_angle=11, from_starttime=False):
     """
     RPG and Attex scan files only have one timestamp per scan. This function returns the approximate timestamp for the
     observation at each angle
@@ -20,6 +20,8 @@ def scan_endtime_to_time(endtime, n_angles, time_per_angle=17):
         n_angles: number of angles per scan.
         time_per_angle: total time for scanning one angle incl. integration time and the time for moving the mirror.
             Indicated in seconds. The default is 17.
+        from_starttime: if True, the timestamps will be calculated assuming the provided time is the start time of 
+        the scan, otherwise from the end time. This arise from the change in timestamping operated in HATPRO instruments (TBC)
 
     Returns:
         time : :class:`numpy.ndarray` of :class:`datetime.datetime` objects of end times for each observed angle
@@ -33,11 +35,18 @@ def timedelta_method(seconds):
             # use ms as timedelta needs int. Will truncate to ms what should also avoid rounding errors in tests
             return np.timedelta64(int(seconds*1000), 'ms')
 
-    delta = [timedelta_method(n * time_per_angle) for n in reversed(range(n_angles))]
+    if from_starttime:
+        delta = [timedelta_method(n * time_per_angle) for n in range(n_angles)]
+    else:
+        delta = [timedelta_method(n * time_per_angle) for n in reversed(range(n_angles))]
     delta = np.array(delta)
 
     endtime = endtime.reshape(len(endtime), 1)  # for letting numpy broadcast along dimension 1
-    time = endtime - delta  # calculate time for each scan position (matrix)
+    if from_starttime:
+        time = endtime + delta  # calculate time for each scan position (matrix)
+    else:
+        time = endtime - delta
+
     time = time.reshape((-1,))  # make one-dimenional vector out of time matrix
 
     return time
@@ -61,6 +70,7 @@ def scantime_from_aux(blb, hkd=None, brt=None):
     n_ele = len(blb['scan_ele'].values)
 
     endtime2time_params = dict(endtime=time_scan, n_angles=n_ele)
+    endtime2time_params['from_starttime'] = True  # default assume time in blb is start time of scan
     if hkd is not None and 'BLscan_active' in hkd:
         time_scan_active = hkd.time[hkd.BLscan_active.values == 1].values
         time_zen_active = hkd.time[hkd.BLscan_active.values == 0].values
@@ -71,11 +81,16 @@ def scantime_from_aux(blb, hkd=None, brt=None):
             scan_duration = timedelta2s(time_last_scan_active[-1] - time_last_scan_active[0])
             endtime2time_params['time_per_angle'] = scan_duration / n_ele
         elif time_scan_active[0] > time_zen_active[0]:  # sure to have full scan at beginning of hkd
-            scan_duration = timedelta2s(blb.time[0].values - time_scan_active[0])
+            scan_duration = timedelta2s(time_scan_active[-1] - time_scan_active[0])
             endtime2time_params['time_per_angle'] = scan_duration / n_ele
         else:
             logger.warning(
                 'Cannot infer scan duration as first scan might extend to previous period. Using default values')
+        
+        if np.abs(timedelta2s(time_scan_active[0] - time_scan[0])) > 20 or np.abs(timedelta2s(time_scan_active[-1] - time_scan[-1])) < 20:
+            endtime2time_params['from_starttime'] = False
+            logger.info('Assuming that the time in BLB file is the endtime of the scan: TBC')
+
     elif brt is not None:
         # less accurate than hkd because things happen before scan starts (e.g. ambload obs).
         # Assume after last hkd measure it takes 2x time_per_angle before first scanobs ends.
@@ -86,7 +101,7 @@ def scantime_from_aux(blb, hkd=None, brt=None):
         else:
             logger.warning(
                 'Cannot infer scan duration as first scan might extend to previous period. Using default values')
-
+    
     return scan_endtime_to_time(**endtime2time_params)
 
 
diff --git a/mwr_raw2l1/readers/reader_radiometrics_helpers.py b/mwr_raw2l1/readers/reader_radiometrics_helpers.py
@@ -3,6 +3,7 @@
 from mwr_raw2l1.errors import CorruptRectype, EmptyLineError
 from mwr_raw2l1.readers.reader_helpers import get_time, simplify_header
 
+from mwr_raw2l1.log import logger
 
 def get_data(data_raw, header, no_mwr=False, **kwargs):
     """extract all known data from data_raw using header
@@ -17,13 +18,16 @@ def get_data(data_raw, header, no_mwr=False, **kwargs):
     """
     data = get_simple_vars(data_raw, header)
     try:
-        data['time'] = get_time(data_raw, header, 'date/time', '%m/%d/%y %H:%M:%S')
-    except ValueError:
         # Radiometrics changed its timestamps format with upgrade to VizMetPro.
         # The new format is '%Y/%m/%d %H:%M:%S' instead of '%m/%d/%y %H:%M:%S'.
-        # This is a workaround to support both formats but a better solution would be to 
-        # add this pattern to the config file.
+        # TODO: This is a workaround to support both formats but a better solution would be to
+        # add this pattern to the config file (or get rid of older formats but we can not exclude 
+        # that we integrate an old instrument once).
         data['time'] = get_time(data_raw, header, 'date/time', '%Y/%m/%d %H:%M:%S')
+    except ValueError:
+        logger.warning('Failed to parse time with new timestamp format %Y/%m/%d %H:%M:%S, trying with old version %m/%d/%y %H:%M:%S')
+        data['time'] = get_time(data_raw, header, 'date/time', '%m/%d/%y %H:%M:%S')
+
     if not no_mwr:
         data['Tb'], data['frequency'] = get_mwr(data_raw, header, **kwargs)
     return data
diff --git a/mwr_raw2l1/readers/reader_rpg_helpers.py b/mwr_raw2l1/readers/reader_rpg_helpers.py
@@ -4,6 +4,7 @@
 import datetime as dt
 
 import numpy as np
+import pandas as pd
 
 from mwr_raw2l1.errors import UnknownFlagValue, WrongInputFormat
 
@@ -18,8 +19,8 @@ def interpret_time(time_in):
         time_in = np.array([time_in])
         scalar_input = True
 
-    times = [dt.datetime.utcfromtimestamp(x + posix_offset) for x in time_in]
-    out = np.array(times)
+    times = [dt.datetime.fromtimestamp(x + posix_offset, dt.timezone.utc) for x in time_in]
+    out = pd.to_datetime(times)
 
     if scalar_input:
         out = out[0]
diff --git a/tests/data/radiometrics/reference_output/MWR_1C01_0-20000-0-10393_A202101310004.nc b/tests/data/radiometrics/reference_output/MWR_1C01_0-20000-0-10393_A202101310004.nc
diff --git a/tests/data/rpg/reference_output/MWR_1C01_0-20000-0-06610_A201908040100.nc b/tests/data/rpg/reference_output/MWR_1C01_0-20000-0-06610_A201908040100.nc
diff --git a/tests/data/rpg/reference_output/MWR_1C01_0-20000-0-06610_A202305190603_single_obs.nc b/tests/data/rpg/reference_output/MWR_1C01_0-20000-0-06610_A202305190603_single_obs.nc
diff --git a/tests/data/rpg/reference_output/MWR_1C01_0-20000-0-06620_A202305182358.nc b/tests/data/rpg/reference_output/MWR_1C01_0-20000-0-06620_A202305182358.nc
diff --git a/tests/data/rpg/reference_output/MWR_1C01_0-20008-0-IZO_A202303241200.nc b/tests/data/rpg/reference_output/MWR_1C01_0-20008-0-IZO_A202303241200.nc