add unicode roundtrip for FIF (mne-tools#12080)

drammock · web-flow · commit 23fa43c3df27 · 2023-10-06T02:32:02.000-04:00
diff --git a/doc/changes/devel.rst b/doc/changes/devel.rst
@@ -65,6 +65,7 @@ Bugs
 - Fix parsing of eye-link :class:`~mne.Annotations` when ``apply_offsets=False`` is provided to :func:`~mne.io.read_raw_eyelink` (:gh:`12003` by `Mathieu Scheltienne`_)
 - Correctly prune channel-specific :class:`~mne.Annotations` when creating :class:`~mne.Epochs` without the channel(s) included in the channel specific annotations (:gh:`12010` by `Mathieu Scheltienne`_)
 - Fix :func:`~mne.viz.plot_volume_source_estimates` with :class:`~mne.VolSourceEstimate` which include a list of vertices (:gh:`12025` by `Mathieu Scheltienne`_)
+- Add support for non-ASCII characters in Annotations, Evoked comments, etc when saving to FIFF format (:gh:`12080` by `Daniel McCloy`_)
 - Correctly handle passing ``"eyegaze"`` or ``"pupil"`` to :meth:`mne.io.Raw.pick` (:gh:`12019` by `Scott Huberty`_)
 
 API changes
diff --git a/mne/_fiff/tag.py b/mne/_fiff/tag.py
@@ -4,7 +4,9 @@
 # License: BSD-3-Clause
 
 from functools import partial
+import html
 import struct
+import re
 
 import numpy as np
 from scipy.sparse import csc_matrix, csr_matrix
@@ -265,7 +267,10 @@ def _read_string(fid, tag, shape, rlims):
     """Read a string tag."""
     # Always decode to ISO 8859-1 / latin1 (FIFF standard).
     d = _frombuffer_rows(fid, tag.size, dtype=">c", shape=shape, rlims=rlims)
-    return str(d.tobytes().decode("latin1", "ignore"))
+    string = str(d.tobytes().decode("latin1", "ignore"))
+    if re.search(r"&#[0-9a-fA-F]{6};", string):
+        string = html.unescape(string)
+    return string
 
 
 def _read_complex_float(fid, tag, shape, rlims):
diff --git a/mne/_fiff/write.py b/mne/_fiff/write.py
@@ -128,7 +128,10 @@ def write_julian(fid, kind, data):
 
 def write_string(fid, kind, data):
     """Write a string tag."""
-    str_data = str(data).encode("latin1")
+    try:
+        str_data = str(data).encode("latin1")
+    except UnicodeEncodeError:
+        str_data = str(data).encode("latin1", errors="xmlcharrefreplace")
     data_size = len(str_data)  # therefore compute size here
     my_dtype = ">a"  # py2/3 compatible on writing -- don't ask me why
     if data_size > 0:
diff --git a/mne/tests/test_evoked.py b/mne/tests/test_evoked.py
@@ -263,6 +263,12 @@ def test_io_evoked(tmp_path):
     ave_complex = read_evokeds(fname_temp)[0]
     assert_allclose(ave.data, ave_complex.data.imag)
 
+    # test non-ascii comments (gh 11684)
+    aves1[0].comment = "🙃"
+    write_evokeds(tmp_path / "evoked-ave.fif", aves1, overwrite=True)
+    aves1_read = read_evokeds(tmp_path / "evoked-ave.fif")[0]
+    assert aves1_read.comment == aves1[0].comment
+
     # test warnings on bad filenames
     fname2 = tmp_path / "test-bad-name.fif"
     with pytest.warns(RuntimeWarning, match="-ave.fif"):