DRY logic to support files in arbitrary encoding

AliSoftware · AliSoftware · commit e6a05751f633 · 2022-10-14T12:57:22.000+02:00
Introduce `read_utf8_lines` as a wrapper around `File.readlines` that ensures that it: 1. Takes the encoding the file was saved in into (by looking at its BOM if present) 2. Yield the lines as UTF-8 strings (after converting it to that encoding), especially to ensure that any subsequent matching using `RegExp.match?` doesn't throw a `Encoding::CompatibilityError` (as our `RegExp`s are UTF-8) This commit addresses the suggestion from #418 (comment)
diff --git a/lib/fastlane/plugin/wpmreleasetoolkit/helper/ios/ios_l10n_helper.rb b/lib/fastlane/plugin/wpmreleasetoolkit/helper/ios/ios_l10n_helper.rb
@@ -36,6 +36,28 @@ def self.strings_file_type(path:)
           end
         end
 
+        # Read a file line by line and iterate over it (just like `File.readlines` does),
+        # except that it also detects the encoding used by the file (using the BOM if present) when reading it,
+        # and then convert each line to UTF-8 before yielding it
+        #
+        # This is particularly useful if you need to then use a `RegExp` to match part of the lines you're iterating over,
+        # as the `RegExp` (which will typically be UTF-8) and the string you're matching with it have to use the same encoding
+        # (otherwise we would get a `Encoding::CompatibilityError`)
+        #
+        # @important If you are then using a `RegExp` to match the UTF-8 lines you iterate on,
+        # remember to use the `u` flag on it (`/…/u`) to make it UTF-8-aware too.
+        #
+        # @param [String] file The path to the file to read
+        # @yield each line read from the file, after converting it to the UTF-8 encoding
+        #
+        def self.read_utf8_lines(file)
+          # Be sure to guess file encoding using the Byte-Order-Mark, and fallback to UTF-8 if there's no BOM.
+          File.readlines(file, mode: 'rb:BOM|UTF-8').map do |line|
+            # Ensure the line is re-encoded to UTF-8 regardless of the encoding that was used in the input file
+            line.encode(Encoding::UTF_8)
+          end
+        end
+
         # Merge the content of multiple `.strings` files into a new `.strings` text file.
         #
         # @param [Hash<String, String>] paths The paths of the `.strings` files to merge together, associated with the prefix to prepend to each of their respective keys
@@ -68,11 +90,9 @@ def self.merge_strings(paths:, output_path:)
               all_keys_found += string_keys
 
               tmp_file.write("/* MARK: - #{File.basename(input_file)} */\n\n")
-              # Read line-by-line to reduce memory footprint during content copy; Be sure to guess file encoding using the Byte-Order-Mark.
-              File.readlines(input_file, mode: 'rb:BOM|UTF-8').each do |line|
+              # Read line-by-line to reduce memory footprint during content copy
+              read_utf8_lines(input_file).each do |line|
                 unless prefix.nil? || prefix.empty?
-                  # We need to ensure the line and RegExp are using the same encoding, so we transcode everything to UTF-8.
-                  line.encode!(Encoding::UTF_8)
                   # The `/u` modifier on the RegExps is to make them UTF-8
                   line.gsub!(/^(\s*")/u, "\\1#{prefix}") # Lines starting with a quote are considered to be start of a key; add prefix right after the quote
                   line.gsub!(/^(\s*)([A-Z0-9_]+)(\s*=\s*")/ui, "\\1\"#{prefix}\\2\"\\3") # Lines starting with an identifier followed by a '=' are considered to be an unquoted key (typical in InfoPlist.strings files for example)
diff --git a/lib/fastlane/plugin/wpmreleasetoolkit/helper/ios/ios_strings_file_validation_helper.rb b/lib/fastlane/plugin/wpmreleasetoolkit/helper/ios/ios_strings_file_validation_helper.rb
@@ -11,51 +11,51 @@ class StringsFileValidationHelper
 
         TRANSITIONS = {
           root: {
-            /\s/ => :root,
+            /\s/u => :root,
             '/' => :maybe_comment_start,
             '"' => :in_quoted_key
           },
           maybe_comment_start: {
             '/' => :in_line_comment,
-            /\*/ => :in_block_comment
+            /\*/u => :in_block_comment
           },
           in_line_comment: {
             "\n" => :root,
-            /./ => :in_line_comment
+            /./u => :in_line_comment
           },
           in_block_comment: {
             /\*/ => :maybe_block_comment_end,
-            /./m => :in_block_comment
+            /./mu => :in_block_comment
           },
           maybe_block_comment_end: {
             '/' => :root,
-            /./m => :in_block_comment
+            /./mu => :in_block_comment
           },
           in_quoted_key: {
             '"' => lambda do |state, _|
               state.found_key = state.buffer.string.dup
               state.buffer.string = ''
               :after_quoted_key_before_eq
             end,
-            /./ => lambda do |state, c|
+            /./u => lambda do |state, c|
               state.buffer.write(c)
               :in_quoted_key
             end
           },
           after_quoted_key_before_eq: {
-            /\s/ => :after_quoted_key_before_eq,
+            /\s/u => :after_quoted_key_before_eq,
             '=' => :after_quoted_key_and_eq
           },
           after_quoted_key_and_eq: {
-            /\s/ => :after_quoted_key_and_eq,
+            /\s/u => :after_quoted_key_and_eq,
             '"' => :in_quoted_value
           },
           in_quoted_value: {
             '"' => :after_quoted_value,
-            /./m => :in_quoted_value
+            /./mu => :in_quoted_value
           },
           after_quoted_value: {
-            /\s/ => :after_quoted_value,
+            /\s/u => :after_quoted_value,
             ';' => :root
           }
         }.freeze
@@ -70,8 +70,11 @@ def self.find_duplicated_keys(file:)
 
           state = State.new(context: :root, buffer: StringIO.new, in_escaped_ctx: false, found_key: nil)
 
-          File.readlines(file, mode: 'rb:BOM|UTF-8').each_with_index do |line, line_no|
-            line.encode('UTF-8').chars.each_with_index do |c, col_no|
+          # Using our `each_utf8_line` helper instead of `File.readlines` ensures we can also read files that are
+          # encoded in UTF-16, yet process each of their lines as a UTF-8 string, so that `RegExp#match?` don't throw
+          # an `Encoding::CompatibilityError` exception. (Note how all our `RegExp`s in `TRANSITIONS` have the `u` flag)
+          Fastlane::Helper::Ios::L10nHelper.read_utf8_lines(file).each_with_index do |line, line_no|
+            line.chars.each_with_index do |c, col_no|
               # Handle escaped characters at a global level.
               # This is more straightforward than having to account for it in the `TRANSITIONS` table.
               if state.in_escaped_ctx || c == '\\'