Skip to content

Lexer returns invalid UTF-8 token value for valid UTF-8 source #4130

@zzak

Description

@zzak

We have this markup in a slim template:

| %)

When switching the TargetRubyVersion from 3.3 to 4.0, we get this exception when running rubocop.

full stacktrace:
invalid byte sequence in UTF-8
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/prism-1.9.0/lib/prism/translation/parser/lexer.rb:451:in `[]'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/prism-1.9.0/lib/prism/translation/parser/lexer.rb:451:in `block in to_a'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/prism-1.9.0/lib/prism/translation/parser/lexer.rb:449:in `each'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/prism-1.9.0/lib/prism/translation/parser/lexer.rb:449:in `with_index'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/prism-1.9.0/lib/prism/translation/parser/lexer.rb:449:in `to_a'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/prism-1.9.0/lib/prism/translation/parser.rb:326:in `build_tokens'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/prism-1.9.0/lib/prism/translation/parser.rb:147:in `tokenize'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-ast-1.49.1/lib/rubocop/ast/processed_source.rb:245:in `tokenize'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-ast-1.49.1/lib/rubocop/ast/processed_source.rb:240:in `parse'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-ast-1.49.1/lib/rubocop/ast/processed_source.rb:66:in `initialize'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-ast-1.49.1/lib/rubocop/ast/processed_source.rb:46:in `new'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-ast-1.49.1/lib/rubocop/ast/processed_source.rb:46:in `from_file'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/runner.rb:511:in `get_processed_source'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/runner.rb:281:in `do_inspection_loop'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/runner.rb:171:in `block in file_offenses'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/runner.rb:196:in `file_offense_cache'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/runner.rb:170:in `file_offenses'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/runner.rb:103:in `block in warm_cache'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/parallel-1.27.0/lib/parallel.rb:650:in `call_with_index'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/parallel-1.27.0/lib/parallel.rb:620:in `process_incoming_jobs'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/parallel-1.27.0/lib/parallel.rb:600:in `block in worker'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/parallel-1.27.0/lib/parallel.rb:591:in `fork'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/parallel-1.27.0/lib/parallel.rb:591:in `worker'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/parallel-1.27.0/lib/parallel.rb:582:in `block in create_workers'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/parallel-1.27.0/lib/parallel.rb:581:in `each'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/parallel-1.27.0/lib/parallel.rb:581:in `each_with_index'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/parallel-1.27.0/lib/parallel.rb:581:in `create_workers'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/parallel-1.27.0/lib/parallel.rb:520:in `work_in_processes'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/parallel-1.27.0/lib/parallel.rb:291:in `map'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/parallel-1.27.0/lib/parallel.rb:235:in `each'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/runner.rb:103:in `warm_cache'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/runner.rb:76:in `run'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/cli/command/execute_runner.rb:26:in `block in execute_runner'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/cli/command/execute_runner.rb:52:in `with_redirect'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/cli/command/execute_runner.rb:25:in `execute_runner'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/cli/command/execute_runner.rb:17:in `run'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/cli/command.rb:11:in `run'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/cli/environment.rb:18:in `run'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/cli.rb:130:in `run_command'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/cli.rb:137:in `execute_runners'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/cli.rb:54:in `block in run'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/cli.rb:89:in `profile_if_needed'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/lib/rubocop/cli.rb:45:in `run'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/gems/rubocop-1.86.0/exe/rubocop:15:in `<top (required)>'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/bin/rubocop:25:in `load'
/home/runner/work/project/project/vendor/bundle/ruby/3.3.0/bin/rubocop:25:in `<top (required)>'
/opt/hostedtoolcache/Ruby/3.3.4/x64/lib/ruby/gems/3.3.0/gems/bundler-2.5.11/lib/bundler/cli/exec.rb:58:in `load'
/opt/hostedtoolcache/Ruby/3.3.4/x64/lib/ruby/gems/3.3.0/gems/bundler-2.5.11/lib/bundler/cli/exec.rb:58:in `kernel_load'
/opt/hostedtoolcache/Ruby/3.3.4/x64/lib/ruby/gems/3.3.0/gems/bundler-2.5.11/lib/bundler/cli/exec.rb:23:in `run'
/opt/hostedtoolcache/Ruby/3.3.4/x64/lib/ruby/gems/3.3.0/gems/bundler-2.5.11/lib/bundler/cli.rb:455:in `exec'
/opt/hostedtoolcache/Ruby/3.3.4/x64/lib/ruby/gems/3.3.0/gems/bundler-2.5.11/lib/bundler/vendor/thor/lib/thor/command.rb:28:in `run'
/opt/hostedtoolcache/Ruby/3.3.4/x64/lib/ruby/gems/3.3.0/gems/bundler-2.5.11/lib/bundler/vendor/thor/lib/thor/invocation.rb:127:in `invoke_command'
/opt/hostedtoolcache/Ruby/3.3.4/x64/lib/ruby/gems/3.3.0/gems/bundler-2.5.11/lib/bundler/vendor/thor/lib/thor.rb:527:in `dispatch'
/opt/hostedtoolcache/Ruby/3.3.4/x64/lib/ruby/gems/3.3.0/gems/bundler-2.5.11/lib/bundler/cli.rb:35:in `dispatch'
/opt/hostedtoolcache/Ruby/3.3.4/x64/lib/ruby/gems/3.3.0/gems/bundler-2.5.11/lib/bundler/vendor/thor/lib/thor/base.rb:584:in `start'
/opt/hostedtoolcache/Ruby/3.3.4/x64/lib/ruby/gems/3.3.0/gems/bundler-2.5.11/lib/bundler/cli.rb:29:in `start'
/opt/hostedtoolcache/Ruby/3.3.4/x64/lib/ruby/gems/3.3.0/gems/bundler-2.5.11/exe/bundle:28:in `block in <top (required)>'
/opt/hostedtoolcache/Ruby/3.3.4/x64/lib/ruby/gems/3.3.0/gems/bundler-2.5.11/lib/bundler/friendly_errors.rb:117:in `with_friendly_errors'
/opt/hostedtoolcache/Ruby/3.3.4/x64/lib/ruby/gems/3.3.0/gems/bundler-2.5.11/exe/bundle:20:in `<top (required)>'
/opt/hostedtoolcache/Ruby/3.3.4/x64/bin/bundle:25:in `load'
/opt/hostedtoolcache/Ruby/3.3.4/x64/bin/bundle:25:in `<main>'

I guess we were still using the parser gem before changing the target version.

https://docs.rubocop.org/rubocop/latest/compatibility.html#parser-engines

Since RuboCop 1.75, parser_prism is used by default when TargetRubyVersion is 3.4 or higher.

I've tried to make a repro of it, this one I feel the most confident about. I've tried smaller scripts with just Prism but I figure folks here know what I mean.

require "rubocop"

source = "p\n  | %)\n"

[[3.3, :default], [4.0, :default]].each do |ruby_version, engine|
  begin
    processed = RuboCop::ProcessedSource.new(source, ruby_version,
    "bug.slim", parser_engine: engine)
    p [ruby_version, processed.parser_engine, :ok, processed.valid_syntax?,
    processed.tokens.size]
  rescue => e
    p [ruby_version, :raised, e.class, e.message]
  end
end
$ ruby /tmp/bugg.rb 
[3.3, :parser_whitequark, :ok, false, 3]
[4.0, :raised, ArgumentError, "invalid byte sequence in UTF-8"]

I also spent some tokens on a patch, but have very little confidence in it so take that what you will:

diff --git a/src/prism.c b/src/prism.c
index a2e04ed10..ddfac2a5d 100644
--- a/src/prism.c
+++ b/src/prism.c
@@ -190,6 +190,15 @@ lex_mode_terminator(const uint8_t start) {
     }
 }

+/**
+ * Returns true if the delimiter can be added to the byte-oriented breakpoint
+ * list used by pm_strpbrk.
+ */
+static PRISM_INLINE bool
+lex_mode_byte_delimiter_p(const uint8_t delimiter) {
+    return delimiter != '\0' && delimiter < 0x80;
+}
+
 /**
  * Push a new lex state onto the stack. If we're still within the pre-allocated
  * space of the lex state stack, then we'll just use a new slot. Otherwise we'll
@@ -240,7 +249,7 @@ lex_mode_push_list(pm_parser_t *parser, bool interpolation, uint8_t delimiter) {

     // Now we'll add the terminator to the list of breakpoints. If the
     // terminator is not already a NULL byte, add it to the list.
-    if (terminator != '\0') {
+    if (lex_mode_byte_delimiter_p(terminator)) {
         breakpoints[index++] = terminator;
     }

@@ -292,7 +301,7 @@ lex_mode_push_regexp(pm_parser_t *parser, uint8_t incrementor, uint8_t terminato
     size_t index = 4;

     // First we'll add the terminator.
-    if (terminator != '\0') {
+    if (lex_mode_byte_delimiter_p(terminator)) {
         breakpoints[index++] = terminator;
     }

@@ -330,7 +339,7 @@ lex_mode_push_string(pm_parser_t *parser, bool interpolation, bool label_allowed

     // Now add in the terminator. If the terminator is not already a NULL byte
,
     // then we'll add it.
-    if (terminator != '\0') {
+    if (lex_mode_byte_delimiter_p(terminator)) {
         breakpoints[index++] = terminator;
     }

@@ -9956,7 +9965,19 @@ pm_lex_percent_delimiter(pm_parser_t *parser) {
         return delimiter;
     }

-    return *parser->current.end++;
+    uint8_t delimiter = *parser->current.end;
+
+    if (delimiter >= 0x80) {
+        size_t width = parser->encoding_changed
+            ? parser->encoding->char_width(parser->current.end, parser->end - parser->current.end)
+            : pm_encoding_utf_8_char_width(parser->current.end, parser->end - parser->current.end);
+
+        parser->current.end += (width == 0 ? 1 : width);
+    } else {
+        parser->current.end++;
+    }
+
+    return delimiter;
 }

But with the patch the result is much better:

$ ruby -Ilib /tmp/bugg.rb
[3.3, :parser_whitequark, :ok, false, 3]
[4.0, :parser_prism, :ok, false, 5]

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions