also guard against invalid UTF-8 in input

2026-06-14 03:30:14 +00:00 · 2026-04-23 12:56:26 +02:00
parent 4c00114a63
commit 3368ae22ab
4 changed files with 39 additions and 5 deletions
@@ -46,10 +46,16 @@ module Import
      )

      def initialize(text)
-        @text = text.dup
+        # Normalize any input into a safe, mutable UTF-8 string: nil becomes "",
+        # and invalid byte sequences are dropped so downstream regex/StringScanner
+        # operations cannot raise ArgumentError on malformed input.
+        @text = text.to_s.dup
+        @text.scrub!("") unless @text.valid_encoding?
      end

      def parse
+        return N::Document.new(children: []) if @text.blank?
+
        preprocess
        blocks = parse_blocks
        N::Document.new(children: blocks)
@@ -35,8 +35,6 @@ module Import
    end

    def convert
-      return "" if @text.blank?
-
      ast = JiraWikiMarkup::Parser.new(@text).parse
      JiraWikiMarkup::Renderer.new(ast).render
    end
@@ -102,8 +102,6 @@ module Import
    end

    def collect_markup_mentions(text, mention_usernames)
-      return if text.blank?
-
      ast = JiraWikiMarkup::Parser.new(text).parse
      collect_mentions_from_node(ast, mention_usernames)
    end
@@ -51,6 +51,38 @@ RSpec.describe Import::JiraWikiMarkupConverter do

      it { is_expected.to eq("This is not {code} and not [a link]") }
    end
+
+    context "with invalid UTF-8 byte sequences in the input" do
+      it "drops a stray invalid byte and keeps the surrounding text" do
+        input = "Hello \xFF world".dup
+        expect(input.valid_encoding?).to be(false)
+        expect(described_class.new(input).convert).to eq("Hello  world")
+      end
+
+      it "drops a stray continuation byte" do
+        input = "abc \x80 def".dup
+        expect(input.valid_encoding?).to be(false)
+        expect(described_class.new(input).convert).to eq("abc  def")
+      end
+
+      it "drops a truncated multi-byte sequence" do
+        input = "pre \xC3 post".dup
+        expect(input.valid_encoding?).to be(false)
+        expect(described_class.new(input).convert).to eq("pre  post")
+      end
+
+      it "preserves valid multi-byte characters while dropping only the invalid byte" do
+        input = "héllo \xFF world".dup
+        expect(input.valid_encoding?).to be(false)
+        expect(described_class.new(input).convert).to eq("héllo  world")
+      end
+
+      it "still parses formatting around invalid bytes inside delimiters" do
+        input = "*bold\xFFtext*".dup
+        expect(input.valid_encoding?).to be(false)
+        expect(described_class.new(input).convert).to eq("**boldtext**")
+      end
+    end
  end

  describe "line ending normalization" do