also guard against invalid UTF-8 in input

This commit is contained in:
as-op
2026-04-23 12:56:26 +02:00
parent 4c00114a63
commit 3368ae22ab
4 changed files with 39 additions and 5 deletions
@@ -46,10 +46,16 @@ module Import
)
def initialize(text)
@text = text.dup
# Normalize any input into a safe, mutable UTF-8 string: nil becomes "",
# and invalid byte sequences are dropped so downstream regex/StringScanner
# operations cannot raise ArgumentError on malformed input.
@text = text.to_s.dup
@text.scrub!("") unless @text.valid_encoding?
end
def parse
return N::Document.new(children: []) if @text.blank?
preprocess
blocks = parse_blocks
N::Document.new(children: blocks)
@@ -35,8 +35,6 @@ module Import
end
def convert
return "" if @text.blank?
ast = JiraWikiMarkup::Parser.new(@text).parse
JiraWikiMarkup::Renderer.new(ast).render
end
@@ -102,8 +102,6 @@ module Import
end
def collect_markup_mentions(text, mention_usernames)
return if text.blank?
ast = JiraWikiMarkup::Parser.new(text).parse
collect_mentions_from_node(ast, mention_usernames)
end
@@ -51,6 +51,38 @@ RSpec.describe Import::JiraWikiMarkupConverter do
it { is_expected.to eq("This is not {code} and not [a link]") }
end
context "with invalid UTF-8 byte sequences in the input" do
it "drops a stray invalid byte and keeps the surrounding text" do
input = "Hello \xFF world".dup
expect(input.valid_encoding?).to be(false)
expect(described_class.new(input).convert).to eq("Hello world")
end
it "drops a stray continuation byte" do
input = "abc \x80 def".dup
expect(input.valid_encoding?).to be(false)
expect(described_class.new(input).convert).to eq("abc def")
end
it "drops a truncated multi-byte sequence" do
input = "pre \xC3 post".dup
expect(input.valid_encoding?).to be(false)
expect(described_class.new(input).convert).to eq("pre post")
end
it "preserves valid multi-byte characters while dropping only the invalid byte" do
input = "héllo \xFF world".dup
expect(input.valid_encoding?).to be(false)
expect(described_class.new(input).convert).to eq("héllo world")
end
it "still parses formatting around invalid bytes inside delimiters" do
input = "*bold\xFFtext*".dup
expect(input.valid_encoding?).to be(false)
expect(described_class.new(input).convert).to eq("**boldtext**")
end
end
end
describe "line ending normalization" do