From 3368ae22ab72a63ba7ca9a8023fc3bef14878bcf Mon Sep 17 00:00:00 2001 From: as-op Date: Thu, 23 Apr 2026 12:56:26 +0200 Subject: [PATCH] also guard against invalid UTF-8 in input --- .../import/jira_wiki_markup/parser.rb | 8 ++++- .../import/jira_wiki_markup_converter.rb | 2 -- .../jira_fetch_and_import_projects_job.rb | 2 -- .../import/jira_wiki_markup_converter_spec.rb | 32 +++++++++++++++++++ 4 files changed, 39 insertions(+), 5 deletions(-) diff --git a/app/services/import/jira_wiki_markup/parser.rb b/app/services/import/jira_wiki_markup/parser.rb index e047e82a000..27e599862e0 100644 --- a/app/services/import/jira_wiki_markup/parser.rb +++ b/app/services/import/jira_wiki_markup/parser.rb @@ -46,10 +46,16 @@ module Import ) def initialize(text) - @text = text.dup + # Normalize any input into a safe, mutable UTF-8 string: nil becomes "", + # and invalid byte sequences are dropped so downstream regex/StringScanner + # operations cannot raise ArgumentError on malformed input. + @text = text.to_s.dup + @text.scrub!("") unless @text.valid_encoding? end def parse + return N::Document.new(children: []) if @text.blank? + preprocess blocks = parse_blocks N::Document.new(children: blocks) diff --git a/app/services/import/jira_wiki_markup_converter.rb b/app/services/import/jira_wiki_markup_converter.rb index aba715dc3f9..05d6ed49794 100644 --- a/app/services/import/jira_wiki_markup_converter.rb +++ b/app/services/import/jira_wiki_markup_converter.rb @@ -35,8 +35,6 @@ module Import end def convert - return "" if @text.blank? - ast = JiraWikiMarkup::Parser.new(@text).parse JiraWikiMarkup::Renderer.new(ast).render end diff --git a/app/workers/import/jira_fetch_and_import_projects_job.rb b/app/workers/import/jira_fetch_and_import_projects_job.rb index 7d68aaf8c27..15767340bfc 100644 --- a/app/workers/import/jira_fetch_and_import_projects_job.rb +++ b/app/workers/import/jira_fetch_and_import_projects_job.rb @@ -102,8 +102,6 @@ module Import end def collect_markup_mentions(text, mention_usernames) - return if text.blank? - ast = JiraWikiMarkup::Parser.new(text).parse collect_mentions_from_node(ast, mention_usernames) end diff --git a/spec/services/import/jira_wiki_markup_converter_spec.rb b/spec/services/import/jira_wiki_markup_converter_spec.rb index cc91c271e5b..a1c145ce342 100644 --- a/spec/services/import/jira_wiki_markup_converter_spec.rb +++ b/spec/services/import/jira_wiki_markup_converter_spec.rb @@ -51,6 +51,38 @@ RSpec.describe Import::JiraWikiMarkupConverter do it { is_expected.to eq("This is not {code} and not [a link]") } end + + context "with invalid UTF-8 byte sequences in the input" do + it "drops a stray invalid byte and keeps the surrounding text" do + input = "Hello \xFF world".dup + expect(input.valid_encoding?).to be(false) + expect(described_class.new(input).convert).to eq("Hello world") + end + + it "drops a stray continuation byte" do + input = "abc \x80 def".dup + expect(input.valid_encoding?).to be(false) + expect(described_class.new(input).convert).to eq("abc def") + end + + it "drops a truncated multi-byte sequence" do + input = "pre \xC3 post".dup + expect(input.valid_encoding?).to be(false) + expect(described_class.new(input).convert).to eq("pre post") + end + + it "preserves valid multi-byte characters while dropping only the invalid byte" do + input = "héllo \xFF world".dup + expect(input.valid_encoding?).to be(false) + expect(described_class.new(input).convert).to eq("héllo world") + end + + it "still parses formatting around invalid bytes inside delimiters" do + input = "*bold\xFFtext*".dup + expect(input.valid_encoding?).to be(false) + expect(described_class.new(input).convert).to eq("**boldtext**") + end + end end describe "line ending normalization" do