Be more cautious when parsing charset from file

This commit is contained in:
Oliver Günther
2026-05-29 10:30:07 +02:00
parent 5330745e69
commit a852d46cb6
2 changed files with 28 additions and 2 deletions
@@ -90,8 +90,11 @@ module OpenProject
end
def extract_mime_and_charset(type)
mime, charset_param = type.split(";", 2).map(&:strip)
charset = charset_param&.match(/\Acharset=(.+)\z/)&.[](1)
parts = type.split(";").map(&:strip)
mime = parts.first
charset = parts.drop(1)
.filter_map { |p| p.match(/\Acharset=([^\s;]+)\z/)&.[](1) }
.first
charset = nil if charset == "binary"
[mime, charset]
end
@@ -99,6 +99,29 @@ RSpec.describe OpenProject::FileCommandContentTypeDetector do
expect(Open3).to have_received(:capture2).with("file", "-b", "--mime", "--", "--help")
end
describe "charset parsing edge cases" do
def detect(raw_output)
allow(Open3).to receive(:capture2).and_return [raw_output, 0]
described_class.new("any").detect
end
it "extracts charset when followed by an extra unknown parameter" do
expect(detect("text/plain; charset=utf-8; taste=banana")).to eq(["text/plain", "utf-8"])
end
it "only captures the charset token, not trailing content" do
expect(detect("text/plain; charset=utf-8;extra")).to eq(["text/plain", "utf-8"])
end
it "returns nil charset when charset= has no value" do
expect(detect("text/plain; charset=")).to eq(["text/plain", nil])
end
it "ignores unrecognised parameters before charset" do
expect(detect("text/plain; taste=banana; charset=iso-8859-1")).to eq(["text/plain", "iso-8859-1"])
end
end
describe "charset detection from real fixture files" do
let(:utf8_fixture) { Rails.root.join("spec/fixtures/encoding/utf-8.txt").to_s }
let(:iso8859_fixture) { Rails.root.join("spec/fixtures/encoding/iso-8859-1.txt").to_s }