From 6f63faeed101d0f3dabd5cc6b8c743802e20e09c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20G=C3=BCnther?= Date: Thu, 28 May 2026 13:23:22 +0200 Subject: [PATCH 1/3] Extract and use charset to properly encode attachments --- app/contracts/attachments/create_contract.rb | 1 + app/models/attachment.rb | 13 +++- config/constants/settings/definition.rb | 5 ++ config/locales/en.yml | 1 + ...260528120000_add_charset_to_attachments.rb | 35 ++++++++++ lib/api/helpers/attachment_renderer.rb | 3 +- lib/open_project/content_type_detector.rb | 31 ++++----- .../file_command_content_type_detector.rb | 22 +++++-- spec/fixtures/encoding/utf-8.txt | 1 + .../files/{testfile.txt => testfile-utf8.txt} | 0 .../content_type_detector_spec.rb | 2 +- ...file_command_content_type_detector_spec.rb | 43 +++++++++---- spec/models/attachment_spec.rb | 50 ++++++++++++++- .../attachment_resource_shared_examples.rb | 64 +++++++++++++++++-- 14 files changed, 228 insertions(+), 43 deletions(-) create mode 100644 db/migrate/20260528120000_add_charset_to_attachments.rb create mode 100644 spec/fixtures/encoding/utf-8.txt rename spec/fixtures/files/{testfile.txt => testfile-utf8.txt} (100%) diff --git a/app/contracts/attachments/create_contract.rb b/app/contracts/attachments/create_contract.rb index 9686efdac59..415364ad196 100644 --- a/app/contracts/attachments/create_contract.rb +++ b/app/contracts/attachments/create_contract.rb @@ -36,6 +36,7 @@ module Attachments attribute :digest attribute :description attribute :content_type + attribute :charset attribute :container attribute :container_type attribute :author diff --git a/app/models/attachment.rb b/app/models/attachment.rb index 0679e6caa55..a4552f9ada1 100644 --- a/app/models/attachment.rb +++ b/app/models/attachment.rb @@ -119,6 +119,17 @@ class Attachment < ApplicationRecord end end + # Returns the content type to use when serving the file to a browser. + # For text files, ensures a charset is always present so browsers don't + # fall back to ISO-8859-1. + # We use a configurable fallback (default utf-8) so that administrators + # can control content types for previously uploaded attachments + def serving_content_type + return content_type unless is_text? + + "#{content_type}; charset=#{charset.presence || Setting.attachment_default_charset}" + end + def visible?(user = User.current) allowed_or_author?(user) do container.attachments_visible?(user) @@ -227,7 +238,7 @@ class Attachment < ApplicationRecord end def set_content_type(file) - self.content_type = self.class.content_type_for(file.path) + self.content_type, self.charset = OpenProject::ContentTypeDetector.new(file.path).detect_with_charset end def set_digest(file) diff --git a/config/constants/settings/definition.rb b/config/constants/settings/definition.rb index 6b65694e6b9..03d4884db30 100644 --- a/config/constants/settings/definition.rb +++ b/config/constants/settings/definition.rb @@ -81,6 +81,11 @@ module Settings organization_name: { default: "My Organization" }, + attachment_default_charset: { + description: "Fallback charset used when serving text attachments whose encoding was not detected on upload", + format: :string, + default: "utf-8" + }, attachment_max_size: { default: 5120 }, diff --git a/config/locales/en.yml b/config/locales/en.yml index bc67b832d7e..5134e042d2f 100644 --- a/config/locales/en.yml +++ b/config/locales/en.yml @@ -1808,6 +1808,7 @@ en: attachment: attachment_content: "Attachment content" attachment_file_name: "Attachment file name" + charset: "Character set" content_type: "Content-type" downloads: "Downloads" file: "File" diff --git a/db/migrate/20260528120000_add_charset_to_attachments.rb b/db/migrate/20260528120000_add_charset_to_attachments.rb new file mode 100644 index 00000000000..f49a48a9b73 --- /dev/null +++ b/db/migrate/20260528120000_add_charset_to_attachments.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +#-- copyright +# OpenProject is an open source project management software. +# Copyright (C) the OpenProject GmbH +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License version 3. +# +# OpenProject is a fork of ChiliProject, which is a fork of Redmine. The copyright follows: +# Copyright (C) 2006-2013 Jean-Philippe Lang +# Copyright (C) 2010-2013 the ChiliProject Team +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# +# See COPYRIGHT and LICENSE files for more details. +#++ + +class AddCharsetToAttachments < ActiveRecord::Migration[8.0] + def change + add_column :attachments, :charset, :text + end +end diff --git a/lib/api/helpers/attachment_renderer.rb b/lib/api/helpers/attachment_renderer.rb index 74a7f9e32b2..c086a6de814 100644 --- a/lib/api/helpers/attachment_renderer.rb +++ b/lib/api/helpers/attachment_renderer.rb @@ -103,7 +103,8 @@ module API if attachment.is_text? # Even if the text mime type might differ, always output plain text # so this doesn't get interpreted as e.g., a script or html file - "text/plain" + charset = attachment.charset.presence || Setting.attachment_default_charset + "text/plain; charset=#{charset}" elsif attachment.inlineable? attachment.content_type else diff --git a/lib/open_project/content_type_detector.rb b/lib/open_project/content_type_detector.rb index c0a57ef8e5f..df4be688a63 100644 --- a/lib/open_project/content_type_detector.rb +++ b/lib/open_project/content_type_detector.rb @@ -92,19 +92,19 @@ module OpenProject @filename = filename end - # Returns a String describing the file's content type - def detect - if blank_name? - SENSIBLE_DEFAULT - elsif empty_file? - EMPTY_TYPE - elsif calculated_type_matches.any? - calculated_type_matches.first - else - type_from_file_command || SENSIBLE_DEFAULT - end.to_s + # Returns [mime_type, charset_or_nil], running the file command once. + def detect_with_charset + return [SENSIBLE_DEFAULT, nil] if blank_name? + return [EMPTY_TYPE, nil] if empty_file? + + raw_mime, charset = FileCommandContentTypeDetector.new(@filename).detect + [resolve_mime(raw_mime), charset] end + # Detecting only the mime type is effectively the + # first argument of +detect_with_charset+ + def detect = detect_with_charset.first + private def empty_file? @@ -121,12 +121,9 @@ module OpenProject MIME::Types.type_for(@filename).map(&:content_type) end - def calculated_type_matches - possible_types.select { |content_type| content_type == type_from_file_command } - end - - def type_from_file_command - @type_from_file_command ||= FileCommandContentTypeDetector.new(@filename).detect + def resolve_mime(raw_mime) + matches = possible_types.select { |ct| ct == raw_mime } + (matches.first || raw_mime || SENSIBLE_DEFAULT).to_s end end end diff --git a/lib/open_project/file_command_content_type_detector.rb b/lib/open_project/file_command_content_type_detector.rb index 65b7676d424..f491e17dc0b 100644 --- a/lib/open_project/file_command_content_type_detector.rb +++ b/lib/open_project/file_command_content_type_detector.rb @@ -69,23 +69,31 @@ module OpenProject @filename = filename end + # Returns [mime_type, charset_or_nil], e.g.: + # ["text/plain", "utf-8"] + # ["image/png", nil] def detect - type_from_file_command + @detect ||= parse_file_command end private - def type_from_file_command + def parse_file_command # On BSDs, `file` doesn't give a result code of 1 if the file doesn't exist. type, status = Open3.capture2("file", "-b", "--mime", "--", @filename) + return [SENSIBLE_DEFAULT, nil] if type.nil? || status.to_i > 0 || type.match(/\(.*?\)/) - if type.nil? || status.to_i > 0 || type.match(/\(.*?\)/) - type = SENSIBLE_DEFAULT - end - type.split(/[:;\s]+/)[0] + extract_mime_and_charset(type.strip) rescue StandardError => e Rails.logger.info { "Failed to get mime type from #{@filename}: #{e} #{e.message}" } - SENSIBLE_DEFAULT + [SENSIBLE_DEFAULT, nil] + end + + def extract_mime_and_charset(type) + mime, charset_param = type.split(";", 2).map(&:strip) + charset = charset_param&.match(/\Acharset=(.+)\z/)&.[](1) + charset = nil if charset == "binary" + [mime, charset] end end end diff --git a/spec/fixtures/encoding/utf-8.txt b/spec/fixtures/encoding/utf-8.txt new file mode 100644 index 00000000000..7e2f8091944 --- /dev/null +++ b/spec/fixtures/encoding/utf-8.txt @@ -0,0 +1 @@ +UTF-8 encoded text with non-ASCII characters: äöü € diff --git a/spec/fixtures/files/testfile.txt b/spec/fixtures/files/testfile-utf8.txt similarity index 100% rename from spec/fixtures/files/testfile.txt rename to spec/fixtures/files/testfile-utf8.txt diff --git a/spec/lib/open_project/content_type_detector_spec.rb b/spec/lib/open_project/content_type_detector_spec.rb index a04243a6ed4..d67d15ec865 100644 --- a/spec/lib/open_project/content_type_detector_spec.rb +++ b/spec/lib/open_project/content_type_detector_spec.rb @@ -90,7 +90,7 @@ RSpec.describe OpenProject::ContentTypeDetector do File.open(@filename, "w+") do |file| file.puts "This is a text file." file.rewind - expect(OpenProject::ContentTypeDetector.new(file.path).detect).to eq("text/plain") + expect(described_class.new(file.path).detect).to start_with("text/plain") end FileUtils.rm @filename end diff --git a/spec/lib/open_project/file_command_content_type_detector_spec.rb b/spec/lib/open_project/file_command_content_type_detector_spec.rb index 0113173d426..4f444643932 100644 --- a/spec/lib/open_project/file_command_content_type_detector_spec.rb +++ b/spec/lib/open_project/file_command_content_type_detector_spec.rb @@ -59,35 +59,36 @@ require "spec_helper" RSpec.describe OpenProject::FileCommandContentTypeDetector do - it "returns a content type based on the content of the file" do + it "returns a [mime_type, charset] tuple for a text file" do tempfile = Tempfile.new("something") tempfile.write("This is a file.") tempfile.rewind - expect(described_class.new(tempfile.path).detect).to eq("text/plain") + mime, charset = described_class.new(tempfile.path).detect + expect(mime).to eq("text/plain") + expect(charset).to be_a(String) tempfile.close end - it "returns a sensible default when the file command is missing" do + it "returns [sensible_default, nil] when the file command is missing" do allow(Open3).to receive(:capture2).and_raise "o noes!" - filename = "/path/to/something" - expect(described_class.new(filename).detect).to eq("application/binary") + expect(described_class.new("/path/to/something").detect).to eq(["application/binary", nil]) end - it "returns a sensible default on the odd chance that run returns nil" do + it "returns [sensible_default, nil] on the odd chance that run returns nil" do allow(Open3).to receive(:capture2).and_return [nil, 0] - expect(described_class.new("windows").detect).to eq("application/binary") + expect(described_class.new("windows").detect).to eq(["application/binary", nil]) end - it "returns a sensible default when the file command returns an error code" do + it "returns [sensible_default, nil] when the file command returns an error code" do allow(Open3).to receive(:capture2).and_return ["text/plain", 1] - expect(described_class.new("windows").detect).to eq("application/binary") + expect(described_class.new("windows").detect).to eq(["application/binary", nil]) end - it "returns a sensible default when the file command returns a type with parentheses" do + it "returns [sensible_default, nil] when the file command returns a type with parentheses" do allow(Open3).to receive(:capture2).and_return ["text/plain (with something)", 0] - expect(described_class.new("windows").detect).to eq("application/binary") + expect(described_class.new("windows").detect).to eq(["application/binary", nil]) end it "uses end-of-input delimiter to prevent command injection" do @@ -97,4 +98,24 @@ RSpec.describe OpenProject::FileCommandContentTypeDetector do expect(Open3).to have_received(:capture2).with("file", "-b", "--mime", "--", "--help") end + + describe "charset detection from real fixture files" do + let(:utf8_fixture) { Rails.root.join("spec/fixtures/encoding/utf-8.txt").to_s } + let(:iso8859_fixture) { Rails.root.join("spec/fixtures/encoding/iso-8859-1.txt").to_s } + let(:png_fixture) { Rails.root.join("spec/fixtures/files/image.png").to_s } + + it "detects utf-8 charset for a UTF-8 encoded file" do + expect(described_class.new(utf8_fixture).detect).to eq(["text/plain", "utf-8"]) + end + + it "detects iso-8859-1 charset for an ISO-8859-1 encoded file" do + expect(described_class.new(iso8859_fixture).detect).to eq(["text/plain", "iso-8859-1"]) + end + + it "returns nil charset for non-text files" do + mime, charset = described_class.new(png_fixture).detect + expect(mime).to eq("image/png") + expect(charset).to be_nil + end + end end diff --git a/spec/models/attachment_spec.rb b/spec/models/attachment_spec.rb index 05b1f5f81e3..f1a7f81b1a3 100644 --- a/spec/models/attachment_spec.rb +++ b/spec/models/attachment_spec.rb @@ -252,7 +252,7 @@ RSpec.describe Attachment do let(:author) { create(:user) } let(:image_path) { Rails.root.join("spec/fixtures/files/image.png") } - let(:text_path) { Rails.root.join("spec/fixtures/files/testfile.txt") } + let(:text_path) { Rails.root.join("spec/fixtures/files/testfile-utf8.txt") } let(:binary_path) { Rails.root.join("spec/fixtures/files/textfile.txt.gz") } let(:image_attachment) { FogAttachment.new author:, file: File.open(image_path) } @@ -354,6 +354,54 @@ RSpec.describe Attachment do end end + describe "#serving_content_type" do + subject(:attachment) { described_class.new(content_type:, charset:) } + + let(:charset) { nil } + + context "when a text file has a detected utf-8 charset (new upload)" do + let(:content_type) { "text/plain" } + let(:charset) { "utf-8" } + + it "combines content_type and charset" do + expect(attachment.serving_content_type).to eq("text/plain; charset=utf-8") + end + end + + context "when a text file has a non-UTF-8 charset (e.g. ISO-8859-1)" do + let(:content_type) { "text/plain" } + let(:charset) { "iso-8859-1" } + + it "uses the stored charset" do + expect(attachment.serving_content_type).to eq("text/plain; charset=iso-8859-1") + end + end + + context "when a text file has no charset stored (legacy upload)" do + let(:content_type) { "text/plain" } + + it "falls back to Setting.attachment_default_charset so browsers do not default to ISO-8859-1" do + expect(attachment.serving_content_type).to eq("text/plain; charset=#{Setting.attachment_default_charset}") + end + end + + context "when another text subtype has no charset stored" do + let(:content_type) { "text/x-ruby" } + + it "falls back to Setting.attachment_default_charset" do + expect(attachment.serving_content_type).to eq("text/x-ruby; charset=#{Setting.attachment_default_charset}") + end + end + + context "when the file is not a text type" do + let(:content_type) { "image/png" } + + it "returns the content type unchanged" do + expect(attachment.serving_content_type).to eq("image/png") + end + end + end + describe "virus scan job on commit" do shared_let(:work_package) { create(:work_package) } let(:created_attachment) do diff --git a/spec/requests/api/v3/attachments/attachment_resource_shared_examples.rb b/spec/requests/api/v3/attachments/attachment_resource_shared_examples.rb index 5052404d970..0bb6b1e36ce 100644 --- a/spec/requests/api/v3/attachments/attachment_resource_shared_examples.rb +++ b/spec/requests/api/v3/attachments/attachment_resource_shared_examples.rb @@ -486,19 +486,75 @@ RSpec.shared_examples "an APIv3 attachment resource", content_type: :json, type: end end - context "for a local text file" do + context "for a local text file (no stored charset, uses configured default)" do it_behaves_like "for a local file" do - let(:expected_content_type) { "text/plain" } + let(:expected_content_type) { "text/plain; charset=#{Setting.attachment_default_charset}" } let(:mock_file) { FileHelpers.mock_uploaded_file name: "foobar.txt" } let(:content_disposition) { "inline; filename=foobar.txt" } + let(:attachment) do + att = create(:attachment, container:, file: mock_file, author: current_user) + att.file.store! + att.send :write_attribute, :file, mock_file.original_filename + att.send :write_attribute, :content_type, "text/plain" + att.send :write_attribute, :charset, nil + att.save! + att + end end end - context "for a local JS file" do + context "for a local JS file (normalised to text/plain, uses configured default charset)" do it_behaves_like "for a local file" do - let(:expected_content_type) { "text/plain" } + let(:expected_content_type) { "text/plain; charset=#{Setting.attachment_default_charset}" } let(:mock_file) { FileHelpers.mock_uploaded_file name: "foobar.js", content_type: "text/x-javascript" } let(:content_disposition) { "inline; filename=foobar.js" } + let(:attachment) do + att = create(:attachment, container:, file: mock_file, author: current_user) + att.file.store! + att.send :write_attribute, :file, mock_file.original_filename + att.send :write_attribute, :content_type, "text/x-javascript" + att.send :write_attribute, :charset, nil + att.save! + att + end + end + end + + context "for a local UTF-8 text file" do + it_behaves_like "for a local file" do + let(:expected_content_type) { "text/plain; charset=utf-8" } + let(:mock_file) { FileHelpers.mock_uploaded_file name: "foobar.txt" } + let(:content_disposition) { "inline; filename=foobar.txt" } + let(:attachment) do + att = create(:attachment, container:, file: mock_file, author: current_user) + att.file.store! + att.send :write_attribute, :file, mock_file.original_filename + att.send :write_attribute, :content_type, "text/plain" + att.send :write_attribute, :charset, "utf-8" + att.save! + att + end + end + end + + context "for a local ISO-8859-1 text file" do + it_behaves_like "for a local file" do + let(:expected_content_type) { "text/plain; charset=iso-8859-1" } + let(:mock_file) do + FileHelpers.mock_uploaded_file name: "iso.txt", + content: Rails.root.join("spec/fixtures/encoding/iso-8859-1.txt").binread, + binary: true + end + let(:content_disposition) { "inline; filename=iso.txt" } + let(:attachment) do + att = create(:attachment, container:, file: mock_file, author: current_user) + att.file.store! + att.send :write_attribute, :file, mock_file.original_filename + att.send :write_attribute, :content_type, "text/plain" + att.send :write_attribute, :charset, "iso-8859-1" + att.save! + att + end end end From 5330745e69adf82a195434f6a38e1e0ef302e59c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20G=C3=BCnther?= Date: Fri, 29 May 2026 10:25:20 +0200 Subject: [PATCH 2/3] Add the content type for external URLs --- app/models/attachment.rb | 23 +++++++++++++++++++---- app/uploaders/fog_file_uploader.rb | 22 +++++++++++++--------- lib/api/helpers/attachment_renderer.rb | 12 +----------- spec/models/attachment_spec.rb | 9 +++++++++ 4 files changed, 42 insertions(+), 24 deletions(-) diff --git a/app/models/attachment.rb b/app/models/attachment.rb index a4552f9ada1..4ba634a12bd 100644 --- a/app/models/attachment.rb +++ b/app/models/attachment.rb @@ -93,7 +93,9 @@ class Attachment < ApplicationRecord # specifically when using S3 for attachments. In the case of S3 the file name for the downloaded # file will still be correct as it's part of the URL before the query. def external_url_options(expires_in: nil) - { content_disposition: content_disposition(include_filename: false), expires_in: } + { content_disposition: content_disposition(include_filename: false), + content_type: served_content_type, + expires_in: } end def external_storage? @@ -119,11 +121,24 @@ class Attachment < ApplicationRecord end end + # Returns the Content-Type to use when serving this file inline in a browser. + # Text files are normalised to text/plain (prevents script execution) with an + # explicit charset. Non-inlineable files get application/octet-stream so the + # browser is forced to download them. + def served_content_type + if is_text? + "text/plain; charset=#{charset.presence || Setting.attachment_default_charset}" + elsif inlineable? + content_type + else + "application/octet-stream" + end + end + # Returns the content type to use when serving the file to a browser. # For text files, ensures a charset is always present so browsers don't - # fall back to ISO-8859-1. - # We use a configurable fallback (default utf-8) so that administrators - # can control content types for previously uploaded attachments + # fall back to ISO-8859-1. Preserves the real MIME subtype (e.g. text/x-ruby) + # unlike served_content_type which normalises to text/plain for security. def serving_content_type return content_type unless is_text? diff --git a/app/uploaders/fog_file_uploader.rb b/app/uploaders/fog_file_uploader.rb index 463073abcf6..05cb857bbb8 100644 --- a/app/uploaders/fog_file_uploader.rb +++ b/app/uploaders/fog_file_uploader.rb @@ -74,6 +74,7 @@ class FogFileUploader < CarrierWave::Uploader::Base # # @param options [Hash] Options hash. # @option options [String] :content_disposition Pass this content disposition to S3 so that it serves the file with it. + # @option options [String] :content_type Pass this content type to S3 so that it serves the file with it. # @option options [DateTime] :expires_at Date at which the link should expire (default: now + 5 minutes) # @option options [ActiveSupport::Duration] :expires_in Duration in which the link should expire. # @@ -82,6 +83,7 @@ class FogFileUploader < CarrierWave::Uploader::Base url_options = {} set_content_disposition!(url_options, options:) + set_content_type!(url_options, options:) set_expires_at!(url_options, options:) remote_file.url url_options @@ -103,15 +105,17 @@ class FogFileUploader < CarrierWave::Uploader::Base private def set_content_disposition!(url_options, options:) - if options[:content_disposition].present? - url_options[:query] = { - # Passing this option to S3 will make it serve the file with the - # respective content disposition. Without it no content disposition - # header is sent. This only works for S3 but we don't support - # anything else anyway (see carrierwave.rb). - "response-content-disposition" => options[:content_disposition] - } - end + return if options[:content_disposition].blank? + + (url_options[:query] ||= {})["response-content-disposition"] = options[:content_disposition] + end + + def set_content_type!(url_options, options:) + return if options[:content_type].blank? + + # Like the content disposition above, this makes S3 serve the file with the + # given Content-Type, overriding the stored object type. + (url_options[:query] ||= {})["response-content-type"] = options[:content_type] end def set_expires_at!(url_options, options:) diff --git a/lib/api/helpers/attachment_renderer.rb b/lib/api/helpers/attachment_renderer.rb index c086a6de814..3bc18c0fb01 100644 --- a/lib/api/helpers/attachment_renderer.rb +++ b/lib/api/helpers/attachment_renderer.rb @@ -100,17 +100,7 @@ module API end def attachment_content_type(attachment) - if attachment.is_text? - # Even if the text mime type might differ, always output plain text - # so this doesn't get interpreted as e.g., a script or html file - charset = attachment.charset.presence || Setting.attachment_default_charset - "text/plain; charset=#{charset}" - elsif attachment.inlineable? - attachment.content_type - else - # For security reasons, mark all non-inlinable files as an octet-stream first - "application/octet-stream" - end + attachment.served_content_type end def set_cache_headers diff --git a/spec/models/attachment_spec.rb b/spec/models/attachment_spec.rb index f1a7f81b1a3..be4ffcb0f27 100644 --- a/spec/models/attachment_spec.rb +++ b/spec/models/attachment_spec.rb @@ -320,6 +320,11 @@ RSpec.describe Attachment do it_behaves_like "it uses content disposition inline" do let(:attachment) { text_attachment } end + + it "includes response-content-type with text/plain and the detected charset in the S3 URL" do + url = text_attachment.external_url.to_s + expect(url).to include "response-content-type=text%2Fplain%3B%20charset%3D" + end end describe "for a video file" do @@ -351,6 +356,10 @@ RSpec.describe Attachment do expect(binary_attachment.content_disposition).to eq "attachment; filename=textfile.txt.gz" expect(binary_attachment.external_url.to_s).to include "response-content-disposition=attachment" end + + it "includes response-content-type application/octet-stream in the S3 URL" do + expect(binary_attachment.external_url.to_s).to include "response-content-type=application%2Foctet-stream" + end end end From a852d46cb6f17fd2453ee4690a7bba185a627b2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oliver=20G=C3=BCnther?= Date: Fri, 29 May 2026 10:30:07 +0200 Subject: [PATCH 3/3] Be more cautious when parsing charset from `file` --- .../file_command_content_type_detector.rb | 7 ++++-- ...file_command_content_type_detector_spec.rb | 23 +++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/lib/open_project/file_command_content_type_detector.rb b/lib/open_project/file_command_content_type_detector.rb index f491e17dc0b..f25ee999c97 100644 --- a/lib/open_project/file_command_content_type_detector.rb +++ b/lib/open_project/file_command_content_type_detector.rb @@ -90,8 +90,11 @@ module OpenProject end def extract_mime_and_charset(type) - mime, charset_param = type.split(";", 2).map(&:strip) - charset = charset_param&.match(/\Acharset=(.+)\z/)&.[](1) + parts = type.split(";").map(&:strip) + mime = parts.first + charset = parts.drop(1) + .filter_map { |p| p.match(/\Acharset=([^\s;]+)\z/)&.[](1) } + .first charset = nil if charset == "binary" [mime, charset] end diff --git a/spec/lib/open_project/file_command_content_type_detector_spec.rb b/spec/lib/open_project/file_command_content_type_detector_spec.rb index 4f444643932..a20c848b0ca 100644 --- a/spec/lib/open_project/file_command_content_type_detector_spec.rb +++ b/spec/lib/open_project/file_command_content_type_detector_spec.rb @@ -99,6 +99,29 @@ RSpec.describe OpenProject::FileCommandContentTypeDetector do expect(Open3).to have_received(:capture2).with("file", "-b", "--mime", "--", "--help") end + describe "charset parsing edge cases" do + def detect(raw_output) + allow(Open3).to receive(:capture2).and_return [raw_output, 0] + described_class.new("any").detect + end + + it "extracts charset when followed by an extra unknown parameter" do + expect(detect("text/plain; charset=utf-8; taste=banana")).to eq(["text/plain", "utf-8"]) + end + + it "only captures the charset token, not trailing content" do + expect(detect("text/plain; charset=utf-8;extra")).to eq(["text/plain", "utf-8"]) + end + + it "returns nil charset when charset= has no value" do + expect(detect("text/plain; charset=")).to eq(["text/plain", nil]) + end + + it "ignores unrecognised parameters before charset" do + expect(detect("text/plain; taste=banana; charset=iso-8859-1")).to eq(["text/plain", "iso-8859-1"]) + end + end + describe "charset detection from real fixture files" do let(:utf8_fixture) { Rails.root.join("spec/fixtures/encoding/utf-8.txt").to_s } let(:iso8859_fixture) { Rails.root.join("spec/fixtures/encoding/iso-8859-1.txt").to_s }