From 90ba46d406de348dc1ea57af39f064e93b095d42 Mon Sep 17 00:00:00 2001 From: Wieland Lindenthal Date: Thu, 15 Feb 2018 17:15:00 +0100 Subject: [PATCH] =?UTF-8?q?Gem=20name=20changed=20from=20=E2=80=9Atext=5Fe?= =?UTF-8?q?xtractor=E2=80=98=20to=20=E2=80=9Aplaintext=E2=80=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 +- Gemfile | 3 +- Gemfile.lock | 10 ++--- app/controllers/admin_controller.rb | 12 +++--- app/workers/extract_fulltext_job.rb | 2 +- .../{text_extractor.rb => plaintext.rb} | 4 +- config/plaintext.yml | 38 +++++++++++++++++++ .../table/queries/filter_spec.rb | 4 +- spec/workers/extract_fulltext_job_spec.rb | 4 +- 9 files changed, 58 insertions(+), 21 deletions(-) rename config/initializers/{text_extractor.rb => plaintext.rb} (91%) create mode 100644 config/plaintext.yml diff --git a/.gitignore b/.gitignore index 72dae3dd3da..f6af361c08b 100644 --- a/.gitignore +++ b/.gitignore @@ -90,5 +90,5 @@ npm-debug.log* /frontend/dist/ /frontend/tests/*.gif node_modules/ -text_extractor.yml +plaintext.yml structure.sql diff --git a/Gemfile b/Gemfile index 32eaadf23c8..95967a617c9 100644 --- a/Gemfile +++ b/Gemfile @@ -163,8 +163,7 @@ gem 'aws-sdk', '~> 2.10.1' gem 'openproject-token', '~> 1.0.1' -gem 'text-extractor', '0.1.0' - +gem 'plaintext', '0.1.0' group :test do gem 'rack-test', '~> 0.6.3' diff --git a/Gemfile.lock b/Gemfile.lock index 7a5c4d34187..a9a4e6b0478 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -383,6 +383,10 @@ GEM rake (>= 0.8.1) pdf-core (0.7.0) pg (0.21.0) + plaintext (0.1.0) + activesupport (> 2.2.1) + nokogiri (~> 1.8.1) + rubyzip (~> 1.2.1) powerpack (0.1.1) prawn (2.2.2) pdf-core (~> 0.7.0) @@ -559,10 +563,6 @@ GEM sys-filesystem (1.1.8) ffi test-prof (0.1.0) - text-extractor (0.1.0) - activesupport (> 2.2.1) - nokogiri (~> 1.8.1) - rubyzip (~> 1.2.1) thin (1.7.2) daemons (~> 1.0, >= 1.0.9) eventmachine (~> 1.0, >= 1.0.4) @@ -674,6 +674,7 @@ DEPENDENCIES parallel_tests (~> 2.14.0) passenger pg (~> 0.21.0) + plaintext (= 0.1.0) prawn (~> 2.2) prawn-table (~> 0.2.2) pry-byebug (~> 3.4.2) @@ -720,7 +721,6 @@ DEPENDENCIES syck (~> 1.3.0) sys-filesystem (~> 1.1.4) test-prof - text-extractor (= 0.1.0) thin (~> 1.7.2) timecop (~> 0.9.0) transactional_lock! diff --git a/app/controllers/admin_controller.rb b/app/controllers/admin_controller.rb index 5d2bf4c066f..747290d4d08 100644 --- a/app/controllers/admin_controller.rb +++ b/app/controllers/admin_controller.rb @@ -77,12 +77,12 @@ class AdminController < ApplicationController @checklist = [ [:text_default_administrator_account_changed, User.default_admin_account_changed?], [:text_file_repository_writable, repository_writable], - [:'extraction.available.pdftotext', TextExtractor::PdfHandler.available?], - [:'extraction.available.unrtf', TextExtractor::RtfHandler.available?], - [:'extraction.available.catdoc', TextExtractor::DocHandler.available?], - [:'extraction.available.xls2csv', TextExtractor::XlsHandler.available?], - [:'extraction.available.catppt', TextExtractor::PptHandler.available?], - [:'extraction.available.tesseract', TextExtractor::ImageHandler.available?] + [:'extraction.available.pdftotext', Plaintext::PdfHandler.available?], + [:'extraction.available.unrtf', Plaintext::RtfHandler.available?], + [:'extraction.available.catdoc', Plaintext::DocHandler.available?], + [:'extraction.available.xls2csv', Plaintext::XlsHandler.available?], + [:'extraction.available.catppt', Plaintext::PptHandler.available?], + [:'extraction.available.tesseract', Plaintext::ImageHandler.available?] ] @storage_information = OpenProject::Storage.mount_information diff --git a/app/workers/extract_fulltext_job.rb b/app/workers/extract_fulltext_job.rb index c299b8b4525..ae10a94d911 100644 --- a/app/workers/extract_fulltext_job.rb +++ b/app/workers/extract_fulltext_job.rb @@ -55,7 +55,7 @@ class ExtractFulltextJob < ApplicationJob begin if @attachment.readable? - resolver = TextExtractor::Resolver.new(@file, @attachment.content_type) + resolver = Plaintext::Resolver.new(@file, @attachment.content_type) @text = resolver.text end rescue => e diff --git a/config/initializers/text_extractor.rb b/config/initializers/plaintext.rb similarity index 91% rename from config/initializers/text_extractor.rb rename to config/initializers/plaintext.rb index b90f569fb41..a068fa9f7d1 100644 --- a/config/initializers/text_extractor.rb +++ b/config/initializers/plaintext.rb @@ -27,8 +27,8 @@ # See doc/COPYRIGHT.rdoc for more details. #++ -file_name = File.join([Rails.root.to_s, 'config', 'text_extractor.yml']) +file_name = File.join([Rails.root.to_s, 'config', 'plaintext.yml']) if File.file?(file_name) config_file = File.read(file_name) - TextExtractor::Configuration.load(config_file) + Plaintext::Configuration.load(config_file) end diff --git a/config/plaintext.yml b/config/plaintext.yml new file mode 100644 index 00000000000..a0219f896cd --- /dev/null +++ b/config/plaintext.yml @@ -0,0 +1,38 @@ +# Text extraction helper programs. +# +# commands should write the resulting plain text to STDOUT. Use __FILE__ as +# placeholder for the file path. The values below are the defaults. +# apt install poppler-utils +pdftotext: + - /usr/local/bin/pdftotext + - -enc + - UTF-8 + - __FILE__ + - '-' + +# apt install unrtf +unrtf: + - /usr/local/bin/unrtf + - --text + - __FILE__ + +tesseract: + - /usr/local/bin/tesseract + - __FILE__ + - stdout + +# apt install catdoc +catdoc: + - /usr/bin/textutil + - -convert + - txt + - -stdout + - __FILE__ +# xls2csv: +# - /usr/bin/xls2csv +# - -dutf-8 +# - __FILE__ +# catppt: +# - /usr/bin/catppt +# - -dutf-8 +# - __FILE__ diff --git a/spec/features/work_packages/table/queries/filter_spec.rb b/spec/features/work_packages/table/queries/filter_spec.rb index eec0f31208d..c7607f76927 100644 --- a/spec/features/work_packages/table/queries/filter_spec.rb +++ b/spec/features/work_packages/table/queries/filter_spec.rb @@ -263,10 +263,10 @@ describe 'filter work packages', js: true do allow(EnterpriseToken).to receive(:allows_to?).and_return(false) allow(EnterpriseToken).to receive(:allows_to?).with(:attachment_filters).and_return(true) - allow_any_instance_of(TextExtractor::Resolver).to receive(:text).and_return('I am the first text $1.99.') + allow_any_instance_of(Plaintext::Resolver).to receive(:text).and_return('I am the first text $1.99.') wp_with_attachment_a ExtractFulltextJob.new(attachment_a.id).perform - allow_any_instance_of(TextExtractor::Resolver).to receive(:text).and_return('I am the second text.') + allow_any_instance_of(Plaintext::Resolver).to receive(:text).and_return('I am the second text.') wp_with_attachment_b ExtractFulltextJob.new(attachment_b.id).perform wp_without_attachment diff --git a/spec/workers/extract_fulltext_job_spec.rb b/spec/workers/extract_fulltext_job_spec.rb index c84f4403880..7a4ca8653a1 100644 --- a/spec/workers/extract_fulltext_job_spec.rb +++ b/spec/workers/extract_fulltext_job_spec.rb @@ -38,7 +38,7 @@ describe ExtractFulltextJob, type: :job do context "with successful text extraction" do before do - allow_any_instance_of(TextExtractor::Resolver).to receive(:text).and_return(text) + allow_any_instance_of(Plaintext::Resolver).to receive(:text).and_return(text) end context 'attachment is readable' do @@ -91,7 +91,7 @@ describe ExtractFulltextJob, type: :job do let(:logger) { Rails.logger } before do - allow_any_instance_of(TextExtractor::Resolver).to receive(:text).and_raise(exception_message) + allow_any_instance_of(Plaintext::Resolver).to receive(:text).and_raise(exception_message) # This line is actually part of the test. `expect` call needs to go so far up here, as we want to verify that a message gets logged. expect(logger).to receive(:error).with(exception_message)