Kouhei Sutou
null+****@clear*****
Wed Feb 19 00:04:35 JST 2014
Kouhei Sutou 2014-02-19 00:04:35 +0900 (Wed, 19 Feb 2014) New Revision: d5fa6f18895af9998bab17b7307c841164564624 https://github.com/ranguba/ranguba/commit/d5fa6f18895af9998bab17b7307c841164564624 Message: Use chupa-text gem Modified files: Gemfile lib/ranguba/indexer.rb Modified: Gemfile (+4 -2) =================================================================== --- Gemfile 2014-02-19 00:03:56 +0900 (d134bcd) +++ Gemfile 2014-02-19 00:04:35 +0900 (5f0c17a) @@ -40,9 +40,11 @@ end # Use debugger # gem 'debugger', group: [:development, :test] -gem 'glib2' gem 'nokogiri' -gem 'chuparuby' +gem 'chupa-text' +gem 'chupa-text-decomposer-pdf' +gem 'chupa-text-decomposer-libreoffice' +gem 'chupa-text-decomposer-html' base_dir = File.join(File.dirname(__FILE__), "..") gem 'rroonga' Modified: lib/ranguba/indexer.rb (+21 -53) =================================================================== --- lib/ranguba/indexer.rb 2014-02-19 00:03:56 +0900 (806df5c) +++ lib/ranguba/indexer.rb 2014-02-19 00:04:35 +0900 (0d81141) @@ -3,7 +3,7 @@ require 'shellwords' require 'tmpdir' require 'fileutils' require 'time' -require 'chupatext' +require 'chupa-text' class Ranguba::Indexer attr_accessor :wget, :log_file, :url_prefix, :level, :accept, @@ -135,6 +135,9 @@ EOS end def prepare(args) + ChupaText::Decomposers.load + @extractor = ChupaText::Extractor.new + @extractor.apply_configuration(ChupaText::Configuration.default) if @log_file and @url_prefix raise OptionParser::InvalidOption, "--url-prefix and --from-log options are exclusive" end @@ -376,24 +379,19 @@ EOS def decompose_file_in_same_process(url, path, response) data = nil begin - input_data = Chupa::Data.new(path) - feeder = Chupa::Feeder.new - feeder.signal_connect("accepted") do |_feeder, _data| - data = _data + input_data = ChupaText::InputData.new(path) + @extractor.extract(input_data) do |extracted_data| + data = extracted_data end - feeder.feed(input_data) - rescue Chupa::Error => e + rescue ChupaText::EncryptedError + nil + rescue ChupaText::Error => e log(:error, "[error] #{e.class}: #{e.message}") log(:error, "[error] path: #{path}") - case e.code - when Chupa::DecomposerErrorCode::ENCRYPTED - return nil - else - raise - end else return nil if data.nil? - decomposed_file = DecomposedFile.new(@resolver, url, path, response, data) + decomposed_file = DecomposedFile.new(@resolver, url, path, response, + input_data, data) decomposed_file.attributes end end @@ -449,30 +447,25 @@ EOS end class DecomposedFile - include Loggable - - def initialize(resolver, url, path, response, data) + def initialize(resolver, url, path, response, input_data, data) @resolver = resolver @url = url @path = path @response = response - @metadata = data.metadata - @body = data.read || "" - if****@body***** == Encoding::ASCII_8BIT - @body.force_encoding(@metadata.encoding || Encoding::UTF_8) - end + @input_data = input_data + @data = data end def attributes { key: @url, - title: @metadata.title, - body: @body, + title: @data.attributes.title, + body: @data.body, basename: @url.split(/\//).last, - type: normalize_type(@metadata.original_mime_type), - encoding: @response["charset"] || @metadata.original_encoding || "", + type: normalize_type(@input_data.mime_type), + encoding: @response["charset"] || @input_data.attributes.encoding.to_s, category: category_for_url(@url) || "", - author: @metadata.author || "", + author: @data.attributes.author || "", modified_at: modification_time, updated_at: @response["x-update-time"], } @@ -481,7 +474,7 @@ EOS private def modification_time modification_time = @response["last-modified"] - modification_time ||=****@metad*****_time + modification_time ||=****@data*****_time if modification_time begin modification_time = Time.parse(modification_time) @@ -500,30 +493,5 @@ EOS def normalize_type(source) @resolver.normalize_type(source) || "unknown" end - - def valid_encoding?(attributes) - url = attributes[:key] - invalid_encoding_attributes = attributes.reject do |key, value| - valid_utf8?(value) - end - invalid_encoding_keys = invalid_encoding_attributes.keys - if invalid_encoding_keys.blank? - true - else - message = "[#{invalid_encoding_keys.join(', ')}]" - log(:warn, "[encoding][invalid] key: #{url} - #{message}") - false - end - end - - def valid_utf8?(value) - return true unless value.respond_to?(:encode) - value = value.dup - value.force_encoding("UTF-8").valid_encoding? - end - - def log(level, message) - super(level, "[decompose]#{message}") - end end end -------------- next part -------------- HTML����������������������������... Télécharger