[Groonga-commit] ranguba/ranguba at d5fa6f1 [master] Use chupa-text gem

Back to archive index

Kouhei Sutou null+****@clear*****
Wed Feb 19 00:04:35 JST 2014


Kouhei Sutou	2014-02-19 00:04:35 +0900 (Wed, 19 Feb 2014)

  New Revision: d5fa6f18895af9998bab17b7307c841164564624
  https://github.com/ranguba/ranguba/commit/d5fa6f18895af9998bab17b7307c841164564624

  Message:
    Use chupa-text gem

  Modified files:
    Gemfile
    lib/ranguba/indexer.rb

  Modified: Gemfile (+4 -2)
===================================================================
--- Gemfile    2014-02-19 00:03:56 +0900 (d134bcd)
+++ Gemfile    2014-02-19 00:04:35 +0900 (5f0c17a)
@@ -40,9 +40,11 @@ end
 # Use debugger
 # gem 'debugger', group: [:development, :test]
 
-gem 'glib2'
 gem 'nokogiri'
-gem 'chuparuby'
+gem 'chupa-text'
+gem 'chupa-text-decomposer-pdf'
+gem 'chupa-text-decomposer-libreoffice'
+gem 'chupa-text-decomposer-html'
 
 base_dir = File.join(File.dirname(__FILE__), "..")
 gem 'rroonga'

  Modified: lib/ranguba/indexer.rb (+21 -53)
===================================================================
--- lib/ranguba/indexer.rb    2014-02-19 00:03:56 +0900 (806df5c)
+++ lib/ranguba/indexer.rb    2014-02-19 00:04:35 +0900 (0d81141)
@@ -3,7 +3,7 @@ require 'shellwords'
 require 'tmpdir'
 require 'fileutils'
 require 'time'
-require 'chupatext'
+require 'chupa-text'
 
 class Ranguba::Indexer
   attr_accessor :wget, :log_file, :url_prefix, :level, :accept,
@@ -135,6 +135,9 @@ EOS
   end
 
   def prepare(args)
+    ChupaText::Decomposers.load
+    @extractor = ChupaText::Extractor.new
+    @extractor.apply_configuration(ChupaText::Configuration.default)
     if @log_file and @url_prefix
       raise OptionParser::InvalidOption, "--url-prefix and --from-log options are exclusive"
     end
@@ -376,24 +379,19 @@ EOS
   def decompose_file_in_same_process(url, path, response)
     data = nil
     begin
-      input_data = Chupa::Data.new(path)
-      feeder = Chupa::Feeder.new
-      feeder.signal_connect("accepted") do |_feeder, _data|
-        data = _data
+      input_data = ChupaText::InputData.new(path)
+      @extractor.extract(input_data) do |extracted_data|
+        data = extracted_data
       end
-      feeder.feed(input_data)
-    rescue Chupa::Error => e
+    rescue ChupaText::EncryptedError
+      nil
+    rescue ChupaText::Error => e
       log(:error, "[error] #{e.class}: #{e.message}")
       log(:error, "[error] path: #{path}")
-      case e.code
-      when Chupa::DecomposerErrorCode::ENCRYPTED
-        return nil
-      else
-        raise
-      end
     else
       return nil if data.nil?
-      decomposed_file = DecomposedFile.new(@resolver, url, path, response, data)
+      decomposed_file = DecomposedFile.new(@resolver, url, path, response,
+                                           input_data, data)
       decomposed_file.attributes
     end
   end
@@ -449,30 +447,25 @@ EOS
   end
 
   class DecomposedFile
-    include Loggable
-
-    def initialize(resolver, url, path, response, data)
+    def initialize(resolver, url, path, response, input_data, data)
       @resolver = resolver
       @url = url
       @path = path
       @response = response
-      @metadata = data.metadata
-      @body = data.read || ""
-      if****@body***** == Encoding::ASCII_8BIT
-        @body.force_encoding(@metadata.encoding || Encoding::UTF_8)
-      end
+      @input_data = input_data
+      @data = data
     end
 
     def attributes
       {
         key: @url,
-        title: @metadata.title,
-        body: @body,
+        title: @data.attributes.title,
+        body: @data.body,
         basename: @url.split(/\//).last,
-        type: normalize_type(@metadata.original_mime_type),
-        encoding: @response["charset"] || @metadata.original_encoding || "",
+        type: normalize_type(@input_data.mime_type),
+        encoding: @response["charset"] || @input_data.attributes.encoding.to_s,
         category: category_for_url(@url) || "",
-        author: @metadata.author || "",
+        author: @data.attributes.author || "",
         modified_at: modification_time,
         updated_at: @response["x-update-time"],
       }
@@ -481,7 +474,7 @@ EOS
     private
     def modification_time
       modification_time = @response["last-modified"]
-      modification_time ||=****@metad*****_time
+      modification_time ||=****@data*****_time
       if modification_time
         begin
           modification_time = Time.parse(modification_time)
@@ -500,30 +493,5 @@ EOS
     def normalize_type(source)
       @resolver.normalize_type(source) || "unknown"
     end
-
-    def valid_encoding?(attributes)
-      url = attributes[:key]
-      invalid_encoding_attributes = attributes.reject do |key, value|
-        valid_utf8?(value)
-      end
-      invalid_encoding_keys = invalid_encoding_attributes.keys
-      if invalid_encoding_keys.blank?
-        true
-      else
-        message = "[#{invalid_encoding_keys.join(', ')}]"
-        log(:warn, "[encoding][invalid] key: #{url} - #{message}")
-        false
-      end
-    end
-
-    def valid_utf8?(value)
-      return true unless value.respond_to?(:encode)
-      value = value.dup
-      value.force_encoding("UTF-8").valid_encoding?
-    end
-
-    def log(level, message)
-      super(level, "[decompose]#{message}")
-    end
   end
 end
-------------- next part --------------
HTML����������������������������...
Télécharger 



More information about the Groonga-commit mailing list
Back to archive index