Kouhei Sutou 2019-02-28 12:43:23 +0900 (Thu, 28 Feb 2019) Revision: 64482eab0f7143a98f074c8a200b07ba28746016 https://github.com/ranguba/chupa-text/commit/64482eab0f7143a98f074c8a200b07ba28746016 Message: Add support for plain text encoding conversion Modified files: lib/chupa-text/extractor.rb test/test-extractor.rb Modified: lib/chupa-text/extractor.rb (+34 -2) =================================================================== --- lib/chupa-text/extractor.rb 2019-02-28 12:21:24 +0900 (d564e16) +++ lib/chupa-text/extractor.rb 2019-02-28 12:43:23 +0900 (403e90a) @@ -68,11 +68,13 @@ module ChupaText if decomposer.nil? if target.text_plain? debug {"#{log_tag}[extract][text-plain]"} - yield(target) + yield(ensure_utf8_body_data(target)) next else debug {"#{log_tag}[extract][decomposer] not found"} - yield(target) if target.text? + if target.text? + yield(ensure_utf8_body_data(target)) + end next end end @@ -98,6 +100,36 @@ module ChupaText end end + def ensure_utf8_body_data(data) + body = data.body + return dat if body.nil? + + encoding = body.encoding + case encoding + when Encoding::UTF_8 + return data + when Encoding::ASCII_8BIT + return data if body.ascii_only? + end + + candidates = [ + Encoding::UTF_8, + Encoding::EUC_JP, + Encoding::Windows_31J, + ] + candidates.each do |candidate| + body.force_encoding(candidate) + if body.valid_encoding? + utf8_body = body.encode(Encoding::UTF_8, + invalid: :replace, + undef: :replace) + return TextData.new(utf8_body, source_data: data) + end + end + body.encoding = encoding + data + end + def find_decomposer(data) candidates = [] @decomposers.each do |decomposer| Modified: test/test-extractor.rb (+24 -1) =================================================================== --- test/test-extractor.rb 2019-02-28 12:21:24 +0900 (02c81fb) +++ test/test-extractor.rb 2019-02-28 12:43:23 +0900 (713b89f) @@ -1,4 +1,4 @@ -# Copyright (C) 2013 Kouhei Sutou <kou****@clear*****> +# Copyright (C) 2013-2019 Kouhei Sutou <kou****@clear*****> # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -121,5 +121,28 @@ class TestExtractor < Test::Unit::TestCase assert_equal(["Hello", "Hello"], extract(data)) end end + + sub_test_case("body") do + def test_utf8 + data = ChupaText::Data.new + data.mime_type = "text/plain" + data.body = "こんにちは" + assert_equal(["こんにちは"], extract(data)) + end + + def test_cp932 + data = ChupaText::Data.new + data.mime_type = "text/plain" + data.body = "こんにちは".encode("cp932") + assert_equal(["こんにちは"], extract(data)) + end + + def test_euc_jp + data = ChupaText::Data.new + data.mime_type = "text/plain" + data.body = "こんにちは".encode("euc-jp") + assert_equal(["こんにちは"], extract(data)) + end + end end end -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190228/6ea223ea/attachment-0001.html>