Kouhei Sutou
null+****@clear*****
Sat Feb 25 00:53:13 JST 2017
Kouhei Sutou 2017-02-25 00:53:13 +0900 (Sat, 25 Feb 2017) New Revision: 28f69e1a41d79fb4796efd9191760af57743bd57 https://github.com/ranguba/chupa-text-decomposer-html/commit/28f69e1a41d79fb4796efd9191760af57743bd57 Message: Support not ASCII compatible case Modified files: lib/chupa-text/decomposers/html.rb test/test-html.rb Modified: lib/chupa-text/decomposers/html.rb (+4 -0) =================================================================== --- lib/chupa-text/decomposers/html.rb 2017-02-19 00:16:39 +0900 (ef3e96e) +++ lib/chupa-text/decomposers/html.rb 2017-02-25 00:53:13 +0900 (2f6e3e8) @@ -53,6 +53,10 @@ module ChupaText private def guess_encoding(text) + unless text.encoding.ascii_compatible? + return text.encoding.name + end + case text when /\A<\?xml.+?encoding=(['"])([a-zA-Z0-9_-]+)\1/ $2 Modified: test/test-html.rb (+27 -1) =================================================================== --- test/test-html.rb 2017-02-19 00:16:39 +0900 (8af4e77) +++ test/test-html.rb 2017-02-25 00:53:13 +0900 (4713b97) @@ -1,4 +1,4 @@ -# Copyright (C) 2013-2014 Kouhei Sutou <kou �� clear-code.com> +# Copyright (C) 2013-2017 Kouhei Sutou <kou �� clear-code.com> # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -181,6 +181,32 @@ class TestHTML < Test::Unit::TestCase HTML5 assert_equal([Encoding::EUC_JP], decompose(@data)) end + + sub_test_case("not ascii_compatible?") do + def test_iso_2022_jp + @data.body = <<-ISO_2022_JP_HTML.encode("ISO-2022-JP") +<html> + <head> + <title>タイトル</title> + </head> + <body>Hello</body> +</html> + ISO_2022_JP_HTML + assert_equal([Encoding::ISO_2022_JP], decompose(@data)) + end + + def test_utf32 + @data.body = <<-UTF_32_HTML.encode("UTF-32") +<html> + <head> + <title>タイトル</title> + </head> + <body>Hello</body> +</html> + UTF_32_HTML + assert_equal([Encoding::UTF_32], decompose(@data)) + end + end end sub_test_case("normalize") do -------------- next part -------------- HTML����������������������������...Télécharger