Kouhei Sutou 2019-02-28 14:43:15 +0900 (Thu, 28 Feb 2019) Revision: 6475299a1912f18d4d8280782637d49ba97965f2 https://github.com/ranguba/chupa-text/commit/6475299a1912f18d4d8280782637d49ba97965f2 Message: Add support BOM detection Modified files: lib/chupa-text/extractor.rb Modified: lib/chupa-text/extractor.rb (+56 -3) =================================================================== --- lib/chupa-text/extractor.rb 2019-02-28 12:43:23 +0900 (403e90a) +++ lib/chupa-text/extractor.rb 2019-02-28 14:43:15 +0900 (115f1f7) @@ -107,9 +107,33 @@ module ChupaText encoding = body.encoding case encoding when Encoding::UTF_8 - return data + bom_size, bom_encoding = detect_bom(body) + if bom_size + body_without_bom = body.byteslice(bom_size, + body.byteslice - bom_size) + return TextData.new(body_without_bom, source_data: data) + else + return data + end when Encoding::ASCII_8BIT return data if body.ascii_only? + else + utf8_body = body.encode(Encoding::UTF_8, + invalid: :replace, + undef: :replace, + replace: "") + return TextData.new(utf8_body, source_data: data) + end + + bom_size, bom_encoding = detect_bom(body) + if bom_encoding + body_without_bom = body.byteslice(bom_size, body.bytesize - bom_size) + utf8_body = body_without_bom.encode(Encoding::UTF_8, + bom_encoding, + invalid: :replace, + undef: :replace, + replace: "") + return TextData.new(utf8_body, source_data: data) end candidates = [ @@ -122,14 +146,43 @@ module ChupaText if body.valid_encoding? utf8_body = body.encode(Encoding::UTF_8, invalid: :replace, - undef: :replace) + undef: :replace, + replace: "") return TextData.new(utf8_body, source_data: data) end end - body.encoding = encoding + body.force_encoding(encoding) data end + UTF_8_BOM = "\xef\xbb\xbf".b + UTF_16BE_BOM = "\xfe\xff".b + UTF_16LE_BOM = "\xff\xfe".b + UTF_32BE_BOM = "\x00\x00\xfe\xff".b + UTF_32LE_BOM = "\xff\xfe\x00\x00".b + def detect_bom(text) + case text.byteslice(0, 4).b + when UTF_32BE_BOM + return 4, Encoding::UTF_32BE + when UTF_32LE_BOM + return 4, Encoding::UTF_32LE + end + + case text.byteslice(0, 3).b + when UTF_8_BOM + return 3, Encoding::UTF_8 + end + + case text.byteslice(0, 2).b + when UTF_16BE_BOM + return 2, Encoding::UTF_16BE + when UTF_16LE_BOM + return 2, Encoding::UTF_16LE + end + + nil + end + def find_decomposer(data) candidates = [] @decomposers.each do |decomposer| -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190228/28e36bba/attachment-0001.html>