susumu.yata
null+****@clear*****
Thu May 30 10:12:15 JST 2013
susumu.yata 2013-05-30 10:12:15 +0900 (Thu, 30 May 2013) New Revision: 9d38a9f5cc6233332483e30b8db46c54d83fff7c https://github.com/groonga/grnxx/commit/9d38a9f5cc6233332483e30b8db46c54d83fff7c Message: Update grnxx::Charset to support grnxx::Bytes. Modified files: lib/grnxx/charset.hpp lib/grnxx/charset/euc-jp.cpp lib/grnxx/charset/euc-jp.hpp lib/grnxx/charset/shift_jis.cpp lib/grnxx/charset/shift_jis.hpp lib/grnxx/charset/utf-8.cpp lib/grnxx/charset/utf-8.hpp lib/grnxx/map/scanner_impl.cpp Modified: lib/grnxx/charset.hpp (+12 -1) =================================================================== --- lib/grnxx/charset.hpp 2013-05-30 00:23:47 +0900 (019b8a7) +++ lib/grnxx/charset.hpp 2013-05-30 10:12:15 +0900 (d16e4f2) @@ -20,9 +20,12 @@ #include "grnxx/features.hpp" -#include "grnxx/slice.hpp" +#include "grnxx/bytes.hpp" #include "grnxx/types.hpp" +// TODO: To be removed in future. +#include "grnxx/slice.hpp" + namespace grnxx { class StringBuilder; @@ -49,6 +52,14 @@ class Charset { // Return the charset code. virtual CharsetCode code() const = 0; + // Return the first character of "bytes". This function may return an empty + // sequence if "bytes" is empty or an invalid sequence. + virtual Bytes get_char(const Bytes &bytes) const = 0; + // Return the size of the first character of "bytes". This function may + // return 0 if "bytes" is empty or an invalid sequence. + virtual size_t get_char_size(const Bytes &bytes) const = 0; + + // TODO: To be removed in future. // Return the first character of "slice". This function may return an empty // slice if "slice" is empty or an invalid sequence. virtual Slice get_char(const Slice &slice) const = 0; Modified: lib/grnxx/charset/euc-jp.cpp (+20 -12) =================================================================== --- lib/grnxx/charset/euc-jp.cpp 2013-05-30 00:23:47 +0900 (93c7d1a) +++ lib/grnxx/charset/euc-jp.cpp 2013-05-30 10:12:15 +0900 (f4371a2) @@ -29,44 +29,44 @@ CharsetCode EUC_JP::code() const { return CHARSET_EUC_JP; } -Slice EUC_JP::get_char(const Slice &slice) const { - return slice.prefix(get_char_size(slice)); +Bytes EUC_JP::get_char(const Bytes &bytes) const { + return bytes.prefix(get_char_size(bytes)); } -size_t EUC_JP::get_char_size(const Slice &slice) const { - if (!slice) { +size_t EUC_JP::get_char_size(const Bytes &bytes) const { + if (!bytes) { return 0; } // Reference: http://ja.wikipedia.org/wiki/EUC-JP - if (slice[0] & 0x80) { + if (bytes[0] & 0x80) { // A 3-byte character starts with 0x8F. - if (slice[0] == 0x8F) { + if (bytes[0] == 0x8F) { // Return 0 if the character is incomplete. - if (slice.size() < 3) { + if (bytes.size() < 3) { return 0; } // Return 0 if the 2nd byte is invalid. // In fact, only bytes in [A1, A8], [B0, ED], and [F3, FE] are valid. - if (static_cast<unsigned>(slice[1] - 0xA1) > (0xFE - 0xA1)) { + if (static_cast<unsigned>(bytes[1] - 0xA1) > (0xFE - 0xA1)) { return 0; } // Return 0 if the 3rd byte is invalid. - if (static_cast<unsigned>(slice[2] - 0xA1) > (0xFE - 0xA1)) { + if (static_cast<unsigned>(bytes[2] - 0xA1) > (0xFE - 0xA1)) { return 0; } return 3; } else { // Return 0 if the 1st byte is invalid. // In fact, only bytes in [A1, A8], [AD, AD], and [B0, FE] are valid. - if (static_cast<unsigned>(slice[0] - 0xA1) > (0xFE - 0xA1)) { + if (static_cast<unsigned>(bytes[0] - 0xA1) > (0xFE - 0xA1)) { return 0; } // Return 0 if the character is incomplete. - if (slice.size() < 2) { + if (bytes.size() < 2) { return 0; } // Return 0 if the 2nd byte is invalid. - if (static_cast<unsigned>(slice[1] - 0xA1) > (0xFE - 0xA1)) { + if (static_cast<unsigned>(bytes[1] - 0xA1) > (0xFE - 0xA1)) { return 0; } return 2; @@ -76,5 +76,13 @@ size_t EUC_JP::get_char_size(const Slice &slice) const { return 1; } +Slice EUC_JP::get_char(const Slice &slice) const { + return slice.prefix(get_char_size(slice)); +} + +size_t EUC_JP::get_char_size(const Slice &slice) const { + return get_char_size(Bytes(slice.ptr(), slice.size())); +} + } // namespace charset } // namespace grnxx Modified: lib/grnxx/charset/euc-jp.hpp (+7 -0) =================================================================== --- lib/grnxx/charset/euc-jp.hpp 2013-05-30 00:23:47 +0900 (57676aa) +++ lib/grnxx/charset/euc-jp.hpp 2013-05-30 10:12:15 +0900 (8c8fb03) @@ -20,7 +20,10 @@ #include "grnxx/features.hpp" +#include "grnxx/bytes.hpp" #include "grnxx/charset.hpp" + +// TODO: To be removed in future. #include "grnxx/slice.hpp" namespace grnxx { @@ -33,6 +36,10 @@ class EUC_JP : public Charset { CharsetCode code() const; + Bytes get_char(const Bytes &bytes) const; + size_t get_char_size(const Bytes &bytes) const; + + // TODO: To be removed in future. Slice get_char(const Slice &slice) const; size_t get_char_size(const Slice &slice) const; }; Modified: lib/grnxx/charset/shift_jis.cpp (+15 -7) =================================================================== --- lib/grnxx/charset/shift_jis.cpp 2013-05-30 00:23:47 +0900 (5501434) +++ lib/grnxx/charset/shift_jis.cpp 2013-05-30 10:12:15 +0900 (bd7a3a0) @@ -29,23 +29,23 @@ CharsetCode Shift_JIS::code() const { return CHARSET_SHIFT_JIS; } -Slice Shift_JIS::get_char(const Slice &slice) const { - return slice.prefix(get_char_size(slice)); +Bytes Shift_JIS::get_char(const Bytes &bytes) const { + return bytes.prefix(get_char_size(bytes)); } -size_t Shift_JIS::get_char_size(const Slice &slice) const { - if (!slice) { +size_t Shift_JIS::get_char_size(const Bytes &bytes) const { + if (!bytes) { return 0; } // The 1st byte of a multibyte character is in [81, 9F] or [E0, FC]. // Reference: http://www.st.rim.or.jp/~phinloda/cqa/cqa15.html#Q4 - if (static_cast<unsigned>((slice[0] ^ 0x20) - 0xA1) < 0x3C) { + if (static_cast<unsigned>((bytes[0] ^ 0x20) - 0xA1) < 0x3C) { // Return 0 if the character is incomplete. - if (slice.size() < 2) { + if (bytes.size() < 2) { return 0; } // Return 0 if the 2nd byte is invalid. - if (static_cast<unsigned>(slice[1] - 0x40) > (0xFC - 0x40)) { + if (static_cast<unsigned>(bytes[1] - 0x40) > (0xFC - 0x40)) { return 0; } return 2; @@ -54,5 +54,13 @@ size_t Shift_JIS::get_char_size(const Slice &slice) const { return 1; } +Slice Shift_JIS::get_char(const Slice &slice) const { + return slice.prefix(get_char_size(slice)); +} + +size_t Shift_JIS::get_char_size(const Slice &slice) const { + return get_char_size(Bytes(slice.ptr(), slice.size())); +} + } // namespace charset } // namespace grnxx Modified: lib/grnxx/charset/shift_jis.hpp (+7 -0) =================================================================== --- lib/grnxx/charset/shift_jis.hpp 2013-05-30 00:23:47 +0900 (c7f38a4) +++ lib/grnxx/charset/shift_jis.hpp 2013-05-30 10:12:15 +0900 (b3e2051) @@ -20,7 +20,10 @@ #include "grnxx/features.hpp" +#include "grnxx/bytes.hpp" #include "grnxx/charset.hpp" + +// TODO: To be removed in future. #include "grnxx/slice.hpp" namespace grnxx { @@ -33,6 +36,10 @@ class Shift_JIS : public Charset { CharsetCode code() const; + Bytes get_char(const Bytes &bytes) const; + size_t get_char_size(const Bytes &bytes) const; + + // TODO: To be removed in future. Slice get_char(const Slice &slice) const; size_t get_char_size(const Slice &slice) const; }; Modified: lib/grnxx/charset/utf-8.cpp (+18 -10) =================================================================== --- lib/grnxx/charset/utf-8.cpp 2013-05-30 00:23:47 +0900 (5ef4a83) +++ lib/grnxx/charset/utf-8.cpp 2013-05-30 10:12:15 +0900 (c6e0c13) @@ -31,40 +31,40 @@ CharsetCode UTF_8::code() const { return CHARSET_UTF_8; } -Slice UTF_8::get_char(const Slice &slice) const { - return slice.prefix(get_char_size(slice)); +Bytes UTF_8::get_char(const Bytes &bytes) const { + return bytes.prefix(get_char_size(bytes)); } -size_t UTF_8::get_char_size(const Slice &slice) const { - if (!slice) { +size_t UTF_8::get_char_size(const Bytes &bytes) const { + if (!bytes) { return 0; } - if (slice[0] & 0x80) { + if (bytes[0] & 0x80) { // A multibyte character can be 2, 3, or 4 bytes long. Also, the 2nd, // 3rd, and 4th byte must be 10xxxxxx, the most significant 2 bits must // be 10. const size_t char_size = - 31 - bit_scan_reverse(~(static_cast<uint32_t>(slice[0]) << 24)); + 31 - bit_scan_reverse(~(static_cast<uint32_t>(bytes[0]) << 24)); // Return 0 if the character is incomplete. - if (char_size > slice.size()) { + if (char_size > bytes.size()) { return 0; } switch (char_size) { case 4: { // Return 0 if the 4th byte is invalid. - if ((slice[3] & 0xC0) != 0x80) { + if ((bytes[3] & 0xC0) != 0x80) { return 0; } } case 3: { // Return 0 if the 3rd byte is invalid. - if ((slice[2] & 0xC0) != 0x80) { + if ((bytes[2] & 0xC0) != 0x80) { return 0; } } case 2: { // Return 0 if the 2nd byte is invalid. - if ((slice[1] & 0xC0) != 0x80) { + if ((bytes[1] & 0xC0) != 0x80) { return 0; } return char_size; @@ -79,5 +79,13 @@ size_t UTF_8::get_char_size(const Slice &slice) const { return 1; } +Slice UTF_8::get_char(const Slice &slice) const { + return slice.prefix(get_char_size(slice)); +} + +size_t UTF_8::get_char_size(const Slice &slice) const { + return get_char_size(Bytes(slice.ptr(), slice.size())); +} + } // namespace charset } // namespace grnxx Modified: lib/grnxx/charset/utf-8.hpp (+7 -0) =================================================================== --- lib/grnxx/charset/utf-8.hpp 2013-05-30 00:23:47 +0900 (3009165) +++ lib/grnxx/charset/utf-8.hpp 2013-05-30 10:12:15 +0900 (27a1c21) @@ -20,7 +20,10 @@ #include "grnxx/features.hpp" +#include "grnxx/bytes.hpp" #include "grnxx/charset.hpp" + +// TODO: To be removed in future. #include "grnxx/slice.hpp" namespace grnxx { @@ -33,6 +36,10 @@ class UTF_8 : public Charset { CharsetCode code() const; + Bytes get_char(const Bytes &bytes) const; + size_t get_char_size(const Bytes &bytes) const; + + // TODO: To be removed in future. Slice get_char(const Slice &slice) const; size_t get_char_size(const Slice &slice) const; }; Modified: lib/grnxx/map/scanner_impl.cpp (+1 -5) =================================================================== --- lib/grnxx/map/scanner_impl.cpp 2013-05-30 00:23:47 +0900 (c12d1f3) +++ lib/grnxx/map/scanner_impl.cpp 2013-05-30 10:12:15 +0900 (26714f5) @@ -25,9 +25,6 @@ #include "grnxx/logger.hpp" #include "grnxx/map.hpp" -// TODO: To be removed in future. -#include "grnxx/slice.hpp" - namespace grnxx { namespace map { @@ -62,8 +59,7 @@ bool ScannerImpl<T>::next() { } // Move to the next character. if (charset_) { - // TODO: Charset should support Bytes. - this->offset_ += charset_->get_char_size(Slice(rest.ptr(), rest.size())); + this->offset_ += charset_->get_char_size(rest); } else { ++this->offset_; } -------------- next part -------------- HTML����������������������������... Télécharger