[Groonga-commit] groonga/grnxx at 9d38a9f [master] Update grnxx::Charset to support grnxx::Bytes.

Back to archive index

susumu.yata null+****@clear*****
Thu May 30 10:12:15 JST 2013


susumu.yata	2013-05-30 10:12:15 +0900 (Thu, 30 May 2013)

  New Revision: 9d38a9f5cc6233332483e30b8db46c54d83fff7c
  https://github.com/groonga/grnxx/commit/9d38a9f5cc6233332483e30b8db46c54d83fff7c

  Message:
    Update grnxx::Charset to support grnxx::Bytes.

  Modified files:
    lib/grnxx/charset.hpp
    lib/grnxx/charset/euc-jp.cpp
    lib/grnxx/charset/euc-jp.hpp
    lib/grnxx/charset/shift_jis.cpp
    lib/grnxx/charset/shift_jis.hpp
    lib/grnxx/charset/utf-8.cpp
    lib/grnxx/charset/utf-8.hpp
    lib/grnxx/map/scanner_impl.cpp

  Modified: lib/grnxx/charset.hpp (+12 -1)
===================================================================
--- lib/grnxx/charset.hpp    2013-05-30 00:23:47 +0900 (019b8a7)
+++ lib/grnxx/charset.hpp    2013-05-30 10:12:15 +0900 (d16e4f2)
@@ -20,9 +20,12 @@
 
 #include "grnxx/features.hpp"
 
-#include "grnxx/slice.hpp"
+#include "grnxx/bytes.hpp"
 #include "grnxx/types.hpp"
 
+// TODO: To be removed in future.
+#include "grnxx/slice.hpp"
+
 namespace grnxx {
 
 class StringBuilder;
@@ -49,6 +52,14 @@ class Charset {
   // Return the charset code.
   virtual CharsetCode code() const = 0;
 
+  // Return the first character of "bytes". This function may return an empty
+  // sequence if "bytes" is empty or an invalid sequence.
+  virtual Bytes get_char(const Bytes &bytes) const = 0;
+  // Return the size of the first character of "bytes". This function may
+  // return 0 if "bytes" is empty or an invalid sequence.
+  virtual size_t get_char_size(const Bytes &bytes) const = 0;
+
+  // TODO: To be removed in future.
   // Return the first character of "slice". This function may return an empty
   // slice if "slice" is empty or an invalid sequence.
   virtual Slice get_char(const Slice &slice) const = 0;

  Modified: lib/grnxx/charset/euc-jp.cpp (+20 -12)
===================================================================
--- lib/grnxx/charset/euc-jp.cpp    2013-05-30 00:23:47 +0900 (93c7d1a)
+++ lib/grnxx/charset/euc-jp.cpp    2013-05-30 10:12:15 +0900 (f4371a2)
@@ -29,44 +29,44 @@ CharsetCode EUC_JP::code() const {
   return CHARSET_EUC_JP;
 }
 
-Slice EUC_JP::get_char(const Slice &slice) const {
-  return slice.prefix(get_char_size(slice));
+Bytes EUC_JP::get_char(const Bytes &bytes) const {
+  return bytes.prefix(get_char_size(bytes));
 }
 
-size_t EUC_JP::get_char_size(const Slice &slice) const {
-  if (!slice) {
+size_t EUC_JP::get_char_size(const Bytes &bytes) const {
+  if (!bytes) {
     return 0;
   }
   // Reference: http://ja.wikipedia.org/wiki/EUC-JP
-  if (slice[0] & 0x80) {
+  if (bytes[0] & 0x80) {
     // A 3-byte character starts with 0x8F.
-    if (slice[0] == 0x8F) {
+    if (bytes[0] == 0x8F) {
       // Return 0 if the character is incomplete.
-      if (slice.size() < 3) {
+      if (bytes.size() < 3) {
         return 0;
       }
       // Return 0 if the 2nd byte is invalid.
       // In fact, only bytes in [A1, A8], [B0, ED], and [F3, FE] are valid.
-      if (static_cast<unsigned>(slice[1] - 0xA1) > (0xFE - 0xA1)) {
+      if (static_cast<unsigned>(bytes[1] - 0xA1) > (0xFE - 0xA1)) {
         return 0;
       }
       // Return 0 if the 3rd byte is invalid.
-      if (static_cast<unsigned>(slice[2] - 0xA1) > (0xFE - 0xA1)) {
+      if (static_cast<unsigned>(bytes[2] - 0xA1) > (0xFE - 0xA1)) {
         return 0;
       }
       return 3;
     } else {
       // Return 0 if the 1st byte is invalid.
       // In fact, only bytes in [A1, A8], [AD, AD], and [B0, FE] are valid.
-      if (static_cast<unsigned>(slice[0] - 0xA1) > (0xFE - 0xA1)) {
+      if (static_cast<unsigned>(bytes[0] - 0xA1) > (0xFE - 0xA1)) {
         return 0;
       }
       // Return 0 if the character is incomplete.
-      if (slice.size() < 2) {
+      if (bytes.size() < 2) {
         return 0;
       }
       // Return 0 if the 2nd byte is invalid.
-      if (static_cast<unsigned>(slice[1] - 0xA1) > (0xFE - 0xA1)) {
+      if (static_cast<unsigned>(bytes[1] - 0xA1) > (0xFE - 0xA1)) {
         return 0;
       }
       return 2;
@@ -76,5 +76,13 @@ size_t EUC_JP::get_char_size(const Slice &slice) const {
   return 1;
 }
 
+Slice EUC_JP::get_char(const Slice &slice) const {
+  return slice.prefix(get_char_size(slice));
+}
+
+size_t EUC_JP::get_char_size(const Slice &slice) const {
+  return get_char_size(Bytes(slice.ptr(), slice.size()));
+}
+
 }  // namespace charset
 }  // namespace grnxx

  Modified: lib/grnxx/charset/euc-jp.hpp (+7 -0)
===================================================================
--- lib/grnxx/charset/euc-jp.hpp    2013-05-30 00:23:47 +0900 (57676aa)
+++ lib/grnxx/charset/euc-jp.hpp    2013-05-30 10:12:15 +0900 (8c8fb03)
@@ -20,7 +20,10 @@
 
 #include "grnxx/features.hpp"
 
+#include "grnxx/bytes.hpp"
 #include "grnxx/charset.hpp"
+
+// TODO: To be removed in future.
 #include "grnxx/slice.hpp"
 
 namespace grnxx {
@@ -33,6 +36,10 @@ class EUC_JP : public Charset {
 
   CharsetCode code() const;
 
+  Bytes get_char(const Bytes &bytes) const;
+  size_t get_char_size(const Bytes &bytes) const;
+
+  // TODO: To be removed in future.
   Slice get_char(const Slice &slice) const;
   size_t get_char_size(const Slice &slice) const;
 };

  Modified: lib/grnxx/charset/shift_jis.cpp (+15 -7)
===================================================================
--- lib/grnxx/charset/shift_jis.cpp    2013-05-30 00:23:47 +0900 (5501434)
+++ lib/grnxx/charset/shift_jis.cpp    2013-05-30 10:12:15 +0900 (bd7a3a0)
@@ -29,23 +29,23 @@ CharsetCode Shift_JIS::code() const {
   return CHARSET_SHIFT_JIS;
 }
 
-Slice Shift_JIS::get_char(const Slice &slice) const {
-  return slice.prefix(get_char_size(slice));
+Bytes Shift_JIS::get_char(const Bytes &bytes) const {
+  return bytes.prefix(get_char_size(bytes));
 }
 
-size_t Shift_JIS::get_char_size(const Slice &slice) const {
-  if (!slice) {
+size_t Shift_JIS::get_char_size(const Bytes &bytes) const {
+  if (!bytes) {
     return 0;
   }
   // The 1st byte of a multibyte character is in [81, 9F] or [E0, FC].
   // Reference: http://www.st.rim.or.jp/~phinloda/cqa/cqa15.html#Q4
-  if (static_cast<unsigned>((slice[0] ^ 0x20) - 0xA1) < 0x3C) {
+  if (static_cast<unsigned>((bytes[0] ^ 0x20) - 0xA1) < 0x3C) {
     // Return 0 if the character is incomplete.
-    if (slice.size() < 2) {
+    if (bytes.size() < 2) {
       return 0;
     }
     // Return 0 if the 2nd byte is invalid.
-    if (static_cast<unsigned>(slice[1] - 0x40) > (0xFC - 0x40)) {
+    if (static_cast<unsigned>(bytes[1] - 0x40) > (0xFC - 0x40)) {
       return 0;
     }
     return 2;
@@ -54,5 +54,13 @@ size_t Shift_JIS::get_char_size(const Slice &slice) const {
   return 1;
 }
 
+Slice Shift_JIS::get_char(const Slice &slice) const {
+  return slice.prefix(get_char_size(slice));
+}
+
+size_t Shift_JIS::get_char_size(const Slice &slice) const {
+  return get_char_size(Bytes(slice.ptr(), slice.size()));
+}
+
 }  // namespace charset
 }  // namespace grnxx

  Modified: lib/grnxx/charset/shift_jis.hpp (+7 -0)
===================================================================
--- lib/grnxx/charset/shift_jis.hpp    2013-05-30 00:23:47 +0900 (c7f38a4)
+++ lib/grnxx/charset/shift_jis.hpp    2013-05-30 10:12:15 +0900 (b3e2051)
@@ -20,7 +20,10 @@
 
 #include "grnxx/features.hpp"
 
+#include "grnxx/bytes.hpp"
 #include "grnxx/charset.hpp"
+
+// TODO: To be removed in future.
 #include "grnxx/slice.hpp"
 
 namespace grnxx {
@@ -33,6 +36,10 @@ class Shift_JIS : public Charset {
 
   CharsetCode code() const;
 
+  Bytes get_char(const Bytes &bytes) const;
+  size_t get_char_size(const Bytes &bytes) const;
+
+  // TODO: To be removed in future.
   Slice get_char(const Slice &slice) const;
   size_t get_char_size(const Slice &slice) const;
 };

  Modified: lib/grnxx/charset/utf-8.cpp (+18 -10)
===================================================================
--- lib/grnxx/charset/utf-8.cpp    2013-05-30 00:23:47 +0900 (5ef4a83)
+++ lib/grnxx/charset/utf-8.cpp    2013-05-30 10:12:15 +0900 (c6e0c13)
@@ -31,40 +31,40 @@ CharsetCode UTF_8::code() const {
   return CHARSET_UTF_8;
 }
 
-Slice UTF_8::get_char(const Slice &slice) const {
-  return slice.prefix(get_char_size(slice));
+Bytes UTF_8::get_char(const Bytes &bytes) const {
+  return bytes.prefix(get_char_size(bytes));
 }
 
-size_t UTF_8::get_char_size(const Slice &slice) const {
-  if (!slice) {
+size_t UTF_8::get_char_size(const Bytes &bytes) const {
+  if (!bytes) {
     return 0;
   }
-  if (slice[0] & 0x80) {
+  if (bytes[0] & 0x80) {
     // A multibyte character can be 2, 3, or 4 bytes long. Also, the 2nd,
     // 3rd, and 4th byte must be 10xxxxxx, the most significant 2 bits must
     // be 10.
     const size_t char_size =
-        31 - bit_scan_reverse(~(static_cast<uint32_t>(slice[0]) << 24));
+        31 - bit_scan_reverse(~(static_cast<uint32_t>(bytes[0]) << 24));
     // Return 0 if the character is incomplete.
-    if (char_size > slice.size()) {
+    if (char_size > bytes.size()) {
       return 0;
     }
     switch (char_size) {
       case 4: {
         // Return 0 if the 4th byte is invalid.
-        if ((slice[3] & 0xC0) != 0x80) {
+        if ((bytes[3] & 0xC0) != 0x80) {
           return 0;
         }
       }
       case 3: {
         // Return 0 if the 3rd byte is invalid.
-        if ((slice[2] & 0xC0) != 0x80) {
+        if ((bytes[2] & 0xC0) != 0x80) {
           return 0;
         }
       }
       case 2: {
         // Return 0 if the 2nd byte is invalid.
-        if ((slice[1] & 0xC0) != 0x80) {
+        if ((bytes[1] & 0xC0) != 0x80) {
           return 0;
         }
         return char_size;
@@ -79,5 +79,13 @@ size_t UTF_8::get_char_size(const Slice &slice) const {
   return 1;
 }
 
+Slice UTF_8::get_char(const Slice &slice) const {
+  return slice.prefix(get_char_size(slice));
+}
+
+size_t UTF_8::get_char_size(const Slice &slice) const {
+  return get_char_size(Bytes(slice.ptr(), slice.size()));
+}
+
 }  // namespace charset
 }  // namespace grnxx

  Modified: lib/grnxx/charset/utf-8.hpp (+7 -0)
===================================================================
--- lib/grnxx/charset/utf-8.hpp    2013-05-30 00:23:47 +0900 (3009165)
+++ lib/grnxx/charset/utf-8.hpp    2013-05-30 10:12:15 +0900 (27a1c21)
@@ -20,7 +20,10 @@
 
 #include "grnxx/features.hpp"
 
+#include "grnxx/bytes.hpp"
 #include "grnxx/charset.hpp"
+
+// TODO: To be removed in future.
 #include "grnxx/slice.hpp"
 
 namespace grnxx {
@@ -33,6 +36,10 @@ class UTF_8 : public Charset {
 
   CharsetCode code() const;
 
+  Bytes get_char(const Bytes &bytes) const;
+  size_t get_char_size(const Bytes &bytes) const;
+
+  // TODO: To be removed in future.
   Slice get_char(const Slice &slice) const;
   size_t get_char_size(const Slice &slice) const;
 };

  Modified: lib/grnxx/map/scanner_impl.cpp (+1 -5)
===================================================================
--- lib/grnxx/map/scanner_impl.cpp    2013-05-30 00:23:47 +0900 (c12d1f3)
+++ lib/grnxx/map/scanner_impl.cpp    2013-05-30 10:12:15 +0900 (26714f5)
@@ -25,9 +25,6 @@
 #include "grnxx/logger.hpp"
 #include "grnxx/map.hpp"
 
-// TODO: To be removed in future.
-#include "grnxx/slice.hpp"
-
 namespace grnxx {
 namespace map {
 
@@ -62,8 +59,7 @@ bool ScannerImpl<T>::next() {
     }
     // Move to the next character.
     if (charset_) {
-      // TODO: Charset should support Bytes.
-      this->offset_ += charset_->get_char_size(Slice(rest.ptr(), rest.size()));
+      this->offset_ += charset_->get_char_size(rest);
     } else {
       ++this->offset_;
     }
-------------- next part --------------
HTML����������������������������...
Télécharger 



More information about the Groonga-commit mailing list
Back to archive index