utf8encoded_strings.hpp 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. // Copyright Takatoshi Kondo 2015
  2. //
  3. // Distributed under the Boost Software License, Version 1.0.
  4. // (See accompanying file LICENSE_1_0.txt or copy at
  5. // http://www.boost.org/LICENSE_1_0.txt)
  6. #if !defined(MQTT_UTF8ENCODED_STRINGS_HPP)
  7. #define MQTT_UTF8ENCODED_STRINGS_HPP
  8. #include <mqtt/namespace.hpp>
  9. #include <mqtt/string_view.hpp>
  10. namespace MQTT_NS {
  11. namespace utf8string {
  12. enum struct validation
  13. {
  14. /**
  15. * @brief UTF-8 string is well_formed.
  16. * See http://docs.oasis-open.org/mqtt/mqtt/v3.1.1/os/mqtt-v3.1.1-os.html#_Toc398718016
  17. * 1.5.3 UTF-8 encoded strings
  18. */
  19. well_formed = 0,
  20. /**
  21. * @brief UTF-8 string is ill_formed or contains null character.
  22. * See http://docs.oasis-open.org/mqtt/mqtt/v3.1.1/os/mqtt-v3.1.1-os.html#_Toc398718016
  23. * 1.5.3 UTF-8 encoded strings
  24. */
  25. ill_formed,
  26. /**
  27. * @brief UTF-8 string is well_formed and contains control character and non-character.
  28. * See http://docs.oasis-open.org/mqtt/mqtt/v3.1.1/os/mqtt-v3.1.1-os.html#_Toc398718016
  29. * 1.5.3 UTF-8 encoded strings
  30. */
  31. well_formed_with_non_charactor,
  32. };
  33. constexpr bool
  34. is_valid_length(string_view str) {
  35. return str.size() <= 0xffff;
  36. }
  37. constexpr validation
  38. validate_contents(string_view str) {
  39. // This code is based on https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c
  40. auto result = validation::well_formed;
  41. #if defined(MQTT_USE_STR_CHECK)
  42. auto it = str.begin();
  43. auto end = str.end();
  44. while (it != end) {
  45. if (static_cast<unsigned char>(*(it + 0)) < 0b1000'0000) {
  46. // 0xxxxxxxxx
  47. if (static_cast<unsigned char>(*(it + 0)) == 0x00) {
  48. result = validation::ill_formed;
  49. break;
  50. }
  51. if ((static_cast<unsigned char>(*(it + 0)) >= 0x01 &&
  52. static_cast<unsigned char>(*(it + 0)) <= 0x1f) ||
  53. static_cast<unsigned char>(*(it + 0)) == 0x7f) {
  54. result = validation::well_formed_with_non_charactor;
  55. }
  56. ++it;
  57. }
  58. else if ((static_cast<unsigned char>(*(it + 0)) & 0b1110'0000) == 0b1100'0000) {
  59. // 110XXXXx 10xxxxxx
  60. if (it + 1 >= end) {
  61. result = validation::ill_formed;
  62. break;
  63. }
  64. if ((static_cast<unsigned char>(*(it + 1)) & 0b1100'0000) != 0b1000'0000 ||
  65. (static_cast<unsigned char>(*(it + 0)) & 0b1111'1110) == 0b1100'0000) { // overlong
  66. result = validation::ill_formed;
  67. break;
  68. }
  69. if (static_cast<unsigned char>(*(it + 0)) == 0b1100'0010 &&
  70. static_cast<unsigned char>(*(it + 1)) >= 0b1000'0000 &&
  71. static_cast<unsigned char>(*(it + 1)) <= 0b1001'1111) {
  72. result = validation::well_formed_with_non_charactor;
  73. }
  74. it += 2;
  75. }
  76. else if ((static_cast<unsigned char>(*(it + 0)) & 0b1111'0000) == 0b1110'0000) {
  77. // 1110XXXX 10Xxxxxx 10xxxxxx
  78. if (it + 2 >= end) {
  79. result = validation::ill_formed;
  80. break;
  81. }
  82. if ((static_cast<unsigned char>(*(it + 1)) & 0b1100'0000) != 0b1000'0000 ||
  83. (static_cast<unsigned char>(*(it + 2)) & 0b1100'0000) != 0b1000'0000 ||
  84. (static_cast<unsigned char>(*(it + 0)) == 0b1110'0000 &&
  85. (static_cast<unsigned char>(*(it + 1)) & 0b1110'0000) == 0b1000'0000) || // overlong?
  86. (static_cast<unsigned char>(*(it + 0)) == 0b1110'1101 &&
  87. (static_cast<unsigned char>(*(it + 1)) & 0b1110'0000) == 0b1010'0000)) { // surrogate?
  88. result = validation::ill_formed;
  89. break;
  90. }
  91. if (static_cast<unsigned char>(*(it + 0)) == 0b1110'1111 &&
  92. static_cast<unsigned char>(*(it + 1)) == 0b1011'1111 &&
  93. (static_cast<unsigned char>(*(it + 2)) & 0b1111'1110) == 0b1011'1110) {
  94. // U+FFFE or U+FFFF?
  95. result = validation::well_formed_with_non_charactor;
  96. }
  97. it += 3;
  98. }
  99. else if ((static_cast<unsigned char>(*(it + 0)) & 0b1111'1000) == 0b1111'0000) {
  100. // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
  101. if (it + 3 >= end) {
  102. result = validation::ill_formed;
  103. break;
  104. }
  105. if ((static_cast<unsigned char>(*(it + 1)) & 0b1100'0000) != 0b1000'0000 ||
  106. (static_cast<unsigned char>(*(it + 2)) & 0b1100'0000) != 0b1000'0000 ||
  107. (static_cast<unsigned char>(*(it + 3)) & 0b1100'0000) != 0b1000'0000 ||
  108. (static_cast<unsigned char>(*(it + 0)) == 0b1111'0000 &&
  109. (static_cast<unsigned char>(*(it + 1)) & 0b1111'0000) == 0b1000'0000) || // overlong?
  110. (static_cast<unsigned char>(*(it + 0)) == 0b1111'0100 &&
  111. static_cast<unsigned char>(*(it + 1)) > 0b1000'1111) ||
  112. static_cast<unsigned char>(*(it + 0)) > 0b1111'0100) { // > U+10FFFF?
  113. result = validation::ill_formed;
  114. break;
  115. }
  116. if ((static_cast<unsigned char>(*(it + 1)) & 0b1100'1111) == 0b1000'1111 &&
  117. static_cast<unsigned char>(*(it + 2)) == 0b1011'1111 &&
  118. (static_cast<unsigned char>(*(it + 3)) & 0b1111'1110) == 0b1011'1110) {
  119. // U+nFFFE or U+nFFFF?
  120. result = validation::well_formed_with_non_charactor;
  121. }
  122. it += 4;
  123. }
  124. else {
  125. result = validation::ill_formed;
  126. break;
  127. }
  128. }
  129. #else // MQTT_USE_STR_CHECK
  130. static_cast<void>(str);
  131. #endif // MQTT_USE_STR_CHECK
  132. return result;
  133. }
  134. } // namespace utf8string
  135. } // namespace MQTT_NS
  136. #endif // MQTT_UTF8ENCODED_STRINGS_HPP