51 const unsigned char ch =
static_cast<unsigned char>(*str);
52 if ((ch & 0xF0) == 0xE0) {
54 }
else if ((ch & 0x80) == 0x00) {
56 }
else if ((ch & 0xE0) == 0xC0) {
58 }
else if ((ch & 0xF8) == 0xF0) {
60 }
else if ((ch & 0xFC) == 0xF8) {
62 }
else if ((ch & 0xFE) == 0xFC) {
83 const char* candidate = str - 1;
85 while (distance < 6) {
86 const unsigned char ch =
static_cast<unsigned char>(*candidate);
87 if ((ch & 0xC0) != 0x80) {
95 if (length == distance) {
115 static size_t IdeographicDescriptionOperatorArity(uint32_t codePoint) {
141 static size_t NextIdeographicDescriptionSequenceLength(
const char* str,
143 const size_t kMaxIDSDepth = 16;
144 const size_t kMaxIDSCodePoints = 64;
148 const size_t charLen = NextCharLengthNoException(str);
149 if (charLen == 0 || charLen > len) {
152 const uint32_t codePoint = CodePointNoException(str, charLen);
153 if (IdeographicDescriptionOperatorArity(codePoint) == 0) {
158 size_t codePoints = 0;
159 if (ConsumeIdeographicDescriptionSequence(
160 str, len, kMaxIDSDepth, kMaxIDSCodePoints, &consumed,
161 &codePoints) == IDSParseStatus::Complete) {
167 static bool IsIncompleteIdeographicDescriptionSequencePrefix(
const char* str,
169 const size_t kMaxIDSDepth = 16;
170 const size_t kMaxIDSCodePoints = 64;
174 const size_t charLen = NextCharLengthNoException(str);
175 if (charLen == 0 || charLen > len) {
178 const uint32_t codePoint = CodePointNoException(str, charLen);
179 if (IdeographicDescriptionOperatorArity(codePoint) == 0) {
184 size_t codePoints = 0;
185 return ConsumeIdeographicDescriptionSequence(
186 str, len, kMaxIDSDepth, kMaxIDSCodePoints, &consumed,
187 &codePoints) == IDSParseStatus::Incomplete;
190 static bool IsVariationSelector(uint32_t codePoint) {
191 return (codePoint >= 0xFE00 && codePoint <= 0xFE0F) ||
192 (codePoint >= 0xE0100 && codePoint <= 0xE01EF);
195 static bool ContainsVariationSelector(
const char* str,
size_t len) {
196 const char* pStr = str;
197 const char* strEnd = str + len;
198 while (pStr < strEnd) {
199 const size_t remainingLength = strEnd - pStr;
200 const size_t charLen = NextCharLengthNoException(pStr);
205 if (charLen > remainingLength) {
208 if (IsVariationSelector(CodePointNoException(pStr, charLen))) {
224 while (*str !=
'\0') {
233 while (i < charLen && str[i] !=
'\0') {
262 return ch ==
'\0' || ch ==
'\n' || ch ==
'\r';
268 static std::string
FromSubstr(
const char* str,
size_t length) {
270 newStr.resize(length);
271 memcpy(newStr.data(), str, length);
280 while (byteLength > 0) {
294 static std::string
TruncateUTF8(
const char* str,
size_t maxByteLength) {
295 std::string wordTrunc;
298 const char* pStr = str;
301 if (len + charLength > maxByteLength) {
317 static void ReplaceAll(std::string& str,
const char* from,
const char* to) {
318 std::string::size_type pos = 0;
319 std::string::size_type fromLen = strlen(from);
320 std::string::size_type toLen = strlen(to);
321 while ((pos = str.find(from, pos)) != std::string::npos) {
322 str.replace(pos, fromLen, to);
330 static std::string
Join(
const std::vector<std::string>& strings,
331 const std::string& separator) {
332 std::ostringstream buffer;
334 for (
const auto& str : strings) {
347 static std::string
Join(
const std::vector<std::string>& strings) {
348 std::ostringstream buffer;
349 for (
const auto& str : strings) {
355 static void GetByteMap(
const char* str,
const size_t utf8Length,
356 std::vector<size_t>* byteMap) {
357 if (byteMap->size() < utf8Length) {
358 byteMap->resize(utf8Length);
360 const char* pstr = str;
361 for (
size_t i = 0; i < utf8Length; i++) {
362 (*byteMap)[i] = pstr - str;
363 pstr = NextChar(pstr);
368 static std::wstring GetPlatformString(
const std::string& str) {
372 static std::string GetPlatformString(
const std::string& str) {
return str; }
376 static std::string U16ToU8(
const std::wstring& wstr) {
378 int length =
static_cast<int>(wstr.length());
379 int convcnt = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, NULL, 0,
383 WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, &ret[0], convcnt,
389 static std::wstring U8ToU16(
const std::string& str) {
391 int length =
static_cast<int>(str.length());
392 int convcnt = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, NULL, 0);
395 MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, &ret[0], convcnt);
402 enum class IDSParseStatus {
408 static uint32_t CodePointNoException(
const char* str,
size_t charLen) {
409 const unsigned char first =
static_cast<unsigned char>(str[0]);
414 uint32_t codePoint = first & ((1U << (7 - charLen)) - 1);
415 for (
size_t i = 1; i < charLen; i++) {
416 codePoint = (codePoint << 6) |
417 (
static_cast<unsigned char>(str[i]) & 0x3FU);
422 static IDSParseStatus ConsumeIdeographicDescriptionSequence(
423 const char* str,
size_t len,
size_t depthLeft,
size_t maxCodePoints,
424 size_t* consumed,
size_t* codePoints) {
426 return IDSParseStatus::Incomplete;
428 if (depthLeft == 0 || *codePoints >= maxCodePoints) {
429 return IDSParseStatus::Invalid;
431 const size_t charLen = NextCharLengthNoException(str);
433 return IDSParseStatus::Invalid;
436 return IDSParseStatus::Incomplete;
440 const uint32_t codePoint = CodePointNoException(str, charLen);
441 const size_t arity = IdeographicDescriptionOperatorArity(codePoint);
444 return IDSParseStatus::Complete;
447 size_t offset = charLen;
448 for (
size_t i = 0; i < arity; i++) {
450 return IDSParseStatus::Incomplete;
452 size_t operandLength = 0;
453 const IDSParseStatus operandStatus = ConsumeIdeographicDescriptionSequence(
454 str + offset, len - offset, depthLeft - 1, maxCodePoints,
455 &operandLength, codePoints);
456 if (operandStatus != IDSParseStatus::Complete) {
457 return operandStatus;
459 offset += operandLength;
462 return IDSParseStatus::Complete;