license.bot | bf09a50 | 2008-08-24 00:55:55 | [diff] [blame] | 1 | // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
[email protected] | 6b27db80 | 2008-08-07 15:29:49 | [diff] [blame] | 4 | |
initial.commit | d7cae12 | 2008-07-26 21:49:38 | [diff] [blame] | 5 | #include "base/string_util.h" |
| 6 | |
| 7 | #include <string.h> |
| 8 | #include <vector> |
| 9 | |
| 10 | #include "base/basictypes.h" |
| 11 | #include "base/logging.h" |
| 12 | #include "base/singleton.h" |
| 13 | #include "unicode/ucnv.h" |
| 14 | #include "unicode/numfmt.h" |
| 15 | #include "unicode/ustring.h" |
| 16 | |
[email protected] | 6b27db80 | 2008-08-07 15:29:49 | [diff] [blame] | 17 | namespace { |
| 18 | |
| 19 | // ReadUnicodeCharacter -------------------------------------------------------- |
| 20 | |
| 21 | // Reads a UTF-8 stream, placing the next code point into the given output |
| 22 | // |*code_point|. |src| represents the entire string to read, and |*char_index| |
| 23 | // is the character offset within the string to start reading at. |*char_index| |
| 24 | // will be updated to index the last character read, such that incrementing it |
| 25 | // (as in a for loop) will take the reader to the next character. |
| 26 | // |
| 27 | // Returns true on success. On false, |*code_point| will be invalid. |
| 28 | bool ReadUnicodeCharacter(const char* src, int32 src_len, |
[email protected] | d6b0667 | 2008-08-19 00:31:24 | [diff] [blame] | 29 | int32* char_index, uint32* code_point_out) { |
| 30 | // U8_NEXT expects to be able to use -1 to signal an error, so we must |
| 31 | // use a signed type for code_point. But this function returns false |
| 32 | // on error anyway, so code_point_out is unsigned. |
| 33 | int32 code_point; |
| 34 | U8_NEXT(src, *char_index, src_len, code_point); |
| 35 | *code_point_out = static_cast<uint32>(code_point); |
[email protected] | 6b27db80 | 2008-08-07 15:29:49 | [diff] [blame] | 36 | |
| 37 | // The ICU macro above moves to the next char, we want to point to the last |
| 38 | // char consumed. |
| 39 | (*char_index)--; |
| 40 | |
| 41 | // Validate the decoded value. |
[email protected] | d6b0667 | 2008-08-19 00:31:24 | [diff] [blame] | 42 | return U_IS_UNICODE_CHAR(code_point); |
[email protected] | 6b27db80 | 2008-08-07 15:29:49 | [diff] [blame] | 43 | } |
| 44 | |
[email protected] | 39be424 | 2008-08-07 18:31:40 | [diff] [blame] | 45 | // Reads a UTF-16 character. The usage is the same as the 8-bit version above. |
[email protected] | e6da5e1f | 2008-08-07 20:27:57 | [diff] [blame] | 46 | bool ReadUnicodeCharacter(const char16* src, int32 src_len, |
[email protected] | 6b27db80 | 2008-08-07 15:29:49 | [diff] [blame] | 47 | int32* char_index, uint32* code_point) { |
| 48 | if (U16_IS_SURROGATE(src[*char_index])) { |
| 49 | if (!U16_IS_SURROGATE_LEAD(src[*char_index]) || |
| 50 | *char_index + 1 >= src_len || |
| 51 | !U16_IS_TRAIL(src[*char_index + 1])) { |
| 52 | // Invalid surrogate pair. |
| 53 | return false; |
| 54 | } |
| 55 | |
| 56 | // Valid surrogate pair. |
| 57 | *code_point = U16_GET_SUPPLEMENTARY(src[*char_index], |
| 58 | src[*char_index + 1]); |
| 59 | (*char_index)++; |
| 60 | } else { |
| 61 | // Not a surrogate, just one 16-bit word. |
| 62 | *code_point = src[*char_index]; |
| 63 | } |
| 64 | |
| 65 | return U_IS_UNICODE_CHAR(*code_point); |
| 66 | } |
[email protected] | e6da5e1f | 2008-08-07 20:27:57 | [diff] [blame] | 67 | |
| 68 | #if defined(WCHAR_T_IS_UTF32) |
[email protected] | 39be424 | 2008-08-07 18:31:40 | [diff] [blame] | 69 | // Reads UTF-32 character. The usage is the same as the 8-bit version above. |
[email protected] | a31e79e | 2008-08-07 22:36:01 | [diff] [blame] | 70 | bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, |
[email protected] | e6da5e1f | 2008-08-07 20:27:57 | [diff] [blame] | 71 | int32* char_index, uint32* code_point) { |
[email protected] | 6b27db80 | 2008-08-07 15:29:49 | [diff] [blame] | 72 | // Conversion is easy since the source is 32-bit. |
| 73 | *code_point = src[*char_index]; |
| 74 | |
| 75 | // Validate the value. |
| 76 | return U_IS_UNICODE_CHAR(*code_point); |
| 77 | } |
[email protected] | 39be424 | 2008-08-07 18:31:40 | [diff] [blame] | 78 | #endif // defined(WCHAR_T_IS_UTF32) |
[email protected] | 6b27db80 | 2008-08-07 15:29:49 | [diff] [blame] | 79 | |
| 80 | // WriteUnicodeCharacter ------------------------------------------------------- |
| 81 | |
| 82 | // Appends a UTF-8 character to the given 8-bit string. |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 83 | void WriteUnicodeCharacter(uint32 code_point, std::string* output) { |
[email protected] | 6b27db80 | 2008-08-07 15:29:49 | [diff] [blame] | 84 | if (code_point <= 0x7f) { |
| 85 | // Fast path the common case of one byte. |
| 86 | output->push_back(code_point); |
| 87 | return; |
| 88 | } |
| 89 | |
| 90 | // U8_APPEND_UNSAFE can append up to 4 bytes. |
| 91 | int32 char_offset = static_cast<int32>(output->length()); |
| 92 | output->resize(char_offset + U8_MAX_LENGTH); |
| 93 | |
| 94 | U8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); |
| 95 | |
| 96 | // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so |
| 97 | // it will represent the new length of the string. |
| 98 | output->resize(char_offset); |
| 99 | } |
| 100 | |
[email protected] | 39be424 | 2008-08-07 18:31:40 | [diff] [blame] | 101 | // Appends the given code point as a UTF-16 character to the STL string. |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 102 | void WriteUnicodeCharacter(uint32 code_point, string16* output) { |
[email protected] | 6b27db80 | 2008-08-07 15:29:49 | [diff] [blame] | 103 | if (U16_LENGTH(code_point) == 1) { |
| 104 | // Thie code point is in the Basic Multilingual Plane (BMP). |
[email protected] | e6da5e1f | 2008-08-07 20:27:57 | [diff] [blame] | 105 | output->push_back(static_cast<char16>(code_point)); |
[email protected] | 6b27db80 | 2008-08-07 15:29:49 | [diff] [blame] | 106 | } else { |
| 107 | // Non-BMP characters use a double-character encoding. |
| 108 | int32 char_offset = static_cast<int32>(output->length()); |
| 109 | output->resize(char_offset + U16_MAX_LENGTH); |
| 110 | U16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); |
| 111 | } |
| 112 | } |
[email protected] | e6da5e1f | 2008-08-07 20:27:57 | [diff] [blame] | 113 | |
| 114 | #if defined(WCHAR_T_IS_UTF32) |
[email protected] | 39be424 | 2008-08-07 18:31:40 | [diff] [blame] | 115 | // Appends the given UTF-32 character to the given 32-bit string. |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 116 | inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) { |
[email protected] | 6b27db80 | 2008-08-07 15:29:49 | [diff] [blame] | 117 | // This is the easy case, just append the character. |
| 118 | output->push_back(code_point); |
| 119 | } |
[email protected] | 39be424 | 2008-08-07 18:31:40 | [diff] [blame] | 120 | #endif // defined(WCHAR_T_IS_UTF32) |
[email protected] | 6b27db80 | 2008-08-07 15:29:49 | [diff] [blame] | 121 | |
| 122 | // Generalized Unicode converter ----------------------------------------------- |
| 123 | |
| 124 | // Converts the given source Unicode character type to the given destination |
| 125 | // Unicode character type as a STL string. The given input buffer and size |
| 126 | // determine the source, and the given output STL string will be replaced by |
| 127 | // the result. |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 128 | template<typename SRC_CHAR, typename DEST_STRING> |
| 129 | bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) { |
[email protected] | 6b27db80 | 2008-08-07 15:29:49 | [diff] [blame] | 130 | output->clear(); |
| 131 | |
| 132 | // ICU requires 32-bit numbers. |
| 133 | bool success = true; |
| 134 | int32 src_len32 = static_cast<int32>(src_len); |
| 135 | for (int32 i = 0; i < src_len32; i++) { |
| 136 | uint32 code_point; |
| 137 | if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) |
| 138 | WriteUnicodeCharacter(code_point, output); |
| 139 | else |
| 140 | success = false; |
| 141 | } |
| 142 | return success; |
| 143 | } |
| 144 | |
[email protected] | f0fcfd3 | 2008-08-26 19:27:24 | [diff] [blame] | 145 | |
| 146 | // Guesses the length of the output in UTF-8 in bytes, and reserves that amount |
| 147 | // of space in the given string. We also assume that the input character types |
| 148 | // are unsigned, which will be true for UTF-16 and -32 on our systems. We assume |
| 149 | // the string length is greater than zero. |
| 150 | template<typename CHAR> |
| 151 | void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) { |
| 152 | if (src[0] < 0x80) { |
| 153 | // Assume that the entire input will be ASCII. |
| 154 | output->reserve(src_len); |
| 155 | } else { |
| 156 | // Assume that the entire input is non-ASCII and will have 3 bytes per char. |
| 157 | output->reserve(src_len * 3); |
| 158 | } |
| 159 | } |
| 160 | |
| 161 | // Guesses the size of the output buffer (containing either UTF-16 or -32 data) |
| 162 | // given some UTF-8 input that will be converted to it. See ReserveUTF8Output. |
| 163 | // We assume the source length is > 0. |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 164 | template<typename STRING> |
| 165 | void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) { |
[email protected] | f0fcfd3 | 2008-08-26 19:27:24 | [diff] [blame] | 166 | if (static_cast<unsigned char>(src[0]) < 0x80) { |
| 167 | // Assume the input is all ASCII, which means 1:1 correspondence. |
| 168 | output->reserve(src_len); |
| 169 | } else { |
| 170 | // Otherwise assume that the UTF-8 sequences will have 2 bytes for each |
| 171 | // character. |
| 172 | output->reserve(src_len / 2); |
| 173 | } |
| 174 | } |
| 175 | |
[email protected] | 6b27db80 | 2008-08-07 15:29:49 | [diff] [blame] | 176 | } // namespace |
| 177 | |
[email protected] | e6da5e1f | 2008-08-07 20:27:57 | [diff] [blame] | 178 | // UTF-8 <-> Wide -------------------------------------------------------------- |
[email protected] | 6b27db80 | 2008-08-07 15:29:49 | [diff] [blame] | 179 | |
| 180 | std::string WideToUTF8(const std::wstring& wide) { |
| 181 | std::string ret; |
| 182 | if (wide.empty()) |
| 183 | return ret; |
| 184 | |
| 185 | // Ignore the success flag of this call, it will do the best it can for |
| 186 | // invalid input, which is what we want here. |
| 187 | WideToUTF8(wide.data(), wide.length(), &ret); |
| 188 | return ret; |
| 189 | } |
| 190 | |
| 191 | bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) { |
| 192 | if (src_len == 0) { |
| 193 | output->clear(); |
| 194 | return true; |
| 195 | } |
| 196 | |
[email protected] | f0fcfd3 | 2008-08-26 19:27:24 | [diff] [blame] | 197 | ReserveUTF8Output(src, src_len, output); |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 198 | return ConvertUnicode<wchar_t, std::string>(src, src_len, output); |
[email protected] | 6b27db80 | 2008-08-07 15:29:49 | [diff] [blame] | 199 | } |
| 200 | |
| 201 | std::wstring UTF8ToWide(const std::string& utf8) { |
| 202 | std::wstring ret; |
| 203 | if (utf8.empty()) |
| 204 | return ret; |
| 205 | |
| 206 | UTF8ToWide(utf8.data(), utf8.length(), &ret); |
| 207 | return ret; |
| 208 | } |
| 209 | |
| 210 | bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { |
| 211 | if (src_len == 0) { |
| 212 | output->clear(); |
| 213 | return true; |
| 214 | } |
| 215 | |
[email protected] | f0fcfd3 | 2008-08-26 19:27:24 | [diff] [blame] | 216 | ReserveUTF16Or32Output(src, src_len, output); |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 217 | return ConvertUnicode<char, std::wstring>(src, src_len, output); |
[email protected] | 6b27db80 | 2008-08-07 15:29:49 | [diff] [blame] | 218 | } |
| 219 | |
[email protected] | e6da5e1f | 2008-08-07 20:27:57 | [diff] [blame] | 220 | // UTF-16 <-> Wide ------------------------------------------------------------- |
| 221 | |
| 222 | #if defined(WCHAR_T_IS_UTF16) |
| 223 | |
| 224 | // When wide == UTF-16, then conversions are a NOP. |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 225 | string16 WideToUTF16(const std::wstring& wide) { |
[email protected] | e6da5e1f | 2008-08-07 20:27:57 | [diff] [blame] | 226 | return wide; |
| 227 | } |
| 228 | |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 229 | bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { |
[email protected] | e6da5e1f | 2008-08-07 20:27:57 | [diff] [blame] | 230 | output->assign(src, src_len); |
| 231 | return true; |
| 232 | } |
| 233 | |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 234 | std::wstring UTF16ToWide(const string16& utf16) { |
[email protected] | e6da5e1f | 2008-08-07 20:27:57 | [diff] [blame] | 235 | return utf16; |
| 236 | } |
| 237 | |
| 238 | bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { |
| 239 | output->assign(src, src_len); |
| 240 | return true; |
| 241 | } |
| 242 | |
| 243 | #elif defined(WCHAR_T_IS_UTF32) |
| 244 | |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 245 | string16 WideToUTF16(const std::wstring& wide) { |
| 246 | string16 ret; |
[email protected] | e6da5e1f | 2008-08-07 20:27:57 | [diff] [blame] | 247 | if (wide.empty()) |
| 248 | return ret; |
| 249 | |
[email protected] | a31e79e | 2008-08-07 22:36:01 | [diff] [blame] | 250 | WideToUTF16(wide.data(), wide.length(), &ret); |
[email protected] | e6da5e1f | 2008-08-07 20:27:57 | [diff] [blame] | 251 | return ret; |
| 252 | } |
| 253 | |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 254 | bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { |
[email protected] | e6da5e1f | 2008-08-07 20:27:57 | [diff] [blame] | 255 | if (src_len == 0) { |
| 256 | output->clear(); |
| 257 | return true; |
| 258 | } |
| 259 | |
| 260 | // Assume that normally we won't have any non-BMP characters so the counts |
| 261 | // will be the same. |
| 262 | output->reserve(src_len); |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 263 | return ConvertUnicode<wchar_t, string16>(src, src_len, output); |
[email protected] | e6da5e1f | 2008-08-07 20:27:57 | [diff] [blame] | 264 | } |
| 265 | |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 266 | std::wstring UTF16ToWide(const string16& utf16) { |
[email protected] | e6da5e1f | 2008-08-07 20:27:57 | [diff] [blame] | 267 | std::wstring ret; |
| 268 | if (utf16.empty()) |
| 269 | return ret; |
| 270 | |
[email protected] | a31e79e | 2008-08-07 22:36:01 | [diff] [blame] | 271 | UTF16ToWide(utf16.data(), utf16.length(), &ret); |
[email protected] | e6da5e1f | 2008-08-07 20:27:57 | [diff] [blame] | 272 | return ret; |
| 273 | } |
| 274 | |
| 275 | bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { |
| 276 | if (src_len == 0) { |
| 277 | output->clear(); |
| 278 | return true; |
| 279 | } |
| 280 | |
| 281 | // Assume that normally we won't have any non-BMP characters so the counts |
| 282 | // will be the same. |
| 283 | output->reserve(src_len); |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 284 | return ConvertUnicode<char16, std::wstring>(src, src_len, output); |
[email protected] | e6da5e1f | 2008-08-07 20:27:57 | [diff] [blame] | 285 | } |
| 286 | |
| 287 | #endif // defined(WCHAR_T_IS_UTF32) |
| 288 | |
[email protected] | f0fcfd3 | 2008-08-26 19:27:24 | [diff] [blame] | 289 | // UTF16 <-> UTF8 -------------------------------------------------------------- |
| 290 | |
| 291 | #if defined(WCHAR_T_IS_UTF32) |
| 292 | |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 293 | bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { |
[email protected] | f0fcfd3 | 2008-08-26 19:27:24 | [diff] [blame] | 294 | if (src_len == 0) { |
| 295 | output->clear(); |
| 296 | return true; |
| 297 | } |
| 298 | |
| 299 | ReserveUTF16Or32Output(src, src_len, output); |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 300 | return ConvertUnicode<char, string16>(src, src_len, output); |
[email protected] | f0fcfd3 | 2008-08-26 19:27:24 | [diff] [blame] | 301 | } |
| 302 | |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 303 | string16 UTF8ToUTF16(const std::string& utf8) { |
| 304 | string16 ret; |
[email protected] | f0fcfd3 | 2008-08-26 19:27:24 | [diff] [blame] | 305 | if (utf8.empty()) |
| 306 | return ret; |
| 307 | |
| 308 | // Ignore the success flag of this call, it will do the best it can for |
| 309 | // invalid input, which is what we want here. |
| 310 | UTF8ToUTF16(utf8.data(), utf8.length(), &ret); |
| 311 | return ret; |
| 312 | } |
| 313 | |
| 314 | bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { |
| 315 | if (src_len == 0) { |
| 316 | output->clear(); |
| 317 | return true; |
| 318 | } |
| 319 | |
| 320 | ReserveUTF8Output(src, src_len, output); |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 321 | return ConvertUnicode<char16, std::string>(src, src_len, output); |
[email protected] | f0fcfd3 | 2008-08-26 19:27:24 | [diff] [blame] | 322 | } |
| 323 | |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 324 | std::string UTF16ToUTF8(const string16& utf16) { |
[email protected] | f0fcfd3 | 2008-08-26 19:27:24 | [diff] [blame] | 325 | std::string ret; |
| 326 | if (utf16.empty()) |
| 327 | return ret; |
| 328 | |
| 329 | // Ignore the success flag of this call, it will do the best it can for |
| 330 | // invalid input, which is what we want here. |
| 331 | UTF16ToUTF8(utf16.data(), utf16.length(), &ret); |
| 332 | return ret; |
| 333 | } |
| 334 | |
| 335 | #elif defined(WCHAR_T_IS_UTF16) |
| 336 | // Easy case since we can use the "wide" versions we already wrote above. |
| 337 | |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 338 | bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { |
[email protected] | f0fcfd3 | 2008-08-26 19:27:24 | [diff] [blame] | 339 | return UTF8ToWide(src, src_len, output); |
| 340 | } |
| 341 | |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 342 | string16 UTF8ToUTF16(const std::string& utf8) { |
[email protected] | f0fcfd3 | 2008-08-26 19:27:24 | [diff] [blame] | 343 | return UTF8ToWide(utf8); |
| 344 | } |
| 345 | |
| 346 | bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { |
| 347 | return WideToUTF8(src, src_len, output); |
| 348 | } |
| 349 | |
[email protected] | d1370190 | 2008-08-27 20:57:35 | [diff] [blame] | 350 | std::string UTF16ToUTF8(const string16& utf16) { |
[email protected] | f0fcfd3 | 2008-08-26 19:27:24 | [diff] [blame] | 351 | return WideToUTF8(utf16); |
| 352 | } |
| 353 | |
| 354 | #endif |
| 355 | |
initial.commit | d7cae12 | 2008-07-26 21:49:38 | [diff] [blame] | 356 | // Codepage <-> Wide ----------------------------------------------------------- |
| 357 | |
| 358 | // Convert a unicode string into the specified codepage_name. If the codepage |
| 359 | // isn't found, return false. |
| 360 | bool WideToCodepage(const std::wstring& wide, |
| 361 | const char* codepage_name, |
| 362 | OnStringUtilConversionError::Type on_error, |
| 363 | std::string* encoded) { |
| 364 | encoded->clear(); |
| 365 | |
| 366 | UErrorCode status = U_ZERO_ERROR; |
| 367 | UConverter* converter = ucnv_open(codepage_name, &status); |
| 368 | if (!U_SUCCESS(status)) |
| 369 | return false; |
| 370 | |
| 371 | const UChar* uchar_src; |
| 372 | int uchar_len; |
[email protected] | 39be424 | 2008-08-07 18:31:40 | [diff] [blame] | 373 | #if defined(WCHAR_T_IS_UTF16) |
initial.commit | d7cae12 | 2008-07-26 21:49:38 | [diff] [blame] | 374 | uchar_src = wide.c_str(); |
| 375 | uchar_len = static_cast<int>(wide.length()); |
[email protected] | 39be424 | 2008-08-07 18:31:40 | [diff] [blame] | 376 | #elif defined(WCHAR_T_IS_UTF32) |
initial.commit | d7cae12 | 2008-07-26 21:49:38 | [diff] [blame] | 377 | // When wchar_t is wider than UChar (16 bits), transform |wide| into a |
| 378 | // UChar* string. Size the UChar* buffer to be large enough to hold twice |
[email protected] | 703f427e | 2008-08-13 01:17:18 | [diff] [blame] | 379 | // as many UTF-16 code points as there are UTF-16 characters, in case each |
initial.commit | d7cae12 | 2008-07-26 21:49:38 | [diff] [blame] | 380 | // character translates to a UTF-16 surrogate pair, and leave room for a NUL |
| 381 | // terminator. |
| 382 | std::vector<UChar> wide_uchar(wide.length() * 2 + 1); |
| 383 | u_strFromWCS(&wide_uchar[0], wide_uchar.size(), &uchar_len, |
| 384 | wide.c_str(), wide.length(), &status); |
| 385 | uchar_src = &wide_uchar[0]; |
| 386 | DCHECK(U_SUCCESS(status)) << "failed to convert wstring to UChar*"; |
[email protected] | 39be424 | 2008-08-07 18:31:40 | [diff] [blame] | 387 | #endif // defined(WCHAR_T_IS_UTF32) |
initial.commit | d7cae12 | 2008-07-26 21:49:38 | [diff] [blame] | 388 | |
| 389 | int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len, |
| 390 | ucnv_getMaxCharSize(converter)); |
| 391 | encoded->resize(encoded_max_length); |
| 392 | |
| 393 | // Setup our error handler. |
| 394 | switch (on_error) { |
| 395 | case OnStringUtilConversionError::FAIL: |
| 396 | ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0, |
| 397 | NULL, NULL, &status); |
| 398 | break; |
| 399 | case OnStringUtilConversionError::SKIP: |
| 400 | ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0, |
| 401 | NULL, NULL, &status); |
| 402 | break; |
| 403 | default: |
| 404 | NOTREACHED(); |
| 405 | } |
| 406 | |
| 407 | // ucnv_fromUChars returns size not including terminating null |
| 408 | int actual_size = ucnv_fromUChars(converter, &(*encoded)[0], |
| 409 | encoded_max_length, uchar_src, uchar_len, &status); |
| 410 | encoded->resize(actual_size); |
| 411 | ucnv_close(converter); |
| 412 | if (U_SUCCESS(status)) |
| 413 | return true; |
| 414 | encoded->clear(); // Make sure the output is empty on error. |
| 415 | return false; |
| 416 | } |
| 417 | |
| 418 | // Converts a string of the given codepage into unicode. |
| 419 | // If the codepage isn't found, return false. |
| 420 | bool CodepageToWide(const std::string& encoded, |
| 421 | const char* codepage_name, |
| 422 | OnStringUtilConversionError::Type on_error, |
| 423 | std::wstring* wide) { |
| 424 | wide->clear(); |
| 425 | |
| 426 | UErrorCode status = U_ZERO_ERROR; |
| 427 | UConverter* converter = ucnv_open(codepage_name, &status); |
| 428 | if (!U_SUCCESS(status)) |
| 429 | return false; |
| 430 | |
| 431 | // The worst case is all the input characters are non-BMP (32-bit) ones. |
| 432 | size_t uchar_max_length = encoded.length() * 2 + 1; |
| 433 | |
| 434 | UChar* uchar_dst; |
[email protected] | 39be424 | 2008-08-07 18:31:40 | [diff] [blame] | 435 | #if defined(WCHAR_T_IS_UTF16) |
initial.commit | d7cae12 | 2008-07-26 21:49:38 | [diff] [blame] | 436 | uchar_dst = WriteInto(wide, uchar_max_length); |
[email protected] | 39be424 | 2008-08-07 18:31:40 | [diff] [blame] | 437 | #elif defined(WCHAR_T_IS_UTF32) |
initial.commit | d7cae12 | 2008-07-26 21:49:38 | [diff] [blame] | 438 | // When wchar_t is wider than UChar (16 bits), convert into a temporary |
| 439 | // UChar* buffer. |
| 440 | std::vector<UChar> wide_uchar(uchar_max_length); |
| 441 | uchar_dst = &wide_uchar[0]; |
[email protected] | 39be424 | 2008-08-07 18:31:40 | [diff] [blame] | 442 | #endif // defined(WCHAR_T_IS_UTF32) |
initial.commit | d7cae12 | 2008-07-26 21:49:38 | [diff] [blame] | 443 | |
| 444 | // Setup our error handler. |
| 445 | switch (on_error) { |
| 446 | case OnStringUtilConversionError::FAIL: |
| 447 | ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0, |
| 448 | NULL, NULL, &status); |
| 449 | break; |
| 450 | case OnStringUtilConversionError::SKIP: |
| 451 | ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0, |
| 452 | NULL, NULL, &status); |
| 453 | break; |
| 454 | default: |
| 455 | NOTREACHED(); |
| 456 | } |
| 457 | |
| 458 | int actual_size = ucnv_toUChars(converter, |
| 459 | uchar_dst, |
| 460 | static_cast<int>(uchar_max_length), |
| 461 | encoded.data(), |
| 462 | static_cast<int>(encoded.length()), |
| 463 | &status); |
| 464 | ucnv_close(converter); |
| 465 | if (!U_SUCCESS(status)) { |
| 466 | wide->clear(); // Make sure the output is empty on error. |
| 467 | return false; |
| 468 | } |
| 469 | |
[email protected] | 39be424 | 2008-08-07 18:31:40 | [diff] [blame] | 470 | #ifdef WCHAR_T_IS_UTF32 |
initial.commit | d7cae12 | 2008-07-26 21:49:38 | [diff] [blame] | 471 | // When wchar_t is wider than UChar (16 bits), it's not possible to wind up |
| 472 | // with any more wchar_t elements than UChar elements. ucnv_toUChars |
| 473 | // returns the number of UChar elements not including the NUL terminator, so |
| 474 | // leave extra room for that. |
| 475 | u_strToWCS(WriteInto(wide, actual_size + 1), actual_size + 1, &actual_size, |
| 476 | uchar_dst, actual_size, &status); |
| 477 | DCHECK(U_SUCCESS(status)) << "failed to convert UChar* to wstring"; |
[email protected] | 39be424 | 2008-08-07 18:31:40 | [diff] [blame] | 478 | #endif // WCHAR_T_IS_UTF32 |
initial.commit | d7cae12 | 2008-07-26 21:49:38 | [diff] [blame] | 479 | |
| 480 | wide->resize(actual_size); |
| 481 | return true; |
| 482 | } |
| 483 | |
| 484 | // Number formatting ----------------------------------------------------------- |
| 485 | |
[email protected] | 8988699 | 2008-08-13 15:32:27 | [diff] [blame] | 486 | namespace { |
| 487 | |
| 488 | struct NumberFormatSingletonTraits |
| 489 | : public DefaultSingletonTraits<NumberFormat> { |
| 490 | static NumberFormat* New() { |
| 491 | UErrorCode status = U_ZERO_ERROR; |
| 492 | NumberFormat* formatter = NumberFormat::createInstance(status); |
| 493 | DCHECK(U_SUCCESS(status)); |
| 494 | return formatter; |
| 495 | } |
| 496 | // There's no ICU call to destroy a NumberFormat object other than |
| 497 | // operator delete, so use the default Delete, which calls operator delete. |
| 498 | // This can cause problems if a different allocator is used by this file than |
| 499 | // by ICU. |
| 500 | }; |
| 501 | |
| 502 | } // namespace |
initial.commit | d7cae12 | 2008-07-26 21:49:38 | [diff] [blame] | 503 | |
| 504 | std::wstring FormatNumber(int64 number) { |
[email protected] | 8988699 | 2008-08-13 15:32:27 | [diff] [blame] | 505 | NumberFormat* number_format = |
| 506 | Singleton<NumberFormat, NumberFormatSingletonTraits>::get(); |
| 507 | |
initial.commit | d7cae12 | 2008-07-26 21:49:38 | [diff] [blame] | 508 | if (!number_format) { |
| 509 | // As a fallback, just return the raw number in a string. |
| 510 | return StringPrintf(L"%lld", number); |
| 511 | } |
| 512 | UnicodeString ustr; |
| 513 | number_format->format(number, ustr); |
| 514 | |
[email protected] | 39be424 | 2008-08-07 18:31:40 | [diff] [blame] | 515 | #if defined(WCHAR_T_IS_UTF16) |
initial.commit | d7cae12 | 2008-07-26 21:49:38 | [diff] [blame] | 516 | return std::wstring(ustr.getBuffer(), |
| 517 | static_cast<std::wstring::size_type>(ustr.length())); |
[email protected] | 39be424 | 2008-08-07 18:31:40 | [diff] [blame] | 518 | #elif defined(WCHAR_T_IS_UTF32) |
initial.commit | d7cae12 | 2008-07-26 21:49:38 | [diff] [blame] | 519 | wchar_t buffer[64]; // A int64 is less than 20 chars long, so 64 chars |
| 520 | // leaves plenty of room for formating stuff. |
| 521 | int length = 0; |
| 522 | UErrorCode error = U_ZERO_ERROR; |
| 523 | u_strToWCS(buffer, 64, &length, ustr.getBuffer(), ustr.length() , &error); |
| 524 | if (U_FAILURE(error)) { |
| 525 | NOTREACHED(); |
| 526 | // As a fallback, just return the raw number in a string. |
| 527 | return StringPrintf(L"%lld", number); |
| 528 | } |
| 529 | return std::wstring(buffer, static_cast<std::wstring::size_type>(length)); |
[email protected] | 39be424 | 2008-08-07 18:31:40 | [diff] [blame] | 530 | #endif // defined(WCHAR_T_IS_UTF32) |
initial.commit | d7cae12 | 2008-07-26 21:49:38 | [diff] [blame] | 531 | } |
license.bot | bf09a50 | 2008-08-24 00:55:55 | [diff] [blame] | 532 | |