[email protected] | 421de2ab | 2011-04-13 18:43:05 | [diff] [blame] | 1 | // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
[email protected] | a3f72189 | 2013-02-07 03:59:06 | [diff] [blame] | 5 | #include "base/strings/utf_offset_string_conversions.h" |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 6 | |
avi | 84f37e1 | 2015-12-25 09:31:42 | [diff] [blame] | 7 | #include <stdint.h> |
| 8 | |
[email protected] | 421de2ab | 2011-04-13 18:43:05 | [diff] [blame] | 9 | #include <algorithm> |
dcheng | 093de9b | 2016-04-04 21:25:51 | [diff] [blame] | 10 | #include <memory> |
[email protected] | 421de2ab | 2011-04-13 18:43:05 | [diff] [blame] | 11 | |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 12 | #include "base/logging.h" |
[email protected] | eb62f726 | 2013-03-30 14:29:00 | [diff] [blame] | 13 | #include "base/strings/string_piece.h" |
[email protected] | a3f72189 | 2013-02-07 03:59:06 | [diff] [blame] | 14 | #include "base/strings/utf_string_conversion_utils.h" |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 15 | |
[email protected] | a3f72189 | 2013-02-07 03:59:06 | [diff] [blame] | 16 | namespace base { |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 17 | |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 18 | OffsetAdjuster::Adjustment::Adjustment(size_t original_offset, |
| 19 | size_t original_length, |
| 20 | size_t output_length) |
| 21 | : original_offset(original_offset), |
| 22 | original_length(original_length), |
| 23 | output_length(output_length) { |
| 24 | } |
| 25 | |
| 26 | // static |
tommycli | 7a4241c | 2017-07-13 01:37:32 | [diff] [blame] | 27 | void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments, |
| 28 | std::vector<size_t>* offsets_for_adjustment, |
| 29 | size_t limit) { |
| 30 | DCHECK(offsets_for_adjustment); |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 31 | for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin()); |
| 32 | i != offsets_for_adjustment->end(); ++i) |
tommycli | 7a4241c | 2017-07-13 01:37:32 | [diff] [blame] | 33 | AdjustOffset(adjustments, &(*i), limit); |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 34 | } |
| 35 | |
| 36 | // static |
| 37 | void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments, |
tommycli | 7a4241c | 2017-07-13 01:37:32 | [diff] [blame] | 38 | size_t* offset, |
| 39 | size_t limit) { |
| 40 | DCHECK(offset); |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 41 | if (*offset == string16::npos) |
| 42 | return; |
[email protected] | 529d4b5 | 2014-04-28 19:37:23 | [diff] [blame] | 43 | int adjustment = 0; |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 44 | for (Adjustments::const_iterator i = adjustments.begin(); |
| 45 | i != adjustments.end(); ++i) { |
| 46 | if (*offset <= i->original_offset) |
| 47 | break; |
| 48 | if (*offset < (i->original_offset + i->original_length)) { |
| 49 | *offset = string16::npos; |
| 50 | return; |
| 51 | } |
[email protected] | 529d4b5 | 2014-04-28 19:37:23 | [diff] [blame] | 52 | adjustment += static_cast<int>(i->original_length - i->output_length); |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 53 | } |
| 54 | *offset -= adjustment; |
tommycli | 7a4241c | 2017-07-13 01:37:32 | [diff] [blame] | 55 | |
| 56 | if (*offset > limit) |
| 57 | *offset = string16::npos; |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 58 | } |
| 59 | |
| 60 | // static |
[email protected] | 529d4b5 | 2014-04-28 19:37:23 | [diff] [blame] | 61 | void OffsetAdjuster::UnadjustOffsets( |
| 62 | const Adjustments& adjustments, |
| 63 | std::vector<size_t>* offsets_for_unadjustment) { |
| 64 | if (!offsets_for_unadjustment || adjustments.empty()) |
| 65 | return; |
| 66 | for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin()); |
| 67 | i != offsets_for_unadjustment->end(); ++i) |
| 68 | UnadjustOffset(adjustments, &(*i)); |
| 69 | } |
| 70 | |
| 71 | // static |
| 72 | void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments, |
| 73 | size_t* offset) { |
| 74 | if (*offset == string16::npos) |
| 75 | return; |
| 76 | int adjustment = 0; |
| 77 | for (Adjustments::const_iterator i = adjustments.begin(); |
| 78 | i != adjustments.end(); ++i) { |
| 79 | if (*offset + adjustment <= i->original_offset) |
| 80 | break; |
| 81 | adjustment += static_cast<int>(i->original_length - i->output_length); |
| 82 | if ((*offset + adjustment) < |
| 83 | (i->original_offset + i->original_length)) { |
| 84 | *offset = string16::npos; |
| 85 | return; |
| 86 | } |
| 87 | } |
| 88 | *offset += adjustment; |
| 89 | } |
| 90 | |
| 91 | // static |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 92 | void OffsetAdjuster::MergeSequentialAdjustments( |
| 93 | const Adjustments& first_adjustments, |
| 94 | Adjustments* adjustments_on_adjusted_string) { |
| 95 | Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin(); |
| 96 | Adjustments::const_iterator first_iter = first_adjustments.begin(); |
| 97 | // Simultaneously iterate over all |adjustments_on_adjusted_string| and |
| 98 | // |first_adjustments|, adding adjustments to or correcting the adjustments |
| 99 | // in |adjustments_on_adjusted_string| as we go. |shift| keeps track of the |
| 100 | // current number of characters collapsed by |first_adjustments| up to this |
| 101 | // point. |currently_collapsing| keeps track of the number of characters |
| 102 | // collapsed by |first_adjustments| into the current |adjusted_iter|'s |
| 103 | // length. These are characters that will change |shift| as soon as we're |
| 104 | // done processing the current |adjusted_iter|; they are not yet reflected in |
| 105 | // |shift|. |
| 106 | size_t shift = 0; |
| 107 | size_t currently_collapsing = 0; |
| 108 | while (adjusted_iter != adjustments_on_adjusted_string->end()) { |
| 109 | if ((first_iter == first_adjustments.end()) || |
| 110 | ((adjusted_iter->original_offset + shift + |
| 111 | adjusted_iter->original_length) <= first_iter->original_offset)) { |
| 112 | // Entire |adjusted_iter| (accounting for its shift and including its |
| 113 | // whole original length) comes before |first_iter|. |
| 114 | // |
| 115 | // Correct the offset at |adjusted_iter| and move onto the next |
| 116 | // adjustment that needs revising. |
| 117 | adjusted_iter->original_offset += shift; |
| 118 | shift += currently_collapsing; |
| 119 | currently_collapsing = 0; |
| 120 | ++adjusted_iter; |
| 121 | } else if ((adjusted_iter->original_offset + shift) > |
| 122 | first_iter->original_offset) { |
| 123 | // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|). |
| 124 | |
| 125 | // It's not possible for the adjustments to overlap. (It shouldn't |
| 126 | // be possible that we have an |adjusted_iter->original_offset| that, |
| 127 | // when adjusted by the computed |shift|, is in the middle of |
[email protected] | e2819765 | 2014-04-23 04:28:38 | [diff] [blame] | 128 | // |first_iter|'s output's length. After all, that would mean the |
| 129 | // current adjustment_on_adjusted_string somehow points to an offset |
| 130 | // that was supposed to have been eliminated by the first set of |
| 131 | // adjustments.) |
| 132 | DCHECK_LE(first_iter->original_offset + first_iter->output_length, |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 133 | adjusted_iter->original_offset + shift); |
| 134 | |
| 135 | // Add the |first_adjustment_iter| to the full set of adjustments while |
| 136 | // making sure |adjusted_iter| continues pointing to the same element. |
| 137 | // We do this by inserting the |first_adjustment_iter| right before |
| 138 | // |adjusted_iter|, then incrementing |adjusted_iter| so it points to |
| 139 | // the following element. |
| 140 | shift += first_iter->original_length - first_iter->output_length; |
| 141 | adjusted_iter = adjustments_on_adjusted_string->insert( |
| 142 | adjusted_iter, *first_iter); |
| 143 | ++adjusted_iter; |
| 144 | ++first_iter; |
| 145 | } else { |
| 146 | // The first adjustment adjusted something that then got further adjusted |
| 147 | // by the second set of adjustments. In other words, |first_iter| points |
| 148 | // to something in the range covered by |adjusted_iter|'s length (after |
| 149 | // accounting for |shift|). Precisely, |
| 150 | // adjusted_iter->original_offset + shift |
| 151 | // <= |
| 152 | // first_iter->original_offset |
| 153 | // <= |
| 154 | // adjusted_iter->original_offset + shift + |
| 155 | // adjusted_iter->original_length |
| 156 | |
| 157 | // Modify the current |adjusted_iter| to include whatever collapsing |
| 158 | // happened in |first_iter|, then advance to the next |first_adjustments| |
| 159 | // because we dealt with the current one. |
| 160 | const int collapse = static_cast<int>(first_iter->original_length) - |
| 161 | static_cast<int>(first_iter->output_length); |
| 162 | // This function does not know how to deal with a string that expands and |
| 163 | // then gets modified, only strings that collapse and then get modified. |
| 164 | DCHECK_GT(collapse, 0); |
| 165 | adjusted_iter->original_length += collapse; |
| 166 | currently_collapsing += collapse; |
| 167 | ++first_iter; |
| 168 | } |
| 169 | } |
| 170 | DCHECK_EQ(0u, currently_collapsing); |
| 171 | if (first_iter != first_adjustments.end()) { |
| 172 | // Only first adjustments are left. These do not need to be modified. |
| 173 | // (Their offsets are already correct with respect to the original string.) |
| 174 | // Append them all. |
| 175 | DCHECK(adjusted_iter == adjustments_on_adjusted_string->end()); |
| 176 | adjustments_on_adjusted_string->insert( |
| 177 | adjustments_on_adjusted_string->end(), first_iter, |
| 178 | first_adjustments.end()); |
| 179 | } |
| 180 | } |
| 181 | |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 182 | // Converts the given source Unicode character type to the given destination |
| 183 | // Unicode character type as a STL string. The given input buffer and size |
| 184 | // determine the source, and the given output STL string will be replaced by |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 185 | // the result. If non-NULL, |adjustments| is set to reflect the all the |
| 186 | // alterations to the string that are not one-character-to-one-character. |
| 187 | // It will always be sorted by increasing offset. |
[email protected] | cbf35e17 | 2011-09-08 02:18:10 | [diff] [blame] | 188 | template<typename SrcChar, typename DestStdString> |
| 189 | bool ConvertUnicode(const SrcChar* src, |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 190 | size_t src_len, |
[email protected] | cbf35e17 | 2011-09-08 02:18:10 | [diff] [blame] | 191 | DestStdString* output, |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 192 | OffsetAdjuster::Adjustments* adjustments) { |
| 193 | if (adjustments) |
| 194 | adjustments->clear(); |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 195 | // ICU requires 32-bit numbers. |
| 196 | bool success = true; |
avi | 84f37e1 | 2015-12-25 09:31:42 | [diff] [blame] | 197 | int32_t src_len32 = static_cast<int32_t>(src_len); |
| 198 | for (int32_t i = 0; i < src_len32; i++) { |
| 199 | uint32_t code_point; |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 200 | size_t original_i = i; |
| 201 | size_t chars_written = 0; |
| 202 | if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { |
| 203 | chars_written = WriteUnicodeCharacter(code_point, output); |
| 204 | } else { |
[email protected] | d7a3e8e | 2010-01-01 22:16:38 | [diff] [blame] | 205 | chars_written = WriteUnicodeCharacter(0xFFFD, output); |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 206 | success = false; |
| 207 | } |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 208 | |
| 209 | // Only bother writing an adjustment if this modification changed the |
| 210 | // length of this character. |
| 211 | // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last |
| 212 | // character read, not after it (so that incrementing it in the loop |
| 213 | // increment will place it at the right location), so we need to account |
| 214 | // for that in determining the amount that was read. |
| 215 | if (adjustments && ((i - original_i + 1) != chars_written)) { |
| 216 | adjustments->push_back(OffsetAdjuster::Adjustment( |
| 217 | original_i, i - original_i + 1, chars_written)); |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 218 | } |
| 219 | } |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 220 | return success; |
| 221 | } |
| 222 | |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 223 | bool UTF8ToUTF16WithAdjustments( |
| 224 | const char* src, |
| 225 | size_t src_len, |
| 226 | string16* output, |
| 227 | base::OffsetAdjuster::Adjustments* adjustments) { |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 228 | PrepareForUTF16Or32Output(src, src_len, output); |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 229 | return ConvertUnicode(src, src_len, output, adjustments); |
[email protected] | 421de2ab | 2011-04-13 18:43:05 | [diff] [blame] | 230 | } |
| 231 | |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 232 | string16 UTF8ToUTF16WithAdjustments( |
| 233 | const base::StringPiece& utf8, |
| 234 | base::OffsetAdjuster::Adjustments* adjustments) { |
[email protected] | 04866c4 | 2011-05-03 20:03:50 | [diff] [blame] | 235 | string16 result; |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 236 | UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments); |
[email protected] | 421de2ab | 2011-04-13 18:43:05 | [diff] [blame] | 237 | return result; |
| 238 | } |
| 239 | |
[email protected] | 04866c4 | 2011-05-03 20:03:50 | [diff] [blame] | 240 | string16 UTF8ToUTF16AndAdjustOffsets( |
| 241 | const base::StringPiece& utf8, |
[email protected] | 421de2ab | 2011-04-13 18:43:05 | [diff] [blame] | 242 | std::vector<size_t>* offsets_for_adjustment) { |
tommycli | 7a4241c | 2017-07-13 01:37:32 | [diff] [blame] | 243 | for (size_t& offset : *offsets_for_adjustment) { |
| 244 | if (offset > utf8.length()) |
| 245 | offset = string16::npos; |
| 246 | } |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 247 | OffsetAdjuster::Adjustments adjustments; |
| 248 | string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments); |
| 249 | OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); |
[email protected] | cbf35e17 | 2011-09-08 02:18:10 | [diff] [blame] | 250 | return result; |
| 251 | } |
| 252 | |
| 253 | std::string UTF16ToUTF8AndAdjustOffsets( |
| 254 | const base::StringPiece16& utf16, |
| 255 | std::vector<size_t>* offsets_for_adjustment) { |
tommycli | 7a4241c | 2017-07-13 01:37:32 | [diff] [blame] | 256 | for (size_t& offset : *offsets_for_adjustment) { |
| 257 | if (offset > utf16.length()) |
| 258 | offset = string16::npos; |
| 259 | } |
[email protected] | cbf35e17 | 2011-09-08 02:18:10 | [diff] [blame] | 260 | std::string result; |
| 261 | PrepareForUTF8Output(utf16.data(), utf16.length(), &result); |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 262 | OffsetAdjuster::Adjustments adjustments; |
| 263 | ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments); |
| 264 | OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); |
[email protected] | cbf35e17 | 2011-09-08 02:18:10 | [diff] [blame] | 265 | return result; |
| 266 | } |
| 267 | |
[email protected] | a3f72189 | 2013-02-07 03:59:06 | [diff] [blame] | 268 | } // namespace base |