blob: b91ee03832663e800d701409c22508024600c99d [file] [log] [blame]
[email protected]421de2ab2011-04-13 18:43:051// Copyright (c) 2011 The Chromium Authors. All rights reserved.
[email protected]b9f93832009-11-13 19:27:482// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
[email protected]a3f721892013-02-07 03:59:065#include "base/strings/utf_offset_string_conversions.h"
[email protected]b9f93832009-11-13 19:27:486
avi84f37e12015-12-25 09:31:427#include <stdint.h>
8
[email protected]421de2ab2011-04-13 18:43:059#include <algorithm>
dcheng093de9b2016-04-04 21:25:5110#include <memory>
[email protected]421de2ab2011-04-13 18:43:0511
[email protected]a97376e2014-04-18 20:54:4412#include "base/logging.h"
[email protected]eb62f7262013-03-30 14:29:0013#include "base/strings/string_piece.h"
[email protected]a3f721892013-02-07 03:59:0614#include "base/strings/utf_string_conversion_utils.h"
[email protected]b9f93832009-11-13 19:27:4815
[email protected]a3f721892013-02-07 03:59:0616namespace base {
[email protected]b9f93832009-11-13 19:27:4817
[email protected]a97376e2014-04-18 20:54:4418OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
19 size_t original_length,
20 size_t output_length)
21 : original_offset(original_offset),
22 original_length(original_length),
23 output_length(output_length) {
24}
25
26// static
tommycli7a4241c2017-07-13 01:37:3227void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments,
28 std::vector<size_t>* offsets_for_adjustment,
29 size_t limit) {
30 DCHECK(offsets_for_adjustment);
[email protected]a97376e2014-04-18 20:54:4431 for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin());
32 i != offsets_for_adjustment->end(); ++i)
tommycli7a4241c2017-07-13 01:37:3233 AdjustOffset(adjustments, &(*i), limit);
[email protected]a97376e2014-04-18 20:54:4434}
35
36// static
37void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,
tommycli7a4241c2017-07-13 01:37:3238 size_t* offset,
39 size_t limit) {
40 DCHECK(offset);
[email protected]a97376e2014-04-18 20:54:4441 if (*offset == string16::npos)
42 return;
[email protected]529d4b52014-04-28 19:37:2343 int adjustment = 0;
[email protected]a97376e2014-04-18 20:54:4444 for (Adjustments::const_iterator i = adjustments.begin();
45 i != adjustments.end(); ++i) {
46 if (*offset <= i->original_offset)
47 break;
48 if (*offset < (i->original_offset + i->original_length)) {
49 *offset = string16::npos;
50 return;
51 }
[email protected]529d4b52014-04-28 19:37:2352 adjustment += static_cast<int>(i->original_length - i->output_length);
[email protected]a97376e2014-04-18 20:54:4453 }
54 *offset -= adjustment;
tommycli7a4241c2017-07-13 01:37:3255
56 if (*offset > limit)
57 *offset = string16::npos;
[email protected]a97376e2014-04-18 20:54:4458}
59
60// static
[email protected]529d4b52014-04-28 19:37:2361void OffsetAdjuster::UnadjustOffsets(
62 const Adjustments& adjustments,
63 std::vector<size_t>* offsets_for_unadjustment) {
64 if (!offsets_for_unadjustment || adjustments.empty())
65 return;
66 for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin());
67 i != offsets_for_unadjustment->end(); ++i)
68 UnadjustOffset(adjustments, &(*i));
69}
70
71// static
72void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,
73 size_t* offset) {
74 if (*offset == string16::npos)
75 return;
76 int adjustment = 0;
77 for (Adjustments::const_iterator i = adjustments.begin();
78 i != adjustments.end(); ++i) {
79 if (*offset + adjustment <= i->original_offset)
80 break;
81 adjustment += static_cast<int>(i->original_length - i->output_length);
82 if ((*offset + adjustment) <
83 (i->original_offset + i->original_length)) {
84 *offset = string16::npos;
85 return;
86 }
87 }
88 *offset += adjustment;
89}
90
91// static
[email protected]a97376e2014-04-18 20:54:4492void OffsetAdjuster::MergeSequentialAdjustments(
93 const Adjustments& first_adjustments,
94 Adjustments* adjustments_on_adjusted_string) {
95 Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin();
96 Adjustments::const_iterator first_iter = first_adjustments.begin();
97 // Simultaneously iterate over all |adjustments_on_adjusted_string| and
98 // |first_adjustments|, adding adjustments to or correcting the adjustments
99 // in |adjustments_on_adjusted_string| as we go. |shift| keeps track of the
100 // current number of characters collapsed by |first_adjustments| up to this
101 // point. |currently_collapsing| keeps track of the number of characters
102 // collapsed by |first_adjustments| into the current |adjusted_iter|'s
103 // length. These are characters that will change |shift| as soon as we're
104 // done processing the current |adjusted_iter|; they are not yet reflected in
105 // |shift|.
106 size_t shift = 0;
107 size_t currently_collapsing = 0;
108 while (adjusted_iter != adjustments_on_adjusted_string->end()) {
109 if ((first_iter == first_adjustments.end()) ||
110 ((adjusted_iter->original_offset + shift +
111 adjusted_iter->original_length) <= first_iter->original_offset)) {
112 // Entire |adjusted_iter| (accounting for its shift and including its
113 // whole original length) comes before |first_iter|.
114 //
115 // Correct the offset at |adjusted_iter| and move onto the next
116 // adjustment that needs revising.
117 adjusted_iter->original_offset += shift;
118 shift += currently_collapsing;
119 currently_collapsing = 0;
120 ++adjusted_iter;
121 } else if ((adjusted_iter->original_offset + shift) >
122 first_iter->original_offset) {
123 // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|).
124
125 // It's not possible for the adjustments to overlap. (It shouldn't
126 // be possible that we have an |adjusted_iter->original_offset| that,
127 // when adjusted by the computed |shift|, is in the middle of
[email protected]e28197652014-04-23 04:28:38128 // |first_iter|'s output's length. After all, that would mean the
129 // current adjustment_on_adjusted_string somehow points to an offset
130 // that was supposed to have been eliminated by the first set of
131 // adjustments.)
132 DCHECK_LE(first_iter->original_offset + first_iter->output_length,
[email protected]a97376e2014-04-18 20:54:44133 adjusted_iter->original_offset + shift);
134
135 // Add the |first_adjustment_iter| to the full set of adjustments while
136 // making sure |adjusted_iter| continues pointing to the same element.
137 // We do this by inserting the |first_adjustment_iter| right before
138 // |adjusted_iter|, then incrementing |adjusted_iter| so it points to
139 // the following element.
140 shift += first_iter->original_length - first_iter->output_length;
141 adjusted_iter = adjustments_on_adjusted_string->insert(
142 adjusted_iter, *first_iter);
143 ++adjusted_iter;
144 ++first_iter;
145 } else {
146 // The first adjustment adjusted something that then got further adjusted
147 // by the second set of adjustments. In other words, |first_iter| points
148 // to something in the range covered by |adjusted_iter|'s length (after
149 // accounting for |shift|). Precisely,
150 // adjusted_iter->original_offset + shift
151 // <=
152 // first_iter->original_offset
153 // <=
154 // adjusted_iter->original_offset + shift +
155 // adjusted_iter->original_length
156
157 // Modify the current |adjusted_iter| to include whatever collapsing
158 // happened in |first_iter|, then advance to the next |first_adjustments|
159 // because we dealt with the current one.
160 const int collapse = static_cast<int>(first_iter->original_length) -
161 static_cast<int>(first_iter->output_length);
162 // This function does not know how to deal with a string that expands and
163 // then gets modified, only strings that collapse and then get modified.
164 DCHECK_GT(collapse, 0);
165 adjusted_iter->original_length += collapse;
166 currently_collapsing += collapse;
167 ++first_iter;
168 }
169 }
170 DCHECK_EQ(0u, currently_collapsing);
171 if (first_iter != first_adjustments.end()) {
172 // Only first adjustments are left. These do not need to be modified.
173 // (Their offsets are already correct with respect to the original string.)
174 // Append them all.
175 DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());
176 adjustments_on_adjusted_string->insert(
177 adjustments_on_adjusted_string->end(), first_iter,
178 first_adjustments.end());
179 }
180}
181
[email protected]b9f93832009-11-13 19:27:48182// Converts the given source Unicode character type to the given destination
183// Unicode character type as a STL string. The given input buffer and size
184// determine the source, and the given output STL string will be replaced by
[email protected]a97376e2014-04-18 20:54:44185// the result. If non-NULL, |adjustments| is set to reflect the all the
186// alterations to the string that are not one-character-to-one-character.
187// It will always be sorted by increasing offset.
[email protected]cbf35e172011-09-08 02:18:10188template<typename SrcChar, typename DestStdString>
189bool ConvertUnicode(const SrcChar* src,
[email protected]b9f93832009-11-13 19:27:48190 size_t src_len,
[email protected]cbf35e172011-09-08 02:18:10191 DestStdString* output,
[email protected]a97376e2014-04-18 20:54:44192 OffsetAdjuster::Adjustments* adjustments) {
193 if (adjustments)
194 adjustments->clear();
[email protected]b9f93832009-11-13 19:27:48195 // ICU requires 32-bit numbers.
196 bool success = true;
avi84f37e12015-12-25 09:31:42197 int32_t src_len32 = static_cast<int32_t>(src_len);
198 for (int32_t i = 0; i < src_len32; i++) {
199 uint32_t code_point;
[email protected]b9f93832009-11-13 19:27:48200 size_t original_i = i;
201 size_t chars_written = 0;
202 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
203 chars_written = WriteUnicodeCharacter(code_point, output);
204 } else {
[email protected]d7a3e8e2010-01-01 22:16:38205 chars_written = WriteUnicodeCharacter(0xFFFD, output);
[email protected]b9f93832009-11-13 19:27:48206 success = false;
207 }
[email protected]a97376e2014-04-18 20:54:44208
209 // Only bother writing an adjustment if this modification changed the
210 // length of this character.
211 // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
212 // character read, not after it (so that incrementing it in the loop
213 // increment will place it at the right location), so we need to account
214 // for that in determining the amount that was read.
215 if (adjustments && ((i - original_i + 1) != chars_written)) {
216 adjustments->push_back(OffsetAdjuster::Adjustment(
217 original_i, i - original_i + 1, chars_written));
[email protected]b9f93832009-11-13 19:27:48218 }
219 }
[email protected]b9f93832009-11-13 19:27:48220 return success;
221}
222
[email protected]a97376e2014-04-18 20:54:44223bool UTF8ToUTF16WithAdjustments(
224 const char* src,
225 size_t src_len,
226 string16* output,
227 base::OffsetAdjuster::Adjustments* adjustments) {
[email protected]b9f93832009-11-13 19:27:48228 PrepareForUTF16Or32Output(src, src_len, output);
[email protected]a97376e2014-04-18 20:54:44229 return ConvertUnicode(src, src_len, output, adjustments);
[email protected]421de2ab2011-04-13 18:43:05230}
231
[email protected]a97376e2014-04-18 20:54:44232string16 UTF8ToUTF16WithAdjustments(
233 const base::StringPiece& utf8,
234 base::OffsetAdjuster::Adjustments* adjustments) {
[email protected]04866c42011-05-03 20:03:50235 string16 result;
[email protected]a97376e2014-04-18 20:54:44236 UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);
[email protected]421de2ab2011-04-13 18:43:05237 return result;
238}
239
[email protected]04866c42011-05-03 20:03:50240string16 UTF8ToUTF16AndAdjustOffsets(
241 const base::StringPiece& utf8,
[email protected]421de2ab2011-04-13 18:43:05242 std::vector<size_t>* offsets_for_adjustment) {
tommycli7a4241c2017-07-13 01:37:32243 for (size_t& offset : *offsets_for_adjustment) {
244 if (offset > utf8.length())
245 offset = string16::npos;
246 }
[email protected]a97376e2014-04-18 20:54:44247 OffsetAdjuster::Adjustments adjustments;
248 string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);
249 OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
[email protected]cbf35e172011-09-08 02:18:10250 return result;
251}
252
253std::string UTF16ToUTF8AndAdjustOffsets(
254 const base::StringPiece16& utf16,
255 std::vector<size_t>* offsets_for_adjustment) {
tommycli7a4241c2017-07-13 01:37:32256 for (size_t& offset : *offsets_for_adjustment) {
257 if (offset > utf16.length())
258 offset = string16::npos;
259 }
[email protected]cbf35e172011-09-08 02:18:10260 std::string result;
261 PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
[email protected]a97376e2014-04-18 20:54:44262 OffsetAdjuster::Adjustments adjustments;
263 ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);
264 OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
[email protected]cbf35e172011-09-08 02:18:10265 return result;
266}
267
[email protected]a3f721892013-02-07 03:59:06268} // namespace base