blob: c5ce647a99601f2d6b1307e28644395cdd27a170 [file] [log] [blame]
[email protected]421de2ab2011-04-13 18:43:051// Copyright (c) 2011 The Chromium Authors. All rights reserved.
[email protected]b9f93832009-11-13 19:27:482// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
avi84f37e12015-12-25 09:31:425#include <stddef.h>
6
[email protected]421de2ab2011-04-13 18:43:057#include <algorithm>
8
[email protected]b9f93832009-11-13 19:27:489#include "base/logging.h"
avi84f37e12015-12-25 09:31:4210#include "base/macros.h"
[email protected]eb62f7262013-03-30 14:29:0011#include "base/strings/string_piece.h"
[email protected]a3f721892013-02-07 03:59:0612#include "base/strings/utf_offset_string_conversions.h"
[email protected]b9f93832009-11-13 19:27:4813#include "testing/gtest/include/gtest/gtest.h"
14
15namespace base {
16
17namespace {
18
[email protected]04866c42011-05-03 20:03:5019static const size_t kNpos = string16::npos;
[email protected]b9f93832009-11-13 19:27:4820
21} // namespace
22
23TEST(UTFOffsetStringConversionsTest, AdjustOffset) {
[email protected]04866c42011-05-03 20:03:5024 struct UTF8ToUTF16Case {
[email protected]b9f93832009-11-13 19:27:4825 const char* utf8;
26 size_t input_offset;
27 size_t output_offset;
[email protected]04866c42011-05-03 20:03:5028 } utf8_to_utf16_cases[] = {
[email protected]cf7ca8a2013-09-11 00:42:2829 {"", 0, 0},
30 {"", kNpos, kNpos},
[email protected]421de2ab2011-04-13 18:43:0531 {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, kNpos},
[email protected]b9f93832009-11-13 19:27:4832 {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1},
Mark Mentovai62ab7902017-11-06 22:26:0933 {"\xed\xb0\x80z", 3, 3},
[email protected]b9f93832009-11-13 19:27:4834 {"A\xF0\x90\x8C\x80z", 1, 1},
[email protected]421de2ab2011-04-13 18:43:0535 {"A\xF0\x90\x8C\x80z", 2, kNpos},
[email protected]b9f93832009-11-13 19:27:4836 {"A\xF0\x90\x8C\x80z", 5, 3},
[email protected]cf7ca8a2013-09-11 00:42:2837 {"A\xF0\x90\x8C\x80z", 6, 4},
38 {"A\xF0\x90\x8C\x80z", kNpos, kNpos},
[email protected]b9f93832009-11-13 19:27:4839 };
viettrungluu805eabb2014-10-16 04:02:4940 for (size_t i = 0; i < arraysize(utf8_to_utf16_cases); ++i) {
[email protected]a97376e2014-04-18 20:54:4441 const size_t offset = utf8_to_utf16_cases[i].input_offset;
42 std::vector<size_t> offsets;
43 offsets.push_back(offset);
44 UTF8ToUTF16AndAdjustOffsets(utf8_to_utf16_cases[i].utf8, &offsets);
45 EXPECT_EQ(utf8_to_utf16_cases[i].output_offset, offsets[0]);
[email protected]b9f93832009-11-13 19:27:4846 }
[email protected]cbf35e172011-09-08 02:18:1047
48 struct UTF16ToUTF8Case {
49 char16 utf16[10];
50 size_t input_offset;
51 size_t output_offset;
52 } utf16_to_utf8_cases[] = {
[email protected]cf7ca8a2013-09-11 00:42:2853 {{}, 0, 0},
[email protected]cbf35e172011-09-08 02:18:1054 // Converted to 3-byte utf-8 sequences
[email protected]cf7ca8a2013-09-11 00:42:2855 {{0x5909, 0x63DB}, 3, kNpos},
56 {{0x5909, 0x63DB}, 2, 6},
[email protected]cbf35e172011-09-08 02:18:1057 {{0x5909, 0x63DB}, 1, 3},
[email protected]cf7ca8a2013-09-11 00:42:2858 {{0x5909, 0x63DB}, 0, 0},
[email protected]cbf35e172011-09-08 02:18:1059 // Converted to 2-byte utf-8 sequences
60 {{'A', 0x00bc, 0x00be, 'z'}, 1, 1},
61 {{'A', 0x00bc, 0x00be, 'z'}, 2, 3},
62 {{'A', 0x00bc, 0x00be, 'z'}, 3, 5},
[email protected]cf7ca8a2013-09-11 00:42:2863 {{'A', 0x00bc, 0x00be, 'z'}, 4, 6},
[email protected]cbf35e172011-09-08 02:18:1064 // Surrogate pair
65 {{'A', 0xd800, 0xdf00, 'z'}, 1, 1},
66 {{'A', 0xd800, 0xdf00, 'z'}, 2, kNpos},
67 {{'A', 0xd800, 0xdf00, 'z'}, 3, 5},
[email protected]cf7ca8a2013-09-11 00:42:2868 {{'A', 0xd800, 0xdf00, 'z'}, 4, 6},
[email protected]cbf35e172011-09-08 02:18:1069 };
viettrungluu805eabb2014-10-16 04:02:4970 for (size_t i = 0; i < arraysize(utf16_to_utf8_cases); ++i) {
[email protected]cbf35e172011-09-08 02:18:1071 size_t offset = utf16_to_utf8_cases[i].input_offset;
[email protected]a97376e2014-04-18 20:54:4472 std::vector<size_t> offsets;
73 offsets.push_back(offset);
74 UTF16ToUTF8AndAdjustOffsets(utf16_to_utf8_cases[i].utf16, &offsets);
75 EXPECT_EQ(utf16_to_utf8_cases[i].output_offset, offsets[0]) << i;
[email protected]cbf35e172011-09-08 02:18:1076 }
[email protected]b9f93832009-11-13 19:27:4877}
78
[email protected]421de2ab2011-04-13 18:43:0579TEST(UTFOffsetStringConversionsTest, LimitOffsets) {
tommycli7a4241c2017-07-13 01:37:3280 const OffsetAdjuster::Adjustments kNoAdjustments;
[email protected]421de2ab2011-04-13 18:43:0581 const size_t kLimit = 10;
82 const size_t kItems = 20;
83 std::vector<size_t> size_ts;
tommycli7a4241c2017-07-13 01:37:3284 for (size_t t = 0; t < kItems; ++t) {
[email protected]421de2ab2011-04-13 18:43:0585 size_ts.push_back(t);
tommycli7a4241c2017-07-13 01:37:3286 OffsetAdjuster::AdjustOffset(kNoAdjustments, &size_ts.back(), kLimit);
87 }
[email protected]421de2ab2011-04-13 18:43:0588 size_t unlimited_count = 0;
89 for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end();
90 ++ti) {
[email protected]cf7ca8a2013-09-11 00:42:2891 if (*ti != kNpos)
[email protected]421de2ab2011-04-13 18:43:0592 ++unlimited_count;
93 }
[email protected]cf7ca8a2013-09-11 00:42:2894 EXPECT_EQ(11U, unlimited_count);
[email protected]421de2ab2011-04-13 18:43:0595
96 // Reverse the values in the vector and try again.
97 size_ts.clear();
tommycli7a4241c2017-07-13 01:37:3298 for (size_t t = kItems; t > 0; --t) {
[email protected]421de2ab2011-04-13 18:43:0599 size_ts.push_back(t - 1);
tommycli7a4241c2017-07-13 01:37:32100 OffsetAdjuster::AdjustOffset(kNoAdjustments, &size_ts.back(), kLimit);
101 }
[email protected]421de2ab2011-04-13 18:43:05102 unlimited_count = 0;
103 for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end();
104 ++ti) {
[email protected]cf7ca8a2013-09-11 00:42:28105 if (*ti != kNpos)
[email protected]421de2ab2011-04-13 18:43:05106 ++unlimited_count;
107 }
[email protected]cf7ca8a2013-09-11 00:42:28108 EXPECT_EQ(11U, unlimited_count);
[email protected]421de2ab2011-04-13 18:43:05109}
110
111TEST(UTFOffsetStringConversionsTest, AdjustOffsets) {
112 // Imagine we have strings as shown in the following cases where the
113 // X's represent encoded characters.
114 // 1: abcXXXdef ==> abcXdef
[email protected]04866c42011-05-03 20:03:50115 {
116 std::vector<size_t> offsets;
[email protected]cf7ca8a2013-09-11 00:42:28117 for (size_t t = 0; t <= 9; ++t)
[email protected]04866c42011-05-03 20:03:50118 offsets.push_back(t);
[email protected]a97376e2014-04-18 20:54:44119 OffsetAdjuster::Adjustments adjustments;
120 adjustments.push_back(OffsetAdjuster::Adjustment(3, 3, 1));
121 OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
[email protected]cf7ca8a2013-09-11 00:42:28122 size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6, 7};
[email protected]04866c42011-05-03 20:03:50123 EXPECT_EQ(offsets.size(), arraysize(expected_1));
124 for (size_t i = 0; i < arraysize(expected_1); ++i)
125 EXPECT_EQ(expected_1[i], offsets[i]);
126 }
[email protected]421de2ab2011-04-13 18:43:05127
128 // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
[email protected]04866c42011-05-03 20:03:50129 {
130 std::vector<size_t> offsets;
[email protected]cf7ca8a2013-09-11 00:42:28131 for (size_t t = 0; t <= 23; ++t)
[email protected]04866c42011-05-03 20:03:50132 offsets.push_back(t);
[email protected]a97376e2014-04-18 20:54:44133 OffsetAdjuster::Adjustments adjustments;
134 adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 1));
135 adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 2));
136 adjustments.push_back(OffsetAdjuster::Adjustment(10, 7, 4));
137 adjustments.push_back(OffsetAdjuster::Adjustment(20, 3, 1));
138 OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
[email protected]cf7ca8a2013-09-11 00:42:28139 size_t expected_2[] = {
140 0, kNpos, kNpos, 1, 2, kNpos, kNpos, kNpos, 4, 5, 6, kNpos, kNpos, kNpos,
141 kNpos, kNpos, kNpos, 10, 11, 12, 13, kNpos, kNpos, 14
142 };
[email protected]04866c42011-05-03 20:03:50143 EXPECT_EQ(offsets.size(), arraysize(expected_2));
144 for (size_t i = 0; i < arraysize(expected_2); ++i)
145 EXPECT_EQ(expected_2[i], offsets[i]);
146 }
[email protected]421de2ab2011-04-13 18:43:05147
148 // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
[email protected]04866c42011-05-03 20:03:50149 {
150 std::vector<size_t> offsets;
[email protected]cf7ca8a2013-09-11 00:42:28151 for (size_t t = 0; t <= 17; ++t)
[email protected]04866c42011-05-03 20:03:50152 offsets.push_back(t);
[email protected]a97376e2014-04-18 20:54:44153 OffsetAdjuster::Adjustments adjustments;
154 adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 0));
155 adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 4));
156 adjustments.push_back(OffsetAdjuster::Adjustment(11, 3, 3));
157 adjustments.push_back(OffsetAdjuster::Adjustment(15, 2, 0));
158 OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
[email protected]cf7ca8a2013-09-11 00:42:28159 size_t expected_3[] = {
160 0, kNpos, kNpos, 0, 1, kNpos, kNpos, kNpos, 5, 6, 7, 8, kNpos, kNpos, 11,
161 12, kNpos, 12
162 };
[email protected]04866c42011-05-03 20:03:50163 EXPECT_EQ(offsets.size(), arraysize(expected_3));
164 for (size_t i = 0; i < arraysize(expected_3); ++i)
165 EXPECT_EQ(expected_3[i], offsets[i]);
166 }
[email protected]421de2ab2011-04-13 18:43:05167}
168
[email protected]529d4b52014-04-28 19:37:23169TEST(UTFOffsetStringConversionsTest, UnadjustOffsets) {
170 // Imagine we have strings as shown in the following cases where the
171 // X's represent encoded characters.
172 // 1: abcXXXdef ==> abcXdef
173 {
174 std::vector<size_t> offsets;
175 for (size_t t = 0; t <= 7; ++t)
176 offsets.push_back(t);
177 OffsetAdjuster::Adjustments adjustments;
178 adjustments.push_back(OffsetAdjuster::Adjustment(3, 3, 1));
179 OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
180 size_t expected_1[] = {0, 1, 2, 3, 6, 7, 8, 9};
181 EXPECT_EQ(offsets.size(), arraysize(expected_1));
182 for (size_t i = 0; i < arraysize(expected_1); ++i)
183 EXPECT_EQ(expected_1[i], offsets[i]);
184 }
185
186 // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
187 {
188 std::vector<size_t> offsets;
189 for (size_t t = 0; t <= 14; ++t)
190 offsets.push_back(t);
191 OffsetAdjuster::Adjustments adjustments;
192 adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 1));
193 adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 2));
194 adjustments.push_back(OffsetAdjuster::Adjustment(10, 7, 4));
195 adjustments.push_back(OffsetAdjuster::Adjustment(20, 3, 1));
196 OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
197 size_t expected_2[] = {
198 0, 3, 4, kNpos, 8, 9, 10, kNpos, kNpos, kNpos, 17, 18, 19, 20, 23
199 };
200 EXPECT_EQ(offsets.size(), arraysize(expected_2));
201 for (size_t i = 0; i < arraysize(expected_2); ++i)
202 EXPECT_EQ(expected_2[i], offsets[i]);
203 }
204
205 // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
206 {
207 std::vector<size_t> offsets;
208 for (size_t t = 0; t <= 12; ++t)
209 offsets.push_back(t);
210 OffsetAdjuster::Adjustments adjustments;
211 adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 0));
212 adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 4));
213 adjustments.push_back(OffsetAdjuster::Adjustment(11, 3, 3));
214 adjustments.push_back(OffsetAdjuster::Adjustment(15, 2, 0));
215 OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
216 size_t expected_3[] = {
217 0, // this could just as easily be 3
218 4, kNpos, kNpos, kNpos, 8, 9, 10, 11, kNpos, kNpos, 14,
219 15 // this could just as easily be 17
220 };
221 EXPECT_EQ(offsets.size(), arraysize(expected_3));
222 for (size_t i = 0; i < arraysize(expected_3); ++i)
223 EXPECT_EQ(expected_3[i], offsets[i]);
224 }
225}
226
[email protected]a97376e2014-04-18 20:54:44227// MergeSequentialAdjustments is used by net/base/escape.{h,cc} and
228// net/base/net_util.{h,cc}. The two tests EscapeTest.AdjustOffset and
229// NetUtilTest.FormatUrlWithOffsets test its behavior extensively. This
230// is simply a short, additional test.
231TEST(UTFOffsetStringConversionsTest, MergeSequentialAdjustments) {
232 // Pretend the input string is "abcdefghijklmnopqrstuvwxyz".
233
234 // Set up |first_adjustments| to
235 // - remove the leading "a"
236 // - combine the "bc" into one character (call it ".")
237 // - remove the "f"
238 // - remove the "tuv"
239 // The resulting string should be ".deghijklmnopqrswxyz".
240 OffsetAdjuster::Adjustments first_adjustments;
241 first_adjustments.push_back(OffsetAdjuster::Adjustment(0, 1, 0));
242 first_adjustments.push_back(OffsetAdjuster::Adjustment(1, 2, 1));
243 first_adjustments.push_back(OffsetAdjuster::Adjustment(5, 1, 0));
244 first_adjustments.push_back(OffsetAdjuster::Adjustment(19, 3, 0));
245
246 // Set up |adjustments_on_adjusted_string| to
247 // - combine the "." character that replaced "bc" with "d" into one character
248 // (call it "?")
249 // - remove the "egh"
250 // - expand the "i" into two characters (call them "12")
251 // - combine the "jkl" into one character (call it "@")
252 // - expand the "z" into two characters (call it "34")
253 // The resulting string should be "?12@mnopqrswxy34".
254 OffsetAdjuster::Adjustments adjustments_on_adjusted_string;
255 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
256 0, 2, 1));
257 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
258 2, 3, 0));
259 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
260 5, 1, 2));
261 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
262 6, 3, 1));
263 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
264 19, 1, 2));
265
266 // Now merge the adjustments and check the results.
267 OffsetAdjuster::MergeSequentialAdjustments(first_adjustments,
268 &adjustments_on_adjusted_string);
269 // The merged adjustments should look like
270 // - combine abcd into "?"
271 // - note: it's also reasonable for the Merge function to instead produce
272 // two adjustments instead of this, one to remove a and another to
273 // combine bcd into "?". This test verifies the current behavior.
274 // - remove efgh
275 // - expand i into "12"
276 // - combine jkl into "@"
277 // - remove tuv
278 // - expand z into "34"
279 ASSERT_EQ(6u, adjustments_on_adjusted_string.size());
280 EXPECT_EQ(0u, adjustments_on_adjusted_string[0].original_offset);
281 EXPECT_EQ(4u, adjustments_on_adjusted_string[0].original_length);
282 EXPECT_EQ(1u, adjustments_on_adjusted_string[0].output_length);
283 EXPECT_EQ(4u, adjustments_on_adjusted_string[1].original_offset);
284 EXPECT_EQ(4u, adjustments_on_adjusted_string[1].original_length);
285 EXPECT_EQ(0u, adjustments_on_adjusted_string[1].output_length);
286 EXPECT_EQ(8u, adjustments_on_adjusted_string[2].original_offset);
287 EXPECT_EQ(1u, adjustments_on_adjusted_string[2].original_length);
288 EXPECT_EQ(2u, adjustments_on_adjusted_string[2].output_length);
289 EXPECT_EQ(9u, adjustments_on_adjusted_string[3].original_offset);
290 EXPECT_EQ(3u, adjustments_on_adjusted_string[3].original_length);
291 EXPECT_EQ(1u, adjustments_on_adjusted_string[3].output_length);
292 EXPECT_EQ(19u, adjustments_on_adjusted_string[4].original_offset);
293 EXPECT_EQ(3u, adjustments_on_adjusted_string[4].original_length);
294 EXPECT_EQ(0u, adjustments_on_adjusted_string[4].output_length);
295 EXPECT_EQ(25u, adjustments_on_adjusted_string[5].original_offset);
296 EXPECT_EQ(1u, adjustments_on_adjusted_string[5].original_length);
297 EXPECT_EQ(2u, adjustments_on_adjusted_string[5].output_length);
298}
299
danakjc3762b92015-03-07 01:51:42300} // namespace base