Blame - base/strings/utf_offset_string_conversions.cc - chromium/src.git

blob: b91ee03832663e800d701409c22508024600c99d [file] [log] [blame]

[email protected]	421de2ab	2011-04-13 18:43:05	[diff] [blame]	1	// Copyright (c) 2011 The Chromium Authors. All rights reserved.
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
[email protected]	a3f72189	2013-02-07 03:59:06	[diff] [blame]	5	#include "base/strings/utf_offset_string_conversions.h"
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	6
avi	84f37e1	2015-12-25 09:31:42	[diff] [blame]	7	#include <stdint.h>
				8
[email protected]	421de2ab	2011-04-13 18:43:05	[diff] [blame]	9	#include <algorithm>
dcheng	093de9b	2016-04-04 21:25:51	[diff] [blame]	10	#include <memory>
[email protected]	421de2ab	2011-04-13 18:43:05	[diff] [blame]	11
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	12	#include "base/logging.h"
[email protected]	eb62f726	2013-03-30 14:29:00	[diff] [blame]	13	#include "base/strings/string_piece.h"
[email protected]	a3f72189	2013-02-07 03:59:06	[diff] [blame]	14	#include "base/strings/utf_string_conversion_utils.h"
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	15
[email protected]	a3f72189	2013-02-07 03:59:06	[diff] [blame]	16	namespace base {
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	17
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	18	OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
				19	size_t original_length,
				20	size_t output_length)
				21	: original_offset(original_offset),
				22	original_length(original_length),
				23	output_length(output_length) {
				24	}
				25
				26	// static
tommycli	7a4241c	2017-07-13 01:37:32	[diff] [blame]	27	void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments,
				28	std::vector<size_t>* offsets_for_adjustment,
				29	size_t limit) {
				30	DCHECK(offsets_for_adjustment);
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	31	for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin());
				32	i != offsets_for_adjustment->end(); ++i)
tommycli	7a4241c	2017-07-13 01:37:32	[diff] [blame]	33	AdjustOffset(adjustments, &(*i), limit);
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	34	}
				35
				36	// static
				37	void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,
tommycli	7a4241c	2017-07-13 01:37:32	[diff] [blame]	38	size_t* offset,
				39	size_t limit) {
				40	DCHECK(offset);
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	41	if (*offset == string16::npos)
				42	return;
[email protected]	529d4b5	2014-04-28 19:37:23	[diff] [blame]	43	int adjustment = 0;
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	44	for (Adjustments::const_iterator i = adjustments.begin();
				45	i != adjustments.end(); ++i) {
				46	if (*offset <= i->original_offset)
				47	break;
				48	if (*offset < (i->original_offset + i->original_length)) {
				49	*offset = string16::npos;
				50	return;
				51	}
[email protected]	529d4b5	2014-04-28 19:37:23	[diff] [blame]	52	adjustment += static_cast<int>(i->original_length - i->output_length);
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	53	}
				54	*offset -= adjustment;
tommycli	7a4241c	2017-07-13 01:37:32	[diff] [blame]	55
				56	if (*offset > limit)
				57	*offset = string16::npos;
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	58	}
				59
				60	// static
[email protected]	529d4b5	2014-04-28 19:37:23	[diff] [blame]	61	void OffsetAdjuster::UnadjustOffsets(
				62	const Adjustments& adjustments,
				63	std::vector<size_t>* offsets_for_unadjustment) {
				64	if (!offsets_for_unadjustment \|\| adjustments.empty())
				65	return;
				66	for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin());
				67	i != offsets_for_unadjustment->end(); ++i)
				68	UnadjustOffset(adjustments, &(*i));
				69	}
				70
				71	// static
				72	void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,
				73	size_t* offset) {
				74	if (*offset == string16::npos)
				75	return;
				76	int adjustment = 0;
				77	for (Adjustments::const_iterator i = adjustments.begin();
				78	i != adjustments.end(); ++i) {
				79	if (*offset + adjustment <= i->original_offset)
				80	break;
				81	adjustment += static_cast<int>(i->original_length - i->output_length);
				82	if ((*offset + adjustment) <
				83	(i->original_offset + i->original_length)) {
				84	*offset = string16::npos;
				85	return;
				86	}
				87	}
				88	*offset += adjustment;
				89	}
				90
				91	// static
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	92	void OffsetAdjuster::MergeSequentialAdjustments(
				93	const Adjustments& first_adjustments,
				94	Adjustments* adjustments_on_adjusted_string) {
				95	Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin();
				96	Adjustments::const_iterator first_iter = first_adjustments.begin();
				97	// Simultaneously iterate over all \|adjustments_on_adjusted_string\| and
				98	// \|first_adjustments\|, adding adjustments to or correcting the adjustments
				99	// in \|adjustments_on_adjusted_string\| as we go. \|shift\| keeps track of the
				100	// current number of characters collapsed by \|first_adjustments\| up to this
				101	// point. \|currently_collapsing\| keeps track of the number of characters
				102	// collapsed by \|first_adjustments\| into the current \|adjusted_iter\|'s
				103	// length. These are characters that will change \|shift\| as soon as we're
				104	// done processing the current \|adjusted_iter\|; they are not yet reflected in
				105	// \|shift\|.
				106	size_t shift = 0;
				107	size_t currently_collapsing = 0;
				108	while (adjusted_iter != adjustments_on_adjusted_string->end()) {
				109	if ((first_iter == first_adjustments.end()) \|\|
				110	((adjusted_iter->original_offset + shift +
				111	adjusted_iter->original_length) <= first_iter->original_offset)) {
				112	// Entire \|adjusted_iter\| (accounting for its shift and including its
				113	// whole original length) comes before \|first_iter\|.
				114	//
				115	// Correct the offset at \|adjusted_iter\| and move onto the next
				116	// adjustment that needs revising.
				117	adjusted_iter->original_offset += shift;
				118	shift += currently_collapsing;
				119	currently_collapsing = 0;
				120	++adjusted_iter;
				121	} else if ((adjusted_iter->original_offset + shift) >
				122	first_iter->original_offset) {
				123	// \|first_iter\| comes before the \|adjusted_iter\| (as adjusted by \|shift\|).
				124
				125	// It's not possible for the adjustments to overlap. (It shouldn't
				126	// be possible that we have an \|adjusted_iter->original_offset\| that,
				127	// when adjusted by the computed \|shift\|, is in the middle of
[email protected]	e2819765	2014-04-23 04:28:38	[diff] [blame]	128	// \|first_iter\|'s output's length. After all, that would mean the
				129	// current adjustment_on_adjusted_string somehow points to an offset
				130	// that was supposed to have been eliminated by the first set of
				131	// adjustments.)
				132	DCHECK_LE(first_iter->original_offset + first_iter->output_length,
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	133	adjusted_iter->original_offset + shift);
				134
				135	// Add the \|first_adjustment_iter\| to the full set of adjustments while
				136	// making sure \|adjusted_iter\| continues pointing to the same element.
				137	// We do this by inserting the \|first_adjustment_iter\| right before
				138	// \|adjusted_iter\|, then incrementing \|adjusted_iter\| so it points to
				139	// the following element.
				140	shift += first_iter->original_length - first_iter->output_length;
				141	adjusted_iter = adjustments_on_adjusted_string->insert(
				142	adjusted_iter, *first_iter);
				143	++adjusted_iter;
				144	++first_iter;
				145	} else {
				146	// The first adjustment adjusted something that then got further adjusted
				147	// by the second set of adjustments. In other words, \|first_iter\| points
				148	// to something in the range covered by \|adjusted_iter\|'s length (after
				149	// accounting for \|shift\|). Precisely,
				150	// adjusted_iter->original_offset + shift
				151	// <=
				152	// first_iter->original_offset
				153	// <=
				154	// adjusted_iter->original_offset + shift +
				155	// adjusted_iter->original_length
				156
				157	// Modify the current \|adjusted_iter\| to include whatever collapsing
				158	// happened in \|first_iter\|, then advance to the next \|first_adjustments\|
				159	// because we dealt with the current one.
				160	const int collapse = static_cast<int>(first_iter->original_length) -
				161	static_cast<int>(first_iter->output_length);
				162	// This function does not know how to deal with a string that expands and
				163	// then gets modified, only strings that collapse and then get modified.
				164	DCHECK_GT(collapse, 0);
				165	adjusted_iter->original_length += collapse;
				166	currently_collapsing += collapse;
				167	++first_iter;
				168	}
				169	}
				170	DCHECK_EQ(0u, currently_collapsing);
				171	if (first_iter != first_adjustments.end()) {
				172	// Only first adjustments are left. These do not need to be modified.
				173	// (Their offsets are already correct with respect to the original string.)
				174	// Append them all.
				175	DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());
				176	adjustments_on_adjusted_string->insert(
				177	adjustments_on_adjusted_string->end(), first_iter,
				178	first_adjustments.end());
				179	}
				180	}
				181
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	182	// Converts the given source Unicode character type to the given destination
				183	// Unicode character type as a STL string. The given input buffer and size
				184	// determine the source, and the given output STL string will be replaced by
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	185	// the result. If non-NULL, \|adjustments\| is set to reflect the all the
				186	// alterations to the string that are not one-character-to-one-character.
				187	// It will always be sorted by increasing offset.
[email protected]	cbf35e17	2011-09-08 02:18:10	[diff] [blame]	188	template<typename SrcChar, typename DestStdString>
				189	bool ConvertUnicode(const SrcChar* src,
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	190	size_t src_len,
[email protected]	cbf35e17	2011-09-08 02:18:10	[diff] [blame]	191	DestStdString* output,
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	192	OffsetAdjuster::Adjustments* adjustments) {
				193	if (adjustments)
				194	adjustments->clear();
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	195	// ICU requires 32-bit numbers.
				196	bool success = true;
avi	84f37e1	2015-12-25 09:31:42	[diff] [blame]	197	int32_t src_len32 = static_cast<int32_t>(src_len);
				198	for (int32_t i = 0; i < src_len32; i++) {
				199	uint32_t code_point;
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	200	size_t original_i = i;
				201	size_t chars_written = 0;
				202	if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
				203	chars_written = WriteUnicodeCharacter(code_point, output);
				204	} else {
[email protected]	d7a3e8e	2010-01-01 22:16:38	[diff] [blame]	205	chars_written = WriteUnicodeCharacter(0xFFFD, output);
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	206	success = false;
				207	}
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	208
				209	// Only bother writing an adjustment if this modification changed the
				210	// length of this character.
				211	// NOTE: ReadUnicodeCharacter() adjusts \|i\| to point _at_ the last
				212	// character read, not after it (so that incrementing it in the loop
				213	// increment will place it at the right location), so we need to account
				214	// for that in determining the amount that was read.
				215	if (adjustments && ((i - original_i + 1) != chars_written)) {
				216	adjustments->push_back(OffsetAdjuster::Adjustment(
				217	original_i, i - original_i + 1, chars_written));
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	218	}
				219	}
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	220	return success;
				221	}
				222
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	223	bool UTF8ToUTF16WithAdjustments(
				224	const char* src,
				225	size_t src_len,
				226	string16* output,
				227	base::OffsetAdjuster::Adjustments* adjustments) {
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	228	PrepareForUTF16Or32Output(src, src_len, output);
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	229	return ConvertUnicode(src, src_len, output, adjustments);
[email protected]	421de2ab	2011-04-13 18:43:05	[diff] [blame]	230	}
				231
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	232	string16 UTF8ToUTF16WithAdjustments(
				233	const base::StringPiece& utf8,
				234	base::OffsetAdjuster::Adjustments* adjustments) {
[email protected]	04866c4	2011-05-03 20:03:50	[diff] [blame]	235	string16 result;
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	236	UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);
[email protected]	421de2ab	2011-04-13 18:43:05	[diff] [blame]	237	return result;
				238	}
				239
[email protected]	04866c4	2011-05-03 20:03:50	[diff] [blame]	240	string16 UTF8ToUTF16AndAdjustOffsets(
				241	const base::StringPiece& utf8,
[email protected]	421de2ab	2011-04-13 18:43:05	[diff] [blame]	242	std::vector<size_t>* offsets_for_adjustment) {
tommycli	7a4241c	2017-07-13 01:37:32	[diff] [blame]	243	for (size_t& offset : *offsets_for_adjustment) {
				244	if (offset > utf8.length())
				245	offset = string16::npos;
				246	}
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	247	OffsetAdjuster::Adjustments adjustments;
				248	string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);
				249	OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
[email protected]	cbf35e17	2011-09-08 02:18:10	[diff] [blame]	250	return result;
				251	}
				252
				253	std::string UTF16ToUTF8AndAdjustOffsets(
				254	const base::StringPiece16& utf16,
				255	std::vector<size_t>* offsets_for_adjustment) {
tommycli	7a4241c	2017-07-13 01:37:32	[diff] [blame]	256	for (size_t& offset : *offsets_for_adjustment) {
				257	if (offset > utf16.length())
				258	offset = string16::npos;
				259	}
[email protected]	cbf35e17	2011-09-08 02:18:10	[diff] [blame]	260	std::string result;
				261	PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
[email protected]	a97376e	2014-04-18 20:54:44	[diff] [blame]	262	OffsetAdjuster::Adjustments adjustments;
				263	ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);
				264	OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
[email protected]	cbf35e17	2011-09-08 02:18:10	[diff] [blame]	265	return result;
				266	}
				267
[email protected]	a3f72189	2013-02-07 03:59:06	[diff] [blame]	268	} // namespace base