Blame - base/utf_string_conversion_utils.h - chromium/src.git

blob: 71dc5276ae700bca29e792757ad0fe5386b21e96 [file] [log] [blame]

[email protected]	23bb71f	2011-04-21 22:22:10	[diff] [blame]	1	// Copyright (c) 2011 The Chromium Authors. All rights reserved.
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#ifndef BASE_UTF_STRING_CONVERSION_UTILS_H_
				6	#define BASE_UTF_STRING_CONVERSION_UTILS_H_
[email protected]	32b76ef	2010-07-26 23:08:24	[diff] [blame]	7	#pragma once
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	8
				9	// This should only be used by the various UTF string conversion files.
				10
[email protected]	0bea725	2011-08-05 15:34:00	[diff] [blame]	11	#include "base/base_export.h"
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	12	#include "base/string16.h"
				13
				14	namespace base {
				15
				16	inline bool IsValidCodepoint(uint32 code_point) {
[email protected]	858d4887	2010-01-16 17:56:08	[diff] [blame]	17	// Excludes the surrogate code points ([0xD800, 0xDFFF]) and
				18	// codepoints larger than 0x10FFFF (the highest codepoint allowed).
				19	// Non-characters and unassigned codepoints are allowed.
				20	return code_point < 0xD800u \|\|
				21	(code_point >= 0xE000u && code_point <= 0x10FFFFu);
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	22	}
				23
[email protected]	bce55e27	2010-02-26 23:39:58	[diff] [blame]	24	inline bool IsValidCharacter(uint32 code_point) {
				25	// Excludes non-characters (U+FDD0..U+FDEF, and all codepoints ending in
				26	// 0xFFFE or 0xFFFF) from the set of valid code points.
				27	return code_point < 0xD800u \|\| (code_point >= 0xE000u &&
				28	code_point < 0xFDD0u) \|\| (code_point > 0xFDEFu &&
				29	code_point <= 0x10FFFFu && (code_point & 0xFFFEu) != 0xFFFEu);
				30	}
				31
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	32	// ReadUnicodeCharacter --------------------------------------------------------
				33
				34	// Reads a UTF-8 stream, placing the next code point into the given output
				35	// \|code_point\|. \|src\| represents the entire string to read, and \|char_index\|
				36	// is the character offset within the string to start reading at. \|*char_index\|
				37	// will be updated to index the last character read, such that incrementing it
				38	// (as in a for loop) will take the reader to the next character.
				39	//
				40	// Returns true on success. On false, \|*code_point\| will be invalid.
[email protected]	0bea725	2011-08-05 15:34:00	[diff] [blame]	41	BASE_EXPORT bool ReadUnicodeCharacter(const char* src,
				42	int32 src_len,
				43	int32* char_index,
				44	uint32* code_point_out);
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	45
				46	// Reads a UTF-16 character. The usage is the same as the 8-bit version above.
[email protected]	0bea725	2011-08-05 15:34:00	[diff] [blame]	47	BASE_EXPORT bool ReadUnicodeCharacter(const char16* src,
				48	int32 src_len,
				49	int32* char_index,
				50	uint32* code_point);
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	51
				52	#if defined(WCHAR_T_IS_UTF32)
				53	// Reads UTF-32 character. The usage is the same as the 8-bit version above.
[email protected]	0bea725	2011-08-05 15:34:00	[diff] [blame]	54	BASE_EXPORT bool ReadUnicodeCharacter(const wchar_t* src,
				55	int32 src_len,
				56	int32* char_index,
				57	uint32* code_point);
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	58	#endif // defined(WCHAR_T_IS_UTF32)
				59
				60	// WriteUnicodeCharacter -------------------------------------------------------
				61
				62	// Appends a UTF-8 character to the given 8-bit string. Returns the number of
				63	// bytes written.
[email protected]	23bb71f	2011-04-21 22:22:10	[diff] [blame]	64	// TODO(brettw) Bug 79631: This function should not be exposed.
[email protected]	0bea725	2011-08-05 15:34:00	[diff] [blame]	65	BASE_EXPORT size_t WriteUnicodeCharacter(uint32 code_point,
				66	std::string* output);
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	67
				68	// Appends the given code point as a UTF-16 character to the given 16-bit
				69	// string. Returns the number of 16-bit values written.
[email protected]	3757776a	2011-12-04 03:14:54	[diff] [blame^]	70	BASE_EXPORT size_t WriteUnicodeCharacter(uint32 code_point, string16* output);
[email protected]	b9f9383	2009-11-13 19:27:48	[diff] [blame]	71
				72	#if defined(WCHAR_T_IS_UTF32)
				73	// Appends the given UTF-32 character to the given 32-bit string. Returns the
				74	// number of 32-bit values written.
				75	inline size_t WriteUnicodeCharacter(uint32 code_point, std::wstring* output) {
				76	// This is the easy case, just append the character.
				77	output->push_back(code_point);
				78	return 1;
				79	}
				80	#endif // defined(WCHAR_T_IS_UTF32)
				81
				82	// Generalized Unicode converter -----------------------------------------------
				83
				84	// Guesses the length of the output in UTF-8 in bytes, clears that output
				85	// string, and reserves that amount of space. We assume that the input
				86	// character types are unsigned, which will be true for UTF-16 and -32 on our
				87	// systems.
				88	template<typename CHAR>
				89	void PrepareForUTF8Output(const CHAR* src, size_t src_len, std::string* output);
				90
				91	// Prepares an output buffer (containing either UTF-16 or -32 data) given some
				92	// UTF-8 input that will be converted to it. See PrepareForUTF8Output().
				93	template<typename STRING>
				94	void PrepareForUTF16Or32Output(const char* src, size_t src_len, STRING* output);
				95
				96	} // namespace base
				97
				98	#endif // BASE_UTF_STRING_CONVERSION_UTILS_H_