Blame - url/url_canon_internal.cc - chromium/src.git

[email protected]

51bcc5d

2013-04-24 01:41:37

[diff] [blame]

1

2

// Use of this source code is governed by a BSD-style license that can be

3

// found in the LICENSE file.

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

4

[email protected]

15f2283

2014-05-02 15:58:31

[diff] [blame]

5

#include "url/url_canon_internal.h"

6

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

7

#include <errno.h>

avi

c0c6031

2015-12-21 21:03:50

[diff] [blame]

8

#include <stddef.h>

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

9

#include <stdlib.h>

[email protected]

318076b

2013-04-18 21:19:45

[diff] [blame]

#include <cstdio>

2013-04-10 20:10:52

[diff] [blame]

#include <string>

2014-05-02 15:58:31

[diff] [blame]

14

#include "base/strings/utf_string_conversion_utils.h"

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

15

[email protected]

0318f92

2014-04-22 00:09:23

[diff] [blame]

16

namespace url {

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

namespace {

template<typename CHAR, typename UCHAR>

21

void DoAppendStringOfType(const CHAR* source, int length,

22

SharedCharTypes type,

23

CanonOutput* output) {

24

for (int i = 0; i < length; i++) {

25

if (static_cast<UCHAR>(source[i]) >= 0x80) {

26

// ReadChar will fill the code point with kUnicodeReplacementCharacter

27

// when the input is invalid, which is what we want.

28

unsigned code_point;

29

ReadUTFChar(source, &i, length, &code_point);

30

AppendUTF8EscapedValue(code_point, output);

31

} else {

32

// Just append the 7-bit character, possibly escaping it.

33

unsigned char uch = static_cast<unsigned char>(source[i]);

34

if (!IsCharOfType(uch, type))

35

AppendEscapedChar(uch, output);

36

else

37

output->push_back(uch);

}

}

}

// This function assumes the input values are all contained in 8-bit,

43

// although it allows any type. Returns true if input is valid, false if not.

44

template<typename CHAR, typename UCHAR>

45

void DoAppendInvalidNarrowString(const CHAR* spec, int begin, int end,

46

CanonOutput* output) {

47

for (int i = begin; i < end; i++) {

48

UCHAR uch = static_cast<UCHAR>(spec[i]);

49

if (uch >= 0x80) {

50

// Handle UTF-8/16 encodings. This call will correctly handle the error

51

// case by appending the invalid character.

52

AppendUTF8EscapedChar(spec, &i, end, output);

53

} else if (uch <= ' ' || uch == 0x7f) {

54

// This function is for error handling, so we escape all control

55

// characters and spaces, but not anything else since we lack

56

// context to do something more specific.

57

AppendEscapedChar(static_cast<unsigned char>(uch), output);

58

} else {

59

output->push_back(static_cast<char>(uch));

}

}

}

2014-04-22 00:09:23

[diff] [blame]

64

// Overrides one component, see the Replacements structure for

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

65

// what the various combionations of source pointer and component mean.

66

void DoOverrideComponent(const char* override_source,

[email protected]

0318f92

2014-04-22 00:09:23

[diff] [blame]

67

const Component& override_component,

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

68

const char** dest,

[email protected]

0318f92

2014-04-22 00:09:23

[diff] [blame]

69

Component* dest_component) {

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

70

if (override_source) {

71

*dest = override_source;

72

*dest_component = override_component;

}

}

// Similar to DoOverrideComponent except that it takes a UTF-16 input and does

77

// not actually set the output character pointer.

78

//

79

// The input is converted to UTF-8 at the end of the given buffer as a temporary

[email protected]

186e280

2013-11-03 05:00:13

[diff] [blame]

80

// holding place. The component identifying the portion of the buffer used in

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

81

// the |utf8_buffer| will be specified in |*dest_component|.

82

//

83

// This will not actually set any |dest| pointer like DoOverrideComponent

84

// does because all of the pointers will point into the |utf8_buffer|, which

85

// may get resized while we're overriding a subsequent component. Instead, the

86

// caller should use the beginning of the |utf8_buffer| as the string pointer

87

// for all components once all overrides have been prepared.

[email protected]

0318f92

2014-04-22 00:09:23

[diff] [blame]

88

bool PrepareUTF16OverrideComponent(const base::char16* override_source,

89

const Component& override_component,

90

CanonOutput* utf8_buffer,

91

Component* dest_component) {

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

92

bool success = true;

93

if (override_source) {

94

if (!override_component.is_valid()) {

95

// Non-"valid" component (means delete), so we need to preserve that.

[email protected]

0318f92

2014-04-22 00:09:23

[diff] [blame]

96

*dest_component = Component();

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

97

} else {

98

// Convert to UTF-8.

99

dest_component->begin = utf8_buffer->length();

100

success = ConvertUTF16ToUTF8(&override_source[override_component.begin],

101

override_component.len, utf8_buffer);

102

dest_component->len = utf8_buffer->length() - dest_component->begin;

}

}

return success;

}

} // namespace

// See the header file for this array's declaration.

111

const unsigned char kSharedCharTypeTable[0x100] = {

112

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x00 - 0x0f

113

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x10 - 0x1f

114

0, // 0x20 ' ' (escape spaces in queries)

115

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x21 !

116

0, // 0x22 "

117

0, // 0x23 # (invalid in query since it marks the ref)

118

CHAR_QUERY | CHAR_USERINFO, // 0x24 $

119

CHAR_QUERY | CHAR_USERINFO, // 0x25 %

120

CHAR_QUERY | CHAR_USERINFO, // 0x26 &

[email protected]

e60479fb

2013-09-24 03:18:40

[diff] [blame]

121

0, // 0x27 ' (Try to prevent XSS.)

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

122

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x28 (

123

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x29 )

124

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x2a *

125

CHAR_QUERY | CHAR_USERINFO, // 0x2b +

126

CHAR_QUERY | CHAR_USERINFO, // 0x2c ,

127

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x2d -

128

CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x2e .

129

CHAR_QUERY, // 0x2f /

CHAR_QUERY, // 0x3a :

141

CHAR_QUERY, // 0x3b ;

142

0, // 0x3c < (Try to prevent certain types of XSS.)

143

CHAR_QUERY, // 0x3d =

144

0, // 0x3e > (Try to prevent certain types of XSS.)

145

CHAR_QUERY, // 0x3f ?

146

CHAR_QUERY, // 0x40 @

147

CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x41 A

148

CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x42 B

149

CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x43 C

150

CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x44 D

151

CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x45 E

152

CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x46 F

153

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x47 G

154

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x48 H

155

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x49 I

156

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4a J

157

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4b K

158

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4c L

159

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4d M

160

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4e N

161

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4f O

162

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x50 P

163

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x51 Q

164

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x52 R

165

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x53 S

166

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x54 T

167

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x55 U

168

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x56 V

169

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x57 W

170

CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x58 X

171

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x59 Y

172

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x5a Z

173

CHAR_QUERY, // 0x5b [

174

CHAR_QUERY, // 0x5c '\'

175

CHAR_QUERY, // 0x5d ]

176

CHAR_QUERY, // 0x5e ^

177

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x5f _

178

CHAR_QUERY, // 0x60 `

179

CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x61 a

180

CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x62 b

181

CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x63 c

182

CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x64 d

183

CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x65 e

184

CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x66 f

185

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x67 g

186

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x68 h

187

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x69 i

188

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6a j

189

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6b k

190

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6c l

191

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6d m

192

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6e n

193

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6f o

194

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x70 p

195

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x71 q

196

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x72 r

197

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x73 s

198

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x74 t

199

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x75 u

200

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x76 v

201

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x77 w

202

CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x78 x

203

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x79 y

204

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x7a z

205

CHAR_QUERY, // 0x7b {

206

CHAR_QUERY, // 0x7c |

207

CHAR_QUERY, // 0x7d }

208

CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x7e ~

209

0, // 0x7f

210

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8f

211

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9f

212

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xa0 - 0xaf

213

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xb0 - 0xbf

214

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xc0 - 0xcf

215

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xd0 - 0xdf

216

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xe0 - 0xef

217

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff

218

};

219

220

const char kHexCharLookup[0x10] = {

221

'0', '1', '2', '3', '4', '5', '6', '7',

222

'8', '9', 'A', 'B', 'C', 'D', 'E', 'F',

223

};

224

225

const char kCharToHexLookup[8] = {

226

0, // 0x00 - 0x1f

227

'0', // 0x20 - 0x3f: digits 0 - 9 are 0x30 - 0x39

228

'A' - 10, // 0x40 - 0x5f: letters A - F are 0x41 - 0x46

229

'a' - 10, // 0x60 - 0x7f: letters a - f are 0x61 - 0x66

0, // 0x80 - 0x9F

0, // 0xA0 - 0xBF

0, // 0xC0 - 0xDF

0, // 0xE0 - 0xFF

};

2013-06-11 21:21:57

[diff] [blame]

236

const base::char16 kUnicodeReplacementCharacter = 0xfffd;

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

237

238

void AppendStringOfType(const char* source, int length,

239

SharedCharTypes type,

240

CanonOutput* output) {

241

DoAppendStringOfType<char, unsigned char>(source, length, type, output);

}

2013-06-11 21:21:57

[diff] [blame]

244

void AppendStringOfType(const base::char16* source, int length,

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

245

SharedCharTypes type,

246

CanonOutput* output) {

[email protected]

3774f83

2013-06-11 21:21:57

[diff] [blame]

247

DoAppendStringOfType<base::char16, base::char16>(

248

source, length, type, output);

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

}

2014-05-02 15:58:31

[diff] [blame]

251

bool ReadUTFChar(const char* str, int* begin, int length,

252

unsigned* code_point_out) {

qyearsley

2bc727d

2015-08-14 20:17:15

[diff] [blame]

253

// This depends on ints and int32s being the same thing. If they're not, it

[email protected]

15f2283

2014-05-02 15:58:31

[diff] [blame]

254

// will fail to compile.

qyearsley

2bc727d

2015-08-14 20:17:15

[diff] [blame]

255

// TODO(mmenke): This should probably be fixed.

[email protected]

15f2283

2014-05-02 15:58:31

[diff] [blame]

256

if (!base::ReadUnicodeCharacter(str, length, begin, code_point_out) ||

257

!base::IsValidCharacter(*code_point_out)) {

258

*code_point_out = kUnicodeReplacementCharacter;

return false;

}

return true;

}

bool ReadUTFChar(const base::char16* str, int* begin, int length,

265

unsigned* code_point_out) {

qyearsley

2bc727d

2015-08-14 20:17:15

[diff] [blame]

266

// This depends on ints and int32s being the same thing. If they're not, it

[email protected]

15f2283

2014-05-02 15:58:31

[diff] [blame]

267

// will fail to compile.

qyearsley

2bc727d

2015-08-14 20:17:15

[diff] [blame]

268

// TODO(mmenke): This should probably be fixed.

[email protected]

15f2283

2014-05-02 15:58:31

[diff] [blame]

269

if (!base::ReadUnicodeCharacter(str, length, begin, code_point_out) ||

270

!base::IsValidCharacter(*code_point_out)) {

271

*code_point_out = kUnicodeReplacementCharacter;

return false;

}

return true;

}

2013-04-10 20:10:52

[diff] [blame]

277

void AppendInvalidNarrowString(const char* spec, int begin, int end,

278

CanonOutput* output) {

279

DoAppendInvalidNarrowString<char, unsigned char>(spec, begin, end, output);

}

2013-06-11 21:21:57

[diff] [blame]

282

void AppendInvalidNarrowString(const base::char16* spec, int begin, int end,

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

283

CanonOutput* output) {

[email protected]

3774f83

2013-06-11 21:21:57

[diff] [blame]

284

DoAppendInvalidNarrowString<base::char16, base::char16>(

285

spec, begin, end, output);

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

}

2013-06-11 21:21:57

[diff] [blame]

288

bool ConvertUTF16ToUTF8(const base::char16* input, int input_len,

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

289

CanonOutput* output) {

290

bool success = true;

291

for (int i = 0; i < input_len; i++) {

292

unsigned code_point;

293

success &= ReadUTFChar(input, &i, input_len, &code_point);

294

AppendUTF8Value(code_point, output);

}

return success;

}

bool ConvertUTF8ToUTF16(const char* input, int input_len,

[email protected]

3774f83

2013-06-11 21:21:57

[diff] [blame]

300

CanonOutputT<base::char16>* output) {

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

301

bool success = true;

302

for (int i = 0; i < input_len; i++) {

303

unsigned code_point;

304

success &= ReadUTFChar(input, &i, input_len, &code_point);

305

AppendUTF16Value(code_point, output);

}

return success;

}

void SetupOverrideComponents(const char* base,

311

const Replacements<char>& repl,

312

URLComponentSource<char>* source,

[email protected]

0318f92

2014-04-22 00:09:23

[diff] [blame]

313

Parsed* parsed) {

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

314

// Get the source and parsed structures of the things we are replacing.

315

const URLComponentSource<char>& repl_source = repl.sources();

[email protected]

0318f92

2014-04-22 00:09:23

[diff] [blame]

316

const Parsed& repl_parsed = repl.components();

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

317

318

DoOverrideComponent(repl_source.scheme, repl_parsed.scheme,

319

&source->scheme, &parsed->scheme);

320

DoOverrideComponent(repl_source.username, repl_parsed.username,

321

&source->username, &parsed->username);

322

DoOverrideComponent(repl_source.password, repl_parsed.password,

323

&source->password, &parsed->password);

324

325

// Our host should be empty if not present, so override the default setup.

326

DoOverrideComponent(repl_source.host, repl_parsed.host,

327

&source->host, &parsed->host);

328

if (parsed->host.len == -1)

329

parsed->host.len = 0;

330

331

DoOverrideComponent(repl_source.port, repl_parsed.port,

332

&source->port, &parsed->port);

333

DoOverrideComponent(repl_source.path, repl_parsed.path,

334

&source->path, &parsed->path);

335

DoOverrideComponent(repl_source.query, repl_parsed.query,

336

&source->query, &parsed->query);

337

DoOverrideComponent(repl_source.ref, repl_parsed.ref,

338

&source->ref, &parsed->ref);

339

}

340

341

bool SetupUTF16OverrideComponents(const char* base,

[email protected]

3774f83

2013-06-11 21:21:57

[diff] [blame]

342

const Replacements<base::char16>& repl,

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

343

CanonOutput* utf8_buffer,

344

URLComponentSource<char>* source,

[email protected]

0318f92

2014-04-22 00:09:23

[diff] [blame]

345

Parsed* parsed) {

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

346

bool success = true;

347

348

// Get the source and parsed structures of the things we are replacing.

[email protected]

3774f83

2013-06-11 21:21:57

[diff] [blame]

349

const URLComponentSource<base::char16>& repl_source = repl.sources();

[email protected]

0318f92

2014-04-22 00:09:23

[diff] [blame]

350

const Parsed& repl_parsed = repl.components();

[email protected]

e7bba5f8

2013-04-10 20:10:52

[diff] [blame]

351

352