Avi Drissman | 6459548 | 2022-09-14 20:52:29 | [diff] [blame] | 1 | // Copyright 2013 The Chromium Authors |
[email protected] | ca93c2aa | 2013-01-31 17:41:01 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
eroman | f6c3a1e6 | 2016-02-09 23:13:45 | [diff] [blame] | 5 | // This file contains a set of utility functions related to parsing, |
| 6 | // manipulating, and interacting with URLs and hostnames. These functions are |
| 7 | // intended to be of a text-processing nature, and should not attempt to use any |
| 8 | // networking or blocking services. |
| 9 | |
[email protected] | ca93c2aa | 2013-01-31 17:41:01 | [diff] [blame] | 10 | #ifndef NET_BASE_URL_UTIL_H_ |
| 11 | #define NET_BASE_URL_UTIL_H_ |
| 12 | |
| 13 | #include <string> |
| 14 | |
Ali Hijazi | 5517919 | 2022-11-09 16:28:51 | [diff] [blame] | 15 | #include "base/memory/raw_ref.h" |
tfarina | 77021d6 | 2015-10-11 20:19:03 | [diff] [blame] | 16 | #include "base/strings/string_piece.h" |
[email protected] | ca93c2aa | 2013-01-31 17:41:01 | [diff] [blame] | 17 | #include "net/base/net_export.h" |
Anudeep Palanki | c22db42 | 2022-10-07 19:36:26 | [diff] [blame] | 18 | #include "third_party/abseil-cpp/absl/types/optional.h" |
tfarina | 018de6e | 2015-05-26 17:41:20 | [diff] [blame] | 19 | #include "url/third_party/mozilla/url_parse.h" |
[email protected] | ca93c2aa | 2013-01-31 17:41:01 | [diff] [blame] | 20 | |
| 21 | class GURL; |
| 22 | |
tfarina | 7a4a7fd | 2016-01-20 14:23:44 | [diff] [blame] | 23 | namespace url { |
| 24 | struct CanonHostInfo; |
Matt Menke | 12b8a5b6 | 2021-12-16 15:15:13 | [diff] [blame] | 25 | class SchemeHostPort; |
Eric Orth | 35126b6 | 2022-12-01 22:12:18 | [diff] [blame] | 26 | } // namespace url |
tfarina | 7a4a7fd | 2016-01-20 14:23:44 | [diff] [blame] | 27 | |
[email protected] | ca93c2aa | 2013-01-31 17:41:01 | [diff] [blame] | 28 | namespace net { |
| 29 | |
| 30 | // Returns a new GURL by appending the given query parameter name and the |
| 31 | // value. Unsafe characters in the name and the value are escaped like |
| 32 | // %XX%XX. The original query component is preserved if it's present. |
| 33 | // |
| 34 | // Examples: |
| 35 | // |
| 36 | // AppendQueryParameter(GURL("http://example.com"), "name", "value").spec() |
| 37 | // => "http://example.com?name=value" |
| 38 | // AppendQueryParameter(GURL("http://example.com?x=y"), "name", "value").spec() |
| 39 | // => "http://example.com?x=y&name=value" |
| 40 | NET_EXPORT GURL AppendQueryParameter(const GURL& url, |
David Benjamin | 044f897 | 2022-10-24 18:50:08 | [diff] [blame] | 41 | base::StringPiece name, |
| 42 | base::StringPiece value); |
[email protected] | ca93c2aa | 2013-01-31 17:41:01 | [diff] [blame] | 43 | |
| 44 | // Returns a new GURL by appending or replacing the given query parameter name |
Anudeep Palanki | c22db42 | 2022-10-07 19:36:26 | [diff] [blame] | 45 | // and the value. If `name` appears more than once, only the first name-value |
[email protected] | ca93c2aa | 2013-01-31 17:41:01 | [diff] [blame] | 46 | // pair is replaced. Unsafe characters in the name and the value are escaped |
| 47 | // like %XX%XX. The original query component is preserved if it's present. |
Anudeep Palanki | c22db42 | 2022-10-07 19:36:26 | [diff] [blame] | 48 | // Using `absl::nullopt` for `value` will remove the `name` parameter. |
[email protected] | ca93c2aa | 2013-01-31 17:41:01 | [diff] [blame] | 49 | // |
| 50 | // Examples: |
| 51 | // |
| 52 | // AppendOrReplaceQueryParameter( |
| 53 | // GURL("http://example.com"), "name", "new").spec() |
| 54 | // => "http://example.com?name=value" |
| 55 | // AppendOrReplaceQueryParameter( |
| 56 | // GURL("http://example.com?x=y&name=old"), "name", "new").spec() |
| 57 | // => "http://example.com?x=y&name=new" |
Anudeep Palanki | c22db42 | 2022-10-07 19:36:26 | [diff] [blame] | 58 | // AppendOrReplaceQueryParameter( |
| 59 | // GURL("http://example.com?x=y&name=old"), "name", absl::nullopt).spec() |
| 60 | // => "http://example.com?x=y&" |
| 61 | NET_EXPORT GURL |
| 62 | AppendOrReplaceQueryParameter(const GURL& url, |
David Benjamin | 044f897 | 2022-10-24 18:50:08 | [diff] [blame] | 63 | base::StringPiece name, |
Anudeep Palanki | c22db42 | 2022-10-07 19:36:26 | [diff] [blame] | 64 | absl::optional<base::StringPiece> value); |
[email protected] | ca93c2aa | 2013-01-31 17:41:01 | [diff] [blame] | 65 | |
Etienne Noel | 8e79032 | 2022-11-08 19:19:18 | [diff] [blame] | 66 | // Returns a new GURL by appending the provided ref (also named fragment). |
| 67 | // Unsafe characters are escaped. The original fragment is replaced |
| 68 | // if it's present. |
| 69 | // |
| 70 | // Examples: |
| 71 | // |
| 72 | // AppendOrReplaceRef( |
| 73 | // GURL("http://example.com"), "ref").spec() |
| 74 | // => "http://example.com#ref" |
| 75 | // AppendOrReplaceRef( |
| 76 | // GURL("http://example.com#ref"), "ref2").spec() |
| 77 | // => "http://example.com#ref2" |
| 78 | NET_EXPORT GURL AppendOrReplaceRef(const GURL& url, |
| 79 | const base::StringPiece& ref); |
| 80 | |
[email protected] | 1a643611 | 2013-10-09 02:49:58 | [diff] [blame] | 81 | // Iterates over the key-value pairs in the query portion of |url|. |
Alex Kalugin | 0ecaa65 | 2021-12-15 04:06:53 | [diff] [blame] | 82 | // NOTE: QueryIterator stores reference to |url| and creates base::StringPiece |
| 83 | // instances which refer to the data inside |url| query. Therefore |url| must |
| 84 | // outlive QueryIterator and all base::StringPiece objects returned from GetKey |
| 85 | // and GetValue methods. |
[email protected] | 1a643611 | 2013-10-09 02:49:58 | [diff] [blame] | 86 | class NET_EXPORT QueryIterator { |
| 87 | public: |
| 88 | explicit QueryIterator(const GURL& url); |
David Bienvenu | a03ac8c | 2020-11-06 15:55:39 | [diff] [blame] | 89 | QueryIterator(const QueryIterator&) = delete; |
| 90 | QueryIterator& operator=(const QueryIterator&) = delete; |
[email protected] | 1a643611 | 2013-10-09 02:49:58 | [diff] [blame] | 91 | ~QueryIterator(); |
| 92 | |
Alex Kalugin | 0ecaa65 | 2021-12-15 04:06:53 | [diff] [blame] | 93 | base::StringPiece GetKey() const; |
| 94 | base::StringPiece GetValue() const; |
[email protected] | 1a643611 | 2013-10-09 02:49:58 | [diff] [blame] | 95 | const std::string& GetUnescapedValue(); |
| 96 | |
| 97 | bool IsAtEnd() const; |
| 98 | void Advance(); |
| 99 | |
| 100 | private: |
Ali Hijazi | 5517919 | 2022-11-09 16:28:51 | [diff] [blame] | 101 | const raw_ref<const GURL> url_; |
[email protected] | ce97ca36 | 2014-04-30 11:35:46 | [diff] [blame] | 102 | url::Component query_; |
[email protected] | 1a643611 | 2013-10-09 02:49:58 | [diff] [blame] | 103 | bool at_end_; |
[email protected] | ce97ca36 | 2014-04-30 11:35:46 | [diff] [blame] | 104 | url::Component key_; |
| 105 | url::Component value_; |
[email protected] | 1a643611 | 2013-10-09 02:49:58 | [diff] [blame] | 106 | std::string unescaped_value_; |
[email protected] | 1a643611 | 2013-10-09 02:49:58 | [diff] [blame] | 107 | }; |
| 108 | |
[email protected] | ca93c2aa | 2013-01-31 17:41:01 | [diff] [blame] | 109 | // Looks for |search_key| in the query portion of |url|. Returns true if the |
| 110 | // key is found and sets |out_value| to the unescaped value for the key. |
| 111 | // Returns false if the key is not found. |
| 112 | NET_EXPORT bool GetValueForKeyInQuery(const GURL& url, |
David Benjamin | 044f897 | 2022-10-24 18:50:08 | [diff] [blame] | 113 | base::StringPiece search_key, |
[email protected] | ca93c2aa | 2013-01-31 17:41:01 | [diff] [blame] | 114 | std::string* out_value); |
| 115 | |
tfarina | 7a4a7fd | 2016-01-20 14:23:44 | [diff] [blame] | 116 | // Splits an input of the form <host>[":"<port>] into its consitituent parts. |
| 117 | // Saves the result into |*host| and |*port|. If the input did not have |
| 118 | // the optional port, sets |*port| to -1. |
| 119 | // Returns true if the parsing was successful, false otherwise. |
| 120 | // The returned host is NOT canonicalized, and may be invalid. |
| 121 | // |
| 122 | // IPv6 literals must be specified in a bracketed form, for instance: |
| 123 | // [::1]:90 and [::1] |
| 124 | // |
| 125 | // The resultant |*host| in both cases will be "::1" (not bracketed). |
David Benjamin | 6e44604 | 2018-03-12 19:20:07 | [diff] [blame] | 126 | NET_EXPORT bool ParseHostAndPort(base::StringPiece input, |
tfarina | 7a4a7fd | 2016-01-20 14:23:44 | [diff] [blame] | 127 | std::string* host, |
| 128 | int* port); |
| 129 | |
| 130 | // Returns a host:port string for the given URL. |
| 131 | NET_EXPORT std::string GetHostAndPort(const GURL& url); |
| 132 | |
| 133 | // Returns a host[:port] string for the given URL, where the port is omitted |
| 134 | // if it is the default for the URL's scheme. |
| 135 | NET_EXPORT std::string GetHostAndOptionalPort(const GURL& url); |
tfarina | 77021d6 | 2015-10-11 20:19:03 | [diff] [blame] | 136 | |
Matt Menke | 12b8a5b6 | 2021-12-16 15:15:13 | [diff] [blame] | 137 | // Just like above, but takes a SchemeHostPort. |
| 138 | NET_EXPORT std::string GetHostAndOptionalPort( |
| 139 | const url::SchemeHostPort& scheme_host_port); |
| 140 | |
tfarina | 77021d6 | 2015-10-11 20:19:03 | [diff] [blame] | 141 | // Returns the hostname by trimming the ending dot, if one exists. |
brettw | b65cd5c | 2016-01-23 00:46:38 | [diff] [blame] | 142 | NET_EXPORT std::string TrimEndingDot(base::StringPiece host); |
tfarina | 77021d6 | 2015-10-11 20:19:03 | [diff] [blame] | 143 | |
tfarina | 7a4a7fd | 2016-01-20 14:23:44 | [diff] [blame] | 144 | // Returns either the host from |url|, or, if the host is empty, the full spec. |
| 145 | NET_EXPORT std::string GetHostOrSpecFromURL(const GURL& url); |
| 146 | |
Lily Chen | da52493 | 2020-02-11 20:19:55 | [diff] [blame] | 147 | // Returns the given domain minus its leftmost label, or the empty string if the |
| 148 | // given domain is just a single label. For normal domain names (not IP |
| 149 | // addresses), this represents the "superdomain" of the given domain. |
| 150 | // Note that this does not take into account anything like the Public Suffix |
| 151 | // List, so the superdomain may end up being a bare eTLD. The returned string is |
| 152 | // not guaranteed to be a valid or canonical hostname, or to make any sense at |
| 153 | // all. |
| 154 | // |
| 155 | // Examples: |
| 156 | // |
| 157 | // GetSuperdomain("assets.example.com") -> "example.com" |
| 158 | // GetSuperdomain("example.net") -> "net" |
| 159 | // GetSuperdomain("littlebox") -> "" |
| 160 | // GetSuperdomain("127.0.0.1") -> "0.0.1" |
| 161 | NET_EXPORT std::string GetSuperdomain(base::StringPiece domain); |
| 162 | |
Lily Chen | f46d8ae8 | 2020-04-23 17:57:32 | [diff] [blame] | 163 | // Returns whether |subdomain| is a subdomain of (or identical to) |
| 164 | // |superdomain|, if both are hostnames (not IP addresses -- for which this |
| 165 | // function is nonsensical). Does not consider the Public Suffix List. |
Lily Chen | 033d702 | 2020-04-27 17:21:20 | [diff] [blame] | 166 | // Returns true if both input strings are empty. |
Lily Chen | f46d8ae8 | 2020-04-23 17:57:32 | [diff] [blame] | 167 | NET_EXPORT bool IsSubdomainOf(base::StringPiece subdomain, |
| 168 | base::StringPiece superdomain); |
| 169 | |
tfarina | 7a4a7fd | 2016-01-20 14:23:44 | [diff] [blame] | 170 | // Canonicalizes |host| and returns it. Also fills |host_info| with |
| 171 | // IP address information. |host_info| must not be NULL. |
brettw | b65cd5c | 2016-01-23 00:46:38 | [diff] [blame] | 172 | NET_EXPORT std::string CanonicalizeHost(base::StringPiece host, |
tfarina | 7a4a7fd | 2016-01-20 14:23:44 | [diff] [blame] | 173 | url::CanonHostInfo* host_info); |
| 174 | |
| 175 | // Returns true if |host| is not an IP address and is compliant with a set of |
| 176 | // rules based on RFC 1738 and tweaked to be compatible with the real world. |
| 177 | // The rules are: |
Eric Orth | 35126b6 | 2022-12-01 22:12:18 | [diff] [blame] | 178 | // * One or more non-empty labels separated by '.', each no more than 63 |
| 179 | // characters. |
tfarina | 7a4a7fd | 2016-01-20 14:23:44 | [diff] [blame] | 180 | // * Each component contains only alphanumeric characters and '-' or '_' |
| 181 | // * The last component begins with an alphanumeric character |
| 182 | // * Optional trailing dot after last component (means "treat as FQDN") |
Eric Orth | 35126b6 | 2022-12-01 22:12:18 | [diff] [blame] | 183 | // * Total size (including optional trailing dot, whether or not actually |
| 184 | // present in `host`) no more than 254 characters. |
tfarina | 7a4a7fd | 2016-01-20 14:23:44 | [diff] [blame] | 185 | // |
| 186 | // NOTE: You should only pass in hosts that have been returned from |
| 187 | // CanonicalizeHost(), or you may not get accurate results. |
David Benjamin | 044f897 | 2022-10-24 18:50:08 | [diff] [blame] | 188 | NET_EXPORT bool IsCanonicalizedHostCompliant(base::StringPiece host); |
tfarina | 7a4a7fd | 2016-01-20 14:23:44 | [diff] [blame] | 189 | |
tfarina | 3ad1745 | 2016-01-27 10:34:38 | [diff] [blame] | 190 | // Returns true if |hostname| contains a non-registerable or non-assignable |
| 191 | // domain name (eg: a gTLD that has not been assigned by IANA) or an IP address |
Nathan Parker | 4a78e3d | 2018-04-11 01:16:20 | [diff] [blame] | 192 | // that falls in an range reserved for non-publicly routable networks. |
David Benjamin | 044f897 | 2022-10-24 18:50:08 | [diff] [blame] | 193 | NET_EXPORT bool IsHostnameNonUnique(base::StringPiece hostname); |
tfarina | 3ad1745 | 2016-01-27 10:34:38 | [diff] [blame] | 194 | |
Rob Wu | f79b3ba | 2018-01-14 01:54:31 | [diff] [blame] | 195 | // Returns true if the host part of |url| is a local host name according to |
| 196 | // HostStringIsLocalhost. |
| 197 | NET_EXPORT bool IsLocalhost(const GURL& url); |
| 198 | |
tfarina | 7ba5a62 | 2016-02-23 23:21:44 | [diff] [blame] | 199 | // Returns true if |host| is one of the local hostnames |
| 200 | // (e.g. "localhost") or IP addresses (IPv4 127.0.0.0/8 or IPv6 ::1). |
Rob Wu | f79b3ba | 2018-01-14 01:54:31 | [diff] [blame] | 201 | // "[::1]" is not detected as a local hostname. Do not use this method to check |
| 202 | // whether the host part of a URL is a local host name; use IsLocalhost instead. |
tfarina | 7ba5a62 | 2016-02-23 23:21:44 | [diff] [blame] | 203 | // |
| 204 | // Note that this function does not check for IP addresses other than |
| 205 | // the above, although other IP addresses may point to the local |
| 206 | // machine. |
Rob Wu | f79b3ba | 2018-01-14 01:54:31 | [diff] [blame] | 207 | NET_EXPORT bool HostStringIsLocalhost(base::StringPiece host); |
tfarina | 7ba5a62 | 2016-02-23 23:21:44 | [diff] [blame] | 208 | |
tfarina | 7a4a7fd | 2016-01-20 14:23:44 | [diff] [blame] | 209 | // Strip the portions of |url| that aren't core to the network request. |
| 210 | // - user name / password |
| 211 | // - reference section |
| 212 | NET_EXPORT GURL SimplifyUrlForRequest(const GURL& url); |
| 213 | |
Adam Rice | 9bd428b0a | 2019-02-15 06:31:36 | [diff] [blame] | 214 | // Changes scheme "ws" to "http" and "wss" to "https". This is useful for origin |
| 215 | // checks and authentication, where WebSocket URLs are treated as if they were |
| 216 | // HTTP. It is an error to call this function with a url with a scheme other |
| 217 | // than "ws" or "wss". |
| 218 | NET_EXPORT GURL ChangeWebSocketSchemeToHttpScheme(const GURL& url); |
| 219 | |
Lily Chen | e4070ef | 2020-12-22 16:14:38 | [diff] [blame] | 220 | // Returns whether the given url scheme is of a standard scheme type that can |
| 221 | // have hostnames representing domains (i.e. network hosts). |
| 222 | // See url::SchemeType. |
| 223 | NET_EXPORT bool IsStandardSchemeWithNetworkHost(base::StringPiece scheme); |
| 224 | |
tfarina | c38cb95 | 2016-01-14 12:45:01 | [diff] [blame] | 225 | // Extracts the unescaped username/password from |url|, saving the results |
| 226 | // into |*username| and |*password|. |
| 227 | NET_EXPORT_PRIVATE void GetIdentityFromURL(const GURL& url, |
Jan Wilken Dörrie | 739ccc21 | 2021-03-11 18:13:05 | [diff] [blame] | 228 | std::u16string* username, |
| 229 | std::u16string* password); |
tfarina | c38cb95 | 2016-01-14 12:45:01 | [diff] [blame] | 230 | |
tfarina | 7a4a7fd | 2016-01-20 14:23:44 | [diff] [blame] | 231 | // Returns true if the url's host is a Google server. This should only be used |
| 232 | // for histograms and shouldn't be used to affect behavior. |
| 233 | NET_EXPORT_PRIVATE bool HasGoogleHost(const GURL& url); |
| 234 | |
David Benjamin | f89ca93 | 2019-04-24 23:55:04 | [diff] [blame] | 235 | // Returns true if |host| is the hostname of a Google server. This should only |
| 236 | // be used for histograms and shouldn't be used to affect behavior. |
| 237 | NET_EXPORT_PRIVATE bool IsGoogleHost(base::StringPiece host); |
| 238 | |
Tsuyoshi Horo | e74601e4 | 2022-10-31 08:25:51 | [diff] [blame] | 239 | // Returns true if |host| is the hostname of a Google server and HTTPS DNS |
| 240 | // record of |host| is expected to indicate H3 support. This should only be used |
| 241 | // for histograms and shouldn't be used to affect behavior. |
| 242 | NET_EXPORT_PRIVATE bool IsGoogleHostWithAlpnH3(base::StringPiece host); |
| 243 | |
tfarina | 9ed7f8c5 | 2016-02-19 17:50:18 | [diff] [blame] | 244 | // This function tests |host| to see if it is of any local hostname form. |
Frédéric Wang | 71698e6 | 2020-12-10 06:13:52 | [diff] [blame] | 245 | // |host| is normalized before being tested. |
| 246 | NET_EXPORT_PRIVATE bool IsLocalHostname(base::StringPiece host); |
tfarina | 9ed7f8c5 | 2016-02-19 17:50:18 | [diff] [blame] | 247 | |
Liviu Tinta | 8a22a878 | 2023-01-13 18:19:04 | [diff] [blame] | 248 | // The notion of unescaping used in the application/x-www-form-urlencoded |
| 249 | // parser. https://url.spec.whatwg.org/#concept-urlencoded-parser |
| 250 | NET_EXPORT_PRIVATE std::string UnescapePercentEncodedUrl( |
| 251 | base::StringPiece input); |
| 252 | |
[email protected] | ca93c2aa | 2013-01-31 17:41:01 | [diff] [blame] | 253 | } // namespace net |
| 254 | |
| 255 | #endif // NET_BASE_URL_UTIL_H_ |