mek | 9b28f42 | 2016-05-13 21:46:23 | [diff] [blame] | 1 | // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #include "components/link_header_util/link_header_util.h" |
| 6 | |
| 7 | #include "base/strings/string_util.h" |
| 8 | #include "net/http/http_util.h" |
| 9 | |
| 10 | namespace link_header_util { |
| 11 | |
| 12 | namespace { |
| 13 | |
| 14 | // A variation of base::StringTokenizer and net::HttpUtil::ValuesIterator. |
| 15 | // Takes the parsing of StringTokenizer and adds support for quoted strings that |
| 16 | // are quoted by matching <> (and does not support escaping in those strings). |
| 17 | // Also has the behavior of ValuesIterator where it strips whitespace from all |
| 18 | // values and only outputs non-empty values. |
| 19 | // Only supports ',' as separator and supports "" and <> as quote chars. |
| 20 | class ValueTokenizer { |
| 21 | public: |
| 22 | ValueTokenizer(std::string::const_iterator begin, |
| 23 | std::string::const_iterator end) |
| 24 | : token_begin_(begin), token_end_(begin), end_(end) {} |
| 25 | |
| 26 | std::string::const_iterator token_begin() const { return token_begin_; } |
| 27 | std::string::const_iterator token_end() const { return token_end_; } |
| 28 | |
| 29 | bool GetNext() { |
| 30 | while (GetNextInternal()) { |
| 31 | net::HttpUtil::TrimLWS(&token_begin_, &token_end_); |
| 32 | |
| 33 | // Only return non-empty values. |
| 34 | if (token_begin_ != token_end_) |
| 35 | return true; |
| 36 | } |
| 37 | return false; |
| 38 | } |
| 39 | |
| 40 | private: |
| 41 | // Updates token_begin_ and token_end_ to point to the (possibly empty) next |
| 42 | // token. Returns false if end-of-string was reached first. |
| 43 | bool GetNextInternal() { |
| 44 | // First time this is called token_end_ points to the first character in the |
| 45 | // input. Every other time token_end_ points to the delimiter at the end of |
| 46 | // the last returned token (which could be the end of the string). |
| 47 | |
| 48 | // End of string, return false. |
| 49 | if (token_end_ == end_) |
| 50 | return false; |
| 51 | |
| 52 | // Skip past the delimiter. |
| 53 | if (*token_end_ == ',') |
| 54 | ++token_end_; |
| 55 | |
| 56 | // Make token_begin_ point to the beginning of the next token, and search |
| 57 | // for the end of the token in token_end_. |
| 58 | token_begin_ = token_end_; |
| 59 | |
| 60 | // Set to true if we're currently inside a quoted string. |
| 61 | bool in_quote = false; |
| 62 | // Set to true if we're currently inside a quoted string, and have just |
| 63 | // encountered an escape character. In this case a closing quote will be |
| 64 | // ignored. |
| 65 | bool in_escape = false; |
| 66 | // If currently in a quoted string, this is the character that (when not |
| 67 | // escaped) indicates the end of the string. |
| 68 | char quote_close_char = '\0'; |
| 69 | // If currently in a quoted string, this is set to true if it is possible to |
| 70 | // escape the closing quote using '\'. |
| 71 | bool quote_allows_escape = false; |
| 72 | |
| 73 | while (token_end_ != end_) { |
| 74 | char c = *token_end_; |
| 75 | if (in_quote) { |
| 76 | if (in_escape) { |
| 77 | in_escape = false; |
| 78 | } else if (quote_allows_escape && c == '\\') { |
| 79 | in_escape = true; |
| 80 | } else if (c == quote_close_char) { |
| 81 | in_quote = false; |
| 82 | } |
| 83 | } else { |
| 84 | if (c == ',') |
| 85 | break; |
| 86 | if (c == '"' || c == '<') { |
| 87 | in_quote = true; |
| 88 | quote_close_char = (c == '<' ? '>' : c); |
| 89 | quote_allows_escape = (c != '<'); |
| 90 | } |
| 91 | } |
| 92 | ++token_end_; |
| 93 | } |
| 94 | return true; |
| 95 | } |
| 96 | |
| 97 | std::string::const_iterator token_begin_; |
| 98 | std::string::const_iterator token_end_; |
| 99 | std::string::const_iterator end_; |
| 100 | }; |
| 101 | |
| 102 | // Parses the URL part of a Link header. When successful |url_begin| points |
| 103 | // to the beginning of the url, |url_end| points to the end of the url and |
| 104 | // |params_begin| points to the first character after the '>' character at the |
| 105 | // end of the url. |
| 106 | bool ExtractURL(std::string::const_iterator begin, |
| 107 | std::string::const_iterator end, |
| 108 | std::string::const_iterator* url_begin, |
| 109 | std::string::const_iterator* url_end, |
| 110 | std::string::const_iterator* params_begin) { |
| 111 | // Extract the URL part (everything between '<' and first '>' character). |
| 112 | if (*begin != '<') |
| 113 | return false; |
| 114 | |
| 115 | ++begin; |
| 116 | *url_begin = begin; |
| 117 | *url_end = std::find(begin, end, '>'); |
| 118 | |
| 119 | // Fail if we did not find a '>'. |
| 120 | if (*url_end == end) |
| 121 | return false; |
| 122 | |
| 123 | *params_begin = *url_end; |
| 124 | // Skip the '>' at the end of the URL. |
| 125 | ++*params_begin; |
| 126 | |
| 127 | // Trim whitespace from the URL. |
| 128 | net::HttpUtil::TrimLWS(url_begin, url_end); |
| 129 | return true; |
| 130 | } |
| 131 | |
| 132 | } // namespace |
| 133 | |
| 134 | std::vector<StringIteratorPair> SplitLinkHeader(const std::string& header) { |
| 135 | std::vector<StringIteratorPair> values; |
| 136 | ValueTokenizer tokenizer(header.begin(), header.end()); |
| 137 | while (tokenizer.GetNext()) { |
| 138 | values.push_back( |
| 139 | StringIteratorPair(tokenizer.token_begin(), tokenizer.token_end())); |
| 140 | } |
| 141 | return values; |
| 142 | } |
| 143 | |
| 144 | // Parses one link in a link header into its url and parameters. |
| 145 | // A link is of the form "<some-url>; param1=value1; param2=value2". |
| 146 | // Returns false if parsing the link failed, returns true on success. This |
| 147 | // method is more lenient than the RFC. It doesn't fail on things like invalid |
| 148 | // characters in the URL, and also doesn't verify that certain parameters should |
| 149 | // or shouldn't be quoted strings. |
| 150 | // If a parameter occurs more than once in the link, only the first value is |
| 151 | // returned in params as this is the required behavior for all attributes chrome |
| 152 | // currently cares about in link headers. |
| 153 | bool ParseLinkHeaderValue( |
| 154 | std::string::const_iterator begin, |
| 155 | std::string::const_iterator end, |
| 156 | std::string* url, |
| 157 | std::unordered_map<std::string, base::Optional<std::string>>* params) { |
| 158 | // Can't parse an empty string. |
| 159 | if (begin == end) |
| 160 | return false; |
| 161 | |
| 162 | // Extract the URL part (everything between '<' and first '>' character). |
| 163 | std::string::const_iterator url_begin; |
| 164 | std::string::const_iterator url_end; |
| 165 | if (!ExtractURL(begin, end, &url_begin, &url_end, &begin)) |
| 166 | return false; |
| 167 | *url = std::string(url_begin, url_end); |
| 168 | |
| 169 | // Trim any remaining whitespace, and make sure there is a ';' separating |
| 170 | // parameters from the URL. |
| 171 | net::HttpUtil::TrimLWS(&begin, &end); |
| 172 | if (begin != end && *begin != ';') |
| 173 | return false; |
| 174 | |
| 175 | // Parse all the parameters. |
| 176 | net::HttpUtil::NameValuePairsIterator params_iterator( |
| 177 | begin, end, ';', |
| 178 | net::HttpUtil::NameValuePairsIterator::Values::NOT_REQUIRED, |
| 179 | net::HttpUtil::NameValuePairsIterator::Quotes::STRICT_QUOTES); |
| 180 | while (params_iterator.GetNext()) { |
| 181 | if (!net::HttpUtil::IsParmName(params_iterator.name_begin(), |
| 182 | params_iterator.name_end())) |
| 183 | return false; |
| 184 | std::string name = base::ToLowerASCII(base::StringPiece( |
| 185 | params_iterator.name_begin(), params_iterator.name_end())); |
| 186 | if (!params_iterator.value_is_quoted() && |
| 187 | params_iterator.value_begin() == params_iterator.value_end()) |
| 188 | params->insert(std::make_pair(name, base::nullopt)); |
| 189 | else |
| 190 | params->insert(std::make_pair(name, params_iterator.value())); |
| 191 | } |
| 192 | return params_iterator.valid(); |
| 193 | } |
| 194 | |
| 195 | } // namespace link_header_util |