[email protected] | 57319e1 | 2012-03-30 22:52:34 | [diff] [blame] | 1 | // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
[email protected] | 1632eb2 | 2009-10-01 18:14:12 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | // This file defines utility functions for working with html. |
| 6 | |
| 7 | #ifndef CHROME_FRAME_HTML_UTILS_H_ |
| 8 | #define CHROME_FRAME_HTML_UTILS_H_ |
| 9 | |
| 10 | #include <string> |
| 11 | #include <vector> |
| 12 | |
| 13 | #include "base/basictypes.h" |
[email protected] | 19118d5 | 2010-07-26 22:13:42 | [diff] [blame] | 14 | #include "base/gtest_prod_util.h" |
[email protected] | 3f55e87 | 2009-10-17 04:48:37 | [diff] [blame] | 15 | #include "net/http/http_util.h" |
[email protected] | 1632eb2 | 2009-10-01 18:14:12 | [diff] [blame] | 16 | |
| 17 | // Forward declarations |
| 18 | class HtmlUtilUnittest; |
| 19 | |
| 20 | // |
| 21 | // Class designed to take a string of HTML and extract from it named |
| 22 | // attribute values from named tags. |
| 23 | // |
| 24 | // Caveat: this class currently doesn't handle multi-word UTF-16 encoded |
| 25 | // characters. Doesn't handle implies that any data following such a |
| 26 | // character could possibly be misinterpreted. |
| 27 | // |
| 28 | class HTMLScanner { |
| 29 | public: |
| 30 | typedef std::wstring::const_iterator StrPos; |
| 31 | |
| 32 | // Structure maintaining const_iterators into html_string_. |
| 33 | class StringRange { |
| 34 | friend class HTMLScanner; |
| 35 | public: |
| 36 | StringRange(); |
| 37 | StringRange(StrPos start, StrPos end); |
| 38 | |
| 39 | bool LowerCaseEqualsASCII(const char* other) const; |
| 40 | bool Equals(const wchar_t* other) const; |
| 41 | |
| 42 | // Copies the data described by StringRange into destination. |
| 43 | std::wstring Copy() const; |
| 44 | |
| 45 | // If this StringRange represents a tag, this method extracts the name of |
| 46 | // the tag and sticks it in tag_name. |
| 47 | // Returns true if the tag name was successfully extracted. |
| 48 | // Returns false if this string doesn't look like a valid tag. |
| 49 | bool GetTagName(std::wstring* tag_name) const; |
| 50 | |
| 51 | // From a given string range, uses a string tokenizer to extract the value |
| 52 | // of the named attribute if a simple scan finds that the attribute name is |
| 53 | // present. |
| 54 | // |
| 55 | // Returns true if the named attribute can be located and it has a value |
| 56 | // which has been placed in attribute_value. |
| 57 | // |
| 58 | // Note that the attribute value is unquoted here as well, so that |
| 59 | // GetTagAttribute(*<foo bar="baz">*, L"bar", *out_value*) will stick |
| 60 | // 'bar' in out_value and not '"bar"'. |
| 61 | // |
| 62 | // Returns false if the named attribute is not present in the tag or if it |
| 63 | // did not have a value. |
| 64 | // |
| 65 | bool GetTagAttribute(const wchar_t* attribute_name, |
| 66 | StringRange* attribute_value) const; |
| 67 | |
| 68 | // Unquotes a StringRange by removing a matching pair of either ' or " |
| 69 | // characters from the beginning and end of the string if present. |
| 70 | // Returns true if string was modified, false otherwise. |
| 71 | bool UnQuote(); |
| 72 | private: |
| 73 | StrPos start_; |
| 74 | StrPos end_; |
| 75 | }; |
| 76 | |
| 77 | typedef std::vector<StringRange> StringRangeList; |
| 78 | |
| 79 | // html_string must be a null-terminated string containing the HTML |
| 80 | // to be scanned. |
| 81 | explicit HTMLScanner(const wchar_t* html_string); |
| 82 | |
| 83 | // Returns the set of ranges denoting HTML tags that match the given name. |
| 84 | // If stop_tag_name is given, then as soon as a tag with this name is |
| 85 | // encountered this method will return. |
| 86 | void GetTagsByName(const wchar_t* name, StringRangeList* tag_list, |
| 87 | const wchar_t* stop_tag_name); |
| 88 | |
| 89 | private: |
| 90 | friend class HtmlUtilUnittest; |
[email protected] | 19118d5 | 2010-07-26 22:13:42 | [diff] [blame] | 91 | FRIEND_TEST_ALL_PREFIXES(HtmlUtilUnittest, BasicTest); |
[email protected] | 1632eb2 | 2009-10-01 18:14:12 | [diff] [blame] | 92 | |
| 93 | // Given html_string which represents the remaining html range, this method |
| 94 | // returns the next tag in tag and advances html_string to one character after |
| 95 | // the end of tag. This method is intended to be called repeatedly to extract |
| 96 | // all of the tags in sequence. |
| 97 | // |
| 98 | // Returns true if another tag was found and 'tag' was populated with a valid |
| 99 | // range. |
| 100 | // Returns false if we have reached the end of the html data. |
| 101 | bool NextTag(StringRange* html_string, StringRange* tag); |
| 102 | |
| 103 | // Returns true if c can be found in quotes_, false otherwise |
| 104 | bool IsQuote(wchar_t c); |
| 105 | |
| 106 | // Returns true if pos refers to the last character in an HTML comment in a |
| 107 | // string described by html_string, false otherwise. |
| 108 | // For example with html_string describing <!-- foo> -->, pos must refer to |
| 109 | // the last > for this method to return true. |
[email protected] | 9f2c802 | 2010-12-14 20:57:55 | [diff] [blame] | 110 | bool IsHTMLCommentClose(const StringRange* html_string, StrPos pos); |
| 111 | |
| 112 | // Returns true if pos refers to the last character in the terminator of the |
| 113 | // opening tag of a downlevel-hidden conditional comment in IE as per |
| 114 | // http://msdn.microsoft.com/en-us/library/ms537512(VS.85).aspx#syntax |
| 115 | // For example with html_string describing <![if booga >wooga]>, pos must |
| 116 | // refer to the last > for this method to return true. |
| 117 | bool IsIEConditionalCommentClose(const StringRange* html_string, StrPos pos); |
[email protected] | 1632eb2 | 2009-10-01 18:14:12 | [diff] [blame] | 118 | |
| 119 | // We store a (CollapsedWhitespace'd) copy of the html data. |
| 120 | const std::wstring html_string_; |
| 121 | |
| 122 | // Store the string of quote characters to avoid repeated construction. |
| 123 | const std::wstring quotes_; |
| 124 | |
| 125 | DISALLOW_COPY_AND_ASSIGN(HTMLScanner); |
| 126 | }; |
| 127 | |
[email protected] | 3f55e87 | 2009-10-17 04:48:37 | [diff] [blame] | 128 | namespace http_utils { |
| 129 | |
[email protected] | 57319e1 | 2012-03-30 22:52:34 | [diff] [blame] | 130 | // Adds "chromeframe/a.b.c.d" to the User-Agent string (a.b.c.d is the version). |
| 131 | // If the cf tag has already been added to the string, the original string is |
| 132 | // returned. |
[email protected] | 3f55e87 | 2009-10-17 04:48:37 | [diff] [blame] | 133 | std::string AddChromeFrameToUserAgentValue(const std::string& value); |
| 134 | |
[email protected] | 57319e1 | 2012-03-30 22:52:34 | [diff] [blame] | 135 | // Removes "chromeframe/a.b.c.d" from the User-Agent string (a.b.c.d is the |
| 136 | // version). If the cf tag is not present in the string, the original string is |
| 137 | // returned. |
| 138 | std::string RemoveChromeFrameFromUserAgentValue(const std::string& value); |
| 139 | |
[email protected] | 3f55e87 | 2009-10-17 04:48:37 | [diff] [blame] | 140 | // Fetches the user agent from urlmon and adds chrome frame to the |
| 141 | // comment section. |
| 142 | // NOTE: The returned string includes the "User-Agent: " header name. |
| 143 | std::string GetDefaultUserAgentHeaderWithCFTag(); |
| 144 | |
[email protected] | e67a73f4 | 2010-08-31 15:05:02 | [diff] [blame] | 145 | // Returns the User-Agent header as would be used by Chrome itself. |
| 146 | const char* GetChromeUserAgent(); |
| 147 | |
[email protected] | 3f55e87 | 2009-10-17 04:48:37 | [diff] [blame] | 148 | // Fetches the default user agent string from urlmon. |
| 149 | // This value does not include the "User-Agent:" header name. |
| 150 | std::string GetDefaultUserAgent(); |
| 151 | |
| 152 | // Returns the Chrome Frame user agent. E.g. "chromeframe/1.0". |
| 153 | // Note that in unit tests this will be "chromeframe/0.0" due to the version |
| 154 | // table not being present in the unit test executable. |
| 155 | const char* GetChromeFrameUserAgent(); |
| 156 | |
[email protected] | d578d30 | 2009-11-19 02:25:42 | [diff] [blame] | 157 | // Returns true if there is a frame busting header (other than the do-nothing |
[email protected] | bc2ff519 | 2010-06-01 22:05:45 | [diff] [blame] | 158 | // "X-Frame-Options: ALLOWALL") in the provided header block. Note that there |
| 159 | // may be multiple X-Frame-Options values specified; if there is one anywhere in |
| 160 | // the list with a value other than ALLOWALL, this returns true. |
[email protected] | d578d30 | 2009-11-19 02:25:42 | [diff] [blame] | 161 | bool HasFrameBustingHeader(const std::string& http_headers); |
| 162 | |
[email protected] | f855429 | 2010-09-10 05:06:29 | [diff] [blame] | 163 | // Returns the header passed in from the headers list. |
| 164 | std::string GetHttpHeaderFromHeaderList(const std::string& header_name, |
| 165 | const std::string& headers); |
[email protected] | 3f55e87 | 2009-10-17 04:48:37 | [diff] [blame] | 166 | } // namespace http_utils |
| 167 | |
[email protected] | 1632eb2 | 2009-10-01 18:14:12 | [diff] [blame] | 168 | #endif // CHROME_FRAME_HTML_UTILS_H_ |