[email protected] | 12a936d | 2013-05-15 04:55:49 | [diff] [blame] | 1 | // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
license.bot | bf09a50 | 2008-08-24 00:55:55 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 4 | |
[email protected] | 12a936d | 2013-05-15 04:55:49 | [diff] [blame] | 5 | #include "content/renderer/savable_resources.h" |
[email protected] | 528c56d | 2010-07-30 19:28:44 | [diff] [blame] | 6 | |
[email protected] | 52bf465 | 2009-10-22 17:01:18 | [diff] [blame] | 7 | #include <set> |
| 8 | |
[email protected] | fa41969 | 2008-10-16 21:46:14 | [diff] [blame] | 9 | #include "base/compiler_specific.h" |
[email protected] | 38789d8 | 2010-11-17 06:03:44 | [diff] [blame] | 10 | #include "base/logging.h" |
[email protected] | 21aa9968 | 2013-06-11 07:17:01 | [diff] [blame] | 11 | #include "base/strings/string_util.h" |
[email protected] | 5c30b5e0 | 2013-05-30 03:46:08 | [diff] [blame] | 12 | #include "third_party/WebKit/public/platform/WebString.h" |
| 13 | #include "third_party/WebKit/public/platform/WebVector.h" |
[email protected] | 2255a933 | 2013-06-17 05:12:31 | [diff] [blame] | 14 | #include "third_party/WebKit/public/web/WebDocument.h" |
| 15 | #include "third_party/WebKit/public/web/WebElement.h" |
| 16 | #include "third_party/WebKit/public/web/WebFrame.h" |
| 17 | #include "third_party/WebKit/public/web/WebInputElement.h" |
| 18 | #include "third_party/WebKit/public/web/WebNode.h" |
| 19 | #include "third_party/WebKit/public/web/WebNodeCollection.h" |
| 20 | #include "third_party/WebKit/public/web/WebNodeList.h" |
| 21 | #include "third_party/WebKit/public/web/WebView.h" |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 22 | |
[email protected] | 180ef24 | 2013-11-07 06:50:46 | [diff] [blame^] | 23 | using blink::WebDocument; |
| 24 | using blink::WebElement; |
| 25 | using blink::WebFrame; |
| 26 | using blink::WebInputElement; |
| 27 | using blink::WebNode; |
| 28 | using blink::WebNodeCollection; |
| 29 | using blink::WebNodeList; |
| 30 | using blink::WebString; |
| 31 | using blink::WebVector; |
| 32 | using blink::WebView; |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 33 | |
[email protected] | 12a936d | 2013-05-15 04:55:49 | [diff] [blame] | 34 | namespace content { |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 35 | namespace { |
| 36 | |
| 37 | // Structure for storage the unique set of all savable resource links for |
| 38 | // making sure that no duplicated resource link in final result. The consumer |
| 39 | // of the SavableResourcesUniqueCheck is responsible for keeping these pointers |
| 40 | // valid for the lifetime of the SavableResourcesUniqueCheck instance. |
| 41 | struct SavableResourcesUniqueCheck { |
| 42 | // Unique set of all sub resource links. |
| 43 | std::set<GURL>* resources_set; |
| 44 | // Unique set of all frame links. |
| 45 | std::set<GURL>* frames_set; |
| 46 | // Collection of all frames we go through when getting all savable resource |
| 47 | // links. |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 48 | std::vector<WebFrame*>* frames; |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 49 | |
| 50 | SavableResourcesUniqueCheck() |
| 51 | : resources_set(NULL), |
| 52 | frames_set(NULL), |
| 53 | frames(NULL) {} |
| 54 | |
| 55 | SavableResourcesUniqueCheck(std::set<GURL>* resources_set, |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 56 | std::set<GURL>* frames_set, std::vector<WebFrame*>* frames) |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 57 | : resources_set(resources_set), |
| 58 | frames_set(frames_set), |
| 59 | frames(frames) {} |
| 60 | }; |
| 61 | |
| 62 | // Get all savable resource links from current element. One element might |
| 63 | // have more than one resource link. It is possible to have some links |
| 64 | // in one CSS stylesheet. |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 65 | void GetSavableResourceLinkForElement( |
| 66 | const WebElement& element, |
| 67 | const WebDocument& current_doc, |
| 68 | SavableResourcesUniqueCheck* unique_check, |
[email protected] | 12a936d | 2013-05-15 04:55:49 | [diff] [blame] | 69 | SavableResourcesResult* result) { |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 70 | |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 71 | // Handle frame and iframe tag. |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 72 | if (element.hasTagName("iframe") || |
| 73 | element.hasTagName("frame")) { |
| 74 | WebFrame* sub_frame = WebFrame::fromFrameOwnerElement(element); |
| 75 | if (sub_frame) |
| 76 | unique_check->frames->push_back(sub_frame); |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 77 | return; |
| 78 | } |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 79 | |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 80 | // Check whether the node has sub resource URL or not. |
[email protected] | 12a936d | 2013-05-15 04:55:49 | [diff] [blame] | 81 | WebString value = GetSubResourceLinkFromElement(element); |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 82 | if (value.isNull()) |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 83 | return; |
| 84 | // Get absolute URL. |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 85 | GURL u = current_doc.completeURL(value); |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 86 | // ignore invalid URL |
| 87 | if (!u.is_valid()) |
| 88 | return; |
| 89 | // Ignore those URLs which are not standard protocols. Because FTP |
| 90 | // protocol does no have cache mechanism, we will skip all |
| 91 | // sub-resources if they use FTP protocol. |
[email protected] | 91f568903 | 2013-08-22 01:43:33 | [diff] [blame] | 92 | if (!u.SchemeIsHTTPOrHTTPS() && !u.SchemeIs("file")) |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 93 | return; |
| 94 | // Ignore duplicated resource link. |
| 95 | if (!unique_check->resources_set->insert(u).second) |
| 96 | return; |
| 97 | result->resources_list->push_back(u); |
| 98 | // Insert referrer for above new resource link. |
[email protected] | c2d98651 | 2012-05-12 00:22:46 | [diff] [blame] | 99 | result->referrer_urls_list->push_back(GURL()); |
[email protected] | 180ef24 | 2013-11-07 06:50:46 | [diff] [blame^] | 100 | result->referrer_policies_list->push_back(blink::WebReferrerPolicyDefault); |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 101 | } |
| 102 | |
| 103 | // Get all savable resource links from current WebFrameImpl object pointer. |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 104 | void GetAllSavableResourceLinksForFrame(WebFrame* current_frame, |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 105 | SavableResourcesUniqueCheck* unique_check, |
[email protected] | 12a936d | 2013-05-15 04:55:49 | [diff] [blame] | 106 | SavableResourcesResult* result, |
[email protected] | dbeb395 | 2009-10-13 18:01:18 | [diff] [blame] | 107 | const char** savable_schemes) { |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 108 | // Get current frame's URL. |
[email protected] | 5426276 | 2011-06-24 00:25:27 | [diff] [blame] | 109 | GURL current_frame_url = current_frame->document().url(); |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 110 | |
[email protected] | dbeb395 | 2009-10-13 18:01:18 | [diff] [blame] | 111 | // If url of current frame is invalid, ignore it. |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 112 | if (!current_frame_url.is_valid()) |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 113 | return; |
[email protected] | dbeb395 | 2009-10-13 18:01:18 | [diff] [blame] | 114 | |
| 115 | // If url of current frame is not a savable protocol, ignore it. |
| 116 | bool is_valid_protocol = false; |
| 117 | for (int i = 0; savable_schemes[i] != NULL; ++i) { |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 118 | if (current_frame_url.SchemeIs(savable_schemes[i])) { |
[email protected] | dbeb395 | 2009-10-13 18:01:18 | [diff] [blame] | 119 | is_valid_protocol = true; |
| 120 | break; |
| 121 | } |
| 122 | } |
| 123 | if (!is_valid_protocol) |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 124 | return; |
[email protected] | dbeb395 | 2009-10-13 18:01:18 | [diff] [blame] | 125 | |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 126 | // If find same frame we have recorded, ignore it. |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 127 | if (!unique_check->frames_set->insert(current_frame_url).second) |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 128 | return; |
| 129 | |
| 130 | // Get current using document. |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 131 | WebDocument current_doc = current_frame->document(); |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 132 | // Go through all descent nodes. |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 133 | WebNodeCollection all = current_doc.all(); |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 134 | // Go through all node in this frame. |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 135 | for (WebNode node = all.firstItem(); !node.isNull(); |
| 136 | node = all.nextItem()) { |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 137 | // We only save HTML resources. |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 138 | if (!node.isElementNode()) |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 139 | continue; |
[email protected] | f40b49e | 2010-05-05 22:38:45 | [diff] [blame] | 140 | WebElement element = node.to<WebElement>(); |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 141 | GetSavableResourceLinkForElement(element, |
| 142 | current_doc, |
| 143 | unique_check, |
| 144 | result); |
| 145 | } |
| 146 | } |
| 147 | |
| 148 | } // namespace |
| 149 | |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 150 | WebString GetSubResourceLinkFromElement(const WebElement& element) { |
| 151 | const char* attribute_name = NULL; |
[email protected] | ff3a36d | 2012-10-15 03:47:30 | [diff] [blame] | 152 | if (element.hasHTMLTagName("img") || |
| 153 | element.hasHTMLTagName("script")) { |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 154 | attribute_name = "src"; |
[email protected] | ff3a36d | 2012-10-15 03:47:30 | [diff] [blame] | 155 | } else if (element.hasHTMLTagName("input")) { |
[email protected] | f40b49e | 2010-05-05 22:38:45 | [diff] [blame] | 156 | const WebInputElement input = element.toConst<WebInputElement>(); |
[email protected] | a11728e8 | 2010-09-22 00:11:03 | [diff] [blame] | 157 | if (input.isImageButton()) { |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 158 | attribute_name = "src"; |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 159 | } |
[email protected] | ff3a36d | 2012-10-15 03:47:30 | [diff] [blame] | 160 | } else if (element.hasHTMLTagName("body") || |
| 161 | element.hasHTMLTagName("table") || |
| 162 | element.hasHTMLTagName("tr") || |
| 163 | element.hasHTMLTagName("td")) { |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 164 | attribute_name = "background"; |
[email protected] | ff3a36d | 2012-10-15 03:47:30 | [diff] [blame] | 165 | } else if (element.hasHTMLTagName("blockquote") || |
| 166 | element.hasHTMLTagName("q") || |
| 167 | element.hasHTMLTagName("del") || |
| 168 | element.hasHTMLTagName("ins")) { |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 169 | attribute_name = "cite"; |
[email protected] | ff3a36d | 2012-10-15 03:47:30 | [diff] [blame] | 170 | } else if (element.hasHTMLTagName("link")) { |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 171 | // If the link element is not linked to css, ignore it. |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 172 | if (LowerCaseEqualsASCII(element.getAttribute("type"), "text/css")) { |
[email protected] | 7f328145 | 2010-02-24 21:27:02 | [diff] [blame] | 173 | // TODO(jnd): Add support for extracting links of sub-resources which |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 174 | // are inside style-sheet such as @import, url(), etc. |
| 175 | // See bug: http://b/issue?id=1111667. |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 176 | attribute_name = "href"; |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 177 | } |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 178 | } |
| 179 | if (!attribute_name) |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 180 | return WebString(); |
| 181 | WebString value = element.getAttribute(WebString::fromUTF8(attribute_name)); |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 182 | // If value has content and not start with "javascript:" then return it, |
| 183 | // otherwise return NULL. |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 184 | if (!value.isNull() && !value.isEmpty() && |
[email protected] | 7f328145 | 2010-02-24 21:27:02 | [diff] [blame] | 185 | !StartsWithASCII(value.utf8(), "javascript:", false)) |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 186 | return value; |
| 187 | |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 188 | return WebString(); |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 189 | } |
| 190 | |
| 191 | // Get all savable resource links from current webview, include main |
| 192 | // frame and sub-frame |
| 193 | bool GetAllSavableResourceLinksForCurrentPage(WebView* view, |
[email protected] | dbeb395 | 2009-10-13 18:01:18 | [diff] [blame] | 194 | const GURL& page_url, SavableResourcesResult* result, |
| 195 | const char** savable_schemes) { |
[email protected] | 26aa048 | 2009-09-30 16:55:27 | [diff] [blame] | 196 | WebFrame* main_frame = view->mainFrame(); |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 197 | if (!main_frame) |
| 198 | return false; |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 199 | |
| 200 | std::set<GURL> resources_set; |
| 201 | std::set<GURL> frames_set; |
[email protected] | d9ec5c0f | 2009-12-23 11:55:07 | [diff] [blame] | 202 | std::vector<WebFrame*> frames; |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 203 | SavableResourcesUniqueCheck unique_check(&resources_set, |
| 204 | &frames_set, |
| 205 | &frames); |
| 206 | |
[email protected] | 5426276 | 2011-06-24 00:25:27 | [diff] [blame] | 207 | GURL main_page_gurl(main_frame->document().url()); |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 208 | |
| 209 | // Make sure we are saving same page between embedder and webkit. |
| 210 | // If page has being navigated, embedder will get three empty vector, |
| 211 | // which will make the saving page job ended. |
| 212 | if (page_url != main_page_gurl) |
| 213 | return true; |
| 214 | |
| 215 | // First, process main frame. |
[email protected] | c322d20 | 2010-01-11 22:18:42 | [diff] [blame] | 216 | frames.push_back(main_frame); |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 217 | |
| 218 | // Check all resource in this page, include sub-frame. |
| 219 | for (int i = 0; i < static_cast<int>(frames.size()); ++i) { |
| 220 | // Get current frame's all savable resource links. |
[email protected] | dbeb395 | 2009-10-13 18:01:18 | [diff] [blame] | 221 | GetAllSavableResourceLinksForFrame(frames[i], &unique_check, result, |
| 222 | savable_schemes); |
initial.commit | f5b16fe | 2008-07-27 00:20:51 | [diff] [blame] | 223 | } |
| 224 | |
| 225 | // Since frame's src can also point to sub-resources link, so it is possible |
| 226 | // that some URLs in frames_list are also in resources_list. For those |
| 227 | // URLs, we will remove it from frame_list, only keep them in resources_list. |
| 228 | for (std::set<GURL>::iterator it = frames_set.begin(); |
| 229 | it != frames_set.end(); ++it) { |
| 230 | // Append unique frame source to savable frame list. |
| 231 | if (resources_set.find(*it) == resources_set.end()) |
| 232 | result->frames_list->push_back(*it); |
| 233 | } |
| 234 | |
| 235 | return true; |
| 236 | } |
| 237 | |
[email protected] | 12a936d | 2013-05-15 04:55:49 | [diff] [blame] | 238 | } // namespace content |