blob: 5e5e0478008de80b7b7dda63f18cacb8830bcb86 [file] [log] [blame]
[email protected]12a936d2013-05-15 04:55:491// Copyright (c) 2013 The Chromium Authors. All rights reserved.
license.botbf09a502008-08-24 00:55:552// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
initial.commitf5b16fe2008-07-27 00:20:514
[email protected]12a936d2013-05-15 04:55:495#include "content/renderer/savable_resources.h"
[email protected]528c56d2010-07-30 19:28:446
[email protected]52bf4652009-10-22 17:01:187#include <set>
8
[email protected]fa419692008-10-16 21:46:149#include "base/compiler_specific.h"
[email protected]38789d82010-11-17 06:03:4410#include "base/logging.h"
[email protected]21aa99682013-06-11 07:17:0111#include "base/strings/string_util.h"
[email protected]5c30b5e02013-05-30 03:46:0812#include "third_party/WebKit/public/platform/WebString.h"
13#include "third_party/WebKit/public/platform/WebVector.h"
[email protected]2255a9332013-06-17 05:12:3114#include "third_party/WebKit/public/web/WebDocument.h"
15#include "third_party/WebKit/public/web/WebElement.h"
16#include "third_party/WebKit/public/web/WebFrame.h"
17#include "third_party/WebKit/public/web/WebInputElement.h"
18#include "third_party/WebKit/public/web/WebNode.h"
19#include "third_party/WebKit/public/web/WebNodeCollection.h"
20#include "third_party/WebKit/public/web/WebNodeList.h"
21#include "third_party/WebKit/public/web/WebView.h"
initial.commitf5b16fe2008-07-27 00:20:5122
[email protected]180ef242013-11-07 06:50:4623using blink::WebDocument;
24using blink::WebElement;
25using blink::WebFrame;
26using blink::WebInputElement;
27using blink::WebNode;
28using blink::WebNodeCollection;
29using blink::WebNodeList;
30using blink::WebString;
31using blink::WebVector;
32using blink::WebView;
initial.commitf5b16fe2008-07-27 00:20:5133
[email protected]12a936d2013-05-15 04:55:4934namespace content {
initial.commitf5b16fe2008-07-27 00:20:5135namespace {
36
37// Structure for storage the unique set of all savable resource links for
38// making sure that no duplicated resource link in final result. The consumer
39// of the SavableResourcesUniqueCheck is responsible for keeping these pointers
40// valid for the lifetime of the SavableResourcesUniqueCheck instance.
41struct SavableResourcesUniqueCheck {
42 // Unique set of all sub resource links.
43 std::set<GURL>* resources_set;
44 // Unique set of all frame links.
45 std::set<GURL>* frames_set;
46 // Collection of all frames we go through when getting all savable resource
47 // links.
[email protected]d9ec5c0f2009-12-23 11:55:0748 std::vector<WebFrame*>* frames;
initial.commitf5b16fe2008-07-27 00:20:5149
50 SavableResourcesUniqueCheck()
51 : resources_set(NULL),
52 frames_set(NULL),
53 frames(NULL) {}
54
55 SavableResourcesUniqueCheck(std::set<GURL>* resources_set,
[email protected]d9ec5c0f2009-12-23 11:55:0756 std::set<GURL>* frames_set, std::vector<WebFrame*>* frames)
initial.commitf5b16fe2008-07-27 00:20:5157 : resources_set(resources_set),
58 frames_set(frames_set),
59 frames(frames) {}
60};
61
62// Get all savable resource links from current element. One element might
63// have more than one resource link. It is possible to have some links
64// in one CSS stylesheet.
[email protected]d9ec5c0f2009-12-23 11:55:0765void GetSavableResourceLinkForElement(
66 const WebElement& element,
67 const WebDocument& current_doc,
68 SavableResourcesUniqueCheck* unique_check,
[email protected]12a936d2013-05-15 04:55:4969 SavableResourcesResult* result) {
[email protected]d9ec5c0f2009-12-23 11:55:0770
initial.commitf5b16fe2008-07-27 00:20:5171 // Handle frame and iframe tag.
[email protected]d9ec5c0f2009-12-23 11:55:0772 if (element.hasTagName("iframe") ||
73 element.hasTagName("frame")) {
74 WebFrame* sub_frame = WebFrame::fromFrameOwnerElement(element);
75 if (sub_frame)
76 unique_check->frames->push_back(sub_frame);
initial.commitf5b16fe2008-07-27 00:20:5177 return;
78 }
[email protected]d9ec5c0f2009-12-23 11:55:0779
initial.commitf5b16fe2008-07-27 00:20:5180 // Check whether the node has sub resource URL or not.
[email protected]12a936d2013-05-15 04:55:4981 WebString value = GetSubResourceLinkFromElement(element);
[email protected]d9ec5c0f2009-12-23 11:55:0782 if (value.isNull())
initial.commitf5b16fe2008-07-27 00:20:5183 return;
84 // Get absolute URL.
[email protected]d9ec5c0f2009-12-23 11:55:0785 GURL u = current_doc.completeURL(value);
initial.commitf5b16fe2008-07-27 00:20:5186 // ignore invalid URL
87 if (!u.is_valid())
88 return;
89 // Ignore those URLs which are not standard protocols. Because FTP
90 // protocol does no have cache mechanism, we will skip all
91 // sub-resources if they use FTP protocol.
[email protected]91f5689032013-08-22 01:43:3392 if (!u.SchemeIsHTTPOrHTTPS() && !u.SchemeIs("file"))
initial.commitf5b16fe2008-07-27 00:20:5193 return;
94 // Ignore duplicated resource link.
95 if (!unique_check->resources_set->insert(u).second)
96 return;
97 result->resources_list->push_back(u);
98 // Insert referrer for above new resource link.
[email protected]c2d986512012-05-12 00:22:4699 result->referrer_urls_list->push_back(GURL());
[email protected]180ef242013-11-07 06:50:46100 result->referrer_policies_list->push_back(blink::WebReferrerPolicyDefault);
initial.commitf5b16fe2008-07-27 00:20:51101}
102
103// Get all savable resource links from current WebFrameImpl object pointer.
[email protected]d9ec5c0f2009-12-23 11:55:07104void GetAllSavableResourceLinksForFrame(WebFrame* current_frame,
initial.commitf5b16fe2008-07-27 00:20:51105 SavableResourcesUniqueCheck* unique_check,
[email protected]12a936d2013-05-15 04:55:49106 SavableResourcesResult* result,
[email protected]dbeb3952009-10-13 18:01:18107 const char** savable_schemes) {
initial.commitf5b16fe2008-07-27 00:20:51108 // Get current frame's URL.
[email protected]54262762011-06-24 00:25:27109 GURL current_frame_url = current_frame->document().url();
initial.commitf5b16fe2008-07-27 00:20:51110
[email protected]dbeb3952009-10-13 18:01:18111 // If url of current frame is invalid, ignore it.
[email protected]d9ec5c0f2009-12-23 11:55:07112 if (!current_frame_url.is_valid())
initial.commitf5b16fe2008-07-27 00:20:51113 return;
[email protected]dbeb3952009-10-13 18:01:18114
115 // If url of current frame is not a savable protocol, ignore it.
116 bool is_valid_protocol = false;
117 for (int i = 0; savable_schemes[i] != NULL; ++i) {
[email protected]d9ec5c0f2009-12-23 11:55:07118 if (current_frame_url.SchemeIs(savable_schemes[i])) {
[email protected]dbeb3952009-10-13 18:01:18119 is_valid_protocol = true;
120 break;
121 }
122 }
123 if (!is_valid_protocol)
initial.commitf5b16fe2008-07-27 00:20:51124 return;
[email protected]dbeb3952009-10-13 18:01:18125
initial.commitf5b16fe2008-07-27 00:20:51126 // If find same frame we have recorded, ignore it.
[email protected]d9ec5c0f2009-12-23 11:55:07127 if (!unique_check->frames_set->insert(current_frame_url).second)
initial.commitf5b16fe2008-07-27 00:20:51128 return;
129
130 // Get current using document.
[email protected]d9ec5c0f2009-12-23 11:55:07131 WebDocument current_doc = current_frame->document();
initial.commitf5b16fe2008-07-27 00:20:51132 // Go through all descent nodes.
[email protected]d9ec5c0f2009-12-23 11:55:07133 WebNodeCollection all = current_doc.all();
initial.commitf5b16fe2008-07-27 00:20:51134 // Go through all node in this frame.
[email protected]d9ec5c0f2009-12-23 11:55:07135 for (WebNode node = all.firstItem(); !node.isNull();
136 node = all.nextItem()) {
initial.commitf5b16fe2008-07-27 00:20:51137 // We only save HTML resources.
[email protected]d9ec5c0f2009-12-23 11:55:07138 if (!node.isElementNode())
initial.commitf5b16fe2008-07-27 00:20:51139 continue;
[email protected]f40b49e2010-05-05 22:38:45140 WebElement element = node.to<WebElement>();
initial.commitf5b16fe2008-07-27 00:20:51141 GetSavableResourceLinkForElement(element,
142 current_doc,
143 unique_check,
144 result);
145 }
146}
147
148} // namespace
149
[email protected]d9ec5c0f2009-12-23 11:55:07150WebString GetSubResourceLinkFromElement(const WebElement& element) {
151 const char* attribute_name = NULL;
[email protected]ff3a36d2012-10-15 03:47:30152 if (element.hasHTMLTagName("img") ||
153 element.hasHTMLTagName("script")) {
[email protected]d9ec5c0f2009-12-23 11:55:07154 attribute_name = "src";
[email protected]ff3a36d2012-10-15 03:47:30155 } else if (element.hasHTMLTagName("input")) {
[email protected]f40b49e2010-05-05 22:38:45156 const WebInputElement input = element.toConst<WebInputElement>();
[email protected]a11728e82010-09-22 00:11:03157 if (input.isImageButton()) {
[email protected]d9ec5c0f2009-12-23 11:55:07158 attribute_name = "src";
initial.commitf5b16fe2008-07-27 00:20:51159 }
[email protected]ff3a36d2012-10-15 03:47:30160 } else if (element.hasHTMLTagName("body") ||
161 element.hasHTMLTagName("table") ||
162 element.hasHTMLTagName("tr") ||
163 element.hasHTMLTagName("td")) {
[email protected]d9ec5c0f2009-12-23 11:55:07164 attribute_name = "background";
[email protected]ff3a36d2012-10-15 03:47:30165 } else if (element.hasHTMLTagName("blockquote") ||
166 element.hasHTMLTagName("q") ||
167 element.hasHTMLTagName("del") ||
168 element.hasHTMLTagName("ins")) {
[email protected]d9ec5c0f2009-12-23 11:55:07169 attribute_name = "cite";
[email protected]ff3a36d2012-10-15 03:47:30170 } else if (element.hasHTMLTagName("link")) {
initial.commitf5b16fe2008-07-27 00:20:51171 // If the link element is not linked to css, ignore it.
[email protected]d9ec5c0f2009-12-23 11:55:07172 if (LowerCaseEqualsASCII(element.getAttribute("type"), "text/css")) {
[email protected]7f3281452010-02-24 21:27:02173 // TODO(jnd): Add support for extracting links of sub-resources which
initial.commitf5b16fe2008-07-27 00:20:51174 // are inside style-sheet such as @import, url(), etc.
175 // See bug: http://b/issue?id=1111667.
[email protected]d9ec5c0f2009-12-23 11:55:07176 attribute_name = "href";
initial.commitf5b16fe2008-07-27 00:20:51177 }
initial.commitf5b16fe2008-07-27 00:20:51178 }
179 if (!attribute_name)
[email protected]d9ec5c0f2009-12-23 11:55:07180 return WebString();
181 WebString value = element.getAttribute(WebString::fromUTF8(attribute_name));
initial.commitf5b16fe2008-07-27 00:20:51182 // If value has content and not start with "javascript:" then return it,
183 // otherwise return NULL.
[email protected]d9ec5c0f2009-12-23 11:55:07184 if (!value.isNull() && !value.isEmpty() &&
[email protected]7f3281452010-02-24 21:27:02185 !StartsWithASCII(value.utf8(), "javascript:", false))
initial.commitf5b16fe2008-07-27 00:20:51186 return value;
187
[email protected]d9ec5c0f2009-12-23 11:55:07188 return WebString();
initial.commitf5b16fe2008-07-27 00:20:51189}
190
191// Get all savable resource links from current webview, include main
192// frame and sub-frame
193bool GetAllSavableResourceLinksForCurrentPage(WebView* view,
[email protected]dbeb3952009-10-13 18:01:18194 const GURL& page_url, SavableResourcesResult* result,
195 const char** savable_schemes) {
[email protected]26aa0482009-09-30 16:55:27196 WebFrame* main_frame = view->mainFrame();
initial.commitf5b16fe2008-07-27 00:20:51197 if (!main_frame)
198 return false;
initial.commitf5b16fe2008-07-27 00:20:51199
200 std::set<GURL> resources_set;
201 std::set<GURL> frames_set;
[email protected]d9ec5c0f2009-12-23 11:55:07202 std::vector<WebFrame*> frames;
initial.commitf5b16fe2008-07-27 00:20:51203 SavableResourcesUniqueCheck unique_check(&resources_set,
204 &frames_set,
205 &frames);
206
[email protected]54262762011-06-24 00:25:27207 GURL main_page_gurl(main_frame->document().url());
initial.commitf5b16fe2008-07-27 00:20:51208
209 // Make sure we are saving same page between embedder and webkit.
210 // If page has being navigated, embedder will get three empty vector,
211 // which will make the saving page job ended.
212 if (page_url != main_page_gurl)
213 return true;
214
215 // First, process main frame.
[email protected]c322d202010-01-11 22:18:42216 frames.push_back(main_frame);
initial.commitf5b16fe2008-07-27 00:20:51217
218 // Check all resource in this page, include sub-frame.
219 for (int i = 0; i < static_cast<int>(frames.size()); ++i) {
220 // Get current frame's all savable resource links.
[email protected]dbeb3952009-10-13 18:01:18221 GetAllSavableResourceLinksForFrame(frames[i], &unique_check, result,
222 savable_schemes);
initial.commitf5b16fe2008-07-27 00:20:51223 }
224
225 // Since frame's src can also point to sub-resources link, so it is possible
226 // that some URLs in frames_list are also in resources_list. For those
227 // URLs, we will remove it from frame_list, only keep them in resources_list.
228 for (std::set<GURL>::iterator it = frames_set.begin();
229 it != frames_set.end(); ++it) {
230 // Append unique frame source to savable frame list.
231 if (resources_set.find(*it) == resources_set.end())
232 result->frames_list->push_back(*it);
233 }
234
235 return true;
236}
237
[email protected]12a936d2013-05-15 04:55:49238} // namespace content