Support URL fragment resolution against non-hierarchical schemes

Support URL fragment resolution against non-hierarchical schemes
As a result, data: about: etc now have 'query' and 'ref' components
parsed; as a result a new GURL::GetContent() convenience is added to
retrieve the spec with the scheme stripped off.

A complication in supporting this is that we now need to allow whitespace
to trailing whitespace to be preserved when transferring url_parse::Parsed
structs between KURL and GURL. Without this, the URL prior to the
#fragment can change (i.e. whitespace stripped) when following an anchor
link which breaks the page (causes reload from source). See
http://crbug.com/291747 for more details on this.

[email protected]
[email protected]
BUG=291747

Review URL: https://codereview.chromium.org/23835019

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@236917 0039d316-1c4b-4281-b951-d872f2087c98
diff --git a/url/url_canon_pathurl.cc b/url/url_canon_pathurl.cc
index bc681f4..8f7dee4 100644
--- a/url/url_canon_pathurl.cc
+++ b/url/url_canon_pathurl.cc
@@ -13,6 +13,39 @@
 
 namespace {
 
+// Canonicalize the given |component| from |source| into |output| and
+// |new_component|. If |separator| is non-zero, it is pre-pended to |ouput|
+// prior to the canonicalized component; i.e. for the '?' or '#' characters.
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizePathComponent(const CHAR* source,
+                                 const url_parse::Component& component,
+                                 CHAR seperator,
+                                 CanonOutput* output,
+                                 url_parse::Component* new_component) {
+  bool success = true;
+  if (component.is_valid()) {
+    if (seperator)
+      output->push_back(seperator);
+    // Copy the path using path URL's more lax escaping rules (think for
+    // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all
+    // ASCII characters alone. This helps readability of JavaStript.
+    new_component->begin = output->length();
+    int end = component.end();
+    for (int i = component.begin; i < end; i++) {
+      UCHAR uch = static_cast<UCHAR>(source[i]);
+      if (uch < 0x20 || uch >= 0x80)
+        success &= AppendUTF8EscapedChar(source, &i, end, output);
+      else
+        output->push_back(static_cast<char>(uch));
+    }
+    new_component->len = output->length() - new_component->begin;
+  } else {
+    // Empty part.
+    new_component->reset();
+  }
+  return success;
+}
+
 template<typename CHAR, typename UCHAR>
 bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source,
                            const url_parse::Parsed& parsed,
@@ -28,29 +61,14 @@
   new_parsed->password.reset();
   new_parsed->host.reset();
   new_parsed->port.reset();
-
-  if (parsed.path.is_valid()) {
-    // Copy the path using path URL's more lax escaping rules (think for
-    // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all
-    // ASCII characters alone. This helps readability of JavaStript.
-    new_parsed->path.begin = output->length();
-    int end = parsed.path.end();
-    for (int i = parsed.path.begin; i < end; i++) {
-      UCHAR uch = static_cast<UCHAR>(source.path[i]);
-      if (uch < 0x20 || uch >= 0x80)
-        success &= AppendUTF8EscapedChar(source.path, &i, end, output);
-      else
-        output->push_back(static_cast<char>(uch));
-    }
-    new_parsed->path.len = output->length() - new_parsed->path.begin;
-  } else {
-    // Empty path.
-    new_parsed->path.reset();
-  }
-
-  // Assume there's no query or ref.
-  new_parsed->query.reset();
-  new_parsed->ref.reset();
+  // We allow path URLs to have the path, query and fragment components, but we
+  // will canonicalize each of the via the weaker path URL rules.
+  success &= DoCanonicalizePathComponent<CHAR, UCHAR>(
+      source.path, parsed.path, 0, output, &new_parsed->path);
+  success &= DoCanonicalizePathComponent<CHAR, UCHAR>(
+      source.query, parsed.query, '?', output, &new_parsed->query);
+  success &= DoCanonicalizePathComponent<CHAR, UCHAR>(
+      source.ref, parsed.ref, '#', output, &new_parsed->ref);
 
   return success;
 }