string_bytes: Guarantee valid utf-8 output
Previously v8's WriteUtf8 function would produce invalid utf-8 output
when encountering unmatched surrogate code units [1]. The new
REPLACE_INVALID_UTF8 option fixes that by replacing invalid code points
with the unicode replacement character.
[1]: JS Strings are defined as arrays of 16 bit unsigned integers. There
is no unicode enforcement, so one can easily end up with invalid unicode
code unit sequences inside a string.
diff --git a/src/node.cc b/src/node.cc
index 8257604..5cb202f 100644
--- a/src/node.cc
+++ b/src/node.cc
@@ -176,6 +176,8 @@
// Declared in node_internals.h
Isolate* node_isolate = NULL;
+int WRITE_UTF8_FLAGS = v8::String::HINT_MANY_WRITES_EXPECTED |
+ v8::String::NO_NULL_TERMINATION;
static void Spin(uv_idle_t* handle, int status) {
assert((uv_idle_t*) handle == &tick_spinner);
@@ -3042,6 +3044,11 @@
}
int Start(int argc, char *argv[]) {
+ const char* replaceInvalid = getenv("NODE_INVALID_UTF8");
+
+ if (replaceInvalid == NULL)
+ WRITE_UTF8_FLAGS |= String::REPLACE_INVALID_UTF8;
+
// Hack aroung with the argv pointer. Used for process.title = "blah".
argv = uv_setup_args(argc, argv);