From dd0053a3595bf14981e576f4ffea2dd37dcd5367 Mon Sep 17 00:00:00 2001 From: Petka Antonov Date: Thu, 7 May 2015 16:10:40 +0300 Subject: [PATCH 1/3] url: improve url module performance --- lib/url.js | 1189 ++++++++++++++++++++++--------------- test/parallel/test-url.js | 20 +- 2 files changed, 738 insertions(+), 471 deletions(-) diff --git a/lib/url.js b/lib/url.js index 8da2f025dc8ca2..1b211e336133b0 100644 --- a/lib/url.js +++ b/lib/url.js @@ -22,48 +22,45 @@ function Url() { this.pathname = null; this.path = null; this.href = null; + this._prependSlash = false; } // Reference: RFC 3986, RFC 1808, RFC 2396 -// define these here so at least they only have to be -// compiled once on the first module load. -const protocolPattern = /^([a-z0-9.+-]+:)/i; -const portPattern = /:[0-9]*$/; - -// Special case for a simple path URL -const simplePathPattern = /^(\/\/?(?!\/)[^\?\s]*)(\?[^\s]*)?$/; +const _protocolCharacters = makeAsciiTable([ + [0x61, 0x7A] /*a-z*/, + [0x41, 0x5A] /*A-Z*/, + 0x2E /*'.'*/, 0x2B /*'+'*/, 0x2D /*'-'*/ +]); // RFC 2396: characters reserved for delimiting URLs. // We actually just auto-escape these. -const delims = ['<', '>', '"', '`', ' ', '\r', '\n', '\t']; - // RFC 2396: characters not allowed for various reasons. -const unwise = ['{', '}', '|', '\\', '^', '`'].concat(delims); - // Allowed by RFCs, but cause of XSS attacks. Always escape these. -const autoEscape = ['\''].concat(unwise); +const _autoEscape = [ + '<', '>', '\'', '`', ' ', '\r', '\n', '\t', '{', '}', '|', '\\', '^', '`', '"' +]; -// Characters that are never ever allowed in a hostname. -// Note that any invalid chars are also handled, but these -// are the ones that are *expected* to be seen, so we fast-path them. -const nonHostChars = ['%', '/', '?', ';', '#'].concat(autoEscape); -const hostEndingChars = ['/', '?', '#']; -const hostnameMaxLen = 255; -const hostnamePartPattern = /^[+a-z0-9A-Z_-]{0,63}$/; -const hostnamePartStart = /^([+a-z0-9A-Z_-]{0,63})(.*)$/; -// protocols that can allow "unsafe" and "unwise" chars. -const unsafeProtocol = { - 'javascript': true, - 'javascript:': true -}; -// protocols that never have a hostname. -const hostlessProtocol = { - 'javascript': true, - 'javascript:': true -}; -// protocols that always contain a // bit. -const slashedProtocol = { +const _autoEscapeMap = new Array(128); + +for (let i = 0, len = _autoEscapeMap.length; i < len; ++i) { + _autoEscapeMap[i] = ''; +} + +for (let i = 0, len = _autoEscape.length; i < len; ++i) { + let c = _autoEscape[i]; + let esc = encodeURIComponent(c); + if (esc === c) + esc = escape(c); + _autoEscapeMap[c.charCodeAt(0)] = esc; +} + +// Same as autoEscapeMap except \ is not escaped but is turned into /. +const _afterQueryAutoEscapeMap = _autoEscapeMap.slice(); +_autoEscapeMap[0x5C /*'\'*/] = '/'; + +// Protocols that always contain a // bit. +const _slashProtocols = { 'http': true, 'https': true, 'ftp': true, @@ -75,408 +72,268 @@ const slashedProtocol = { 'gopher:': true, 'file:': true }; + +const _autoEscapeCharacters = makeAsciiTable(_autoEscape.map(function(v) { + return v.charCodeAt(0); +})); + +// Characters that are never ever allowed in a hostname. +// Note that any invalid chars are also handled, but these +// are the ones that are *expected* to be seen, so we fast-path them. +const _hostEndingCharacters = makeAsciiTable( + ['#', '?', '/', '\\'].map(function(v) { + return v.charCodeAt(0); + })); +// If these characters end a host name, the path will not be prepended a /. +const _hostEndingCharactersNoPrependSlash = makeAsciiTable([ + '<', '>', '"', '`', ' ', '\r', '\n', '\t', '{', '}', '|', '^', '`', '\'', '%', + ';' +].map(function(v) { + return v.charCodeAt(0); +})); + const querystring = require('querystring'); function urlParse(url, parseQueryString, slashesDenoteHost) { - if (url instanceof Url) return url; + if (url instanceof Url) + return url; var u = new Url(); u.parse(url, parseQueryString, slashesDenoteHost); + u.href = u.format(); return u; } -Url.prototype.parse = function(url, parseQueryString, slashesDenoteHost) { - if (typeof url !== 'string') { - throw new TypeError("Parameter 'url' must be a string, not " + typeof url); - } - - // Copy chrome, IE, opera backslash-handling behavior. - // Back slashes before the query string get converted to forward slashes - // See: https://code.google.com/p/chromium/issues/detail?id=25916 - var queryIndex = url.indexOf('?'), - splitter = - (queryIndex !== -1 && queryIndex < url.indexOf('#')) ? '?' : '#', - uSplit = url.split(splitter), - slashRegex = /\\/g; - uSplit[0] = uSplit[0].replace(slashRegex, '/'); - url = uSplit.join(splitter); - - var rest = url; - - // trim before proceeding. - // This is to support parse stuff like " http://foo.com \n" - rest = rest.trim(); - - if (!slashesDenoteHost && url.split('#').length === 1) { - // Try fast path regexp - var simplePath = simplePathPattern.exec(rest); - if (simplePath) { - this.path = rest; - this.href = rest; - this.pathname = simplePath[1]; - if (simplePath[2]) { - this.search = simplePath[2]; - if (parseQueryString) { - this.query = querystring.parse(this.search.substr(1)); - } else { - this.query = this.search.substr(1); - } - } else if (parseQueryString) { - this.search = ''; - this.query = {}; - } - return this; - } +Url.prototype.parse = function(str, parseQueryString, slashesDenoteHost) { + if (typeof str !== 'string') { + throw new TypeError(`Parameter 'url' must be a string, not ` + + typeof str); } - - var proto = protocolPattern.exec(rest); - if (proto) { - proto = proto[0]; - var lowerProto = proto.toLowerCase(); - this.protocol = lowerProto; - rest = rest.substr(proto.length); + var start = 0; + var end = str.length - 1; + + // Trim leading and trailing ws. + while (str.charCodeAt(start) <= 0x20 /*' '*/) start++; + var trimmedStart = start; + while (str.charCodeAt(end) <= 0x20 /*' '*/) end--; + + start = this._parseProtocol(str, start, end); + + // Javascript doesn't have host. + if (this.protocol !== 'javascript:') { + start = this._parseHost(str, trimmedStart, start, end, slashesDenoteHost); + var proto = this.protocol; + if (this._isEmpty(this.hostname) && + (this.slashes || (!this._isEmpty(proto) && !_slashProtocols[proto]))) + this.hostname = this.host = ''; } - // figure out if it's got a host - // user@server is *always* interpreted as a hostname, and url - // resolution will treat //foo/bar as host=foo,path=bar because that's - // how the browser resolves relative URLs. - if (slashesDenoteHost || proto || rest.match(/^\/\/[^@\/]+@[^@\/]+/)) { - var slashes = rest.substr(0, 2) === '//'; - if (slashes && !(proto && hostlessProtocol[proto])) { - rest = rest.substr(2); - this.slashes = true; - } + if (start <= end) { + var ch = str.charCodeAt(start); + if (ch === 0x2F /*'/'*/ || ch === 0x5C /*'\'*/) + this._parsePath(str, start, end); + else if (ch === 0x3F /*'?'*/) + this._parseQuery(str, start, end); + else if (ch === 0x23 /*'#'*/) + this._parseHash(str, start, end); + else if (this.protocol !== 'javascript:') + this._parsePath(str, start, end); + else // For javascript the pathname is just the rest of it. + this.pathname = str.slice(start, end + 1); } - if (!hostlessProtocol[proto] && - (slashes || (proto && !slashedProtocol[proto]))) { - - // there's a hostname. - // the first instance of /, ?, ;, or # ends the host. - // - // If there is an @ in the hostname, then non-host chars *are* allowed - // to the left of the last @ sign, unless some host-ending character - // comes *before* the @-sign. - // URLs are obnoxious. - // - // ex: - // http://a@b@c/ => user:a@b host:c - // http://a@b?@c => user:a host:c path:/?@c - - // v0.12 TODO(isaacs): This is not quite how Chrome does things. - // Review our test case against browsers more comprehensively. - - // find the first instance of any hostEndingChars - var hostEnd = -1; - for (var i = 0; i < hostEndingChars.length; i++) { - var hec = rest.indexOf(hostEndingChars[i]); - if (hec !== -1 && (hostEnd === -1 || hec < hostEnd)) - hostEnd = hec; - } - - // at this point, either we have an explicit point where the - // auth portion cannot go past, or the last @ char is the decider. - var auth, atSign; - if (hostEnd === -1) { - // atSign can be anywhere. - atSign = rest.lastIndexOf('@'); - } else { - // atSign must be in auth portion. - // http://a@b/c@d => host:b auth:a path:/c@d - atSign = rest.lastIndexOf('@', hostEnd); - } - - // Now we have a portion which is definitely the auth. - // Pull that off. - if (atSign !== -1) { - auth = rest.slice(0, atSign); - rest = rest.slice(atSign + 1); - this.auth = decodeURIComponent(auth); - } - - // the host is the remaining to the left of the first non-host char - hostEnd = -1; - for (var i = 0; i < nonHostChars.length; i++) { - var hec = rest.indexOf(nonHostChars[i]); - if (hec !== -1 && (hostEnd === -1 || hec < hostEnd)) - hostEnd = hec; - } - // if we still have not hit it, then the entire thing is a host. - if (hostEnd === -1) - hostEnd = rest.length; - - this.host = rest.slice(0, hostEnd); - rest = rest.slice(hostEnd); - - // pull out port. - this.parseHost(); - - // we've indicated that there is a hostname, - // so even if it's empty, it has to be present. - this.hostname = this.hostname || ''; - - // if hostname begins with [ and ends with ] - // assume that it's an IPv6 address. - var ipv6Hostname = this.hostname[0] === '[' && - this.hostname[this.hostname.length - 1] === ']'; - - // validate a little. - if (!ipv6Hostname) { - var hostparts = this.hostname.split(/\./); - for (var i = 0, l = hostparts.length; i < l; i++) { - var part = hostparts[i]; - if (!part) continue; - if (!part.match(hostnamePartPattern)) { - var newpart = ''; - for (var j = 0, k = part.length; j < k; j++) { - if (part.charCodeAt(j) > 127) { - // we replace non-ASCII char with a temporary placeholder - // we need this to make sure size of hostname is not - // broken by replacing non-ASCII by nothing - newpart += 'x'; - } else { - newpart += part[j]; - } - } - // we test again with ASCII char only - if (!newpart.match(hostnamePartPattern)) { - var validParts = hostparts.slice(0, i); - var notHost = hostparts.slice(i + 1); - var bit = part.match(hostnamePartStart); - if (bit) { - validParts.push(bit[1]); - notHost.unshift(bit[2]); - } - if (notHost.length) { - rest = '/' + notHost.join('.') + rest; - } - this.hostname = validParts.join('.'); - break; - } - } - } - } - - if (this.hostname.length > hostnameMaxLen) { - this.hostname = ''; - } else { - // hostnames are always lower case. - this.hostname = this.hostname.toLowerCase(); - } - - if (!ipv6Hostname) { - // IDNA Support: Returns a punycoded representation of "domain". - // It only converts parts of the domain name that - // have non-ASCII characters, i.e. it doesn't matter if - // you call it with a domain that already is ASCII-only. - this.hostname = punycode.toASCII(this.hostname); - } + if (this._isEmpty(this.pathname) && + !this._isEmpty(this.hostname) && + _slashProtocols[this.protocol]) + this.pathname = '/'; - var p = this.port ? ':' + this.port : ''; - var h = this.hostname || ''; - this.host = h + p; - // strip [ and ] from the hostname - // the host field still retains them, though - if (ipv6Hostname) { - this.hostname = this.hostname.substr(1, this.hostname.length - 2); - if (rest[0] !== '/') { - rest = '/' + rest; - } - } - } + var pathname = this.pathname; + var search = this.search; - // now rest is set to the post-host stuff. - // chop off any delim chars. - if (!unsafeProtocol[lowerProto]) { - - // First, make 100% sure that any "autoEscape" chars get - // escaped, even if encodeURIComponent doesn't think they - // need to be. - for (var i = 0, l = autoEscape.length; i < l; i++) { - var ae = autoEscape[i]; - if (rest.indexOf(ae) === -1) - continue; - var esc = encodeURIComponent(ae); - if (esc === ae) { - esc = escape(ae); - } - rest = rest.split(ae).join(esc); - } - } + if (this._isEmpty(search)) { + // Bug-to-bug compat. + if (parseQueryString) + this.search = ''; + this.query = parseQueryString ? {} : search; - // chop off from the tail first. - var hash = rest.indexOf('#'); - if (hash !== -1) { - // got a fragment string. - this.hash = rest.substr(hash); - rest = rest.slice(0, hash); - } - var qm = rest.indexOf('?'); - if (qm !== -1) { - this.search = rest.substr(qm); - this.query = rest.substr(qm + 1); - if (parseQueryString) { - this.query = querystring.parse(this.query); - } - rest = rest.slice(0, qm); - } else if (parseQueryString) { - // no query string, but parseQueryString still requested - this.search = ''; - this.query = {}; - } - if (rest) this.pathname = rest; - if (slashedProtocol[lowerProto] && - this.hostname && !this.pathname) { - this.pathname = '/'; + if (!this._isEmpty(pathname)) + this.path = pathname; + } else { + var query = search.slice(1); + this.query = parseQueryString ? querystring.parse(query) : query; + this.path = this._isEmpty(pathname) ? search : pathname + search; } +}; - //to support http.request - if (this.pathname || this.search) { - var p = this.pathname || ''; - var s = this.search || ''; - this.path = p + s; - } +function urlResolve(source, relative) { + return urlParse(source, false, true).resolve(relative); +} - // finally, reconstruct the href based on what has been validated. - this.href = this.format(); - return this; +Url.prototype.resolve = function(relative) { + return this.resolveObject(urlParse(relative, false, true)).format(); }; -// format a parsed object into a url string +// Format a parsed object into a url string. function urlFormat(obj) { - // ensure it's an object, and not a string url. + // Ensure it's an object, and not a string url. // If it's an obj, this is a no-op. // this way, you can call url_format() on strings // to clean up potentially wonky urls. - if (typeof obj === 'string') obj = urlParse(obj); - - else if (typeof obj !== 'object' || obj === null) - throw new TypeError("Parameter 'urlObj' must be an object, not " + + if (typeof obj === 'string') { + obj = urlParse(obj); + } else if (typeof obj !== 'object' || obj === null) { + throw new TypeError('Parameter \'urlObj\' must be an object, not ' + obj === null ? 'null' : typeof obj); - - else if (!(obj instanceof Url)) return Url.prototype.format.call(obj); + } else if (!(obj instanceof Url)) { + return Url.prototype.format.call(obj); + } return obj.format(); } Url.prototype.format = function() { var auth = this.auth || ''; + if (auth) { auth = encodeURIComponent(auth); auth = auth.replace(/%3A/i, ':'); auth += '@'; } - var protocol = this.protocol || '', - pathname = this.pathname || '', - hash = this.hash || '', - host = false, - query = ''; + var protocol = this.protocol || ''; + var pathname = this.pathname || ''; + var hash = this.hash || ''; + var search = this.search || ''; + var query = ''; + var hostname = this.hostname || ''; + var port = this.port || ''; + var host = false; + var scheme = ''; + + var q = this.query; + if (q !== null && typeof q === 'object') + query = querystring.stringify(q); + + if (!search) + search = query ? '?' + query : ''; + + if (protocol && protocol.charCodeAt(protocol.length - 1) !== 0x3A /*':'*/) + protocol += ':'; if (this.host) { host = auth + this.host; - } else if (this.hostname) { - host = auth + (this.hostname.indexOf(':') === -1 ? - this.hostname : - '[' + this.hostname + ']'); - if (this.port) { - host += ':' + this.port; - } - } - - if (this.query !== null && - typeof this.query === 'object' && - Object.keys(this.query).length) { - query = querystring.stringify(this.query); + } else if (hostname) { + if (containsCharacter2(hostname, 0x3A/*':'*/, -1)) + hostname = '[' + hostname + ']'; + host = auth + hostname + (port ? ':' + port : ''); } - var search = this.search || (query && ('?' + query)) || ''; + var slashes = this.slashes || + ((!protocol || _slashProtocols[protocol]) && host !== false); - if (protocol && protocol.substr(-1) !== ':') protocol += ':'; + if (protocol) + scheme = protocol + (slashes ? '//' : ''); + else if (slashes) + scheme = '//'; - // only the slashedProtocols get the //. Not mailto:, xmpp:, etc. - // unless they had them to begin with. - if (this.slashes || - (!protocol || slashedProtocol[protocol]) && host !== false) { - host = '//' + (host || ''); - if (pathname && pathname.charAt(0) !== '/') pathname = '/' + pathname; - } else if (!host) { - host = ''; - } - - if (hash && hash.charAt(0) !== '#') hash = '#' + hash; - if (search && search.charAt(0) !== '?') search = '?' + search; + if (slashes && pathname && pathname.charCodeAt(0) !== 0x2F /*'/'*/) + pathname = '/' + pathname; + if (search && search.charCodeAt(0) !== 0x3F /*'?'*/) + search = '?' + search; + if (hash && hash.charCodeAt(0) !== 0x23 /*'#'*/) + hash = '#' + hash; - pathname = pathname.replace(/[?#]/g, function(match) { - return encodeURIComponent(match); - }); - search = search.replace('#', '%23'); + pathname = escapePathName(pathname); + search = escapeSearch(search); - return protocol + host + pathname + search + hash; + return scheme + (host === false ? '' : host) + pathname + search + hash; }; -function urlResolve(source, relative) { - return urlParse(source, false, true).resolve(relative); -} +Url.prototype._clone = function() { + var ret = new Url(); + ret.protocol = this.protocol; + ret.slashes = this.slashes; + ret.auth = this.auth; + ret.host = this.host; + ret.port = this.port; + ret.hostname = this.hostname; + ret.hash = this.hash; + ret.search = this.search; + ret.query = this.query; + ret.pathname = this.pathname; + ret.path = this.path; + ret.href = this.href; + ret._prependSlash = this._prependSlash; + return ret; +}; -Url.prototype.resolve = function(relative) { - return this.resolveObject(urlParse(relative, false, true)).format(); +Url.prototype._copyPropsTo = function(target, noProtocol) { + if (!noProtocol) + target.protocol = this.protocol; + + target.slashes = this.slashes; + target.auth = this.auth; + target.host = this.host; + target.port = this.port; + target.hostname = this.hostname; + target.hash = this.hash; + target.search = this.search; + target.query = this.query; + target.pathname = this.pathname; + target.path = this.path; + target.href = this.href; + target._prependSlash = this._prependSlash; }; function urlResolveObject(source, relative) { - if (!source) return relative; + if (!source) + return relative; return urlParse(source, false, true).resolveObject(relative); } +Url.prototype._resolveEmpty = function() { + var ret = this._clone(); + ret.hash = null; + ret.href = ret.format(); + return ret; +}; + Url.prototype.resolveObject = function(relative) { if (typeof relative === 'string') { - var rel = new Url(); - rel.parse(relative, false, true); - relative = rel; + if (relative.length === 0) + return this._resolveEmpty(); + + var u = new Url(); + u.parse(relative, false, true); + relative = u; + } else if (this._isEmpty(relative.href)) { + return this._resolveEmpty(); } - var result = new Url(); - var tkeys = Object.keys(this); - for (var tk = 0; tk < tkeys.length; tk++) { - var tkey = tkeys[tk]; - result[tkey] = this[tkey]; - } + var result = this._clone(); - // hash is always overridden, no matter what. - // even href="" will remove it. + // Hash is always overridden, no matter what. + // even href='' will remove it. result.hash = relative.hash; - // if the relative url is empty, then there's nothing left to do here. - if (relative.href === '') { - result.href = result.format(); - return result; - } + // Hrefs like //foo/bar always cut to the protocol. + if (relative.slashes && this._isEmpty(relative.protocol)) { + relative._copyPropsTo(result, true); - // hrefs like //foo/bar always cut to the protocol. - if (relative.slashes && !relative.protocol) { - // take everything except the protocol from relative - var rkeys = Object.keys(relative); - for (var rk = 0; rk < rkeys.length; rk++) { - var rkey = rkeys[rk]; - if (rkey !== 'protocol') - result[rkey] = relative[rkey]; - } - - //urlParse appends trailing / to urls like http://www.example.com - if (slashedProtocol[result.protocol] && - result.hostname && !result.pathname) { + if (_slashProtocols[result.protocol] && + !this._isEmpty(result.hostname) && + this._isEmpty(result.pathname)) result.path = result.pathname = '/'; - } result.href = result.format(); return result; } - if (relative.protocol && relative.protocol !== result.protocol) { - // if it's a known url protocol, then changing + if (!this._isEmpty(relative.protocol) && + relative.protocol !== result.protocol) { + // If it's a known url protocol, then changing // the protocol does weird things // first, if it's not file:, then we MUST have a host, // and if there was a path @@ -484,30 +341,31 @@ Url.prototype.resolveObject = function(relative) { // if it is file:, then the host is dropped, // because that's known to be hostless. // anything else is assumed to be absolute. - if (!slashedProtocol[relative.protocol]) { - var keys = Object.keys(relative); - for (var v = 0; v < keys.length; v++) { - var k = keys[v]; - result[k] = relative[k]; - } + if (!_slashProtocols[relative.protocol]) { + relative._copyPropsTo(result, false); result.href = result.format(); return result; } result.protocol = relative.protocol; - if (!relative.host && + if (this._isEmpty(relative.host) && !/^file:?$/.test(relative.protocol) && - !hostlessProtocol[relative.protocol]) { + relative.protocol !== 'javascript:') { var relPath = (relative.pathname || '').split('/'); while (relPath.length && !(relative.host = relPath.shift())); - if (!relative.host) relative.host = ''; - if (!relative.hostname) relative.hostname = ''; - if (relPath[0] !== '') relPath.unshift(''); - if (relPath.length < 2) relPath.unshift(''); + if (this._isEmpty(relative.host)) + relative.host = ''; + if (this._isEmpty(relative.hostname)) + relative.hostname = ''; + if (relPath[0] !== '') + relPath.unshift(''); + if (relPath.length < 2) + relPath.unshift(''); result.pathname = relPath.join('/'); } else { result.pathname = relative.pathname; } + result.search = relative.search; result.query = relative.query; result.host = relative.host || ''; @@ -525,19 +383,29 @@ Url.prototype.resolveObject = function(relative) { return result; } - var isSourceAbs = (result.pathname && result.pathname.charAt(0) === '/'), - isRelAbs = ( - relative.host || - relative.pathname && relative.pathname.charAt(0) === '/' - ), - mustEndAbs = (isRelAbs || isSourceAbs || - (result.host && relative.pathname)), - removeAllDots = mustEndAbs, - srcPath = result.pathname && result.pathname.split('/') || [], - relPath = relative.pathname && relative.pathname.split('/') || [], - psychotic = result.protocol && !slashedProtocol[result.protocol]; - - // if the url is a non-slashed url, then relative + var isSourceAbs = + (!this._isEmpty(result.pathname) && + result.pathname.charCodeAt(0) === 0x2F /*'/'*/); + var isRelAbs = ( + !this._isEmpty(relative.host) || + (!this._isEmpty(relative.pathname) && + relative.pathname.charCodeAt(0) === 0x2F /*'/'*/)); + + var mustEndAbs = (isRelAbs || + isSourceAbs || + (!this._isEmpty(result.host) && + !this._isEmpty(relative.pathname))); + + var removeAllDots = mustEndAbs; + + var srcPath = !this._isEmpty(result.pathname) && + result.pathname.split('/') || []; + var relPath = !this._isEmpty(relative.pathname) && + relative.pathname.split('/') || []; + var psychotic = !this._isEmpty(result.protocol) && + !_slashProtocols[result.protocol]; + + // If the url is a non-slashed url, then relative // links like ../.. should be able // to crawl up to the hostname, as well. This is strange. // result.protocol has already been set by now. @@ -545,135 +413,131 @@ Url.prototype.resolveObject = function(relative) { if (psychotic) { result.hostname = ''; result.port = null; - if (result.host) { - if (srcPath[0] === '') srcPath[0] = result.host; - else srcPath.unshift(result.host); + if (!this._isEmpty(result.host)) { + if (srcPath[0] === '') + srcPath[0] = result.host; + else + srcPath.unshift(result.host); } result.host = ''; - if (relative.protocol) { + if (!this._isEmpty(relative.protocol)) { relative.hostname = null; relative.port = null; if (relative.host) { - if (relPath[0] === '') relPath[0] = relative.host; - else relPath.unshift(relative.host); + if (relPath[0] === '') + relPath[0] = relative.host; + else + relPath.unshift(relative.host); } - relative.host = null; + relative.host = ''; } mustEndAbs = mustEndAbs && (relPath[0] === '' || srcPath[0] === ''); } if (isRelAbs) { // it's absolute. - result.host = (relative.host || relative.host === '') ? - relative.host : result.host; - result.hostname = (relative.hostname || relative.hostname === '') ? - relative.hostname : result.hostname; + result.host = !this._isEmpty(relative.host) ? relative.host : result.host; + result.hostname = + !this._isEmpty(relative.hostname) ? relative.hostname : result.hostname; result.search = relative.search; result.query = relative.query; srcPath = relPath; - // fall through to the dot-handling below. + // Fall through to the dot-handling below. } else if (relPath.length) { - // it's relative + // It's relative // throw away the existing file, and take the new path instead. - if (!srcPath) srcPath = []; + if (!srcPath) + srcPath = []; srcPath.pop(); srcPath = srcPath.concat(relPath); result.search = relative.search; result.query = relative.query; - } else if (relative.search !== null && relative.search !== undefined) { - // just pull out the search. + } else if (!this._isEmpty(relative.search)) { + // Just pull out the search. // like href='?foo'. // Put this after the other two cases because it simplifies the booleans if (psychotic) { result.hostname = result.host = srcPath.shift(); - //occationaly the auth can get stuck only in host - //this especialy happens in cases like - //url.resolveObject('mailto:local1@domain1', 'local2@domain2') - var authInHost = result.host && result.host.indexOf('@') > 0 ? - result.host.split('@') : false; - if (authInHost) { + // Occasionally the auth can get stuck only in host + // this especialy happens in cases like + // url.resolveObject('mailto:local1@domain1', 'local2@domain2'). + var authInHost = !this._isEmpty(result.host) && + result.host.indexOf('@') > 0 ? result.host.split('@') : false; + if (authInHost !== false) { result.auth = authInHost.shift(); result.host = result.hostname = authInHost.shift(); } } result.search = relative.search; result.query = relative.query; - //to support http.request - if (result.pathname !== null || result.search !== null) { - result.path = (result.pathname ? result.pathname : '') + - (result.search ? result.search : ''); - } + result.path = result.pathname !== null ? + result.pathname + result.search : result.search; result.href = result.format(); return result; } if (!srcPath.length) { - // no path at all. easy. + // No path at all. easy. // we've already handled the other stuff above. result.pathname = null; - //to support http.request - if (result.search) { - result.path = '/' + result.search; - } else { - result.path = null; - } + result.path = this._isEmpty(result.search) ? null : '/' + result.search; result.href = result.format(); return result; } - // if a url ENDs in . or .., then it must get a trailing slash. + // If a url ENDs in . or .., then it must get a trailing slash. // however, if it ends in anything else non-slashy, // then it must NOT get a trailing slash. var last = srcPath.slice(-1)[0]; var hasTrailingSlash = ( - (result.host || relative.host || srcPath.length > 1) && + (!this._isEmpty(result.host) || + !this._isEmpty(relative.host) || + srcPath.length > 1) && (last === '.' || last === '..') || last === ''); - // strip single dots, resolve double dots to parent dir - // if the path tries to go above the root, `up` ends up > 0 + // Strip single dots, resolve double dots to parent dir + // if the path tries to go above the root, `up` ends up > 0. var up = 0; for (var i = srcPath.length; i >= 0; i--) { last = srcPath[i]; if (last === '.') { - spliceOne(srcPath, i); + srcPath.splice(i, 1); } else if (last === '..') { - spliceOne(srcPath, i); + srcPath.splice(i, 1); up++; } else if (up) { - spliceOne(srcPath, i); + srcPath.splice(i, 1); up--; } } - // if the path is allowed to go above the root, restore leading ..s + // If the path is allowed to go above the root, restore leading ..s. if (!mustEndAbs && !removeAllDots) { - for (; up--; up) { + for (; up--; up) srcPath.unshift('..'); - } } if (mustEndAbs && srcPath[0] !== '' && - (!srcPath[0] || srcPath[0].charAt(0) !== '/')) { + (!srcPath[0] || srcPath[0].charCodeAt(0) !== 0x2F /*'/'*/)) { srcPath.unshift(''); } - if (hasTrailingSlash && (srcPath.join('/').substr(-1) !== '/')) { + if (hasTrailingSlash && (srcPath.join('/').substr(-1) !== '/')) srcPath.push(''); - } var isAbsolute = srcPath[0] === '' || - (srcPath[0] && srcPath[0].charAt(0) === '/'); + (srcPath[0] && srcPath[0].charCodeAt(0) === 0x2F /*'/'*/); // put the host back if (psychotic) { result.hostname = result.host = isAbsolute ? '' : - srcPath.length ? srcPath.shift() : ''; - //occationaly the auth can get stuck only in host - //this especialy happens in cases like - //url.resolveObject('mailto:local1@domain1', 'local2@domain2') - var authInHost = result.host && result.host.indexOf('@') > 0 ? - result.host.split('@') : false; - if (authInHost) { + srcPath.length ? srcPath.shift() : ''; + // Occasionally the auth can get stuck only in host + // this especialy happens in cases like + // url.resolveObject('mailto:local1@domain1', 'local2@domain2'). + var authInHost = !this._isEmpty(result.host) && + result.host.indexOf('@') > 0 ? result.host.split('@') : false; + if (authInHost !== false) { result.auth = authInHost.shift(); result.host = result.hostname = authInHost.shift(); } @@ -681,44 +545,439 @@ Url.prototype.resolveObject = function(relative) { mustEndAbs = mustEndAbs || (result.host && srcPath.length); - if (mustEndAbs && !isAbsolute) { + if (mustEndAbs && !isAbsolute) srcPath.unshift(''); - } - if (!srcPath.length) { + if (srcPath.length === 0) { result.pathname = null; - result.path = null; + result.path = this._isEmpty(result.search) ? null : result.search; } else { - result.pathname = srcPath.join('/'); + var pathname = srcPath.join('/'); + result.pathname = pathname; + result.path = + this._isEmpty(result.search) ? pathname : pathname + result.search; } - //to support request.http - if (result.pathname !== null || result.search !== null) { - result.path = (result.pathname ? result.pathname : '') + - (result.search ? result.search : ''); - } result.auth = relative.auth || result.auth; result.slashes = result.slashes || relative.slashes; result.href = result.format(); return result; }; -Url.prototype.parseHost = function() { - var host = this.host; - var port = portPattern.exec(host); - if (port) { - port = port[0]; - if (port !== ':') { - this.port = port.substr(1); +Url.prototype._parseProtocol = function(str, start, end) { + var needsLowerCasing = false; + var protocolCharacters = _protocolCharacters; + + for (var i = start; i <= end; ++i) { + var ch = str.charCodeAt(i); + + if (ch === 0x3A /*':'*/) { + if (i - start === 0) + return start; + var protocol = str.slice(start, i + 1); + if (needsLowerCasing) + protocol = protocol.toLowerCase(); + this.protocol = protocol; + return i + 1; + } else if (protocolCharacters[ch] === 1) { + if (ch < 0x61 /*'a'*/) + needsLowerCasing = true; + } else { + return start; + } + + } + return start; +}; + +Url.prototype._parseAuth = function(str, start, end, decode) { + var auth = str.slice(start, end + 1); + if (decode) + auth = decodeURIComponent(auth); + this.auth = auth; +}; + +Url.prototype._parsePort = function(str, start, end) { + for (var i = start; i <= end; ++i) { + var ch = str.charCodeAt(i); + + if (!(0x30 /*'0'*/ <= ch && ch <= 0x39 /*'9'*/)) { + this._prependSlash = true; + return false; + } + } + + if (i > start) + this.port = str.slice(start, i); + return true; +}; + +Url.prototype._parseHost = function(str, + trimmedStart, + start, + end, + slashesDenoteHost) { + var hostEndingCharacters = _hostEndingCharacters; + var first = str.charCodeAt(start); + var second = str.charCodeAt(start + 1); + if ((first === 0x2F /*'/'*/ || first === 0x5C /*'\'*/) && + (second === 0x2F /*'/'*/ || second === 0x5C /*'\'*/)) { + this.slashes = true; + + // The string starts with // or \\. + if (start === trimmedStart) { + // The string is just '//' or '\\'. + if (end - start === 1) + return start; + // If slashes do not denote host and there is no auth, + // there is no host when the string starts with // or \\. + var hasAuth = + containsCharacter(str, + 0x40 /*'@'*/, + trimmedStart + 2, + hostEndingCharacters); + if (!hasAuth && !slashesDenoteHost) { + this.slashes = null; + return start; + } + } + // There is a host that starts after the //. + start += 2; + } + // If there is no slashes, there is no hostname if + // 1. there was no protocol at all. + else if (this.protocol === null || + // 2. there was a protocol that requires slashes + // e.g. in 'http:asd' 'asd' is not a hostname. + _slashProtocols[this.protocol] + ) { + return start; + } + + var needsLowerCasing = false; + var idna = false; + var hostNameStart = start; + var hostnameEnd = end; + var lastCh = -1; + var hostEndsAtPortEnd = true; + var portStart = -1; + var charsAfterDot = 0; + var authNeedsDecoding = false; + + var j = -1; + + // Find the last occurrence of an @-sign until hostending character is met + // also mark if decoding is needed for the auth portion. + for (var i = start; i <= end; ++i) { + var ch = str.charCodeAt(i); + if (ch === 0x40 /*'@'*/) + j = i; + else if (ch === 0x25 /*'%'*/) + authNeedsDecoding = true; + else if (hostEndingCharacters[ch] === 1) + break; + } + + // @-sign was found at index j, everything to the left from it + // is auth part. + if (j > -1) { + this._parseAuth(str, start, j - 1, authNeedsDecoding); + // Hostname starts after the last @-sign- + start = hostNameStart = j + 1; + } + + var hostEndingCharactersNoPrependSlash = _hostEndingCharactersNoPrependSlash; + // Host name is starting with a [. + if (str.charCodeAt(start) === 0x5B /*'['*/) { + for (var i = start + 1; i <= end; ++i) { + var ch = str.charCodeAt(i); + + // Assume valid IP6 is between the brackets. + if (ch === 0x5D /*']'*/) { + var ip6End = i; + for (i = i + 1; i <= end; ++i) { + var ch = str.charCodeAt(i); + if (ch === 0x3A /*':'*/) + portStart = i + 1; + else if (hostEndingCharacters[ch] === 1 || + hostEndingCharactersNoPrependSlash[ch] === 1) + break; + } + + var portEnd = i - 1; + hostnameEnd = portStart !== -1 ? portStart - 1 : ip6End + 1; + + var isInvalidHost = portStart !== -1 && ip6End + 1 !== portStart - 1; + var hostname = + isInvalidHost ? str.slice(start, hostnameEnd).toLowerCase() : + str.slice(start + 1, hostnameEnd - 1).toLowerCase(); + + if (portStart !== -1) + hostEndsAtPortEnd = this._parsePort(str, portStart, portEnd); + + this.hostname = hostname; + + if (!isInvalidHost) + hostname = '[' + hostname + ']'; + + this.host = + this._isEmpty(this.port) ? hostname : hostname + ':' + this.port; + this.pathname = '/'; + return (hostEndsAtPortEnd ? portEnd + 1 : portStart - 1); + } } - host = host.substr(0, host.length - port.length); + // Empty hostname, [ starts a path. + return start; } - if (host) this.hostname = host; + + for (var i = start; i <= end; ++i) { + if (charsAfterDot > 62) { + this.hostname = this.host = str.slice(start, i); + return i; + } + var ch = str.charCodeAt(i); + + if (ch === 0x3A /*':'*/) { + portStart = i + 1; + } else if (ch < 0x61 /*'a'*/) { + if (ch === 0x2E /*'.'*/) { + // TODO(petkaantonov) This error is originally ignored: + // if (lastCh === 0x2E /*'.'*/ || lastCh === -1) { + // this.hostname = this.host = ''; + // return start; + // } + charsAfterDot = -1; + } else if (0x41 /*'A'*/ <= ch && ch <= 0x5A /*'Z'*/) { + needsLowerCasing = true; + } + // Valid characters other than ASCII letters -, _, +, 0-9. + else if (!(ch === 0x2D /*'-'*/ || + ch === 0x5F /*'_'*/ || + ch === 0x2B /*'+'*/ || + (0x30 /*'0'*/ <= ch && ch <= 0x39 /*'9'*/))) { + if (hostEndingCharacters[ch] === 0 && + hostEndingCharactersNoPrependSlash[ch] === 0) + this._prependSlash = true; + hostnameEnd = i - 1; + break; + } + } else if (ch >= 0x7B /*'{'*/) { + if (ch <= 0x7E /*'~'*/) { + if (hostEndingCharactersNoPrependSlash[ch] === 0) + this._prependSlash = true; + hostnameEnd = i - 1; + break; + } + idna = true; + needsLowerCasing = true; + } + lastCh = ch; + charsAfterDot++; + } + + var portEnd = i - 1; + hostnameEnd = portStart !== -1 ? portStart - 2 : hostnameEnd; + + if (portStart !== -1) + hostEndsAtPortEnd = this._parsePort(str, portStart, portEnd); + + // TODO(petkaantonov) This error is originally ignored: + // if (lastCh === 0x2E /*'.'*/) + // hostnameEnd-- + + if (hostnameEnd + 1 !== start && + hostnameEnd - hostNameStart <= 256) { + var hostname = str.slice(hostNameStart, hostnameEnd + 1); + if (needsLowerCasing) + hostname = hostname.toLowerCase(); + if (idna) + hostname = punycode.toASCII(hostname); + + this.hostname = hostname; + this.host = + this._isEmpty(this.port) ? hostname : hostname + ':' + this.port; + } + + return (hostEndsAtPortEnd ? portEnd + 1 : portStart - 1); }; -// About 1.5x faster than the two-arg version of Array#splice(). -function spliceOne(list, index) { - for (var i = index, k = i + 1, n = list.length; k < n; i += 1, k += 1) - list[i] = list[k]; - list.pop(); +Url.prototype._parsePath = function(str, start, end) { + var pathStart = start; + var pathEnd = end; + var escape = false; + var autoEscapeCharacters = _autoEscapeCharacters; + + for (var i = start; i <= end; ++i) { + var ch = str.charCodeAt(i); + + if (ch === 0x23 /*'#'*/) { + this._parseHash(str, i, end); + pathEnd = i - 1; + break; + } else if (ch === 0x3F /*'?'*/) { + this._parseQuery(str, i, end); + pathEnd = i - 1; + break; + } else if (!escape && autoEscapeCharacters[ch] === 1) { + escape = true; + } + } + + if (pathStart > pathEnd) { + this.pathname = '/'; + return; + } + + var path; + if (escape) + path = getComponentEscaped(str, pathStart, pathEnd, false); + else + path = str.slice(pathStart, pathEnd + 1); + + this.pathname = (this._prependSlash ? '/' + path : path); +}; + +Url.prototype._parseQuery = function(str, start, end) { + var queryStart = start; + var queryEnd = end; + var escape = false; + var autoEscapeCharacters = _autoEscapeCharacters; + + for (var i = start; i <= end; ++i) { + var ch = str.charCodeAt(i); + + if (ch === 0x23 /*'#'*/) { + this._parseHash(str, i, end); + queryEnd = i - 1; + break; + } else if (!escape && autoEscapeCharacters[ch] === 1) { + escape = true; + } + } + + if (queryStart > queryEnd) { + this.search = ''; + return; + } + + var query = escape ? + getComponentEscaped(str, queryStart, queryEnd, true) : + str.slice(queryStart, queryEnd + 1); + + this.search = query; +}; + +Url.prototype._parseHash = function(str, start, end) { + if (start > end) { + this.hash = ''; + return; + } + this.hash = getComponentEscaped(str, start, end, true); +}; + +// This must be a method because this getting inlined is crucial. +Url.prototype._isEmpty = function(value) { + return value === null || value === '' || value === undefined; +}; + + +// Search `char1` (integer code for a character) in `string` +// starting from `fromIndex` and ending at `string.length - 1` +// or when a stop character is found. +function containsCharacter(string, char1, fromIndex, stopCharacterTable) { + var len = string.length; + for (var i = fromIndex; i < len; ++i) { + var ch = string.charCodeAt(i); + + if (ch === char1) + return true; + else if (stopCharacterTable[ch] === 1) + return false; + } + return false; } + +// See if `char1` or `char2` (integer codes for characters) +// is contained in `string`. +function containsCharacter2(string, char1, char2) { + for (var i = 0, len = string.length; i < len; ++i) { + var ch = string.charCodeAt(i); + if (ch === char1 || ch === char2) + return true; + } + return false; +} + +// Makes an array of 128 uint8's which represent boolean values. +// Spec is an array of ascii code points or ascii code point ranges +// ranges are expressed as [start, end]. + +// For example, to create a table with the characters +// 0x30-0x39 (decimals '0' - '9') and +// 0x7A (lowercaseletter 'z') as `true`: + +// var a = makeAsciiTable([[0x30, 0x39], 0x7A]); +// a[0x30]; //1 +// a[0x15]; //0 +// a[0x35]; //1 +function makeAsciiTable(spec) { + var ret = new Uint8Array(128); + spec.forEach(function(item) { + if (typeof item === 'number') { + ret[item] = 1; + } else { + var start = item[0]; + var end = item[1]; + for (var j = start; j <= end; ++j) { + ret[j] = 1; + } + } + }); + + return ret; +} + +function escapePathName(pathname) { + if (!containsCharacter2(pathname, 0x23 /*'#'*/, 0x3F /*'?'*/)) + return pathname; + + return pathname.replace(/[?#]/g, function(match) { + return encodeURIComponent(match); + }); +} + +function escapeSearch(search) { + if (!containsCharacter2(search, 0x23 /*'#'*/, -1)) + return search; + + return search.replace(/#/g, function(match) { + return encodeURIComponent(match); + }); +} + +function getComponentEscaped(str, start, end, isAfterQuery) { + var cur = start; + var i = start; + var ret = ''; + var autoEscapeMap = isAfterQuery ? _afterQueryAutoEscapeMap : _autoEscapeMap; + for (; i <= end; ++i) { + var ch = str.charCodeAt(i); + var escaped = autoEscapeMap[ch]; + + if (escaped !== '' && escaped !== undefined) { + if (cur < i) + ret += str.slice(cur, i); + ret += escaped; + cur = i + 1; + } + } + if (cur < i + 1) + ret += str.slice(cur, i); + return ret; +} + +// Optimize back from normalized object caused by non-identifier keys. +function FakeConstructor() {} +FakeConstructor.prototype = _slashProtocols; +/*jshint nonew: false */ +new FakeConstructor(); diff --git a/test/parallel/test-url.js b/test/parallel/test-url.js index 121d6caaf612e3..90804e998d2a19 100644 --- a/test/parallel/test-url.js +++ b/test/parallel/test-url.js @@ -862,9 +862,10 @@ for (var u in parseTests) { expected = parseTests[u]; Object.keys(actual).forEach(function (i) { - if (expected[i] === undefined && actual[i] === null) { + if (i.charAt(0) === "_") + expected[i] = actual[i]; + else if (expected[i] === undefined && actual[i] === null) expected[i] = null; - } }); assert.deepEqual(actual, expected); @@ -931,11 +932,13 @@ var parseTestsWithQueryString = { for (var u in parseTestsWithQueryString) { var actual = url.parse(u, true); var expected = parseTestsWithQueryString[u]; - for (var i in actual) { - if (actual[i] === null && expected[i] === undefined) { + + Object.keys(actual).forEach(function (i) { + if (i.charAt(0) === "_") + expected[i] = actual[i]; + else if (expected[i] === undefined && actual[i] === null) expected[i] = null; - } - } + }); assert.deepEqual(actual, expected); } @@ -1550,6 +1553,11 @@ relativeTests2.forEach(function(relativeTest) { var actual = url.resolveObject(url.parse(relativeTest[1]), relativeTest[0]), expected = url.parse(relativeTest[2]); + Object.keys(actual).forEach(function (i) { + if (i.charAt(0) === "_") + expected[i] = actual[i]; + }); + assert.deepEqual(actual, expected); var expected = relativeTest[2], From 35921390c4a70bc338532ab14cbf7d0d52a57620 Mon Sep 17 00:00:00 2001 From: Petka Antonov Date: Mon, 11 May 2015 15:44:34 +0300 Subject: [PATCH 2/3] comma spacing --- lib/url.js | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/url.js b/lib/url.js index 1b211e336133b0..28984ffac9148c 100644 --- a/lib/url.js +++ b/lib/url.js @@ -28,9 +28,9 @@ function Url() { // Reference: RFC 3986, RFC 1808, RFC 2396 const _protocolCharacters = makeAsciiTable([ - [0x61, 0x7A] /*a-z*/, - [0x41, 0x5A] /*A-Z*/, - 0x2E /*'.'*/, 0x2B /*'+'*/, 0x2D /*'-'*/ + [0x61, 0x7A]/*a-z*/, + [0x41, 0x5A]/*A-Z*/, + 0x2E/*'.'*/, 0x2B/*'+'*/, 0x2D/*'-'*/ ]); // RFC 2396: characters reserved for delimiting URLs. @@ -633,7 +633,7 @@ Url.prototype._parseHost = function(str, // there is no host when the string starts with // or \\. var hasAuth = containsCharacter(str, - 0x40 /*'@'*/, + 0x40/*'@'*/, trimmedStart + 2, hostEndingCharacters); if (!hasAuth && !slashesDenoteHost) { @@ -938,7 +938,7 @@ function makeAsciiTable(spec) { } function escapePathName(pathname) { - if (!containsCharacter2(pathname, 0x23 /*'#'*/, 0x3F /*'?'*/)) + if (!containsCharacter2(pathname, 0x23/*'#'*/, 0x3F /*'?'*/)) return pathname; return pathname.replace(/[?#]/g, function(match) { @@ -947,7 +947,7 @@ function escapePathName(pathname) { } function escapeSearch(search) { - if (!containsCharacter2(search, 0x23 /*'#'*/, -1)) + if (!containsCharacter2(search, 0x23/*'#'*/, -1)) return search; return search.replace(/#/g, function(match) { From 01d2f88de59d837872b3fd24921b8ab9fefc9788 Mon Sep 17 00:00:00 2001 From: Petka Antonov Date: Mon, 11 May 2015 16:00:01 +0300 Subject: [PATCH 3/3] test delete --- test/parallel/test-url.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/parallel/test-url.js b/test/parallel/test-url.js index 90804e998d2a19..0aa441f5bbbb4a 100644 --- a/test/parallel/test-url.js +++ b/test/parallel/test-url.js @@ -1584,3 +1584,7 @@ for (var i = 0; i < throws.length; i++) { }; assert(url.format('') === ''); assert(url.format({}) === ''); + +var uri = url.parse("https://registry.lvh.me:8661/"); +delete uri.protocol; +assert(uri.format() === '//registry.lvh.me:8661/');