diff --git a/.editorconfig b/.editorconfig index c6681ba..ba0f541 100644 --- a/.editorconfig +++ b/.editorconfig @@ -4,6 +4,7 @@ root = true [*] end_of_line = lf insert_final_newline = true +trim_trailing_whitespace = true [{*.js,package.json,.travis.yml}] charset = utf-8 diff --git a/.eslintignore b/.eslintignore index 63e7ae6..1915bd1 100644 --- a/.eslintignore +++ b/.eslintignore @@ -1 +1,4 @@ +.vscode/ +.memsearch/ +node_modules/ **/*.ts diff --git a/.eslintrc.json b/.eslintrc.json index 8761f2d..964e9ca 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -1,4 +1,5 @@ { + "root": true, "env": { "node": true, "es6": true @@ -8,6 +9,31 @@ }, "extends": "eslint:recommended", "rules": { + "arrow-spacing": [ + "error", + { + "before": true, + "after": true + } + ], + "func-call-spacing": [ + "error", + "never" + ], + "key-spacing": [ + "error", + { + "beforeColon": false, + "afterColon": true + } + ], + "keyword-spacing": [ + "error", + { + "before": true, + "after": true + } + ], "indent": [ "error", 2, @@ -19,6 +45,10 @@ "error", "unix" ], + "object-curly-spacing": [ + "error", + "always" + ], "quotes": [ "error", "single" @@ -27,6 +57,17 @@ "error", "always" ], + "space-before-blocks": [ + "error", + "always" + ], + "space-before-function-paren": [ + "error", { + "anonymous": "always", + "named": "always", + "asyncArrow": "always" + } + ], "no-cond-assign": [ 0 ] diff --git a/README.md b/README.md index 02ea6dc..b76b006 100644 --- a/README.md +++ b/README.md @@ -106,22 +106,23 @@ You can also check out this nice [working implementation](https://github.com/scr ### options - `normalize` - Set to `false` to override Feedparser's default behavior, - which is to parse feeds into an object that contains the generic properties + which is to both parse feeds into an object that contains the generic properties patterned after (although not identical to) the RSS 2.0 format, regardless - of the feed's format. + of the feed's format, as well as to resolve all relative urls, including those + embedded in HTML content fields. - `addmeta` - Set to `false` to override Feedparser's default behavior, which is to add the feed's `meta` information to each article. - `feedurl` - The url (string) of the feed. FeedParser is very good at - resolving relative urls in feeds. But some feeds use relative urls without - declaring the `xml:base` attribute any place in the feed. This is perfectly - valid, but we don't know know the feed's url before we start parsing the feed - and trying to resolve those relative urls. If we discover the feed's url, we - will go back and resolve the relative urls we've already seen, but this takes - a little time (not much). If you want to be sure we never have to re-resolve - relative urls (or if FeedParser is failing to properly resolve relative urls), - you should set the `feedurl` option. Otherwise, feel free to ignore this option. + resolving relative urls in feeds, including those embedded in HTML content + fields. But some feeds use relative urls without declaring the `xml:base` + attribute any place in the feed. This is perfectly valid, but we don't know + the feed's url before we start parsing the feed and trying to resolve those + relative urls. If we discover the feed's url, we will go back and resolve the + relative urls we've already seen, but this takes a little time (not much). + If you want to be sure we can resolve all relative urls, you should set the + `feedurl` option. - `resume_saxerror` - Set to `false` to override Feedparser's default behavior, which is to silently handle them and then automatically resume parsing. In diff --git a/bin/feedparser.js b/bin/feedparser.js index 90d1b30..da74fc3 100755 --- a/bin/feedparser.js +++ b/bin/feedparser.js @@ -36,7 +36,7 @@ var items = []; process.stdin.pipe(new FeedParser(argv)) .on('error', console.error) - .on('readable', function() { + .on('readable', function () { var stream = this, item; while (item = stream.read()) { if (argv.group) { diff --git a/examples/complete.js b/examples/complete.js index 49c7afe..3379081 100644 --- a/examples/complete.js +++ b/examples/complete.js @@ -8,7 +8,7 @@ var fetch = require('node-fetch') , FeedParser = require(__dirname+'/..') , iconv = require('iconv-lite'); -function get(feed) { +function get (feed) { // Get a response stream fetch(feed, { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36', 'accept': 'text/html,application/xhtml+xml' }).then(function (res) { @@ -16,7 +16,7 @@ function get(feed) { var feedparser = new FeedParser(); feedparser.on('error', done); feedparser.on('end', done); - feedparser.on('readable', function() { + feedparser.on('readable', function () { var post; while (post = this.read()) { console.log(JSON.stringify(post, ' ', 4)); @@ -45,14 +45,14 @@ function maybeTranslate (res, charset) { // If we're using iconvStream, stream will be the output of iconvStream // otherwise it will remain the output of request res = res.pipe(iconvStream); - } catch(err) { + } catch (err) { res.emit('error', err); } } return res; } -function getParams(str) { +function getParams (str) { var params = str.split(';').reduce(function (params, param) { var parts = param.split('=').map(function (part) { return part.trim(); }); if (parts.length === 2) { @@ -63,7 +63,7 @@ function getParams(str) { return params; } -function done(err) { +function done (err) { if (err) { console.log(err, err.stack); return process.exit(1); diff --git a/examples/simple.js b/examples/simple.js index fe17455..c247137 100644 --- a/examples/simple.js +++ b/examples/simple.js @@ -19,7 +19,7 @@ fs.createReadStream(feed) .on('meta', function (meta) { console.log('===== %s =====', meta.title); }) - .on('readable', function() { + .on('readable', function () { var stream = this, item; while (item = stream.read()) { console.log('Got article: %s', item.title || item.description); diff --git a/lib/namespaces.js b/lib/constants.js similarity index 61% rename from lib/namespaces.js rename to lib/constants.js index 58442b6..ba25068 100644 --- a/lib/namespaces.js +++ b/lib/constants.js @@ -1,9 +1,10 @@ /* - * Default namespaces - * - * Lookup by URI - */ -module.exports = { +* Default namespaces +* +* Lookup by URI +*/ +/* eslint-disable key-spacing */ +var NAMESPACES = { 'http://www.w3.org/2005/Atom' :'atom', // v1.0 'http://purl.org/atom/ns#' :'atom', // v0.3 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' :'rdf', @@ -35,3 +36,49 @@ module.exports = { 'http://www.w3.org/1999/xhtml' :'xhtml', 'http://www.w3.org/XML/1998/namespace' :'xml' }; +/* eslint-enable key-spacing */ + +var HTML_URI_ATTRS = new Set([ + 'href', + 'src', + 'uri', + 'srcset', + 'cite', + 'longdesc', + 'action', + 'background', + 'data', + 'poster' +]); + +var HTML_TAGS = new Set([ + 'a', 'abbr', 'acronym', 'address', 'applet', 'area', 'article', 'aside', 'audio', + 'b', 'base', 'basefont', 'bdi', 'bdo', 'big', 'blink', 'blockquote', 'body', 'br', 'button', + 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', + 'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt', + 'em', 'embed', + 'fieldset', 'figcaption', 'figure', 'font', 'footer', 'form', 'frame', 'frameset', + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'hr', 'html', + 'i', 'iframe', 'img', 'input', 'ins', 'isindex', + 'kbd', + 'label', 'legend', 'li', 'link', 'listing', + 'main', 'map', 'mark', 'marquee', 'menu', 'menuitem', 'meta', 'meter', 'multicol', + 'nav', 'nextid', 'nobr', 'noembed', 'noframes', 'noscript', + 'object', 'ol', 'optgroup', 'option', 'output', + 'p', 'param', 'picture', 'plaintext', 'pre', 'progress', + 'q', + 'rb', 'rp', 'rt', 'rtc', 'ruby', + 's', 'samp', 'script', 'section', 'select', 'slot', 'small', 'source', 'spacer', 'span', 'strike', 'strong', 'style', 'sub', 'summary', 'sup', + 'table', 'tbody', 'td', 'template', 'textarea', 'tfoot', 'th', + 'thead', 'time', 'title', 'tr', 'track', 'tt', + 'u', 'ul', + 'var', 'video', + 'wbr', + 'xmp' +]); + +module.exports = { + NAMESPACES, + HTML_URI_ATTRS, + HTML_TAGS +}; diff --git a/lib/feedparser/index.js b/lib/feedparser.js similarity index 85% rename from lib/feedparser/index.js rename to lib/feedparser.js index 94840fb..9be6fb8 100644 --- a/lib/feedparser/index.js +++ b/lib/feedparser.js @@ -9,12 +9,18 @@ /** * Module dependencies. */ -var sax = require('sax') - , addressparser = require('addressparser') - , indexOfObject = require('array-indexofobject') - , util = require('util') - , TransformStream = require('readable-stream').Transform - , _ = require('../utils'); +const sax = require('sax'); +const addressparser = require('addressparser'); +const indexOfObject = require('array-indexofobject'); +const { inherits } = require('util'); +const { Transform: TransformStream } = require('readable-stream'); +const { HTML_URI_ATTRS } = require('./constants'); +const _ = { + ...require('./utils'), + has: require('lodash.has'), + assign: require('lodash.assign'), + uniq: require('lodash.uniq') +}; /** * FeedParser constructor. @@ -56,7 +62,7 @@ var sax = require('sax') * - categories {Array} * * @this {FeedParserInstance} - * @param {import('../../index').Options} [options] + * @param {import('../index').Options} [options] */ function FeedParser (options) { if (!(this instanceof FeedParser)) return new FeedParser(options); @@ -81,19 +87,19 @@ function FeedParser (options) { // @ts-expect-error sax.MAX_BUFFER_LENGTH = 16 * 1024 * 1024; // 16M versus the 64K default } - if (this.options.feedurl) this.xmlbase.unshift({ '#name': 'xml', '#': this.options.feedurl}); + if (this.options.feedurl) this.xmlbase.unshift({ '#name': 'xml', '#': this.options.feedurl }); // See https://github.com/isaacs/sax-js for more info - this.stream = sax.createStream(this.options.strict /* strict mode - no by default */, {lowercase: true, xmlns: true }); + this.stream = sax.createStream(this.options.strict /* strict mode - no by default */, { lowercase: true, xmlns: true }); this.stream.on('error', this.handleSaxError.bind(this)); this.stream.on('processinginstruction', this.handleProcessingInstruction.bind(this)); this.stream.on('opentag', this.handleOpenTag.bind(this)); - this.stream.on('closetag',this.handleCloseTag.bind(this)); + this.stream.on('closetag', this.handleCloseTag.bind(this)); this.stream.on('text', this.handleText.bind(this)); this.stream.on('cdata', this.handleText.bind(this)); this.stream.on('end', this.handleEnd.bind(this)); } -util.inherits(FeedParser, TransformStream); +inherits(FeedParser, TransformStream); /* * Initializes the SAX stream @@ -101,7 +107,7 @@ util.inherits(FeedParser, TransformStream); * Initializes the class-variables */ /** @this {FeedParserInstance} */ -FeedParser.prototype.init = function (){ +FeedParser.prototype.init = function () { this.meta = { '#ns': [], '@': [], @@ -119,7 +125,7 @@ FeedParser.prototype.init = function (){ }; /** @this {FeedParserInstance} */ -FeedParser.prototype.handleEnd = function (){ +FeedParser.prototype.handleEnd = function () { // We made it to the end without throwing, but let's make sure we were actually // parsing a feed if (!(this.meta && this.meta['#type'])) { @@ -174,7 +180,7 @@ FeedParser.prototype.handleProcessingInstruction = function (node) { * @this {FeedParserInstance} * @param {import('sax').QualifiedTag} node */ -FeedParser.prototype.handleOpenTag = function (node){ +FeedParser.prototype.handleOpenTag = function (node) { var n = {}; n['#name'] = node.name; // Avoid namespace collissions later... n['#prefix'] = node.prefix; // The current ns prefix @@ -189,23 +195,23 @@ FeedParser.prototype.handleOpenTag = function (node){ if (this.in_xhtml && this.xhtml['#name'] != n['#name']) { // We are in an xhtml node // This builds the opening tag, e.g.,
- this.xhtml['#'] += '<'+n['#name']; - Object.keys(n['@']).forEach(function(name){ - this.xhtml['#'] += ' '+ name +'="'+ n['@'][name] + '"'; - }, this); + this.xhtml['#'] += '<' + n['#name']; + Object.keys(n['@']).forEach((name) => { + this.xhtml['#'] += ' ' + name + '="' + n['@'][name] + '"'; + }); this.xhtml['#'] += '>'; - } else if ( this.stack.length === 0 && - (n['#name'] === 'rss' || - (n['#local'] === 'rdf' && _.nslookup([n['#uri']], 'rdf')) || - (n['#local'] === 'feed'&& _.nslookup([n['#uri']], 'atom')) ) ) { - Object.keys(n['@']).forEach(function(name) { + } else if (this.stack.length === 0 && + (n['#name'] === 'rss' || + (n['#local'] === 'rdf' && _.nslookup([n['#uri']], 'rdf')) || + (n['#local'] === 'feed' && _.nslookup([n['#uri']], 'atom')))) { + Object.keys(n['@']).forEach((name) => { var o = {}; if (name != 'version') { o[name] = n['@'][name]; this.meta['@'].push(o); } - }, this); - switch(n['#local']) { + }); + switch (n['#local']) { case 'rss': this.meta['#type'] = 'rss'; this.meta['#version'] = n['@']['version']; @@ -224,14 +230,15 @@ FeedParser.prototype.handleOpenTag = function (node){ }; /** @this {FeedParserInstance} */ -FeedParser.prototype.handleCloseTag = function (el){ +FeedParser.prototype.handleCloseTag = function (el) { var node = { '#name': el, '#prefix': '', - '#local' : '' + '#local': '' } , stdEl , item + , base , baseurl , isIllegallyNested = false ; @@ -261,7 +268,8 @@ FeedParser.prototype.handleCloseTag = function (el){ delete n['#uri']; if (this.xmlbase && this.xmlbase.length) { - baseurl = this.xmlbase[0]['#']; + base = this.xmlbase[0]; + baseurl = base['#']; } var mayHaveResolvableUrl = ( @@ -272,12 +280,18 @@ FeedParser.prototype.handleCloseTag = function (el){ node['#local'] === 'link' // include rss:link, even though it should _never_ be a relative URL ) ); + + var mayHaveEmbeddedHtml = _.mayHaveEmbeddedHtml(node['#name'], n) || _.mayHaveEmbeddedHtml(node['#local'], n); if (baseurl && mayHaveResolvableUrl) { // Apply xml:base to these elements as they appear // rather than leaving it to the ultimate parser n['#'] = _.resolve(baseurl, n['#']); } + if (baseurl && this.options.normalize && mayHaveEmbeddedHtml) { + n['#'] = _.resolveHtmlUris(n['#'], baseurl); + } + if (this.xmlbase.length && (el == this.xmlbase[0]['#name'])) { void this.xmlbase.shift(); } @@ -314,15 +328,15 @@ FeedParser.prototype.handleCloseTag = function (el){ } if (node['#name'] === 'item' || - node['#name'] === 'entry' || - (node['#local'] === 'item' && (node['#prefix'] === '' || node['#type'] === 'rdf')) || - (node['#local'] == 'entry' && (node['#prefix'] === '' || node['#type'] === 'atom'))) { // We have an article! + node['#name'] === 'entry' || + (node['#local'] === 'item' && (node['#prefix'] === '' || node['#type'] === 'rdf')) || + (node['#local'] == 'entry' && (node['#prefix'] === '' || node['#type'] === 'atom'))) { // We have an article! isIllegallyNested = ( - ( node['#name'] === 'item' && this.stack[0]['#name'] === 'item' ) || - ( node['#name'] === 'entry' && this.stack[0]['#name'] === 'entry' ) || - ( (node['#local'] === 'item' && (node['#prefix'] === '' || node['#type'] === 'rdf')) && this.stack[0]['#name'] === 'item' ) || - ( (node['#local'] == 'entry' && (node['#prefix'] === '' || node['#type'] === 'atom')) && this.stack[0]['#name'] === 'entry' ) + (node['#name'] === 'item' && this.stack[0]['#name'] === 'item') || + (node['#name'] === 'entry' && this.stack[0]['#name'] === 'entry') || + ((node['#local'] === 'item' && (node['#prefix'] === '' || node['#type'] === 'rdf')) && this.stack[0]['#name'] === 'item') || + ((node['#local'] == 'entry' && (node['#prefix'] === '' || node['#type'] === 'atom')) && this.stack[0]['#name'] === 'entry') ); if (isIllegallyNested) { @@ -346,10 +360,10 @@ FeedParser.prototype.handleCloseTag = function (el){ if (this.meta.author && !item.author) item.author = this.meta.author; this.push(item); } else if (!this.meta.title && // We haven't yet parsed all the metadata - (node['#name'] === 'channel' || - node['#name'] === 'feed' || - (node['#local'] === 'channel' && (node['#prefix'] === '' || node['#type'] === 'rdf')) || - (node['#local'] === 'feed' && (node['#prefix'] === '' || node['#type'] === 'atom')) ) ) { + (node['#name'] === 'channel' || + node['#name'] === 'feed' || + (node['#local'] === 'channel' && (node['#prefix'] === '' || node['#type'] === 'rdf')) || + (node['#local'] === 'feed' && (node['#prefix'] === '' || node['#type'] === 'atom')))) { _.assign(this.meta, this.handleMeta(n, this.meta['#type'], this.options)); if (!this._emitted_meta) { this.emit('meta', this.meta); @@ -379,7 +393,7 @@ FeedParser.prototype.handleCloseTag = function (el){ * @this {FeedParserInstance} * @param {string} text */ -FeedParser.prototype.handleText = function (text){ +FeedParser.prototype.handleText = function (text) { if (this.in_xhtml) { this.xhtml['#'] += text; } else { @@ -419,7 +433,7 @@ FeedParser.prototype.handleAttributes = function handleAttributes (attrs, el) { basepath = this.xmlbase[0]['#']; } - Object.keys(attrs).forEach(/** @this {FeedParserInstance} */ function(key){ + Object.keys(attrs).forEach(/** @type (key: string) => void */ (key) => { var attr = attrs[key] , ns = {} , prefix = '' @@ -434,12 +448,12 @@ FeedParser.prototype.handleAttributes = function handleAttributes (attrs, el) { // If the feed is using a non-default prefix, we'll use it, too // But we force the use of the 'xml' prefix if (attr.uri && attr.prefix && !_.nslookup(attr.uri, attr.prefix) || _.nslookup(attr.uri, 'xml')) { - prefix = ( _.nsprefix(attr.uri) || attr.prefix ) + ( attr.local ? ':' : '' ); + prefix = (_.nsprefix(attr.uri) || attr.prefix) + (attr.local ? ':' : ''); } - if (basepath && (attr.local == 'href' || attr.local == 'src' || attr.local == 'uri')) { + if (basepath && HTML_URI_ATTRS.has(attr.local)) { // Apply xml:base to these elements as they appear // rather than leaving it to the ultimate parser - attr.value = _.resolve(basepath, attr.value); + attr.value = _.resolveHtmlAttributeValue(basepath, attr.local, attr.value); } else if (attr.local === 'base' && _.nslookup(attr.uri, 'xml')) { // Keep track of the xml:base for the current node if (basepath) { @@ -448,22 +462,22 @@ FeedParser.prototype.handleAttributes = function handleAttributes (attrs, el) { // Per RFC 3986 ยง4.4, an empty or "#"-only xml:base is a same-document reference. // It does not change the effective base URI, so skip pushing it. if (attr.value && !/^#/.test(attr.value)) { - this.xmlbase.unshift({ '#name': el, '#': attr.value}); + this.xmlbase.unshift({ '#name': el, '#': attr.value }); } } else if (attr.name === 'type' && attr.value === 'xhtml') { this.in_xhtml = true; - this.xhtml = {'#name': el, '#': ''}; + this.xhtml = { '#name': el, '#': '' }; } simplifiedAttributes[prefix + attr.local] = attr.value ? attr.value.trim() : ''; - }, this); + }); return simplifiedAttributes; }; /** * @this {FeedParserInstance} * @param {ParsedNode} node - * @param {import('../../index').Type} type - * @param {import('../../index').Options} options + * @param {import('../index').Type} type + * @param {import('../index').Options} options * @returns {Object} */ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) { @@ -474,7 +488,7 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) { ; if (normalize) { - ['title','description','date', 'pubdate', 'pubDate','link', 'xmlurl', 'xmlUrl','author','language','favicon','copyright','generator'].forEach(function (property){ + ['title', 'description', 'date', 'pubdate', 'pubDate', 'link', 'xmlurl', 'xmlUrl', 'author', 'language', 'favicon', 'copyright', 'generator'].forEach(function (property) { meta[property] = null; }); meta.cloud = {}; @@ -482,24 +496,24 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) { meta.categories = []; } - Object.keys(node).forEach(function(name){ + Object.keys(node).forEach((name) => { var el = node[name]; if (normalize) { - switch(name){ - case('title'): + switch (name) { + case ('title'): meta.title = _.get(el); break; - case('description'): - case('subtitle'): + case ('description'): + case ('subtitle'): meta.description = _.get(el); break; - case('pubdate'): - case('lastbuilddate'): - case('published'): - case('modified'): - case('updated'): - case('dc:date'): + case ('pubdate'): + case ('lastbuilddate'): + case ('published'): + case ('modified'): + case ('updated'): + case ('dc:date'): var date = _.get(el) ? new Date(_.get(el)) : null; if (!date) break; if (meta.pubdate === null || name == 'pubdate' || name == 'published') @@ -507,11 +521,11 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) { if (meta.date === null || name == 'lastbuilddate' || name == 'modified' || name == 'updated') meta.date = date; break; - case('link'): - case('atom:link'): - case('atom10:link'): + case ('link'): + case ('atom:link'): + case ('atom10:link'): if (Array.isArray(el)) { - el.forEach(function (link){ + el.forEach((link) => { if (link['@']['href']) { // Atom if (_.get(link['@'], 'rel')) { if (link['@']['rel'] == 'alternate') { @@ -538,13 +552,13 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) { meta.link = _.get(link); } if (_.isAbsoluteUrl(meta.link) && this.xmlbase && this.xmlbase.length === 0) { - this.xmlbase.unshift({ '#name': 'xml', '#': meta.link}); + this.xmlbase.unshift({ '#name': 'xml', '#': meta.link }); this.stack[0] = _.reresolve(this.stack[0], meta.link); } else if (this.xmlbase && this.xmlbase.length > 0) { meta.link = _.resolve(_.get(this.xmlbase[0], '#'), meta.link); } - }, this); + }); } else { if (el['@']['href']) { // Atom if (_.get(el['@'], 'rel')) { @@ -554,7 +568,7 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) { else if (el['@']['rel'] == 'self') { meta.xmlurl = meta.xmlUrl = el['@']['href']; if (_.isAbsoluteUrl(meta.xmlurl) && this.xmlbase && this.xmlbase.length === 0) { - this.xmlbase.unshift({ '#name': 'xml', '#': meta.xmlurl}); + this.xmlbase.unshift({ '#name': 'xml', '#': meta.xmlurl }); this.stack[0] = _.reresolve(this.stack[0], meta.xmlurl); } else if (this.xmlbase && this.xmlbase.length > 0) { @@ -572,7 +586,7 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) { if (!meta.link) meta.link = _.get(el); } if (_.isAbsoluteUrl(meta.link) && this.xmlbase && this.xmlbase.length === 0) { - this.xmlbase.unshift({ '#name': 'xml', '#': meta.link}); + this.xmlbase.unshift({ '#name': 'xml', '#': meta.link }); this.stack[0] = _.reresolve(this.stack[0], meta.link); } else if (this.xmlbase && this.xmlbase.length > 0) { @@ -580,9 +594,9 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) { } } break; - case('managingeditor'): - case('webmaster'): - case('author'): + case ('managingeditor'): + case ('webmaster'): + case ('author'): var author = {}; if (name == 'author') { meta.author = _.get(el.name) || _.get(el.email) || _.get(el.uri); @@ -598,7 +612,7 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) { } } break; - case('cloud'): + case ('cloud'): // I can't believe someone actually would put two cloud elements in their channel // but it happened // Nevertheless, there can be only one @@ -621,11 +635,11 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) { } meta.cloud.type = 'rsscloud'; break; - case('language'): + case ('language'): meta.language = _.get(el); break; - case('image'): - case('logo'): + case ('image'): + case ('logo'): if (el.url) meta.image.url = _.get(el.url); if (el.title) @@ -633,46 +647,46 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) { if (!meta.image.url && _.get(el)) meta.image.url = _.get(el); break; - case('icon'): + case ('icon'): meta.favicon = _.get(el); break; - case('copyright'): - case('rights'): - case('dc:rights'): + case ('copyright'): + case ('rights'): + case ('dc:rights'): meta.copyright = _.get(el); break; - case('generator'): + case ('generator'): meta.generator = _.get(el); if (_.get(el['@'], 'version')) meta.generator += (meta.generator ? ' ' : '') + 'v' + el['@'].version; if (_.get(el['@'], 'uri')) meta.generator += meta.generator ? ' (' + el['@'].uri + ')' : el['@'].uri; break; - case('category'): - case('dc:subject'): - case('itunes:category'): - case('media:category'): + case ('category'): + case ('dc:subject'): + case ('itunes:category'): + case ('media:category'): /* We handle all the kinds of categories within the switch loop because meta.categories - * is an array, unlike the other properties, and therefore can handle multiple values - */ + * is an array, unlike the other properties, and therefore can handle multiple values + */ var _category = '' , _categories = [] - ; + ; if (Array.isArray(el)) { - el.forEach(function (category){ + el.forEach(function (category) { var _categoryValue; if ('category' == name && 'atom' == type) { if (category['@'] && (_categoryValue = _.safeTrim(_.get(category['@'], 'term')))) { meta.categories.push(_categoryValue); } } - else if ('category' == name && 'rss' == type){ + else if ('category' == name && 'rss' == type) { if ((_categoryValue = _.safeTrim(_.get(category)))) { meta.categories.push(_categoryValue); } } else if ('dc:subject' == name && (_categoryValue = _.safeTrim(_.get(category)))) { - _categories = _categoryValue.split(' ').map(function (cat){ return cat.trim(); }); + _categories = _categoryValue.split(' ').map(function (cat) { return cat.trim(); }); if (_categories.length) { meta.categories = meta.categories.concat(_categories); } @@ -681,7 +695,7 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) { if (category['@'] && _.safeTrim(_.get(category['@'], 'text'))) _category = _.safeTrim(_.get(category['@'], 'text')); if (category[name]) { if (Array.isArray(category[name])) { - category[name].forEach(function (subcategory){ + category[name].forEach(function (subcategory) { var _subcategoryValue; if (subcategory['@'] && (_subcategoryValue = _.safeTrim(_.get(subcategory['@'], 'text')))) { meta.categories.push(_category + '/' + _subcategoryValue); @@ -712,7 +726,7 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) { } } else if ('dc:subject' == name && (_category = _.safeTrim(_.get(el)))) { - _categories = _category.split(' ').map(function (cat){ return cat.trim(); }); + _categories = _category.split(' ').map(function (cat) { return cat.trim(); }); if (_categories.length) { meta.categories = meta.categories.concat(_categories); } @@ -721,7 +735,7 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) { if (el['@'] && _.safeTrim(_.get(el['@'], 'text'))) _category = _.safeTrim(_.get(el['@'], 'text')); if (el[name]) { if (Array.isArray(el[name])) { - el[name].forEach(function (subcategory){ + el[name].forEach(function (subcategory) { var _subcategoryValue; if (subcategory['@'] && (_subcategoryValue = _.safeTrim(_.get(subcategory['@'], 'text')))) { meta.categories.push(_category + '/' + _subcategoryValue); @@ -748,7 +762,7 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) { if (~name.indexOf(':')) meta[name] = el; else meta[type + ':' + name] = el; } - }, this); // forEach end + }); // forEach end if (normalize) { if (!meta.description) { @@ -816,11 +830,11 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) { /** * @this {FeedParserInstance} * @param {ParsedNode} node - * @param {import('../../index').Type} type - * @param {import('../../index').Options} options + * @param {import('../index').Type} type + * @param {import('../index').Options} options * @returns {Object} */ -FeedParser.prototype.handleItem = function handleItem (node, type, options){ +FeedParser.prototype.handleItem = function handleItem (node, type, options) { if (!type || !node) return {}; var item = {} @@ -828,7 +842,7 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){ ; if (normalize) { - ['title','description','summary','date','pubdate','pubDate','link','guid','author','comments', 'origlink'].forEach(function (property){ + ['title', 'description', 'summary', 'date', 'pubdate', 'pubDate', 'link', 'guid', 'author', 'comments', 'origlink'].forEach(function (property) { item[property] = null; }); item.image = {}; @@ -837,30 +851,30 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){ item.enclosures = []; } - Object.keys(node).forEach(function(name){ + Object.keys(node).forEach((name) => { var el = node[name] , attrs = _.get(el, '@') , enclosure; if (normalize) { - switch(name){ - case('title'): + switch (name) { + case ('title'): item.title = _.get(el); break; - case('description'): - case('summary'): + case ('description'): + case ('summary'): item.summary = _.get(el); if (!item.description) item.description = _.get(el); break; - case('content'): - case('content:encoded'): + case ('content'): + case ('content:encoded'): item.description = _.get(el); break; - case('pubdate'): - case('published'): - case('issued'): - case('modified'): - case('updated'): - case('dc:date'): + case ('pubdate'): + case ('published'): + case ('issued'): + case ('modified'): + case ('updated'): + case ('dc:date'): var date = _.get(el) ? new Date(_.get(el)) : null; if (!date) break; if (item.pubdate === null || name == 'pubdate' || name == 'published' || name == 'issued') @@ -868,9 +882,9 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){ if (item.date === null || name == 'modified' || name == 'updated') item.date = date; break; - case('link'): + case ('link'): if (Array.isArray(el)) { - el.forEach(function (link){ + el.forEach(function (link) { if (link['@']['href']) { // Atom if (_.get(link['@'], 'rel')) { if (link['@']['rel'] == 'canonical') item.origlink = link['@']['href']; @@ -878,7 +892,7 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){ if (link['@']['rel'] == 'self' && !item.link) item.link = link['@']['href']; if (link['@']['rel'] == 'replies') item.comments = link['@']['href']; if (link['@']['rel'] == 'enclosure') { - enclosure = /** @type {import('../../index').Enclosure} */ ({}); + enclosure = /** @type {import('../index').Enclosure} */ ({}); enclosure.url = link['@']['href']; enclosure.type = _.get(link['@'], 'type'); enclosure.length = _.get(link['@'], 'length'); @@ -901,7 +915,7 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){ if (el['@']['rel'] == 'self' && !item.link) item.link = el['@']['href']; if (el['@']['rel'] == 'replies') item.comments = el['@']['href']; if (el['@']['rel'] == 'enclosure') { - enclosure = /** @type {import('../../index').Enclosure} */ ({}); + enclosure = /** @type {import('../index').Enclosure} */ ({}); enclosure.url = el['@']['href']; enclosure.type = _.get(el['@'], 'type'); enclosure.length = _.get(el['@'], 'length'); @@ -918,8 +932,8 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){ } if (!item.guid) item.guid = item.link; break; - case('guid'): - case('id'): + case ('guid'): + case ('id'): item.guid = _.get(el); // http://cyber.law.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt // If the guid element has an attribute named "isPermaLink" with a value @@ -933,7 +947,7 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){ item.permalink = item.guid; } break; - case('author'): + case ('author'): var author = {}; if (_.get(el)) { // RSS author = addressparser(_.get(el))[0]; @@ -950,13 +964,13 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){ item.author = _.get(el.name) || _.get(el.email) || _.get(el.uri); } break; - case('dc:creator'): + case ('dc:creator'): item.author = _.get(el); break; - case('comments'): + case ('comments'): item.comments = _.get(el); break; - case('source'): + case ('source'): if ('rss' == type) { item.source['title'] = _.get(el); item.source['url'] = _.get(el['@'], 'url'); @@ -969,7 +983,7 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){ if (item.source['url'] && !this.meta.xmlurl) { this.meta.xmlurl = this.meta.xmlUrl = item.source['url']; if (_.isAbsoluteUrl(item.source['url']) && this.xmlbase && this.xmlbase.length === 0) { - this.xmlbase.unshift({ '#name': 'xml', '#': item.source['url']}); + this.xmlbase.unshift({ '#name': 'xml', '#': item.source['url'] }); this.stack[0] = _.reresolve(this.stack[0], item.source['url']); } else if (this.xmlbase && this.xmlbase.length > 0) { @@ -977,10 +991,10 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){ } } break; - case('enclosure'): + case ('enclosure'): if (Array.isArray(el)) { - el.forEach(function (enc){ - enclosure = /** @type {import('../../index').Enclosure} */ ({}); + el.forEach(function (enc) { + enclosure = /** @type {import('../index').Enclosure} */ ({}); enclosure.url = _.get(enc['@'], 'url'); enclosure.type = _.get(enc['@'], 'type'); enclosure.length = _.get(enc['@'], 'length'); @@ -991,7 +1005,7 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){ } }); } else { - enclosure = /** @type {import('../../index').Enclosure} */ ({}); + enclosure = /** @type {import('../index').Enclosure} */ ({}); enclosure.url = _.get(el['@'], 'url'); enclosure.type = _.get(el['@'], 'type'); enclosure.length = _.get(el['@'], 'length'); @@ -1002,11 +1016,11 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){ } } break; - case('media:content'): + case ('media:content'): var optionalAttributes = ['bitrate', 'framerate', 'samplingrate', 'duration', 'height', 'width']; if (Array.isArray(el)) { - el.forEach(function (enc){ - enclosure = /** @type {import('../../index').Enclosure} */ ({}); + el.forEach(function (enc) { + enclosure = /** @type {import('../index').Enclosure} */ ({}); enclosure.url = _.get(enc['@'], 'url'); enclosure.type = _.get(enc['@'], 'type') || _.get(enc['@'], 'medium'); enclosure.length = _.get(enc['@'], 'filesize'); @@ -1024,7 +1038,7 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){ } }); } else { - enclosure = /** @type {import('../../index').Enclosure} */ ({}); + enclosure = /** @type {import('../index').Enclosure} */ ({}); enclosure.url = _.get(el['@'], 'url'); enclosure.type = _.get(el['@'], 'type') || _.get(el['@'], 'medium'); enclosure.length = _.get(el['@'], 'filesize'); @@ -1042,32 +1056,32 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){ } } break; - case('enc:enclosure'): // Can't find this in use for an example to debug. Only example found does not comply with the spec -- can't code THAT! + case ('enc:enclosure'): // Can't find this in use for an example to debug. Only example found does not comply with the spec -- can't code THAT! break; - case('category'): - case('dc:subject'): - case('itunes:category'): - case('media:category'): + case ('category'): + case ('dc:subject'): + case ('itunes:category'): + case ('media:category'): /* We handle all the kinds of categories within the switch loop because item.categories - * is an array, unlike the other properties, and therefore can handle multiple values - */ + * is an array, unlike the other properties, and therefore can handle multiple values + */ var _category = '' , _categories = [] - ; + ; if (Array.isArray(el)) { - el.forEach(function (category){ + el.forEach(function (category) { if ('category' == name && 'atom' == type) { if (category['@'] && _.get(category['@'], 'term')) item.categories.push(_.get(category['@'], 'term')); } else if ('category' == name && _.get(category) && 'rss' == type) { item.categories.push(_.get(category).trim()); } else if ('dc:subject' == name && _.get(category)) { - _categories = _.get(category).split(' ').map(function (cat){ return cat.trim(); }); + _categories = _.get(category).split(' ').map(function (cat) { return cat.trim(); }); if (_categories.length) item.categories = item.categories.concat(_categories); } else if ('itunes:category' == name) { if (category['@'] && _.get(category['@'], 'text')) _category = _.get(category['@'], 'text'); if (category[name]) { if (Array.isArray(category[name])) { - category[name].forEach(function (subcategory){ + category[name].forEach(function (subcategory) { if (subcategory['@'] && _.get(subcategory['@'], 'text')) item.categories.push(_category + '/' + _.get(subcategory['@'], 'text')); }); } else { @@ -1087,13 +1101,13 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){ } else if ('category' == name && _.get(el) && 'rss' == type) { item.categories.push(_.get(el).trim()); } else if ('dc:subject' == name && _.get(el)) { - _categories = _.get(el).split(' ').map(function (cat){ return cat.trim(); }); + _categories = _.get(el).split(' ').map(function (cat) { return cat.trim(); }); if (_categories.length) item.categories = item.categories.concat(_categories); } else if ('itunes:category' == name) { if (el['@'] && _.get(el['@'], 'text')) _category = _.get(el['@'], 'text'); if (el[name]) { if (Array.isArray(el[name])) { - el[name].forEach(function (subcategory){ + el[name].forEach(function (subcategory) { if (subcategory['@'] && _.get(subcategory['@'], 'text')) item.categories.push(_category + '/' + _.get(subcategory['@'], 'text')); }); } else { @@ -1108,8 +1122,8 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){ } } break; - case('feedburner:origlink'): - case('pheedo:origlink'): + case ('feedburner:origlink'): + case ('pheedo:origlink'): if (!item.origlink) { item.origlink = _.get(el); } @@ -1121,7 +1135,7 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){ if (~name.indexOf(':')) item[name] = el; else item[type + ':' + name] = el; } - }, this); // forEach end + }); // forEach end if (normalize) { if (!item.description) { @@ -1216,7 +1230,7 @@ FeedParser.prototype._flush = function (done) { * @typedef {Object} FeedParserState * Instance properties set up by FeedParser.prototype.init and the constructor. * @property {Object} meta - Parsed feed metadata; shape evolves during parsing - * @property {import('../../index').Options} options + * @property {import('../index').Options} options * @property {Object.} _namespaces * @property {boolean} _emitted_meta * @property {Array.} stack @@ -1235,8 +1249,8 @@ FeedParser.prototype._flush = function (done) { * @property {function(string): void} handleCloseTag * @property {function(string): void} handleText * @property {function(Object., string): Object.} handleAttributes - * @property {function(ParsedNode, import('../../index').Type, import('../../index').Options): Object} handleMeta - * @property {function(ParsedNode, import('../../index').Type, import('../../index').Options): Object} handleItem + * @property {function(ParsedNode, import('../index').Type, import('../index').Options): Object} handleMeta + * @property {function(ParsedNode, import('../index').Type, import('../index').Options): Object} handleItem */ /** @@ -1249,14 +1263,14 @@ FeedParser.prototype[Symbol.asyncIterator] = async function* () { var error = null; var ended = false; - function onReadable() { + function onReadable () { if (resolve) { resolve(); resolve = null; } } - function onEnd() { + function onEnd () { ended = true; if (resolve) { resolve(); resolve = null; } } - function onError(err) { + function onError (err) { error = err; if (resolve) { resolve(); resolve = null; } } diff --git a/lib/utils.js b/lib/utils.js index b6c8a33..7a54fac 100644 --- a/lib/utils.js +++ b/lib/utils.js @@ -1,12 +1,7 @@ -var WHATWGURL = require('url').URL - , namespaces = require('./namespaces') - ; +const { URL: WHATWGURL } = require('url'); +const { NAMESPACES, HTML_URI_ATTRS, HTML_TAGS } = require('./constants'); -exports.has = require('lodash.has'); -exports.assign = require('lodash.assign'); -exports.uniq = require('lodash.uniq'); - -var _get = require('lodash.get'); +const _get = require('lodash.get'); /** * lodash.get, but wrapped to provide a default subkey (a/k/a path) of "#" * and defaultValue of "null" @@ -25,7 +20,7 @@ var _get = require('lodash.get'); * @returns {*} The value of the selected key, or null if undefined. * @private */ -function get(obj, subkey, defaultValue) { +function get (obj, subkey, defaultValue) { if (!subkey) { subkey = '#'; } @@ -41,7 +36,6 @@ function get(obj, subkey, defaultValue) { return _get(obj, subkey, defaultValue); } } -exports.get = get; /** * Safely trim a value if it's a String @@ -55,7 +49,6 @@ function safeTrim (val) { } return val; } -exports.safeTrim = safeTrim; /* * Resolve a URL against a base URL, returning the original pathUrl if @@ -68,13 +61,55 @@ exports.safeTrim = safeTrim; */ function resolve (baseUrl, pathUrl) { if (!baseUrl || !pathUrl) return pathUrl; + if (typeof pathUrl !== 'string') return pathUrl; try { return new WHATWGURL(pathUrl, baseUrl).href; } catch (e) { return pathUrl; } } -exports.resolve = resolve; + +/* + * Resolve the URLs in a srcset attribute value against a base URL. + * @param {string} baseUrl + * @param {string} candidate + * @returns {string} + * @private + */ +function resolveSrcsetCandidate (baseUrl, candidate) { + var match = candidate.match(/^(\s*)(\S+)([\s\S]*)$/); + if (!match) return candidate; + return match[1] + resolve(baseUrl, match[2]) + match[3]; +} + +function resolveSrcset (baseUrl, srcset) { + if (!baseUrl || !srcset || typeof srcset !== 'string') return srcset; + + var out = ''; + var start = 0; + var depth = 0; + var i; + for (i = 0; i < srcset.length; i++) { + if (srcset[i] === '(') { + depth++; + } else if (srcset[i] === ')' && depth) { + depth--; + } else if (srcset[i] === ',' && depth === 0) { + // Do not split commas that are part of functional URL notation. + out += resolveSrcsetCandidate(baseUrl, srcset.slice(start, i)) + ','; + start = i + 1; + } + } + + return out + resolveSrcsetCandidate(baseUrl, srcset.slice(start)); +} + +function resolveHtmlAttributeValue (baseUrl, name, value) { + var attrName = name.toLowerCase(); + if (attrName === 'srcset') return resolveSrcset(baseUrl, value); + if (HTML_URI_ATTRS.has(attrName)) return resolve(baseUrl, value); + return value; +} /* * Check whether a given uri is an absolute URL @@ -90,7 +125,6 @@ function isAbsoluteUrl (uri) { return false; } } -exports.isAbsoluteUrl = isAbsoluteUrl; /* * Check whether a given namespace URI matches the given default @@ -101,9 +135,8 @@ exports.isAbsoluteUrl = isAbsoluteUrl; * @private */ function nslookup (uri, def) { - return namespaces[uri] === def; + return NAMESPACES[uri] === def; } -exports.nslookup = nslookup; /* * Return the "default" namespace prefix for a given namespace URI @@ -113,9 +146,8 @@ exports.nslookup = nslookup; * @private */ function nsprefix (uri) { - return namespaces[uri]; + return NAMESPACES[uri]; } -exports.nsprefix = nsprefix; /* * Walk a node and re-resolve the urls using the given baseurl @@ -132,7 +164,7 @@ function reresolve (node, baseurl) { function resolveLevel (level) { var els = Object.keys(level); - els.forEach(function(el){ + els.forEach(function (el) { if (Array.isArray(level[el])) { // The shape of the array of element items is different than if the element is not an array. // We need it to be the same shape to enable using the same function for recursion. @@ -158,13 +190,16 @@ function reresolve (node, baseurl) { if ('@' in level[el]) { var attrs = Object.keys(level[el]['@']); attrs.forEach(function (name) { - if (name == 'href' || name == 'src' || name == 'uri') { + if (HTML_URI_ATTRS.has(name)) { if ('string' === typeof level[el]['@'][name]) { - level[el]['@'][name] = resolve(baseurl, level[el]['@'][name]); + level[el]['@'][name] = resolveHtmlAttributeValue(baseurl, name, level[el]['@'][name]); } } }); } + if (mayHaveEmbeddedHtml(el, level[el])) { + level[el]['#'] = resolveHtmlUris(level[el]['#'], baseurl); + } } } }); @@ -173,33 +208,6 @@ function reresolve (node, baseurl) { return resolveLevel(node); } -exports.reresolve = reresolve; - -var HTML_TAGS = new Set([ - 'a', 'abbr', 'acronym', 'address', 'applet', 'area', 'article', 'aside', 'audio', - 'b', 'base', 'basefont', 'bdi', 'bdo', 'big', 'blink', 'blockquote', 'body', 'br', 'button', - 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', - 'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt', - 'em', 'embed', - 'fieldset', 'figcaption', 'figure', 'font', 'footer', 'form', 'frame', 'frameset', - 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'hr', 'html', - 'i', 'iframe', 'img', 'input', 'ins', 'isindex', - 'kbd', - 'label', 'legend', 'li', 'link', 'listing', - 'main', 'map', 'mark', 'marquee', 'menu', 'menuitem', 'meta', 'meter', 'multicol', - 'nav', 'nextid', 'nobr', 'noembed', 'noframes', 'noscript', - 'object', 'ol', 'optgroup', 'option', 'output', - 'p', 'param', 'picture', 'plaintext', 'pre', 'progress', - 'q', - 'rb', 'rp', 'rt', 'rtc', 'ruby', - 's', 'samp', 'script', 'section', 'select', 'slot', 'small', 'source', 'spacer', 'span', 'strike', 'strong', 'style', 'sub', 'summary', 'sup', - 'table', 'tbody', 'td', 'template', 'textarea', 'tfoot', 'th', - 'thead', 'time', 'title', 'tr', 'track', 'tt', - 'u', 'ul', - 'var', 'video', - 'wbr', - 'xmp' -]); /* * Scan markup starting at str[i] (which must be '<') and return its length @@ -258,13 +266,94 @@ function readMarkupAt (str, i) { } else if (ch === '"' || ch === '\'') { quote = ch; } else if (ch === '>') { - return { tagName: tagName, len: j + 1 - i }; + return { tagName: tagName, isClosing: isClosing, len: j + 1 - i }; } j++; } return null; // unclosed tag } +function rewriteHtmlTagUris (tag, baseUrl) { + var i = 1; + var out = ''; + var last = 0; + + if (tag[i] === '/') return tag; + while (i < tag.length && !/[\s/>]/.test(tag[i])) i++; + + while (i < tag.length) { + while (i < tag.length && /\s/.test(tag[i])) i++; + if (i >= tag.length || tag[i] === '>' || tag[i] === '/') break; + + var nameStart = i; + while (i < tag.length && !/[\s=/>]/.test(tag[i])) i++; + var name = tag.slice(nameStart, i); + + while (i < tag.length && /\s/.test(tag[i])) i++; + if (tag[i] !== '=') continue; + i++; + while (i < tag.length && /\s/.test(tag[i])) i++; + + var quote = null; + if (tag[i] === '"' || tag[i] === '\'') { + quote = tag[i]; + i++; + } + + var valueStart = i; + if (quote) { + while (i < tag.length && tag[i] !== quote) i++; + } else { + while (i < tag.length && !/[\s/>]/.test(tag[i])) i++; + } + var valueEnd = i; + var value = tag.slice(valueStart, valueEnd); + var resolved = resolveHtmlAttributeValue(baseUrl, name, value); + if (resolved !== value) { + out += tag.slice(last, valueStart) + resolved; + last = valueEnd; + } + if (quote && tag[i] === quote) i++; + } + + return out ? out + tag.slice(last) : tag; +} + +function resolveHtmlUris (html, baseUrl) { + if (!baseUrl || !html || typeof html !== 'string') return html; + + var out = ''; + var i = 0; + while (i < html.length) { + if (html[i] === '<') { + var markup = readMarkupAt(html, i); + if (markup && !markup.alwaysStrip && !markup.isClosing && HTML_TAGS.has(markup.tagName)) { + out += rewriteHtmlTagUris(html.slice(i, i + markup.len), baseUrl); + i += markup.len; + continue; + } + } + out += html[i]; + i++; + } + return out; +} + +function mayHaveEmbeddedHtml (name, el) { + if (!el || typeof el['#'] !== 'string') return false; + + var type = get(el['@'], 'type'); + + if (name === 'content:encoded') return true; + if (name === 'description' || name === 'summary' || name === 'tagline') return true; + + if (name === 'content' || name === 'title' || name === 'subtitle' || name === 'rights') { + return type === 'html' || type === 'xhtml'; + } + + return false; +} + /* * Strip HTML tags, leaving bare text content. * Scans the string for markup - HTML tags, comments, doctypes, and processing @@ -292,5 +381,17 @@ function stripHtml (str) { return out; } -exports.HTML_TAGS = HTML_TAGS; -exports.stripHtml = stripHtml; +module.exports = { + get, + safeTrim, + resolve, + resolveSrcset, + resolveHtmlAttributeValue, + isAbsoluteUrl, + nslookup, + nsprefix, + reresolve, + resolveHtmlUris, + mayHaveEmbeddedHtml, + stripHtml +}; diff --git a/test/api.js b/test/api.js index 4be194f..cf40825 100644 --- a/test/api.js +++ b/test/api.js @@ -54,4 +54,4 @@ describe('api', function () { }); }); -}); \ No newline at end of file +}); diff --git a/test/async-iterator.js b/test/async-iterator.js index ada8aea..e6ff940 100644 --- a/test/async-iterator.js +++ b/test/async-iterator.js @@ -4,7 +4,7 @@ var pipeline = require('util').promisify(require('stream').pipeline); describe('async iterator usage', function () { // These tests use .pipe() only to allow testing in older Node versions. - // In modern Node versions, you can use pipeline() with async iterators + // In modern Node versions, you can use pipeline() with async iterators // instead of .pipe(). If you use .pipe, you must add your own error handling // to avoid uncaught exceptions on errors. it('should work as an async iterator', async function () { @@ -46,7 +46,7 @@ describe('async iterator usage', function () { var items = []; var caught = null; var uncaught = null; - function onUncaught(err) { + function onUncaught (err) { uncaught = err; } process.prependOnceListener('uncaughtException', onUncaught); diff --git a/test/bad.js b/test/bad.js index b74b98e..dce6f4f 100644 --- a/test/bad.js +++ b/test/bad.js @@ -1,4 +1,4 @@ -describe('bad feeds', function(){ +describe('bad feeds', function () { describe('not a feed', function () { diff --git a/test/category.js b/test/category.js index 1d77039..94f8d09 100644 --- a/test/category.js +++ b/test/category.js @@ -1,4 +1,4 @@ -describe('categories', function(){ +describe('categories', function () { var feed = __dirname + '/feeds/category-feed.xml'; diff --git a/test/duplicate-enclosures.js b/test/duplicate-enclosures.js index 8215489..725c10b 100644 --- a/test/duplicate-enclosures.js +++ b/test/duplicate-enclosures.js @@ -1,8 +1,8 @@ -describe('duplicate enclosures', function(){ +describe('duplicate enclosures', function () { var feed = __dirname + '/feeds/mediacontent-dupes.xml'; - it('should not have duplicate enclosures from different elements', function (done){ + it('should not have duplicate enclosures from different elements', function (done) { fs.createReadStream(feed).pipe(new FeedParser()) .once('readable', function () { var stream = this; diff --git a/test/feeds/rss-with-item-scoped-html-base.xml b/test/feeds/rss-with-item-scoped-html-base.xml new file mode 100644 index 0000000..71a8ae9 --- /dev/null +++ b/test/feeds/rss-with-item-scoped-html-base.xml @@ -0,0 +1,18 @@ + + + + Item Scoped HTML Base + https://example.com/ + Fixture for scoped xml:base in item HTML. + + First + first + First

]]>
+
+ + Second + second + Second

]]>
+
+
+
diff --git a/test/feeds/rss-with-relative-html-urls-no-base.xml b/test/feeds/rss-with-relative-html-urls-no-base.xml new file mode 100644 index 0000000..3122b44 --- /dev/null +++ b/test/feeds/rss-with-relative-html-urls-no-base.xml @@ -0,0 +1,13 @@ + + + + Relative HTML URLs Without Base + https://example.com/feed/ + Fixture for relative links in embedded HTML without xml:base. + + Post + posts/post-1 + ReadComments

]]>
+
+
+
diff --git a/test/feeds/rss-with-relative-html-urls.xml b/test/feeds/rss-with-relative-html-urls.xml new file mode 100644 index 0000000..dabc687 --- /dev/null +++ b/test/feeds/rss-with-relative-html-urls.xml @@ -0,0 +1,13 @@ + + + + Relative HTML URLs + https://example.com/ + Fixture for relative links in embedded HTML. + + Post + posts/post-1 + ReadComments

]]>
+
+
+
diff --git a/test/illegally-nested.js b/test/illegally-nested.js index 642832c..fe465a7 100644 --- a/test/illegally-nested.js +++ b/test/illegally-nested.js @@ -1,8 +1,8 @@ -describe('illegally nested', function(){ +describe('illegally nested', function () { var feed = __dirname + '/feeds/illegally-nested.xml'; - it('should ignore illegally-nested items', function (done){ + it('should ignore illegally-nested items', function (done) { var itemCount = 0; fs.createReadStream(feed).pipe(new FeedParser()) .on('readable', function () { diff --git a/test/link.js b/test/link.js index 52d56ca..8996b3d 100644 --- a/test/link.js +++ b/test/link.js @@ -1,4 +1,4 @@ -describe('links', function(){ +describe('links', function () { var feed = __dirname + '/feeds/non-text-alternate-links.xml'; @@ -20,7 +20,7 @@ describe('links', function(){ var items = []; var sawDeprecation = false; var origEmit = process.emit; - process.emit = function(event, warning) { + process.emit = function (event, warning) { if (event === 'warning' && warning && warning.name === 'DeprecationWarning') { sawDeprecation = true; } diff --git a/test/namespaces.js b/test/namespaces.js index fbced78..3ae393c 100644 --- a/test/namespaces.js +++ b/test/namespaces.js @@ -1,6 +1,6 @@ -describe('namespaced elements', function(){ +describe('namespaced elements', function () { - describe('standard namespaces', function(){ + describe('standard namespaces', function () { var feed = __dirname + '/feeds/wapowellness.xml'; @@ -21,7 +21,7 @@ describe('namespaced elements', function(){ }); - describe('non-standard namespaces', function(){ + describe('non-standard namespaces', function () { var feed = __dirname + '/feeds/complexNamespaceFeed.xml'; @@ -41,7 +41,7 @@ describe('namespaced elements', function(){ }); - describe('nondefaultnamespace-baseline', function(){ + describe('nondefaultnamespace-baseline', function () { var feed = __dirname + '/feeds/nondefaultnamespace-baseline.atom'; @@ -62,7 +62,7 @@ describe('namespaced elements', function(){ }); - describe('nondefaultnamespace Test case 1', function(){ + describe('nondefaultnamespace Test case 1', function () { var feed = __dirname + '/feeds/nondefaultnamespace.atom'; @@ -83,7 +83,7 @@ describe('namespaced elements', function(){ }); - describe('nondefaultnamespace Test case 2', function(){ + describe('nondefaultnamespace Test case 2', function () { var feed = __dirname + '/feeds/nondefaultnamespace-xhtml.atom'; @@ -104,7 +104,7 @@ describe('namespaced elements', function(){ }); - describe('nondefaultnamespace Test case 3', function(){ + describe('nondefaultnamespace Test case 3', function () { var feed = __dirname + '/feeds/unknown-namespace.atom'; diff --git a/test/utils.js b/test/utils.js index bb7f69f..94c7405 100644 --- a/test/utils.js +++ b/test/utils.js @@ -1,4 +1,5 @@ var utils = require('../lib/utils'); +var { HTML_TAGS } = require('../lib/constants'); describe('utils', function () { @@ -120,6 +121,10 @@ describe('utils', function () { assert.strictEqual(utils.resolve('http://example.com/', undefined), undefined); }); + it('returns pathUrl when pathUrl is not a string', function () { + assert.strictEqual(utils.resolve('http://example.com/', 42), 42); + }); + it('returns pathUrl for tag: URIs that the URL constructor rejects', function () { var tagUri = 'tag:example.com,2003:posts/1'; assert.strictEqual(utils.resolve('http://example.com/', tagUri), tagUri); @@ -277,6 +282,18 @@ describe('utils', function () { assert.strictEqual(node.el['@'].uri, 'http://example.com/resource'); }); + it('resolves HTML URI attributes', function () { + var node = { video: { '@': { poster: '/poster.png' } } }; + utils.reresolve(node, 'http://example.com/'); + assert.strictEqual(node.video['@'].poster, 'http://example.com/poster.png'); + }); + + it('resolves srcset attributes', function () { + var node = { img: { '@': { srcset: 'small.png 480w, /large.png 2x' } } }; + utils.reresolve(node, 'http://example.com/path/'); + assert.strictEqual(node.img['@'].srcset, 'http://example.com/path/small.png 480w, http://example.com/large.png 2x'); + }); + it('handles array of element items', function () { var node = { link: [ @@ -443,7 +460,7 @@ describe('utils', function () { assert.strictEqual(utils.stripHtml(' 0\'>link'), 'link'); }); - utils.HTML_TAGS.forEach(function (tag) { + HTML_TAGS.forEach(function (tag) { it(`strips ${tag} HTML tag opening and closing and self-closing`, function () { assert.strictEqual(utils.stripHtml('<' + tag + '>content and <' + tag + ' />more'), 'content and more', 'expected <' + tag + '> to be stripped'); }); diff --git a/test/xmlbase.js b/test/xmlbase.js index 7b53c17..519230e 100644 --- a/test/xmlbase.js +++ b/test/xmlbase.js @@ -1,4 +1,4 @@ -describe('xmlbase', function(){ +describe('xmlbase', function () { it('should resolve relative URIs in meta elements with no root xml:base', function (done) { var feed = __dirname + '/feeds/intertwingly.atom'; @@ -167,4 +167,111 @@ describe('xmlbase', function(){ }); }); + it('should resolve relative URLs in embedded item HTML with xml:base', function (done) { + var feed = __dirname + '/feeds/rss-with-relative-html-urls.xml'; + var descriptions = []; + + fs.createReadStream(feed).pipe(new FeedParser()) + .on('readable', function () { + var item; + while ((item = this.read())) { + descriptions.push(item.description); + } + }) + .on('error', function (err) { + assert.ifError(err); + done(err); + }) + .on('end', function () { + assert.equal(descriptions[0], '

ReadComments

'); + done(); + }); + }); + + it('should resolve relative URLs in embedded item HTML with an inferred base', function (done) { + var feed = __dirname + '/feeds/rss-with-relative-html-urls-no-base.xml'; + var items = []; + + fs.createReadStream(feed).pipe(new FeedParser()) + .on('readable', function () { + var item; + while ((item = this.read())) { + items.push(item); + } + }) + .on('error', function (err) { + assert.ifError(err); + done(err); + }) + .on('end', function () { + assert.equal(items[0].link, 'https://example.com/feed/posts/post-1'); + assert.equal(items[0].description, '

ReadComments

'); + done(); + }); + }); + + it('should resolve relative URLs in embedded item HTML with feedurl option', function (done) { + var feed = __dirname + '/feeds/rss-with-relative-html-urls-no-base.xml'; + var descriptions = []; + + fs.createReadStream(feed).pipe(new FeedParser({ feedurl: 'https://example.com/feed/' })) + .on('readable', function () { + var item; + while ((item = this.read())) { + descriptions.push(item.description); + } + }) + .on('error', function (err) { + assert.ifError(err); + done(err); + }) + .on('end', function () { + assert.equal(descriptions[0], '

ReadComments

'); + done(); + }); + }); + + it('should not use item xml:base for sibling embedded item HTML', function (done) { + var feed = __dirname + '/feeds/rss-with-item-scoped-html-base.xml'; + var descriptions = []; + + fs.createReadStream(feed).pipe(new FeedParser()) + .on('readable', function () { + var item; + while ((item = this.read())) { + descriptions.push(item.description); + } + }) + .on('error', function (err) { + assert.ifError(err); + done(err); + }) + .on('end', function () { + assert.equal(descriptions[0], '

First

'); + assert.equal(descriptions[1], '

Second

'); + done(); + }); + }); + + it('should not resolve relative URLs in embedded item HTML when normalize is false', function (done) { + var feed = __dirname + '/feeds/rss-with-relative-html-urls.xml'; + var descriptions = []; + + fs.createReadStream(feed).pipe(new FeedParser({ normalize: false })) + .on('readable', function () { + var item; + while ((item = this.read())) { + descriptions.push(item['rss:description']['#']); + } + }) + .on('error', function (err) { + assert.ifError(err); + done(err); + }) + .on('end', function () { + assert.equal(descriptions[0], '

ReadComments

'); + done(); + }); + }); + });