diff --git a/.editorconfig b/.editorconfig
index c6681ba..ba0f541 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -4,6 +4,7 @@ root = true
[*]
end_of_line = lf
insert_final_newline = true
+trim_trailing_whitespace = true
[{*.js,package.json,.travis.yml}]
charset = utf-8
diff --git a/.eslintignore b/.eslintignore
index 63e7ae6..1915bd1 100644
--- a/.eslintignore
+++ b/.eslintignore
@@ -1 +1,4 @@
+.vscode/
+.memsearch/
+node_modules/
**/*.ts
diff --git a/.eslintrc.json b/.eslintrc.json
index 8761f2d..964e9ca 100644
--- a/.eslintrc.json
+++ b/.eslintrc.json
@@ -1,4 +1,5 @@
{
+ "root": true,
"env": {
"node": true,
"es6": true
@@ -8,6 +9,31 @@
},
"extends": "eslint:recommended",
"rules": {
+ "arrow-spacing": [
+ "error",
+ {
+ "before": true,
+ "after": true
+ }
+ ],
+ "func-call-spacing": [
+ "error",
+ "never"
+ ],
+ "key-spacing": [
+ "error",
+ {
+ "beforeColon": false,
+ "afterColon": true
+ }
+ ],
+ "keyword-spacing": [
+ "error",
+ {
+ "before": true,
+ "after": true
+ }
+ ],
"indent": [
"error",
2,
@@ -19,6 +45,10 @@
"error",
"unix"
],
+ "object-curly-spacing": [
+ "error",
+ "always"
+ ],
"quotes": [
"error",
"single"
@@ -27,6 +57,17 @@
"error",
"always"
],
+ "space-before-blocks": [
+ "error",
+ "always"
+ ],
+ "space-before-function-paren": [
+ "error", {
+ "anonymous": "always",
+ "named": "always",
+ "asyncArrow": "always"
+ }
+ ],
"no-cond-assign": [
0
]
diff --git a/README.md b/README.md
index 02ea6dc..b76b006 100644
--- a/README.md
+++ b/README.md
@@ -106,22 +106,23 @@ You can also check out this nice [working implementation](https://github.com/scr
### options
- `normalize` - Set to `false` to override Feedparser's default behavior,
- which is to parse feeds into an object that contains the generic properties
+ which is to both parse feeds into an object that contains the generic properties
patterned after (although not identical to) the RSS 2.0 format, regardless
- of the feed's format.
+ of the feed's format, as well as to resolve all relative urls, including those
+ embedded in HTML content fields.
- `addmeta` - Set to `false` to override Feedparser's default behavior, which
is to add the feed's `meta` information to each article.
- `feedurl` - The url (string) of the feed. FeedParser is very good at
- resolving relative urls in feeds. But some feeds use relative urls without
- declaring the `xml:base` attribute any place in the feed. This is perfectly
- valid, but we don't know know the feed's url before we start parsing the feed
- and trying to resolve those relative urls. If we discover the feed's url, we
- will go back and resolve the relative urls we've already seen, but this takes
- a little time (not much). If you want to be sure we never have to re-resolve
- relative urls (or if FeedParser is failing to properly resolve relative urls),
- you should set the `feedurl` option. Otherwise, feel free to ignore this option.
+ resolving relative urls in feeds, including those embedded in HTML content
+ fields. But some feeds use relative urls without declaring the `xml:base`
+ attribute any place in the feed. This is perfectly valid, but we don't know
+ the feed's url before we start parsing the feed and trying to resolve those
+ relative urls. If we discover the feed's url, we will go back and resolve the
+ relative urls we've already seen, but this takes a little time (not much).
+ If you want to be sure we can resolve all relative urls, you should set the
+ `feedurl` option.
- `resume_saxerror` - Set to `false` to override Feedparser's default behavior, which
is to silently handle them and then automatically resume parsing. In
diff --git a/bin/feedparser.js b/bin/feedparser.js
index 90d1b30..da74fc3 100755
--- a/bin/feedparser.js
+++ b/bin/feedparser.js
@@ -36,7 +36,7 @@ var items = [];
process.stdin.pipe(new FeedParser(argv))
.on('error', console.error)
- .on('readable', function() {
+ .on('readable', function () {
var stream = this, item;
while (item = stream.read()) {
if (argv.group) {
diff --git a/examples/complete.js b/examples/complete.js
index 49c7afe..3379081 100644
--- a/examples/complete.js
+++ b/examples/complete.js
@@ -8,7 +8,7 @@ var fetch = require('node-fetch')
, FeedParser = require(__dirname+'/..')
, iconv = require('iconv-lite');
-function get(feed) {
+function get (feed) {
// Get a response stream
fetch(feed, { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36', 'accept': 'text/html,application/xhtml+xml' }).then(function (res) {
@@ -16,7 +16,7 @@ function get(feed) {
var feedparser = new FeedParser();
feedparser.on('error', done);
feedparser.on('end', done);
- feedparser.on('readable', function() {
+ feedparser.on('readable', function () {
var post;
while (post = this.read()) {
console.log(JSON.stringify(post, ' ', 4));
@@ -45,14 +45,14 @@ function maybeTranslate (res, charset) {
// If we're using iconvStream, stream will be the output of iconvStream
// otherwise it will remain the output of request
res = res.pipe(iconvStream);
- } catch(err) {
+ } catch (err) {
res.emit('error', err);
}
}
return res;
}
-function getParams(str) {
+function getParams (str) {
var params = str.split(';').reduce(function (params, param) {
var parts = param.split('=').map(function (part) { return part.trim(); });
if (parts.length === 2) {
@@ -63,7 +63,7 @@ function getParams(str) {
return params;
}
-function done(err) {
+function done (err) {
if (err) {
console.log(err, err.stack);
return process.exit(1);
diff --git a/examples/simple.js b/examples/simple.js
index fe17455..c247137 100644
--- a/examples/simple.js
+++ b/examples/simple.js
@@ -19,7 +19,7 @@ fs.createReadStream(feed)
.on('meta', function (meta) {
console.log('===== %s =====', meta.title);
})
- .on('readable', function() {
+ .on('readable', function () {
var stream = this, item;
while (item = stream.read()) {
console.log('Got article: %s', item.title || item.description);
diff --git a/lib/namespaces.js b/lib/constants.js
similarity index 61%
rename from lib/namespaces.js
rename to lib/constants.js
index 58442b6..ba25068 100644
--- a/lib/namespaces.js
+++ b/lib/constants.js
@@ -1,9 +1,10 @@
/*
- * Default namespaces
- *
- * Lookup by URI
- */
-module.exports = {
+* Default namespaces
+*
+* Lookup by URI
+*/
+/* eslint-disable key-spacing */
+var NAMESPACES = {
'http://www.w3.org/2005/Atom' :'atom', // v1.0
'http://purl.org/atom/ns#' :'atom', // v0.3
'http://www.w3.org/1999/02/22-rdf-syntax-ns#' :'rdf',
@@ -35,3 +36,49 @@ module.exports = {
'http://www.w3.org/1999/xhtml' :'xhtml',
'http://www.w3.org/XML/1998/namespace' :'xml'
};
+/* eslint-enable key-spacing */
+
+var HTML_URI_ATTRS = new Set([
+ 'href',
+ 'src',
+ 'uri',
+ 'srcset',
+ 'cite',
+ 'longdesc',
+ 'action',
+ 'background',
+ 'data',
+ 'poster'
+]);
+
+var HTML_TAGS = new Set([
+ 'a', 'abbr', 'acronym', 'address', 'applet', 'area', 'article', 'aside', 'audio',
+ 'b', 'base', 'basefont', 'bdi', 'bdo', 'big', 'blink', 'blockquote', 'body', 'br', 'button',
+ 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
+ 'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt',
+ 'em', 'embed',
+ 'fieldset', 'figcaption', 'figure', 'font', 'footer', 'form', 'frame', 'frameset',
+ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'hr', 'html',
+ 'i', 'iframe', 'img', 'input', 'ins', 'isindex',
+ 'kbd',
+ 'label', 'legend', 'li', 'link', 'listing',
+ 'main', 'map', 'mark', 'marquee', 'menu', 'menuitem', 'meta', 'meter', 'multicol',
+ 'nav', 'nextid', 'nobr', 'noembed', 'noframes', 'noscript',
+ 'object', 'ol', 'optgroup', 'option', 'output',
+ 'p', 'param', 'picture', 'plaintext', 'pre', 'progress',
+ 'q',
+ 'rb', 'rp', 'rt', 'rtc', 'ruby',
+ 's', 'samp', 'script', 'section', 'select', 'slot', 'small', 'source', 'spacer', 'span', 'strike', 'strong', 'style', 'sub', 'summary', 'sup',
+ 'table', 'tbody', 'td', 'template', 'textarea', 'tfoot', 'th',
+ 'thead', 'time', 'title', 'tr', 'track', 'tt',
+ 'u', 'ul',
+ 'var', 'video',
+ 'wbr',
+ 'xmp'
+]);
+
+module.exports = {
+ NAMESPACES,
+ HTML_URI_ATTRS,
+ HTML_TAGS
+};
diff --git a/lib/feedparser/index.js b/lib/feedparser.js
similarity index 85%
rename from lib/feedparser/index.js
rename to lib/feedparser.js
index 94840fb..9be6fb8 100644
--- a/lib/feedparser/index.js
+++ b/lib/feedparser.js
@@ -9,12 +9,18 @@
/**
* Module dependencies.
*/
-var sax = require('sax')
- , addressparser = require('addressparser')
- , indexOfObject = require('array-indexofobject')
- , util = require('util')
- , TransformStream = require('readable-stream').Transform
- , _ = require('../utils');
+const sax = require('sax');
+const addressparser = require('addressparser');
+const indexOfObject = require('array-indexofobject');
+const { inherits } = require('util');
+const { Transform: TransformStream } = require('readable-stream');
+const { HTML_URI_ATTRS } = require('./constants');
+const _ = {
+ ...require('./utils'),
+ has: require('lodash.has'),
+ assign: require('lodash.assign'),
+ uniq: require('lodash.uniq')
+};
/**
* FeedParser constructor.
@@ -56,7 +62,7 @@ var sax = require('sax')
* - categories {Array}
*
* @this {FeedParserInstance}
- * @param {import('../../index').Options} [options]
+ * @param {import('../index').Options} [options]
*/
function FeedParser (options) {
if (!(this instanceof FeedParser)) return new FeedParser(options);
@@ -81,19 +87,19 @@ function FeedParser (options) {
// @ts-expect-error
sax.MAX_BUFFER_LENGTH = 16 * 1024 * 1024; // 16M versus the 64K default
}
- if (this.options.feedurl) this.xmlbase.unshift({ '#name': 'xml', '#': this.options.feedurl});
+ if (this.options.feedurl) this.xmlbase.unshift({ '#name': 'xml', '#': this.options.feedurl });
// See https://github.com/isaacs/sax-js for more info
- this.stream = sax.createStream(this.options.strict /* strict mode - no by default */, {lowercase: true, xmlns: true });
+ this.stream = sax.createStream(this.options.strict /* strict mode - no by default */, { lowercase: true, xmlns: true });
this.stream.on('error', this.handleSaxError.bind(this));
this.stream.on('processinginstruction', this.handleProcessingInstruction.bind(this));
this.stream.on('opentag', this.handleOpenTag.bind(this));
- this.stream.on('closetag',this.handleCloseTag.bind(this));
+ this.stream.on('closetag', this.handleCloseTag.bind(this));
this.stream.on('text', this.handleText.bind(this));
this.stream.on('cdata', this.handleText.bind(this));
this.stream.on('end', this.handleEnd.bind(this));
}
-util.inherits(FeedParser, TransformStream);
+inherits(FeedParser, TransformStream);
/*
* Initializes the SAX stream
@@ -101,7 +107,7 @@ util.inherits(FeedParser, TransformStream);
* Initializes the class-variables
*/
/** @this {FeedParserInstance} */
-FeedParser.prototype.init = function (){
+FeedParser.prototype.init = function () {
this.meta = {
'#ns': [],
'@': [],
@@ -119,7 +125,7 @@ FeedParser.prototype.init = function (){
};
/** @this {FeedParserInstance} */
-FeedParser.prototype.handleEnd = function (){
+FeedParser.prototype.handleEnd = function () {
// We made it to the end without throwing, but let's make sure we were actually
// parsing a feed
if (!(this.meta && this.meta['#type'])) {
@@ -174,7 +180,7 @@ FeedParser.prototype.handleProcessingInstruction = function (node) {
* @this {FeedParserInstance}
* @param {import('sax').QualifiedTag} node
*/
-FeedParser.prototype.handleOpenTag = function (node){
+FeedParser.prototype.handleOpenTag = function (node) {
var n = {};
n['#name'] = node.name; // Avoid namespace collissions later...
n['#prefix'] = node.prefix; // The current ns prefix
@@ -189,23 +195,23 @@ FeedParser.prototype.handleOpenTag = function (node){
if (this.in_xhtml && this.xhtml['#name'] != n['#name']) { // We are in an xhtml node
// This builds the opening tag, e.g.,
- this.xhtml['#'] += '<'+n['#name'];
- Object.keys(n['@']).forEach(function(name){
- this.xhtml['#'] += ' '+ name +'="'+ n['@'][name] + '"';
- }, this);
+ this.xhtml['#'] += '<' + n['#name'];
+ Object.keys(n['@']).forEach((name) => {
+ this.xhtml['#'] += ' ' + name + '="' + n['@'][name] + '"';
+ });
this.xhtml['#'] += '>';
- } else if ( this.stack.length === 0 &&
- (n['#name'] === 'rss' ||
- (n['#local'] === 'rdf' && _.nslookup([n['#uri']], 'rdf')) ||
- (n['#local'] === 'feed'&& _.nslookup([n['#uri']], 'atom')) ) ) {
- Object.keys(n['@']).forEach(function(name) {
+ } else if (this.stack.length === 0 &&
+ (n['#name'] === 'rss' ||
+ (n['#local'] === 'rdf' && _.nslookup([n['#uri']], 'rdf')) ||
+ (n['#local'] === 'feed' && _.nslookup([n['#uri']], 'atom')))) {
+ Object.keys(n['@']).forEach((name) => {
var o = {};
if (name != 'version') {
o[name] = n['@'][name];
this.meta['@'].push(o);
}
- }, this);
- switch(n['#local']) {
+ });
+ switch (n['#local']) {
case 'rss':
this.meta['#type'] = 'rss';
this.meta['#version'] = n['@']['version'];
@@ -224,14 +230,15 @@ FeedParser.prototype.handleOpenTag = function (node){
};
/** @this {FeedParserInstance} */
-FeedParser.prototype.handleCloseTag = function (el){
+FeedParser.prototype.handleCloseTag = function (el) {
var node = {
'#name': el,
'#prefix': '',
- '#local' : ''
+ '#local': ''
}
, stdEl
, item
+ , base
, baseurl
, isIllegallyNested = false
;
@@ -261,7 +268,8 @@ FeedParser.prototype.handleCloseTag = function (el){
delete n['#uri'];
if (this.xmlbase && this.xmlbase.length) {
- baseurl = this.xmlbase[0]['#'];
+ base = this.xmlbase[0];
+ baseurl = base['#'];
}
var mayHaveResolvableUrl = (
@@ -272,12 +280,18 @@ FeedParser.prototype.handleCloseTag = function (el){
node['#local'] === 'link' // include rss:link, even though it should _never_ be a relative URL
)
);
+
+ var mayHaveEmbeddedHtml = _.mayHaveEmbeddedHtml(node['#name'], n) || _.mayHaveEmbeddedHtml(node['#local'], n);
if (baseurl && mayHaveResolvableUrl) {
// Apply xml:base to these elements as they appear
// rather than leaving it to the ultimate parser
n['#'] = _.resolve(baseurl, n['#']);
}
+ if (baseurl && this.options.normalize && mayHaveEmbeddedHtml) {
+ n['#'] = _.resolveHtmlUris(n['#'], baseurl);
+ }
+
if (this.xmlbase.length && (el == this.xmlbase[0]['#name'])) {
void this.xmlbase.shift();
}
@@ -314,15 +328,15 @@ FeedParser.prototype.handleCloseTag = function (el){
}
if (node['#name'] === 'item' ||
- node['#name'] === 'entry' ||
- (node['#local'] === 'item' && (node['#prefix'] === '' || node['#type'] === 'rdf')) ||
- (node['#local'] == 'entry' && (node['#prefix'] === '' || node['#type'] === 'atom'))) { // We have an article!
+ node['#name'] === 'entry' ||
+ (node['#local'] === 'item' && (node['#prefix'] === '' || node['#type'] === 'rdf')) ||
+ (node['#local'] == 'entry' && (node['#prefix'] === '' || node['#type'] === 'atom'))) { // We have an article!
isIllegallyNested = (
- ( node['#name'] === 'item' && this.stack[0]['#name'] === 'item' ) ||
- ( node['#name'] === 'entry' && this.stack[0]['#name'] === 'entry' ) ||
- ( (node['#local'] === 'item' && (node['#prefix'] === '' || node['#type'] === 'rdf')) && this.stack[0]['#name'] === 'item' ) ||
- ( (node['#local'] == 'entry' && (node['#prefix'] === '' || node['#type'] === 'atom')) && this.stack[0]['#name'] === 'entry' )
+ (node['#name'] === 'item' && this.stack[0]['#name'] === 'item') ||
+ (node['#name'] === 'entry' && this.stack[0]['#name'] === 'entry') ||
+ ((node['#local'] === 'item' && (node['#prefix'] === '' || node['#type'] === 'rdf')) && this.stack[0]['#name'] === 'item') ||
+ ((node['#local'] == 'entry' && (node['#prefix'] === '' || node['#type'] === 'atom')) && this.stack[0]['#name'] === 'entry')
);
if (isIllegallyNested) {
@@ -346,10 +360,10 @@ FeedParser.prototype.handleCloseTag = function (el){
if (this.meta.author && !item.author) item.author = this.meta.author;
this.push(item);
} else if (!this.meta.title && // We haven't yet parsed all the metadata
- (node['#name'] === 'channel' ||
- node['#name'] === 'feed' ||
- (node['#local'] === 'channel' && (node['#prefix'] === '' || node['#type'] === 'rdf')) ||
- (node['#local'] === 'feed' && (node['#prefix'] === '' || node['#type'] === 'atom')) ) ) {
+ (node['#name'] === 'channel' ||
+ node['#name'] === 'feed' ||
+ (node['#local'] === 'channel' && (node['#prefix'] === '' || node['#type'] === 'rdf')) ||
+ (node['#local'] === 'feed' && (node['#prefix'] === '' || node['#type'] === 'atom')))) {
_.assign(this.meta, this.handleMeta(n, this.meta['#type'], this.options));
if (!this._emitted_meta) {
this.emit('meta', this.meta);
@@ -379,7 +393,7 @@ FeedParser.prototype.handleCloseTag = function (el){
* @this {FeedParserInstance}
* @param {string} text
*/
-FeedParser.prototype.handleText = function (text){
+FeedParser.prototype.handleText = function (text) {
if (this.in_xhtml) {
this.xhtml['#'] += text;
} else {
@@ -419,7 +433,7 @@ FeedParser.prototype.handleAttributes = function handleAttributes (attrs, el) {
basepath = this.xmlbase[0]['#'];
}
- Object.keys(attrs).forEach(/** @this {FeedParserInstance} */ function(key){
+ Object.keys(attrs).forEach(/** @type (key: string) => void */ (key) => {
var attr = attrs[key]
, ns = {}
, prefix = ''
@@ -434,12 +448,12 @@ FeedParser.prototype.handleAttributes = function handleAttributes (attrs, el) {
// If the feed is using a non-default prefix, we'll use it, too
// But we force the use of the 'xml' prefix
if (attr.uri && attr.prefix && !_.nslookup(attr.uri, attr.prefix) || _.nslookup(attr.uri, 'xml')) {
- prefix = ( _.nsprefix(attr.uri) || attr.prefix ) + ( attr.local ? ':' : '' );
+ prefix = (_.nsprefix(attr.uri) || attr.prefix) + (attr.local ? ':' : '');
}
- if (basepath && (attr.local == 'href' || attr.local == 'src' || attr.local == 'uri')) {
+ if (basepath && HTML_URI_ATTRS.has(attr.local)) {
// Apply xml:base to these elements as they appear
// rather than leaving it to the ultimate parser
- attr.value = _.resolve(basepath, attr.value);
+ attr.value = _.resolveHtmlAttributeValue(basepath, attr.local, attr.value);
} else if (attr.local === 'base' && _.nslookup(attr.uri, 'xml')) {
// Keep track of the xml:base for the current node
if (basepath) {
@@ -448,22 +462,22 @@ FeedParser.prototype.handleAttributes = function handleAttributes (attrs, el) {
// Per RFC 3986 ยง4.4, an empty or "#"-only xml:base is a same-document reference.
// It does not change the effective base URI, so skip pushing it.
if (attr.value && !/^#/.test(attr.value)) {
- this.xmlbase.unshift({ '#name': el, '#': attr.value});
+ this.xmlbase.unshift({ '#name': el, '#': attr.value });
}
} else if (attr.name === 'type' && attr.value === 'xhtml') {
this.in_xhtml = true;
- this.xhtml = {'#name': el, '#': ''};
+ this.xhtml = { '#name': el, '#': '' };
}
simplifiedAttributes[prefix + attr.local] = attr.value ? attr.value.trim() : '';
- }, this);
+ });
return simplifiedAttributes;
};
/**
* @this {FeedParserInstance}
* @param {ParsedNode} node
- * @param {import('../../index').Type} type
- * @param {import('../../index').Options} options
+ * @param {import('../index').Type} type
+ * @param {import('../index').Options} options
* @returns {Object}
*/
FeedParser.prototype.handleMeta = function handleMeta (node, type, options) {
@@ -474,7 +488,7 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) {
;
if (normalize) {
- ['title','description','date', 'pubdate', 'pubDate','link', 'xmlurl', 'xmlUrl','author','language','favicon','copyright','generator'].forEach(function (property){
+ ['title', 'description', 'date', 'pubdate', 'pubDate', 'link', 'xmlurl', 'xmlUrl', 'author', 'language', 'favicon', 'copyright', 'generator'].forEach(function (property) {
meta[property] = null;
});
meta.cloud = {};
@@ -482,24 +496,24 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) {
meta.categories = [];
}
- Object.keys(node).forEach(function(name){
+ Object.keys(node).forEach((name) => {
var el = node[name];
if (normalize) {
- switch(name){
- case('title'):
+ switch (name) {
+ case ('title'):
meta.title = _.get(el);
break;
- case('description'):
- case('subtitle'):
+ case ('description'):
+ case ('subtitle'):
meta.description = _.get(el);
break;
- case('pubdate'):
- case('lastbuilddate'):
- case('published'):
- case('modified'):
- case('updated'):
- case('dc:date'):
+ case ('pubdate'):
+ case ('lastbuilddate'):
+ case ('published'):
+ case ('modified'):
+ case ('updated'):
+ case ('dc:date'):
var date = _.get(el) ? new Date(_.get(el)) : null;
if (!date) break;
if (meta.pubdate === null || name == 'pubdate' || name == 'published')
@@ -507,11 +521,11 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) {
if (meta.date === null || name == 'lastbuilddate' || name == 'modified' || name == 'updated')
meta.date = date;
break;
- case('link'):
- case('atom:link'):
- case('atom10:link'):
+ case ('link'):
+ case ('atom:link'):
+ case ('atom10:link'):
if (Array.isArray(el)) {
- el.forEach(function (link){
+ el.forEach((link) => {
if (link['@']['href']) { // Atom
if (_.get(link['@'], 'rel')) {
if (link['@']['rel'] == 'alternate') {
@@ -538,13 +552,13 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) {
meta.link = _.get(link);
}
if (_.isAbsoluteUrl(meta.link) && this.xmlbase && this.xmlbase.length === 0) {
- this.xmlbase.unshift({ '#name': 'xml', '#': meta.link});
+ this.xmlbase.unshift({ '#name': 'xml', '#': meta.link });
this.stack[0] = _.reresolve(this.stack[0], meta.link);
}
else if (this.xmlbase && this.xmlbase.length > 0) {
meta.link = _.resolve(_.get(this.xmlbase[0], '#'), meta.link);
}
- }, this);
+ });
} else {
if (el['@']['href']) { // Atom
if (_.get(el['@'], 'rel')) {
@@ -554,7 +568,7 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) {
else if (el['@']['rel'] == 'self') {
meta.xmlurl = meta.xmlUrl = el['@']['href'];
if (_.isAbsoluteUrl(meta.xmlurl) && this.xmlbase && this.xmlbase.length === 0) {
- this.xmlbase.unshift({ '#name': 'xml', '#': meta.xmlurl});
+ this.xmlbase.unshift({ '#name': 'xml', '#': meta.xmlurl });
this.stack[0] = _.reresolve(this.stack[0], meta.xmlurl);
}
else if (this.xmlbase && this.xmlbase.length > 0) {
@@ -572,7 +586,7 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) {
if (!meta.link) meta.link = _.get(el);
}
if (_.isAbsoluteUrl(meta.link) && this.xmlbase && this.xmlbase.length === 0) {
- this.xmlbase.unshift({ '#name': 'xml', '#': meta.link});
+ this.xmlbase.unshift({ '#name': 'xml', '#': meta.link });
this.stack[0] = _.reresolve(this.stack[0], meta.link);
}
else if (this.xmlbase && this.xmlbase.length > 0) {
@@ -580,9 +594,9 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) {
}
}
break;
- case('managingeditor'):
- case('webmaster'):
- case('author'):
+ case ('managingeditor'):
+ case ('webmaster'):
+ case ('author'):
var author = {};
if (name == 'author') {
meta.author = _.get(el.name) || _.get(el.email) || _.get(el.uri);
@@ -598,7 +612,7 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) {
}
}
break;
- case('cloud'):
+ case ('cloud'):
// I can't believe someone actually would put two cloud elements in their channel
// but it happened
// Nevertheless, there can be only one
@@ -621,11 +635,11 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) {
}
meta.cloud.type = 'rsscloud';
break;
- case('language'):
+ case ('language'):
meta.language = _.get(el);
break;
- case('image'):
- case('logo'):
+ case ('image'):
+ case ('logo'):
if (el.url)
meta.image.url = _.get(el.url);
if (el.title)
@@ -633,46 +647,46 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) {
if (!meta.image.url && _.get(el))
meta.image.url = _.get(el);
break;
- case('icon'):
+ case ('icon'):
meta.favicon = _.get(el);
break;
- case('copyright'):
- case('rights'):
- case('dc:rights'):
+ case ('copyright'):
+ case ('rights'):
+ case ('dc:rights'):
meta.copyright = _.get(el);
break;
- case('generator'):
+ case ('generator'):
meta.generator = _.get(el);
if (_.get(el['@'], 'version'))
meta.generator += (meta.generator ? ' ' : '') + 'v' + el['@'].version;
if (_.get(el['@'], 'uri'))
meta.generator += meta.generator ? ' (' + el['@'].uri + ')' : el['@'].uri;
break;
- case('category'):
- case('dc:subject'):
- case('itunes:category'):
- case('media:category'):
+ case ('category'):
+ case ('dc:subject'):
+ case ('itunes:category'):
+ case ('media:category'):
/* We handle all the kinds of categories within the switch loop because meta.categories
- * is an array, unlike the other properties, and therefore can handle multiple values
- */
+ * is an array, unlike the other properties, and therefore can handle multiple values
+ */
var _category = ''
, _categories = []
- ;
+ ;
if (Array.isArray(el)) {
- el.forEach(function (category){
+ el.forEach(function (category) {
var _categoryValue;
if ('category' == name && 'atom' == type) {
if (category['@'] && (_categoryValue = _.safeTrim(_.get(category['@'], 'term')))) {
meta.categories.push(_categoryValue);
}
}
- else if ('category' == name && 'rss' == type){
+ else if ('category' == name && 'rss' == type) {
if ((_categoryValue = _.safeTrim(_.get(category)))) {
meta.categories.push(_categoryValue);
}
}
else if ('dc:subject' == name && (_categoryValue = _.safeTrim(_.get(category)))) {
- _categories = _categoryValue.split(' ').map(function (cat){ return cat.trim(); });
+ _categories = _categoryValue.split(' ').map(function (cat) { return cat.trim(); });
if (_categories.length) {
meta.categories = meta.categories.concat(_categories);
}
@@ -681,7 +695,7 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) {
if (category['@'] && _.safeTrim(_.get(category['@'], 'text'))) _category = _.safeTrim(_.get(category['@'], 'text'));
if (category[name]) {
if (Array.isArray(category[name])) {
- category[name].forEach(function (subcategory){
+ category[name].forEach(function (subcategory) {
var _subcategoryValue;
if (subcategory['@'] && (_subcategoryValue = _.safeTrim(_.get(subcategory['@'], 'text')))) {
meta.categories.push(_category + '/' + _subcategoryValue);
@@ -712,7 +726,7 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) {
}
}
else if ('dc:subject' == name && (_category = _.safeTrim(_.get(el)))) {
- _categories = _category.split(' ').map(function (cat){ return cat.trim(); });
+ _categories = _category.split(' ').map(function (cat) { return cat.trim(); });
if (_categories.length) {
meta.categories = meta.categories.concat(_categories);
}
@@ -721,7 +735,7 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) {
if (el['@'] && _.safeTrim(_.get(el['@'], 'text'))) _category = _.safeTrim(_.get(el['@'], 'text'));
if (el[name]) {
if (Array.isArray(el[name])) {
- el[name].forEach(function (subcategory){
+ el[name].forEach(function (subcategory) {
var _subcategoryValue;
if (subcategory['@'] && (_subcategoryValue = _.safeTrim(_.get(subcategory['@'], 'text')))) {
meta.categories.push(_category + '/' + _subcategoryValue);
@@ -748,7 +762,7 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) {
if (~name.indexOf(':')) meta[name] = el;
else meta[type + ':' + name] = el;
}
- }, this); // forEach end
+ }); // forEach end
if (normalize) {
if (!meta.description) {
@@ -816,11 +830,11 @@ FeedParser.prototype.handleMeta = function handleMeta (node, type, options) {
/**
* @this {FeedParserInstance}
* @param {ParsedNode} node
- * @param {import('../../index').Type} type
- * @param {import('../../index').Options} options
+ * @param {import('../index').Type} type
+ * @param {import('../index').Options} options
* @returns {Object}
*/
-FeedParser.prototype.handleItem = function handleItem (node, type, options){
+FeedParser.prototype.handleItem = function handleItem (node, type, options) {
if (!type || !node) return {};
var item = {}
@@ -828,7 +842,7 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){
;
if (normalize) {
- ['title','description','summary','date','pubdate','pubDate','link','guid','author','comments', 'origlink'].forEach(function (property){
+ ['title', 'description', 'summary', 'date', 'pubdate', 'pubDate', 'link', 'guid', 'author', 'comments', 'origlink'].forEach(function (property) {
item[property] = null;
});
item.image = {};
@@ -837,30 +851,30 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){
item.enclosures = [];
}
- Object.keys(node).forEach(function(name){
+ Object.keys(node).forEach((name) => {
var el = node[name]
, attrs = _.get(el, '@')
, enclosure;
if (normalize) {
- switch(name){
- case('title'):
+ switch (name) {
+ case ('title'):
item.title = _.get(el);
break;
- case('description'):
- case('summary'):
+ case ('description'):
+ case ('summary'):
item.summary = _.get(el);
if (!item.description) item.description = _.get(el);
break;
- case('content'):
- case('content:encoded'):
+ case ('content'):
+ case ('content:encoded'):
item.description = _.get(el);
break;
- case('pubdate'):
- case('published'):
- case('issued'):
- case('modified'):
- case('updated'):
- case('dc:date'):
+ case ('pubdate'):
+ case ('published'):
+ case ('issued'):
+ case ('modified'):
+ case ('updated'):
+ case ('dc:date'):
var date = _.get(el) ? new Date(_.get(el)) : null;
if (!date) break;
if (item.pubdate === null || name == 'pubdate' || name == 'published' || name == 'issued')
@@ -868,9 +882,9 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){
if (item.date === null || name == 'modified' || name == 'updated')
item.date = date;
break;
- case('link'):
+ case ('link'):
if (Array.isArray(el)) {
- el.forEach(function (link){
+ el.forEach(function (link) {
if (link['@']['href']) { // Atom
if (_.get(link['@'], 'rel')) {
if (link['@']['rel'] == 'canonical') item.origlink = link['@']['href'];
@@ -878,7 +892,7 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){
if (link['@']['rel'] == 'self' && !item.link) item.link = link['@']['href'];
if (link['@']['rel'] == 'replies') item.comments = link['@']['href'];
if (link['@']['rel'] == 'enclosure') {
- enclosure = /** @type {import('../../index').Enclosure} */ ({});
+ enclosure = /** @type {import('../index').Enclosure} */ ({});
enclosure.url = link['@']['href'];
enclosure.type = _.get(link['@'], 'type');
enclosure.length = _.get(link['@'], 'length');
@@ -901,7 +915,7 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){
if (el['@']['rel'] == 'self' && !item.link) item.link = el['@']['href'];
if (el['@']['rel'] == 'replies') item.comments = el['@']['href'];
if (el['@']['rel'] == 'enclosure') {
- enclosure = /** @type {import('../../index').Enclosure} */ ({});
+ enclosure = /** @type {import('../index').Enclosure} */ ({});
enclosure.url = el['@']['href'];
enclosure.type = _.get(el['@'], 'type');
enclosure.length = _.get(el['@'], 'length');
@@ -918,8 +932,8 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){
}
if (!item.guid) item.guid = item.link;
break;
- case('guid'):
- case('id'):
+ case ('guid'):
+ case ('id'):
item.guid = _.get(el);
// http://cyber.law.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt
// If the guid element has an attribute named "isPermaLink" with a value
@@ -933,7 +947,7 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){
item.permalink = item.guid;
}
break;
- case('author'):
+ case ('author'):
var author = {};
if (_.get(el)) { // RSS
author = addressparser(_.get(el))[0];
@@ -950,13 +964,13 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){
item.author = _.get(el.name) || _.get(el.email) || _.get(el.uri);
}
break;
- case('dc:creator'):
+ case ('dc:creator'):
item.author = _.get(el);
break;
- case('comments'):
+ case ('comments'):
item.comments = _.get(el);
break;
- case('source'):
+ case ('source'):
if ('rss' == type) {
item.source['title'] = _.get(el);
item.source['url'] = _.get(el['@'], 'url');
@@ -969,7 +983,7 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){
if (item.source['url'] && !this.meta.xmlurl) {
this.meta.xmlurl = this.meta.xmlUrl = item.source['url'];
if (_.isAbsoluteUrl(item.source['url']) && this.xmlbase && this.xmlbase.length === 0) {
- this.xmlbase.unshift({ '#name': 'xml', '#': item.source['url']});
+ this.xmlbase.unshift({ '#name': 'xml', '#': item.source['url'] });
this.stack[0] = _.reresolve(this.stack[0], item.source['url']);
}
else if (this.xmlbase && this.xmlbase.length > 0) {
@@ -977,10 +991,10 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){
}
}
break;
- case('enclosure'):
+ case ('enclosure'):
if (Array.isArray(el)) {
- el.forEach(function (enc){
- enclosure = /** @type {import('../../index').Enclosure} */ ({});
+ el.forEach(function (enc) {
+ enclosure = /** @type {import('../index').Enclosure} */ ({});
enclosure.url = _.get(enc['@'], 'url');
enclosure.type = _.get(enc['@'], 'type');
enclosure.length = _.get(enc['@'], 'length');
@@ -991,7 +1005,7 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){
}
});
} else {
- enclosure = /** @type {import('../../index').Enclosure} */ ({});
+ enclosure = /** @type {import('../index').Enclosure} */ ({});
enclosure.url = _.get(el['@'], 'url');
enclosure.type = _.get(el['@'], 'type');
enclosure.length = _.get(el['@'], 'length');
@@ -1002,11 +1016,11 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){
}
}
break;
- case('media:content'):
+ case ('media:content'):
var optionalAttributes = ['bitrate', 'framerate', 'samplingrate', 'duration', 'height', 'width'];
if (Array.isArray(el)) {
- el.forEach(function (enc){
- enclosure = /** @type {import('../../index').Enclosure} */ ({});
+ el.forEach(function (enc) {
+ enclosure = /** @type {import('../index').Enclosure} */ ({});
enclosure.url = _.get(enc['@'], 'url');
enclosure.type = _.get(enc['@'], 'type') || _.get(enc['@'], 'medium');
enclosure.length = _.get(enc['@'], 'filesize');
@@ -1024,7 +1038,7 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){
}
});
} else {
- enclosure = /** @type {import('../../index').Enclosure} */ ({});
+ enclosure = /** @type {import('../index').Enclosure} */ ({});
enclosure.url = _.get(el['@'], 'url');
enclosure.type = _.get(el['@'], 'type') || _.get(el['@'], 'medium');
enclosure.length = _.get(el['@'], 'filesize');
@@ -1042,32 +1056,32 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){
}
}
break;
- case('enc:enclosure'): // Can't find this in use for an example to debug. Only example found does not comply with the spec -- can't code THAT!
+ case ('enc:enclosure'): // Can't find this in use for an example to debug. Only example found does not comply with the spec -- can't code THAT!
break;
- case('category'):
- case('dc:subject'):
- case('itunes:category'):
- case('media:category'):
+ case ('category'):
+ case ('dc:subject'):
+ case ('itunes:category'):
+ case ('media:category'):
/* We handle all the kinds of categories within the switch loop because item.categories
- * is an array, unlike the other properties, and therefore can handle multiple values
- */
+ * is an array, unlike the other properties, and therefore can handle multiple values
+ */
var _category = ''
, _categories = []
- ;
+ ;
if (Array.isArray(el)) {
- el.forEach(function (category){
+ el.forEach(function (category) {
if ('category' == name && 'atom' == type) {
if (category['@'] && _.get(category['@'], 'term')) item.categories.push(_.get(category['@'], 'term'));
} else if ('category' == name && _.get(category) && 'rss' == type) {
item.categories.push(_.get(category).trim());
} else if ('dc:subject' == name && _.get(category)) {
- _categories = _.get(category).split(' ').map(function (cat){ return cat.trim(); });
+ _categories = _.get(category).split(' ').map(function (cat) { return cat.trim(); });
if (_categories.length) item.categories = item.categories.concat(_categories);
} else if ('itunes:category' == name) {
if (category['@'] && _.get(category['@'], 'text')) _category = _.get(category['@'], 'text');
if (category[name]) {
if (Array.isArray(category[name])) {
- category[name].forEach(function (subcategory){
+ category[name].forEach(function (subcategory) {
if (subcategory['@'] && _.get(subcategory['@'], 'text')) item.categories.push(_category + '/' + _.get(subcategory['@'], 'text'));
});
} else {
@@ -1087,13 +1101,13 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){
} else if ('category' == name && _.get(el) && 'rss' == type) {
item.categories.push(_.get(el).trim());
} else if ('dc:subject' == name && _.get(el)) {
- _categories = _.get(el).split(' ').map(function (cat){ return cat.trim(); });
+ _categories = _.get(el).split(' ').map(function (cat) { return cat.trim(); });
if (_categories.length) item.categories = item.categories.concat(_categories);
} else if ('itunes:category' == name) {
if (el['@'] && _.get(el['@'], 'text')) _category = _.get(el['@'], 'text');
if (el[name]) {
if (Array.isArray(el[name])) {
- el[name].forEach(function (subcategory){
+ el[name].forEach(function (subcategory) {
if (subcategory['@'] && _.get(subcategory['@'], 'text')) item.categories.push(_category + '/' + _.get(subcategory['@'], 'text'));
});
} else {
@@ -1108,8 +1122,8 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){
}
}
break;
- case('feedburner:origlink'):
- case('pheedo:origlink'):
+ case ('feedburner:origlink'):
+ case ('pheedo:origlink'):
if (!item.origlink) {
item.origlink = _.get(el);
}
@@ -1121,7 +1135,7 @@ FeedParser.prototype.handleItem = function handleItem (node, type, options){
if (~name.indexOf(':')) item[name] = el;
else item[type + ':' + name] = el;
}
- }, this); // forEach end
+ }); // forEach end
if (normalize) {
if (!item.description) {
@@ -1216,7 +1230,7 @@ FeedParser.prototype._flush = function (done) {
* @typedef {Object} FeedParserState
* Instance properties set up by FeedParser.prototype.init and the constructor.
* @property {Object} meta - Parsed feed metadata; shape evolves during parsing
- * @property {import('../../index').Options} options
+ * @property {import('../index').Options} options
* @property {Object.
} _namespaces
* @property {boolean} _emitted_meta
* @property {Array.} stack
@@ -1235,8 +1249,8 @@ FeedParser.prototype._flush = function (done) {
* @property {function(string): void} handleCloseTag
* @property {function(string): void} handleText
* @property {function(Object., string): Object.} handleAttributes
- * @property {function(ParsedNode, import('../../index').Type, import('../../index').Options): Object} handleMeta
- * @property {function(ParsedNode, import('../../index').Type, import('../../index').Options): Object} handleItem
+ * @property {function(ParsedNode, import('../index').Type, import('../index').Options): Object} handleMeta
+ * @property {function(ParsedNode, import('../index').Type, import('../index').Options): Object} handleItem
*/
/**
@@ -1249,14 +1263,14 @@ FeedParser.prototype[Symbol.asyncIterator] = async function* () {
var error = null;
var ended = false;
- function onReadable() {
+ function onReadable () {
if (resolve) { resolve(); resolve = null; }
}
- function onEnd() {
+ function onEnd () {
ended = true;
if (resolve) { resolve(); resolve = null; }
}
- function onError(err) {
+ function onError (err) {
error = err;
if (resolve) { resolve(); resolve = null; }
}
diff --git a/lib/utils.js b/lib/utils.js
index b6c8a33..7a54fac 100644
--- a/lib/utils.js
+++ b/lib/utils.js
@@ -1,12 +1,7 @@
-var WHATWGURL = require('url').URL
- , namespaces = require('./namespaces')
- ;
+const { URL: WHATWGURL } = require('url');
+const { NAMESPACES, HTML_URI_ATTRS, HTML_TAGS } = require('./constants');
-exports.has = require('lodash.has');
-exports.assign = require('lodash.assign');
-exports.uniq = require('lodash.uniq');
-
-var _get = require('lodash.get');
+const _get = require('lodash.get');
/**
* lodash.get, but wrapped to provide a default subkey (a/k/a path) of "#"
* and defaultValue of "null"
@@ -25,7 +20,7 @@ var _get = require('lodash.get');
* @returns {*} The value of the selected key, or null if undefined.
* @private
*/
-function get(obj, subkey, defaultValue) {
+function get (obj, subkey, defaultValue) {
if (!subkey) {
subkey = '#';
}
@@ -41,7 +36,6 @@ function get(obj, subkey, defaultValue) {
return _get(obj, subkey, defaultValue);
}
}
-exports.get = get;
/**
* Safely trim a value if it's a String
@@ -55,7 +49,6 @@ function safeTrim (val) {
}
return val;
}
-exports.safeTrim = safeTrim;
/*
* Resolve a URL against a base URL, returning the original pathUrl if
@@ -68,13 +61,55 @@ exports.safeTrim = safeTrim;
*/
function resolve (baseUrl, pathUrl) {
if (!baseUrl || !pathUrl) return pathUrl;
+ if (typeof pathUrl !== 'string') return pathUrl;
try {
return new WHATWGURL(pathUrl, baseUrl).href;
} catch (e) {
return pathUrl;
}
}
-exports.resolve = resolve;
+
+/*
+ * Resolve the URLs in a srcset attribute value against a base URL.
+ * @param {string} baseUrl
+ * @param {string} candidate
+ * @returns {string}
+ * @private
+ */
+function resolveSrcsetCandidate (baseUrl, candidate) {
+ var match = candidate.match(/^(\s*)(\S+)([\s\S]*)$/);
+ if (!match) return candidate;
+ return match[1] + resolve(baseUrl, match[2]) + match[3];
+}
+
+function resolveSrcset (baseUrl, srcset) {
+ if (!baseUrl || !srcset || typeof srcset !== 'string') return srcset;
+
+ var out = '';
+ var start = 0;
+ var depth = 0;
+ var i;
+ for (i = 0; i < srcset.length; i++) {
+ if (srcset[i] === '(') {
+ depth++;
+ } else if (srcset[i] === ')' && depth) {
+ depth--;
+ } else if (srcset[i] === ',' && depth === 0) {
+ // Do not split commas that are part of functional URL notation.
+ out += resolveSrcsetCandidate(baseUrl, srcset.slice(start, i)) + ',';
+ start = i + 1;
+ }
+ }
+
+ return out + resolveSrcsetCandidate(baseUrl, srcset.slice(start));
+}
+
+function resolveHtmlAttributeValue (baseUrl, name, value) {
+ var attrName = name.toLowerCase();
+ if (attrName === 'srcset') return resolveSrcset(baseUrl, value);
+ if (HTML_URI_ATTRS.has(attrName)) return resolve(baseUrl, value);
+ return value;
+}
/*
* Check whether a given uri is an absolute URL
@@ -90,7 +125,6 @@ function isAbsoluteUrl (uri) {
return false;
}
}
-exports.isAbsoluteUrl = isAbsoluteUrl;
/*
* Check whether a given namespace URI matches the given default
@@ -101,9 +135,8 @@ exports.isAbsoluteUrl = isAbsoluteUrl;
* @private
*/
function nslookup (uri, def) {
- return namespaces[uri] === def;
+ return NAMESPACES[uri] === def;
}
-exports.nslookup = nslookup;
/*
* Return the "default" namespace prefix for a given namespace URI
@@ -113,9 +146,8 @@ exports.nslookup = nslookup;
* @private
*/
function nsprefix (uri) {
- return namespaces[uri];
+ return NAMESPACES[uri];
}
-exports.nsprefix = nsprefix;
/*
* Walk a node and re-resolve the urls using the given baseurl
@@ -132,7 +164,7 @@ function reresolve (node, baseurl) {
function resolveLevel (level) {
var els = Object.keys(level);
- els.forEach(function(el){
+ els.forEach(function (el) {
if (Array.isArray(level[el])) {
// The shape of the array of element items is different than if the element is not an array.
// We need it to be the same shape to enable using the same function for recursion.
@@ -158,13 +190,16 @@ function reresolve (node, baseurl) {
if ('@' in level[el]) {
var attrs = Object.keys(level[el]['@']);
attrs.forEach(function (name) {
- if (name == 'href' || name == 'src' || name == 'uri') {
+ if (HTML_URI_ATTRS.has(name)) {
if ('string' === typeof level[el]['@'][name]) {
- level[el]['@'][name] = resolve(baseurl, level[el]['@'][name]);
+ level[el]['@'][name] = resolveHtmlAttributeValue(baseurl, name, level[el]['@'][name]);
}
}
});
}
+ if (mayHaveEmbeddedHtml(el, level[el])) {
+ level[el]['#'] = resolveHtmlUris(level[el]['#'], baseurl);
+ }
}
}
});
@@ -173,33 +208,6 @@ function reresolve (node, baseurl) {
return resolveLevel(node);
}
-exports.reresolve = reresolve;
-
-var HTML_TAGS = new Set([
- 'a', 'abbr', 'acronym', 'address', 'applet', 'area', 'article', 'aside', 'audio',
- 'b', 'base', 'basefont', 'bdi', 'bdo', 'big', 'blink', 'blockquote', 'body', 'br', 'button',
- 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
- 'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt',
- 'em', 'embed',
- 'fieldset', 'figcaption', 'figure', 'font', 'footer', 'form', 'frame', 'frameset',
- 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'hr', 'html',
- 'i', 'iframe', 'img', 'input', 'ins', 'isindex',
- 'kbd',
- 'label', 'legend', 'li', 'link', 'listing',
- 'main', 'map', 'mark', 'marquee', 'menu', 'menuitem', 'meta', 'meter', 'multicol',
- 'nav', 'nextid', 'nobr', 'noembed', 'noframes', 'noscript',
- 'object', 'ol', 'optgroup', 'option', 'output',
- 'p', 'param', 'picture', 'plaintext', 'pre', 'progress',
- 'q',
- 'rb', 'rp', 'rt', 'rtc', 'ruby',
- 's', 'samp', 'script', 'section', 'select', 'slot', 'small', 'source', 'spacer', 'span', 'strike', 'strong', 'style', 'sub', 'summary', 'sup',
- 'table', 'tbody', 'td', 'template', 'textarea', 'tfoot', 'th',
- 'thead', 'time', 'title', 'tr', 'track', 'tt',
- 'u', 'ul',
- 'var', 'video',
- 'wbr',
- 'xmp'
-]);
/*
* Scan markup starting at str[i] (which must be '<') and return its length
@@ -258,13 +266,94 @@ function readMarkupAt (str, i) {
} else if (ch === '"' || ch === '\'') {
quote = ch;
} else if (ch === '>') {
- return { tagName: tagName, len: j + 1 - i };
+ return { tagName: tagName, isClosing: isClosing, len: j + 1 - i };
}
j++;
}
return null; // unclosed tag
}
+function rewriteHtmlTagUris (tag, baseUrl) {
+ var i = 1;
+ var out = '';
+ var last = 0;
+
+ if (tag[i] === '/') return tag;
+ while (i < tag.length && !/[\s/>]/.test(tag[i])) i++;
+
+ while (i < tag.length) {
+ while (i < tag.length && /\s/.test(tag[i])) i++;
+ if (i >= tag.length || tag[i] === '>' || tag[i] === '/') break;
+
+ var nameStart = i;
+ while (i < tag.length && !/[\s=/>]/.test(tag[i])) i++;
+ var name = tag.slice(nameStart, i);
+
+ while (i < tag.length && /\s/.test(tag[i])) i++;
+ if (tag[i] !== '=') continue;
+ i++;
+ while (i < tag.length && /\s/.test(tag[i])) i++;
+
+ var quote = null;
+ if (tag[i] === '"' || tag[i] === '\'') {
+ quote = tag[i];
+ i++;
+ }
+
+ var valueStart = i;
+ if (quote) {
+ while (i < tag.length && tag[i] !== quote) i++;
+ } else {
+ while (i < tag.length && !/[\s/>]/.test(tag[i])) i++;
+ }
+ var valueEnd = i;
+ var value = tag.slice(valueStart, valueEnd);
+ var resolved = resolveHtmlAttributeValue(baseUrl, name, value);
+ if (resolved !== value) {
+ out += tag.slice(last, valueStart) + resolved;
+ last = valueEnd;
+ }
+ if (quote && tag[i] === quote) i++;
+ }
+
+ return out ? out + tag.slice(last) : tag;
+}
+
+function resolveHtmlUris (html, baseUrl) {
+ if (!baseUrl || !html || typeof html !== 'string') return html;
+
+ var out = '';
+ var i = 0;
+ while (i < html.length) {
+ if (html[i] === '<') {
+ var markup = readMarkupAt(html, i);
+ if (markup && !markup.alwaysStrip && !markup.isClosing && HTML_TAGS.has(markup.tagName)) {
+ out += rewriteHtmlTagUris(html.slice(i, i + markup.len), baseUrl);
+ i += markup.len;
+ continue;
+ }
+ }
+ out += html[i];
+ i++;
+ }
+ return out;
+}
+
+function mayHaveEmbeddedHtml (name, el) {
+ if (!el || typeof el['#'] !== 'string') return false;
+
+ var type = get(el['@'], 'type');
+
+ if (name === 'content:encoded') return true;
+ if (name === 'description' || name === 'summary' || name === 'tagline') return true;
+
+ if (name === 'content' || name === 'title' || name === 'subtitle' || name === 'rights') {
+ return type === 'html' || type === 'xhtml';
+ }
+
+ return false;
+}
+
/*
* Strip HTML tags, leaving bare text content.
* Scans the string for markup - HTML tags, comments, doctypes, and processing
@@ -292,5 +381,17 @@ function stripHtml (str) {
return out;
}
-exports.HTML_TAGS = HTML_TAGS;
-exports.stripHtml = stripHtml;
+module.exports = {
+ get,
+ safeTrim,
+ resolve,
+ resolveSrcset,
+ resolveHtmlAttributeValue,
+ isAbsoluteUrl,
+ nslookup,
+ nsprefix,
+ reresolve,
+ resolveHtmlUris,
+ mayHaveEmbeddedHtml,
+ stripHtml
+};
diff --git a/test/api.js b/test/api.js
index 4be194f..cf40825 100644
--- a/test/api.js
+++ b/test/api.js
@@ -54,4 +54,4 @@ describe('api', function () {
});
});
-});
\ No newline at end of file
+});
diff --git a/test/async-iterator.js b/test/async-iterator.js
index ada8aea..e6ff940 100644
--- a/test/async-iterator.js
+++ b/test/async-iterator.js
@@ -4,7 +4,7 @@ var pipeline = require('util').promisify(require('stream').pipeline);
describe('async iterator usage', function () {
// These tests use .pipe() only to allow testing in older Node versions.
- // In modern Node versions, you can use pipeline() with async iterators
+ // In modern Node versions, you can use pipeline() with async iterators
// instead of .pipe(). If you use .pipe, you must add your own error handling
// to avoid uncaught exceptions on errors.
it('should work as an async iterator', async function () {
@@ -46,7 +46,7 @@ describe('async iterator usage', function () {
var items = [];
var caught = null;
var uncaught = null;
- function onUncaught(err) {
+ function onUncaught (err) {
uncaught = err;
}
process.prependOnceListener('uncaughtException', onUncaught);
diff --git a/test/bad.js b/test/bad.js
index b74b98e..dce6f4f 100644
--- a/test/bad.js
+++ b/test/bad.js
@@ -1,4 +1,4 @@
-describe('bad feeds', function(){
+describe('bad feeds', function () {
describe('not a feed', function () {
diff --git a/test/category.js b/test/category.js
index 1d77039..94f8d09 100644
--- a/test/category.js
+++ b/test/category.js
@@ -1,4 +1,4 @@
-describe('categories', function(){
+describe('categories', function () {
var feed = __dirname + '/feeds/category-feed.xml';
diff --git a/test/duplicate-enclosures.js b/test/duplicate-enclosures.js
index 8215489..725c10b 100644
--- a/test/duplicate-enclosures.js
+++ b/test/duplicate-enclosures.js
@@ -1,8 +1,8 @@
-describe('duplicate enclosures', function(){
+describe('duplicate enclosures', function () {
var feed = __dirname + '/feeds/mediacontent-dupes.xml';
- it('should not have duplicate enclosures from different elements', function (done){
+ it('should not have duplicate enclosures from different elements', function (done) {
fs.createReadStream(feed).pipe(new FeedParser())
.once('readable', function () {
var stream = this;
diff --git a/test/feeds/rss-with-item-scoped-html-base.xml b/test/feeds/rss-with-item-scoped-html-base.xml
new file mode 100644
index 0000000..71a8ae9
--- /dev/null
+++ b/test/feeds/rss-with-item-scoped-html-base.xml
@@ -0,0 +1,18 @@
+
+
+
+ Item Scoped HTML Base
+ https://example.com/
+ Fixture for scoped xml:base in item HTML.
+ -
+ First
+ first
+ First]]>
+
+ -
+ Second
+ second
+ Second]]>
+
+
+
diff --git a/test/feeds/rss-with-relative-html-urls-no-base.xml b/test/feeds/rss-with-relative-html-urls-no-base.xml
new file mode 100644
index 0000000..3122b44
--- /dev/null
+++ b/test/feeds/rss-with-relative-html-urls-no-base.xml
@@ -0,0 +1,13 @@
+
+
+
+ Relative HTML URLs Without Base
+ https://example.com/feed/
+ Fixture for relative links in embedded HTML without xml:base.
+ -
+ Post
+ posts/post-1
+ ReadComments
]]>
+
+
+
diff --git a/test/feeds/rss-with-relative-html-urls.xml b/test/feeds/rss-with-relative-html-urls.xml
new file mode 100644
index 0000000..dabc687
--- /dev/null
+++ b/test/feeds/rss-with-relative-html-urls.xml
@@ -0,0 +1,13 @@
+
+
+
+ Relative HTML URLs
+ https://example.com/
+ Fixture for relative links in embedded HTML.
+ -
+ Post
+ posts/post-1
+ ReadComments
]]>
+
+
+
diff --git a/test/illegally-nested.js b/test/illegally-nested.js
index 642832c..fe465a7 100644
--- a/test/illegally-nested.js
+++ b/test/illegally-nested.js
@@ -1,8 +1,8 @@
-describe('illegally nested', function(){
+describe('illegally nested', function () {
var feed = __dirname + '/feeds/illegally-nested.xml';
- it('should ignore illegally-nested items', function (done){
+ it('should ignore illegally-nested items', function (done) {
var itemCount = 0;
fs.createReadStream(feed).pipe(new FeedParser())
.on('readable', function () {
diff --git a/test/link.js b/test/link.js
index 52d56ca..8996b3d 100644
--- a/test/link.js
+++ b/test/link.js
@@ -1,4 +1,4 @@
-describe('links', function(){
+describe('links', function () {
var feed = __dirname + '/feeds/non-text-alternate-links.xml';
@@ -20,7 +20,7 @@ describe('links', function(){
var items = [];
var sawDeprecation = false;
var origEmit = process.emit;
- process.emit = function(event, warning) {
+ process.emit = function (event, warning) {
if (event === 'warning' && warning && warning.name === 'DeprecationWarning') {
sawDeprecation = true;
}
diff --git a/test/namespaces.js b/test/namespaces.js
index fbced78..3ae393c 100644
--- a/test/namespaces.js
+++ b/test/namespaces.js
@@ -1,6 +1,6 @@
-describe('namespaced elements', function(){
+describe('namespaced elements', function () {
- describe('standard namespaces', function(){
+ describe('standard namespaces', function () {
var feed = __dirname + '/feeds/wapowellness.xml';
@@ -21,7 +21,7 @@ describe('namespaced elements', function(){
});
- describe('non-standard namespaces', function(){
+ describe('non-standard namespaces', function () {
var feed = __dirname + '/feeds/complexNamespaceFeed.xml';
@@ -41,7 +41,7 @@ describe('namespaced elements', function(){
});
- describe('nondefaultnamespace-baseline', function(){
+ describe('nondefaultnamespace-baseline', function () {
var feed = __dirname + '/feeds/nondefaultnamespace-baseline.atom';
@@ -62,7 +62,7 @@ describe('namespaced elements', function(){
});
- describe('nondefaultnamespace Test case 1', function(){
+ describe('nondefaultnamespace Test case 1', function () {
var feed = __dirname + '/feeds/nondefaultnamespace.atom';
@@ -83,7 +83,7 @@ describe('namespaced elements', function(){
});
- describe('nondefaultnamespace Test case 2', function(){
+ describe('nondefaultnamespace Test case 2', function () {
var feed = __dirname + '/feeds/nondefaultnamespace-xhtml.atom';
@@ -104,7 +104,7 @@ describe('namespaced elements', function(){
});
- describe('nondefaultnamespace Test case 3', function(){
+ describe('nondefaultnamespace Test case 3', function () {
var feed = __dirname + '/feeds/unknown-namespace.atom';
diff --git a/test/utils.js b/test/utils.js
index bb7f69f..94c7405 100644
--- a/test/utils.js
+++ b/test/utils.js
@@ -1,4 +1,5 @@
var utils = require('../lib/utils');
+var { HTML_TAGS } = require('../lib/constants');
describe('utils', function () {
@@ -120,6 +121,10 @@ describe('utils', function () {
assert.strictEqual(utils.resolve('http://example.com/', undefined), undefined);
});
+ it('returns pathUrl when pathUrl is not a string', function () {
+ assert.strictEqual(utils.resolve('http://example.com/', 42), 42);
+ });
+
it('returns pathUrl for tag: URIs that the URL constructor rejects', function () {
var tagUri = 'tag:example.com,2003:posts/1';
assert.strictEqual(utils.resolve('http://example.com/', tagUri), tagUri);
@@ -277,6 +282,18 @@ describe('utils', function () {
assert.strictEqual(node.el['@'].uri, 'http://example.com/resource');
});
+ it('resolves HTML URI attributes', function () {
+ var node = { video: { '@': { poster: '/poster.png' } } };
+ utils.reresolve(node, 'http://example.com/');
+ assert.strictEqual(node.video['@'].poster, 'http://example.com/poster.png');
+ });
+
+ it('resolves srcset attributes', function () {
+ var node = { img: { '@': { srcset: 'small.png 480w, /large.png 2x' } } };
+ utils.reresolve(node, 'http://example.com/path/');
+ assert.strictEqual(node.img['@'].srcset, 'http://example.com/path/small.png 480w, http://example.com/large.png 2x');
+ });
+
it('handles array of element items', function () {
var node = {
link: [
@@ -443,7 +460,7 @@ describe('utils', function () {
assert.strictEqual(utils.stripHtml(' 0\'>link'), 'link');
});
- utils.HTML_TAGS.forEach(function (tag) {
+ HTML_TAGS.forEach(function (tag) {
it(`strips ${tag} HTML tag opening and closing and self-closing`, function () {
assert.strictEqual(utils.stripHtml('<' + tag + '>content' + tag + '> and <' + tag + ' />more'), 'content and more', 'expected <' + tag + '> to be stripped');
});
diff --git a/test/xmlbase.js b/test/xmlbase.js
index 7b53c17..519230e 100644
--- a/test/xmlbase.js
+++ b/test/xmlbase.js
@@ -1,4 +1,4 @@
-describe('xmlbase', function(){
+describe('xmlbase', function () {
it('should resolve relative URIs in meta elements with no root xml:base', function (done) {
var feed = __dirname + '/feeds/intertwingly.atom';
@@ -167,4 +167,111 @@ describe('xmlbase', function(){
});
});
+ it('should resolve relative URLs in embedded item HTML with xml:base', function (done) {
+ var feed = __dirname + '/feeds/rss-with-relative-html-urls.xml';
+ var descriptions = [];
+
+ fs.createReadStream(feed).pipe(new FeedParser())
+ .on('readable', function () {
+ var item;
+ while ((item = this.read())) {
+ descriptions.push(item.description);
+ }
+ })
+ .on('error', function (err) {
+ assert.ifError(err);
+ done(err);
+ })
+ .on('end', function () {
+ assert.equal(descriptions[0], 'ReadComments
');
+ done();
+ });
+ });
+
+ it('should resolve relative URLs in embedded item HTML with an inferred base', function (done) {
+ var feed = __dirname + '/feeds/rss-with-relative-html-urls-no-base.xml';
+ var items = [];
+
+ fs.createReadStream(feed).pipe(new FeedParser())
+ .on('readable', function () {
+ var item;
+ while ((item = this.read())) {
+ items.push(item);
+ }
+ })
+ .on('error', function (err) {
+ assert.ifError(err);
+ done(err);
+ })
+ .on('end', function () {
+ assert.equal(items[0].link, 'https://example.com/feed/posts/post-1');
+ assert.equal(items[0].description, 'ReadComments
');
+ done();
+ });
+ });
+
+ it('should resolve relative URLs in embedded item HTML with feedurl option', function (done) {
+ var feed = __dirname + '/feeds/rss-with-relative-html-urls-no-base.xml';
+ var descriptions = [];
+
+ fs.createReadStream(feed).pipe(new FeedParser({ feedurl: 'https://example.com/feed/' }))
+ .on('readable', function () {
+ var item;
+ while ((item = this.read())) {
+ descriptions.push(item.description);
+ }
+ })
+ .on('error', function (err) {
+ assert.ifError(err);
+ done(err);
+ })
+ .on('end', function () {
+ assert.equal(descriptions[0], 'ReadComments
');
+ done();
+ });
+ });
+
+ it('should not use item xml:base for sibling embedded item HTML', function (done) {
+ var feed = __dirname + '/feeds/rss-with-item-scoped-html-base.xml';
+ var descriptions = [];
+
+ fs.createReadStream(feed).pipe(new FeedParser())
+ .on('readable', function () {
+ var item;
+ while ((item = this.read())) {
+ descriptions.push(item.description);
+ }
+ })
+ .on('error', function (err) {
+ assert.ifError(err);
+ done(err);
+ })
+ .on('end', function () {
+ assert.equal(descriptions[0], 'First
');
+ assert.equal(descriptions[1], 'Second
');
+ done();
+ });
+ });
+
+ it('should not resolve relative URLs in embedded item HTML when normalize is false', function (done) {
+ var feed = __dirname + '/feeds/rss-with-relative-html-urls.xml';
+ var descriptions = [];
+
+ fs.createReadStream(feed).pipe(new FeedParser({ normalize: false }))
+ .on('readable', function () {
+ var item;
+ while ((item = this.read())) {
+ descriptions.push(item['rss:description']['#']);
+ }
+ })
+ .on('error', function (err) {
+ assert.ifError(err);
+ done(err);
+ })
+ .on('end', function () {
+ assert.equal(descriptions[0], 'ReadComments
');
+ done();
+ });
+ });
+
});