Skip to content

Commit 961d8fe

Browse files
committed
Preliminary version (works with Atom feeds)
1 parent 10389d9 commit 961d8fe

File tree

7 files changed

+343
-192
lines changed

7 files changed

+343
-192
lines changed

index.js

Lines changed: 49 additions & 187 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@ import validateNames from 'jsdom/lib/jsdom/living/helpers/validate-names.js';
1212

1313
import nunjucks from 'nunjucks';
1414
import css from 'css';
15-
import { Readability } from '@mozilla/readability';
16-
import createDOMPurify from 'dompurify';
1715
import MimeType from 'whatwg-mimetype';
1816

1917
/*
@@ -35,28 +33,16 @@ import humanDate from './src/util/human-date.js';
3533
import outputPath from './src/util/output-path.js';
3634
import getCssPageFormat from './src/util/get-css-page-format.js';
3735
import { resolveSequence, resolveParallel } from './src/util/promises.js';
38-
import { getUrlOrigin } from './src/util/url-origin.js';
3936
import addExif from './src/exif.js';
40-
import { hyphenateDom } from './src/hyphenate.js';
41-
import { textToIso6391, getLanguageAttribute } from './src/util/language.js';
42-
import { setIdsAndReturnHeadings, nestHeadings } from './src/headings.js';
4337

44-
import {
45-
ampToHtml,
46-
fixLazyLoadedImages,
47-
imagesAtFullSize,
48-
wikipediaSpecific,
49-
noUselessHref,
50-
relativeToAbsoluteURIs,
51-
singleImgToFigure,
52-
expandDetailsElements,
53-
githubSpecific,
54-
wrapPreBlocks
55-
} from './src/enhancements.js';
56-
import mapRemoteResources from './src/remote-resources.js';
57-
import inlineImages from './src/inline-images.js';
38+
import { isURL } from './src/util/url.js';
39+
import { isFeed, processFeed } from './src/util/feeds.js';
40+
5841
import get_style_attribute_value from './src/get-style-attribute-value.js';
5942

43+
import cleanupItem from './src/cleanup-item.js';
44+
import wrapHTMLFragment from './src/util/wrap-html-fragment.js';
45+
6046
const out = process.stdout;
6147
const err = process.stderr;
6248

@@ -74,24 +60,6 @@ const JUSTIFY_CSS = `
7460
}
7561
`;
7662

77-
const enhancePage = function (dom) {
78-
// Note: the order of the enhancements matters!
79-
[
80-
ampToHtml,
81-
fixLazyLoadedImages,
82-
relativeToAbsoluteURIs,
83-
imagesAtFullSize,
84-
singleImgToFigure,
85-
noUselessHref,
86-
expandDetailsElements,
87-
wikipediaSpecific,
88-
githubSpecific,
89-
wrapPreBlocks
90-
].forEach(enhancement => {
91-
enhancement(dom.window.document);
92-
});
93-
};
94-
9563
/*
9664
Some setup
9765
----------
@@ -141,16 +109,6 @@ function launch(options, size) {
141109
-----------------------------------
142110
*/
143111

144-
function isURL(ref) {
145-
try {
146-
new URL(ref);
147-
return true;
148-
} catch (err) {
149-
// no-op
150-
}
151-
return false;
152-
}
153-
154112
async function fetchContent(ref, fetchOptions = {}) {
155113
if (ref instanceof stream.Readable) {
156114
return {
@@ -248,152 +206,55 @@ async function cleanup(url, options) {
248206
url: final_url
249207
});
250208

251-
// Force relative URL resolution
252-
dom.window.document.body.setAttribute(null, null);
253-
254-
const sanitizer = createDOMPurify(dom.window);
255-
256-
const amp = dom.window.document.querySelector('link[rel~=amphtml]');
257-
if (amp && options.amp) {
258-
err.write('\nFound AMP version (use `--no-amp` to ignore)\n');
259-
return cleanup(amp.href, options, amp.href);
260-
}
209+
const doc = dom.window.document;
261210

262-
err.write(`Enhancing web page: ${url}`);
211+
// Stop-gap solution as some of these are still referenced in cleanupItem()
212+
const ENV = {
213+
err,
214+
out,
215+
UA
216+
};
263217

264-
/*
265-
Run enhancements
266-
----------------
218+
/*
219+
If the file is a valid RSS/Atom feed
220+
extract the feed entries to be processed further.
267221
*/
268-
enhancePage(dom);
269-
270-
// Run through readability and return
271-
const R = new Readability(dom.window.document, {
272-
classesToPreserve: [
273-
'no-href',
274-
275-
/*
276-
Placed on some <a> elements
277-
as in-page anchors
278-
*/
279-
'anchor'
280-
],
281-
/*
282-
Change Readability's serialization to return
283-
a DOM element (instead of a HTML string)
284-
as the `.content` property returned from `.parse()`
285-
286-
This makes it easier for us to run subsequent
287-
transformations (sanitization, hyphenation, etc.)
288-
without having to parse/serialize the HTML repeatedly.
289-
*/
290-
serializer: el => el
291-
});
292-
293-
// TODO: find better solution to prevent Readability from
294-
// making img srcs relative.
295-
if (options.mapRemoteResources || options.inline) {
296-
R._fixRelativeUris = () => {};
297-
}
298-
299-
const parsed = R.parse() || {};
300-
301-
let remoteResources;
302-
if (options.mapRemoteResources) {
303-
remoteResources = mapRemoteResources(parsed.content);
304-
}
305-
306-
// Hyphenate the text
307-
const textContent = sanitizer.sanitize(parsed.textContent);
308-
const lang =
309-
getLanguageAttribute(dom.window.document) ||
310-
textToIso6391(textContent);
311-
312-
err.write(' ✓\n');
313-
314-
if (options.inline) {
315-
await inlineImages(
316-
parsed.content,
317-
{
318-
headers: {
319-
'user-agent': UA
320-
},
321-
/*
322-
Send the referrer as the browser would
323-
when fetching the image to render it.
324-
325-
The referrer policy would take care of
326-
stripping the URL down to its origin,
327-
but just in case, let’s strip it ourselves.
328-
*/
329-
referrer: getUrlOrigin(final_url),
330-
referrerPolicy: 'strict-origin-when-cross-origin',
331-
timeout: 10 * 1000
332-
},
333-
options.debug ? out : undefined
222+
if (isFeed(doc)) {
223+
return await Promise.all(
224+
processFeed(doc).map(async it => {
225+
const itemDOM = new JSDOM(wrapHTMLFragment(it), {
226+
url: it.url
227+
});
228+
it.content = itemDOM.window.document.body;
229+
const clean = await cleanupItem(itemDOM, options, it, ENV);
230+
return {
231+
...clean,
232+
// TODO
233+
originalContent: {
234+
buffer: null,
235+
contentType: 'text/html'
236+
}
237+
};
238+
})
334239
);
335240
}
336241

337242
/*
338-
Select the appropriate serialization method
339-
based on the bundle target. EPUBs need the
340-
content to be XHTML (produced by a XML serializer),
341-
rather than normal HTML.
342-
*/
343-
const serializer = options.xhtml
344-
? arr => {
345-
const xs = new dom.window.XMLSerializer();
346-
return arr.map(el => xs.serializeToString(el)).join('');
347-
}
348-
: arr => arr.map(el => el.innerHTML).join('');
349-
350-
/*
351-
When dompurify returns a DOM node, it always wraps it
352-
in a HTMLBodyElement. We only need its children.
353-
*/
354-
const sanitize_to_dom = dirty =>
355-
Array.from(
356-
sanitizer.sanitize(dirty, { RETURN_DOM: true }).children
357-
);
358-
359-
const content_els = sanitize_to_dom(parsed.content);
360-
361-
// `--toc-level` implies `--toc`, unless disabled with `--no-toc`.
362-
let headings = [];
363-
if (options['toc-level'] > 1 && options.toc !== false) {
364-
headings = setIdsAndReturnHeadings(
365-
content_els,
366-
options['toc-level']
367-
).map(heading => {
368-
return {
369-
id: heading.id,
370-
level: heading.level,
371-
// Plain text used in EPUB
372-
text: heading.node.textContent.trim(),
373-
// Sanitized marked-up text used in HTML/PDF
374-
content: serializer([heading.node])
375-
};
376-
});
243+
Use the AMP version of the article
244+
if one is available and the user has not opted out.
245+
*/
246+
const amp = doc.querySelector('link[rel~=amphtml]');
247+
if (amp && options.amp) {
248+
err.write('\nFound AMP version (use `--no-amp` to ignore)\n');
249+
return cleanup(amp.href, options, amp.href);
377250
}
378251

379252
return {
380-
id: `percollate-page-${uuid()}`,
381-
url: final_url,
382-
title: sanitizer.sanitize(parsed.title),
383-
byline: sanitizer.sanitize(parsed.byline),
384-
dir: sanitizer.sanitize(parsed.dir),
385-
excerpt: serializer(sanitize_to_dom(parsed.excerpt)),
386-
content: serializer(
387-
options.hyphenate === true
388-
? content_els.map(el => hyphenateDom(el, lang))
389-
: content_els
390-
),
391-
lang,
392-
textContent,
393-
toc: nestHeadings(headings || []),
394-
length: parsed.length,
395-
siteName: sanitizer.sanitize(parsed.siteName),
396-
remoteResources,
253+
...(await cleanupItem(dom, options, null, ENV)),
254+
/*
255+
Augument for original content, useful when
256+
percollate is used as an API.
257+
*/
397258
originalContent: {
398259
buffer,
399260
contentType
@@ -762,7 +623,9 @@ async function generate(fn, urls, options = {}) {
762623
},
763624
w
764625
)
765-
).filter(it => it);
626+
)
627+
.filter(it => it)
628+
.flat();
766629

767630
if (options.individual) {
768631
await Promise.all(items.map(item => fn([item], options)));
@@ -995,6 +858,5 @@ async function epubgen(data, output_path, options) {
995858
export { configure, pdf, epub, html, md };
996859

997860
export const __test__ = {
998-
fetchContent,
999-
isURL
861+
fetchContent
1000862
};

0 commit comments

Comments
 (0)