@@ -12,8 +12,6 @@ import validateNames from 'jsdom/lib/jsdom/living/helpers/validate-names.js';
12
12
13
13
import nunjucks from 'nunjucks' ;
14
14
import css from 'css' ;
15
- import { Readability } from '@mozilla/readability' ;
16
- import createDOMPurify from 'dompurify' ;
17
15
import MimeType from 'whatwg-mimetype' ;
18
16
19
17
/*
@@ -35,28 +33,16 @@ import humanDate from './src/util/human-date.js';
35
33
import outputPath from './src/util/output-path.js' ;
36
34
import getCssPageFormat from './src/util/get-css-page-format.js' ;
37
35
import { resolveSequence , resolveParallel } from './src/util/promises.js' ;
38
- import { getUrlOrigin } from './src/util/url-origin.js' ;
39
36
import addExif from './src/exif.js' ;
40
- import { hyphenateDom } from './src/hyphenate.js' ;
41
- import { textToIso6391 , getLanguageAttribute } from './src/util/language.js' ;
42
- import { setIdsAndReturnHeadings , nestHeadings } from './src/headings.js' ;
43
37
44
- import {
45
- ampToHtml ,
46
- fixLazyLoadedImages ,
47
- imagesAtFullSize ,
48
- wikipediaSpecific ,
49
- noUselessHref ,
50
- relativeToAbsoluteURIs ,
51
- singleImgToFigure ,
52
- expandDetailsElements ,
53
- githubSpecific ,
54
- wrapPreBlocks
55
- } from './src/enhancements.js' ;
56
- import mapRemoteResources from './src/remote-resources.js' ;
57
- import inlineImages from './src/inline-images.js' ;
38
+ import { isURL } from './src/util/url.js' ;
39
+ import { isFeed , processFeed } from './src/util/feeds.js' ;
40
+
58
41
import get_style_attribute_value from './src/get-style-attribute-value.js' ;
59
42
43
+ import cleanupItem from './src/cleanup-item.js' ;
44
+ import wrapHTMLFragment from './src/util/wrap-html-fragment.js' ;
45
+
60
46
const out = process . stdout ;
61
47
const err = process . stderr ;
62
48
@@ -74,24 +60,6 @@ const JUSTIFY_CSS = `
74
60
}
75
61
` ;
76
62
77
- const enhancePage = function ( dom ) {
78
- // Note: the order of the enhancements matters!
79
- [
80
- ampToHtml ,
81
- fixLazyLoadedImages ,
82
- relativeToAbsoluteURIs ,
83
- imagesAtFullSize ,
84
- singleImgToFigure ,
85
- noUselessHref ,
86
- expandDetailsElements ,
87
- wikipediaSpecific ,
88
- githubSpecific ,
89
- wrapPreBlocks
90
- ] . forEach ( enhancement => {
91
- enhancement ( dom . window . document ) ;
92
- } ) ;
93
- } ;
94
-
95
63
/*
96
64
Some setup
97
65
----------
@@ -141,16 +109,6 @@ function launch(options, size) {
141
109
-----------------------------------
142
110
*/
143
111
144
- function isURL ( ref ) {
145
- try {
146
- new URL ( ref ) ;
147
- return true ;
148
- } catch ( err ) {
149
- // no-op
150
- }
151
- return false ;
152
- }
153
-
154
112
async function fetchContent ( ref , fetchOptions = { } ) {
155
113
if ( ref instanceof stream . Readable ) {
156
114
return {
@@ -248,152 +206,55 @@ async function cleanup(url, options) {
248
206
url : final_url
249
207
} ) ;
250
208
251
- // Force relative URL resolution
252
- dom . window . document . body . setAttribute ( null , null ) ;
253
-
254
- const sanitizer = createDOMPurify ( dom . window ) ;
255
-
256
- const amp = dom . window . document . querySelector ( 'link[rel~=amphtml]' ) ;
257
- if ( amp && options . amp ) {
258
- err . write ( '\nFound AMP version (use `--no-amp` to ignore)\n' ) ;
259
- return cleanup ( amp . href , options , amp . href ) ;
260
- }
209
+ const doc = dom . window . document ;
261
210
262
- err . write ( `Enhancing web page: ${ url } ` ) ;
211
+ // Stop-gap solution as some of these are still referenced in cleanupItem()
212
+ const ENV = {
213
+ err,
214
+ out,
215
+ UA
216
+ } ;
263
217
264
- /*
265
- Run enhancements
266
- ----------------
218
+ /*
219
+ If the file is a valid RSS/Atom feed
220
+ extract the feed entries to be processed further.
267
221
*/
268
- enhancePage ( dom ) ;
269
-
270
- // Run through readability and return
271
- const R = new Readability ( dom . window . document , {
272
- classesToPreserve : [
273
- 'no-href' ,
274
-
275
- /*
276
- Placed on some <a> elements
277
- as in-page anchors
278
- */
279
- 'anchor'
280
- ] ,
281
- /*
282
- Change Readability's serialization to return
283
- a DOM element (instead of a HTML string)
284
- as the `.content` property returned from `.parse()`
285
-
286
- This makes it easier for us to run subsequent
287
- transformations (sanitization, hyphenation, etc.)
288
- without having to parse/serialize the HTML repeatedly.
289
- */
290
- serializer : el => el
291
- } ) ;
292
-
293
- // TODO: find better solution to prevent Readability from
294
- // making img srcs relative.
295
- if ( options . mapRemoteResources || options . inline ) {
296
- R . _fixRelativeUris = ( ) => { } ;
297
- }
298
-
299
- const parsed = R . parse ( ) || { } ;
300
-
301
- let remoteResources ;
302
- if ( options . mapRemoteResources ) {
303
- remoteResources = mapRemoteResources ( parsed . content ) ;
304
- }
305
-
306
- // Hyphenate the text
307
- const textContent = sanitizer . sanitize ( parsed . textContent ) ;
308
- const lang =
309
- getLanguageAttribute ( dom . window . document ) ||
310
- textToIso6391 ( textContent ) ;
311
-
312
- err . write ( ' ✓\n' ) ;
313
-
314
- if ( options . inline ) {
315
- await inlineImages (
316
- parsed . content ,
317
- {
318
- headers : {
319
- 'user-agent' : UA
320
- } ,
321
- /*
322
- Send the referrer as the browser would
323
- when fetching the image to render it.
324
-
325
- The referrer policy would take care of
326
- stripping the URL down to its origin,
327
- but just in case, let’s strip it ourselves.
328
- */
329
- referrer : getUrlOrigin ( final_url ) ,
330
- referrerPolicy : 'strict-origin-when-cross-origin' ,
331
- timeout : 10 * 1000
332
- } ,
333
- options . debug ? out : undefined
222
+ if ( isFeed ( doc ) ) {
223
+ return await Promise . all (
224
+ processFeed ( doc ) . map ( async it => {
225
+ const itemDOM = new JSDOM ( wrapHTMLFragment ( it ) , {
226
+ url : it . url
227
+ } ) ;
228
+ it . content = itemDOM . window . document . body ;
229
+ const clean = await cleanupItem ( itemDOM , options , it , ENV ) ;
230
+ return {
231
+ ...clean ,
232
+ // TODO
233
+ originalContent : {
234
+ buffer : null ,
235
+ contentType : 'text/html'
236
+ }
237
+ } ;
238
+ } )
334
239
) ;
335
240
}
336
241
337
242
/*
338
- Select the appropriate serialization method
339
- based on the bundle target. EPUBs need the
340
- content to be XHTML (produced by a XML serializer),
341
- rather than normal HTML.
342
- */
343
- const serializer = options . xhtml
344
- ? arr => {
345
- const xs = new dom . window . XMLSerializer ( ) ;
346
- return arr . map ( el => xs . serializeToString ( el ) ) . join ( '' ) ;
347
- }
348
- : arr => arr . map ( el => el . innerHTML ) . join ( '' ) ;
349
-
350
- /*
351
- When dompurify returns a DOM node, it always wraps it
352
- in a HTMLBodyElement. We only need its children.
353
- */
354
- const sanitize_to_dom = dirty =>
355
- Array . from (
356
- sanitizer . sanitize ( dirty , { RETURN_DOM : true } ) . children
357
- ) ;
358
-
359
- const content_els = sanitize_to_dom ( parsed . content ) ;
360
-
361
- // `--toc-level` implies `--toc`, unless disabled with `--no-toc`.
362
- let headings = [ ] ;
363
- if ( options [ 'toc-level' ] > 1 && options . toc !== false ) {
364
- headings = setIdsAndReturnHeadings (
365
- content_els ,
366
- options [ 'toc-level' ]
367
- ) . map ( heading => {
368
- return {
369
- id : heading . id ,
370
- level : heading . level ,
371
- // Plain text used in EPUB
372
- text : heading . node . textContent . trim ( ) ,
373
- // Sanitized marked-up text used in HTML/PDF
374
- content : serializer ( [ heading . node ] )
375
- } ;
376
- } ) ;
243
+ Use the AMP version of the article
244
+ if one is available and the user has not opted out.
245
+ */
246
+ const amp = doc . querySelector ( 'link[rel~=amphtml]' ) ;
247
+ if ( amp && options . amp ) {
248
+ err . write ( '\nFound AMP version (use `--no-amp` to ignore)\n' ) ;
249
+ return cleanup ( amp . href , options , amp . href ) ;
377
250
}
378
251
379
252
return {
380
- id : `percollate-page-${ uuid ( ) } ` ,
381
- url : final_url ,
382
- title : sanitizer . sanitize ( parsed . title ) ,
383
- byline : sanitizer . sanitize ( parsed . byline ) ,
384
- dir : sanitizer . sanitize ( parsed . dir ) ,
385
- excerpt : serializer ( sanitize_to_dom ( parsed . excerpt ) ) ,
386
- content : serializer (
387
- options . hyphenate === true
388
- ? content_els . map ( el => hyphenateDom ( el , lang ) )
389
- : content_els
390
- ) ,
391
- lang,
392
- textContent,
393
- toc : nestHeadings ( headings || [ ] ) ,
394
- length : parsed . length ,
395
- siteName : sanitizer . sanitize ( parsed . siteName ) ,
396
- remoteResources,
253
+ ...( await cleanupItem ( dom , options , null , ENV ) ) ,
254
+ /*
255
+ Augument for original content, useful when
256
+ percollate is used as an API.
257
+ */
397
258
originalContent : {
398
259
buffer,
399
260
contentType
@@ -762,7 +623,9 @@ async function generate(fn, urls, options = {}) {
762
623
} ,
763
624
w
764
625
)
765
- ) . filter ( it => it ) ;
626
+ )
627
+ . filter ( it => it )
628
+ . flat ( ) ;
766
629
767
630
if ( options . individual ) {
768
631
await Promise . all ( items . map ( item => fn ( [ item ] , options ) ) ) ;
@@ -995,6 +858,5 @@ async function epubgen(data, output_path, options) {
995
858
export { configure , pdf , epub , html , md } ;
996
859
997
860
export const __test__ = {
998
- fetchContent,
999
- isURL
861
+ fetchContent
1000
862
} ;
0 commit comments