From 0308566b4c95933bf307a676b5c44bf34cdcbeb1 Mon Sep 17 00:00:00 2001 From: xxcdd <42600601+xxcdd@users.noreply.github.com> Date: Wed, 6 Mar 2024 13:31:31 +0800 Subject: [PATCH 1/2] getCoreContentText for any websites using https://github.com/mozilla/readability --- package-lock.json | 9 +++++ package.json | 1 + src/utils/get-core-content-text.mjs | 57 ++++------------------------- 3 files changed, 17 insertions(+), 50 deletions(-) diff --git a/package-lock.json b/package-lock.json index 3872d5a6..22ac4c04 100644 --- a/package-lock.json +++ b/package-lock.json @@ -6,6 +6,7 @@ "": { "name": "chatgptbox", "dependencies": { + "@mozilla/readability": "^0.5.0", "@nem035/gpt-3-encoder": "^1.1.7", "@picocss/pico": "^1.5.9", "@primer/octicons-react": "^18.3.0", @@ -2077,6 +2078,14 @@ "@jridgewell/sourcemap-codec": "^1.4.14" } }, + "node_modules/@mozilla/readability": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.5.0.tgz", + "integrity": "sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==", + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/@nem035/gpt-3-encoder": { "version": "1.1.7", "resolved": "https://registry.npmjs.org/@nem035/gpt-3-encoder/-/gpt-3-encoder-1.1.7.tgz", diff --git a/package.json b/package.json index 52212e15..9956b537 100644 --- a/package.json +++ b/package.json @@ -19,6 +19,7 @@ "lint" ], "dependencies": { + "@mozilla/readability": "^0.5.0", "@nem035/gpt-3-encoder": "^1.1.7", "@picocss/pico": "^1.5.9", "@primer/octicons-react": "^18.3.0", diff --git a/src/utils/get-core-content-text.mjs b/src/utils/get-core-content-text.mjs index bef4dc8f..dc008698 100644 --- a/src/utils/get-core-content-text.mjs +++ b/src/utils/get-core-content-text.mjs @@ -1,9 +1,5 @@ import { getPossibleElementByQuerySelector } from './get-possible-element-by-query-selector.mjs' - -function getArea(e) { - const rect = e.getBoundingClientRect() - return rect.width * rect.height -} +import { Readability } from "@mozilla/readability" const adapters = { 'scholar.google': ['#gs_res_ccl_mid'], @@ -17,31 +13,6 @@ const adapters = { 'new.qq.com': ['.content-article'], } -function findLargestElement(e) { - if (!e) { - return null - } - let maxArea = 0 - let largestElement = null - const limitedArea = 0.8 * getArea(e) - - function traverseDOM(node) { - if (node.nodeType === Node.ELEMENT_NODE) { - const area = getArea(node) - - if (area > maxArea && area < limitedArea) { - maxArea = area - largestElement = node - } - - Array.from(node.children).forEach(traverseDOM) - } - } - - traverseDOM(e) - return largestElement -} - export function getCoreContentText() { function getTextFrom(e) { return e.innerText || e.textContent @@ -60,24 +31,10 @@ export function getCoreContentText() { return getTextFrom(element) } - const largestElement = findLargestElement(document.body) - const secondLargestElement = findLargestElement(largestElement) - console.log(largestElement) - console.log(secondLargestElement) - - let ret - if (!largestElement) { - ret = getTextFrom(document.body) - console.log('use document.body') - } else if ( - secondLargestElement && - getArea(secondLargestElement) > 0.5 * getArea(largestElement) - ) { - ret = getTextFrom(secondLargestElement) - console.log('use second') - } else { - ret = getTextFrom(largestElement) - console.log('use first') - } - return ret.trim().replaceAll(' ', '').replaceAll('\n\n', '').replaceAll(',,', '') + let article = new Readability(document.cloneNode(true), { + keepClasses: true + }).parse() + let content = article.textContent.trim().replaceAll(' ', '').replaceAll('\t', '').replaceAll('\n\n', '').replaceAll(',,', '') + console.log(content) + return content } From 777f405943fe36b43f49fe0ba1113c7f2aa82aba Mon Sep 17 00:00:00 2001 From: josc146 Date: Fri, 22 Mar 2024 15:59:58 +0800 Subject: [PATCH 2/2] improve use of @mozilla/readability --- src/utils/get-core-content-text.mjs | 85 +++++++++++++++++++++++++---- 1 file changed, 73 insertions(+), 12 deletions(-) diff --git a/src/utils/get-core-content-text.mjs b/src/utils/get-core-content-text.mjs index dc008698..54c004cb 100644 --- a/src/utils/get-core-content-text.mjs +++ b/src/utils/get-core-content-text.mjs @@ -1,5 +1,5 @@ import { getPossibleElementByQuerySelector } from './get-possible-element-by-query-selector.mjs' -import { Readability } from "@mozilla/readability" +import { Readability, isProbablyReaderable } from '@mozilla/readability' const adapters = { 'scholar.google': ['#gs_res_ccl_mid'], @@ -13,28 +13,89 @@ const adapters = { 'new.qq.com': ['.content-article'], } -export function getCoreContentText() { - function getTextFrom(e) { - return e.innerText || e.textContent +function getArea(e) { + const rect = e.getBoundingClientRect() + return rect.width * rect.height +} + +function findLargestElement(e) { + if (!e) { + return null + } + let maxArea = 0 + let largestElement = null + const limitedArea = 0.8 * getArea(e) + + function traverseDOM(node) { + if (node.nodeType === Node.ELEMENT_NODE) { + const area = getArea(node) + + if (area > maxArea && area < limitedArea) { + maxArea = area + largestElement = node + } + + Array.from(node.children).forEach(traverseDOM) + } } + traverseDOM(e) + return largestElement +} + +function getTextFrom(e) { + return e.innerText || e.textContent +} + +function postProcessText(text) { + return text + .trim() + .replaceAll(' ', '') + .replaceAll('\t', '') + .replaceAll('\n\n', '') + .replaceAll(',,', '') +} + +export function getCoreContentText() { for (const [siteName, selectors] of Object.entries(adapters)) { if (location.hostname.includes(siteName)) { const element = getPossibleElementByQuerySelector(selectors) - if (element) return getTextFrom(element) + if (element) return postProcessText(getTextFrom(element)) break } } const element = document.querySelector('article') if (element) { - return getTextFrom(element) + return postProcessText(getTextFrom(element)) } - let article = new Readability(document.cloneNode(true), { - keepClasses: true - }).parse() - let content = article.textContent.trim().replaceAll(' ', '').replaceAll('\t', '').replaceAll('\n\n', '').replaceAll(',,', '') - console.log(content) - return content + if (isProbablyReaderable(document)) { + let article = new Readability(document.cloneNode(true), { + keepClasses: true, + }).parse() + console.log('readerable') + return postProcessText(article.textContent) + } + + const largestElement = findLargestElement(document.body) + const secondLargestElement = findLargestElement(largestElement) + console.log(largestElement) + console.log(secondLargestElement) + + let ret + if (!largestElement) { + ret = getTextFrom(document.body) + console.log('use document.body') + } else if ( + secondLargestElement && + getArea(secondLargestElement) > 0.5 * getArea(largestElement) + ) { + ret = getTextFrom(secondLargestElement) + console.log('use second') + } else { + ret = getTextFrom(largestElement) + console.log('use first') + } + return postProcessText(ret) }