Skip to content

Commit eb88fc2

Browse files
xxcddjosStorer
andauthored
getCoreContentText for any websites using mozilla/readability (#641)
* getCoreContentText for any websites using https://github.com/mozilla/readability * improve use of @mozilla/readability --------- Co-authored-by: josc146 <[email protected]>
1 parent a6fa0ed commit eb88fc2

File tree

3 files changed

+40
-12
lines changed

3 files changed

+40
-12
lines changed

package-lock.json

Lines changed: 9 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"lint"
2020
],
2121
"dependencies": {
22+
"@mozilla/readability": "^0.5.0",
2223
"@nem035/gpt-3-encoder": "^1.1.7",
2324
"@picocss/pico": "^1.5.9",
2425
"@primer/octicons-react": "^18.3.0",

src/utils/get-core-content-text.mjs

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,5 @@
11
import { getPossibleElementByQuerySelector } from './get-possible-element-by-query-selector.mjs'
2-
3-
function getArea(e) {
4-
const rect = e.getBoundingClientRect()
5-
return rect.width * rect.height
6-
}
2+
import { Readability, isProbablyReaderable } from '@mozilla/readability'
73

84
const adapters = {
95
'scholar.google': ['#gs_res_ccl_mid'],
@@ -17,6 +13,11 @@ const adapters = {
1713
'new.qq.com': ['.content-article'],
1814
}
1915

16+
function getArea(e) {
17+
const rect = e.getBoundingClientRect()
18+
return rect.width * rect.height
19+
}
20+
2021
function findLargestElement(e) {
2122
if (!e) {
2223
return null
@@ -42,22 +43,39 @@ function findLargestElement(e) {
4243
return largestElement
4344
}
4445

45-
export function getCoreContentText() {
46-
function getTextFrom(e) {
47-
return e.innerText || e.textContent
48-
}
46+
function getTextFrom(e) {
47+
return e.innerText || e.textContent
48+
}
4949

50+
function postProcessText(text) {
51+
return text
52+
.trim()
53+
.replaceAll(' ', '')
54+
.replaceAll('\t', '')
55+
.replaceAll('\n\n', '')
56+
.replaceAll(',,', '')
57+
}
58+
59+
export function getCoreContentText() {
5060
for (const [siteName, selectors] of Object.entries(adapters)) {
5161
if (location.hostname.includes(siteName)) {
5262
const element = getPossibleElementByQuerySelector(selectors)
53-
if (element) return getTextFrom(element)
63+
if (element) return postProcessText(getTextFrom(element))
5464
break
5565
}
5666
}
5767

5868
const element = document.querySelector('article')
5969
if (element) {
60-
return getTextFrom(element)
70+
return postProcessText(getTextFrom(element))
71+
}
72+
73+
if (isProbablyReaderable(document)) {
74+
let article = new Readability(document.cloneNode(true), {
75+
keepClasses: true,
76+
}).parse()
77+
console.log('readerable')
78+
return postProcessText(article.textContent)
6179
}
6280

6381
const largestElement = findLargestElement(document.body)
@@ -79,5 +97,5 @@ export function getCoreContentText() {
7997
ret = getTextFrom(largestElement)
8098
console.log('use first')
8199
}
82-
return ret.trim().replaceAll(' ', '').replaceAll('\n\n', '').replaceAll(',,', '')
100+
return postProcessText(ret)
83101
}

0 commit comments

Comments
 (0)