Skip to content

Commit

Permalink
chore(readability): use worker threads
Browse files Browse the repository at this point in the history
happy-dom throw unhandled exceptions based on the target URL markup, so there is no way to don't pullate the main Node.js process.
  • Loading branch information
Kikobeats committed Jan 19, 2025
1 parent b7e00b7 commit da0eacc
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 32 deletions.
9 changes: 7 additions & 2 deletions packages/metascraper-readability/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"dependencies": {
"@metascraper/helpers": "workspace:*",
"@mozilla/readability": "~0.5.0",
"async-memoize-one": "~1.1.8",
"happy-dom": "~16.5.3"
},
"devDependencies": {
Expand All @@ -37,7 +38,11 @@
"src"
],
"scripts": {
"test": "NODE_PATH=.. TZ=UTC ava --timeout 15s"
"test": "NODE_PATH=.. TZ=UTC ava"
},
"license": "MIT"
"license": "MIT",
"ava": {
"workerThreads": false,
"timeout": "15s"
}
}
1 change: 0 additions & 1 deletion packages/metascraper-readability/src/index.d.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
type Options = {
getDocument: ({url: string, html: string }) => Document,
readabilityOpts: import('readability').ReadabilityOptions,
}

Expand Down
47 changes: 18 additions & 29 deletions packages/metascraper-readability/src/index.js
Original file line number Diff line number Diff line change
@@ -1,36 +1,25 @@
'use strict'

const { memoizeOne, composeRule } = require('@metascraper/helpers')
const { Readability } = require('@mozilla/readability')

const parseReader = reader => {
try {
return reader.parse()
} catch (_) {
return {}
}
}

const defaultGetDocument = ({ url, html }) => {
const { Window } = require('happy-dom')
const window = new Window({ url })
const document = window.document
document.documentElement.innerHTML = html
return document
}

module.exports = ({
getDocument = defaultGetDocument,
readabilityOpts
} = {}) => {
const readability = memoizeOne((url, html, getDocument) => {
const document = getDocument({ url, html })
const reader = new Readability(document, readabilityOpts)
return parseReader(reader)
}, memoizeOne.EqualityFirstArgument)

const asyncMemoizeOne = require('async-memoize-one')
const { Worker } = require('worker_threads')
const path = require('path')

const SCRIPT_PATH = path.resolve(__dirname, 'worker.js')

const readability = asyncMemoizeOne((url, html, readabilityOpts) => {
const worker = new Worker(SCRIPT_PATH, {
workerData: { url, html, readabilityOpts }
})
const { promise, resolve, reject } = Promise.withResolvers()
worker.on('message', message => resolve(JSON.parse(message)))
worker.on('error', reject)
return promise
}, memoizeOne.EqualityFirstArgument)

module.exports = ({ readabilityOpts } = {}) => {
const getReadbility = composeRule(($, url) =>
readability(url, $.html(), getDocument)
readability(url, $.html(), readabilityOpts)
)

const rules = {
Expand Down
28 changes: 28 additions & 0 deletions packages/metascraper-readability/src/worker.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
'use strict'

const { workerData, parentPort } = require('node:worker_threads')
const { Readability } = require('@mozilla/readability')

const parseReader = reader => {
try {
return reader.parse()
} catch (_) {
return {}
}
}

const getDocument = ({ url, html }) => {
const { Window } = require('happy-dom')
const window = new Window({ url })
const document = window.document
document.documentElement.innerHTML = html
return document
}

const main = async ({ url, html, readabilityOpts } = {}) => {
const document = getDocument({ url, html })
const reader = new Readability(document, readabilityOpts)
return parseReader(reader)
}

main(workerData).then(result => parentPort.postMessage(JSON.stringify(result)))

0 comments on commit da0eacc

Please sign in to comment.