-
Notifications
You must be signed in to change notification settings - Fork 9.5k
feat(route): add 福建日报电子报 route #21472
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 3 commits
e392228
1cb62ff
875ae44
b6c392b
dd708fe
deb329d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,182 @@ | ||
| import type { CheerioAPI } from 'cheerio'; | ||
| import { load } from 'cheerio'; | ||
|
|
||
| import type { Route } from '@/types'; | ||
| import cache from '@/utils/cache'; | ||
| import got from '@/utils/got'; | ||
| import { parseDate } from '@/utils/parse-date'; | ||
| import timezone from '@/utils/timezone'; | ||
|
|
||
| const ROOT_URL = 'https://fjrb.fjdaily.com'; | ||
|
|
||
| const getDescription = (html: string | null | undefined) => { | ||
| const cleaned = html?.replaceAll(/<!--[\s\S]*?-->/g, '').trim(); | ||
|
|
||
| if (!cleaned) { | ||
| return; | ||
| } | ||
|
|
||
| const $ = load(cleaned); | ||
| $('img, video, audio, source').removeAttr('referrerpolicy'); | ||
| const hasText = $.text().replaceAll(/\s+/g, '').length > 0; | ||
| const hasMedia = $('img, video, audio, source').length > 0; | ||
|
|
||
| return hasText || hasMedia ? $.html() : undefined; | ||
| }; | ||
|
|
||
| const getAttachmentDescription = ($: CheerioAPI) => getDescription($('.attachment').html()); | ||
|
|
||
| const mergeDescription = (mainDescription: string | undefined, attachmentDescription: string | undefined) => { | ||
| if (!mainDescription) { | ||
| return attachmentDescription; | ||
| } | ||
|
|
||
| if (!attachmentDescription) { | ||
| return mainDescription; | ||
| } | ||
|
|
||
| const main = load(mainDescription); | ||
| const attachment = load(attachmentDescription); | ||
| const mainMediaSources = new Set( | ||
| main('img, video, audio, source') | ||
| .toArray() | ||
| .map((item) => main(item).attr('src')) | ||
| .filter(Boolean) | ||
| ); | ||
|
|
||
| const newMedia = attachment('img, video, audio, source') | ||
| .toArray() | ||
| .filter((item) => { | ||
| const src = attachment(item).attr('src'); | ||
| return src && !mainMediaSources.has(src); | ||
| }) | ||
| .map((item) => attachment.html(item)) | ||
| .filter(Boolean) | ||
| .join(''); | ||
|
|
||
| return newMedia ? `${newMedia}${mainDescription}` : mainDescription; | ||
| }; | ||
|
|
||
| const getIssueDate = async (date: string | undefined) => { | ||
| if (date) { | ||
| if (!/^\d{8}$/.test(date)) { | ||
| throw new Error('Invalid date format. Expected YYYYMMDD, for example `20260316`.'); | ||
| } | ||
|
|
||
| return { | ||
| yearMonth: date.slice(0, 6), | ||
| day: date.slice(6, 8), | ||
| }; | ||
| } | ||
|
|
||
| const indexResponse = await got(`${ROOT_URL}/pc/col/index.html`); | ||
| const $ = load(indexResponse.data); | ||
| const latestPath = $('#list li:first-child a').attr('href'); | ||
|
|
||
| if (!latestPath) { | ||
| throw new Error('Failed to locate the latest Fujian Daily edition.'); | ||
| } | ||
|
|
||
| const [, yearMonth, day] = latestPath.match(/(\d{6})\/(\d{2})\/node_\d+\.html/) ?? []; | ||
|
|
||
| if (!yearMonth || !day) { | ||
| throw new Error('Failed to parse the latest Fujian Daily edition date.'); | ||
| } | ||
|
|
||
| return { | ||
| yearMonth, | ||
| day, | ||
| }; | ||
| }; | ||
|
|
||
| export const route: Route = { | ||
| path: '/:date?', | ||
| categories: ['traditional-media'], | ||
| example: '/fjrb/20260316', | ||
| parameters: { date: '日期,格式为 `YYYYMMDD`,留空时抓取当天全部版面,例如 `20260316`' }, | ||
| features: { | ||
| requireConfig: false, | ||
| requirePuppeteer: false, | ||
| antiCrawler: false, | ||
| supportBT: false, | ||
| supportPodcast: false, | ||
| supportScihub: false, | ||
| }, | ||
| radar: [ | ||
| { | ||
| source: ['fjrb.fjdaily.com/pc/col/index.html'], | ||
| target: '/', | ||
| }, | ||
| { | ||
| source: ['fjrb.fjdaily.com/pc/col/:yearmonth/:day/node_:id.html'], | ||
| }, | ||
| ], | ||
| name: '电子报', | ||
| maintainers: ['nczitzk'], | ||
DakoWang marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| handler, | ||
| description: '留空时抓取最新一期全部版面,也可以通过日期参数抓取指定日期的全部版面内容。', | ||
| }; | ||
|
|
||
| async function handler(ctx) { | ||
| const date = ctx.req.param('date'); | ||
| const { yearMonth, day } = await getIssueDate(date); | ||
| const padUrl = `${ROOT_URL}/pad/col/${yearMonth}/${day}/node_01.html`; | ||
| const pageUrl = `${ROOT_URL}/pc/col/${yearMonth}/${day}/node_01.html`; | ||
|
|
||
| const pageResponse = await got(padUrl); | ||
| const content = load(pageResponse.data); | ||
|
|
||
| let currentCategory = ''; | ||
| const list: Array<{ title: string; link: string; category: string[] }> = []; | ||
|
||
|
|
||
| content('#catalog li').each((_, item) => { | ||
| const element = content(item); | ||
| if (element.hasClass('verson')) { | ||
| currentCategory = element.text().replaceAll(/\s+/g, ' ').trim(); | ||
| return; | ||
| } | ||
|
|
||
| const a = element.find('a').first(); | ||
| const href = a.attr('href'); | ||
| if (!href) { | ||
| return; | ||
| } | ||
|
|
||
| list.push({ | ||
|
||
| title: a.text().replaceAll(/\s+/g, ' ').trim(), | ||
| link: new URL(href, padUrl).toString().replace('/pad/', '/pc/'), | ||
| category: [currentCategory.replace(/^\d+版\s*/, '')], | ||
| }); | ||
| }); | ||
|
|
||
| if (list.length === 0) { | ||
| throw new Error(`No articles were found for ${yearMonth}${day}.`); | ||
| } | ||
|
|
||
| const items = await Promise.all( | ||
| list.map((item) => | ||
| cache.tryGet(item.link, async () => { | ||
| const detailResponse = await got(item.link); | ||
| const detail = load(detailResponse.data); | ||
| const pubDate = detail('#NewsArticlePubDay').text().trim(); | ||
| const author = detail('#NewsArticleAuthor').text().trim(); | ||
| const mainDescription = getDescription(detail('#content').html()) || getDescription(detail('#ozoom').html()); | ||
| const attachmentDescription = getAttachmentDescription(detail); | ||
| const description = mergeDescription(mainDescription, attachmentDescription); | ||
|
||
|
|
||
| return { | ||
| ...item, | ||
| author: author || undefined, | ||
| description: description || undefined, | ||
| pubDate: pubDate ? timezone(parseDate(pubDate), 8) : undefined, | ||
| }; | ||
| }) | ||
| ) | ||
| ); | ||
|
|
||
| return { | ||
| title: `福建日报 - ${yearMonth.slice(0, 4)}-${yearMonth.slice(4, 6)}-${day}`, | ||
| link: pageUrl, | ||
| item: items, | ||
| }; | ||
| } | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wrong namespace is used.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks. I followed the
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated to use the |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| import type { Namespace } from '@/types'; | ||
|
|
||
| export const namespace: Namespace = { | ||
| name: '福建日报', | ||
| url: 'fjrb.fjdaily.com', | ||
| lang: 'zh-CN', | ||
| }; |

Uh oh!
There was an error while loading. Please reload this page.