Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 182 additions & 0 deletions lib/routes/fjrb/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
import type { CheerioAPI } from 'cheerio';
import { load } from 'cheerio';

import type { Route } from '@/types';
import cache from '@/utils/cache';
import got from '@/utils/got';
import { parseDate } from '@/utils/parse-date';
import timezone from '@/utils/timezone';

const ROOT_URL = 'https://fjrb.fjdaily.com';

const getDescription = (html: string | null | undefined) => {
const cleaned = html?.replaceAll(/<!--[\s\S]*?-->/g, '').trim();

if (!cleaned) {
return;
}

const $ = load(cleaned);
$('img, video, audio, source').removeAttr('referrerpolicy');
const hasText = $.text().replaceAll(/\s+/g, '').length > 0;
const hasMedia = $('img, video, audio, source').length > 0;

return hasText || hasMedia ? $.html() : undefined;
};

const getAttachmentDescription = ($: CheerioAPI) => getDescription($('.attachment').html());

const mergeDescription = (mainDescription: string | undefined, attachmentDescription: string | undefined) => {
if (!mainDescription) {
return attachmentDescription;
}

if (!attachmentDescription) {
return mainDescription;
}

const main = load(mainDescription);
const attachment = load(attachmentDescription);
const mainMediaSources = new Set(
main('img, video, audio, source')
.toArray()
.map((item) => main(item).attr('src'))
.filter(Boolean)
);

const newMedia = attachment('img, video, audio, source')
.toArray()
.filter((item) => {
const src = attachment(item).attr('src');
return src && !mainMediaSources.has(src);
})
.map((item) => attachment.html(item))
.filter(Boolean)
.join('');

return newMedia ? `${newMedia}${mainDescription}` : mainDescription;
};

const getIssueDate = async (date: string | undefined) => {
if (date) {
if (!/^\d{8}$/.test(date)) {
throw new Error('Invalid date format. Expected YYYYMMDD, for example `20260316`.');
}

return {
yearMonth: date.slice(0, 6),
day: date.slice(6, 8),
};
}

const indexResponse = await got(`${ROOT_URL}/pc/col/index.html`);
const $ = load(indexResponse.data);
const latestPath = $('#list li:first-child a').attr('href');

if (!latestPath) {
throw new Error('Failed to locate the latest Fujian Daily edition.');
}

const [, yearMonth, day] = latestPath.match(/(\d{6})\/(\d{2})\/node_\d+\.html/) ?? [];

if (!yearMonth || !day) {
throw new Error('Failed to parse the latest Fujian Daily edition date.');
}

return {
yearMonth,
day,
};
};

export const route: Route = {
path: '/:date?',
categories: ['traditional-media'],
example: '/fjrb/20260316',
parameters: { date: '日期,格式为 `YYYYMMDD`,留空时抓取当天全部版面,例如 `20260316`' },
features: {
requireConfig: false,
requirePuppeteer: false,
antiCrawler: false,
supportBT: false,
supportPodcast: false,
supportScihub: false,
},
radar: [
{
source: ['fjrb.fjdaily.com/pc/col/index.html'],
target: '/',
},
{
source: ['fjrb.fjdaily.com/pc/col/:yearmonth/:day/node_:id.html'],
},
],
name: '电子报',
maintainers: ['nczitzk'],
handler,
description: '留空时抓取最新一期全部版面,也可以通过日期参数抓取指定日期的全部版面内容。',
};

async function handler(ctx) {
const date = ctx.req.param('date');
const { yearMonth, day } = await getIssueDate(date);
const padUrl = `${ROOT_URL}/pad/col/${yearMonth}/${day}/node_01.html`;
const pageUrl = `${ROOT_URL}/pc/col/${yearMonth}/${day}/node_01.html`;

const pageResponse = await got(padUrl);
const content = load(pageResponse.data);

let currentCategory = '';
const list: Array<{ title: string; link: string; category: string[] }> = [];
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do not reinvent the wheels.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the category handling, I’m currently tracking it from the verson nodes. Let me know if you prefer a different pattern or existing helper here.


content('#catalog li').each((_, item) => {
const element = content(item);
if (element.hasClass('verson')) {
currentCategory = element.text().replaceAll(/\s+/g, ' ').trim();
return;
}

const a = element.find('a').first();
const href = a.attr('href');
if (!href) {
return;
}

list.push({
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use map instead of push.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. Updated to use map.

title: a.text().replaceAll(/\s+/g, ' ').trim(),
link: new URL(href, padUrl).toString().replace('/pad/', '/pc/'),
category: [currentCategory.replace(/^\d+版\s*/, '')],
});
});

if (list.length === 0) {
throw new Error(`No articles were found for ${yearMonth}${day}.`);
}

const items = await Promise.all(
list.map((item) =>
cache.tryGet(item.link, async () => {
const detailResponse = await got(item.link);
const detail = load(detailResponse.data);
const pubDate = detail('#NewsArticlePubDay').text().trim();
const author = detail('#NewsArticleAuthor').text().trim();
const mainDescription = getDescription(detail('#content').html()) || getDescription(detail('#ozoom').html());
const attachmentDescription = getAttachmentDescription(detail);
const description = mergeDescription(mainDescription, attachmentDescription);
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't use load more than once per item.link

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. Reused the parsed document for each item.link.


return {
...item,
author: author || undefined,
description: description || undefined,
pubDate: pubDate ? timezone(parseDate(pubDate), 8) : undefined,
};
})
)
);

return {
title: `福建日报 - ${yearMonth.slice(0, 4)}-${yearMonth.slice(4, 6)}-${day}`,
link: pageUrl,
item: items,
};
}
7 changes: 7 additions & 0 deletions lib/routes/fjrb/namespace.ts
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wrong namespace is used.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. I followed the fjksbm namespace (which uses lang) and used fjrb based on the subdomain. Should this instead use fjdaily, and should lang be removed?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use the second level domain.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated to use the fjdaily namespace (second-level domain).

Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import type { Namespace } from '@/types';

export const namespace: Namespace = {
name: '福建日报',
url: 'fjrb.fjdaily.com',
lang: 'zh-CN',
};
Loading