Skip to content

Commit 68bd9ed

Browse files
dbym4820claude
andcommitted
v2.0.0 - AI-powered paper extraction without RSS
Major Features: - AI-based paper extraction: Extract papers directly from journal web pages using Claude/OpenAI - No RSS required: Works with any journal website that lists papers - Automatic data normalization: AI handles date formats, DOI extraction, author parsing - Background PDF processing: Queue-based PDF download and text extraction Changes: - AiRssGeneratorService: New extractPapersWithAi() for direct AI extraction - RssFetcherService: Separate processing paths for RSS and AI-generated feeds - PaperController: COALESCE sorting to handle NULL published_date - PaperList: Default date filter changed to 'all' - New migrations for jobs table and pdf_status column 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <[email protected]>
1 parent f39f266 commit 68bd9ed

26 files changed

+1259
-404
lines changed

.env.example

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ DB_SOCKET=
1818
# MAMP MySQL 5.7: /Applications/MAMP/Library/bin/mysql57/bin/mysql
1919
MYSQL_BIN=
2020

21+
# キュー設定(PDF処理などの非同期ジョブ用)
22+
# database: DBを使用(推奨),sync: 同期処理(デバッグ用)
23+
QUEUE_CONNECTION=database
24+
2125
# 初期管理者ユーザー(マイグレーション時に自動作成)
2226
ADMIN_USER_ID=
2327
ADMIN_USERNAME=

app/Http/Controllers/AdminController.php

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -149,11 +149,16 @@ public function createJournal(Request $request): JsonResponse
149149

150150
// 初回フェッチを実行
151151
if ($journal->isAiGenerated()) {
152-
// AI生成の場合
153-
$fetchResult = $this->aiRssGenerator->generateFeed($journal, $user);
152+
// AI生成の場合:ページ構造を解析してセレクタを保存
153+
$setupResult = $this->aiRssGenerator->setupFeed($journal, $user);
154154
$message = '論文誌を追加しました';
155-
if ($fetchResult['success']) {
156-
$message .= '' . ($fetchResult['papers_count'] ?? 0) . '件の論文を検出)';
155+
if ($setupResult['success']) {
156+
// HTMLからパースした論文情報をデータベースに登録
157+
$fetchResult = $this->rssFetcher->fetchJournal($journal);
158+
$newPapers = $fetchResult['new_papers'] ?? 0;
159+
$message .= '' . $newPapers . '件の論文を登録)';
160+
} else {
161+
$fetchResult = $setupResult;
157162
}
158163
} else {
159164
// 通常RSSの場合

app/Http/Controllers/GeneratedRssController.php

Lines changed: 24 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -3,63 +3,24 @@
33
namespace App\Http\Controllers;
44

55
use Illuminate\Http\Request;
6-
use Illuminate\Http\Response;
76
use Illuminate\Http\JsonResponse;
8-
use App\Models\GeneratedFeed;
97
use App\Models\Journal;
108
use App\Services\AiRssGeneratorService;
9+
use App\Services\RssFetcherService;
1110

1211
class GeneratedRssController extends Controller
1312
{
14-
private AiRssGeneratorService $rssGenerator;
13+
private AiRssGeneratorService $aiGenerator;
14+
private RssFetcherService $rssFetcher;
1515

16-
public function __construct(AiRssGeneratorService $rssGenerator)
16+
public function __construct(AiRssGeneratorService $aiGenerator, RssFetcherService $rssFetcher)
1717
{
18-
$this->rssGenerator = $rssGenerator;
18+
$this->aiGenerator = $aiGenerator;
19+
$this->rssFetcher = $rssFetcher;
1920
}
2021

2122
/**
22-
* Serve RSS feed by feed token (public, no authentication required)
23-
* Dynamically fetches and parses the source page using saved selectors
24-
*/
25-
public function serve(string $feedToken): Response
26-
{
27-
$feed = GeneratedFeed::where('feed_token', $feedToken)
28-
->with('journal')
29-
->first();
30-
31-
if (!$feed) {
32-
return response('Feed not found', 404)
33-
->header('Content-Type', 'text/plain');
34-
}
35-
36-
// Check if we have valid selectors
37-
$selectors = $feed->extraction_config['selectors'] ?? null;
38-
if (!$selectors || empty($selectors['title'])) {
39-
return response('Feed not configured. Please regenerate the feed.', 503)
40-
->header('Content-Type', 'text/plain');
41-
}
42-
43-
// Dynamically generate RSS by fetching and parsing the source page
44-
try {
45-
$rssXml = $this->rssGenerator->generateRssDynamically($feed);
46-
47-
return response($rssXml)
48-
->header('Content-Type', 'application/rss+xml; charset=utf-8')
49-
->header('Cache-Control', 'public, max-age=1800'); // Cache for 30 minutes
50-
} catch (\Exception $e) {
51-
\Log::error('Failed to generate RSS dynamically', [
52-
'feed_token' => $feedToken,
53-
'error' => $e->getMessage(),
54-
]);
55-
56-
return response('Failed to fetch feed: ' . $e->getMessage(), 503)
57-
->header('Content-Type', 'text/plain');
58-
}
59-
}
60-
61-
/**
62-
* Regenerate feed (requires authentication)
23+
* Reanalyze page structure and fetch papers (requires authentication)
6324
*/
6425
public function regenerate(Request $request, string $journalId): JsonResponse
6526
{
@@ -81,21 +42,30 @@ public function regenerate(Request $request, string $journalId): JsonResponse
8142
], 400);
8243
}
8344

84-
$result = $this->rssGenerator->generateFeed($journal, $user);
45+
// AIでページ構造を再解析してセレクタを更新
46+
$result = $this->aiGenerator->reanalyzeStructure($journal, $user);
8547

8648
if (!$result['success']) {
87-
return response()->json([
49+
$response = [
8850
'success' => false,
8951
'error' => $result['error'],
90-
], 400);
52+
];
53+
if (!empty($result['debug'])) {
54+
$response['debug'] = $result['debug'];
55+
}
56+
return response()->json($response, 400);
9157
}
9258

59+
// HTMLからパースした論文情報をデータベースに登録
60+
$fetchResult = $this->rssFetcher->fetchJournal($journal);
61+
$newPapers = $fetchResult['new_papers'] ?? 0;
62+
9363
return response()->json([
9464
'success' => true,
95-
'message' => 'Feed regenerated successfully',
65+
'message' => 'ページを再解析しました(' . $newPapers . '件の新規論文を登録)',
9666
'papers_count' => $result['papers_count'],
97-
'feed_token' => $result['feed_token'],
98-
'provider' => $result['provider'] ?? ($result['method'] === 'selector' ? 'selector' : null),
67+
'new_papers' => $newPapers,
68+
'provider' => $result['provider'] ?? null,
9969
]);
10070
}
10171

@@ -134,9 +104,9 @@ public function testPage(Request $request): JsonResponse
134104

135105
// Use auto-redirect version if enabled
136106
if ($autoRedirect) {
137-
$result = $this->rssGenerator->testPageAnalysisWithRedirect($url, $user);
107+
$result = $this->aiGenerator->testPageAnalysisWithRedirect($url, $user);
138108
} else {
139-
$result = $this->rssGenerator->testPageAnalysis($url, $user);
109+
$result = $this->aiGenerator->testPageAnalysis($url, $user);
140110
}
141111

142112
$response = [

app/Http/Controllers/JournalController.php

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ public function index(Request $request): JsonResponse
3939
if ($j->generatedFeed) {
4040
$data['generated_feed'] = [
4141
'id' => $j->generatedFeed->id,
42-
'feed_token' => $j->generatedFeed->feed_token,
4342
'source_url' => $j->generatedFeed->source_url,
4443
'ai_provider' => $j->generatedFeed->ai_provider,
4544
'ai_model' => $j->generatedFeed->ai_model,
@@ -94,7 +93,6 @@ public function show(Request $request, string $id): JsonResponse
9493
if ($journal->generatedFeed) {
9594
$response['generated_feed'] = [
9695
'id' => $journal->generatedFeed->id,
97-
'feed_token' => $journal->generatedFeed->feed_token,
9896
'source_url' => $journal->generatedFeed->source_url,
9997
'ai_provider' => $journal->generatedFeed->ai_provider,
10098
'ai_model' => $journal->generatedFeed->ai_model,

app/Http/Controllers/PaperController.php

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
use Illuminate\Support\Facades\Storage;
88
use Symfony\Component\HttpFoundation\StreamedResponse;
99
use App\Models\Paper;
10+
use App\Services\QueueRunnerService;
1011
use Illuminate\Support\Facades\DB;
1112

1213
class PaperController extends Controller
@@ -42,7 +43,8 @@ public function index(Request $request): JsonResponse
4243
$offset = (int) ($request->offset ?? 0);
4344

4445
$total = $query->count();
45-
$papers = $query->orderBy('published_date', 'desc')
46+
// Use COALESCE to handle NULL published_date (AI-generated papers may not have dates)
47+
$papers = $query->orderByRaw('COALESCE(published_date, DATE(fetched_at)) DESC')
4648
->orderBy('fetched_at', 'desc')
4749
->offset($offset)
4850
->limit($limit)
@@ -154,6 +156,51 @@ public function downloadPdf(Request $request, int $id): StreamedResponse|JsonRes
154156
]);
155157
}
156158

159+
/**
160+
* PDF処理状況を確認し、必要に応じてワーカーを再起動
161+
* 各論文のpdf_statusも返す
162+
*/
163+
public function processingStatus(Request $request): JsonResponse
164+
{
165+
$user = $request->attributes->get('user');
166+
167+
// ユーザーの論文でPDF処理に関係するもの(pending/processing/completed/failed)のIDとステータスを取得
168+
$paperStatuses = Paper::forUser($user->id)
169+
->whereNotNull('pdf_status')
170+
->select('id', 'pdf_status', 'pdf_path')
171+
->get()
172+
->map(function ($paper) {
173+
return [
174+
'id' => $paper->id,
175+
'pdf_status' => $paper->pdf_status,
176+
'has_local_pdf' => !empty($paper->pdf_path),
177+
];
178+
});
179+
180+
$processingCount = $paperStatuses->whereIn('pdf_status', ['pending', 'processing'])->count();
181+
182+
// キュー内のジョブ数
183+
$pendingJobs = QueueRunnerService::getPendingJobCount('pdf-processing');
184+
185+
// ワーカーの状態
186+
$workerRunning = QueueRunnerService::isWorkerRunning('pdf-processing');
187+
$workerStarted = false;
188+
189+
// ジョブがある場合は常にワーカー起動を試みる
190+
if ($pendingJobs > 0) {
191+
$workerStarted = QueueRunnerService::startWorkerIfNeeded('pdf-processing');
192+
}
193+
194+
return response()->json([
195+
'success' => true,
196+
'processing_count' => $processingCount,
197+
'pending_jobs' => $pendingJobs,
198+
'worker_running' => $workerRunning || $workerStarted,
199+
'worker_started' => $workerStarted,
200+
'paper_statuses' => $paperStatuses,
201+
]);
202+
}
203+
157204
private function formatPaper(Paper $paper, bool $detailed = false): array
158205
{
159206
$data = [
@@ -182,6 +229,7 @@ private function formatPaper(Paper $paper, bool $detailed = false): array
182229
'full_text_source' => $paper->full_text_source,
183230
'pdf_url' => $paper->pdf_url,
184231
'has_local_pdf' => $paper->hasLocalPdf(),
232+
'pdf_status' => $paper->pdf_status,
185233
// Always include summaries for frontend to show existing summaries
186234
'summaries' => $paper->summaries->map(function ($s) {
187235
return [
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
<?php
2+
3+
namespace App\Jobs;
4+
5+
use App\Models\Paper;
6+
use App\Services\FullTextFetcherService;
7+
use Illuminate\Bus\Queueable;
8+
use Illuminate\Contracts\Queue\ShouldQueue;
9+
use Illuminate\Foundation\Bus\Dispatchable;
10+
use Illuminate\Queue\InteractsWithQueue;
11+
use Illuminate\Queue\SerializesModels;
12+
use Illuminate\Support\Facades\Log;
13+
14+
class ProcessPaperFullTextJob implements ShouldQueue
15+
{
16+
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
17+
18+
/**
19+
* ジョブのタイムアウト(秒)
20+
* PDF解析に十分な時間を確保
21+
*/
22+
public int $timeout = 300; // 5分
23+
24+
/**
25+
* 失敗前の試行回数
26+
*/
27+
public int $tries = 2;
28+
29+
/**
30+
* 再試行までの待機秒数
31+
*/
32+
public int $backoff = 60;
33+
34+
protected int $paperId;
35+
36+
/**
37+
* Create a new job instance.
38+
*/
39+
public function __construct(int $paperId)
40+
{
41+
$this->paperId = $paperId;
42+
$this->onQueue('pdf-processing');
43+
}
44+
45+
/**
46+
* Execute the job.
47+
*/
48+
public function handle(FullTextFetcherService $fullTextFetcher): void
49+
{
50+
$paper = Paper::find($this->paperId);
51+
52+
if (!$paper) {
53+
Log::warning("ProcessPaperFullTextJob: Paper {$this->paperId} not found");
54+
return;
55+
}
56+
57+
// 既に処理済みの場合はスキップ
58+
if ($paper->pdf_status === 'completed') {
59+
Log::debug("ProcessPaperFullTextJob: Paper {$this->paperId} already completed");
60+
return;
61+
}
62+
63+
// ステータスを「処理中」に更新
64+
$paper->update(['pdf_status' => 'processing']);
65+
66+
Log::info("ProcessPaperFullTextJob: Processing paper {$this->paperId} - {$paper->title}");
67+
68+
try {
69+
$result = $fullTextFetcher->fetchFullText($paper);
70+
71+
if ($result['success']) {
72+
$updateData = [
73+
'full_text' => $result['text'],
74+
'full_text_source' => $result['source'],
75+
'full_text_fetched_at' => now(),
76+
'pdf_status' => 'completed',
77+
];
78+
79+
if (!empty($result['pdf_url'])) {
80+
$updateData['pdf_url'] = $result['pdf_url'];
81+
}
82+
if (!empty($result['pdf_path'])) {
83+
$updateData['pdf_path'] = $result['pdf_path'];
84+
}
85+
86+
$paper->update($updateData);
87+
88+
Log::info("ProcessPaperFullTextJob: Successfully processed paper {$this->paperId}");
89+
} else {
90+
// PDF取得に失敗した場合もステータスを更新(再試行対象外)
91+
$paper->update(['pdf_status' => 'failed']);
92+
Log::debug("ProcessPaperFullTextJob: Failed to fetch full text for paper {$this->paperId}: " . ($result['error'] ?? 'Unknown error'));
93+
}
94+
} catch (\Throwable $e) {
95+
Log::error("ProcessPaperFullTextJob: Exception processing paper {$this->paperId}: " . $e->getMessage());
96+
throw $e; // 再試行のために例外を再スロー
97+
}
98+
}
99+
100+
/**
101+
* Handle a job failure.
102+
*/
103+
public function failed(\Throwable $exception): void
104+
{
105+
Log::error("ProcessPaperFullTextJob: Job failed for paper {$this->paperId}: " . $exception->getMessage());
106+
107+
// 最終的に失敗した場合はステータスを更新
108+
$paper = Paper::find($this->paperId);
109+
if ($paper) {
110+
$paper->update(['pdf_status' => 'failed']);
111+
}
112+
}
113+
}

0 commit comments

Comments
 (0)