Skip to content

Commit d009c44

Browse files
authored
Merge pull request #527 from ConardLi/dev
1.4.0
2 parents 32b173a + 41cc4d4 commit d009c44

60 files changed

Lines changed: 4757 additions & 287 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,13 +141,15 @@ services:
141141
ports:
142142
- '1717:1717'
143143
volumes:
144-
- ${LOCAL_DB_PATH}:/app/local-db
145-
- ${LOCAL_PRISMA_PATH}:/app/prisma
144+
- ./local-db:/app/local-db
145+
# - ./prisma:/app/prisma If mounting is required, please manually initialize the database file first.
146146
restart: unless-stopped
147147
```
148148
149149
> **Note:** Replace `{YOUR_LOCAL_DB_PATH}` and `{LOCAL_PRISMA_PATH}` with the actual paths where you want to store the local database. It is recommended to use the `local-db` and `prisma` folders in the current code repository directory to maintain consistency with the database paths when starting via NPM.
150150

151+
> **Note:** If you need to mount the database file (PRISMA), you need to run `npm run db:push` in advance to initialize the database file.
152+
151153
3. Start with docker-compose:
152154

153155
```bash

README.zh-CN.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,8 +141,8 @@ services:
141141
ports:
142142
- '1717:1717'
143143
volumes:
144-
- ${LOCAL_DB_PATH}:/app/local-db
145-
- ${LOCAL_PRISMA_PATH}:/app/prisma
144+
- ./local-db:/app/local-db
145+
# - ./prisma:/app/prisma 如果需要挂载请先手动初始化数据库文件
146146
restart: unless-stopped
147147
```
148148
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import { NextResponse } from 'next/server';
2+
import logger from '@/lib/util/logger';
3+
import cleanService from '@/lib/services/clean';
4+
5+
// 为指定文本块进行数据清洗
6+
export async function POST(request, { params }) {
7+
try {
8+
const { projectId, chunkId } = params;
9+
10+
// 验证项目ID和文本块ID
11+
if (!projectId || !chunkId) {
12+
return NextResponse.json({ error: 'Project ID or text block ID cannot be empty' }, { status: 400 });
13+
}
14+
15+
// 获取请求体
16+
const { model, language = '中文' } = await request.json();
17+
18+
if (!model) {
19+
return NextResponse.json({ error: 'Model cannot be empty' }, { status: 400 });
20+
}
21+
22+
// 使用数据清洗服务
23+
const result = await cleanService.cleanDataForChunk(projectId, chunkId, {
24+
model,
25+
language
26+
});
27+
28+
// 返回清洗结果
29+
return NextResponse.json({
30+
chunkId,
31+
originalLength: result.originalLength,
32+
cleanedLength: result.cleanedLength,
33+
success: result.success,
34+
message: '数据清洗完成'
35+
});
36+
} catch (error) {
37+
logger.error('Error cleaning data:', error);
38+
return NextResponse.json({ error: error.message || 'Error cleaning data' }, { status: 500 });
39+
}
40+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import { getChunkContentsByNames } from '@/lib/db/chunks';
2+
import { NextResponse } from 'next/server';
3+
4+
export async function POST(request, { params }) {
5+
try {
6+
const { projectId } = params;
7+
const { chunkNames } = await request.json();
8+
9+
if (!chunkNames || !Array.isArray(chunkNames)) {
10+
return NextResponse.json({ error: 'chunkNames 参数必须是数组' }, { status: 400 });
11+
}
12+
13+
const chunkContentMap = await getChunkContentsByNames(projectId, chunkNames);
14+
15+
return NextResponse.json(chunkContentMap);
16+
} catch (error) {
17+
console.error('批量获取文本块内容失败:', error);
18+
return NextResponse.json({ error: '批量获取文本块内容失败' }, { status: 500 });
19+
}
20+
}

app/api/projects/[projectId]/datasets/[datasetId]/route.js

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { NextResponse } from 'next/server';
2-
import { getDatasetsById, getDatasetsCounts, getNavigationItems } from '@/lib/db/datasets';
2+
import { getDatasetsById, getDatasetsCounts, getNavigationItems, updateDatasetMetadata } from '@/lib/db/datasets';
33

44
/**
55
* 获取项目的所有数据集
@@ -34,3 +34,49 @@ export async function GET(request, { params }) {
3434
);
3535
}
3636
}
37+
38+
/**
39+
* 更新数据集元数据(评分、标签、备注)
40+
*/
41+
export async function PATCH(request, { params }) {
42+
try {
43+
const { projectId, datasetId } = params;
44+
45+
// 验证参数
46+
if (!projectId) {
47+
return NextResponse.json({ error: '项目ID不能为空' }, { status: 400 });
48+
}
49+
if (!datasetId) {
50+
return NextResponse.json({ error: '数据集ID不能为空' }, { status: 400 });
51+
}
52+
53+
const body = await request.json();
54+
const { score, tags, note } = body;
55+
56+
// 验证评分范围
57+
if (score !== undefined && (score < 0 || score > 5)) {
58+
return NextResponse.json({ error: '评分必须在0-5之间' }, { status: 400 });
59+
}
60+
61+
// 验证标签格式
62+
if (tags !== undefined && !Array.isArray(tags)) {
63+
return NextResponse.json({ error: '标签必须是数组格式' }, { status: 400 });
64+
}
65+
66+
// 更新数据集元数据
67+
const updatedDataset = await updateDatasetMetadata(datasetId, { score, tags, note });
68+
69+
return NextResponse.json({
70+
success: true,
71+
dataset: updatedDataset
72+
});
73+
} catch (error) {
74+
console.error('更新数据集元数据失败:', String(error));
75+
return NextResponse.json(
76+
{
77+
error: error.message || '更新数据集元数据失败'
78+
},
79+
{ status: 500 }
80+
);
81+
}
82+
}

app/api/projects/[projectId]/datasets/export/route.js

Lines changed: 84 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
import { NextResponse } from 'next/server';
2-
import { getDatasets } from '@/lib/db/datasets';
2+
import {
3+
getDatasets,
4+
getBalancedDatasetsByTags,
5+
getTagsWithDatasetCounts,
6+
getDatasetsBatch,
7+
getBalancedDatasetsByTagsBatch
8+
} from '@/lib/db/datasets';
39

410
/**
511
* 获取导出数据集
@@ -8,22 +14,93 @@ export async function GET(request, { params }) {
814
try {
915
const { projectId } = params;
1016
const { searchParams } = new URL(request.url);
17+
1118
// 验证项目ID
1219
if (!projectId) {
13-
return NextResponse.json({ error: '项目ID不能为空' }, { status: 400 });
20+
return NextResponse.json({ error: 'Project ID cannot be empty' }, { status: 400 });
1421
}
22+
1523
let status = searchParams.get('status');
1624
let confirmed = undefined;
1725
if (status === 'confirmed') confirmed = true;
1826
if (status === 'unconfirmed') confirmed = false;
19-
// 获取数据集
20-
let datasets = await getDatasets(projectId, confirmed);
21-
return NextResponse.json(datasets);
27+
28+
// 检查是否是分批导出模式
29+
const batchMode = searchParams.get('batchMode');
30+
const offset = parseInt(searchParams.get('offset')) || 0;
31+
const batchSize = parseInt(searchParams.get('batchSize')) || 1000;
32+
33+
// 检查是否是平衡导出
34+
const balanceMode = searchParams.get('balanceMode');
35+
const balanceConfig = searchParams.get('balanceConfig');
36+
37+
if (batchMode === 'true') {
38+
// 分批导出模式
39+
if (balanceMode === 'true' && balanceConfig) {
40+
// 平衡分批导出
41+
const parsedConfig = JSON.parse(balanceConfig);
42+
const result = await getBalancedDatasetsByTagsBatch(projectId, parsedConfig, confirmed, offset, batchSize);
43+
return NextResponse.json({
44+
data: result.data,
45+
hasMore: result.hasMore,
46+
offset: offset + result.data.length
47+
});
48+
} else {
49+
// 常规分批导出
50+
const datasets = await getDatasetsBatch(projectId, confirmed, offset, batchSize);
51+
const hasMore = datasets.length === batchSize;
52+
return NextResponse.json({
53+
data: datasets,
54+
hasMore,
55+
offset: offset + datasets.length
56+
});
57+
}
58+
} else {
59+
// 传统一次性导出模式(保持向后兼容)
60+
if (balanceMode === 'true' && balanceConfig) {
61+
// 平衡导出模式
62+
const parsedConfig = JSON.parse(balanceConfig);
63+
const datasets = await getBalancedDatasetsByTags(projectId, parsedConfig, confirmed);
64+
return NextResponse.json(datasets);
65+
} else {
66+
// 常规导出模式
67+
const datasets = await getDatasets(projectId, confirmed);
68+
return NextResponse.json(datasets);
69+
}
70+
}
71+
} catch (error) {
72+
console.error('Failed to get datasets:', String(error));
73+
return NextResponse.json(
74+
{
75+
error: error.message || 'Failed to get datasets'
76+
},
77+
{ status: 500 }
78+
);
79+
}
80+
}
81+
82+
/**
83+
* 获取标签统计信息
84+
*/
85+
export async function POST(request, { params }) {
86+
try {
87+
const { projectId } = params;
88+
const body = await request.json();
89+
const { confirmed } = body;
90+
91+
// 验证项目ID
92+
if (!projectId) {
93+
return NextResponse.json({ error: 'Project ID cannot be empty' }, { status: 400 });
94+
}
95+
96+
// 获取标签统计信息
97+
const tagStats = await getTagsWithDatasetCounts(projectId, confirmed);
98+
return NextResponse.json(tagStats);
2299
} catch (error) {
23-
console.error('获取数据集失败:', String(error));
100+
console.error('Failed to get tag statistics:', String(error));
24101
return NextResponse.json(
25102
{
26-
error: error.message || '获取数据集失败'
103+
error: error.message || 'Failed to get tag statistics'
27104
},
28105
{ status: 500 }
29106
);
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
import { NextResponse } from 'next/server';
2+
import { createDataset } from '@/lib/db/datasets';
3+
import { nanoid } from 'nanoid';
4+
5+
export async function POST(request, { params }) {
6+
try {
7+
const { projectId } = params;
8+
const { datasets, sourceInfo } = await request.json();
9+
10+
if (!datasets || !Array.isArray(datasets)) {
11+
return NextResponse.json({ error: 'Invalid datasets data' }, { status: 400 });
12+
}
13+
14+
const results = [];
15+
const errors = [];
16+
let successCount = 0;
17+
let skippedCount = 0;
18+
19+
for (let i = 0; i < datasets.length; i++) {
20+
try {
21+
const dataset = datasets[i];
22+
23+
// 安全获取与清洗字段
24+
const q = typeof dataset?.question === 'string' ? dataset.question.trim() : '';
25+
const a = typeof dataset?.answer === 'string' ? dataset.answer.trim() : '';
26+
27+
// 验证必填字段:缺失则跳过
28+
if (!q || !a) {
29+
errors.push(`第 ${i + 1} 条记录缺少必填字段(question/answer),已跳过`);
30+
skippedCount++;
31+
continue;
32+
}
33+
34+
// 规范化可选字段
35+
const chunkName = dataset?.chunkName || 'Imported Data';
36+
const chunkContent = dataset?.chunkContent || 'Imported from external source';
37+
const model = dataset?.model || 'imported';
38+
const questionLabel = dataset?.questionLabel || '';
39+
const cot = typeof dataset?.cot === 'string' ? dataset.cot : '';
40+
const confirmed = typeof dataset?.confirmed === 'boolean' ? dataset.confirmed : false;
41+
const score = typeof dataset?.score === 'number' ? dataset.score : 0;
42+
// tags: 支持数组/字符串/对象
43+
let tags = '[]';
44+
if (Array.isArray(dataset?.tags)) {
45+
try {
46+
tags = JSON.stringify(dataset.tags);
47+
} catch {
48+
tags = '[]';
49+
}
50+
} else if (typeof dataset?.tags === 'string') {
51+
tags = dataset.tags;
52+
} else if (dataset?.tags && typeof dataset.tags === 'object') {
53+
try {
54+
tags = JSON.stringify(dataset.tags);
55+
} catch {
56+
tags = '[]';
57+
}
58+
}
59+
// other: 对象或字符串
60+
let other = '{}';
61+
if (typeof dataset?.other === 'string') {
62+
other = dataset.other;
63+
} else if (dataset?.other && typeof dataset.other === 'object') {
64+
try {
65+
other = JSON.stringify(dataset.other);
66+
} catch {
67+
other = '{}';
68+
}
69+
}
70+
const note = typeof dataset?.note === 'string' ? dataset.note : '';
71+
72+
// 创建数据集记录
73+
const newDataset = await createDataset({
74+
projectId,
75+
questionId: nanoid(), // 生成唯一的问题ID
76+
question: q,
77+
answer: a,
78+
chunkName,
79+
chunkContent,
80+
model,
81+
questionLabel,
82+
cot,
83+
confirmed,
84+
score,
85+
tags,
86+
note,
87+
other
88+
});
89+
90+
results.push(newDataset);
91+
successCount++;
92+
} catch (error) {
93+
errors.push(`第 ${i + 1} 条记录: ${error.message}`);
94+
}
95+
}
96+
97+
return NextResponse.json({
98+
success: successCount,
99+
total: datasets.length,
100+
failed: errors.length,
101+
skipped: skippedCount,
102+
errors,
103+
sourceInfo
104+
});
105+
} catch (error) {
106+
console.error('Import datasets error:', error);
107+
return NextResponse.json({ error: error.message }, { status: 500 });
108+
}
109+
}

0 commit comments

Comments
 (0)