ConardLi
diff --git a/‎README.md‎
Lines changed: 4 additions & 2 deletions b/‎README.md‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎README.zh-CN.md‎
Lines changed: 2 additions & 2 deletions b/‎README.zh-CN.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎app/api/projects/[projectId]/chunks/[chunkId]/clean/route.js‎
Lines changed: 40 additions & 0 deletions b/‎app/api/projects/[projectId]/chunks/[chunkId]/clean/route.js‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎app/api/projects/[projectId]/chunks/batch-content/route.js‎
Lines changed: 20 additions & 0 deletions b/‎app/api/projects/[projectId]/chunks/batch-content/route.js‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎app/api/projects/[projectId]/datasets/[datasetId]/route.js‎
Lines changed: 47 additions & 1 deletion b/‎app/api/projects/[projectId]/datasets/[datasetId]/route.js‎
Lines changed: 47 additions & 1 deletion
diff --git a/‎app/api/projects/[projectId]/datasets/export/route.js‎
Lines changed: 84 additions & 7 deletions b/‎app/api/projects/[projectId]/datasets/export/route.js‎
Lines changed: 84 additions & 7 deletions
diff --git a/‎app/api/projects/[projectId]/datasets/import/route.js‎
Lines changed: 109 additions & 0 deletions b/‎app/api/projects/[projectId]/datasets/import/route.js‎
Lines changed: 109 additions & 0 deletions
@@ -141,13 +141,15 @@ services:
     ports:
       - '1717:1717'
     volumes:
-      - ${LOCAL_DB_PATH}:/app/local-db
-      - ${LOCAL_PRISMA_PATH}:/app/prisma
+      - ./local-db:/app/local-db
+      # - ./prisma:/app/prisma  If mounting is required, please manually initialize the database file first.
     restart: unless-stopped
 ```
 
 > **Note:** Replace `{YOUR_LOCAL_DB_PATH}` and `{LOCAL_PRISMA_PATH}` with the actual paths where you want to store the local database. It is recommended to use the `local-db` and `prisma` folders in the current code repository directory to maintain consistency with the database paths when starting via NPM.
 
+> **Note:** If you need to mount the database file (PRISMA), you need to run `npm run db:push` in advance to initialize the database file.
+
 3. Start with docker-compose:
 
 ```bash
 
@@ -141,8 +141,8 @@ services:
     ports:
       - '1717:1717'
     volumes:
-      - ${LOCAL_DB_PATH}:/app/local-db
-      - ${LOCAL_PRISMA_PATH}:/app/prisma
+      - ./local-db:/app/local-db
+      # - ./prisma:/app/prisma 如果需要挂载请先手动初始化数据库文件
     restart: unless-stopped
 ```
 
 
@@ -0,0 +1,40 @@
+import { NextResponse } from 'next/server';
+import logger from '@/lib/util/logger';
+import cleanService from '@/lib/services/clean';
+
+// 为指定文本块进行数据清洗
+export async function POST(request, { params }) {
+  try {
+    const { projectId, chunkId } = params;
+
+    // 验证项目ID和文本块ID
+    if (!projectId || !chunkId) {
+      return NextResponse.json({ error: 'Project ID or text block ID cannot be empty' }, { status: 400 });
+    }
+
+    // 获取请求体
+    const { model, language = '中文' } = await request.json();
+
+    if (!model) {
+      return NextResponse.json({ error: 'Model cannot be empty' }, { status: 400 });
+    }
+
+    // 使用数据清洗服务
+    const result = await cleanService.cleanDataForChunk(projectId, chunkId, {
+      model,
+      language
+    });
+
+    // 返回清洗结果
+    return NextResponse.json({
+      chunkId,
+      originalLength: result.originalLength,
+      cleanedLength: result.cleanedLength,
+      success: result.success,
+      message: '数据清洗完成'
+    });
+  } catch (error) {
+    logger.error('Error cleaning data:', error);
+    return NextResponse.json({ error: error.message || 'Error cleaning data' }, { status: 500 });
+  }
+}
@@ -0,0 +1,20 @@
+import { getChunkContentsByNames } from '@/lib/db/chunks';
+import { NextResponse } from 'next/server';
+
+export async function POST(request, { params }) {
+  try {
+    const { projectId } = params;
+    const { chunkNames } = await request.json();
+
+    if (!chunkNames || !Array.isArray(chunkNames)) {
+      return NextResponse.json({ error: 'chunkNames 参数必须是数组' }, { status: 400 });
+    }
+
+    const chunkContentMap = await getChunkContentsByNames(projectId, chunkNames);
+
+    return NextResponse.json(chunkContentMap);
+  } catch (error) {
+    console.error('批量获取文本块内容失败:', error);
+    return NextResponse.json({ error: '批量获取文本块内容失败' }, { status: 500 });
+  }
+}
@@ -1,5 +1,5 @@
 import { NextResponse } from 'next/server';
-import { getDatasetsById, getDatasetsCounts, getNavigationItems } from '@/lib/db/datasets';
+import { getDatasetsById, getDatasetsCounts, getNavigationItems, updateDatasetMetadata } from '@/lib/db/datasets';
 
 /**
  * 获取项目的所有数据集
@@ -34,3 +34,49 @@ export async function GET(request, { params }) {
     );
   }
 }
+
+/**
+ * 更新数据集元数据（评分、标签、备注）
+ */
+export async function PATCH(request, { params }) {
+  try {
+    const { projectId, datasetId } = params;
+
+    // 验证参数
+    if (!projectId) {
+      return NextResponse.json({ error: '项目ID不能为空' }, { status: 400 });
+    }
+    if (!datasetId) {
+      return NextResponse.json({ error: '数据集ID不能为空' }, { status: 400 });
+    }
+
+    const body = await request.json();
+    const { score, tags, note } = body;
+
+    // 验证评分范围
+    if (score !== undefined && (score < 0 || score > 5)) {
+      return NextResponse.json({ error: '评分必须在0-5之间' }, { status: 400 });
+    }
+
+    // 验证标签格式
+    if (tags !== undefined && !Array.isArray(tags)) {
+      return NextResponse.json({ error: '标签必须是数组格式' }, { status: 400 });
+    }
+
+    // 更新数据集元数据
+    const updatedDataset = await updateDatasetMetadata(datasetId, { score, tags, note });
+
+    return NextResponse.json({
+      success: true,
+      dataset: updatedDataset
+    });
+  } catch (error) {
+    console.error('更新数据集元数据失败:', String(error));
+    return NextResponse.json(
+      {
+        error: error.message || '更新数据集元数据失败'
+      },
+      { status: 500 }
+    );
+  }
+}
@@ -1,5 +1,11 @@
 import { NextResponse } from 'next/server';
-import { getDatasets } from '@/lib/db/datasets';
+import {
+  getDatasets,
+  getBalancedDatasetsByTags,
+  getTagsWithDatasetCounts,
+  getDatasetsBatch,
+  getBalancedDatasetsByTagsBatch
+} from '@/lib/db/datasets';
 
 /**
  * 获取导出数据集
@@ -8,22 +14,93 @@ export async function GET(request, { params }) {
   try {
     const { projectId } = params;
     const { searchParams } = new URL(request.url);
+
     // 验证项目ID
     if (!projectId) {
-      return NextResponse.json({ error: '项目ID不能为空' }, { status: 400 });
+      return NextResponse.json({ error: 'Project ID cannot be empty' }, { status: 400 });
     }
+
     let status = searchParams.get('status');
     let confirmed = undefined;
     if (status === 'confirmed') confirmed = true;
     if (status === 'unconfirmed') confirmed = false;
-    // 获取数据集
-    let datasets = await getDatasets(projectId, confirmed);
-    return NextResponse.json(datasets);
+
+    // 检查是否是分批导出模式
+    const batchMode = searchParams.get('batchMode');
+    const offset = parseInt(searchParams.get('offset')) || 0;
+    const batchSize = parseInt(searchParams.get('batchSize')) || 1000;
+
+    // 检查是否是平衡导出
+    const balanceMode = searchParams.get('balanceMode');
+    const balanceConfig = searchParams.get('balanceConfig');
+
+    if (batchMode === 'true') {
+      // 分批导出模式
+      if (balanceMode === 'true' && balanceConfig) {
+        // 平衡分批导出
+        const parsedConfig = JSON.parse(balanceConfig);
+        const result = await getBalancedDatasetsByTagsBatch(projectId, parsedConfig, confirmed, offset, batchSize);
+        return NextResponse.json({
+          data: result.data,
+          hasMore: result.hasMore,
+          offset: offset + result.data.length
+        });
+      } else {
+        // 常规分批导出
+        const datasets = await getDatasetsBatch(projectId, confirmed, offset, batchSize);
+        const hasMore = datasets.length === batchSize;
+        return NextResponse.json({
+          data: datasets,
+          hasMore,
+          offset: offset + datasets.length
+        });
+      }
+    } else {
+      // 传统一次性导出模式（保持向后兼容）
+      if (balanceMode === 'true' && balanceConfig) {
+        // 平衡导出模式
+        const parsedConfig = JSON.parse(balanceConfig);
+        const datasets = await getBalancedDatasetsByTags(projectId, parsedConfig, confirmed);
+        return NextResponse.json(datasets);
+      } else {
+        // 常规导出模式
+        const datasets = await getDatasets(projectId, confirmed);
+        return NextResponse.json(datasets);
+      }
+    }
+  } catch (error) {
+    console.error('Failed to get datasets:', String(error));
+    return NextResponse.json(
+      {
+        error: error.message || 'Failed to get datasets'
+      },
+      { status: 500 }
+    );
+  }
+}
+
+/**
+ * 获取标签统计信息
+ */
+export async function POST(request, { params }) {
+  try {
+    const { projectId } = params;
+    const body = await request.json();
+    const { confirmed } = body;
+
+    // 验证项目ID
+    if (!projectId) {
+      return NextResponse.json({ error: 'Project ID cannot be empty' }, { status: 400 });
+    }
+
+    // 获取标签统计信息
+    const tagStats = await getTagsWithDatasetCounts(projectId, confirmed);
+    return NextResponse.json(tagStats);
   } catch (error) {
-    console.error('获取数据集失败:', String(error));
+    console.error('Failed to get tag statistics:', String(error));
     return NextResponse.json(
       {
-        error: error.message || '获取数据集失败'
+        error: error.message || 'Failed to get tag statistics'
       },
       { status: 500 }
     );
 
@@ -0,0 +1,109 @@
+import { NextResponse } from 'next/server';
+import { createDataset } from '@/lib/db/datasets';
+import { nanoid } from 'nanoid';
+
+export async function POST(request, { params }) {
+  try {
+    const { projectId } = params;
+    const { datasets, sourceInfo } = await request.json();
+
+    if (!datasets || !Array.isArray(datasets)) {
+      return NextResponse.json({ error: 'Invalid datasets data' }, { status: 400 });
+    }
+
+    const results = [];
+    const errors = [];
+    let successCount = 0;
+    let skippedCount = 0;
+
+    for (let i = 0; i < datasets.length; i++) {
+      try {
+        const dataset = datasets[i];
+
+        // 安全获取与清洗字段
+        const q = typeof dataset?.question === 'string' ? dataset.question.trim() : '';
+        const a = typeof dataset?.answer === 'string' ? dataset.answer.trim() : '';
+
+        // 验证必填字段：缺失则跳过
+        if (!q || !a) {
+          errors.push(`第 ${i + 1} 条记录缺少必填字段(question/answer)，已跳过`);
+          skippedCount++;
+          continue;
+        }
+
+        // 规范化可选字段
+        const chunkName = dataset?.chunkName || 'Imported Data';
+        const chunkContent = dataset?.chunkContent || 'Imported from external source';
+        const model = dataset?.model || 'imported';
+        const questionLabel = dataset?.questionLabel || '';
+        const cot = typeof dataset?.cot === 'string' ? dataset.cot : '';
+        const confirmed = typeof dataset?.confirmed === 'boolean' ? dataset.confirmed : false;
+        const score = typeof dataset?.score === 'number' ? dataset.score : 0;
+        // tags: 支持数组/字符串/对象
+        let tags = '[]';
+        if (Array.isArray(dataset?.tags)) {
+          try {
+            tags = JSON.stringify(dataset.tags);
+          } catch {
+            tags = '[]';
+          }
+        } else if (typeof dataset?.tags === 'string') {
+          tags = dataset.tags;
+        } else if (dataset?.tags && typeof dataset.tags === 'object') {
+          try {
+            tags = JSON.stringify(dataset.tags);
+          } catch {
+            tags = '[]';
+          }
+        }
+        // other: 对象或字符串
+        let other = '{}';
+        if (typeof dataset?.other === 'string') {
+          other = dataset.other;
+        } else if (dataset?.other && typeof dataset.other === 'object') {
+          try {
+            other = JSON.stringify(dataset.other);
+          } catch {
+            other = '{}';
+          }
+        }
+        const note = typeof dataset?.note === 'string' ? dataset.note : '';
+
+        // 创建数据集记录
+        const newDataset = await createDataset({
+          projectId,
+          questionId: nanoid(), // 生成唯一的问题ID
+          question: q,
+          answer: a,
+          chunkName,
+          chunkContent,
+          model,
+          questionLabel,
+          cot,
+          confirmed,
+          score,
+          tags,
+          note,
+          other
+        });
+
+        results.push(newDataset);
+        successCount++;
+      } catch (error) {
+        errors.push(`第 ${i + 1} 条记录: ${error.message}`);
+      }
+    }
+
+    return NextResponse.json({
+      success: successCount,
+      total: datasets.length,
+      failed: errors.length,
+      skipped: skippedCount,
+      errors,
+      sourceInfo
+    });
+  } catch (error) {
+    console.error('Import datasets error:', error);
+    return NextResponse.json({ error: error.message }, { status: 500 });
+  }
+}