⚡️ perf: improve knowledge base RAG prompts (#4544)

* ⚡️ perf: improve knowledge base qa prompts * ⚡️ perf: improve knowledge base qa performance * ⚡️ perf: improve knowledge base qa performance * ✅ test: add tests * ✅ test: add tests for rag actions * ✅ test: fix tests
lobehub · Oct 29, 2024 · b4e3f60 · b4e3f60
1 parent 58397db
commit b4e3f60
Show file tree

Hide file tree

Showing 12 changed files with 537 additions and 26 deletions.
diff --git a/src/database/server/models/chunk.ts b/src/database/server/models/chunk.ts
@@ -3,7 +3,7 @@ import { and, desc, isNull } from 'drizzle-orm/expressions';
 import { chunk } from 'lodash-es';
 
 import { serverDB } from '@/database/server';
-import { ChunkMetadata, FileChunk, SemanticSearchChunk } from '@/types/chunk';
+import { ChunkMetadata, FileChunk } from '@/types/chunk';
 
 import {
   NewChunkItem,
@@ -148,6 +148,8 @@ export class ChunkModel {
 
     const data = await serverDB
       .select({
+        fileId: fileChunks.fileId,
+        fileName: files.name,
         id: chunks.id,
         index: chunks.index,
         metadata: chunks.metadata,
@@ -158,16 +160,15 @@ export class ChunkModel {
       .from(chunks)
       .leftJoin(embeddings, eq(chunks.id, embeddings.chunkId))
       .leftJoin(fileChunks, eq(chunks.id, fileChunks.chunkId))
+      .leftJoin(files, eq(fileChunks.fileId, files.id))
       .where(fileIds ? inArray(fileChunks.fileId, fileIds) : undefined)
       .orderBy((t) => desc(t.similarity))
       .limit(30);
 
-    return data.map(
-      (item): SemanticSearchChunk => ({
-        ...item,
-        metadata: item.metadata as ChunkMetadata,
-      }),
-    );
+    return data.map((item) => ({
+      ...item,
+      metadata: item.metadata as ChunkMetadata,
+    }));
   }
 
   async semanticSearchForChat({
@@ -187,7 +188,7 @@ export class ChunkModel {
     const result = await serverDB
       .select({
         fileId: files.id,
-        filename: files.name,
+        fileName: files.name,
         id: chunks.id,
         index: chunks.index,
         metadata: chunks.metadata,
@@ -205,6 +206,8 @@ export class ChunkModel {
 
     return result.map((item) => {
       return {
+        fileId: item.fileId,
+        fileName: item.fileName,
         id: item.id,
         index: item.index,
         similarity: item.similarity,

diff --git a/src/prompts/knowledgeBaseQA/__snapshots__/index.test.ts.snap b/src/prompts/knowledgeBaseQA/__snapshots__/index.test.ts.snap
@@ -0,0 +1,26 @@
+// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
+
+exports[`knowledgeBaseQAPrompts > should generate prompt with all parameters 1`] = `
+"<knowledge_base_qa_info>
+You are also a helpful assistant good answering questions related to Test Knowledge. And you'll be provided with a question and several passages that might be relevant. And currently your task is to provide answer based on the question and passages.
+<knowledge_base_anwser_instruction>
+- Note that passages might not be relevant to the question, please only use the passages that are relevant.
+- if there is no relevant passage, please answer using your knowledge.
+- Answer should use the same original language as the question and follow markdown syntax.
+</knowledge_base_anwser_instruction>
+<knowledge_bases>
+<knowledge_bases_docstring>here are the knowledge base scope we retrieve chunks from:</knowledge_bases_docstring>
+<knowledge id="kb1" name="Test Knowledge" type="file" fileType="txt" >Test description</knowledge>
+</knowledge_bases>
+<retrieved_chunks>
+<retrieved_chunks_docstring>here are retrived chunks you can refer to:</retrieved_chunks_docstring>
+<chunk fileId="file1" fileName="test.txt" similarity="0.8"  pageNumber="1" >This is a test chunk</chunk>
+</retrieved_chunks>
+<user_query>
+<user_query_docstring>to make result better, we may rewrite user's question.If there is a rewrite query, it will be wrapper with \`rewrite_query\` tag.</user_query_docstring>
+
+<raw_query>What is the test about?</raw_query>
+<rewrite_query>Could you explain the content of the test?</rewrite_query>
+<user_query>
+</knowledge_base_qa_info>"
+`;
diff --git a/src/prompts/knowledgeBaseQA/chunk.ts b/src/prompts/knowledgeBaseQA/chunk.ts
@@ -0,0 +1,15 @@
+import { ChatSemanticSearchChunk } from '@/types/chunk';
+
+const chunkPrompt = (item: ChatSemanticSearchChunk) =>
+  `<chunk fileId="${item.fileId}" fileName="${item.fileName}" similarity="${item.similarity}" ${item.pageNumber ? ` pageNumber="${item.pageNumber}" ` : ''}>${item.text}</chunk>`;
+
+export const chunkPrompts = (fileList: ChatSemanticSearchChunk[]) => {
+  if (fileList.length === 0) return '';
+
+  const prompt = `<retrieved_chunks>
+<retrieved_chunks_docstring>here are retrived chunks you can refer to:</retrieved_chunks_docstring>
+${fileList.map((item) => chunkPrompt(item)).join('\n')}
+</retrieved_chunks>`;
+
+  return prompt.trim();
+};
diff --git a/src/prompts/knowledgeBaseQA/index.test.ts b/src/prompts/knowledgeBaseQA/index.test.ts
@@ -0,0 +1,146 @@
+import { describe, expect, it } from 'vitest';
+
+import { ChatSemanticSearchChunk } from '@/types/chunk';
+import { KnowledgeItem, KnowledgeType } from '@/types/knowledgeBase';
+
+import { knowledgeBaseQAPrompts } from './index';
+
+describe('knowledgeBaseQAPrompts', () => {
+  // Define test data
+  const mockChunks: ChatSemanticSearchChunk[] = [
+    {
+      id: '1',
+      fileId: 'file1',
+      fileName: 'test.txt',
+      text: 'This is a test chunk',
+      similarity: 0.8,
+      pageNumber: 1,
+    },
+  ];
+
+  const mockKnowledge: KnowledgeItem[] = [
+    {
+      id: 'kb1',
+      name: 'Test Knowledge',
+      type: KnowledgeType.File,
+      fileType: 'txt',
+      description: 'Test description',
+    },
+  ];
+
+  const userQuery = 'What is the test about?';
+  const rewriteQuery = 'Could you explain the content of the test?';
+
+  it('should return empty string if chunks is empty', () => {
+    const result = knowledgeBaseQAPrompts({
+      chunks: [],
+      knowledge: mockKnowledge,
+      userQuery,
+    });
+
+    expect(result).toBe('');
+  });
+
+  it('should return empty string if chunks is undefined', () => {
+    const result = knowledgeBaseQAPrompts({
+      knowledge: mockKnowledge,
+      userQuery,
+    });
+
+    expect(result).toBe('');
+  });
+
+  it('should generate prompt with all parameters', () => {
+    const result = knowledgeBaseQAPrompts({
+      chunks: mockChunks,
+      knowledge: mockKnowledge,
+      userQuery,
+      rewriteQuery,
+    });
+
+    // Verify the prompt structure and content
+    expect(result).toMatchSnapshot();
+  });
+
+  it('should generate prompt without rewriteQuery', () => {
+    const result = knowledgeBaseQAPrompts({
+      chunks: mockChunks,
+      knowledge: mockKnowledge,
+      userQuery,
+    });
+
+    expect(result).toContain('<raw_query>What is the test about?</raw_query>');
+    expect(result).not.toContain('<rewrite_query>');
+  });
+
+  it('should generate prompt without knowledge', () => {
+    const result = knowledgeBaseQAPrompts({
+      chunks: mockChunks,
+      userQuery,
+    });
+
+    expect(result).toContain(
+      'You are also a helpful assistant good answering questions related to',
+    );
+    expect(result).not.toContain('<knowledge_bases>');
+  });
+
+  it('should handle empty knowledge array', () => {
+    const result = knowledgeBaseQAPrompts({
+      chunks: mockChunks,
+      knowledge: [],
+      userQuery,
+    });
+
+    expect(result).toContain(
+      'You are also a helpful assistant good answering questions related to',
+    );
+    expect(result).not.toContain('<knowledge_bases>');
+  });
+
+  it('should properly escape special characters in input', () => {
+    const specialChunks: ChatSemanticSearchChunk[] = [
+      {
+        id: '1',
+        fileId: 'file1',
+        fileName: 'test&.txt',
+        text: 'This is a test with & < > "quotes"',
+        similarity: 0.8,
+      },
+    ];
+
+    const result = knowledgeBaseQAPrompts({
+      chunks: specialChunks,
+      userQuery: 'Test with & < > "quotes"',
+    });
+
+    expect(result).toContain('test&.txt');
+    expect(result).toContain('This is a test with & < > "quotes"');
+    expect(result).toContain('Test with & < > "quotes"');
+  });
+
+  it('should handle multiple knowledge items', () => {
+    const multipleKnowledge: KnowledgeItem[] = [
+      {
+        id: 'kb1',
+        name: 'Knowledge 1',
+        type: KnowledgeType.File,
+      },
+      {
+        id: 'kb2',
+        name: 'Knowledge 2',
+        type: KnowledgeType.KnowledgeBase,
+      },
+    ];
+
+    const result = knowledgeBaseQAPrompts({
+      chunks: mockChunks,
+      knowledge: multipleKnowledge,
+      userQuery,
+    });
+
+    expect(result).toContain('Knowledge 1/Knowledge 2');
+    expect(result).toContain('<knowledge id="kb1"');
+    expect(result).toContain('<knowledge id="kb2"');
+  });
+});
diff --git a/src/prompts/knowledgeBaseQA/index.ts b/src/prompts/knowledgeBaseQA/index.ts
@@ -0,0 +1,33 @@
+import { chunkPrompts } from '@/prompts/knowledgeBaseQA/chunk';
+import { knowledgePrompts } from '@/prompts/knowledgeBaseQA/knowledge';
+import { userQueryPrompt } from '@/prompts/knowledgeBaseQA/userQuery';
+import { ChatSemanticSearchChunk } from '@/types/chunk';
+import { KnowledgeItem } from '@/types/knowledgeBase';
+
+export const knowledgeBaseQAPrompts = ({
+  chunks,
+  knowledge,
+  userQuery,
+  rewriteQuery,
+}: {
+  chunks?: ChatSemanticSearchChunk[];
+  knowledge?: KnowledgeItem[];
+  rewriteQuery?: string;
+  userQuery: string;
+}) => {
+  if ((chunks || [])?.length === 0) return '';
+
+  const domains = (knowledge || []).map((v) => v.name).join('/');
+
+  return `<knowledge_base_qa_info>
+You are also a helpful assistant good answering questions related to ${domains}. And you'll be provided with a question and several passages that might be relevant. And currently your task is to provide answer based on the question and passages.
+<knowledge_base_anwser_instruction>
+- Note that passages might not be relevant to the question, please only use the passages that are relevant.
+- if there is no relevant passage, please answer using your knowledge.
+- Answer should use the same original language as the question and follow markdown syntax.
+</knowledge_base_anwser_instruction>
+${knowledgePrompts(knowledge)}
+${chunks ? chunkPrompts(chunks) : ''}
+${userQueryPrompt(userQuery, rewriteQuery)}
+</knowledge_base_qa_info>`;
+};
diff --git a/src/prompts/knowledgeBaseQA/knowledge.ts b/src/prompts/knowledgeBaseQA/knowledge.ts
@@ -0,0 +1,15 @@
+import { KnowledgeItem } from '@/types/knowledgeBase';
+
+const knowledgePrompt = (item: KnowledgeItem) =>
+  `<knowledge id="${item.id}" name="${item.name}" type="${item.type}"${item.fileType ? ` fileType="${item.fileType}" ` : ''}>${item.description || ''}</knowledge>`;
+
+export const knowledgePrompts = (list?: KnowledgeItem[]) => {
+  if ((list || []).length === 0) return '';
+
+  const prompt = `<knowledge_bases>
+<knowledge_bases_docstring>here are the knowledge base scope we retrieve chunks from:</knowledge_bases_docstring>
+${list?.map((item) => knowledgePrompt(item)).join('\n')}
+</knowledge_bases>`;
+
+  return prompt.trim();
+};
diff --git a/src/prompts/knowledgeBaseQA/userQuery.ts b/src/prompts/knowledgeBaseQA/userQuery.ts
@@ -0,0 +1,8 @@
+export const userQueryPrompt = (userQuery: string, rewriteQuery?: string) => {
+  return `<user_query>
+<user_query_docstring>to make result better, we may rewrite user's question.If there is a rewrite query, it will be wrapper with \`rewrite_query\` tag.</user_query_docstring>
+
+<raw_query>${userQuery.trim()}</raw_query>
+${rewriteQuery ? `<rewrite_query>${rewriteQuery.trim()}</rewrite_query>` : ''}
+<user_query>`;
+};
diff --git a/...s/aiChat/actions/__tests__/action.test.ts → .../actions/__tests__/generateAIChat.test.ts b/...s/aiChat/actions/__tests__/action.test.ts → .../actions/__tests__/generateAIChat.test.ts