Skip to content

Commit

Permalink
⚡️ perf: improve knowledge base RAG prompts (#4544)
Browse files Browse the repository at this point in the history
* ⚡️ perf: improve knowledge base qa prompts

* ⚡️ perf: improve knowledge base qa performance

* ⚡️ perf: improve knowledge base qa performance

* ✅ test: add tests

* ✅ test: add tests for rag actions

* ✅ test: fix tests
  • Loading branch information
arvinxx authored Oct 29, 2024
1 parent 58397db commit b4e3f60
Show file tree
Hide file tree
Showing 12 changed files with 537 additions and 26 deletions.
19 changes: 11 additions & 8 deletions src/database/server/models/chunk.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { and, desc, isNull } from 'drizzle-orm/expressions';
import { chunk } from 'lodash-es';

import { serverDB } from '@/database/server';
import { ChunkMetadata, FileChunk, SemanticSearchChunk } from '@/types/chunk';
import { ChunkMetadata, FileChunk } from '@/types/chunk';

import {
NewChunkItem,
Expand Down Expand Up @@ -148,6 +148,8 @@ export class ChunkModel {

const data = await serverDB
.select({
fileId: fileChunks.fileId,
fileName: files.name,
id: chunks.id,
index: chunks.index,
metadata: chunks.metadata,
Expand All @@ -158,16 +160,15 @@ export class ChunkModel {
.from(chunks)
.leftJoin(embeddings, eq(chunks.id, embeddings.chunkId))
.leftJoin(fileChunks, eq(chunks.id, fileChunks.chunkId))
.leftJoin(files, eq(fileChunks.fileId, files.id))
.where(fileIds ? inArray(fileChunks.fileId, fileIds) : undefined)
.orderBy((t) => desc(t.similarity))
.limit(30);

return data.map(
(item): SemanticSearchChunk => ({
...item,
metadata: item.metadata as ChunkMetadata,
}),
);
return data.map((item) => ({
...item,
metadata: item.metadata as ChunkMetadata,
}));
}

async semanticSearchForChat({
Expand All @@ -187,7 +188,7 @@ export class ChunkModel {
const result = await serverDB
.select({
fileId: files.id,
filename: files.name,
fileName: files.name,
id: chunks.id,
index: chunks.index,
metadata: chunks.metadata,
Expand All @@ -205,6 +206,8 @@ export class ChunkModel {

return result.map((item) => {
return {
fileId: item.fileId,
fileName: item.fileName,
id: item.id,
index: item.index,
similarity: item.similarity,
Expand Down
26 changes: 26 additions & 0 deletions src/prompts/knowledgeBaseQA/__snapshots__/index.test.ts.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html

exports[`knowledgeBaseQAPrompts > should generate prompt with all parameters 1`] = `
"<knowledge_base_qa_info>
You are also a helpful assistant good answering questions related to Test Knowledge. And you'll be provided with a question and several passages that might be relevant. And currently your task is to provide answer based on the question and passages.
<knowledge_base_anwser_instruction>
- Note that passages might not be relevant to the question, please only use the passages that are relevant.
- if there is no relevant passage, please answer using your knowledge.
- Answer should use the same original language as the question and follow markdown syntax.
</knowledge_base_anwser_instruction>
<knowledge_bases>
<knowledge_bases_docstring>here are the knowledge base scope we retrieve chunks from:</knowledge_bases_docstring>
<knowledge id="kb1" name="Test Knowledge" type="file" fileType="txt" >Test description</knowledge>
</knowledge_bases>
<retrieved_chunks>
<retrieved_chunks_docstring>here are retrived chunks you can refer to:</retrieved_chunks_docstring>
<chunk fileId="file1" fileName="test.txt" similarity="0.8" pageNumber="1" >This is a test chunk</chunk>
</retrieved_chunks>
<user_query>
<user_query_docstring>to make result better, we may rewrite user's question.If there is a rewrite query, it will be wrapper with \`rewrite_query\` tag.</user_query_docstring>
<raw_query>What is the test about?</raw_query>
<rewrite_query>Could you explain the content of the test?</rewrite_query>
<user_query>
</knowledge_base_qa_info>"
`;
15 changes: 15 additions & 0 deletions src/prompts/knowledgeBaseQA/chunk.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import { ChatSemanticSearchChunk } from '@/types/chunk';

const chunkPrompt = (item: ChatSemanticSearchChunk) =>
`<chunk fileId="${item.fileId}" fileName="${item.fileName}" similarity="${item.similarity}" ${item.pageNumber ? ` pageNumber="${item.pageNumber}" ` : ''}>${item.text}</chunk>`;

export const chunkPrompts = (fileList: ChatSemanticSearchChunk[]) => {
if (fileList.length === 0) return '';

const prompt = `<retrieved_chunks>
<retrieved_chunks_docstring>here are retrived chunks you can refer to:</retrieved_chunks_docstring>
${fileList.map((item) => chunkPrompt(item)).join('\n')}
</retrieved_chunks>`;

return prompt.trim();
};
146 changes: 146 additions & 0 deletions src/prompts/knowledgeBaseQA/index.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import { describe, expect, it } from 'vitest';

import { ChatSemanticSearchChunk } from '@/types/chunk';
import { KnowledgeItem, KnowledgeType } from '@/types/knowledgeBase';

import { knowledgeBaseQAPrompts } from './index';

describe('knowledgeBaseQAPrompts', () => {
// Define test data
const mockChunks: ChatSemanticSearchChunk[] = [
{
id: '1',
fileId: 'file1',
fileName: 'test.txt',
text: 'This is a test chunk',
similarity: 0.8,
pageNumber: 1,
},
];

const mockKnowledge: KnowledgeItem[] = [
{
id: 'kb1',
name: 'Test Knowledge',
type: KnowledgeType.File,
fileType: 'txt',
description: 'Test description',
},
];

const userQuery = 'What is the test about?';
const rewriteQuery = 'Could you explain the content of the test?';

it('should return empty string if chunks is empty', () => {
const result = knowledgeBaseQAPrompts({
chunks: [],
knowledge: mockKnowledge,
userQuery,
});

expect(result).toBe('');
});

it('should return empty string if chunks is undefined', () => {
const result = knowledgeBaseQAPrompts({
knowledge: mockKnowledge,
userQuery,
});

expect(result).toBe('');
});

it('should generate prompt with all parameters', () => {
const result = knowledgeBaseQAPrompts({
chunks: mockChunks,
knowledge: mockKnowledge,
userQuery,
rewriteQuery,
});

// Verify the prompt structure and content
expect(result).toMatchSnapshot();
});

it('should generate prompt without rewriteQuery', () => {
const result = knowledgeBaseQAPrompts({
chunks: mockChunks,
knowledge: mockKnowledge,
userQuery,
});

expect(result).toContain('<raw_query>What is the test about?</raw_query>');
expect(result).not.toContain('<rewrite_query>');
});

it('should generate prompt without knowledge', () => {
const result = knowledgeBaseQAPrompts({
chunks: mockChunks,
userQuery,
});

expect(result).toContain(
'You are also a helpful assistant good answering questions related to',
);
expect(result).not.toContain('<knowledge_bases>');
});

it('should handle empty knowledge array', () => {
const result = knowledgeBaseQAPrompts({
chunks: mockChunks,
knowledge: [],
userQuery,
});

expect(result).toContain(
'You are also a helpful assistant good answering questions related to',
);
expect(result).not.toContain('<knowledge_bases>');
});

it('should properly escape special characters in input', () => {
const specialChunks: ChatSemanticSearchChunk[] = [
{
id: '1',
fileId: 'file1',
fileName: 'test&.txt',
text: 'This is a test with & < > "quotes"',
similarity: 0.8,
},
];

const result = knowledgeBaseQAPrompts({
chunks: specialChunks,
userQuery: 'Test with & < > "quotes"',
});

expect(result).toContain('test&.txt');
expect(result).toContain('This is a test with & < > "quotes"');
expect(result).toContain('Test with & < > "quotes"');
});

it('should handle multiple knowledge items', () => {
const multipleKnowledge: KnowledgeItem[] = [
{
id: 'kb1',
name: 'Knowledge 1',
type: KnowledgeType.File,
},
{
id: 'kb2',
name: 'Knowledge 2',
type: KnowledgeType.KnowledgeBase,
},
];

const result = knowledgeBaseQAPrompts({
chunks: mockChunks,
knowledge: multipleKnowledge,
userQuery,
});

expect(result).toContain('Knowledge 1/Knowledge 2');
expect(result).toContain('<knowledge id="kb1"');
expect(result).toContain('<knowledge id="kb2"');
});
});
33 changes: 33 additions & 0 deletions src/prompts/knowledgeBaseQA/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import { chunkPrompts } from '@/prompts/knowledgeBaseQA/chunk';
import { knowledgePrompts } from '@/prompts/knowledgeBaseQA/knowledge';
import { userQueryPrompt } from '@/prompts/knowledgeBaseQA/userQuery';
import { ChatSemanticSearchChunk } from '@/types/chunk';
import { KnowledgeItem } from '@/types/knowledgeBase';

export const knowledgeBaseQAPrompts = ({
chunks,
knowledge,
userQuery,
rewriteQuery,
}: {
chunks?: ChatSemanticSearchChunk[];
knowledge?: KnowledgeItem[];
rewriteQuery?: string;
userQuery: string;
}) => {
if ((chunks || [])?.length === 0) return '';

const domains = (knowledge || []).map((v) => v.name).join('/');

return `<knowledge_base_qa_info>
You are also a helpful assistant good answering questions related to ${domains}. And you'll be provided with a question and several passages that might be relevant. And currently your task is to provide answer based on the question and passages.
<knowledge_base_anwser_instruction>
- Note that passages might not be relevant to the question, please only use the passages that are relevant.
- if there is no relevant passage, please answer using your knowledge.
- Answer should use the same original language as the question and follow markdown syntax.
</knowledge_base_anwser_instruction>
${knowledgePrompts(knowledge)}
${chunks ? chunkPrompts(chunks) : ''}
${userQueryPrompt(userQuery, rewriteQuery)}
</knowledge_base_qa_info>`;
};
15 changes: 15 additions & 0 deletions src/prompts/knowledgeBaseQA/knowledge.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import { KnowledgeItem } from '@/types/knowledgeBase';

const knowledgePrompt = (item: KnowledgeItem) =>
`<knowledge id="${item.id}" name="${item.name}" type="${item.type}"${item.fileType ? ` fileType="${item.fileType}" ` : ''}>${item.description || ''}</knowledge>`;

export const knowledgePrompts = (list?: KnowledgeItem[]) => {
if ((list || []).length === 0) return '';

const prompt = `<knowledge_bases>
<knowledge_bases_docstring>here are the knowledge base scope we retrieve chunks from:</knowledge_bases_docstring>
${list?.map((item) => knowledgePrompt(item)).join('\n')}
</knowledge_bases>`;

return prompt.trim();
};
8 changes: 8 additions & 0 deletions src/prompts/knowledgeBaseQA/userQuery.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
export const userQueryPrompt = (userQuery: string, rewriteQuery?: string) => {
return `<user_query>
<user_query_docstring>to make result better, we may rewrite user's question.If there is a rewrite query, it will be wrapper with \`rewrite_query\` tag.</user_query_docstring>
<raw_query>${userQuery.trim()}</raw_query>
${rewriteQuery ? `<rewrite_query>${rewriteQuery.trim()}</rewrite_query>` : ''}
<user_query>`;
};
Loading

0 comments on commit b4e3f60

Please sign in to comment.