1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
| def read_knowledge_base() -> str: with open("knowledge_base.md", "r", encoding="utf-8") as f: return f.read()
def divide_chunks() -> list[str]: text = read_knowledge_base() tmp_chunks = text.split("\n\n")
chunks = [] title = "" for c in tmp_chunks: if c.startswith("#"): title += f"{c}\n" else: chunks.append(f"{title}{c}") title = ""
return chunks
from google import genai
genai_client = genai.Client(api_key="AIzaSyDOai6QVu_r_fMRbFeLv5-rSyljpqpAhWc") EMBED_MODEL = "gemini-embedding-exp-03-07"
def embed(text: str, store: bool) -> list[float]: result = genai_client.models.embed_content( model=EMBED_MODEL, contents=text, config={"task_type": "RETRIEVAL_DOCUMENT" if store else "RETRIEVAL_QUERY"}, )
assert result.embeddings assert result.embeddings[0].values return result.embeddings[0].values
import chromadb
chromadb_client = chromadb.PersistentClient("./chroma.db") chromadb_collection = chromadb_client.get_or_create_collection("linghuchong")
def store_db() -> None: chunks = divide_chunks()
for idx, c in enumerate(chunks): print(f"Process: {c}") vec = embed(c, store=True) chromadb_collection.upsert(ids=str(idx), documents=c, embeddings=vec)
def query_db(question: str) -> list[str]: vec = embed(question, store=False) result = chromadb_collection.query(query_embeddings=vec, n_results=5)
assert result["documents"] return result["documents"][0]
LLM_MODEL = "gemini-2.5-flash-preview-05-20" if __name__ == "__main__":
question = "令狐冲领悟了什么魔法?" context = query_db(question)
system_prompt = "You are clever\n" user_prompt = "Please answer user's question according to context\n" user_prompt += f"Question: {question}\n" user_prompt += "Context:\n" for c in context: user_prompt += f"{c}\n" user_prompt += "-------------\n" prompt = system_prompt + user_prompt
result = genai_client.models.generate_content(model=LLM_MODEL, contents=prompt) print(result)
|