01
优化验证大致思路
asyncdefmain():
entities, description_embedding_store, jina_embedder = process_and_embed()
context_builder = LocalSearchMixedContext(
entities=entities,
entity_text_embeddings=description_embedding_store,
embedding_vectorstore_key=EntityVectorStoreKey.ID,
text_embedder=jina_embedder,
)
local_context_params = {
"top_k_mapped_entities": 5,
"include_entity_rank": True,
"embedding_vectorstore_key": EntityVectorStoreKey.ID,
"max_tokens": 4000,
}
search_engine = LocalSearch(
llm=DeepSeekClientWrapper(deepseek_client),
context_builder=context_builder,
llm_params={"max_tokens": 1000, "temperature": 0.0},
context_builder_params=local_context_params,
response_type="multiple paragraphs",
)
initial_summary = awaitrun_rag_query(search_engine, "总结这篇论文的主要观点,包括研究方法、实验流程和主要结果")
ifinitial_summary:
dataset = awaitgenerate_and_answer_questions(deepseek_client, search_engine, initial_summary)
withopen('prompt_optimization_dataset.yaml', 'w', encoding='utf-8') asyaml_file:
yaml.dump(dataset, yaml_file, allow_unicode=True, sort_keys=False)
print("数据集已保存为 prompt_optimization_dataset.yaml")
if__name__ == "__main__":
importnest_asyncio
nest_asyncio.apply()
asyncio.run(main())defmain():
# 创建文档列表
documents = [{'content': f"Question: {item['question']}\nAnswer: {item['answer']}"} foritem indataset]
# 创建检索器
retriever = SimpleRetriever(documents)
# 加载数据集
dataset = load_dataset('prompt_optimization_dataset.yaml')
# 执行查询
question = "多文档问答任务中,模型是如何具体评估其应用能力的?"
retrieved_docs = retriever.retrieve(question, top_k=3)
answer = generator.generate(question, retrieved_docs)
print("问题:", question)
print("答案:", answer)
if__name__ == "__main__":
main()用这个简单的RAG去问个问题
xiumao代码运行截图2
02
开始优化
对数据集优化的意义在于:你能够整理和规范数据集中的内容,并用于生产系统的部署,如果需要的话,或者作为问答系统的知识库,总之你优化就对了。用最简单的Python基础语句,让Deepseek作为模型批量总结、整理数据集中的字段成你希望的字数和格式,别说控制不了模型生成字数,用好基础语句(写个惩罚函数)连标点符号都能控制,就说你想要什么吧。所以,大道至简,越是基础的知识越有力量,越不容易出错。最后整理完的内容保存到refined_dataset.yaml中。
importyaml
fromopenai importOpenAI
# 配置 DeepSeek 语言模型
classDeepSeekLM:
def__init__(self, api_key, base_url, model):
self.client = OpenAI(api_key=api_key, base_url=base_url)
self.model = model
self.kwargs = {
"temperature": 0.7,
"max_tokens": 200,
"top_p": 1,
"frequency_penalty": 0,
"presence_penalty": 0
}
def__call__(self, prompt, **kwargs):
merged_kwargs = {**self.kwargs, **kwargs}
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "你是一个答案总结助手。请将给定的答案简明扼要地总结为120-180个字符。"},
{"role": "user", "content": prompt}
],
**merged_kwargs
)
returnresponse.choices[0].message.content
# 加载数据集
print("正在加载数据集...")
withopen('prompt_optimization_dataset.yaml', 'r', encoding='utf-8') asfile:
dataset = yaml.safe_load(file)
print(f"成功加载数据集,共 {len(dataset)}条记录")xiumao代码运行截图3
03
验证优化结果
再然后,数据集结果已保存到refined_dataset.yaml,再去用这个yaml文件替换xiumao代码运行截图2那个简单RAG中的优化前的数据集prompt_optimization_dataset.yaml,用同样的问题提问,结果如下:
| 欢迎光临 链载Ai (https://www.lianzai.com/) | Powered by Discuz! X3.5 |