RAG工程落地：处理文档中表格数据 - 链载Ai

在 RAG（Retrieval-Augmented Generation）工程落地过程中，处理文档中的表格数据是一个非常重要但复杂的问题，特别是针对技术文档、报告、论文等结构化强的资料。比如PDF文档里的表格数据，如下：

我们该如何切割该PDF文档呢？又该怎么精准地检索查询出表格中的数据呢？

直接对表格做向量存储索引的检索通常效果欠佳，可以借助大模型生成表格摘要用于嵌入与检索。这有利于提高检索精确度，加强大模型对表格的理解。在检索阶段，通过递归检索出原始的表格用于后面生成。

建议在切分文档之前，将所有非结构化的文档，比如pdf,word,ppt，txt等都转成带有Markdown格式的文档，这么做的好处很多，以后有空再聊。

importpymupdf4llmfrompathlibimportPath
# 设置参数pdf_path =r"D:\Test\muxue\data2\caiwubaogao.pdf"# 替换为您的 PDF 文件路径output_md =r"D:\Test\muxue\data2\caiwubaogao.md"     # 输出的 Markdown 文件名image_dir =r"D:\Test\muxue\data2\images"      # 图片保存目录dpi =300            # 图片分辨率image_format ="png"      # 图片格式，可选 "png"、"jpg" 等
# 创建图片保存目录Path(image_dir).mkdir(parents=True, exist_ok=True)
# 转换 PDF 为 Markdown，并提取图片md_text = pymupdf4llm.to_markdown(  doc=pdf_path,  write_images=True,  image_path=image_dir,  image_format=image_format,  dpi=dpi)
# 保存 Markdown 内容到文件withopen(output_md,"w", encoding="utf-8")asf:  f.write(md_text)
print(f"Markdown 内容已保存到{output_md}")print(f"图片已保存到目录{image_dir}")

importmammothimportos
defdocx_to_markdown_with_images(docx_path, output_md_path=None, image_dir="images"):  os.makedirs(image_dir, exist_ok=True)
 defsave_image(image):    image_name = image.alt_text.replace(" ","_")ifimage.alt_textelse"image"    ext = {     "image/png":".png",     "image/jpeg":".jpg",     "image/gif":".gif"    }.get(image.content_type,".bin")    filename =f"{image_name}{ext}"    image_path = os.path.join(image_dir, filename)
   # 避免重名    counter =1    base_name = filename.rsplit(".",1)[0]   whileos.path.exists(image_path):      filename =f"{base_name}_{counter}{ext}"      image_path = os.path.join(image_dir, filename)      counter +=1
   # 读取图片数据，保存 —— **改这里！**   withimage.open()asimg_file:     withopen(image_path,"wb")asout_file:        out_file.write(img_file.read())
   # 返回 Markdown 中图片的路径，注意替换成相对路径或 URL 时修改这里   return{"src": image_path.replace("\\","/")}
 withopen(docx_path,"rb")asdocx_file:    result = mammoth.convert_to_markdown(      docx_file,      convert_image=mammoth.images.img_element(save_image)    )    markdown_text = result.value
 ifoutput_md_path:   withopen(output_md_path,"w", encoding="utf-8")asf:      f.write(markdown_text)
 returnmarkdown_text
# 示例markdown = docx_to_markdown_with_images( r"D:\Test\muxue\data2\caiwubaogao.docx",  output_md_path=r"D:\muxue\data2\caiwubaogao.md",  image_dir=r"D:\Test\muxue\data2\images")print(markdown)

LlamaIndex对MarkDown文件切分，有几个切割器，比较常用的切割器是MarkdownNodeParser，示例代码如下：

fromllama_index.coreimportSimpleDirectoryReader, VectorStoreIndexfromllama_index.core.node_parserimportMarkdownNodeParser
# 加载 Markdown 文档documents = SimpleDirectoryReader(input_dir=r"D:\Test\RAGTest\data\markdown", required_exts=[".md"]).load_data()
# 创建 Markdown 节点解析器node_parser = MarkdownNodeParser.from_defaults(  include_metadata=True,      # 包含元数据  include_prev_next_rel=True,   # 包含前后节点关系  header_path_separator="/")
# 将文档解析为节点列表nodes = node_parser.get_nodes_from_documents(documents)

为了处理表格，我们需要使用另一个切割器--MarkdownElementNodeParser。它会将markdown文档中的文本、标题、表格等元素分别解析为不同类型的节点：普通文本为TextNode，表格为IndexNode（且“完美表格”会被转为pandas DataFrame，非标准表格则以原始文本存储）。解析后，节点类型和内容可直接区分，便于后续检索和处理。

MarkdownElementNodeParser 与普通的数据分割器的区别主要在于它对其中的表格内容借助大模型生成了内容摘要与结构描述，并构造成索引 Node（IndexNode），然后在查询时通过索引 Node 找到表格内容 Node，将其一起输入大模型进行生成。

fromllama_index.core.llms.mockimportMockLLMfromllama_index.core.node_parser.relational.markdown_elementimportMarkdownElementNodeParserfromllama_index.core.schemaimportDocument, TextNode, IndexNode
# 示例markdown文本，包含文本、标题和表格md_text ="""# 第一章这是第一章的内容。| 年份 | 收益 || ---- | ---- || 2020 | 12000 || 2021 | 15000 |## 第二节这是第二节的内容。| 产品 | 数量 | 价格 || ---- | ---- | ---- || A  | 10  | 5  || B  | 20  | 8  |"""
# 构建Document对象doc = Document(text=md_text)
# 初始化MarkdownElementNodeParserparser = MarkdownElementNodeParser(llm=MockLLM())
# 解析为节点nodes = parser.get_nodes_from_documents([doc])
# 输出每个节点的类型和内容fori, nodeinenumerate(nodes): print(f"Node (i): 类型:{type(node).__name__}") print(f"内容:{getattr(node,'text',getattr(node,'table',''))}\n")

'''markdown中表格数据的切割和查询'''fromllama_index.coreimportVectorStoreIndex, Settings, SimpleDirectoryReaderfromllama_index.llms.openai_likeimportOpenAILikefromllama_index.embeddings.openai_likeimportOpenAILikeEmbeddingfromllama_index.core.node_parser.relational.markdown_elementimport(  MarkdownElementNodeParser,)fromllama_index.core.llms.mockimportMockLLM
# ================== 初始化模型 ==================definit_models(): """初始化模型并验证""" # Embedding模型
  embed_model = OpenAILikeEmbedding(    model_name="BAAI/bge-m3",    api_base="https://api.siliconflow.cn/v1",    api_key="sk-xxx",    embed_batch_size=10,  )
  llm = OpenAILike(    model="DeepSeek-ai/DeepSeek-V3",    api_base="https://api.siliconflow.cn/v1",    api_key="sk-xxx",    context_window=128000,    is_chat_model=True,    is_function_calling_model=False,  )

  Settings.embed_model = embed_model  Settings.llm = llm
 # 验证模型  test_embedding = embed_model.get_text_embedding("测试文本") print(f"Embedding维度验证：{len(test_embedding)}")
 returnembed_model, llm
init_models()
# load documents, split into chunksdocuments = SimpleDirectoryReader(r"D:\Test\muxue\data2", required_exts=[".md"]).load_data()
# 2. 强大的分割器
node_parser = MarkdownElementNodeParser(llm=MockLLM())nodes = node_parser.get_nodes_from_documents(documents)
index = VectorStoreIndex(nodes)
fromllama_index.core.query_engineimportCitationQueryEngine
query_engine = CitationQueryEngine.from_args(  index,  similarity_top_k=3, # here we can control how granular citation sources are, the default is 512  citation_chunk_size=512,)
res = query_engine.query("股本增减变动幅度多大?请使用中文回答")print(res.response)       # LLM 输出回答print("------来源---------------")fornodeinres.source_nodes: print("相关片段：", node.text) print("片段分数：", node.score) print("片段元数据：", node.metadata) print("="*40)