#将示例文本分块 text="This is the text I would like to ch up.It is the example text for this exercise."
#设置块大小 chunk_size=35 #初始化一个列表来保存块chunks =[] #遍历文本以创建块 foriinrange(0,len(text),chunk_size): chunk=text[i:i+chunk_size] chunks.append(chunk) #显示块 print(chunks) #输出:['This is the text I would like to ch', 'unk up. It is the example text for ', 'this exercise']
使用LangChainCharacterTextSplitter来实现相同的结果:
from langchain.text_splitter import CharacterTextSplitter
fromlangchain_text_splittersimportRecursiveCharacterTextSplitter #Sample text to chunk text=""" The Olympic Games,originally held in ancient Greece,were revived in 1896 and have since become the world’s foremost sports competition,bringing together athletes from around the globe. """ #Initialize the recursive character text splitter with specified chunk size text_splitter=RecursiveCharacterTextSplitter( #Set a really small chunk size,just to show. chunk_size=30, chunk_overlap=20, length_function=len, is_separator_regex=False, )
#Create documents using the text splitter documents=text_splitter.create_documents([text]) #Display the created documents fordocindocuments: print(doc.page_content) #Output: #“The Olympic Games,originally” #“held in ancient Greece,were” #“revived in 1896 and have” #“have since become the world’s” #“world’s foremost sports” #“competition,bringing together” #“together athletes from around” #“around the globe.”
fromlangchain.text_splitterimportMarkdownTextSplitter #Sample Markdown text markdown_text=""" #Fun in California ##Driving Try driving on the 1 down to San Diego ###Food Make sure to eat a burrito while you're there ##Hiking Go to Yosemite """ #Initialize the Markdown text splitter splitter=MarkdownTextSplitter(chunk_size=40,chunk_overlap=0) #Create documents using the text splitter documents=splitter.create_documents([markdown_text]) #Display the created documents fordocindocuments: print(doc.page_content) #Output: ##Fun in California\n\n##Driving #Try driving on the 1 down to San Diego ####Food #Make sure to eat a burrito while you're #there ###Hiking\n\nGo to Yosemite
Python 代码切块
fromlangchain.text_splitterimportPythonCodeTextSplitter #Sample Python code python_text=""" class Person: def __init__(self,name,age): self.name=name self.age=age p1=Person("John",36) for i in range(10): print(i) """ #Initialize the Python code text splitter python_splitter=PythonCodeTextSplitter(chunk_size=100,chunk_overlap=0) #Create documents using the text splitter documents=python_splitter.create_documents([python_text]) #Display the created documents fordocindocuments: print(doc.page_content) #Output: #class Person:\ndef __init__(self,name,age):\nself.name=name\nself.age=age #p1=Person("John",36)\n\nfor i in range(10):\nprint(i)
from sklearn.metrics.pairwise import cosine_similarity from langchain.embeddings import OpenAIEmbeddings import re # Sample text text = """ One of the most important things I didn't understand about the world when I was a child is the degree to which the returns for performance are superlinear. Teachers and coaches implicitly told us the returns were linear. "You get out," I heard a thousand times, "what you put in." They meant well, but this is rarely true. If your product is only half as good as your competitor's, you don't get half as many customers. You get no customers, and you go out of business. It's obviously true that the returns for performance are superlinear in business. Some think this is a flaw of capitalism, and that if we changed the rules it would stop being true. But superlinear returns for performance are a feature of the world, not an artifact of rules we've invented. We see the same pattern in fame, power, military victories, knowledge, and even benefit to humanity. In all of these, the rich get richer. """ # Splitting the text into sentences sentences = re.split(r'(?<=[.?!])\s+', text) sentences = [{'sentence': x, 'index' : i} for i, x in enumerate(sentences)] # Combine sentences for context def combine_sentences(sentences, buffer_size=1): for i in range(len(sentences)): combined_sentence = '' for j in range(i - buffer_size, i): if j >= 0: combined_sentence += sentences[j]['sentence'] + ' ' combined_sentence += sentences[i]['sentence'] for j in range(i + 1, i + 1 + buffer_size): if j < len(sentences): combined_sentence += ' ' + sentences[j]['sentence'] sentences[i]['combined_sentence'] = combined_sentence return sentences sentences = combine_sentences(sentences) # Generate embeddings oai_embeds = OpenAIEmbeddings() embeddings = oai_embeds.embed_documents([x['combined_sentence'] for x in sentences]) # Add embeddings to sentences for i, sentence in enumerate(sentences): sentence['combined_sentence_embedding'] = embeddings[i] # Calculate cosine distances def calculate_cosine_distances(sentences): distances = [] for i in range(len(sentences) - 1): embedding_current = sentences[i]['combined_sentence_embedding'] embedding_next = sentences[i + 1]['combined_sentence_embedding'] similarity = cosine_similarity([embedding_current], [embedding_next])[0][0] distance = 1 - similarity distances.append(distance) sentences[i]['distance_to_next'] = distance return distances, sentences distances, sentences = calculate_cosine_distances(sentences) # Determine breakpoints and create chunks import numpy as np breakpoint_distance_threshold = np.percentile(distances, 95) indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold] # Combine sentences into chunks chunks = [] start_index = 0 for index in indices_above_thresh: end_index = index group = sentences[start_index:end_index + 1] combined_text = ' '.join([d['sentence'] for d in group]) chunks.append(combined_text) start_index = index + 1 if start_index < len(sentences): combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]]) chunks.append(combined_text) # Display the created chunks for i, chunk in enumerate(chunks): print(f"Chunk #{i+1}:\n{chunk}\n")
#Step 3LM Decision Node decision_node=LLMDecisionNode( input=splitter_node.output, prompt_template="Does the sentence'{next_sentence}'belong to the same chunk as'{current_chunk}'?", name="LLM Decision" )