from langchain.document_loaders import DirectoryLoader# Load HTML files already saved in a local directorypath = "../../RAG/rtdocs_new/"global_pattern = '*.html'loader = DirectoryLoader(path=path, glob=global_pattern)docs = loader.load()# Print num documents and a preview.print(f"loaded {len(docs)} documents")print(docs[0].page_content)pprint.pprint(docs[0].metadata)
import torchfrom sentence_transformers import SentenceTransformer# Initialize torch settings for device-agnostic code.N_GPU = torch.cuda.device_count()DEVICE = torch.device('cuda:N_GPU' if torch.cuda.is_available() else 'cpu')# Download the model from huggingface model hub.model_name = "BAAI/bge-large-en-v1.5"encoder = SentenceTransformer(model_name, device=DEVICE)# Get the model parameters and save for later.EMBEDDING_DIM = encoder.get_sentence_embedding_dimension()MAX_SEQ_LENGTH_IN_TOKENS = encoder.get_max_seq_length()# Inspect model parameters.print(f"model_name: {model_name}")print(f"EMBEDDING_DIM: {EMBEDDING_DIM}")print(f"MAX_SEQ_LENGTH: {MAX_SEQ_LENGTH}")
from langchain.text_splitter import RecursiveCharacterTextSplitterCHUNK_SIZE = 512chunk_overlap = np.round(CHUNK_SIZE * 0.10, 0)print(f"chunk_size: {CHUNK_SIZE}, chunk_overlap: {chunk_overlap}")# Define the splitter.child_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE,chunk_overlap=chunk_overlap)# Chunk the docs.chunks = child_splitter.split_documents(docs)print(f"{len(docs)} docs split into {len(chunks)} child documents.")# Encoder input is doc.page_content as strings.list_of_strings = [doc.page_content for doc in chunks if hasattr(doc, 'page_content')]# Embedding inference using HuggingFace encoder.embeddings = torch.tensor(encoder.encode(list_of_strings))# Normalize the embeddings.embeddings = np.array(embeddings / np.linalg.norm(embeddings))# Milvus expects a list of `numpy.ndarray` of `numpy.float32` numbers.converted_values = list(map(np.float32, embeddings))# Create dict_list for Milvus insertion.dict_list = []for chunk, vector in zip(chunks, converted_values):# Assemble embedding vector, original text chunk, metadata.chunk_dict = {'chunk': chunk.page_content,'source': chunk.metadata.get('source', ""),'vector': vector,}dict_list.append(chunk_dict)
# Connect a client to the Milvus Lite server.from pymilvus import MilvusClientmc = MilvusClient("milvus_demo.db")# Create a collection with flexible schema and AUTOINDEX.COLLECTION_NAME = "MilvusDocs"mc.create_collection(COLLECTION_NAME,EMBEDDING_DIM,consistency_level="Eventually",auto_id=True,overwrite=True)# Insert data into the Milvus collection.print("Start inserting entities")start_time = time.time()mc.insert(COLLECTION_NAME,data=dict_list,progress_bar=True)end_time = time.time()print(f"Milvus insert time for {len(dict_list)} vectors: ", end="")print(f"{round(end_time - start_time, 2)} seconds")
SAMPLE_QUESTION = "What do the parameters for HNSW mean?"# Embed the question using the same encoder.query_embeddings = torch.tensor(encoder.encode(SAMPLE_QUESTION))# Normalize embeddings to unit length.query_embeddings = F.normalize(query_embeddings, p=2, dim=1)# Convert the embeddings to list of list of np.float32.query_embeddings = list(map(np.float32, query_embeddings))# Define metadata fields you can filter on.OUTPUT_FIELDS = list(dict_list[0].keys())OUTPUT_FIELDS.remove('vector')# Define how many top-k results you want to retrieve.TOP_K = 2# Run semantic vector search using your query and the vector database.results = mc.search(COLLECTION_NAME,data=query_embeddings,output_fields=OUTPUT_FIELDS,limit=TOP_K,consistency_level="Eventually")
# (Recommended) Create a new conda environment.conda create -n myenv python=3.11 -yconda activate myenv# Install vLLM with CUDA 12.1.pip install -U vllm transformers torch
importvllm,torchfromvllmimportLLM,SamplingParams#CleartheGPUmemorycache.torch.cuda.empty_cache()#ChecktheGPU.!nvidia-smi
# Login to HuggingFace using your new token.from huggingface_hub import loginfrom google.colab import userdatahf_token = userdata.get('HF_TOKEN')login(token = hf_token, add_to_git_credential=True)
# 1. Choose a modelMODELTORUN = "meta-llama/Meta-Llama-3.1-8B-Instruct"# 2. Clear the GPU memory cache, you're going to need it all!torch.cuda.empty_cache()# 3. Instantiate a vLLM model instance.llm = LLM(model=MODELTORUN,enforce_eager=True,dtype=torch.bfloat16,gpu_memory_utilization=0.5,max_model_len=1000,seed=415,max_num_batched_tokens=3000)
# Separate all the context together by space.contexts_combined = ' '.join(contexts)# Lance Martin, LangChain, says put the best contexts at the end.contexts_combined = ' '.join(reversed(contexts))# Separate all the unique sources together by comma.source_combined = ' '.join(reversed(list(dict.fromkeys(sources))))SYSTEM_PROMPT = f"""First, check if the provided Context is relevant tothe user's question.Second, only if the provided Context is strongly relevant, answer the question using the Context.Otherwise, if the Context is not strongly relevant, answer the question without using the Context.Be clear, concise, relevant.Answer clearly, in fewer than 2 sentences.Grounding sources: {source_combined}Context: {contexts_combined}User's question: {SAMPLE_QUESTION}"""prompts = [SYSTEM_PROMPT]
# Sampling parameterssampling_params = SamplingParams(temperature=0.2, top_p=0.95)# Invoke the vLLM model.outputs = llm.generate(prompts, sampling_params)# Print the outputs.for output in outputs:prompt = output.promptgenerated_text = output.outputs[0].text# !r calls repr(), which prints a string inside quotes.print()print(f"Question: {SAMPLE_QUESTION!r}")pprint.pprint(f"Generated text: {generated_text!r}")
| 欢迎光临 链载Ai (https://www.lianzai.com/) | Powered by Discuz! X3.5 |