chatchat-space · cswangxiaowei · Apr 7, 2024 · Apr 7, 2024
diff --git a/Long_Context_Reorder.ipynb b/Long_Context_Reorder.ipynb
@@ -0,0 +1,146 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "code",
+ "source": [
+ "#参考：https://zhuanlan.zhihu.com/p/682641846"
+ ],
+ "metadata": {
+ "id": "8wS-HNEc6Ll-"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# ----------------- 导入必要的package ----------------- #\n",
+ "import torch\n",
+ "from langchain.document_loaders import PyPDFLoader\n",
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+ "from langchain_community.vectorstores import Chroma\n",
+ "from langchain import PromptTemplate\n",
+ "from langchain_community.document_transformers import (\n",
+ " LongContextReorder,\n",
+ ")\n",
+ "from langchain import HuggingFacePipeline\n",
+ "from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM\n",
+ "from langchain.embeddings import HuggingFaceEmbeddings\n",
+ "from langchain.chains import LLMChain, StuffDocumentsChain\n",
+ "\n",
+ "# ----------------- 配置项 ---------------------------- #\n",
+ "model_path = \"../../models/Baichuan2-13B-Chat\"\n",
+ "embed_path = \"../../models/bge-large-zh-v1.5\"\n",
+ "# ----------------- 加载embedding模型 ----------------- #\n",
+ "embeddings = HuggingFaceEmbeddings(\n",
+ " model_name=embed_path,\n",
+ " model_kwargs={\"device\": \"cuda\"},\n",
+ " encode_kwargs={\"normalize_embeddings\": True},\n",
+ ")\n",
+ "# ----------------- 加载LLM -------------------------- #\n",
+ "tokenizer = AutoTokenizer.from_pretrained(model_path,\n",
+ " device_map=\"auto\",\n",
+ " trust_remote_code=True,\n",
+ " torch_dtype=torch.float16)\n",
+ "\n",
+ "model = AutoModelForCausalLM.from_pretrained(\n",
+ " model_path,\n",
+ " torch_dtype=torch.float16,\n",
+ " trust_remote_code=True,\n",
+ " device_map=\"auto\",\n",
+ ")\n",
+ "\n",
+ "pipeline = pipeline(\n",
+ " \"text-generation\",\n",
+ " model=model,\n",
+ " tokenizer=tokenizer,\n",
+ " return_full_text=True,\n",
+ ")\n",
+ "\n",
+ "llm = HuggingFacePipeline(pipeline=pipeline)\n",
+ "# ----------------- 加载文件 --------------------------- #\n",
+ "loader = PyPDFLoader(\"../data/中华人民共和国证券法(2019修订).pdf\")\n",
+ "documents = loader.load_and_split()\n",
+ "text_splitter = RecursiveCharacterTextSplitter(separators=[\"。\"], chunk_size=512, chunk_overlap=32)\n",
+ "texts_chunks = text_splitter.split_documents(documents)\n",
+ "文本存入向量库后创建retriever，设置返回10个文本块。\n",
+ "\n",
+ "# ----------------- 存入向量库，创建retriever ------------ #\n",
+ "vectorstore = Chroma.from_documents(texts_chunks, embeddings, persist_directory=\"db\")\n",
+ "retriever = vectorstore.as_retriever(search_kwargs={\"k\": 10})\n",
+ "利用get_relevant_documents()获取相关文档，然后用LongContextReorder()进行重排序。\n",
+ "\n",
+ "# ----------------- 文档重排序 -------------------------- #\n",
+ "query = \"公司首次公开发行新股，应当符合哪些条件？\"\n",
+ "docs = retriever.get_relevant_documents(query)\n",
+ "\n",
+ "# 相关性小的文档放在中间，相关性大的文档放在首尾两端\n",
+ "reordering = LongContextReorder()\n",
+ "reordered_docs = reordering.transform_documents(docs)"
+ ],
+ "metadata": {
+ "id": "oTYhN0SX6RqG"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# ----------------- 构造提示模板 -------------------------- #\n",
+ "document_prompt = PromptTemplate(\n",
+ " input_variables=[\"page_content\"], template=\"{page_content}\"\n",
+ ")\n",
+ "document_variable_name = \"context\"\n",
+ "\n",
+ "template = \"\"\"你是一名智能助手，可以根据上下文回答用户的问题。\n",
+ "\n",
+ "已知内容：\n",
+ "{context}\n",
+ "\n",
+ "问题：\n",
+ "{question}\n",
+ "\"\"\"\n",
+ "prompt = PromptTemplate(template=template, input_variables=[\"context\", \"question\"])"
+ ],
+ "metadata": {
+ "id": "uU-U2T5F6ga4"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# ----------------- 初始化chain并测试 -------------------------- #\n",
+ "llm_chain = LLMChain(llm=llm, prompt=prompt)\n",
+ "chain = StuffDocumentsChain(\n",
+ " llm_chain=llm_chain,\n",
+ " document_prompt=document_prompt,\n",
+ " document_variable_name=document_variable_name,\n",
+ ")\n",
+ "result = chain.run(input_documents=reordered_docs, question=query)\n",
+ "print(result)"
+ ],
+ "metadata": {
+ "id": "hesh8ePh6kKA"
+ },
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}