PaddlePaddle · rainyfly · Sep 25, 2023 · Sep 25, 2023 · Sep 25, 2023 · Sep 25, 2023
diff --git a/llm/README.md b/llm/README.md
@@ -0,0 +1,69 @@
+# 环境安装
+
+- Step 1. 安装develop版本PaddlePaddle
+- Step 2. 从源码安装PaddleNLP
+- Step 3. 进入源码PaddleNLP/csrc，执行`python3 setup_cuda.py install --user`安装自定义OP
+
+
+## 导出模型
+```
+cd PaddleNLP/llm
+python export_model.py \
+ --model_name_or_path meta-llama/Llama-2-7b-chat \
+ --output_path ./inference \
+ --dtype float16
+```
+
+## 本地测试模型
+
+```
+wget https://bj.bcebos.com/paddle2onnx/third_libs/inputs_63.jsonl
+mkdir res
+```
+测试脚本如下，预测结果将会保存在res目录下
+```
+import fastdeploy_llm as fdlm
+import copy
+config = fdlm.Config("chatglm-6b")
+config.max_batch_size = 1
+config.mp_num = 1
+config.max_dec_len = 1024
+config.max_seq_len = 1024
+config.decode_strategy = "sampling"
+config.stop_threshold = 2
+config.disable_dynamic_batching = 1
+config.max_queue_num = 512
+config.is_ptuning = 0
+
+inputs = list()
+with open("inputs_63.jsonl", "r") as f:
+ for line in f:
+ data = eval(line.strip())
+ prompt = data["src"]
+ inputs.append((prompt, data))
+
+model = fdlm.ServingModel(config)
+
+def call_back(call_back_task, token_tuple, index, is_last_token, sender=None):
+ with open("res/{}".format(call_back_task.task_id), "a+") as f:
+ f.write("{}\n".format(token_tuple))
+
+for i, ipt in enumerate(inputs):
+ task = fdlm.Task()
+ task.text = ipt[0]
+ task.max_dec_len = 1024
+ task.min_dec_len = 1
+ task.penalty_score = 1.0
+ task.temperature = 1.0
+ task.topp = 0.0
+ task.frequency_score = 0.0
+ task.eos_token_id = 2
+ task.presence_score = 0.0
+ task.task_id = i
+ task.call_back_func = call_back
+ model.add_request(task)
+
+model.start()
+# 停止接收新的请求，处理完请求后，全部自行退出
+model.stop()
+```
diff --git a/llm/fastdeploy_llm/__init__.py b/llm/fastdeploy_llm/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .model import Model
+from .serving.serving_model import ServingModel
+from .task import Task, BatchTask
+from .config import Config
+from . import utils
+from .client import GrpcClient
diff --git a/llm/fastdeploy_llm/client/__init__.py b/llm/fastdeploy_llm/client/__init__.py
@@ -0,0 +1 @@
+from .grpc_client import GrpcClient
diff --git a/llm/fastdeploy_llm/client/grpc_client.py b/llm/fastdeploy_llm/client/grpc_client.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import queue
+import json
+import sys
+from functools import partial
+import os
+import time
+import numpy as np
+import subprocess
+from fastdeploy_llm.utils.logging_util import logger
+try:
+ import tritonclient.grpc as grpcclient
+ from tritonclient.utils import *
+except:
+ pass
+
+
+class UserData:
+ def __init__(self):
+ self._completed_requests = queue.Queue()
+
+
+def callback(user_data, result, error):
+ if error:
+ user_data._completed_requests.put(error)
+ else:
+ user_data._completed_requests.put(result)
+
+
+class GrpcClient:
+ def __init__(self,
+ url: str,
+ model_name: str,
+ model_version: str="1",
+ timeout: int=1000000,
+ openai_port: int=None):
+ """
+ Args:
+ url (`str`): inference server grpc url
+ model_name (`str`)
+ model_version (`str`): default "1"
+ timeout (`int`): inference timeout in seconds
+ openai_port (`int`)
+ """
+ self._model_name = model_name
+ self._model_version = model_version
+ self.timeout = timeout
+ self.url = url
+
+ def generate(self,
+ prompt: str,
+ request_id: str="0",
+ top_p: float=0.0,
+ temperature: float=1.0,
+ max_dec_len: int=1024,
+ min_dec_len: int=2,
+ penalty_score: float=1.0,
+ frequency_score: float=0.99,
+ eos_token_id: int=2,
+ presence_score: float=0.0,
+ stream: bool=False):
+ import tritonclient.grpc as grpcclient
+ #from tritonclient.utils import *
+
+ user_data = UserData()
+ req_dict = {
+ "text": prompt,
+ "topp": top_p,
+ "temperature": temperature,
+ "max_dec_len": max_dec_len,
+ "min_dec_len": min_dec_len,
+ "penalty_score": penalty_score,
+ "frequency_score": frequency_score,
+ "eos_token_id": eos_token_id,
+ "model_test": "test",
+ "presence_score": presence_score
+ }
+
+ inputs = [
+ grpcclient.InferInput("IN", [1], np_to_triton_dtype(np.object_))
+ ]
+ outputs = [grpcclient.InferRequestedOutput("OUT")]
+
+ in_data = np.array([json.dumps(req_dict)], dtype=np.object_)
+
+ user_data = UserData()
+ with grpcclient.InferenceServerClient(
+ url=self.url, verbose=False) as triton_client:
+ triton_client.start_stream(callback=partial(callback, user_data))
+ inputs[0].set_data_from_numpy(in_data)
+ triton_client.async_stream_infer(
+ model_name=self._model_name,
+ inputs=inputs,
+ request_id=request_id,
+ outputs=outputs)
+ response = dict()
+ response["token_ids"] = list()
+ response["token_strs"] = list()
+ response["input"] = req_dict
+ while True:
+ data_item = user_data._completed_requests.get(
+ timeout=self.timeout)
+ if type(data_item) == InferenceServerException:
+ logger.error(
+ "Error happend while generating, status={}, msg={}".
+ format(data_item.status(), data_item.message()))
+ response["error_info"] = (data_item.status(),
+ data_item.message())
+ break
+ else:
+ results = data_item.as_numpy("OUT")[0]
+ data = json.loads(results)
+ response["token_ids"] += data["token_ids"]
+ response["token_strs"].append(data["result"])
+ if data.get("is_end", False):
+ break
+ return response
+
+ def async_generate(self,
+ prompt: str,
+ request_id: str="0",
+ top_p: float=0.0,
+ temperature: float=1.0,
+ max_dec_len: int=1024,
+ min_dec_len: int=2,
+ penalty_score: float=1.0,
+ frequency_score: float=0.99,
+ eos_token_id: int=2,
+ presence_score: float=0.0,
+ stream: bool=False):
+ import tritonclient.grpc as grpcclient
+ #from tritonclient.utils import *
+
+ user_data = UserData()
+ req_dict = {
+ "text": prompt,
+ "topp": top_p,
+ "temperature": temperature,
+ "max_dec_len": max_dec_len,
+ "min_dec_len": min_dec_len,
+ "penalty_score": penalty_score,
+ "frequency_score": frequency_score,
+ "eos_token_id": eos_token_id,
+ "model_test": "test",
+ "presence_score": presence_score
+ }
+
+ inputs = [
+ grpcclient.InferInput("IN", [1], np_to_triton_dtype(np.object_))
+ ]
+ outputs = [grpcclient.InferRequestedOutput("OUT")]
+
+ in_data = np.array([json.dumps(req_dict)], dtype=np.object_)
+
+ user_data = UserData()
+ with grpcclient.InferenceServerClient(
+ url=self.url, verbose=False) as triton_client:
+ triton_client.start_stream(callback=partial(callback, user_data))
+ inputs[0].set_data_from_numpy(in_data)
+ triton_client.async_stream_infer(
+ model_name=self._model_name,
+ inputs=inputs,
+ request_id=request_id,
+ outputs=outputs)
+ while True:
+ data_item = user_data._completed_requests.get(
+ timeout=self.timeout)
+ if type(data_item) == InferenceServerException:
+ logger.error(
+ "Error happend while generating, status={}, msg={}".
+ format(data_item.status(), data_item.message()))
+ break
+ else:
+ results = data_item.as_numpy("OUT")[0]
+ data = json.loads(results)
+ yield data
+ if data.get("is_end", False):
+ break