| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227 |
- from vllm import LLM, SamplingParams
-
- from fastapi import FastAPI, Request
-
- from fastapi.responses import StreamingResponse
-
- import uvicorn
-
- import time
-
- import json
-
-
-
- def create_vllm_server(
-
- model: str,
-
- served_model_name: str,
-
- tensor_parallel_size: int,
-
- host: str,
-
- port: int,
-
- top_p: float,
-
- temperature: float,
-
- max_tokens: int,
-
- gpu_memory_utilization: float,
-
- dtype: str,
-
- ) -> FastAPI:
-
- # 只初始化 LLM
-
- llm = LLM(
-
- model=model,
-
- tensor_parallel_size=tensor_parallel_size,
-
- gpu_memory_utilization=gpu_memory_utilization,
-
- dtype=dtype,
-
- )
-
-
-
- sampling_params = SamplingParams(
-
- temperature=temperature,
-
- top_p=top_p,
-
- max_tokens=max_tokens,
-
- )
-
-
-
- app = FastAPI()
-
-
-
- @app.post("/v1/chat/completions")
-
- async def chat_completions(request: Request):
-
- try:
-
- data = await request.json()
-
- messages = data["messages"]
-
- tools = data.get("tools") # 支持 tools 参数
-
- created_time = time.time()
-
- request_id = f"chatcmpl-{int(time.time())}"
-
-
-
- # 调用 llm.chat(),传入 tools
-
- outputs = llm.chat(
-
- messages=messages,
-
- sampling_params=sampling_params,
-
- tools=tools,
-
- )
-
- if data.get("stream"):
-
- def generate():
-
- full_text = ""
-
- for output in outputs:
-
- new_text = output.outputs[0].text[len(full_text):]
-
- full_text = output.outputs[0].text
-
- response_data = {
-
- "id": request_id,
-
- "model": served_model_name,
-
- "created": created_time,
-
- "choices": [{
-
- "index": 0,
-
- "delta": {"content": new_text},
-
- "finish_reason": output.outputs[0].finish_reason,
-
- }],
-
- }
-
- yield f"data: {json.dumps(response_data)}\n\n"
-
- yield "data: [DONE]\n\n"
-
-
-
- return StreamingResponse(generate(), media_type="text/event-stream")
-
- else:
-
- return {
-
- "id": request_id,
-
- "model": served_model_name,
-
- "created": created_time,
-
- "choices": [{
-
- "index": 0,
-
- "message": {
-
- "role": "assistant",
-
- "content": outputs[0].outputs[0].text,
-
- },
-
- "finish_reason": outputs[0].outputs[0].finish_reason,
-
- }],
-
- }
-
-
-
- except Exception as e:
-
- return {"error": str(e)}, 400
-
-
-
- return app
-
-
-
- if __name__ == "__main__":
-
- # 配置参数
-
- CONFIG = {
-
- "model": "/home/Qwen/Qwen2.5-7B-Instruct",
-
- "served_model_name": "Qwen2.5-7B-Instruct",
-
- "host": "192.168.28.196",
-
- "port": 8000,
-
- "tensor_parallel_size": 4,
-
- "top_p": 0.9,
-
- "temperature": 0.7,
-
- "max_tokens": 8192,
-
- "gpu_memory_utilization": 0.9,
-
- "dtype": "float16"
-
- }
-
- # 创建应用
-
- app = create_vllm_server(**CONFIG)
-
-
-
- # 启动服务器
-
- uvicorn.run(
-
- app,
-
- host=CONFIG["host"],
-
- port=CONFIG["port"],
-
- workers= 1,
-
- )
|