from vllm import LLM, SamplingParams from fastapi import FastAPI, Request from fastapi.responses import StreamingResponse import uvicorn import time import json def create_vllm_server( model: str, served_model_name: str, host: str, port: int, tensor_parallel_size: int, top_p: float, temperature: float, max_tokens: int, gpu_memory_utilization: float, dtype: str, ) -> FastAPI: # 只初始化 LLM llm = LLM( model=model, tensor_parallel_size=tensor_parallel_size, gpu_memory_utilization=gpu_memory_utilization, dtype=dtype, ) sampling_params = SamplingParams( temperature=temperature, top_p=top_p, max_tokens=max_tokens, ) app = FastAPI() @app.post("/v1/chat/completions") async def chat_completions(request: Request): try: data = await request.json() messages = data["messages"] tools = data.get("tools") # 支持 tools 参数 created_time = time.time() request_id = f"chatcmpl-{int(time.time())}" # 调用 llm.chat(),传入 tools outputs = llm.chat( messages=messages, sampling_params=sampling_params, tools=tools, ) if data.get("stream"): def generate(): full_text = "" for output in outputs: new_text = output.outputs[0].text[len(full_text):] full_text = output.outputs[0].text response_data = { "id": request_id, "model": served_model_name, "created": created_time, "choices": [{ "index": 0, "delta": {"content": new_text}, "finish_reason": output.outputs[0].finish_reason, }], } yield f"data: {json.dumps(response_data)}\n\n" yield "data: [DONE]\n\n" return StreamingResponse(generate(), media_type="text/event-stream") else: return { "id": request_id, "model": served_model_name, "created": created_time, "choices": [{ "index": 0, "message": { "role": "assistant", "content": outputs[0].outputs[0].text, }, "finish_reason": outputs[0].outputs[0].finish_reason, }], } except Exception as e: return {"error": str(e)}, 400 return app if __name__ == "__main__": # 配置参数 CONFIG = { "model": "/mnt/d/Qwen/Qwen2.5-1.5B-Instruct", "served_model_name": "Qwen2.5-1.5B-Instruct", # "model": "/mnt/d/Deepseek/DeepSeek-R1-Distill-Qwen-1.5B", # "served_model_name": "DeepSeek-R1-Distill-Qwen-1.5B", "host": "172.25.231.226", "port": 8000, "tensor_parallel_size": 1, "top_p": 0.9, "temperature": 0.7, "max_tokens": 8192, "gpu_memory_utilization": 0.9, "dtype": "float16" } # 创建应用 app = create_vllm_server(**CONFIG) # 启动服务器 uvicorn.run( app, host=CONFIG["host"], port=CONFIG["port"], workers= 1, )