123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117 |
- from vllm import LLM, SamplingParams
- from fastapi import FastAPI, Request
- from fastapi.responses import StreamingResponse
- import uvicorn
- import time
- import json
-
- def create_vllm_server(
- model: str,
- served_model_name: str,
- host: str,
- port: int,
- tensor_parallel_size: int,
- top_p: float,
- temperature: float,
- max_tokens: int,
- gpu_memory_utilization: float,
- dtype: str,
- ) -> FastAPI:
- # 只初始化 LLM
- llm = LLM(
- model=model,
- tensor_parallel_size=tensor_parallel_size,
- gpu_memory_utilization=gpu_memory_utilization,
- dtype=dtype,
- )
-
- sampling_params = SamplingParams(
- temperature=temperature,
- top_p=top_p,
- max_tokens=max_tokens,
- )
-
- app = FastAPI()
-
- @app.post("/v1/chat/completions")
- async def chat_completions(request: Request):
- try:
- data = await request.json()
- messages = data["messages"]
- tools = data.get("tools") # 支持 tools 参数
- created_time = time.time()
- request_id = f"chatcmpl-{int(time.time())}"
-
- # 调用 llm.chat(),传入 tools
- outputs = llm.chat(
- messages=messages,
- sampling_params=sampling_params,
- tools=tools,
- )
- if data.get("stream"):
- def generate():
- full_text = ""
- for output in outputs:
- new_text = output.outputs[0].text[len(full_text):]
- full_text = output.outputs[0].text
- response_data = {
- "id": request_id,
- "model": served_model_name,
- "created": created_time,
- "choices": [{
- "index": 0,
- "delta": {"content": new_text},
- "finish_reason": output.outputs[0].finish_reason,
- }],
- }
- yield f"data: {json.dumps(response_data)}\n\n"
- yield "data: [DONE]\n\n"
-
- return StreamingResponse(generate(), media_type="text/event-stream")
- else:
- return {
- "id": request_id,
- "model": served_model_name,
- "created": created_time,
- "choices": [{
- "index": 0,
- "message": {
- "role": "assistant",
- "content": outputs[0].outputs[0].text,
- },
- "finish_reason": outputs[0].outputs[0].finish_reason,
- }],
- }
-
- except Exception as e:
- return {"error": str(e)}, 400
-
- return app
-
- if __name__ == "__main__":
- # 配置参数
- CONFIG = {
- "model": "/mnt/d/Qwen/Qwen2.5-1.5B-Instruct",
- "served_model_name": "Qwen2.5-1.5B-Instruct",
- # "model": "/mnt/d/Deepseek/DeepSeek-R1-Distill-Qwen-1.5B",
- # "served_model_name": "DeepSeek-R1-Distill-Qwen-1.5B",
- "host": "172.25.231.226",
- "port": 8000,
- "tensor_parallel_size": 1,
- "top_p": 0.9,
- "temperature": 0.7,
- "max_tokens": 8192,
- "gpu_memory_utilization": 0.9,
- "dtype": "float16"
- }
-
- # 创建应用
- app = create_vllm_server(**CONFIG)
-
- # 启动服务器
- uvicorn.run(
- app,
- host=CONFIG["host"],
- port=CONFIG["port"],
- workers= 1,
- )
|