apiVersion: ray.io/v1
kind: RayService
metadata:
name: vllm
namespace: rayserve-vllm
spec:
serveConfigV2: |
applications:
- name: mistral
import_path: "vllm_serve:deployment"
runtime_env:
env_vars:
MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.2"
GPU_MEMORY_UTILIZATION: "0.9"
MAX_MODEL_LEN: "8192"
deployments:
- name: mistral-deployment
autoscaling_config:
min_replicas: 1
max_replicas: 4
target_num_ongoing_requests_per_replica: 20
ray_actor_options:
num_gpus: 1
rayClusterConfig:
workerGroupSpecs:
- replicas: 1
minReplicas: 1
maxReplicas: 4
template:
spec:
nodeSelector:
NodeGroupType: g5-gpu-karpenter
tolerations:
- key: "nvidia.com/gpu"
effect: "NoSchedule"