@@ -51,7 +51,13 @@ struct ChatCompletionRequest {
5151 messages : Vec < ChatMessage > ,
5252 temperature : f32 ,
5353 top_p : f32 ,
54- max_tokens : u32 ,
54+ /// OpenAI newer models require `max_completion_tokens`; local LLMs use `max_tokens`.
55+ /// This field is set to `None` in both cases — the correct key is injected after
56+ /// serialization in `chat_completion()`.
57+ #[ serde( skip_serializing_if = "Option::is_none" ) ]
58+ max_tokens : Option < u32 > ,
59+ #[ serde( skip_serializing_if = "Option::is_none" ) ]
60+ max_completion_tokens : Option < u32 > ,
5561 stream : bool ,
5662}
5763
@@ -103,6 +109,13 @@ pub async fn chat_completion(
103109 ) ,
104110 } ;
105111
112+ // OpenAI's newer models reject `max_tokens` and require `max_completion_tokens`.
113+ // Local LLM servers (llama.cpp, Ollama) still expect `max_tokens`.
114+ let ( max_tokens, max_completion_tokens) = match backend {
115+ AiBackend :: Local { .. } => ( Some ( options. max_tokens ) , None ) ,
116+ AiBackend :: OpenAi { .. } => ( None , Some ( options. max_tokens ) ) ,
117+ } ;
118+
106119 let request_body = ChatCompletionRequest {
107120 model : model_name,
108121 messages : vec ! [
@@ -117,7 +130,8 @@ pub async fn chat_completion(
117130 ] ,
118131 temperature : options. temperature ,
119132 top_p : options. top_p ,
120- max_tokens : options. max_tokens ,
133+ max_tokens,
134+ max_completion_tokens,
121135 stream : false ,
122136 } ;
123137
0 commit comments