diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 810fcdc50a..4d6b5d3eaf 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -409,17 +409,20 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco // choices: array of size N, where N is related to n request parameter jsonResponse.StartArray("choices"); - int index = 0; - for (int i = 0; i < results.tokens.size(); i++) { + for (size_t i = 0; i < results.tokens.size(); ++i) { const std::vector& tokens = results.tokens[i]; SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated tokens: {}", tokens); ParsedOutput parsedOutput = parseOutputIfNeeded(tokens); jsonResponse.StartObject(); - // finish_reason: "stop" in regular scenario, "tool_calls" if output contains tool calls - auto finishReason = mapFinishReason(ov::genai::GenerationFinishReason::STOP, !parsedOutput.toolCalls.empty()); + if (results.finish_reasons.empty()) { + throw std::runtime_error("Missing finish reason in unary LM generation result"); + } + // Current generation flow uses batch=1, so only finish_reasons[0] is expected here. + const ov::genai::GenerationFinishReason finishReasonRaw = results.finish_reasons[0]; + auto finishReason = mapFinishReason(finishReasonRaw, !parsedOutput.toolCalls.empty()); jsonResponse.FinishReason(finishReason.value_or("unknown")); // index: integer; Choice index, only n=1 supported anyway - jsonResponse.Index(index++); + jsonResponse.Index(static_cast(i)); if (endpoint == Endpoint::CHAT_COMPLETIONS) { jsonResponse.MessageObject(parsedOutput); @@ -480,8 +483,12 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated tokens: {}", generatedTokens); ParsedOutput parsedOutput = parseOutputIfNeeded(generatedTokens); jsonResponse.StartObject(); - // finish_reason: "stop" in regular scenario, "tool_calls" if output contains tool calls - auto finishReason = mapFinishReason(ov::genai::GenerationFinishReason::STOP, !parsedOutput.toolCalls.empty()); + if (results.finish_reasons.empty()) { + throw std::runtime_error("Missing finish reason in unary VLM generation result"); + } + // Current generation flow uses batch=1, so only finish_reasons[0] is expected here. + const ov::genai::GenerationFinishReason finishReasonRaw = results.finish_reasons[0]; + auto finishReason = mapFinishReason(finishReasonRaw, !parsedOutput.toolCalls.empty()); jsonResponse.FinishReason(finishReason.value_or("unknown")); // index: integer; Choice index, only n=1 supported anyway jsonResponse.Index(index++); diff --git a/src/llm/apis/openai_responses.cpp b/src/llm/apis/openai_responses.cpp index 60ec1c4f08..e3ac155a03 100644 --- a/src/llm/apis/openai_responses.cpp +++ b/src/llm/apis/openai_responses.cpp @@ -652,17 +652,30 @@ std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::EncodedRes OVMS_PROFILE_FUNCTION(); usage.promptTokens = results.perf_metrics.get_num_input_tokens(); usage.completionTokens = results.perf_metrics.get_num_generated_tokens(); + if (results.finish_reasons.empty()) { + throw std::runtime_error("Missing finish reason in unary LM responses generation result"); + } std::vector parsedOutputs; + ov::genai::GenerationFinishReason responsesFinishReason = ov::genai::GenerationFinishReason::STOP; for (const auto& tokens : results.tokens) { parsedOutputs.push_back(parseOutputIfNeeded(tokens)); } - return serializeUnaryResponseImpl(parsedOutputs); + for (const auto& finishReason : results.finish_reasons) { + if (finishReason == ov::genai::GenerationFinishReason::LENGTH) { + responsesFinishReason = ov::genai::GenerationFinishReason::LENGTH; + break; + } + } + return serializeUnaryResponseImpl(parsedOutputs, responsesFinishReason); } std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::VLMDecodedResults& results, const std::string& textResponse) { OVMS_PROFILE_FUNCTION(); usage.promptTokens = results.perf_metrics.get_num_input_tokens(); usage.completionTokens = results.perf_metrics.get_num_generated_tokens(); + if (results.finish_reasons.empty()) { + throw std::runtime_error("Missing finish reason in unary VLM responses generation result"); + } // Usage is already correctly set from perf_metrics above — no need for updateUsage. std::vector parsedOutputs; if (!textResponse.empty()) { @@ -677,7 +690,14 @@ std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::VLMDecoded parsedOutputs.push_back(std::move(output)); } } - return serializeUnaryResponseImpl(parsedOutputs); + ov::genai::GenerationFinishReason responsesFinishReason = ov::genai::GenerationFinishReason::STOP; + for (const auto& finishReason : results.finish_reasons) { + if (finishReason == ov::genai::GenerationFinishReason::LENGTH) { + responsesFinishReason = ov::genai::GenerationFinishReason::LENGTH; + break; + } + } + return serializeUnaryResponseImpl(parsedOutputs, responsesFinishReason); } // --- Streaming event building blocks --- diff --git a/src/llm/language_model/legacy/servable.cpp b/src/llm/language_model/legacy/servable.cpp index 4234088a2a..a3ac669565 100644 --- a/src/llm/language_model/legacy/servable.cpp +++ b/src/llm/language_model/legacy/servable.cpp @@ -229,7 +229,12 @@ absl::Status LegacyServable::preparePartialResponse(std::shared_ptrlastStreamerCallbackOutput.empty()) { lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput; } - std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, ov::genai::GenerationFinishReason::STOP); + if (legacyExecutionContext->results.finish_reasons.empty()) { + return absl::InternalError("Missing finish reason in legacy LM streaming generation result"); + } + // Legacy generation path always runs with batch=1, so we read the single finish reason at index 0. + ov::genai::GenerationFinishReason finishReason = legacyExecutionContext->results.finish_reasons[0]; + std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason); if (!serializedChunk.empty()) { executionContext->response = wrapTextInServerSideEventMessage(serializedChunk); } diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 1bb2367001..798b6af741 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -245,7 +245,12 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar if (!executionContext->lastStreamerCallbackOutput.empty()) { lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput; } - std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, ov::genai::GenerationFinishReason::STOP); + if (legacyExecutionContext->results.finish_reasons.empty()) { + return absl::InternalError("Missing finish reason in legacy VLM streaming generation result"); + } + // Legacy generation path always runs with batch=1, so we read the single finish reason at index 0. + ov::genai::GenerationFinishReason finishReason = legacyExecutionContext->results.finish_reasons[0]; + std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason); if (!serializedChunk.empty()) { executionContext->response = wrapTextInServerSideEventMessage(serializedChunk); } diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index c3a40cba3c..98aeb231b5 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -1018,6 +1018,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseEncodedResultsReturns ov::genai::EncodedResults results; results.tokens = {createHermes3ToolCallTokens(*tokenizer)}; + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results); ASSERT_NE(serialized.find("\"finish_reason\":\"tool_calls\""), std::string::npos) << serialized; @@ -1049,6 +1050,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseVLMSupportsToolCallsF ov::genai::VLMDecodedResults results; std::string toolCall = R"({"name": "example_tool", "arguments": {"arg1": "value1", "arg2": 42}})"; results.texts = {toolCall}; + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results, toolCall); ASSERT_NE(serialized.find("\"finish_reason\":\"tool_calls\""), std::string::npos) << serialized; @@ -1076,6 +1078,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesContainsO ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); int64_t* outputIdsData = reinterpret_cast(outputIds.data()); results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results); ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; @@ -1107,6 +1110,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesContainsR ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); int64_t* outputIdsData = reinterpret_cast(outputIds.data()); results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results); ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; @@ -1145,6 +1149,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesOmitsReas ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); int64_t* outputIdsData = reinterpret_cast(outputIds.data()); results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results); ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; @@ -2741,6 +2746,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, SerializeResponsesUnaryResponseContainsFunc ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); int64_t* outputIdsData = reinterpret_cast(outputIds.data()); results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results); ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; @@ -2786,6 +2792,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, SerializeResponsesUnaryResponseContainsFunc ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); int64_t* outputIdsData = reinterpret_cast(outputIds.data()); results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results); ASSERT_NE(serialized.find("\"tool_choice\":{"), std::string::npos) << serialized; @@ -3095,6 +3102,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, SerializeUnaryResponseVLMDecodedResultsWith std::string vlmText = "I will call a tool.{\"name\":\"get_weather\",\"arguments\":{\"location\":\"Paris\"}}"; results.texts.push_back(vlmText); + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results, vlmText); diff --git a/src/test/llm/llmnode_test.cpp b/src/test/llm/llmnode_test.cpp index 99d9e8743c..fb47f53751 100644 --- a/src/test/llm/llmnode_test.cpp +++ b/src/test/llm/llmnode_test.cpp @@ -2685,7 +2685,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( // params: model name, generate expected output, check logprobs, check finish reason, test speculative decoding, supports empty handshake msg TestParameters{"lm_cb_regular", true, true, true, false, true}, - TestParameters{"lm_legacy_regular", false, false, false, false, false}, + TestParameters{"lm_legacy_regular", false, false, true, false, false}, TestParameters{"vlm_cb_regular", false, true, true, false, true}, TestParameters{"vlm_legacy_regular", false, false, false, false, false}));