feat: return logprobs in OpenAIChatGenerator and OpenAIResponsesChatGenerator (#10035)

Amnah199 · web-flow · commit 538e48306849 · 2025-11-11T13:16:36.000+01:00
* Return logprobs

* Add log probs in responses

* Get logprobs from streaming

* Fix linting

* Update tests

* Fix formatting

* Update

* Fix tests

* Loosen up tests

* updates

* Update logprobs

* linting

* Fix test
diff --git a/haystack/components/generators/chat/openai.py b/haystack/components/generators/chat/openai.py
@@ -22,7 +22,7 @@
 from pydantic import BaseModel
 
 from haystack import component, default_from_dict, default_to_dict, logging
-from haystack.components.generators.utils import _convert_streaming_chunks_to_chat_message
+from haystack.components.generators.utils import _convert_streaming_chunks_to_chat_message, _serialize_object
 from haystack.dataclasses import (
     AsyncStreamingCallbackT,
     ChatMessage,
@@ -563,16 +563,17 @@ def _convert_chat_completion_to_chat_message(
                     _arguments=arguments_str,
                 )
 
-    chat_message = ChatMessage.from_assistant(
-        text=text,
-        tool_calls=tool_calls,
-        meta={
-            "model": completion.model,
-            "index": choice.index,
-            "finish_reason": choice.finish_reason,
-            "usage": _serialize_usage(completion.usage),
-        },
-    )
+    logprobs = _serialize_object(choice.logprobs) if choice.logprobs else None
+    meta = {
+        "model": completion.model,
+        "index": choice.index,
+        "finish_reason": choice.finish_reason,
+        "usage": _serialize_object(completion.usage),
+    }
+    if logprobs:
+        meta["logprobs"] = logprobs
+
+    chat_message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls, meta=meta)
 
     return chat_message
 
@@ -610,7 +611,7 @@ def _convert_chat_completion_chunk_to_streaming_chunk(
             meta={
                 "model": chunk.model,
                 "received_at": datetime.now().isoformat(),
-                "usage": _serialize_usage(chunk.usage),
+                "usage": _serialize_object(chunk.usage),
             },
         )
 
@@ -643,7 +644,7 @@ def _convert_chat_completion_chunk_to_streaming_chunk(
                 "tool_calls": choice.delta.tool_calls,
                 "finish_reason": choice.finish_reason,
                 "received_at": datetime.now().isoformat(),
-                "usage": _serialize_usage(chunk.usage),
+                "usage": _serialize_object(chunk.usage),
             },
         )
         return chunk_message
@@ -658,6 +659,23 @@ def _convert_chat_completion_chunk_to_streaming_chunk(
         # NOTE: We may need to revisit this if OpenAI allows planning/thinking content before tool calls like
         #       Anthropic Claude
         resolved_index = 0
+
+    # Initialize meta dictionary
+    meta = {
+        "model": chunk.model,
+        "index": choice.index,
+        "tool_calls": choice.delta.tool_calls,
+        "finish_reason": choice.finish_reason,
+        "received_at": datetime.now().isoformat(),
+        "usage": _serialize_object(chunk.usage),
+    }
+
+    # check if logprobs are present
+    # logprobs are returned only for text content
+    logprobs = _serialize_object(choice.logprobs) if choice.logprobs else None
+    if logprobs:
+        meta["logprobs"] = logprobs
+
     chunk_message = StreamingChunk(
         content=choice.delta.content or "",
         component_info=component_info,
@@ -666,27 +684,6 @@ def _convert_chat_completion_chunk_to_streaming_chunk(
         # and previous_chunks is length 1 then this is the start of text content.
         start=len(previous_chunks) == 1,
         finish_reason=finish_reason_mapping.get(choice.finish_reason) if choice.finish_reason else None,
-        meta={
-            "model": chunk.model,
-            "index": choice.index,
-            "tool_calls": choice.delta.tool_calls,
-            "finish_reason": choice.finish_reason,
-            "received_at": datetime.now().isoformat(),
-            "usage": _serialize_usage(chunk.usage),
-        },
+        meta=meta,
     )
     return chunk_message
-
-
-def _serialize_usage(usage):
-    """Convert OpenAI usage object to serializable dict recursively"""
-    if hasattr(usage, "model_dump"):
-        return usage.model_dump()
-    elif hasattr(usage, "__dict__"):
-        return {k: _serialize_usage(v) for k, v in usage.__dict__.items() if not k.startswith("_")}
-    elif isinstance(usage, dict):
-        return {k: _serialize_usage(v) for k, v in usage.items()}
-    elif isinstance(usage, list):
-        return [_serialize_usage(item) for item in usage]
-    else:
-        return usage
diff --git a/haystack/components/generators/chat/openai_responses.py b/haystack/components/generators/chat/openai_responses.py
@@ -13,6 +13,7 @@
 from pydantic import BaseModel
 
 from haystack import component, default_from_dict, default_to_dict, logging
+from haystack.components.generators.utils import _serialize_object
 from haystack.dataclasses import (
     AsyncStreamingCallbackT,
     ChatMessage,
@@ -516,10 +517,17 @@ def _convert_response_to_chat_message(responses: Union[Response, ParsedResponse]
 
     tool_calls = []
     reasoning = None
+    logprobs: list[dict] = []
     for output in responses.output:
         if isinstance(output, ResponseOutputRefusal):
             logger.warning("OpenAI returned a refusal output: {output}", output=output)
             continue
+
+        if output.type == "message":
+            for content in output.content:
+                if hasattr(content, "logprobs") and content.logprobs is not None:
+                    logprobs.append(_serialize_object(content.logprobs))
+
         if output.type == "reasoning":
             # openai doesn't return the reasoning tokens, but we can view summary if its enabled
             # https://platform.openai.com/docs/guides/reasoning#reasoning-summaries
@@ -547,11 +555,17 @@ def _convert_response_to_chat_message(responses: Union[Response, ParsedResponse]
                     _name=output.name,
                     _arguments=output.arguments,
                 )
+                arguments = {}
 
     # we save the response as dict because it contains resp_id etc.
     meta = responses.to_dict()
+
     # remove output from meta because it contains toolcalls, reasoning, text etc.
     meta.pop("output")
+
+    if logprobs:
+        meta["logprobs"] = logprobs
+
     chat_message = ChatMessage.from_assistant(
         text=responses.output_text if responses.output_text else None,
         reasoning=reasoning,
@@ -569,6 +583,7 @@ def _convert_response_chunk_to_streaming_chunk(  # pylint: disable=too-many-retu
     Converts the streaming response chunk from the OpenAI Responses API to a StreamingChunk.
 
     :param chunk: The chunk returned by the OpenAI Responses API.
+    :param previous_chunks: A list of previously received StreamingChunks.
     :param component_info: An optional `ComponentInfo` object containing information about the component that
         generated the chunk, such as the component name and type.
     :returns:
@@ -676,18 +691,22 @@ def _convert_streaming_chunks_to_chat_message(chunks: list[StreamingChunk]) -> C
 
     :returns: The ChatMessage.
     """
+
     # Get the full text by concatenating all text chunks
     text = "".join([chunk.content for chunk in chunks])
+    logprobs = []
+    for chunk in chunks:
+        if chunk.meta.get("logprobs"):
+            logprobs.append(chunk.meta.get("logprobs"))
 
     # Gather reasoning information if present
     reasoning_id = None
     reasoning_text = ""
     for chunk in chunks:
-        if not chunk.reasoning:
-            continue
-        reasoning_text += chunk.reasoning.reasoning_text
-        if chunk.reasoning.extra.get("id"):
-            reasoning_id = chunk.reasoning.extra.get("id")
+        if chunk.reasoning:
+            reasoning_text += chunk.reasoning.reasoning_text
+            if chunk.reasoning.extra.get("id"):
+                reasoning_id = chunk.reasoning.extra.get("id")
 
     # Process tool calls if present in any chunk
     tool_call_data: dict[str, dict[str, Any]] = {}  # Track tool calls by id
@@ -731,7 +750,10 @@ def _convert_streaming_chunks_to_chat_message(chunks: list[StreamingChunk]) -> C
             )
 
     # We dump the entire final response into meta to be consistent with non-streaming response
-    final_response = chunks[-1].meta.get("response")
+    final_response = chunks[-1].meta.get("response") or {}
+    final_response.pop("output", None)
+    if logprobs:
+        final_response["logprobs"] = logprobs
 
     # Add reasoning content if both id and text are available
     reasoning = None
diff --git a/haystack/components/generators/utils.py b/haystack/components/generators/utils.py
@@ -84,6 +84,10 @@ def _convert_streaming_chunks_to_chat_message(chunks: list[StreamingChunk]) -> C
     :returns: The ChatMessage.
     """
     text = "".join([chunk.content for chunk in chunks])
+    logprobs = []
+    for chunk in chunks:
+        if chunk.meta.get("logprobs"):
+            logprobs.append(chunk.meta.get("logprobs"))
     tool_calls = []
 
     # Process tool calls if present in any chunk
@@ -134,4 +138,21 @@ def _convert_streaming_chunks_to_chat_message(chunks: list[StreamingChunk]) -> C
         "usage": chunks[-1].meta.get("usage"),  # last chunk has the final usage data if available
     }
 
+    if logprobs:
+        meta["logprobs"] = logprobs
+
     return ChatMessage.from_assistant(text=text or None, tool_calls=tool_calls, meta=meta)
+
+
+def _serialize_object(obj):
+    """Convert an object to a serializable dict recursively"""
+    if hasattr(obj, "model_dump"):
+        return obj.model_dump()
+    elif hasattr(obj, "__dict__"):
+        return {k: _serialize_object(v) for k, v in obj.__dict__.items() if not k.startswith("_")}
+    elif isinstance(obj, dict):
+        return {k: _serialize_object(v) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return [_serialize_object(item) for item in obj]
+    else:
+        return obj
diff --git a/releasenotes/notes/add-logprobs-openai-fb2194d0f2a16898.yaml b/releasenotes/notes/add-logprobs-openai-fb2194d0f2a16898.yaml
@@ -0,0 +1,4 @@
+---
+features:
+  - |
+    If logprobs are enabled in the generation kwargs, return logprobs in ChatMessage.meta for `OpenAIChatGenerator` and `OpenAIResponsesChatGenerator`.
diff --git a/test/components/generators/chat/test_openai.py b/test/components/generators/chat/test_openai.py
@@ -281,6 +281,7 @@ def test_to_dict_with_parameters(self, monkeypatch, calendar_event_model):
                 "max_completion_tokens": 10,
                 "some_test_param": "test-params",
                 "response_format": calendar_event_model,
+                "logprobs": True,
             },
             tools=[tool],
             tools_strict=True,
@@ -303,6 +304,7 @@ def test_to_dict_with_parameters(self, monkeypatch, calendar_event_model):
                 "generation_kwargs": {
                     "max_completion_tokens": 10,
                     "some_test_param": "test-params",
+                    "logprobs": True,
                     "response_format": {
                         "type": "json_schema",
                         "json_schema": {
@@ -804,14 +806,15 @@ def test_run_with_response_format_and_streaming_pydantic_model(self, calendar_ev
     @pytest.mark.integration
     def test_live_run(self):
         chat_messages = [ChatMessage.from_user("What's the capital of France")]
-        component = OpenAIChatGenerator(generation_kwargs={"n": 1})
+        component = OpenAIChatGenerator(generation_kwargs={"n": 1, "logprobs": True})
         results = component.run(chat_messages)
         assert len(results["replies"]) == 1
         message: ChatMessage = results["replies"][0]
         assert "Paris" in message.text
         assert "gpt-4o" in message.meta["model"]
         assert message.meta["finish_reason"] == "stop"
         assert message.meta["usage"]["prompt_tokens"] > 0
+        assert message.meta["logprobs"] is not None
 
     @pytest.mark.skipif(
         not os.environ.get("OPENAI_API_KEY", None),
@@ -987,7 +990,7 @@ def __call__(self, chunk: StreamingChunk) -> None:
 
         callback = Callback()
         component = OpenAIChatGenerator(
-            streaming_callback=callback, generation_kwargs={"stream_options": {"include_usage": True}}
+            streaming_callback=callback, generation_kwargs={"stream_options": {"include_usage": True}, "logprobs": True}
         )
         results = component.run([ChatMessage.from_user("What's the capital of France?")])
 
@@ -1002,6 +1005,7 @@ def __call__(self, chunk: StreamingChunk) -> None:
         metadata = message.meta
         assert "gpt-4o" in metadata["model"]
         assert metadata["finish_reason"] == "stop"
+        assert metadata["logprobs"] is not None
 
         # Usage information checks
         assert isinstance(metadata.get("usage"), dict), "meta.usage not a dict"
diff --git a/test/components/generators/chat/test_openai_responses.py b/test/components/generators/chat/test_openai_responses.py
@@ -539,15 +539,18 @@ def warm_up(self):
     @pytest.mark.integration
     def test_live_run(self):
         chat_messages = [ChatMessage.from_user("What's the capital of France")]
-        component = OpenAIResponsesChatGenerator()
+        component = OpenAIResponsesChatGenerator(
+            model="gpt-4", generation_kwargs={"include": ["message.output_text.logprobs"]}
+        )
         results = component.run(chat_messages)
         assert len(results["replies"]) == 1
         message: ChatMessage = results["replies"][0]
         assert "Paris" in message.text
-        assert "gpt-5-mini" in message.meta["model"]
+        assert "gpt-4" in message.meta["model"]
         assert message.meta["status"] == "completed"
         assert message.meta["usage"]["total_tokens"] > 0
         assert message.meta["id"] is not None
+        assert message.meta["logprobs"] is not None
 
     @pytest.mark.skipif(
         not os.environ.get("OPENAI_API_KEY", None),
@@ -692,7 +695,9 @@ def __call__(self, chunk: StreamingChunk) -> None:
                 self.responses += chunk.content if chunk.content else ""
 
         callback = Callback()
-        component = OpenAIResponsesChatGenerator(streaming_callback=callback)
+        component = OpenAIResponsesChatGenerator(
+            model="gpt-4", streaming_callback=callback, generation_kwargs={"include": ["message.output_text.logprobs"]}
+        )
         results = component.run([ChatMessage.from_user("What's the capital of France?")])
 
         # Basic response checks
@@ -704,8 +709,8 @@ def __call__(self, chunk: StreamingChunk) -> None:
 
         # Metadata checks
         metadata = message.meta
-        assert "gpt-5-mini" in metadata["model"]
-
+        assert "gpt-4" in metadata["model"]
+        assert metadata["logprobs"] is not None
         # Usage information checks
         assert isinstance(metadata.get("usage"), dict), "meta.usage not a dict"
         usage = metadata["usage"]
@@ -755,7 +760,7 @@ def __call__(self, chunk: StreamingChunk) -> None:
     def test_live_run_with_tools_streaming(self, tools):
         chat_messages = [ChatMessage.from_user("What's the weather like in Paris and Berlin?")]
 
-        component = OpenAIResponsesChatGenerator(tools=tools, streaming_callback=print_streaming_chunk)
+        component = OpenAIResponsesChatGenerator(model="gpt-5", tools=tools, streaming_callback=print_streaming_chunk)
         results = component.run(chat_messages)
         assert len(results["replies"]) == 1
         message = results["replies"][0]
@@ -764,12 +769,18 @@ def test_live_run_with_tools_streaming(self, tools):
         assert not message.text
         assert message.tool_calls
         tool_calls = message.tool_calls
-        assert len(tool_calls) > 0
+        assert len(tool_calls) == 2
 
         for tool_call in tool_calls:
             assert isinstance(tool_call, ToolCall)
             assert tool_call.tool_name == "weather"
 
+        arguments = [tool_call.arguments for tool_call in tool_calls]
+        # Extract city names (handle cases like "Berlin, Germany" -> "Berlin")
+        city_values = [arg["city"].split(",")[0].strip().lower() for arg in arguments]
+        assert "berlin" in city_values and "paris" in city_values
+        assert len(city_values) == 2
+
     @pytest.mark.skipif(
         not os.environ.get("OPENAI_API_KEY", None),
         reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +---
 +features:
 +  - |
 +    If logprobs are enabled in the generation kwargs, return logprobs in ChatMessage.meta for `OpenAIChatGenerator` and `OpenAIResponsesChatGenerator`.