fede-kamel
diff --git a/‎MEMORY_OPTIMIZATION_PROPOSAL.md‎
Lines changed: 0 additions & 145 deletions b/‎MEMORY_OPTIMIZATION_PROPOSAL.md‎
Lines changed: 0 additions & 145 deletions
diff --git a/‎src/cohere/base_client.py‎
Lines changed: 0 additions & 104 deletions b/‎src/cohere/base_client.py‎
Lines changed: 0 additions & 104 deletions
diff --git a/‎src/cohere/client.py‎
Lines changed: 56 additions & 1 deletion b/‎src/cohere/client.py‎
Lines changed: 56 additions & 1 deletion
diff --git a/‎src/cohere/config.py‎
Lines changed: 1 addition & 0 deletions b/‎src/cohere/config.py‎
Lines changed: 1 addition & 0 deletions
@@ -1128,110 +1128,6 @@ def embed(
         )
         return _response.data
 
-    def embed_stream(
-        self,
-        *,
-        texts: typing.Optional[typing.Sequence[str]] = OMIT,
-        model: typing.Optional[str] = OMIT,
-        input_type: typing.Optional[EmbedInputType] = OMIT,
-        embedding_types: typing.Optional[typing.Sequence[EmbeddingType]] = OMIT,
-        truncate: typing.Optional[EmbedRequestTruncate] = OMIT,
-        batch_size: int = 10,
-        request_options: typing.Optional[RequestOptions] = None,
-    ) -> typing.Iterator[typing.Any]:  # Returns Iterator[StreamedEmbedding]
-        """
-        Memory-efficient streaming version of embed that yields embeddings one at a time.
-        
-        This method processes texts in batches and yields individual embeddings as they are
-        parsed from the response, without loading all embeddings into memory at once.
-        Ideal for processing large datasets where memory usage is a concern.
-
-        Parameters
-        ----------
-        texts : typing.Optional[typing.Sequence[str]]
-            An array of strings for the model to embed. Will be processed in batches.
-
-        model : typing.Optional[str]
-            ID of one of the available [Embedding models](https://docs.cohere.com/docs/cohere-embed).
-
-        input_type : typing.Optional[EmbedInputType]
-            Specifies the type of input passed to the model.
-
-        embedding_types : typing.Optional[typing.Sequence[EmbeddingType]]
-            Specifies the types of embeddings you want to get back.
-
-        truncate : typing.Optional[EmbedRequestTruncate]
-            One of `NONE|START|END` to specify how the API will handle inputs longer than the maximum token length.
-
-        batch_size : int
-            Number of texts to process in each batch. Default is 10.
-            Lower values use less memory but may be slower overall.
-
-        request_options : typing.Optional[RequestOptions]
-            Request-specific configuration.
-
-        Yields
-        ------
-        StreamedEmbedding
-            Individual embeddings as they are parsed from the response.
-
-        Examples
-        --------
-        from cohere import Client
-
-        client = Client(
-            client_name="YOUR_CLIENT_NAME",
-            token="YOUR_TOKEN",
-        )
-        
-        # Process embeddings one at a time without loading all into memory
-        for embedding in client.embed_stream(
-            texts=["hello", "goodbye", "how are you"],
-            model="embed-v4.0",
-            batch_size=2
-        ):
-            print(f"Embedding {embedding.index}: {embedding.embedding[:5]}...")
-            # Process/save embedding immediately
-        """
-        # Validate inputs
-        if texts is None or texts is OMIT:
-            return
-        if batch_size < 1:
-            raise ValueError("batch_size must be at least 1")
-
-        from .streaming_utils import StreamingEmbedParser
-
-        # Process texts in batches
-        texts_list = list(texts)
-        if not texts_list:
-            return
-
-        # Track text index separately from embedding index (for multiple embedding types)
-        global_text_index = 0
-
-        for batch_start in range(0, len(texts_list), batch_size):
-            batch_end = min(batch_start + batch_size, len(texts_list))
-            batch_texts = texts_list[batch_start:batch_end]
-
-            # Get response for this batch
-            response = self._raw_client.embed(
-                texts=batch_texts,
-                model=model,
-                input_type=input_type,
-                embedding_types=embedding_types,
-                truncate=truncate,
-                request_options=request_options,
-            )
-
-            # Parse embeddings from response incrementally
-            parser = StreamingEmbedParser(response._response, batch_texts)
-            for embedding in parser.iter_embeddings():
-                # The parser tracks text index per embedding type
-                # Adjust text reference to use batch_texts mapping
-                text_index_in_batch = batch_texts.index(embedding.text) if embedding.text in batch_texts else 0
-                embedding.index = batch_start + text_index_in_batch
-                yield embedding
-
     def rerank(
         self,
         *,
 
@@ -12,7 +12,7 @@
 
 from . import EmbedResponse, EmbedInputType, EmbeddingType, EmbedRequestTruncate
 from .base_client import BaseCohere, AsyncBaseCohere, OMIT
-from .config import embed_batch_size
+from .config import embed_batch_size, embed_stream_batch_size
 from .core import RequestOptions
 from .environment import ClientEnvironment
 from .manually_maintained.cache import CacheMixin
@@ -223,6 +223,61 @@ def embed(
 
         return merge_embed_responses(responses)
 
+    def embed_stream(
+        self,
+        *,
+        texts: typing.Sequence[str],
+        model: typing.Optional[str] = OMIT,
+        input_type: typing.Optional[EmbedInputType] = OMIT,
+        embedding_types: typing.Optional[typing.Sequence[EmbeddingType]] = OMIT,
+        truncate: typing.Optional[EmbedRequestTruncate] = OMIT,
+        batch_size: int = embed_stream_batch_size,
+        request_options: typing.Optional[RequestOptions] = None,
+    ) -> typing.Iterator[typing.Any]:
+        """
+        Memory-efficient embed that yields embeddings one batch at a time.
+
+        Processes texts in batches and yields individual StreamedEmbedding objects
+        as they come back, so you can write to a vector store incrementally without
+        holding all embeddings in memory.
+
+        Args:
+            texts: Texts to embed.
+            model: Embedding model ID.
+            input_type: Input type (search_document, search_query, etc.).
+            embedding_types: Types of embeddings to return (float, int8, etc.).
+            truncate: How to handle inputs longer than the max token length.
+            batch_size: Texts per API call. Defaults to 96 (API max).
+            request_options: Request-specific configuration.
+
+        Yields:
+            StreamedEmbedding with index, embedding, embedding_type, and text.
+        """
+        from .manually_maintained.streaming_embed import extract_embeddings_from_response
+
+        if not texts:
+            return
+        if batch_size < 1:
+            raise ValueError("batch_size must be at least 1")
+
+        texts_list = list(texts)
+
+        for batch_start in range(0, len(texts_list), batch_size):
+            batch_texts = texts_list[batch_start : batch_start + batch_size]
+
+            response = BaseCohere.embed(
+                self,
+                texts=batch_texts,
+                model=model,
+                input_type=input_type,
+                embedding_types=embedding_types,
+                truncate=truncate,
+                request_options=request_options,
+            )
+
+            response_data = response.dict() if hasattr(response, "dict") else response.__dict__
+            yield from extract_embeddings_from_response(response_data, batch_texts, batch_start)
+
     """
     The following methods have been moved or deprecated in cohere==5.0.0. Please update your usage.
     Issues may be filed in https://github.com/cohere-ai/cohere-python/issues.
 
@@ -1 +1,2 @@
 embed_batch_size = 96
+embed_stream_batch_size = 96  # Max texts per API request (API limit)
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`embed_batch_size = 96`
	`2`	`+embed_stream_batch_size = 96 # Max texts per API request (API limit)`