Text-to-speech use case page (#3017)

tsavina · yatarkan · web-flow · commit ca33467743a8 · 2025-11-17T12:51:44.000Z
## Description This PR adds new Speech Generation use case page to OpenVINO GenAI documentation. Preview: [Text-to-speech use case](https://tsavina.github.io/openvino.genai/docs/use-cases/speech-generation/) ## Ticket CVS-169351 ## Checklist: - [ ] Tests have been updated or added to cover the new code - N/A - [ ] This patch fully addresses the ticket.  - [ ] I have made corresponding changes to the documentation.  --------- Co-authored-by: Yaroslav Tarkan <yaroslav.tarkan@intel.com>
diff --git a/site/docs/use-cases/speech-generation/_sections/_run_model/_code_example_cpp.mdx b/site/docs/use-cases/speech-generation/_sections/_run_model/_code_example_cpp.mdx
@@ -0,0 +1,21 @@
+import CodeBlock from '@theme/CodeBlock';
+
+<CodeBlock language="cpp" showLineNumbers>
+{`#include "audio_utils.hpp"
+#include "openvino/genai/speech_generation/text2speech_pipeline.hpp"
+
+int main(int argc, char* argv[]) {
+    std::string models_path = argv[1];
+    ov::genai::Text2SpeechPipeline pipeline(model_path, "${props.device || 'CPU'}");
+
+    auto result = pipeline.generate("Hello OpenVINO GenAI");
+
+    auto waveform_size = result.speeches[0].get_size();
+    auto waveform_ptr = result.speeches[0].data<const float>();
+    auto bits_per_sample = result.speeches[0].get_element_type().bitwidth();
+    utils::audio::save_to_wav(waveform_ptr, waveform_size, "output_audio.wav", bits_per_sample);
+
+    return 0;
+}
+`}
+</CodeBlock>
diff --git a/site/docs/use-cases/speech-generation/_sections/_run_model/_code_example_python.mdx b/site/docs/use-cases/speech-generation/_sections/_run_model/_code_example_python.mdx
@@ -0,0 +1,15 @@
+import CodeBlock from '@theme/CodeBlock';
+
+<CodeBlock language="python" showLineNumbers>
+{`import openvino_genai
+import soundfile as sf
+
+pipeline = openvino_genai.Text2SpeechPipeline(model_path, "${props.device || 'CPU'}")
+
+# Generate audio using the default speaker
+result = pipeline.generate("Hello OpenVINO GenAI")
+# speech tensor contains the waveform of the spoken phrase
+speech = result.speeches[0]
+sf.write("output_audio.wav", speech.data[0], samplerate=16000)
+`}
+</CodeBlock>
diff --git a/site/docs/use-cases/speech-generation/_sections/_run_model/index.mdx b/site/docs/use-cases/speech-generation/_sections/_run_model/index.mdx
@@ -0,0 +1,97 @@
+import CodeExampleCPP from './_code_example_cpp.mdx';
+import CodeExamplePython from './_code_example_python.mdx';
+
+
+## Run Model Using OpenVINO GenAI
+
+The [`Text2SpeechPipeline`](https://docs.openvino.ai/2025/api/genai_api/_autosummary/openvino_genai.Text2SpeechPipeline.html) is the main object for generating speech from text.
+It automatically loads the TTS model and vocoder from the converted model directory.
+
+<LanguageTabs>
+    <TabItemPython>
+        <Tabs groupId="device">
+            <TabItem label="CPU" value="cpu">
+                <CodeExamplePython device="CPU" />
+            </TabItem>
+            <TabItem label="GPU" value="gpu">
+                <CodeExamplePython device="GPU" />
+            </TabItem>
+        </Tabs>
+    </TabItemPython>
+    <TabItemCpp>
+        <Tabs groupId="device">
+            <TabItem label="CPU" value="cpu">
+                <CodeExampleCPP device="CPU" />
+            </TabItem>
+            <TabItem label="GPU" value="gpu">
+                <CodeExampleCPP device="GPU" />
+            </TabItem>
+        </Tabs>
+    </TabItemCpp>
+</LanguageTabs>
+
+:::tip
+Use CPU or GPU as devices without any other code change.
+:::
+
+## Additional Usage Options
+
+:::tip
+Check out [Python](https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/python/speech_generation/text2speech.py) and [C++](https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/cpp/speech_generation/text2speech.cpp) speech generation samples.
+:::
+
+### Use Speaker Embedding File
+
+To generate speech using the SpeechT5 TTS model, you can specify a target voice by providing a speaker embedding file. 
+
+This file must contain 512 32-bit floating-point values that represent the voice characteristics of the target speaker. The model will use these characteristics to synthesize the input text in the specified voice.
+
+If no speaker embedding is provided, the model uses the default built-in speaker.
+
+You can generate a speaker embedding using the [create_speaker_embedding.py](https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/python/speech_generation/create_speaker_embedding.py) script. This script records 5 seconds of audio from your microphone and extracts a speaker embedding vector from the recording.
+
+```bash
+python create_speaker_embedding.py
+```
+
+<LanguageTabs>
+    <TabItemPython>
+        ```python
+        import openvino_genai
+        import openvino as ov
+        import numpy as np
+        import soundfile as sf
+
+        pipeline = openvino_genai.Text2SpeechPipeline(model_path, "CPU")
+
+        speaker_embedding = np.fromfile(args.speaker_embedding_file_path, dtype=np.float32).reshape(1, 512)
+        speaker_embedding = ov.Tensor(speaker_embedding)
+        result = pipeline.generate("Hello OpenVINO GenAI", speaker_embedding)
+
+        speech = result.speeches[0]
+        sf.write("output_audio.wav", speech.data[0], samplerate=16000)
+        ```
+    </TabItemPython>
+    <TabItemCpp>
+        ```cpp
+        #include "openvino/genai/speech_generation/text2speech_pipeline.hpp"
+        #include "audio_utils.hpp"
+
+        int main(int argc, char* argv[]) {
+            std::string model_path = argv[1];
+            ov::genai::Text2SpeechPipeline pipeline(model_path, "CPU");
+
+            auto speaker_embedding = utils::audio::read_speaker_embedding(speaker_embedding_path);
+            auto result = pipeline.generate("Hello OpenVINO GenAI", speaker_embedding);
+
+            auto waveform_size = result.speeches[0].get_size();
+            auto waveform_ptr = result.speeches[0].data<const float>();
+            auto bits_per_sample = result.speeches[0].get_element_type().bitwidth();
+            utils::audio::save_to_wav(waveform_ptr, waveform_size, "output_audio.wav", bits_per_sample);
+
+            return 0;
+        }
+        ```
+    </TabItemCpp>
+</LanguageTabs>
+
diff --git a/site/docs/use-cases/speech-generation/index.mdx b/site/docs/use-cases/speech-generation/index.mdx
@@ -0,0 +1,27 @@
+---
+sidebar_position: 7
+---
+import OptimumCLI from '@site/src/components/OptimumCLI';
+import ConvertModelSection from '../_shared/_convert_model.mdx';
+import RunModelSection from './_sections/_run_model/index.mdx';
+
+
+# Speech Generation Using SpeechT5
+
+:::info Note
+Currently, speech generation pipeline supports the SpeechT5 TTS model.
+The generated audio signal is a single-channel (mono) waveform with a sampling rate of 16 kHz.
+:::
+
+<ConvertModelSection>
+    Download and convert model (e.g. [speecht5_tts](https://huggingface.co/microsoft/speecht5_tts)) and its vocoder to OpenVINO format from Hugging Face.
+    SpeechT5 requires specifying a vocoder via `--model-kwargs`:
+
+    <OptimumCLI model='microsoft/speecht5_tts' outputDir='speecht5_tts' weightFormat='int4' modelKwargs={{
+        vocoder: "microsoft/speecht5_hifigan"
+    }} />
+
+    See all supported [Speech Generation Models](/docs/supported-models/#speech-generation-models).
+</ConvertModelSection>
+
+<RunModelSection />
diff --git a/site/src/components/OptimumCLI/index.tsx b/site/src/components/OptimumCLI/index.tsx
@@ -6,6 +6,7 @@ type OptimumCLIProps = {
   weightFormat?: 'fp32' | 'fp16' | 'int8' | 'int4';
   task?: string;
   trustRemoteCode?: boolean;
+  modelKwargs?: Record<string, string>;
 };
 
 export default function OptimumCLI({
@@ -14,6 +15,7 @@ export default function OptimumCLI({
   weightFormat,
   task,
   trustRemoteCode,
+  modelKwargs,
 }: OptimumCLIProps): React.JSX.Element {
   const args = [`--model ${model}`];
   if (weightFormat) {
@@ -25,6 +27,10 @@ export default function OptimumCLI({
   if (trustRemoteCode) {
     args.push('--trust-remote-code');
   }
+  if (modelKwargs) {
+    const kwargsString = JSON.stringify(modelKwargs);
+    args.push(`--model-kwargs '${kwargsString}'`);
+  }
   return (
     <CodeBlock language="bash">{`optimum-cli export openvino ${args.join(
       ' '
diff --git a/site/src/pages/_sections/UseCasesSection/components/speech-generation.tsx b/site/src/pages/_sections/UseCasesSection/components/speech-generation.tsx
@@ -0,0 +1,34 @@
+import Button from '@site/src/components/Button';
+import { LanguageTabs, TabItemCpp, TabItemPython } from '@site/src/components/LanguageTabs';
+
+import UseCaseCard from './UseCaseCard';
+
+import CodeExampleCpp from '@site/docs/use-cases/speech-generation/_sections/_run_model/_code_example_cpp.mdx';
+import CodeExamplePython from '@site/docs/use-cases/speech-generation/_sections/_run_model/_code_example_python.mdx';
+
+export const SpeechGeneration = () => (
+  <UseCaseCard>
+    <UseCaseCard.Title>Speech Generation Using SpeechT5</UseCaseCard.Title>
+    <UseCaseCard.Description>
+      Convert text to speech using SpeechT5 TTS models.
+    </UseCaseCard.Description>
+    <UseCaseCard.Features>
+      <li>Generate natural and expressive speech from text prompts</li>
+      <li>Use speaker embeddings for personalized voice synthesis</li>
+    </UseCaseCard.Features>
+    <UseCaseCard.Code>
+      <LanguageTabs>
+        <TabItemPython>
+          <CodeExamplePython />
+        </TabItemPython>
+        <TabItemCpp>
+          <CodeExampleCpp />
+        </TabItemCpp>
+      </LanguageTabs>
+    </UseCaseCard.Code>
+    <UseCaseCard.Actions>
+      <Button label="Explore Use Case" link="docs/use-cases/speech-generation" variant="primary" />
+      <Button label="View Code Samples" link="docs/samples" variant="primary" outline />
+    </UseCaseCard.Actions>
+  </UseCaseCard>
+);
diff --git a/site/src/pages/_sections/UseCasesSection/index.tsx b/site/src/pages/_sections/UseCasesSection/index.tsx
@@ -5,6 +5,7 @@ import Heading from '@theme/Heading';
 import Link from '@docusaurus/Link';
 import { ImageGeneration } from './components/image-generation';
 import { ImageProcessing } from './components/image-processing';
+import { SpeechGeneration } from './components/speech-generation';
 import { SpeechRecognition } from './components/speech-recognition';
 import { TextGeneration } from './components/text-generation';
 import { TextRerank } from './components/text-rerank';
@@ -19,6 +20,7 @@ export const UseCasesSection = () => (
       <TextGeneration />
       <ImageGeneration />
       <SpeechRecognition />
+      <SpeechGeneration />
       <ImageProcessing />
       <TextEmbedding />
       <TextRerank />