diff --git a/.changeset/perfect-camels-try.md b/.changeset/perfect-camels-try.md
new file mode 100644
index 0000000000..409a598531
--- /dev/null
+++ b/.changeset/perfect-camels-try.md
@@ -0,0 +1,6 @@
+---
+'firebase': minor
+'@firebase/vertexai': minor
+---
+
+Add support for Gemini multimodal output
diff --git a/common/api-review/vertexai.api.md b/common/api-review/vertexai.api.md
index e7f00c2f4e..f9cf3dac5b 100644
--- a/common/api-review/vertexai.api.md
+++ b/common/api-review/vertexai.api.md
@@ -124,6 +124,7 @@ export { Date_2 as Date }
export interface EnhancedGenerateContentResponse extends GenerateContentResponse {
// (undocumented)
functionCalls: () => FunctionCall[] | undefined;
+ inlineDataParts: () => InlineDataPart[] | undefined;
text: () => string;
}
@@ -304,6 +305,8 @@ export interface GenerationConfig {
// (undocumented)
presencePenalty?: number;
responseMimeType?: string;
+ // @beta
+ responseModalities?: ResponseModality[];
responseSchema?: TypedSchema | SchemaRequest;
// (undocumented)
stopSequences?: string[];
@@ -596,6 +599,15 @@ export interface RequestOptions {
timeout?: number;
}
+// @beta
+export const ResponseModality: {
+ readonly TEXT: "TEXT";
+ readonly IMAGE: "IMAGE";
+};
+
+// @beta
+export type ResponseModality = (typeof ResponseModality)[keyof typeof ResponseModality];
+
// @public (undocumented)
export interface RetrievedContextAttribution {
// (undocumented)
diff --git a/docs-devsite/vertexai.enhancedgeneratecontentresponse.md b/docs-devsite/vertexai.enhancedgeneratecontentresponse.md
index 535fb9def8..b557219bf8 100644
--- a/docs-devsite/vertexai.enhancedgeneratecontentresponse.md
+++ b/docs-devsite/vertexai.enhancedgeneratecontentresponse.md
@@ -24,6 +24,7 @@ export interface EnhancedGenerateContentResponse extends GenerateContentResponse
| Property | Type | Description |
| --- | --- | --- |
| [functionCalls](./vertexai.enhancedgeneratecontentresponse.md#enhancedgeneratecontentresponsefunctioncalls) | () => [FunctionCall](./vertexai.functioncall.md#functioncall_interface)\[\] \| undefined | |
+| [inlineDataParts](./vertexai.enhancedgeneratecontentresponse.md#enhancedgeneratecontentresponseinlinedataparts) | () => [InlineDataPart](./vertexai.inlinedatapart.md#inlinedatapart_interface)\[\] \| undefined | Aggregates and returns all [InlineDataPart](./vertexai.inlinedatapart.md#inlinedatapart_interface)s from the [GenerateContentResponse](./vertexai.generatecontentresponse.md#generatecontentresponse_interface)'s first candidate. |
| [text](./vertexai.enhancedgeneratecontentresponse.md#enhancedgeneratecontentresponsetext) | () => string | Returns the text string from the response, if available. Throws if the prompt or candidate was blocked. |
## EnhancedGenerateContentResponse.functionCalls
@@ -34,6 +35,16 @@ export interface EnhancedGenerateContentResponse extends GenerateContentResponse
functionCalls: () => FunctionCall[] | undefined;
```
+## EnhancedGenerateContentResponse.inlineDataParts
+
+Aggregates and returns all [InlineDataPart](./vertexai.inlinedatapart.md#inlinedatapart_interface)s from the [GenerateContentResponse](./vertexai.generatecontentresponse.md#generatecontentresponse_interface)'s first candidate.
+
+Signature:
+
+```typescript
+inlineDataParts: () => InlineDataPart[] | undefined;
+```
+
## EnhancedGenerateContentResponse.text
Returns the text string from the response, if available. Throws if the prompt or candidate was blocked.
diff --git a/docs-devsite/vertexai.generationconfig.md b/docs-devsite/vertexai.generationconfig.md
index d3e9879f93..360ef70941 100644
--- a/docs-devsite/vertexai.generationconfig.md
+++ b/docs-devsite/vertexai.generationconfig.md
@@ -27,6 +27,7 @@ export interface GenerationConfig
| [maxOutputTokens](./vertexai.generationconfig.md#generationconfigmaxoutputtokens) | number | |
| [presencePenalty](./vertexai.generationconfig.md#generationconfigpresencepenalty) | number | |
| [responseMimeType](./vertexai.generationconfig.md#generationconfigresponsemimetype) | string | Output response MIME type of the generated candidate text. Supported MIME types are text/plain (default, text output), application/json (JSON response in the candidates), and text/x.enum. |
+| [responseModalities](./vertexai.generationconfig.md#generationconfigresponsemodalities) | [ResponseModality](./vertexai.md#responsemodality)\[\] | (Public Preview) Generation modalities to be returned in generation responses. |
| [responseSchema](./vertexai.generationconfig.md#generationconfigresponseschema) | [TypedSchema](./vertexai.md#typedschema) \| [SchemaRequest](./vertexai.schemarequest.md#schemarequest_interface) | Output response schema of the generated candidate text. This value can be a class generated with a [Schema](./vertexai.schema.md#schema_class) static method like Schema.string() or Schema.object() or it can be a plain JS object matching the [SchemaRequest](./vertexai.schemarequest.md#schemarequest_interface) interface.
Note: This only applies when the specified responseMIMEType supports a schema; currently this is limited to application/json and text/x.enum. |
| [stopSequences](./vertexai.generationconfig.md#generationconfigstopsequences) | string\[\] | |
| [temperature](./vertexai.generationconfig.md#generationconfigtemperature) | number | |
@@ -75,6 +76,21 @@ Output response MIME type of the generated candidate text. Supported MIME types
responseMimeType?: string;
```
+## GenerationConfig.responseModalities
+
+> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
+>
+
+Generation modalities to be returned in generation responses.
+
+- Multimodal response generation is only supported by some Gemini models and versions; see [model versions](https://firebase.google.com/docs/vertex-ai/models). - Only image generation (`ResponseModality.IMAGE`) is supported.
+
+Signature:
+
+```typescript
+responseModalities?: ResponseModality[];
+```
+
## GenerationConfig.responseSchema
Output response schema of the generated candidate text. This value can be a class generated with a [Schema](./vertexai.schema.md#schema_class) static method like `Schema.string()` or `Schema.object()` or it can be a plain JS object matching the [SchemaRequest](./vertexai.schemarequest.md#schemarequest_interface) interface.
Note: This only applies when the specified `responseMIMEType` supports a schema; currently this is limited to `application/json` and `text/x.enum`.
diff --git a/docs-devsite/vertexai.md b/docs-devsite/vertexai.md
index f67254eef2..47d45a492e 100644
--- a/docs-devsite/vertexai.md
+++ b/docs-devsite/vertexai.md
@@ -125,12 +125,14 @@ The Vertex AI in Firebase Web SDK.
| Variable | Description |
| --- | --- |
| [POSSIBLE\_ROLES](./vertexai.md#possible_roles) | Possible roles. |
+| [ResponseModality](./vertexai.md#responsemodality) | (Public Preview) Generation modalities to be returned in generation responses. |
## Type Aliases
| Type Alias | Description |
| --- | --- |
| [Part](./vertexai.md#part) | Content part - includes text, image/video, or function call/response part types. |
+| [ResponseModality](./vertexai.md#responsemodality) | (Public Preview) Generation modalities to be returned in generation responses. |
| [Role](./vertexai.md#role) | Role is the producer of the content. |
| [Tool](./vertexai.md#tool) | Defines a tool that model can call to access external knowledge. |
| [TypedSchema](./vertexai.md#typedschema) | A type that includes all specific Schema types. |
@@ -223,6 +225,22 @@ Possible roles.
POSSIBLE_ROLES: readonly ["user", "model", "function", "system"]
```
+## ResponseModality
+
+> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
+>
+
+Generation modalities to be returned in generation responses.
+
+Signature:
+
+```typescript
+ResponseModality: {
+ readonly TEXT: "TEXT";
+ readonly IMAGE: "IMAGE";
+}
+```
+
## Part
Content part - includes text, image/video, or function call/response part types.
@@ -233,6 +251,19 @@ Content part - includes text, image/video, or function call/response part types.
export type Part = TextPart | InlineDataPart | FunctionCallPart | FunctionResponsePart | FileDataPart;
```
+## ResponseModality
+
+> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
+>
+
+Generation modalities to be returned in generation responses.
+
+Signature:
+
+```typescript
+export type ResponseModality = (typeof ResponseModality)[keyof typeof ResponseModality];
+```
+
## Role
Role is the producer of the content.
diff --git a/packages/vertexai/src/requests/response-helpers.test.ts b/packages/vertexai/src/requests/response-helpers.test.ts
index 5371d04025..97dd2f9fe3 100644
--- a/packages/vertexai/src/requests/response-helpers.test.ts
+++ b/packages/vertexai/src/requests/response-helpers.test.ts
@@ -29,6 +29,7 @@ import {
FinishReason,
GenerateContentResponse,
ImagenGCSImage,
+ InlineDataPart,
ImagenInlineImage
} from '../types';
import { getMockResponse } from '../../test-utils/mock-response';
@@ -132,6 +133,44 @@ const fakeResponseMixed3: GenerateContentResponse = {
]
};
+const inlineDataPart1: InlineDataPart = {
+ inlineData: {
+ mimeType: 'image/png',
+ data: 'base64encoded...'
+ }
+};
+
+const inlineDataPart2: InlineDataPart = {
+ inlineData: {
+ mimeType: 'image/jpeg',
+ data: 'anotherbase64...'
+ }
+};
+
+const fakeResponseInlineData: GenerateContentResponse = {
+ candidates: [
+ {
+ index: 0,
+ content: {
+ role: 'model',
+ parts: [inlineDataPart1, inlineDataPart2]
+ }
+ }
+ ]
+};
+
+const fakeResponseTextAndInlineData: GenerateContentResponse = {
+ candidates: [
+ {
+ index: 0,
+ content: {
+ role: 'model',
+ parts: [{ text: 'Describe this:' }, inlineDataPart1]
+ }
+ }
+ ]
+};
+
const badFakeResponse: GenerateContentResponse = {
promptFeedback: {
blockReason: BlockReason.SAFETY,
@@ -148,6 +187,7 @@ describe('response-helpers methods', () => {
const enhancedResponse = addHelpers(fakeResponseText);
expect(enhancedResponse.text()).to.equal('Some text and some more text');
expect(enhancedResponse.functionCalls()).to.be.undefined;
+ expect(enhancedResponse.inlineDataParts()).to.be.undefined;
});
it('good response functionCall', async () => {
const enhancedResponse = addHelpers(fakeResponseFunctionCall);
@@ -155,6 +195,7 @@ describe('response-helpers methods', () => {
expect(enhancedResponse.functionCalls()).to.deep.equal([
functionCallPart1.functionCall
]);
+ expect(enhancedResponse.inlineDataParts()).to.be.undefined;
});
it('good response functionCalls', async () => {
const enhancedResponse = addHelpers(fakeResponseFunctionCalls);
@@ -163,6 +204,7 @@ describe('response-helpers methods', () => {
functionCallPart1.functionCall,
functionCallPart2.functionCall
]);
+ expect(enhancedResponse.inlineDataParts()).to.be.undefined;
});
it('good response text/functionCall', async () => {
const enhancedResponse = addHelpers(fakeResponseMixed1);
@@ -170,6 +212,7 @@ describe('response-helpers methods', () => {
functionCallPart2.functionCall
]);
expect(enhancedResponse.text()).to.equal('some text');
+ expect(enhancedResponse.inlineDataParts()).to.be.undefined;
});
it('good response functionCall/text', async () => {
const enhancedResponse = addHelpers(fakeResponseMixed2);
@@ -177,6 +220,7 @@ describe('response-helpers methods', () => {
functionCallPart1.functionCall
]);
expect(enhancedResponse.text()).to.equal('some text');
+ expect(enhancedResponse.inlineDataParts()).to.be.undefined;
});
it('good response text/functionCall/text', async () => {
const enhancedResponse = addHelpers(fakeResponseMixed3);
@@ -184,10 +228,30 @@ describe('response-helpers methods', () => {
functionCallPart1.functionCall
]);
expect(enhancedResponse.text()).to.equal('some text and more text');
+ expect(enhancedResponse.inlineDataParts()).to.be.undefined;
});
it('bad response safety', async () => {
const enhancedResponse = addHelpers(badFakeResponse);
expect(enhancedResponse.text).to.throw('SAFETY');
+ expect(enhancedResponse.functionCalls).to.throw('SAFETY');
+ expect(enhancedResponse.inlineDataParts).to.throw('SAFETY');
+ });
+ it('good response inlineData', async () => {
+ const enhancedResponse = addHelpers(fakeResponseInlineData);
+ expect(enhancedResponse.text()).to.equal('');
+ expect(enhancedResponse.functionCalls()).to.be.undefined;
+ expect(enhancedResponse.inlineDataParts()).to.deep.equal([
+ inlineDataPart1,
+ inlineDataPart2
+ ]);
+ });
+ it('good response text/inlineData', async () => {
+ const enhancedResponse = addHelpers(fakeResponseTextAndInlineData);
+ expect(enhancedResponse.text()).to.equal('Describe this:');
+ expect(enhancedResponse.functionCalls()).to.be.undefined;
+ expect(enhancedResponse.inlineDataParts()).to.deep.equal([
+ inlineDataPart1
+ ]);
});
});
describe('getBlockString', () => {
diff --git a/packages/vertexai/src/requests/response-helpers.ts b/packages/vertexai/src/requests/response-helpers.ts
index 6d0e3bf2a0..d820f100a5 100644
--- a/packages/vertexai/src/requests/response-helpers.ts
+++ b/packages/vertexai/src/requests/response-helpers.ts
@@ -23,6 +23,7 @@ import {
GenerateContentResponse,
ImagenGCSImage,
ImagenInlineImage,
+ InlineDataPart,
VertexAIErrorCode
} from '../types';
import { VertexAIError } from '../errors';
@@ -89,6 +90,40 @@ export function addHelpers(
}
return '';
};
+ (response as EnhancedGenerateContentResponse).inlineDataParts = ():
+ | InlineDataPart[]
+ | undefined => {
+ if (response.candidates && response.candidates.length > 0) {
+ if (response.candidates.length > 1) {
+ logger.warn(
+ `This response had ${response.candidates.length} ` +
+ `candidates. Returning data from the first candidate only. ` +
+ `Access response.candidates directly to use the other candidates.`
+ );
+ }
+ if (hadBadFinishReason(response.candidates[0])) {
+ throw new VertexAIError(
+ VertexAIErrorCode.RESPONSE_ERROR,
+ `Response error: ${formatBlockErrorMessage(
+ response
+ )}. Response body stored in error.response`,
+ {
+ response
+ }
+ );
+ }
+ return getInlineDataParts(response);
+ } else if (response.promptFeedback) {
+ throw new VertexAIError(
+ VertexAIErrorCode.RESPONSE_ERROR,
+ `Data not available. ${formatBlockErrorMessage(response)}`,
+ {
+ response
+ }
+ );
+ }
+ return undefined;
+ };
(response as EnhancedGenerateContentResponse).functionCalls = () => {
if (response.candidates && response.candidates.length > 0) {
if (response.candidates.length > 1) {
@@ -164,6 +199,31 @@ export function getFunctionCalls(
}
}
+/**
+ * Returns {@link InlineDataPart}s in the first candidate if present.
+ *
+ * @internal
+ */
+export function getInlineDataParts(
+ response: GenerateContentResponse
+): InlineDataPart[] | undefined {
+ const data: InlineDataPart[] = [];
+
+ if (response.candidates?.[0].content?.parts) {
+ for (const part of response.candidates?.[0].content?.parts) {
+ if (part.inlineData) {
+ data.push(part);
+ }
+ }
+ }
+
+ if (data.length > 0) {
+ return data;
+ } else {
+ return undefined;
+ }
+}
+
const badFinishReasons = [FinishReason.RECITATION, FinishReason.SAFETY];
function hadBadFinishReason(candidate: GenerateContentCandidate): boolean {
diff --git a/packages/vertexai/src/types/enums.ts b/packages/vertexai/src/types/enums.ts
index a9481d40f5..d6702a0f1a 100644
--- a/packages/vertexai/src/types/enums.ts
+++ b/packages/vertexai/src/types/enums.ts
@@ -240,3 +240,29 @@ export enum Modality {
*/
DOCUMENT = 'DOCUMENT'
}
+
+/**
+ * Generation modalities to be returned in generation responses.
+ *
+ * @beta
+ */
+export const ResponseModality = {
+ /**
+ * Text.
+ * @beta
+ */
+ TEXT: 'TEXT',
+ /**
+ * Image.
+ * @beta
+ */
+ IMAGE: 'IMAGE'
+} as const;
+
+/**
+ * Generation modalities to be returned in generation responses.
+ *
+ * @beta
+ */
+export type ResponseModality =
+ (typeof ResponseModality)[keyof typeof ResponseModality];
diff --git a/packages/vertexai/src/types/requests.ts b/packages/vertexai/src/types/requests.ts
index c15258b06d..ee45b63667 100644
--- a/packages/vertexai/src/types/requests.ts
+++ b/packages/vertexai/src/types/requests.ts
@@ -21,7 +21,8 @@ import {
FunctionCallingMode,
HarmBlockMethod,
HarmBlockThreshold,
- HarmCategory
+ HarmCategory,
+ ResponseModality
} from './enums';
import { ObjectSchemaInterface, SchemaRequest } from './schema';
@@ -95,6 +96,16 @@ export interface GenerationConfig {
* this is limited to `application/json` and `text/x.enum`.
*/
responseSchema?: TypedSchema | SchemaRequest;
+ /**
+ * Generation modalities to be returned in generation responses.
+ *
+ * @remarks
+ * - Multimodal response generation is only supported by some Gemini models and versions; see {@link https://firebase.google.com/docs/vertex-ai/models | model versions}.
+ * - Only image generation (`ResponseModality.IMAGE`) is supported.
+ *
+ * @beta
+ */
+ responseModalities?: ResponseModality[];
}
/**
diff --git a/packages/vertexai/src/types/responses.ts b/packages/vertexai/src/types/responses.ts
index 7f68df1e67..e4a247bec4 100644
--- a/packages/vertexai/src/types/responses.ts
+++ b/packages/vertexai/src/types/responses.ts
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-import { Content, FunctionCall } from './content';
+import { Content, FunctionCall, InlineDataPart } from './content';
import {
BlockReason,
FinishReason,
@@ -59,6 +59,15 @@ export interface EnhancedGenerateContentResponse
* Throws if the prompt or candidate was blocked.
*/
text: () => string;
+ /**
+ * Aggregates and returns all {@link InlineDataPart}s from the {@link GenerateContentResponse}'s
+ * first candidate.
+ *
+ * @returns An array of {@link InlineDataPart}s containing data from the response, if available.
+ *
+ * @throws If the prompt or candidate was blocked.
+ */
+ inlineDataParts: () => InlineDataPart[] | undefined;
functionCalls: () => FunctionCall[] | undefined;
}