transcript store enhancements (#1879)

yujonglee · web-flow · commit 05181e21a5c2 · 2025-11-25T21:15:02.000+09:00
* feat(transcript): preserve and reindex partial hints after filter

Update transcript listener to correctly handle partial words and their
associated hints when older partial words are removed due to final
segments. Replace inline filtering logic with a two-step approach:
- collect existing partial words into a variable and filter them into
  remainingPartialWords
- build a mapping from old word indices to new indices for the kept
  partial words, then filter and remap partialHints accordingly
This prevents mismatches where hints still point to removed word
indices and ensures hints reference the correct remaining partial
word after filtering. Add a unit test that simulates partial then
final responses and asserts that remaining partial words and hints
are consistent and correctly reindexed.

* Make partial speaker hints channel-aware

Avoid cross-channel hint mis-assignment by scoping partial speaker hints to their channel. The change renames partialHints to partialHintsByChannel: a Record&lt;number, RuntimeSpeakerHint[]&gt;, updates initial state and all reads/writes to use partialHintsByChannel[channelIndex], and adjusts filtering/remapping and reset logic to operate on per-channel hint arrays. Tests and a component consumer were updated to read from partialHintsByChannel as well.

* Fix hint wordIndex alignment across channels

Flatten remainingWords once and reindex per-channel hints so RuntimeSpeakerHint.wordIndex refers to positions in the flattened array. Previously hints were flattened independently and kept per-channel indices, which misaligned hints for channels &gt;0. This change computes offsets for each channel by accumulating prior partialWords lengths, adjusts each hint.wordIndex by its channel offset, and passes the flattened words and reindexed hints to handlePersist.

* Reindex flattened speaker hints by cumulative word offsets

Flattening partial hints across channels left each hint.wordIndex relative
to its original channel, producing incorrect indices when channels are
concatenated. Compute cumulative word offsets for each channel (summing the
lengths of prior channels' partialWords) and reindex each RuntimeSpeakerHint
by adding the channel's offset. Also import the RuntimeSpeakerHint type so
reindexed hints have the correct shape.
diff --git a/apps/desktop/src/components/main/body/sessions/note-input/transcript/shared/index.tsx b/apps/desktop/src/components/main/body/sessions/note-input/transcript/shared/index.tsx
@@ -4,6 +4,7 @@ import { cn } from "@hypr/utils";
 
 import { useListener } from "../../../../../../../contexts/listener";
 import * as main from "../../../../../../../store/tinybase/main";
+import type { RuntimeSpeakerHint } from "../../../../../../../utils/segment";
 import { useAutoScroll, useScrollDetection } from "./hooks";
 import { Operations } from "./operations";
 import { RenderTranscript } from "./render-transcript";
@@ -32,7 +33,32 @@ export function TranscriptContainer({
   const partialWords = useListener((state) =>
     Object.values(state.partialWordsByChannel).flat(),
   );
-  const partialHints = useListener((state) => state.partialHints);
+  const partialHints = useListener((state) => {
+    const channelIndices = Object.keys(state.partialWordsByChannel)
+      .map(Number)
+      .sort((a, b) => a - b);
+
+    const offsetByChannel = new Map<number, number>();
+    let currentOffset = 0;
+    for (const channelIndex of channelIndices) {
+      offsetByChannel.set(channelIndex, currentOffset);
+      currentOffset += state.partialWordsByChannel[channelIndex]?.length ?? 0;
+    }
+
+    const reindexedHints: RuntimeSpeakerHint[] = [];
+    for (const channelIndex of channelIndices) {
+      const hints = state.partialHintsByChannel[channelIndex] ?? [];
+      const offset = offsetByChannel.get(channelIndex) ?? 0;
+      for (const hint of hints) {
+        reindexedHints.push({
+          ...hint,
+          wordIndex: hint.wordIndex + offset,
+        });
+      }
+    }
+
+    return reindexedHints;
+  });
 
   const containerRef = useRef<HTMLDivElement>(null);
   const [scrollElement, setScrollElement] = useState<HTMLDivElement | null>(
diff --git a/apps/desktop/src/store/zustand/listener/transcript.test.ts b/apps/desktop/src/store/zustand/listener/transcript.test.ts
@@ -101,9 +101,9 @@ describe("transcript slice", () => {
       " Another",
       " problem",
     ]);
-    expect(stateAfterFirst.partialHints).toHaveLength(2);
-    expect(stateAfterFirst.partialHints[0]?.wordIndex).toBe(0);
-    expect(stateAfterFirst.partialHints[1]?.wordIndex).toBe(1);
+    expect(stateAfterFirst.partialHintsByChannel[0]).toHaveLength(2);
+    expect(stateAfterFirst.partialHintsByChannel[0]?.[0]?.wordIndex).toBe(0);
+    expect(stateAfterFirst.partialHintsByChannel[0]?.[1]?.wordIndex).toBe(1);
 
     const extendedPartial = createResponse({
       words: [
@@ -132,9 +132,9 @@ describe("transcript slice", () => {
       " problem",
       " exists",
     ]);
-    expect(stateAfterSecond.partialHints).toHaveLength(3);
-    const lastPartialHint =
-      stateAfterSecond.partialHints[stateAfterSecond.partialHints.length - 1];
+    const channelHints = stateAfterSecond.partialHintsByChannel[0] ?? [];
+    expect(channelHints).toHaveLength(3);
+    const lastPartialHint = channelHints[channelHints.length - 1];
     expect(lastPartialHint?.wordIndex).toBe(2);
   });
 
@@ -187,4 +187,91 @@ describe("transcript slice", () => {
     expect(persist).toHaveBeenCalledTimes(1);
     expect(store.getState().finalWordsMaxEndMsByChannel[0]).toBe(1500);
   });
+
+  test("adjusts partial hint indices after filtering partial words", () => {
+    const persist = vi.fn();
+    store.getState().setTranscriptPersist(persist);
+
+    const partialResponse = createResponse({
+      words: [
+        {
+          word: "hello",
+          punctuated_word: "Hello",
+          start: 0,
+          end: 0.5,
+          confidence: 1,
+          speaker: 0,
+          language: "en",
+        },
+        {
+          word: "world",
+          punctuated_word: "world",
+          start: 0.5,
+          end: 1.0,
+          confidence: 1,
+          speaker: 1,
+          language: "en",
+        },
+        {
+          word: "test",
+          punctuated_word: "test",
+          start: 1.1,
+          end: 1.5,
+          confidence: 1,
+          speaker: 0,
+          language: "en",
+        },
+      ],
+      transcript: "Hello world test",
+      isFinal: false,
+    });
+
+    store.getState().handleTranscriptResponse(partialResponse);
+
+    const stateAfterPartial = store.getState();
+    expect(stateAfterPartial.partialWordsByChannel[0]).toHaveLength(3);
+    expect(stateAfterPartial.partialHintsByChannel[0]).toHaveLength(3);
+
+    const finalResponse = createResponse({
+      words: [
+        {
+          word: "hello",
+          punctuated_word: "Hello",
+          start: 0,
+          end: 0.5,
+          confidence: 1,
+          speaker: 0,
+          language: "en",
+        },
+        {
+          word: "world",
+          punctuated_word: "world",
+          start: 0.5,
+          end: 1.0,
+          confidence: 1,
+          speaker: 1,
+          language: "en",
+        },
+      ],
+      transcript: "Hello world",
+      isFinal: true,
+    });
+
+    store.getState().handleTranscriptResponse(finalResponse);
+
+    const stateAfterFinal = store.getState();
+    const remainingPartialWords = stateAfterFinal.partialWordsByChannel[0];
+    const remainingHints = stateAfterFinal.partialHintsByChannel[0] ?? [];
+
+    expect(remainingPartialWords).toHaveLength(1);
+    expect(remainingPartialWords?.[0]?.text).toBe(" test");
+
+    expect(remainingHints).toHaveLength(1);
+    expect(remainingHints[0]?.wordIndex).toBe(0);
+
+    const hintedWord =
+      remainingPartialWords?.[remainingHints[0]?.wordIndex ?? -1];
+    expect(hintedWord).toBeDefined();
+    expect(hintedWord?.text).toBe(" test");
+  });
 });
diff --git a/apps/desktop/src/store/zustand/listener/transcript.ts b/apps/desktop/src/store/zustand/listener/transcript.ts
@@ -16,7 +16,7 @@ export type HandlePersistCallback = (
 export type TranscriptState = {
   finalWordsMaxEndMsByChannel: Record<number, number>;
   partialWordsByChannel: WordsByChannel;
-  partialHints: RuntimeSpeakerHint[];
+  partialHintsByChannel: Record<number, RuntimeSpeakerHint[]>;
   handlePersist?: HandlePersistCallback;
 };
 
@@ -29,7 +29,7 @@ export type TranscriptActions = {
 const initialState: TranscriptState = {
   finalWordsMaxEndMsByChannel: {},
   partialWordsByChannel: {},
-  partialHints: [],
+  partialHintsByChannel: {},
   handlePersist: undefined,
 };
 
@@ -46,7 +46,7 @@ export const createTranscriptSlice = <
   ): void => {
     const {
       partialWordsByChannel,
-      partialHints,
+      partialHintsByChannel,
       handlePersist,
       finalWordsMaxEndMsByChannel,
     } = get();
@@ -69,20 +69,32 @@ export const createTranscriptSlice = <
         wordIndex: hint.wordIndex - firstNewWordIndex,
       }));
 
-    const remainingPartialWords = (
-      partialWordsByChannel[channelIndex] ?? []
-    ).filter((word) => word.start_ms > lastEndMs);
+    const existingPartialWords = partialWordsByChannel[channelIndex] ?? [];
+    const remainingPartialWords = existingPartialWords.filter(
+      (word) => word.start_ms > lastEndMs,
+    );
 
-    const remainingPartialHints = partialHints.filter((hint) => {
-      const partialWords = partialWordsByChannel[channelIndex] ?? [];
-      const word = partialWords[hint.wordIndex];
-      return word && word.start_ms > lastEndMs;
-    });
+    const oldToNewIndex = new Map<number, number>();
+    let newIdx = 0;
+    for (let oldIdx = 0; oldIdx < existingPartialWords.length; oldIdx++) {
+      if (existingPartialWords[oldIdx].start_ms > lastEndMs) {
+        oldToNewIndex.set(oldIdx, newIdx);
+        newIdx++;
+      }
+    }
+
+    const existingPartialHints = partialHintsByChannel[channelIndex] ?? [];
+    const remainingPartialHints = existingPartialHints
+      .filter((hint) => oldToNewIndex.has(hint.wordIndex))
+      .map((hint) => ({
+        ...hint,
+        wordIndex: oldToNewIndex.get(hint.wordIndex)!,
+      }));
 
     set((state) =>
       mutate(state, (draft) => {
         draft.partialWordsByChannel[channelIndex] = remainingPartialWords;
-        draft.partialHints = remainingPartialHints;
+        draft.partialHintsByChannel[channelIndex] = remainingPartialHints;
         draft.finalWordsMaxEndMsByChannel[channelIndex] = lastEndMs;
       }),
     );
@@ -95,7 +107,7 @@ export const createTranscriptSlice = <
     words: WordLike[],
     hints: RuntimeSpeakerHint[],
   ): void => {
-    const { partialWordsByChannel, partialHints } = get();
+    const { partialWordsByChannel, partialHintsByChannel } = get();
     const existing = partialWordsByChannel[channelIndex] ?? [];
 
     const firstStartMs = getFirstStartMs(words);
@@ -113,7 +125,8 @@ export const createTranscriptSlice = <
       wordIndex: before.length + hint.wordIndex,
     }));
 
-    const filteredOldHints = partialHints.filter((hint) => {
+    const existingHints = partialHintsByChannel[channelIndex] ?? [];
+    const filteredOldHints = existingHints.filter((hint) => {
       const word = existing[hint.wordIndex];
       return (
         word && (word.end_ms <= firstStartMs || word.start_ms >= lastEndMs)
@@ -123,7 +136,10 @@ export const createTranscriptSlice = <
     set((state) =>
       mutate(state, (draft) => {
         draft.partialWordsByChannel[channelIndex] = newWords;
-        draft.partialHints = [...filteredOldHints, ...hintsWithAdjustedIndices];
+        draft.partialHintsByChannel[channelIndex] = [
+          ...filteredOldHints,
+          ...hintsWithAdjustedIndices,
+        ];
       }),
     );
   };
@@ -160,17 +176,42 @@ export const createTranscriptSlice = <
       }
     },
     resetTranscript: () => {
-      const { partialWordsByChannel, partialHints, handlePersist } = get();
+      const { partialWordsByChannel, partialHintsByChannel, handlePersist } =
+        get();
 
       const remainingWords = Object.values(partialWordsByChannel).flat();
+
+      const channelIndices = Object.keys(partialWordsByChannel)
+        .map(Number)
+        .sort((a, b) => a - b);
+
+      const offsetByChannel = new Map<number, number>();
+      let currentOffset = 0;
+      for (const channelIndex of channelIndices) {
+        offsetByChannel.set(channelIndex, currentOffset);
+        currentOffset += partialWordsByChannel[channelIndex]?.length ?? 0;
+      }
+
+      const remainingHints: RuntimeSpeakerHint[] = [];
+      for (const channelIndex of channelIndices) {
+        const hints = partialHintsByChannel[channelIndex] ?? [];
+        const offset = offsetByChannel.get(channelIndex) ?? 0;
+        for (const hint of hints) {
+          remainingHints.push({
+            ...hint,
+            wordIndex: hint.wordIndex + offset,
+          });
+        }
+      }
+
       if (remainingWords.length > 0) {
-        handlePersist?.(remainingWords, partialHints);
+        handlePersist?.(remainingWords, remainingHints);
       }
 
       set((state) =>
         mutate(state, (draft) => {
           draft.partialWordsByChannel = {};
-          draft.partialHints = [];
+          draft.partialHintsByChannel = {};
           draft.finalWordsMaxEndMsByChannel = {};
           draft.handlePersist = undefined;
         }),