Skip to content

Commit f687b08

Browse files
committed
Updated speechmatics adapter to create paragraphs according to speaker name instead of end-of-sentence symbols.
1 parent 33b7523 commit f687b08

File tree

1 file changed

+34
-30
lines changed
  • packages/stt-adapters/speechmatics

1 file changed

+34
-30
lines changed

packages/stt-adapters/speechmatics/index.js

Lines changed: 34 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5,34 +5,6 @@
55

66
import generateEntitiesRanges from '../generate-entities-ranges/index.js';
77

8-
/**
9-
* groups words list from speechmatics based on punctuation.
10-
* @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
11-
* @todo As this function is also used in the bbc-kaldi adapter, should it be refactored into its own file?
12-
* @param {array} words - array of words objects from speechmatics transcript
13-
*/
14-
const groupWordsInParagraphs = (words) => {
15-
const results = [];
16-
let paragraph = { words: [], text: [] };
17-
debugger;
18-
19-
words.forEach((word) => {
20-
// if word contains punctuation
21-
if (/[.?!]/.test(word.punct)) {
22-
paragraph.words.push(word);
23-
paragraph.text.push(word.punct);
24-
results.push(paragraph);
25-
// reset paragraph
26-
paragraph = { words: [], text: [] };
27-
} else {
28-
paragraph.words.push(word);
29-
paragraph.text.push(word.punct);
30-
}
31-
});
32-
33-
return results;
34-
};
35-
368
/**
379
* Determines the speaker of a paragraph by comparing the start time of the paragraph with
3810
* the speaker times.
@@ -51,6 +23,38 @@ const getSpeaker = (start, speakers) => {
5123
return 'UNK';
5224
};
5325

26+
/**
27+
* groups words list from speechmatics based on punctuation.
28+
* @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
29+
* @todo As this function is also used in the bbc-kaldi adapter, should it be refactored into its own file?
30+
* @param {array} words - array of words objects from speechmatics transcript
31+
*/
32+
const groupWordsInParagraphs = (words, speakers) => {
33+
const results = [];
34+
let paragraph = { words: [], text: [], speaker: '' };
35+
let oldSpeaker = getSpeaker(words[0].start, speakers);
36+
let newSpeaker;
37+
38+
words.forEach((word) => {
39+
newSpeaker = getSpeaker(word.start, speakers);
40+
// if speaker changes
41+
if (newSpeaker !== oldSpeaker) {
42+
paragraph.speaker = oldSpeaker;
43+
results.push(paragraph);
44+
oldSpeaker = newSpeaker;
45+
// reset paragraph
46+
paragraph = { words: [], text: [] };
47+
}
48+
paragraph.words.push(word);
49+
paragraph.text.push(word.punct);
50+
});
51+
52+
paragraph.speaker = oldSpeaker;
53+
results.push(paragraph);
54+
55+
return results;
56+
};
57+
5458
/**
5559
* Speechmatics treats punctuation as own words. This function merges punctuations with
5660
* the pevious word and adjusts the total duration of the word.
@@ -97,15 +101,15 @@ const speechmaticsToDraft = (speechmaticsJson) => {
97101
});
98102
});
99103

100-
const wordsByParagraphs = groupWordsInParagraphs(tmpWords);
104+
const wordsByParagraphs = groupWordsInParagraphs(tmpWords, tmpSpeakers);
101105

102106
wordsByParagraphs.forEach((paragraph) => {
103107
const paragraphStart = paragraph.words[0].start;
104108
const draftJsContentBlockParagraph = {
105109
text: paragraph.text.join(' '),
106110
type: 'paragraph',
107111
data: {
108-
speaker: getSpeaker(paragraphStart, tmpSpeakers),
112+
speaker: paragraph.speaker,
109113
words: paragraph.words,
110114
start: paragraphStart
111115
},

0 commit comments

Comments
 (0)