Updated speechmatics adapter to create paragraphs according to speaker name instead of end-of-sentence symbols.

murezzda · murezzda · commit f687b08cf8e1 · 2019-08-23T11:21:19.000+02:00
diff --git a/packages/stt-adapters/speechmatics/index.js b/packages/stt-adapters/speechmatics/index.js
@@ -5,34 +5,6 @@
 
 import generateEntitiesRanges from '../generate-entities-ranges/index.js';
 
-/**
- * groups words list from speechmatics based on punctuation.
- * @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
- * @todo As this function is also used in the bbc-kaldi adapter, should it be refactored into its own file?
- * @param {array} words - array of words objects from speechmatics transcript
- */
-const groupWordsInParagraphs = (words) => {
-  const results = [];
-  let paragraph = { words: [], text: [] };
-  debugger;
-
-  words.forEach((word) => {
-    // if word contains punctuation
-    if (/[.?!]/.test(word.punct)) {
-      paragraph.words.push(word);
-      paragraph.text.push(word.punct);
-      results.push(paragraph);
-      // reset paragraph
-      paragraph = { words: [], text: [] };
-    } else {
-      paragraph.words.push(word);
-      paragraph.text.push(word.punct);
-    }
-  });
-
-  return results;
-};
-
 /**
  * Determines the speaker of a paragraph by comparing the start time of the paragraph with
  * the speaker times.
@@ -51,6 +23,38 @@ const getSpeaker = (start, speakers) => {
   return 'UNK';
 };
 
+/**
+ * groups words list from speechmatics based on punctuation.
+ * @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
+ * @todo As this function is also used in the bbc-kaldi adapter, should it be refactored into its own file?
+ * @param {array} words - array of words objects from speechmatics transcript
+ */
+const groupWordsInParagraphs = (words, speakers) => {
+  const results = [];
+  let paragraph = { words: [], text: [], speaker: '' };
+  let oldSpeaker = getSpeaker(words[0].start, speakers);
+  let newSpeaker;
+
+  words.forEach((word) => {
+    newSpeaker = getSpeaker(word.start, speakers);
+    // if speaker changes
+    if (newSpeaker !== oldSpeaker) {
+      paragraph.speaker = oldSpeaker;
+      results.push(paragraph);
+      oldSpeaker = newSpeaker;
+      // reset paragraph
+      paragraph = { words: [], text: [] };
+    }
+    paragraph.words.push(word);
+    paragraph.text.push(word.punct);
+  });
+
+  paragraph.speaker = oldSpeaker;
+  results.push(paragraph);
+
+  return results;
+};
+
 /**
  * Speechmatics treats punctuation as own words. This function merges punctuations with
  * the pevious word and adjusts the total duration of the word.
@@ -97,15 +101,15 @@ const speechmaticsToDraft = (speechmaticsJson) => {
     });
   });
 
-  const wordsByParagraphs = groupWordsInParagraphs(tmpWords);
+  const wordsByParagraphs = groupWordsInParagraphs(tmpWords, tmpSpeakers);
 
   wordsByParagraphs.forEach((paragraph) => {
     const paragraphStart = paragraph.words[0].start;
     const draftJsContentBlockParagraph = {
       text: paragraph.text.join(' '),
       type: 'paragraph',
       data: {
-        speaker: getSpeaker(paragraphStart, tmpSpeakers),
+        speaker: paragraph.speaker,
         words: paragraph.words,
         start: paragraphStart
       },