bbc · pietrop · Mar 22, 2019 · Jan 17, 2019 · Jan 18, 2019 · Jan 18, 2019
diff --git a/.eslintrc b/.eslintrc
@@ -13,6 +13,8 @@
     "jest": true
   },
   "rules": {
+    "prefer-const": 1,
+    "space-infix-ops": ["error", {"int32Hint": false}],
     "no-unused-expressions": "error",
     "no-trailing-spaces": "error",
     "no-nested-ternary": "error",
@@ -28,7 +30,6 @@
     "quotes": [ 1, "single", "avoid-escape" ],
     "no-use-before-define": [ 2, { "functions": false } ],
     "semi": [1, "always"],
-    "prefer-const": 1,
     "react/prefer-es6-class": 0,
     "react/jsx-filename-extension": 0,
     "react/jsx-curly-spacing": [ 2, "always" ],

diff --git a/src/index.js b/src/index.js
@@ -3,7 +3,8 @@ import { render } from 'react-dom';
 
 import { TranscriptEditor } from './lib';
 
-import kaldiTedTalkTranscript from './sample-data/KateDarling_2018S-bbc-kaldi.json';
+// import kaldiTedTalkTranscript from './sample-data/KateDarling_2018S-bbc-kaldi.json';
+import kaldiTedTalkTranscript from './sample-data/KateDarling-bbcKaldiTranscriptWithSpeakerSegments.json';
 import style from './index.module.css';
 import SttTypeSelect from './select-stt-json-type';
 import ExportFormatSelect from './select-export-format';

diff --git a/src/lib/Util/adapters/bbc-kaldi/group-words-by-speakers.js b/src/lib/Util/adapters/bbc-kaldi/group-words-by-speakers.js
@@ -0,0 +1,118 @@
+/**
+edge cases
+- more segments then words - not an issue if you start by matching words with segment
+and handle edge case where it doesn't find a match
+- more words then segments - orphan words
+ */
+function groupWordsInParagraphsBySpeakers(words, segments) {
+  // add speakers to each word
+  const wordsWithSpeakers = addSpeakerToEachWord(words, segments.segments);
+  // group words by speakers sequentially
+  const result = groupWordsBySpeaker(wordsWithSpeakers);
+
+  return result;
+};
+
+/**
+* Add speakers to each words
+* if it doesn't have add unknown attribute `U_UKN`
+* @param {*} words
+* @param {*} segments
+*/
+function addSpeakerToEachWord(words, segments) {
+  const tmpWordsWithSpeakers = [];
+  words.forEach((word) => {
+    const tmpSpeakerSegment = findSegmentForWord(word, segments);
+
+    word.speaker = formatSpeakerName(tmpSpeakerSegment.speaker);
+    tmpWordsWithSpeakers.push(word);
+  });
+
+  return tmpWordsWithSpeakers;
+}
+
+/**
+ * Groups Words by speaker attribute
+ * @param {array} wordsWithSpeakers - same as kaldi words list but with a `speaker` label attribute on each word
+ * @return {array} - list of paragraph objcts, with words, text and sepaker attributes.
+ * where words is an array and the other two are strings.
+ */
+function groupWordsBySpeaker(wordsWithSpeakers) {
+  let currentSpeaker = wordsWithSpeakers[0].speaker;
+  const results = [ ];
+  let paragraph = { words: [], text: '', speaker: '' };
+  wordsWithSpeakers.forEach((word) => {
+    // if current speaker same as word speaker add words to paragraph
+    if (currentSpeaker === word.speaker) {
+      paragraph.words.push(word);
+      paragraph.text += word.punct + ' ';
+      paragraph.speaker = currentSpeaker;
+    }
+    // if it's not same speaker
+    else {
+      // update current speaker
+      currentSpeaker = word.speaker;
+      // remove spacing in text
+      paragraph.text = paragraph.text.trim();
+      //save  previous paragraph
+      results.push(paragraph);
+      // reset paragraph
+      paragraph = { words: [], text: '', speaker: 'U_UKN' };
+      // add words attributes to new
+      paragraph.words.push(word);
+      paragraph.text += word.punct + ' ';
+    }
+  });
+  // add last paragraph
+  results.push(paragraph);
+
+  return results;
+}
+
+/**
+* Helper functions
+*/
+
+/**
+* given word start and end time attributes
+* looks for segment range that contains that word
+* if it doesn't find any it returns a segment with `UKN`
+* speaker attributes.
+* @param {object} word - word object
+* @param {array} segments - list of segments objects
+* @return {object} - a single segment whose range contains the word
+*/
+function findSegmentForWord(word, segments) {
+
+  const tmpSegment = segments.find((seg) => {
+    const segEnd = seg.start + seg.duration;
+
+    return ((word.start >= seg.start) && (word.end <= segEnd));
+  });
+  // if find doesn't find any matches it returns an undefined
+  if (tmpSegment === undefined) {
+    // covering edge case orphan word not belonging to any segments
+    // adding UKN speaker label
+    return {
+      '@type': 'Segment',
+      // keeping both speaker id and gender as this is used later
+      // to format speaker label combining the two
+      speaker: { '@id': 'UKN', gender: 'U' }
+    };
+  } else {
+    // find returns the first element that matches the criteria
+    return tmpSegment;
+  }
+}
+
+/**
+* formats kaldi speaker object into a string
+* Combining Gender and speaker Id
+* @param {object} speaker - BBC kaldi speaker object
+* @return {string} -
+*/
+function formatSpeakerName(speaker) {
+  return speaker.gender + '_' + speaker['@id'];
+}
+
+export default groupWordsInParagraphsBySpeakers;
diff --git a/src/lib/Util/adapters/bbc-kaldi/groups-words-by-speakers.test.js b/src/lib/Util/adapters/bbc-kaldi/groups-words-by-speakers.test.js
@@ -0,0 +1,25 @@
+import groupWordsInParagraphsBySpeakers from './group-words-by-speakers';
+
+import kaldiTedTalkTranscript from './sample/bbcKaldiTranscriptWithSpeakerSegments.sample.json';
+
+const segmentation = kaldiTedTalkTranscript.retval.segmentation;
+const words = kaldiTedTalkTranscript.retval.words;
+
+describe('groupWordsInParagraphsBySpeakers', () => {
+  /**
+     * Hard to test if the segmentation algo it's working properly
+     * But one basic test for now is to test there is the same number of words
+     * In the result.
+     */
+  it('Expect same word count in results', ( ) => {
+
+    const wordsByParagraphs = groupWordsInParagraphsBySpeakers(words, segmentation);
+
+    const resultWordCount = wordsByParagraphs.reduce(reduceFunction, 0);
+    function reduceFunction(total, currentParagraph) {
+      return total + currentParagraph.words.length;
+    };
+
+    expect(resultWordCount).toBe(words.length);
+  });
+});
diff --git a/src/lib/Util/adapters/bbc-kaldi/index.js b/src/lib/Util/adapters/bbc-kaldi/index.js
@@ -5,22 +5,23 @@
  */
 
 import generateEntitiesRanges from '../generate-entities-ranges/index.js';
-
+import groupWordsInParagraphsBySpeakers from './group-words-by-speakers.js';
 /**
  * groups words list from kaldi transcript based on punctuation.
  * @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
  * @param {array} words - array of words opbjects from kaldi transcript
  */
 
-const groupWordsInParagraphs = (words) => {
+const groupWordsInParagraphs = words => {
   const results = [];
   let paragraph = { words: [], text: [] };
 
-  words.forEach((word) => {
+  words.forEach(word => {
     // if word contains punctuation
     if (/[.?!]/.test(word.punct)) {
       paragraph.words.push(word);
       paragraph.text.push(word.punct);
+      paragraph.text = paragraph.text.join(' ');
       results.push(paragraph);
       // reset paragraph
       paragraph = { words: [], text: [] };
@@ -33,35 +34,55 @@ const groupWordsInParagraphs = (words) => {
   return results;
 };
 
-const bbcKaldiToDraft = (bbcKaldiJson) => {
+const bbcKaldiToDraft = bbcKaldiJson => {
   const results = [];
   let tmpWords;
+  let speakerSegmentation = null;
+  let wordsByParagraphs = [];
 
   // BBC Octo Labs API Response wraps Kaldi response around retval,
   // while kaldi contains word attribute at root
   if (bbcKaldiJson.retval !== undefined) {
     tmpWords = bbcKaldiJson.retval.words;
+    if (bbcKaldiJson.retval.segmentation !== undefined) {
+      speakerSegmentation = bbcKaldiJson.retval.segmentation;
+    }
   } else {
     tmpWords = bbcKaldiJson.words;
+    if (bbcKaldiJson.segmentation !== undefined) {
+      speakerSegmentation = bbcKaldiJson.segmentation;
+    }
   }
 
-  const wordsByParagraphs = groupWordsInParagraphs(tmpWords);
+  if (speakerSegmentation === null) {
+    wordsByParagraphs = groupWordsInParagraphs(tmpWords);
+  } else {
+    wordsByParagraphs = groupWordsInParagraphsBySpeakers(tmpWords, speakerSegmentation);
+  }
 
   wordsByParagraphs.forEach((paragraph, i) => {
-    const draftJsContentBlockParagraph = {
-      text: paragraph.text.join(' '),
-      type: 'paragraph',
-      data: {
-        speaker: `TBC ${ i }`,
-        words: paragraph.words,
-        start: paragraph.words[0].start
-      },
-      // the entities as ranges are each word in the space-joined text,
-      // so it needs to be compute for each the offset from the beginning of the paragraph and the length
-      entityRanges: generateEntitiesRanges(paragraph.words, 'punct'), // wordAttributeName
-    };
-    // console.log(JSON.stringify(draftJsContentBlockParagraph,null,2))
-    results.push(draftJsContentBlockParagraph);
+    // if paragraph contain words
+    // eg sometimes the speaker segmentation might not contain words :man-shrugging:
+    if (paragraph.words[0] !== undefined) {
+      let speakerLabel = `TBC ${ i }`;
+      if (speakerSegmentation !== null) {
+        speakerLabel = paragraph.speaker;
+      }
+
+      const draftJsContentBlockParagraph = {
+        text: paragraph.text,
+        type: 'paragraph',
+        data: {
+          speaker: speakerLabel,
+          words: paragraph.words,
+          start: paragraph.words[0].start
+        },
+        // the entities as ranges are each word in the space-joined text,
+        // so it needs to be compute for each the offset from the beginning of the paragraph and the length
+        entityRanges: generateEntitiesRanges(paragraph.words, 'punct') // wordAttributeName
+      };
+      results.push(draftJsContentBlockParagraph);
+    }
   });
 
   return results;