bbc · pietrop · Mar 10, 2019 · Mar 10, 2019 · Mar 10, 2019 · Mar 10, 2019
diff --git a/package-lock.json b/package-lock.json
diff --git a/src/lib/TranscriptEditor/MediaPlayer/index.js b/src/lib/TranscriptEditor/MediaPlayer/index.js
@@ -10,7 +10,6 @@ import returnHotKeys from './defaultHotKeys';
 import styles from './index.module.css';
 
 import { secondsToTimecode, timecodeToSeconds } from '../../Util/timecode-converter/index';
-import { timingSafeEqual } from 'crypto';
 
 const PLAYBACK_RATES = [
   { value: 0.2, label: '0.2' },

diff --git a/src/lib/TranscriptEditor/TimedTextEditor/index.js b/src/lib/TranscriptEditor/TimedTextEditor/index.js
@@ -12,8 +12,7 @@ import {
   convertFromRaw,
   convertToRaw,
   getDefaultKeyBinding,
-  Modifier,
-  KeyBindingUtil
+  Modifier
 } from 'draft-js';
 
 import Word from './Word';
@@ -23,8 +22,6 @@ import sttJsonAdapter from '../../Util/adapters/index.js';
 import exportAdapter from '../../Util/export-adapters/index.js';
 import style from './index.module.css';
 
-const { hasCommandModifier } = KeyBindingUtil;
-
 class TimedTextEditor extends React.Component {
   constructor(props) {
     super(props);

diff --git a/src/lib/Util/adapters/amazon-transcribe/example-usage.js b/src/lib/Util/adapters/amazon-transcribe/example-usage.js
@@ -0,0 +1,4 @@
+import amazonTranscribeToDraft from './index';
+import amazonTranscribeTedTalkTranscript from './sample/amazonTranscribe.sample.json';
+
+console.log(JSON.stringify(amazonTranscribeToDraft(amazonTranscribeTedTalkTranscript), null, 2));
diff --git a/src/lib/Util/adapters/amazon-transcribe/index.js b/src/lib/Util/adapters/amazon-transcribe/index.js
@@ -0,0 +1,109 @@
+import generateEntitiesRanges from '../generate-entities-ranges/index.js';
+
+/**
+ * Helper function to generate draft.js entities,
+ * see unit test for example data structure
+ * it adds offset and length to recognise word in draftjs
+ */
+
+/**
+ *  @param {json} words  - List of words
+ *  @param {string} wordAttributeName - eg 'punct' or 'text' or etc.
+ * attribute for the word object containing the text. eg word ={ punct:'helo', ... }
+ *  or eg word ={ text:'helo', ... }
+ */
+
+const getBestAlternativeForWord = (word) => {
+  const alternatives = word.alternatives;
+  //return alternatives.reduce();
+  if (/punctuation/.test(word.type)) {
+    return Object.assign(word.alternatives[0], { confidence: 1 }); //Transcribe doesn't provide a confidence for punctuation
+  }
+  const wordWithHighestConfidence = word.alternatives.reduce(function(prev, current) {
+    return (parseFloat(prev.confidence) > parseFloat(current.confidence)) ? prev : current;
+  });
+
+  return wordWithHighestConfidence;
+};
+
+/**
+Normalizes words so they can be used in
+ the generic amazonTranscribeToDraft() method
+**/
+
+const normalizedWord = (currentWord, previousWord) => {
+  const bestAlternative = getBestAlternativeForWord(currentWord);
+
+  return {
+    start: /punctuation/.test(currentWord.type) ? (parseFloat(previousWord.end_time) + 0.05).toFixed(2) : parseFloat(currentWord.start_time),
+    end: /punctuation/.test(currentWord.type) ? (parseFloat(previousWord.start_time) + 0.06).toFixed(2) : parseFloat(currentWord.end_time),
+    text: bestAlternative.content,
+    confidence: parseFloat(bestAlternative.confidence)
+  };
+};
+
+/**
+ * groups words list from kaldi transcript based on punctuation.
+ * @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
+ * @param {array} words - array of words opbjects from kaldi transcript
+ */
+
+const groupWordsInParagraphs = (words) => {
+  const results = [];
+  let paragraph = {
+    words: [],
+    text: []
+  };
+  words.forEach((word, index) => {
+    // if word type is punctuation
+    const content = word.alternatives[0].content;
+    let previousWord = {};
+    if (word.type === 'punctuation' && /[.?!]/.test(content)) {
+      previousWord = words[index - 1]; //assuming here the very first word is never punctuation
+      paragraph.words.push(normalizedWord(word, previousWord));
+      paragraph.text.push(content);
+      results.push(paragraph);
+      // reset paragraph
+      paragraph = {
+        words: [],
+        text: []
+      };
+    } else if (word.type === 'punctuation' && /[,?!]/.test(content)) {
+      previousWord = words[index - 1]; //assuming here the very first word is never punctuation
+      paragraph.words.push(normalizedWord(word, previousWord));
+      paragraph.text.push(content);
+    } else {
+      paragraph.words.push(normalizedWord(word, previousWord));
+      paragraph.text.push(content);
+    }
+  });
+
+  return results;
+};
+
+const amazonTranscribeToDraft = (amazonTranscribeJson) => {
+  const results = [];
+  const tmpWords = amazonTranscribeJson.results.items;
+
+  const wordsByParagraphs = groupWordsInParagraphs(tmpWords);
+  wordsByParagraphs.forEach((paragraph, i) => {
+    const draftJsContentBlockParagraph = {
+      text: paragraph.text.join(' '),
+      type: 'paragraph',
+      data: {
+        speaker: `TBC ${ i }`,
+        words: paragraph.words,
+        start: parseFloat(paragraph.words[0].start)
+      },
+      // the entities as ranges are each word in the space-joined text,
+      // so it needs to be compute for each the offset from the beginning of the paragraph and the length
+      entityRanges: generateEntitiesRanges(paragraph.words, 'text'), // wordAttributeName
+    };
+    // console.log(JSON.stringify(draftJsContentBlockParagraph,null,2))
+    results.push(draftJsContentBlockParagraph);
+  });
+
+  return results;
+};
+
+export default amazonTranscribeToDraft;
diff --git a/src/lib/Util/adapters/amazon-transcribe/index.test.js b/src/lib/Util/adapters/amazon-transcribe/index.test.js
@@ -0,0 +1,14 @@
+import amazonTranscribeToDraft from './index';
+import amazonTranscribeTedTalkTranscript from './sample/amazonTranscribe.sample.json';
+import draftTranscriptSample from './sample/amazonTranscribe.sample.js';
+
+describe('amazonTranscribeToDraft', () => {
+  const result = amazonTranscribeToDraft(amazonTranscribeTedTalkTranscript);
+  it('Should be defined', ( ) => {
+    expect(result).toBeDefined();
+  });
+
+  it('Should be equal to expected value', () => {
+    expect(result).toEqual(draftTranscriptSample);
+  });
+});