bbc · pietrop · Mar 18, 2019 · Mar 10, 2019 · Mar 10, 2019 · Mar 10, 2019
diff --git a/package-lock.json b/package-lock.json
diff --git a/src/lib/TranscriptEditor/MediaPlayer/index.js b/src/lib/TranscriptEditor/MediaPlayer/index.js
@@ -10,7 +10,6 @@ import returnHotKeys from './defaultHotKeys';
 import styles from './index.module.css';
 
 import { secondsToTimecode, timecodeToSeconds } from '../../Util/timecode-converter/index';
-import { timingSafeEqual } from 'crypto';
 
 const PLAYBACK_RATES = [
   { value: 0.2, label: '0.2' },

diff --git a/src/lib/TranscriptEditor/TimedTextEditor/index.js b/src/lib/TranscriptEditor/TimedTextEditor/index.js
@@ -12,8 +12,7 @@ import {
   convertFromRaw,
   convertToRaw,
   getDefaultKeyBinding,
-  Modifier,
-  KeyBindingUtil
+  Modifier
 } from 'draft-js';
 
 import Word from './Word';
@@ -23,8 +22,6 @@ import sttJsonAdapter from '../../Util/adapters/index.js';
 import exportAdapter from '../../Util/export-adapters/index.js';
 import style from './index.module.css';
 
-const { hasCommandModifier } = KeyBindingUtil;
-
 class TimedTextEditor extends React.Component {
   constructor(props) {
     super(props);

diff --git a/src/lib/Util/adapters/amazon-transcribe/example-usage.js b/src/lib/Util/adapters/amazon-transcribe/example-usage.js
@@ -0,0 +1,4 @@
+import amazonTranscribeToDraft from './index';
+import amazonTranscribeTedTalkTranscript from './sample/amazonTranscribe.sample.json';
+
+console.log(JSON.stringify(amazonTranscribeToDraft(amazonTranscribeTedTalkTranscript), null, 2));
diff --git a/src/lib/Util/adapters/amazon-transcribe/index.js b/src/lib/Util/adapters/amazon-transcribe/index.js
@@ -0,0 +1,134 @@
+import generateEntitiesRanges from '../generate-entities-ranges/index.js';
+
+/**
+ * Helper function to generate draft.js entities,
+ * see unit test for example data structure
+ * it adds offset and length to recognise word in draftjs
+ */
+
+/**
+ *  @param {json} words  - List of words
+ *  @param {string} wordAttributeName - eg 'punct' or 'text' or etc.
+ * attribute for the word object containing the text. eg word ={ punct:'helo', ... }
+ *  or eg word ={ text:'helo', ... }
+ */
+
+export const getBestAlternativeForWord = (word) => {
+  const alternatives = word.alternatives;
+  //return alternatives.reduce();
+  if (/punctuation/.test(word.type)) {
+    return Object.assign(word.alternatives[0], { confidence: 1 }); //Transcribe doesn't provide a confidence for punctuation
+  }
+  const wordWithHighestConfidence = word.alternatives.reduce(function(prev, current) {
+    return (parseFloat(prev.confidence) > parseFloat(current.confidence)) ? prev : current;
+  });
+
+  return wordWithHighestConfidence;
+};
+
+/**
+Normalizes words so they can be used in
+ the generic generateEntitiesRanges() method
+**/
+
+const normalizeWord = (currentWord, previousWord) => {
+  const bestAlternative = getBestAlternativeForWord(currentWord);
+
+  return {
+    start: parseFloat(currentWord.start_time),
+    end: parseFloat(currentWord.end_time),
+    text: bestAlternative.content,
+    confidence: parseFloat(bestAlternative.confidence)
+  };
+};
+
+export const appendPunctuationToPreviousWord = (punctuation, previousWord) => {
+  const punctuationContent = punctuation.alternatives[0].content
+  return {
+    ...previousWord,
+    alternatives: previousWord.alternatives.map(w => ({
+      ...w,
+      content: w.content + stripLeadingSpace(punctuationContent)
+    }))
+  }
+}
+
+export const mapPunctuationItemsToWords = (words) => {
+  const itemsToRemove = [];
+  const dirtyArray = words.map((word, index) => {
+    let previousWord = {};
+    if (word.type === 'punctuation') {
+      itemsToRemove.push(index-1);
+      previousWord = words[index - 1];
+      return appendPunctuationToPreviousWord(word, previousWord)
+    }
+    else {
+      return word;
+    }
+  })
+  return dirtyArray.filter((item, index) => {
+    return !itemsToRemove.includes(index);
+  })
+}
+
+export const stripLeadingSpace = (word) => {
+  return word.replace(/^\s/, '');
+}
+
+/**
+ * groups words list from amazon transcribe transcript based on punctuation.
+ * @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
+ * @param {array} words - array of words opbjects from kaldi transcript
+ */
+
+ const groupWordsInParagraphs = (words) => {
+   const results = [];
+   let paragraph = {
+     words: [],
+     text: []
+   };
+   words.forEach((word, index) => {
+     const content = getBestAlternativeForWord(word).content;
+     const normalizedWord = normalizeWord(word);
+     let previousWord = {};
+     if (/[.?!]/.test(content)) {
+       paragraph.words.push(normalizedWord);
+       paragraph.text.push(content);
+       results.push(paragraph);
+       // reset paragraph
+       paragraph = { words: [], text: [] };
+     } else {
+       paragraph.words.push(normalizedWord);
+       paragraph.text.push(content);
+     }
+   });
+
+   return results;
+ };
+
+const amazonTranscribeToDraft = (amazonTranscribeJson) => {
+  const results = [];
+  const tmpWords = amazonTranscribeJson.results.items;
+  const wordsWithRemappedPunctuation = mapPunctuationItemsToWords(tmpWords);
+  const wordsByParagraphs = groupWordsInParagraphs(wordsWithRemappedPunctuation);
+  wordsByParagraphs.forEach((paragraph, i) => {
+    const draftJsContentBlockParagraph = {
+      text: paragraph.text.join(' '),
+      type: 'paragraph',
+      data: {
+        speaker: `TBC ${ i }`,
+        words: paragraph.words,
+        start: parseFloat(paragraph.words[0].start)
+      },
+      // the entities as ranges are each word in the space-joined text,
+      // so it needs to be compute for each the offset from the beginning of the paragraph and the length
+      entityRanges: generateEntitiesRanges(paragraph.words, 'text'), // wordAttributeName
+    };
+    // console.log(JSON.stringify(draftJsContentBlockParagraph,null,2))
+    results.push(draftJsContentBlockParagraph);
+  });
+
+  return results;
+};
+
+export default amazonTranscribeToDraft;
diff --git a/src/lib/Util/adapters/amazon-transcribe/index.test.js b/src/lib/Util/adapters/amazon-transcribe/index.test.js
@@ -0,0 +1,140 @@
+import amazonTranscribeToDraft, {
+  mapPunctuationItemsToWords,
+  stripLeadingSpace,
+  appendPunctuationToPreviousWord,
+  getBestAlternativeForWord
+} from './index';
+import amazonTranscribeTedTalkTranscript from './sample/amazonTranscribe.sample.json';
+import draftTranscriptSample from './sample/amazonTranscribe.sample.js';
+
+describe.skip('amazonTranscribeToDraft', () => {
+  const result = amazonTranscribeToDraft(amazonTranscribeTedTalkTranscript);
+  it('Should be defined', () => {
+    expect(result).toBeDefined();
+  });
+
+  it('Should be equal to expected value', () => {
+    expect(result).toEqual(draftTranscriptSample);
+  });
+});
+
+describe('punctuation line item should be added to previous word and return a new array without that item', () => {
+  const startWords = [{
+      "start_time": "18.72",
+      "end_time": "19.16",
+      "alternatives": [{
+        "confidence": "0.9993",
+        "content": "upside"
+      }],
+      "type": "pronunciation"
+    },
+    {
+      "start_time": "19.16",
+      "end_time": "19.55",
+      "alternatives": [{
+        "confidence": "1.0000",
+        "content": "down"
+      }],
+      "type": "pronunciation"
+    },
+    {
+      "alternatives": [{
+        "confidence": null,
+        "content": "."
+      }],
+      "type": "punctuation"
+    }
+  ];
+
+  const expected = [{
+      "start_time": "18.72",
+      "end_time": "19.16",
+      "alternatives": [{
+        "confidence": "0.9993",
+        "content": "upside"
+      }],
+      "type": "pronunciation"
+    },
+    {
+      "start_time": "19.16",
+      "end_time": "19.55",
+      "alternatives": [{
+        "confidence": "1.0000",
+        "content": "down."
+      }],
+      "type": "pronunciation"
+    }
+  ];
+
+  const result = mapPunctuationItemsToWords(startWords);
+  it('should be equal to expected value', () => {
+    expect(result).toEqual(expected);
+  })
+})
+
+describe('Best alternative for word should be returned', () => {
+  const startWord = {
+    "start_time": "18.72",
+    "end_time": "19.16",
+    "alternatives": [{
+        "confidence": "0.9993",
+        "content": "upside"
+      },
+      {
+        "confidence": "0.88",
+        "content": "topside"
+      }
+    ],
+    "type": "pronunciation"
+  };
+  const expected = {
+    "confidence": "0.9993",
+    "content": "upside"
+  };
+  it('Should be equal to expected value', () => {
+
+    const result = getBestAlternativeForWord(startWord);
+    expect(result).toEqual(expected);
+  });
+});
+
+describe('Leading space should be removed from punctuation item', () => {
+  const startWord = ' , ';
+  const expected = ', ';
+  it('should be equal to expected value', () => {
+    const result = stripLeadingSpace(startWord);
+    expect(result).toEqual(expected);
+  })
+});
+
+describe('a word item and punctuation item should be merged', () => {
+  const startWord = {
+    "start_time": "19.16",
+    "end_time": "19.55",
+    "alternatives": [{
+      "confidence": "1.0000",
+      "content": "down"
+    }],
+    "type": "pronunciation"
+  };
+  const startPunctuation = {
+    "alternatives": [{
+      "confidence": null,
+      "content": " . "
+    }],
+    "type": "punctuation"
+  };
+  const expected = {
+    "start_time": "19.16",
+    "end_time": "19.55",
+    "alternatives": [{
+      "confidence": "1.0000",
+      "content": "down. "
+    }],
+    "type": "pronunciation"
+  };
+  it('should be equal to expected value', () => {
+    const result = appendPunctuationToPreviousWord(startPunctuation, startWord);
+    expect(result).toEqual(expected);
+  })
+});