Skip to content

Add punctuation changes #119

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18,467 changes: 0 additions & 18,467 deletions package-lock.json

This file was deleted.

1 change: 0 additions & 1 deletion src/lib/TranscriptEditor/MediaPlayer/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import returnHotKeys from './defaultHotKeys';
import styles from './index.module.css';

import { secondsToTimecode, timecodeToSeconds } from '../../Util/timecode-converter/index';
import { timingSafeEqual } from 'crypto';

const PLAYBACK_RATES = [
{ value: 0.2, label: '0.2' },
Expand Down
5 changes: 1 addition & 4 deletions src/lib/TranscriptEditor/TimedTextEditor/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ import {
convertFromRaw,
convertToRaw,
getDefaultKeyBinding,
Modifier,
KeyBindingUtil
Modifier
} from 'draft-js';

import Word from './Word';
Expand All @@ -23,8 +22,6 @@ import sttJsonAdapter from '../../Util/adapters/index.js';
import exportAdapter from '../../Util/export-adapters/index.js';
import style from './index.module.css';

const { hasCommandModifier } = KeyBindingUtil;

class TimedTextEditor extends React.Component {
constructor(props) {
super(props);
Expand Down
4 changes: 4 additions & 0 deletions src/lib/Util/adapters/amazon-transcribe/example-usage.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import amazonTranscribeToDraft from './index';
import amazonTranscribeTedTalkTranscript from './sample/amazonTranscribe.sample.json';

console.log(JSON.stringify(amazonTranscribeToDraft(amazonTranscribeTedTalkTranscript), null, 2));
134 changes: 134 additions & 0 deletions src/lib/Util/adapters/amazon-transcribe/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import generateEntitiesRanges from '../generate-entities-ranges/index.js';

/**
* Helper function to generate draft.js entities,
* see unit test for example data structure
* it adds offset and length to recognise word in draftjs
*/

/**
* @param {json} words - List of words
* @param {string} wordAttributeName - eg 'punct' or 'text' or etc.
* attribute for the word object containing the text. eg word ={ punct:'helo', ... }
* or eg word ={ text:'helo', ... }
*/

export const getBestAlternativeForWord = (word) => {
const alternatives = word.alternatives;
//return alternatives.reduce();
if (/punctuation/.test(word.type)) {
return Object.assign(word.alternatives[0], { confidence: 1 }); //Transcribe doesn't provide a confidence for punctuation
}
const wordWithHighestConfidence = word.alternatives.reduce(function(prev, current) {
return (parseFloat(prev.confidence) > parseFloat(current.confidence)) ? prev : current;
});

return wordWithHighestConfidence;
};

/**
Normalizes words so they can be used in
the generic generateEntitiesRanges() method
**/

const normalizeWord = (currentWord, previousWord) => {
const bestAlternative = getBestAlternativeForWord(currentWord);

return {
start: parseFloat(currentWord.start_time),
end: parseFloat(currentWord.end_time),
text: bestAlternative.content,
confidence: parseFloat(bestAlternative.confidence)
};
};

export const appendPunctuationToPreviousWord = (punctuation, previousWord) => {
const punctuationContent = punctuation.alternatives[0].content
return {
...previousWord,
alternatives: previousWord.alternatives.map(w => ({
...w,
content: w.content + stripLeadingSpace(punctuationContent)
}))
}
}

export const mapPunctuationItemsToWords = (words) => {
const itemsToRemove = [];
const dirtyArray = words.map((word, index) => {
let previousWord = {};
if (word.type === 'punctuation') {
itemsToRemove.push(index-1);
previousWord = words[index - 1];
return appendPunctuationToPreviousWord(word, previousWord)
}
else {
return word;
}
})
return dirtyArray.filter((item, index) => {
return !itemsToRemove.includes(index);
})
}

export const stripLeadingSpace = (word) => {
return word.replace(/^\s/, '');
}

/**
* groups words list from amazon transcribe transcript based on punctuation.
* @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
* @param {array} words - array of words opbjects from kaldi transcript
*/

const groupWordsInParagraphs = (words) => {
const results = [];
let paragraph = {
words: [],
text: []
};
words.forEach((word, index) => {
const content = getBestAlternativeForWord(word).content;
const normalizedWord = normalizeWord(word);
let previousWord = {};
if (/[.?!]/.test(content)) {
paragraph.words.push(normalizedWord);
paragraph.text.push(content);
results.push(paragraph);
// reset paragraph
paragraph = { words: [], text: [] };
} else {
paragraph.words.push(normalizedWord);
paragraph.text.push(content);
}
});

return results;
};

const amazonTranscribeToDraft = (amazonTranscribeJson) => {
const results = [];
const tmpWords = amazonTranscribeJson.results.items;
const wordsWithRemappedPunctuation = mapPunctuationItemsToWords(tmpWords);
const wordsByParagraphs = groupWordsInParagraphs(wordsWithRemappedPunctuation);
wordsByParagraphs.forEach((paragraph, i) => {
const draftJsContentBlockParagraph = {
text: paragraph.text.join(' '),
type: 'paragraph',
data: {
speaker: `TBC ${ i }`,
words: paragraph.words,
start: parseFloat(paragraph.words[0].start)
},
// the entities as ranges are each word in the space-joined text,
// so it needs to be compute for each the offset from the beginning of the paragraph and the length
entityRanges: generateEntitiesRanges(paragraph.words, 'text'), // wordAttributeName
};
// console.log(JSON.stringify(draftJsContentBlockParagraph,null,2))
results.push(draftJsContentBlockParagraph);
});

return results;
};

export default amazonTranscribeToDraft;
140 changes: 140 additions & 0 deletions src/lib/Util/adapters/amazon-transcribe/index.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import amazonTranscribeToDraft, {
mapPunctuationItemsToWords,
stripLeadingSpace,
appendPunctuationToPreviousWord,
getBestAlternativeForWord
} from './index';
import amazonTranscribeTedTalkTranscript from './sample/amazonTranscribe.sample.json';
import draftTranscriptSample from './sample/amazonTranscribe.sample.js';

describe.skip('amazonTranscribeToDraft', () => {
const result = amazonTranscribeToDraft(amazonTranscribeTedTalkTranscript);
it('Should be defined', () => {
expect(result).toBeDefined();
});

it('Should be equal to expected value', () => {
expect(result).toEqual(draftTranscriptSample);
});
});

describe('punctuation line item should be added to previous word and return a new array without that item', () => {
const startWords = [{
"start_time": "18.72",
"end_time": "19.16",
"alternatives": [{
"confidence": "0.9993",
"content": "upside"
}],
"type": "pronunciation"
},
{
"start_time": "19.16",
"end_time": "19.55",
"alternatives": [{
"confidence": "1.0000",
"content": "down"
}],
"type": "pronunciation"
},
{
"alternatives": [{
"confidence": null,
"content": "."
}],
"type": "punctuation"
}
];

const expected = [{
"start_time": "18.72",
"end_time": "19.16",
"alternatives": [{
"confidence": "0.9993",
"content": "upside"
}],
"type": "pronunciation"
},
{
"start_time": "19.16",
"end_time": "19.55",
"alternatives": [{
"confidence": "1.0000",
"content": "down."
}],
"type": "pronunciation"
}
];

const result = mapPunctuationItemsToWords(startWords);
it('should be equal to expected value', () => {
expect(result).toEqual(expected);
})
})

describe('Best alternative for word should be returned', () => {
const startWord = {
"start_time": "18.72",
"end_time": "19.16",
"alternatives": [{
"confidence": "0.9993",
"content": "upside"
},
{
"confidence": "0.88",
"content": "topside"
}
],
"type": "pronunciation"
};
const expected = {
"confidence": "0.9993",
"content": "upside"
};
it('Should be equal to expected value', () => {

const result = getBestAlternativeForWord(startWord);
expect(result).toEqual(expected);
});
});

describe('Leading space should be removed from punctuation item', () => {
const startWord = ' , ';
const expected = ', ';
it('should be equal to expected value', () => {
const result = stripLeadingSpace(startWord);
expect(result).toEqual(expected);
})
});

describe('a word item and punctuation item should be merged', () => {
const startWord = {
"start_time": "19.16",
"end_time": "19.55",
"alternatives": [{
"confidence": "1.0000",
"content": "down"
}],
"type": "pronunciation"
};
const startPunctuation = {
"alternatives": [{
"confidence": null,
"content": " . "
}],
"type": "punctuation"
};
const expected = {
"start_time": "19.16",
"end_time": "19.55",
"alternatives": [{
"confidence": "1.0000",
"content": "down. "
}],
"type": "pronunciation"
};
it('should be equal to expected value', () => {
const result = appendPunctuationToPreviousWord(startPunctuation, startWord);
expect(result).toEqual(expected);
})
});
Loading