Skip to content

Speechmatics adapter #94

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Feb 13, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/lib/Util/adapters/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import bbcKaldiToDraft from './bbc-kaldi/index';
import autoEdit2ToDraft from './autoEdit2/index';
import speechmaticsToDraft from './speechmatics/index';
/**
* Adapters for STT conversion
* @param {json} transcriptData - A json transcript with some word accurate timecode
Expand Down Expand Up @@ -37,6 +38,10 @@ const sttJsonAdapter = (transcriptData, sttJsonType) => {
case 'autoedit2':
blocks = autoEdit2ToDraft(transcriptData);

return { blocks, entityMap: createEntityMap(blocks) };
case 'speechmatics':
blocks = speechmaticsToDraft(transcriptData);

return { blocks, entityMap: createEntityMap(blocks) };
case 'draftjs':
return transcriptData; // (typeof transcriptData === 'string')? JSON.parse(transcriptData): transcriptData;
Expand Down
7 changes: 7 additions & 0 deletions src/lib/Util/adapters/speechmatics/example-usage.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
const speechmaticsToDraft = require('./index');
// using require, because of testing outside of React app
const speechmaticsTedTalkTranscript = require('./sample/speechmaticsTedTalkTranscript.sample.json');

const result = speechmaticsToDraft(speechmaticsTedTalkTranscript);

console.log(result);
120 changes: 120 additions & 0 deletions src/lib/Util/adapters/speechmatics/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/**
* Convert Speechmatics
*/

import generateEntitiesRanges from '../generate-entities-ranges/index.js';

/**
* groups words list from speechmatics based on punctuation.
* @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
* @todo As this function is also used in the bbc-kaldi adapter, should it be refactored into its own file?
* @param {array} words - array of words objects from speechmatics transcript
*/

const groupWordsInParagraphs = (words) => {
const results = [];
let paragraph = { words: [], text: [] };

words.forEach((word) => {
// if word contains punctuation
if (/[.?!]/.test(word.punct)) {
paragraph.words.push(word);
paragraph.text.push(word.punct);
results.push(paragraph);
// reset paragraph
paragraph = { words: [], text: [] };
} else {
paragraph.words.push(word);
paragraph.text.push(word.punct);
}
});

return results;
};

/**
* Determines the speaker of a paragraph by comparing the start time of the paragraph with
* the speaker times.
* @param {float} start - Starting point of paragraph
* @param {array} speakers - list of all speakers with start and end time
*/
const getSpeaker = (start, speakers) => {
for (var speakerIdx in speakers) {
const speaker = speakers[speakerIdx];
if (start >= speaker.start & start < speaker.end) {
return speaker.name;
}
}

return 'UNK';
};

/**
* Speechmatics treats punctuation as own words. This function merges punctuations with
* the pevious word and adjusts the total duration of the word.
* @param {array} words - array of words objects from speechmatics transcript
*/
const curatePunctuation = (words) => {
const curatedWords = [];
words.forEach((word) => {
if (/[.?!]/.test(word.name)) {
curatedWords[curatedWords.length-1].name = curatedWords[curatedWords.length-1].name + word.name;
curatedWords[curatedWords.length-1].duration = (parseFloat(curatedWords[curatedWords.length-1].duration) + parseFloat(word.duration)).toString();
} else {
curatedWords.push(word);
}
}
);

return curatedWords;
};

const speechmaticsToDraft = (speechmaticsJson) => {
const results = [];

let tmpWords;
tmpWords = curatePunctuation(speechmaticsJson.words);
tmpWords = tmpWords.map((element, index) => {
return ({
start: element.time,
end: (parseFloat(element.time) + parseFloat(element.duration)).toString(),
confidence: element.confidence,
word: element.name.toLowerCase().replace(/[.?!]/g, ''),
punct: element.name,
index: index,
});
});

let tmpSpeakers;
tmpSpeakers = speechmaticsJson.speakers;
tmpSpeakers = tmpSpeakers.map((element) => {
return ({
start: element.time,
end: (parseFloat(element.time) + parseFloat(element.duration)).toString(),
name: element.name,
});
});

const wordsByParagraphs = groupWordsInParagraphs(tmpWords);

wordsByParagraphs.forEach((paragraph) => {
const paragraphStart = paragraph.words[0].start;
const draftJsContentBlockParagraph = {
text: paragraph.text.join(' '),
type: 'paragraph',
data: {
speaker: getSpeaker(paragraphStart, tmpSpeakers),
words: paragraph.words,
start: paragraphStart
},
// the entities as ranges are each word in the space-joined text,
// so it needs to be compute for each the offset from the beginning of the paragraph and the length
entityRanges: generateEntitiesRanges(paragraph.words, 'punct'), // wordAttributeName
};
results.push(draftJsContentBlockParagraph);
});

return results;
};

export default speechmaticsToDraft;
17 changes: 17 additions & 0 deletions src/lib/Util/adapters/speechmatics/index.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import speechmaticsToDraft from './index';

import draftTranscriptExample from './sample/speechmaticsToDraft.sample.js';
import speechmaticsTedTalkTranscript from './sample/speechmaticsTedTalkTranscript.sample.json';

// TODO: figure out why the second of these two tests hang
// might need to review the draftJS data structure output
describe('speechmaticsToDraft', () => {
const result = speechmaticsToDraft(speechmaticsTedTalkTranscript);
it('Should be defined', ( ) => {
expect(result).toBeDefined();
});

it('Should be equal to expected value', ( ) => {
expect(result).toEqual(draftTranscriptExample);
});
});
Loading