Skip to content

Kaldi bbc speakers segment #93

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Mar 22, 2019
3 changes: 2 additions & 1 deletion .eslintrc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
"jest": true
},
"rules": {
"prefer-const": 1,
"space-infix-ops": ["error", {"int32Hint": false}],
"no-unused-expressions": "error",
"no-trailing-spaces": "error",
"no-nested-ternary": "error",
Expand All @@ -28,7 +30,6 @@
"quotes": [ 1, "single", "avoid-escape" ],
"no-use-before-define": [ 2, { "functions": false } ],
"semi": [1, "always"],
"prefer-const": 1,
"react/prefer-es6-class": 0,
"react/jsx-filename-extension": 0,
"react/jsx-curly-spacing": [ 2, "always" ],
Expand Down
3 changes: 2 additions & 1 deletion src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ import { render } from 'react-dom';

import { TranscriptEditor } from './lib';

import kaldiTedTalkTranscript from './sample-data/KateDarling_2018S-bbc-kaldi.json';
// import kaldiTedTalkTranscript from './sample-data/KateDarling_2018S-bbc-kaldi.json';
import kaldiTedTalkTranscript from './sample-data/KateDarling-bbcKaldiTranscriptWithSpeakerSegments.json';
import style from './index.module.css';
import SttTypeSelect from './select-stt-json-type';
import ExportFormatSelect from './select-export-format';
Expand Down
118 changes: 118 additions & 0 deletions src/lib/Util/adapters/bbc-kaldi/group-words-by-speakers.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/**
edge cases
- more segments then words - not an issue if you start by matching words with segment
and handle edge case where it doesn't find a match
- more words then segments - orphan words
*/
function groupWordsInParagraphsBySpeakers(words, segments) {
// add speakers to each word
const wordsWithSpeakers = addSpeakerToEachWord(words, segments.segments);
// group words by speakers sequentially
const result = groupWordsBySpeaker(wordsWithSpeakers);

return result;
};

/**
* Add speakers to each words
* if it doesn't have add unknown attribute `U_UKN`
* @param {*} words
* @param {*} segments
*/
function addSpeakerToEachWord(words, segments) {
const tmpWordsWithSpeakers = [];
words.forEach((word) => {
const tmpSpeakerSegment = findSegmentForWord(word, segments);

word.speaker = formatSpeakerName(tmpSpeakerSegment.speaker);
tmpWordsWithSpeakers.push(word);
});

return tmpWordsWithSpeakers;
}

/**
* Groups Words by speaker attribute
* @param {array} wordsWithSpeakers - same as kaldi words list but with a `speaker` label attribute on each word
* @return {array} - list of paragraph objcts, with words, text and sepaker attributes.
* where words is an array and the other two are strings.
*/
function groupWordsBySpeaker(wordsWithSpeakers) {
let currentSpeaker = wordsWithSpeakers[0].speaker;
const results = [ ];
let paragraph = { words: [], text: '', speaker: '' };
wordsWithSpeakers.forEach((word) => {
// if current speaker same as word speaker add words to paragraph
if (currentSpeaker === word.speaker) {
paragraph.words.push(word);
paragraph.text += word.punct + ' ';
paragraph.speaker = currentSpeaker;
}
// if it's not same speaker
else {
// update current speaker
currentSpeaker = word.speaker;
// remove spacing in text
paragraph.text = paragraph.text.trim();
//save previous paragraph
results.push(paragraph);
// reset paragraph
paragraph = { words: [], text: '', speaker: 'U_UKN' };
// add words attributes to new
paragraph.words.push(word);
paragraph.text += word.punct + ' ';
}
});
// add last paragraph
results.push(paragraph);

return results;
}

/**
* Helper functions
*/

/**
* given word start and end time attributes
* looks for segment range that contains that word
* if it doesn't find any it returns a segment with `UKN`
* speaker attributes.
* @param {object} word - word object
* @param {array} segments - list of segments objects
* @return {object} - a single segment whose range contains the word
*/
function findSegmentForWord(word, segments) {

const tmpSegment = segments.find((seg) => {
const segEnd = seg.start + seg.duration;

return ((word.start >= seg.start) && (word.end <= segEnd));
});
// if find doesn't find any matches it returns an undefined
if (tmpSegment === undefined) {
// covering edge case orphan word not belonging to any segments
// adding UKN speaker label
return {
'@type': 'Segment',
// keeping both speaker id and gender as this is used later
// to format speaker label combining the two
speaker: { '@id': 'UKN', gender: 'U' }
};
} else {
// find returns the first element that matches the criteria
return tmpSegment;
}
}

/**
* formats kaldi speaker object into a string
* Combining Gender and speaker Id
* @param {object} speaker - BBC kaldi speaker object
* @return {string} -
*/
function formatSpeakerName(speaker) {
return speaker.gender + '_' + speaker['@id'];
}

export default groupWordsInParagraphsBySpeakers;
25 changes: 25 additions & 0 deletions src/lib/Util/adapters/bbc-kaldi/groups-words-by-speakers.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import groupWordsInParagraphsBySpeakers from './group-words-by-speakers';

import kaldiTedTalkTranscript from './sample/bbcKaldiTranscriptWithSpeakerSegments.sample.json';

const segmentation = kaldiTedTalkTranscript.retval.segmentation;
const words = kaldiTedTalkTranscript.retval.words;

describe('groupWordsInParagraphsBySpeakers', () => {
/**
* Hard to test if the segmentation algo it's working properly
* But one basic test for now is to test there is the same number of words
* In the result.
*/
it('Expect same word count in results', ( ) => {

const wordsByParagraphs = groupWordsInParagraphsBySpeakers(words, segmentation);

const resultWordCount = wordsByParagraphs.reduce(reduceFunction, 0);
function reduceFunction(total, currentParagraph) {
return total + currentParagraph.words.length;
};

expect(resultWordCount).toBe(words.length);
});
});
59 changes: 40 additions & 19 deletions src/lib/Util/adapters/bbc-kaldi/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,23 @@
*/

import generateEntitiesRanges from '../generate-entities-ranges/index.js';

import groupWordsInParagraphsBySpeakers from './group-words-by-speakers.js';
/**
* groups words list from kaldi transcript based on punctuation.
* @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
* @param {array} words - array of words opbjects from kaldi transcript
*/

const groupWordsInParagraphs = (words) => {
const groupWordsInParagraphs = words => {
const results = [];
let paragraph = { words: [], text: [] };

words.forEach((word) => {
words.forEach(word => {
// if word contains punctuation
if (/[.?!]/.test(word.punct)) {
paragraph.words.push(word);
paragraph.text.push(word.punct);
paragraph.text = paragraph.text.join(' ');
results.push(paragraph);
// reset paragraph
paragraph = { words: [], text: [] };
Expand All @@ -33,35 +34,55 @@ const groupWordsInParagraphs = (words) => {
return results;
};

const bbcKaldiToDraft = (bbcKaldiJson) => {
const bbcKaldiToDraft = bbcKaldiJson => {
const results = [];
let tmpWords;
let speakerSegmentation = null;
let wordsByParagraphs = [];

// BBC Octo Labs API Response wraps Kaldi response around retval,
// while kaldi contains word attribute at root
if (bbcKaldiJson.retval !== undefined) {
tmpWords = bbcKaldiJson.retval.words;
if (bbcKaldiJson.retval.segmentation !== undefined) {
speakerSegmentation = bbcKaldiJson.retval.segmentation;
}
} else {
tmpWords = bbcKaldiJson.words;
if (bbcKaldiJson.segmentation !== undefined) {
speakerSegmentation = bbcKaldiJson.segmentation;
}
}

const wordsByParagraphs = groupWordsInParagraphs(tmpWords);
if (speakerSegmentation === null) {
wordsByParagraphs = groupWordsInParagraphs(tmpWords);
} else {
wordsByParagraphs = groupWordsInParagraphsBySpeakers(tmpWords, speakerSegmentation);
}

wordsByParagraphs.forEach((paragraph, i) => {
const draftJsContentBlockParagraph = {
text: paragraph.text.join(' '),
type: 'paragraph',
data: {
speaker: `TBC ${ i }`,
words: paragraph.words,
start: paragraph.words[0].start
},
// the entities as ranges are each word in the space-joined text,
// so it needs to be compute for each the offset from the beginning of the paragraph and the length
entityRanges: generateEntitiesRanges(paragraph.words, 'punct'), // wordAttributeName
};
// console.log(JSON.stringify(draftJsContentBlockParagraph,null,2))
results.push(draftJsContentBlockParagraph);
// if paragraph contain words
// eg sometimes the speaker segmentation might not contain words :man-shrugging:
if (paragraph.words[0] !== undefined) {
let speakerLabel = `TBC ${ i }`;
if (speakerSegmentation !== null) {
speakerLabel = paragraph.speaker;
}

const draftJsContentBlockParagraph = {
text: paragraph.text,
type: 'paragraph',
data: {
speaker: speakerLabel,
words: paragraph.words,
start: paragraph.words[0].start
},
// the entities as ranges are each word in the space-joined text,
// so it needs to be compute for each the offset from the beginning of the paragraph and the length
entityRanges: generateEntitiesRanges(paragraph.words, 'punct') // wordAttributeName
};
results.push(draftJsContentBlockParagraph);
}
});

return results;
Expand Down
Loading