Skip to content

Commit 25c69f8

Browse files
authored
Kaldi bbc speakers segment (#93)
* speaker segments * added transcript speaker info sample json * Removed conflict * setup the function for segmenation not working, needs re-thinking how to break words into segments * first pass at speaker segmentation * refactor segmentation * Added test for group words by speaker segments counts if word count is preserved * light refactor * fixed PR requested changes * eslint * Save * Removed example stt json as they are now contained in the adapters
1 parent 27bb7c0 commit 25c69f8

11 files changed

+30005
-26273
lines changed

.eslintrc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
"jest": true
1414
},
1515
"rules": {
16+
"prefer-const": 1,
17+
"space-infix-ops": ["error", {"int32Hint": false}],
1618
"no-unused-expressions": "error",
1719
"no-trailing-spaces": "error",
1820
"no-nested-ternary": "error",
@@ -28,7 +30,6 @@
2830
"quotes": [ 1, "single", "avoid-escape" ],
2931
"no-use-before-define": [ 2, { "functions": false } ],
3032
"semi": [1, "always"],
31-
"prefer-const": 1,
3233
"react/prefer-es6-class": 0,
3334
"react/jsx-filename-extension": 0,
3435
"react/jsx-curly-spacing": [ 2, "always" ],

src/index.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ import { render } from 'react-dom';
33

44
import { TranscriptEditor } from './lib';
55

6-
import kaldiTedTalkTranscript from './sample-data/KateDarling_2018S-bbc-kaldi.json';
6+
// import kaldiTedTalkTranscript from './sample-data/KateDarling_2018S-bbc-kaldi.json';
7+
import kaldiTedTalkTranscript from './sample-data/KateDarling-bbcKaldiTranscriptWithSpeakerSegments.json';
78
import style from './index.module.css';
89
import SttTypeSelect from './select-stt-json-type';
910
import ExportFormatSelect from './select-export-format';
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
/**
2+
edge cases
3+
- more segments then words - not an issue if you start by matching words with segment
4+
and handle edge case where it doesn't find a match
5+
- more words then segments - orphan words
6+
*/
7+
function groupWordsInParagraphsBySpeakers(words, segments) {
8+
// add speakers to each word
9+
const wordsWithSpeakers = addSpeakerToEachWord(words, segments.segments);
10+
// group words by speakers sequentially
11+
const result = groupWordsBySpeaker(wordsWithSpeakers);
12+
13+
return result;
14+
};
15+
16+
/**
17+
* Add speakers to each words
18+
* if it doesn't have add unknown attribute `U_UKN`
19+
* @param {*} words
20+
* @param {*} segments
21+
*/
22+
function addSpeakerToEachWord(words, segments) {
23+
const tmpWordsWithSpeakers = [];
24+
words.forEach((word) => {
25+
const tmpSpeakerSegment = findSegmentForWord(word, segments);
26+
27+
word.speaker = formatSpeakerName(tmpSpeakerSegment.speaker);
28+
tmpWordsWithSpeakers.push(word);
29+
});
30+
31+
return tmpWordsWithSpeakers;
32+
}
33+
34+
/**
35+
* Groups Words by speaker attribute
36+
* @param {array} wordsWithSpeakers - same as kaldi words list but with a `speaker` label attribute on each word
37+
* @return {array} - list of paragraph objcts, with words, text and sepaker attributes.
38+
* where words is an array and the other two are strings.
39+
*/
40+
function groupWordsBySpeaker(wordsWithSpeakers) {
41+
let currentSpeaker = wordsWithSpeakers[0].speaker;
42+
const results = [ ];
43+
let paragraph = { words: [], text: '', speaker: '' };
44+
wordsWithSpeakers.forEach((word) => {
45+
// if current speaker same as word speaker add words to paragraph
46+
if (currentSpeaker === word.speaker) {
47+
paragraph.words.push(word);
48+
paragraph.text += word.punct + ' ';
49+
paragraph.speaker = currentSpeaker;
50+
}
51+
// if it's not same speaker
52+
else {
53+
// update current speaker
54+
currentSpeaker = word.speaker;
55+
// remove spacing in text
56+
paragraph.text = paragraph.text.trim();
57+
//save previous paragraph
58+
results.push(paragraph);
59+
// reset paragraph
60+
paragraph = { words: [], text: '', speaker: 'U_UKN' };
61+
// add words attributes to new
62+
paragraph.words.push(word);
63+
paragraph.text += word.punct + ' ';
64+
}
65+
});
66+
// add last paragraph
67+
results.push(paragraph);
68+
69+
return results;
70+
}
71+
72+
/**
73+
* Helper functions
74+
*/
75+
76+
/**
77+
* given word start and end time attributes
78+
* looks for segment range that contains that word
79+
* if it doesn't find any it returns a segment with `UKN`
80+
* speaker attributes.
81+
* @param {object} word - word object
82+
* @param {array} segments - list of segments objects
83+
* @return {object} - a single segment whose range contains the word
84+
*/
85+
function findSegmentForWord(word, segments) {
86+
87+
const tmpSegment = segments.find((seg) => {
88+
const segEnd = seg.start + seg.duration;
89+
90+
return ((word.start >= seg.start) && (word.end <= segEnd));
91+
});
92+
// if find doesn't find any matches it returns an undefined
93+
if (tmpSegment === undefined) {
94+
// covering edge case orphan word not belonging to any segments
95+
// adding UKN speaker label
96+
return {
97+
'@type': 'Segment',
98+
// keeping both speaker id and gender as this is used later
99+
// to format speaker label combining the two
100+
speaker: { '@id': 'UKN', gender: 'U' }
101+
};
102+
} else {
103+
// find returns the first element that matches the criteria
104+
return tmpSegment;
105+
}
106+
}
107+
108+
/**
109+
* formats kaldi speaker object into a string
110+
* Combining Gender and speaker Id
111+
* @param {object} speaker - BBC kaldi speaker object
112+
* @return {string} -
113+
*/
114+
function formatSpeakerName(speaker) {
115+
return speaker.gender + '_' + speaker['@id'];
116+
}
117+
118+
export default groupWordsInParagraphsBySpeakers;
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import groupWordsInParagraphsBySpeakers from './group-words-by-speakers';
2+
3+
import kaldiTedTalkTranscript from './sample/bbcKaldiTranscriptWithSpeakerSegments.sample.json';
4+
5+
const segmentation = kaldiTedTalkTranscript.retval.segmentation;
6+
const words = kaldiTedTalkTranscript.retval.words;
7+
8+
describe('groupWordsInParagraphsBySpeakers', () => {
9+
/**
10+
* Hard to test if the segmentation algo it's working properly
11+
* But one basic test for now is to test there is the same number of words
12+
* In the result.
13+
*/
14+
it('Expect same word count in results', ( ) => {
15+
16+
const wordsByParagraphs = groupWordsInParagraphsBySpeakers(words, segmentation);
17+
18+
const resultWordCount = wordsByParagraphs.reduce(reduceFunction, 0);
19+
function reduceFunction(total, currentParagraph) {
20+
return total + currentParagraph.words.length;
21+
};
22+
23+
expect(resultWordCount).toBe(words.length);
24+
});
25+
});

src/lib/Util/adapters/bbc-kaldi/index.js

Lines changed: 40 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,23 @@
55
*/
66

77
import generateEntitiesRanges from '../generate-entities-ranges/index.js';
8-
8+
import groupWordsInParagraphsBySpeakers from './group-words-by-speakers.js';
99
/**
1010
* groups words list from kaldi transcript based on punctuation.
1111
* @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
1212
* @param {array} words - array of words opbjects from kaldi transcript
1313
*/
1414

15-
const groupWordsInParagraphs = (words) => {
15+
const groupWordsInParagraphs = words => {
1616
const results = [];
1717
let paragraph = { words: [], text: [] };
1818

19-
words.forEach((word) => {
19+
words.forEach(word => {
2020
// if word contains punctuation
2121
if (/[.?!]/.test(word.punct)) {
2222
paragraph.words.push(word);
2323
paragraph.text.push(word.punct);
24+
paragraph.text = paragraph.text.join(' ');
2425
results.push(paragraph);
2526
// reset paragraph
2627
paragraph = { words: [], text: [] };
@@ -33,35 +34,55 @@ const groupWordsInParagraphs = (words) => {
3334
return results;
3435
};
3536

36-
const bbcKaldiToDraft = (bbcKaldiJson) => {
37+
const bbcKaldiToDraft = bbcKaldiJson => {
3738
const results = [];
3839
let tmpWords;
40+
let speakerSegmentation = null;
41+
let wordsByParagraphs = [];
3942

4043
// BBC Octo Labs API Response wraps Kaldi response around retval,
4144
// while kaldi contains word attribute at root
4245
if (bbcKaldiJson.retval !== undefined) {
4346
tmpWords = bbcKaldiJson.retval.words;
47+
if (bbcKaldiJson.retval.segmentation !== undefined) {
48+
speakerSegmentation = bbcKaldiJson.retval.segmentation;
49+
}
4450
} else {
4551
tmpWords = bbcKaldiJson.words;
52+
if (bbcKaldiJson.segmentation !== undefined) {
53+
speakerSegmentation = bbcKaldiJson.segmentation;
54+
}
4655
}
4756

48-
const wordsByParagraphs = groupWordsInParagraphs(tmpWords);
57+
if (speakerSegmentation === null) {
58+
wordsByParagraphs = groupWordsInParagraphs(tmpWords);
59+
} else {
60+
wordsByParagraphs = groupWordsInParagraphsBySpeakers(tmpWords, speakerSegmentation);
61+
}
4962

5063
wordsByParagraphs.forEach((paragraph, i) => {
51-
const draftJsContentBlockParagraph = {
52-
text: paragraph.text.join(' '),
53-
type: 'paragraph',
54-
data: {
55-
speaker: `TBC ${ i }`,
56-
words: paragraph.words,
57-
start: paragraph.words[0].start
58-
},
59-
// the entities as ranges are each word in the space-joined text,
60-
// so it needs to be compute for each the offset from the beginning of the paragraph and the length
61-
entityRanges: generateEntitiesRanges(paragraph.words, 'punct'), // wordAttributeName
62-
};
63-
// console.log(JSON.stringify(draftJsContentBlockParagraph,null,2))
64-
results.push(draftJsContentBlockParagraph);
64+
// if paragraph contain words
65+
// eg sometimes the speaker segmentation might not contain words :man-shrugging:
66+
if (paragraph.words[0] !== undefined) {
67+
let speakerLabel = `TBC ${ i }`;
68+
if (speakerSegmentation !== null) {
69+
speakerLabel = paragraph.speaker;
70+
}
71+
72+
const draftJsContentBlockParagraph = {
73+
text: paragraph.text,
74+
type: 'paragraph',
75+
data: {
76+
speaker: speakerLabel,
77+
words: paragraph.words,
78+
start: paragraph.words[0].start
79+
},
80+
// the entities as ranges are each word in the space-joined text,
81+
// so it needs to be compute for each the offset from the beginning of the paragraph and the length
82+
entityRanges: generateEntitiesRanges(paragraph.words, 'punct') // wordAttributeName
83+
};
84+
results.push(draftJsContentBlockParagraph);
85+
}
6586
});
6687

6788
return results;

0 commit comments

Comments
 (0)