55
66import generateEntitiesRanges from '../generate-entities-ranges/index.js' ;
77
8- /**
9- * groups words list from speechmatics based on punctuation.
10- * @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
11- * @todo As this function is also used in the bbc-kaldi adapter, should it be refactored into its own file?
12- * @param {array } words - array of words objects from speechmatics transcript
13- */
14- const groupWordsInParagraphs = ( words ) => {
15- const results = [ ] ;
16- let paragraph = { words : [ ] , text : [ ] } ;
17- debugger ;
18-
19- words . forEach ( ( word ) => {
20- // if word contains punctuation
21- if ( / [ . ? ! ] / . test ( word . punct ) ) {
22- paragraph . words . push ( word ) ;
23- paragraph . text . push ( word . punct ) ;
24- results . push ( paragraph ) ;
25- // reset paragraph
26- paragraph = { words : [ ] , text : [ ] } ;
27- } else {
28- paragraph . words . push ( word ) ;
29- paragraph . text . push ( word . punct ) ;
30- }
31- } ) ;
32-
33- return results ;
34- } ;
35-
368/**
379 * Determines the speaker of a paragraph by comparing the start time of the paragraph with
3810 * the speaker times.
@@ -51,6 +23,38 @@ const getSpeaker = (start, speakers) => {
5123 return 'UNK' ;
5224} ;
5325
26+ /**
27+ * groups words list from speechmatics based on punctuation.
28+ * @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
29+ * @todo As this function is also used in the bbc-kaldi adapter, should it be refactored into its own file?
30+ * @param {array } words - array of words objects from speechmatics transcript
31+ */
32+ const groupWordsInParagraphs = ( words , speakers ) => {
33+ const results = [ ] ;
34+ let paragraph = { words : [ ] , text : [ ] , speaker : '' } ;
35+ let oldSpeaker = getSpeaker ( words [ 0 ] . start , speakers ) ;
36+ let newSpeaker ;
37+
38+ words . forEach ( ( word ) => {
39+ newSpeaker = getSpeaker ( word . start , speakers ) ;
40+ // if speaker changes
41+ if ( newSpeaker !== oldSpeaker ) {
42+ paragraph . speaker = oldSpeaker ;
43+ results . push ( paragraph ) ;
44+ oldSpeaker = newSpeaker ;
45+ // reset paragraph
46+ paragraph = { words : [ ] , text : [ ] } ;
47+ }
48+ paragraph . words . push ( word ) ;
49+ paragraph . text . push ( word . punct ) ;
50+ } ) ;
51+
52+ paragraph . speaker = oldSpeaker ;
53+ results . push ( paragraph ) ;
54+
55+ return results ;
56+ } ;
57+
5458/**
5559 * Speechmatics treats punctuation as own words. This function merges punctuations with
5660 * the pevious word and adjusts the total duration of the word.
@@ -97,15 +101,15 @@ const speechmaticsToDraft = (speechmaticsJson) => {
97101 } ) ;
98102 } ) ;
99103
100- const wordsByParagraphs = groupWordsInParagraphs ( tmpWords ) ;
104+ const wordsByParagraphs = groupWordsInParagraphs ( tmpWords , tmpSpeakers ) ;
101105
102106 wordsByParagraphs . forEach ( ( paragraph ) => {
103107 const paragraphStart = paragraph . words [ 0 ] . start ;
104108 const draftJsContentBlockParagraph = {
105109 text : paragraph . text . join ( ' ' ) ,
106110 type : 'paragraph' ,
107111 data : {
108- speaker : getSpeaker ( paragraphStart , tmpSpeakers ) ,
112+ speaker : paragraph . speaker ,
109113 words : paragraph . words ,
110114 start : paragraphStart
111115 } ,
0 commit comments