5
5
6
6
import generateEntitiesRanges from '../generate-entities-ranges/index.js' ;
7
7
8
- /**
9
- * groups words list from speechmatics based on punctuation.
10
- * @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
11
- * @todo As this function is also used in the bbc-kaldi adapter, should it be refactored into its own file?
12
- * @param {array } words - array of words objects from speechmatics transcript
13
- */
14
- const groupWordsInParagraphs = ( words ) => {
15
- const results = [ ] ;
16
- let paragraph = { words : [ ] , text : [ ] } ;
17
- debugger ;
18
-
19
- words . forEach ( ( word ) => {
20
- // if word contains punctuation
21
- if ( / [ . ? ! ] / . test ( word . punct ) ) {
22
- paragraph . words . push ( word ) ;
23
- paragraph . text . push ( word . punct ) ;
24
- results . push ( paragraph ) ;
25
- // reset paragraph
26
- paragraph = { words : [ ] , text : [ ] } ;
27
- } else {
28
- paragraph . words . push ( word ) ;
29
- paragraph . text . push ( word . punct ) ;
30
- }
31
- } ) ;
32
-
33
- return results ;
34
- } ;
35
-
36
8
/**
37
9
* Determines the speaker of a paragraph by comparing the start time of the paragraph with
38
10
* the speaker times.
@@ -51,6 +23,38 @@ const getSpeaker = (start, speakers) => {
51
23
return 'UNK' ;
52
24
} ;
53
25
26
+ /**
27
+ * groups words list from speechmatics based on punctuation.
28
+ * @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
29
+ * @todo As this function is also used in the bbc-kaldi adapter, should it be refactored into its own file?
30
+ * @param {array } words - array of words objects from speechmatics transcript
31
+ */
32
+ const groupWordsInParagraphs = ( words , speakers ) => {
33
+ const results = [ ] ;
34
+ let paragraph = { words : [ ] , text : [ ] , speaker : '' } ;
35
+ let oldSpeaker = getSpeaker ( words [ 0 ] . start , speakers ) ;
36
+ let newSpeaker ;
37
+
38
+ words . forEach ( ( word ) => {
39
+ newSpeaker = getSpeaker ( word . start , speakers ) ;
40
+ // if speaker changes
41
+ if ( newSpeaker !== oldSpeaker ) {
42
+ paragraph . speaker = oldSpeaker ;
43
+ results . push ( paragraph ) ;
44
+ oldSpeaker = newSpeaker ;
45
+ // reset paragraph
46
+ paragraph = { words : [ ] , text : [ ] } ;
47
+ }
48
+ paragraph . words . push ( word ) ;
49
+ paragraph . text . push ( word . punct ) ;
50
+ } ) ;
51
+
52
+ paragraph . speaker = oldSpeaker ;
53
+ results . push ( paragraph ) ;
54
+
55
+ return results ;
56
+ } ;
57
+
54
58
/**
55
59
* Speechmatics treats punctuation as own words. This function merges punctuations with
56
60
* the pevious word and adjusts the total duration of the word.
@@ -97,15 +101,15 @@ const speechmaticsToDraft = (speechmaticsJson) => {
97
101
} ) ;
98
102
} ) ;
99
103
100
- const wordsByParagraphs = groupWordsInParagraphs ( tmpWords ) ;
104
+ const wordsByParagraphs = groupWordsInParagraphs ( tmpWords , tmpSpeakers ) ;
101
105
102
106
wordsByParagraphs . forEach ( ( paragraph ) => {
103
107
const paragraphStart = paragraph . words [ 0 ] . start ;
104
108
const draftJsContentBlockParagraph = {
105
109
text : paragraph . text . join ( ' ' ) ,
106
110
type : 'paragraph' ,
107
111
data : {
108
- speaker : getSpeaker ( paragraphStart , tmpSpeakers ) ,
112
+ speaker : paragraph . speaker ,
109
113
words : paragraph . words ,
110
114
start : paragraphStart
111
115
} ,
0 commit comments