Skip to content

Commit 27bb7c0

Browse files
authored
Stt adapter IBM - andrew d anderson (#123)
* Add IBM Watson TTS Json import support. * Fix IBM TTS index.test.js * Update src/lib/Util/adapters/ibm/index.js Co-Authored-By: AndrewDAnderson <[email protected]> * major refactor - working cleaned up the code making it more in line with the logic in the other adapter, and better setup for accomodating change over time * fixed test by updating example was missing speaker info * mend * changes from PR review
1 parent 00f2832 commit 27bb7c0

File tree

13 files changed

+55585
-178
lines changed

13 files changed

+55585
-178
lines changed

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,11 @@ import { TranscriptEditor } from "@bbc/react-transcript-editor";
8888

8989
<!-- _High level overview of system architecture_ -->
9090

91-
Uses [`create-component-lib`](https://www.npmjs.com/package/create-component-lib) as explaied in this [blog post](https://hackernoon.com/creating-a-library-of-react-components-using-create-react-app-without-ejecting-d182df690c6b) to setup the environment to develop this React component.
91+
uses [`create-component-lib`](https://www.npmjs.com/package/create-component-lib) as explaied in this [blog post](https://hackernoon.com/creating-a-library-of-react-components-using-create-react-app-without-ejecting-d182df690c6b) to setup the environment to develop this React.
9292

9393
This uses [Create React App 2.0](https://reactjs.org/blog/2018/10/01/create-react-app-v2.html) so we are using [CSS Modules](https://github.com/css-modules/css-modules) to contain the scope of the css for this component.
94+
<!--
95+
Uses CSS grid-layout https://medium.com/samsung-internet-dev/common-responsive-layouts-with-css-grid-and-some-without-245a862f48df -->
9496

9597
> Place everything you want to publish to npm inside `src/lib`.
9698
@@ -175,7 +177,7 @@ See [CONTRIBUTING.md](./CONTRIBUTING.md) guidelines and [CODE_OF_CONDUCT.md](./C
175177

176178
## Licence
177179

178-
See [LICENCE.md](./LICENCE.md)
180+
<!-- mention MIT Licence -->
179181

180182
## Legal Disclaimer
181183

docs/features-list.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,10 @@ Import Transcript Json - Adapters
5656
- [x] News Labs API - BBC Kaldi
5757
- [x] autoEdit 2
5858
- [x] AWS Transcriber
59+
- [x] IBM Watson STT
60+
- [X] Speechmatics
5961
- [ ] Gentle Transcription
6062
- [ ] Gentle Alignment Json
61-
- [ ] IBM Watson STT
62-
- [X] Speechmatics
6363
- [ ] AssemblyAI
6464
- [ ] Rev
6565
- [ ] Srt

src/lib/Util/adapters/amazon-transcribe/index.js

Lines changed: 29 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,41 @@
1-
import generateEntitiesRanges from '../generate-entities-ranges/index.js';
2-
31
/**
4-
* Helper function to generate draft.js entities,
5-
* see unit test for example data structure
6-
* it adds offset and length to recognise word in draftjs
2+
* Converts AWS Transcribe Json to DraftJs
3+
* see `sample` folder for example of input and output as well as `example-usage.js`
74
*/
85

6+
import generateEntitiesRanges from '../generate-entities-ranges/index.js';
7+
8+
export const stripLeadingSpace = word => {
9+
return word.replace(/^\s/, '');
10+
};
11+
912
/**
1013
* @param {json} words - List of words
1114
* @param {string} wordAttributeName - eg 'punct' or 'text' or etc.
1215
* attribute for the word object containing the text. eg word ={ punct:'helo', ... }
1316
* or eg word ={ text:'helo', ... }
1417
*/
15-
16-
export const getBestAlternativeForWord = (word) => {
18+
export const getBestAlternativeForWord = word => {
1719
if (/punctuation/.test(word.type)) {
1820
return Object.assign(word.alternatives[0], { confidence: 1 }); //Transcribe doesn't provide a confidence for punctuation
1921
}
20-
const wordWithHighestConfidence = word.alternatives.reduce(function(prev, current) {
21-
return (parseFloat(prev.confidence) > parseFloat(current.confidence)) ? prev : current;
22+
const wordWithHighestConfidence = word.alternatives.reduce(function(
23+
prev,
24+
current
25+
) {
26+
return parseFloat(prev.confidence) > parseFloat(current.confidence)
27+
? prev
28+
: current;
2229
});
2330

2431
return wordWithHighestConfidence;
2532
};
2633

2734
/**
28-
Normalizes words so they can be used in
29-
the generic generateEntitiesRanges() method
30-
**/
31-
32-
const normalizeWord = (currentWord, previousWord) => {
35+
* Normalizes words so they can be used in
36+
* the generic generateEntitiesRanges() method
37+
**/
38+
const normalizeWord = currentWord => {
3339
const bestAlternative = getBestAlternativeForWord(currentWord);
3440

3541
return {
@@ -52,7 +58,7 @@ export const appendPunctuationToPreviousWord = (punctuation, previousWord) => {
5258
};
5359
};
5460

55-
export const mapPunctuationItemsToWords = (words) => {
61+
export const mapPunctuationItemsToWords = words => {
5662
const itemsToRemove = [];
5763
const dirtyArray = words.map((word, index) => {
5864
let previousWord = {};
@@ -61,8 +67,7 @@ export const mapPunctuationItemsToWords = (words) => {
6167
previousWord = words[index - 1];
6268

6369
return appendPunctuationToPreviousWord(word, previousWord);
64-
}
65-
else {
70+
} else {
6671
return word;
6772
}
6873
});
@@ -72,17 +77,12 @@ export const mapPunctuationItemsToWords = (words) => {
7277
});
7378
};
7479

75-
export const stripLeadingSpace = (word) => {
76-
return word.replace(/^\s/, '');
77-
};
78-
7980
/**
8081
* groups words list from amazon transcribe transcript based on punctuation.
8182
* @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
82-
* @param {array} words - array of words opbjects from kaldi transcript
83+
* @param {array} words - array of words objects from kaldi transcript
8384
*/
84-
85-
const groupWordsInParagraphs = (words) => {
85+
const groupWordsInParagraphs = words => {
8686
const results = [];
8787
let paragraph = {
8888
words: [],
@@ -106,11 +106,13 @@ const groupWordsInParagraphs = (words) => {
106106
return results;
107107
};
108108

109-
const amazonTranscribeToDraft = (amazonTranscribeJson) => {
109+
const amazonTranscribeToDraft = amazonTranscribeJson => {
110110
const results = [];
111111
const tmpWords = amazonTranscribeJson.results.items;
112112
const wordsWithRemappedPunctuation = mapPunctuationItemsToWords(tmpWords);
113-
const wordsByParagraphs = groupWordsInParagraphs(wordsWithRemappedPunctuation);
113+
const wordsByParagraphs = groupWordsInParagraphs(
114+
wordsWithRemappedPunctuation
115+
);
114116
wordsByParagraphs.forEach((paragraph, i) => {
115117
const draftJsContentBlockParagraph = {
116118
text: paragraph.text.join(' '),
@@ -122,7 +124,7 @@ const amazonTranscribeToDraft = (amazonTranscribeJson) => {
122124
},
123125
// the entities as ranges are each word in the space-joined text,
124126
// so it needs to be compute for each the offset from the beginning of the paragraph and the length
125-
entityRanges: generateEntitiesRanges(paragraph.words, 'text'), // wordAttributeName
127+
entityRanges: generateEntitiesRanges(paragraph.words, 'text') // wordAttributeName
126128
};
127129
results.push(draftJsContentBlockParagraph);
128130
});

src/lib/Util/adapters/autoEdit2/index.js

Lines changed: 2 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,6 @@
11
/**
2-
* Convert autoEdit2 Json
3-
*
4-
* into
5-
*
6-
```
7-
const blocks = [
8-
{
9-
text: 'Hello',
10-
type: 'paragraph',
11-
data: {
12-
speaker: 'Foo',
13-
},
14-
entityRanges: [],
15-
},
16-
{
17-
text: 'World',
18-
type: 'paragraph',
19-
data: {
20-
speaker: 'Bar',
21-
},
22-
entityRanges: [],
23-
},
24-
];
25-
```
26-
*
27-
* See samples folder and test file
28-
* for reference data structures
2+
* Convert autoEdit2 Json to draftJS
3+
* see `sample` folder for example of input and output as well as `example-usage.js`
294
*/
305

316
import generateEntitiesRanges from '../generate-entities-ranges/index';

src/lib/Util/adapters/bbc-kaldi/index.js

Lines changed: 2 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -1,118 +1,6 @@
11
/**
2-
* Convert BBC Kaldi json
3-
```
4-
{
5-
"action": "audio-transcribe",
6-
"retval": {
7-
"status": true,
8-
"wonid": "octo:2692ea33-d595-41d8-bfd5-aa7f2d2f89ee",
9-
"punct": "There is a day. About ten years ago when ...",
10-
"words": [
11-
{
12-
"start": 13.02,
13-
"confidence": 0.68,
14-
"end": 13.17,
15-
"word": "there",
16-
"punct": "There",
17-
"index": 0
18-
},
19-
{
20-
"start": 13.17,
21-
"confidence": 0.61,
22-
"end": 13.38,
23-
"word": "is",
24-
"punct": "is",
25-
"index": 1
26-
},
27-
...
28-
```
29-
*
30-
* into
31-
*
32-
```
33-
const blocks = [
34-
{
35-
"text": "There is a day.",
36-
"type": "paragraph",
37-
"data": {
38-
"speaker": "TBC 0",
39-
"words": [
40-
{
41-
"start": 13.02,
42-
"confidence": 0.68,
43-
"end": 13.17,
44-
"word": "there",
45-
"punct": "There",
46-
"index": 0
47-
},
48-
{
49-
"start": 13.17,
50-
"confidence": 0.61,
51-
"end": 13.38,
52-
"word": "is",
53-
"punct": "is",
54-
"index": 1
55-
},
56-
{
57-
"start": 13.38,
58-
"confidence": 0.99,
59-
"end": 13.44,
60-
"word": "a",
61-
"punct": "a",
62-
"index": 2
63-
},
64-
{
65-
"start": 13.44,
66-
"confidence": 1,
67-
"end": 13.86,
68-
"word": "day",
69-
"punct": "day.",
70-
"index": 3
71-
}
72-
],
73-
"start": 13.02
74-
},
75-
"entityRanges": [
76-
{
77-
"start": 13.02,
78-
"end": 13.17,
79-
"confidence": 0.68,
80-
"text": "There",
81-
"offset": 0,
82-
"length": 5,
83-
"key": "li6c6ld"
84-
},
85-
{
86-
"start": 13.17,
87-
"end": 13.38,
88-
"confidence": 0.61,
89-
"text": "is",
90-
"offset": 6,
91-
"length": 2,
92-
"key": "pcgzkp6"
93-
},
94-
{
95-
"start": 13.38,
96-
"end": 13.44,
97-
"confidence": 0.99,
98-
"text": "a",
99-
"offset": 9,
100-
"length": 1,
101-
"key": "ngomd9"
102-
},
103-
{
104-
"start": 13.44,
105-
"end": 13.86,
106-
"confidence": 1,
107-
"text": "day.",
108-
"offset": 11,
109-
"length": 4,
110-
"key": "sgmfl4f"
111-
}
112-
]
113-
},
114-
...
115-
```
2+
* Convert BBC Kaldi json to draftJs
3+
* see `sample` folder for example of input and output as well as `example-usage.js`
1164
*
1175
*/
1186

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import ibmToDraft from './index.js';
2+
import ibmTedTalkTranscript from './sample/ibmTedTalkTranscript.sample.json';
3+
4+
const result = ibmToDraft(ibmTedTalkTranscript);
5+
6+
console.log(JSON.stringify(result, null, 2));

0 commit comments

Comments
 (0)