27
27
import com .google .cloud .speech .v1p1beta1 .RecognitionMetadata .MicrophoneDistance ;
28
28
import com .google .cloud .speech .v1p1beta1 .RecognitionMetadata .RecordingDeviceType ;
29
29
import com .google .cloud .speech .v1p1beta1 .RecognizeResponse ;
30
+ import com .google .cloud .speech .v1p1beta1 .SpeakerDiarizationConfig ;
30
31
import com .google .cloud .speech .v1p1beta1 .SpeechClient ;
32
+
31
33
import com .google .cloud .speech .v1p1beta1 .SpeechRecognitionAlternative ;
32
34
import com .google .cloud .speech .v1p1beta1 .SpeechRecognitionResult ;
35
+ import com .google .cloud .speech .v1p1beta1 .WordInfo ;
33
36
import com .google .protobuf .ByteString ;
34
37
35
38
import java .nio .file .Files ;
36
39
import java .nio .file .Path ;
37
40
import java .nio .file .Paths ;
38
41
import java .util .ArrayList ;
39
- import java .util .List ;
40
42
41
43
public class Recognize {
42
44
@@ -154,32 +156,52 @@ public static void transcribeDiarization(String fileName) throws Exception {
154
156
RecognitionAudio recognitionAudio =
155
157
RecognitionAudio .newBuilder ().setContent (ByteString .copyFrom (content )).build ();
156
158
159
+ SpeakerDiarizationConfig speakerDiarizationConfig = SpeakerDiarizationConfig .newBuilder ()
160
+ .setEnableSpeakerDiarization (true )
161
+ .setMinSpeakerCount (2 )
162
+ .setMaxSpeakerCount (2 )
163
+ .build ();
164
+
157
165
// Configure request to enable Speaker diarization
158
- RecognitionConfig config =
159
- RecognitionConfig .newBuilder ()
166
+ RecognitionConfig config = RecognitionConfig .newBuilder ()
160
167
.setEncoding (AudioEncoding .LINEAR16 )
161
168
.setLanguageCode ("en-US" )
162
169
.setSampleRateHertz (8000 )
163
- .setEnableSpeakerDiarization (true )
164
- .setDiarizationSpeakerCount (2 )
170
+ .setDiarizationConfig (speakerDiarizationConfig )
165
171
.build ();
166
172
167
173
// Perform the transcription request
168
174
RecognizeResponse recognizeResponse = speechClient .recognize (config , recognitionAudio );
169
175
170
- // Print out the results
171
- for (SpeechRecognitionResult result : recognizeResponse .getResultsList ()) {
172
- // There can be several alternative transcripts for a given chunk of speech. Just
173
- // use the first (most likely) one here.
174
- SpeechRecognitionAlternative alternative = result .getAlternatives (0 );
175
- System .out .format ("Transcript : %s\n " , alternative .getTranscript ());
176
- // The words array contains the entire transcript up until that point.
177
- // Referencing the last spoken word to get the associated Speaker tag
178
- System .out .format (
179
- "Speaker Tag %s: %s\n " ,
180
- alternative .getWords ((alternative .getWordsCount () - 1 )).getSpeakerTag (),
181
- alternative .getTranscript ());
176
+ // Speaker Tags are only included in the last result object, which has only one alternative.
177
+ SpeechRecognitionAlternative alternative =
178
+ recognizeResponse .getResults (
179
+ recognizeResponse .getResultsCount () - 1 ).getAlternatives (0 );
180
+
181
+ // The alternative is made up of WordInfo objects that contain the speaker_tag.
182
+ WordInfo wordInfo = alternative .getWords (0 );
183
+ int currentSpeakerTag = wordInfo .getSpeakerTag ();
184
+
185
+ // For each word, get all the words associated with one speaker, once the speaker changes,
186
+ // add a new line with the new speaker and their spoken words.
187
+ StringBuilder speakerWords = new StringBuilder (
188
+ String .format ("Speaker %d: %s" , wordInfo .getSpeakerTag (), wordInfo .getWord ()));
189
+
190
+ for (int i = 1 ; i < alternative .getWordsCount (); i ++) {
191
+ wordInfo = alternative .getWords (i );
192
+ if (currentSpeakerTag == wordInfo .getSpeakerTag ()) {
193
+ speakerWords .append (" " );
194
+ speakerWords .append (wordInfo .getWord ());
195
+ } else {
196
+ speakerWords .append (
197
+ String .format ("\n Speaker %d: %s" ,
198
+ wordInfo .getSpeakerTag (),
199
+ wordInfo .getWord ()));
200
+ currentSpeakerTag = wordInfo .getSpeakerTag ();
201
+ }
182
202
}
203
+
204
+ System .out .println (speakerWords .toString ());
183
205
}
184
206
}
185
207
// [END speech_transcribe_diarization_beta]
@@ -192,14 +214,19 @@ public static void transcribeDiarization(String fileName) throws Exception {
192
214
*/
193
215
public static void transcribeDiarizationGcs (String gcsUri ) throws Exception {
194
216
try (SpeechClient speechClient = SpeechClient .create ()) {
217
+ SpeakerDiarizationConfig speakerDiarizationConfig = SpeakerDiarizationConfig .newBuilder ()
218
+ .setEnableSpeakerDiarization (true )
219
+ .setMinSpeakerCount (2 )
220
+ .setMaxSpeakerCount (2 )
221
+ .build ();
222
+
195
223
// Configure request to enable Speaker diarization
196
224
RecognitionConfig config =
197
225
RecognitionConfig .newBuilder ()
198
226
.setEncoding (AudioEncoding .LINEAR16 )
199
227
.setLanguageCode ("en-US" )
200
228
.setSampleRateHertz (8000 )
201
- .setEnableSpeakerDiarization (true )
202
- .setDiarizationSpeakerCount (2 )
229
+ .setDiarizationConfig (speakerDiarizationConfig )
203
230
.build ();
204
231
205
232
// Set the remote path for the audio file
@@ -214,17 +241,37 @@ public static void transcribeDiarizationGcs(String gcsUri) throws Exception {
214
241
Thread .sleep (10000 );
215
242
}
216
243
217
- for (SpeechRecognitionResult result : response .get ().getResultsList ()) {
218
- // There can be several alternative transcripts for a given chunk of speech. Just
219
- // use the first (most likely) one here.
220
- SpeechRecognitionAlternative alternative = result .getAlternatives (0 );
221
- // The words array contains the entire transcript up until that point.
222
- // Referencing the last spoken word to get the associated Speaker tag
223
- System .out .format (
224
- "Speaker Tag %s:%s\n " ,
225
- alternative .getWords ((alternative .getWordsCount () - 1 )).getSpeakerTag (),
226
- alternative .getTranscript ());
244
+ // Speaker Tags are only included in the last result object, which has only one alternative.
245
+ LongRunningRecognizeResponse longRunningRecognizeResponse = response .get ();
246
+ SpeechRecognitionAlternative alternative =
247
+ longRunningRecognizeResponse .getResults (
248
+ longRunningRecognizeResponse .getResultsCount () - 1 )
249
+ .getAlternatives (0 );
250
+
251
+ // The alternative is made up of WordInfo objects that contain the speaker_tag.
252
+ WordInfo wordInfo = alternative .getWords (0 );
253
+ int currentSpeakerTag = wordInfo .getSpeakerTag ();
254
+
255
+ // For each word, get all the words associated with one speaker, once the speaker changes,
256
+ // add a new line with the new speaker and their spoken words.
257
+ StringBuilder speakerWords = new StringBuilder (
258
+ String .format ("Speaker %d: %s" , wordInfo .getSpeakerTag (), wordInfo .getWord ()));
259
+
260
+ for (int i = 1 ; i < alternative .getWordsCount (); i ++) {
261
+ wordInfo = alternative .getWords (i );
262
+ if (currentSpeakerTag == wordInfo .getSpeakerTag ()) {
263
+ speakerWords .append (" " );
264
+ speakerWords .append (wordInfo .getWord ());
265
+ } else {
266
+ speakerWords .append (
267
+ String .format ("\n Speaker %d: %s" ,
268
+ wordInfo .getSpeakerTag (),
269
+ wordInfo .getWord ()));
270
+ currentSpeakerTag = wordInfo .getSpeakerTag ();
271
+ }
227
272
}
273
+
274
+ System .out .println (speakerWords .toString ());
228
275
}
229
276
}
230
277
// [END speech_transcribe_diarization_gcs_beta]
@@ -454,7 +501,7 @@ public static void transcribeWordLevelConfidenceGcs(String gcsUri) throws Except
454
501
RecognitionConfig config =
455
502
RecognitionConfig .newBuilder ()
456
503
.setEncoding (AudioEncoding .FLAC )
457
- .setSampleRateHertz (16000 )
504
+ .setSampleRateHertz (44100 )
458
505
.setLanguageCode ("en-US" )
459
506
.setEnableWordConfidence (true )
460
507
.build ();
0 commit comments