@@ -138,6 +138,17 @@ class StreamingRecognitionConfig(proto.Message):
138
138
``END_OF_SINGLE_UTTERANCE`` event and cease recognition. It
139
139
will return no more than one ``StreamingRecognitionResult``
140
140
with the ``is_final`` flag set to ``true``.
141
+
142
+ The ``single_utterance`` field can only be used with
143
+ specified models, otherwise an error is thrown. The
144
+ ``model`` field in [``RecognitionConfig``][] must be set to:
145
+
146
+ - ``command_and_search``
147
+ - ``phone_call`` AND additional field
148
+ ``useEnhanced``\ =\ ``true``
149
+ - The ``model`` field is left undefined. In this case the
150
+ API auto-selects a model based on any other parameters
151
+ that you set in ``RecognitionConfig``.
141
152
interim_results (bool):
142
153
If ``true``, interim results (tentative hypotheses) may be
143
154
returned as they become available (these interim results are
@@ -214,7 +225,7 @@ class RecognitionConfig(proto.Message):
214
225
[SpeechContext][google.cloud.speech.v1.SpeechContext]. A
215
226
means to provide context to assist the speech recognition.
216
227
For more information, see `speech
217
- adaptation <https://cloud.google.com/speech-to-text/docs/context-strength >`__.
228
+ adaptation <https://cloud.google.com/speech-to-text/docs/adaptation >`__.
218
229
enable_word_time_offsets (bool):
219
230
If ``true``, the top result includes a list of words and the
220
231
start and end time offsets (timestamps) for those words. If
@@ -226,11 +237,7 @@ class RecognitionConfig(proto.Message):
226
237
available in select languages. Setting this for
227
238
requests in other languages has no effect at
228
239
all. The default 'false' value does not add
229
- punctuation to result hypotheses. Note: This is
230
- currently offered as an experimental service,
231
- complimentary to all users. In the future this
232
- may be exclusively available as a premium
233
- feature.
240
+ punctuation to result hypotheses.
234
241
diarization_config (google.cloud.speech_v1.types.SpeakerDiarizationConfig):
235
242
Config to enable speaker diarization and set
236
243
additional parameters to make diarization better
@@ -270,7 +277,7 @@ class RecognitionConfig(proto.Message):
270
277
</tr>
271
278
<tr>
272
279
<td><code>video</code></td>
273
- <td>Best for audio that originated from from video or includes multiple
280
+ <td>Best for audio that originated from video or includes multiple
274
281
speakers. Ideally the audio is recorded at a 16khz or greater
275
282
sampling rate. This is a premium model that costs more than the
276
283
standard rate.</td>
@@ -306,7 +313,7 @@ class AudioEncoding(proto.Enum):
306
313
The accuracy of the speech recognition can be reduced if lossy
307
314
codecs are used to capture or transmit audio, particularly if
308
315
background noise is present. Lossy codecs include ``MULAW``,
309
- ``AMR``, ``AMR_WB``, ``OGG_OPUS``, ``SPEEX_WITH_HEADER_BYTE``, and
316
+ ``AMR``, ``AMR_WB``, ``OGG_OPUS``, ``SPEEX_WITH_HEADER_BYTE``,
310
317
``MP3``.
311
318
312
319
The ``FLAC`` and ``WAV`` audio file formats include a header that
@@ -370,7 +377,7 @@ class SpeakerDiarizationConfig(proto.Message):
370
377
automatically determine the correct number of
371
378
speakers. If not set, the default value is 6.
372
379
speaker_tag (int):
373
- Unused.
380
+ Output only. Unused.
374
381
"""
375
382
376
383
enable_speaker_diarization = proto .Field (proto .BOOL , number = 1 ,)
@@ -531,11 +538,17 @@ class RecognizeResponse(proto.Message):
531
538
results (Sequence[google.cloud.speech_v1.types.SpeechRecognitionResult]):
532
539
Sequential list of transcription results
533
540
corresponding to sequential portions of audio.
541
+ total_billed_time (google.protobuf.duration_pb2.Duration):
542
+ When available, billed audio seconds for the
543
+ corresponding request.
534
544
"""
535
545
536
546
results = proto .RepeatedField (
537
547
proto .MESSAGE , number = 2 , message = "SpeechRecognitionResult" ,
538
548
)
549
+ total_billed_time = proto .Field (
550
+ proto .MESSAGE , number = 3 , message = duration_pb2 .Duration ,
551
+ )
539
552
540
553
541
554
class LongRunningRecognizeResponse (proto .Message ):
@@ -550,11 +563,17 @@ class LongRunningRecognizeResponse(proto.Message):
550
563
results (Sequence[google.cloud.speech_v1.types.SpeechRecognitionResult]):
551
564
Sequential list of transcription results
552
565
corresponding to sequential portions of audio.
566
+ total_billed_time (google.protobuf.duration_pb2.Duration):
567
+ When available, billed audio seconds for the
568
+ corresponding request.
553
569
"""
554
570
555
571
results = proto .RepeatedField (
556
572
proto .MESSAGE , number = 2 , message = "SpeechRecognitionResult" ,
557
573
)
574
+ total_billed_time = proto .Field (
575
+ proto .MESSAGE , number = 3 , message = duration_pb2 .Duration ,
576
+ )
558
577
559
578
560
579
class LongRunningRecognizeMetadata (proto .Message ):
@@ -572,13 +591,18 @@ class LongRunningRecognizeMetadata(proto.Message):
572
591
Time when the request was received.
573
592
last_update_time (google.protobuf.timestamp_pb2.Timestamp):
574
593
Time of the most recent processing update.
594
+ uri (str):
595
+ Output only. The URI of the audio file being
596
+ transcribed. Empty if the audio was sent as byte
597
+ content.
575
598
"""
576
599
577
600
progress_percent = proto .Field (proto .INT32 , number = 1 ,)
578
601
start_time = proto .Field (proto .MESSAGE , number = 2 , message = timestamp_pb2 .Timestamp ,)
579
602
last_update_time = proto .Field (
580
603
proto .MESSAGE , number = 3 , message = timestamp_pb2 .Timestamp ,
581
604
)
605
+ uri = proto .Field (proto .STRING , number = 4 ,)
582
606
583
607
584
608
class StreamingRecognizeResponse (proto .Message ):
@@ -588,9 +612,8 @@ class StreamingRecognizeResponse(proto.Message):
588
612
client. If there is no recognizable audio, and ``single_utterance``
589
613
is set to false, then no messages are streamed back to the client.
590
614
591
- Here's an example of a series of ten
592
- ``StreamingRecognizeResponse``\ s that might be returned while
593
- processing audio:
615
+ Here's an example of a series of ``StreamingRecognizeResponse``\ s
616
+ that might be returned while processing audio:
594
617
595
618
1. results { alternatives { transcript: "tube" } stability: 0.01 }
596
619
@@ -648,6 +671,10 @@ class StreamingRecognizeResponse(proto.Message):
648
671
``is_final=false`` results (the interim results).
649
672
speech_event_type (google.cloud.speech_v1.types.StreamingRecognizeResponse.SpeechEventType):
650
673
Indicates the type of speech event.
674
+ total_billed_time (google.protobuf.duration_pb2.Duration):
675
+ When available, billed audio seconds for the
676
+ stream. Set only if this is the last response in
677
+ the stream.
651
678
"""
652
679
653
680
class SpeechEventType (proto .Enum ):
@@ -660,6 +687,9 @@ class SpeechEventType(proto.Enum):
660
687
proto .MESSAGE , number = 2 , message = "StreamingRecognitionResult" ,
661
688
)
662
689
speech_event_type = proto .Field (proto .ENUM , number = 4 , enum = SpeechEventType ,)
690
+ total_billed_time = proto .Field (
691
+ proto .MESSAGE , number = 5 , message = duration_pb2 .Duration ,
692
+ )
663
693
664
694
665
695
class StreamingRecognitionResult (proto .Message ):
@@ -784,12 +814,12 @@ class WordInfo(proto.Message):
784
814
The word corresponding to this set of
785
815
information.
786
816
speaker_tag (int):
787
- A distinct integer value is assigned for every speaker
788
- within the audio. This field specifies which one of those
789
- speakers was detected to have spoken this word. Value ranges
790
- from '1' to diarization_speaker_count. speaker_tag is set if
791
- enable_speaker_diarization = 'true' and only in the top
792
- alternative.
817
+ Output only. A distinct integer value is assigned for every
818
+ speaker within the audio. This field specifies which one of
819
+ those speakers was detected to have spoken this word. Value
820
+ ranges from '1' to diarization_speaker_count. speaker_tag is
821
+ set if enable_speaker_diarization = 'true' and only in the
822
+ top alternative.
793
823
"""
794
824
795
825
start_time = proto .Field (proto .MESSAGE , number = 1 , message = duration_pb2 .Duration ,)
0 commit comments