|
| 1 | +package openai |
| 2 | + |
| 3 | +import ( |
| 4 | + "bytes" |
| 5 | + "context" |
| 6 | + "encoding/json" |
| 7 | + "io" |
| 8 | + "mime" |
| 9 | + "mime/multipart" |
| 10 | + "net/http" |
| 11 | + "net/textproto" |
| 12 | + "strings" |
| 13 | + |
| 14 | + "github.com/pkg/errors" |
| 15 | + |
| 16 | + "github.com/usememos/memos/internal/ai" |
| 17 | +) |
| 18 | + |
| 19 | +type transcriptionResponse struct { |
| 20 | + Text string `json:"text"` |
| 21 | + Language string `json:"language"` |
| 22 | + Duration float64 `json:"duration"` |
| 23 | +} |
| 24 | + |
| 25 | +type errorResponse struct { |
| 26 | + Error struct { |
| 27 | + Message string `json:"message"` |
| 28 | + Type string `json:"type"` |
| 29 | + Code string `json:"code"` |
| 30 | + } `json:"error"` |
| 31 | +} |
| 32 | + |
| 33 | +// Transcribe transcribes audio with the /audio/transcriptions endpoint. |
| 34 | +func (t *Transcriber) Transcribe(ctx context.Context, request ai.TranscribeRequest) (*ai.TranscribeResponse, error) { |
| 35 | + if strings.TrimSpace(request.Model) == "" { |
| 36 | + return nil, errors.New("model is required") |
| 37 | + } |
| 38 | + if request.Audio == nil { |
| 39 | + return nil, errors.New("audio is required") |
| 40 | + } |
| 41 | + |
| 42 | + body := &bytes.Buffer{} |
| 43 | + writer := multipart.NewWriter(body) |
| 44 | + if err := writeAudioFilePart(writer, request); err != nil { |
| 45 | + return nil, err |
| 46 | + } |
| 47 | + if err := writer.WriteField("model", request.Model); err != nil { |
| 48 | + return nil, errors.Wrap(err, "failed to write model field") |
| 49 | + } |
| 50 | + if err := writer.WriteField("response_format", "json"); err != nil { |
| 51 | + return nil, errors.Wrap(err, "failed to write response format field") |
| 52 | + } |
| 53 | + if request.Prompt != "" { |
| 54 | + if err := writer.WriteField("prompt", request.Prompt); err != nil { |
| 55 | + return nil, errors.Wrap(err, "failed to write prompt field") |
| 56 | + } |
| 57 | + } |
| 58 | + if request.Language != "" { |
| 59 | + if err := writer.WriteField("language", request.Language); err != nil { |
| 60 | + return nil, errors.Wrap(err, "failed to write language field") |
| 61 | + } |
| 62 | + } |
| 63 | + if err := writer.Close(); err != nil { |
| 64 | + return nil, errors.Wrap(err, "failed to close multipart writer") |
| 65 | + } |
| 66 | + |
| 67 | + httpRequest, err := http.NewRequestWithContext(ctx, http.MethodPost, strings.TrimRight(t.endpoint, "/")+"/audio/transcriptions", body) |
| 68 | + if err != nil { |
| 69 | + return nil, errors.Wrap(err, "failed to create transcription request") |
| 70 | + } |
| 71 | + httpRequest.Header.Set("Authorization", "Bearer "+t.apiKey) |
| 72 | + httpRequest.Header.Set("Content-Type", writer.FormDataContentType()) |
| 73 | + |
| 74 | + httpResponse, err := t.httpClient.Do(httpRequest) |
| 75 | + if err != nil { |
| 76 | + return nil, errors.Wrap(err, "failed to send transcription request") |
| 77 | + } |
| 78 | + defer httpResponse.Body.Close() |
| 79 | + |
| 80 | + responseBody, err := io.ReadAll(httpResponse.Body) |
| 81 | + if err != nil { |
| 82 | + return nil, errors.Wrap(err, "failed to read transcription response") |
| 83 | + } |
| 84 | + if httpResponse.StatusCode < http.StatusOK || httpResponse.StatusCode >= http.StatusMultipleChoices { |
| 85 | + return nil, errors.Errorf("transcription request failed with status %d: %s", httpResponse.StatusCode, extractErrorMessage(responseBody)) |
| 86 | + } |
| 87 | + |
| 88 | + var response transcriptionResponse |
| 89 | + if err := json.Unmarshal(responseBody, &response); err != nil { |
| 90 | + return nil, errors.Wrap(err, "failed to unmarshal transcription response") |
| 91 | + } |
| 92 | + return &ai.TranscribeResponse{ |
| 93 | + Text: response.Text, |
| 94 | + Language: response.Language, |
| 95 | + Duration: response.Duration, |
| 96 | + }, nil |
| 97 | +} |
| 98 | + |
| 99 | +func writeAudioFilePart(writer *multipart.Writer, request ai.TranscribeRequest) error { |
| 100 | + filename := strings.TrimSpace(request.Filename) |
| 101 | + if filename == "" { |
| 102 | + filename = "audio" |
| 103 | + } |
| 104 | + contentType := strings.TrimSpace(request.ContentType) |
| 105 | + if contentType == "" { |
| 106 | + contentType = "application/octet-stream" |
| 107 | + } else { |
| 108 | + mediaType, _, err := mime.ParseMediaType(contentType) |
| 109 | + if err != nil { |
| 110 | + return errors.Wrap(err, "invalid audio content type") |
| 111 | + } |
| 112 | + contentType = mediaType |
| 113 | + } |
| 114 | + |
| 115 | + header := make(textproto.MIMEHeader) |
| 116 | + header.Set("Content-Disposition", mime.FormatMediaType("form-data", map[string]string{ |
| 117 | + "name": "file", |
| 118 | + "filename": sanitizeFilename(filename), |
| 119 | + })) |
| 120 | + header.Set("Content-Type", contentType) |
| 121 | + part, err := writer.CreatePart(header) |
| 122 | + if err != nil { |
| 123 | + return errors.Wrap(err, "failed to create audio file part") |
| 124 | + } |
| 125 | + if _, err := io.Copy(part, request.Audio); err != nil { |
| 126 | + return errors.Wrap(err, "failed to write audio file part") |
| 127 | + } |
| 128 | + return nil |
| 129 | +} |
| 130 | + |
| 131 | +func extractErrorMessage(responseBody []byte) string { |
| 132 | + var response errorResponse |
| 133 | + if err := json.Unmarshal(responseBody, &response); err == nil && response.Error.Message != "" { |
| 134 | + return response.Error.Message |
| 135 | + } |
| 136 | + return string(responseBody) |
| 137 | +} |
| 138 | + |
| 139 | +func sanitizeFilename(filename string) string { |
| 140 | + filename = strings.NewReplacer("\r", "_", "\n", "_").Replace(filename) |
| 141 | + if strings.TrimSpace(filename) == "" { |
| 142 | + return "audio" |
| 143 | + } |
| 144 | + return filename |
| 145 | +} |
0 commit comments