asrc_flite.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2012 Stefano Sabatini
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 /**
22  * @file
23  * flite voice synth source
24  */
25 
26 #include <flite/flite.h>
28 #include "libavutil/file.h"
29 #include "libavutil/opt.h"
30 #include "avfilter.h"
31 #include "audio.h"
32 #include "formats.h"
33 #include "internal.h"
34 
35 typedef struct {
36  const AVClass *class;
37  char *voice_str;
38  char *textfile;
39  char *text;
40  cst_wave *wave;
41  int16_t *wave_samples;
44  cst_voice *voice;
46  int64_t pts;
47  int frame_nb_samples; ///< number of samples per frame
48 } FliteContext;
49 
50 #define OFFSET(x) offsetof(FliteContext, x)
51 #define FLAGS AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
52 
53 static const AVOption flite_options[] = {
54  { "list_voices", "list voices and exit", OFFSET(list_voices), AV_OPT_TYPE_INT, {.i64=0}, 0, 1, FLAGS },
55  { "nb_samples", "set number of samples per frame", OFFSET(frame_nb_samples), AV_OPT_TYPE_INT, {.i64=512}, 0, INT_MAX, FLAGS },
56  { "n", "set number of samples per frame", OFFSET(frame_nb_samples), AV_OPT_TYPE_INT, {.i64=512}, 0, INT_MAX, FLAGS },
57  { "text", "set text to speak", OFFSET(text), AV_OPT_TYPE_STRING, {.str=NULL}, CHAR_MIN, CHAR_MAX, FLAGS },
58  { "textfile", "set filename of the text to speak", OFFSET(textfile), AV_OPT_TYPE_STRING, {.str=NULL}, CHAR_MIN, CHAR_MAX, FLAGS },
59  { "v", "set voice", OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, CHAR_MIN, CHAR_MAX, FLAGS },
60  { "voice", "set voice", OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, CHAR_MIN, CHAR_MAX, FLAGS },
61  { NULL }
62 };
63 
65 
66 static volatile int flite_inited = 0;
67 
68 /* declare functions for all the supported voices */
69 #define DECLARE_REGISTER_VOICE_FN(name) \
70  cst_voice *register_cmu_us_## name(const char *); \
71  void unregister_cmu_us_## name(cst_voice *);
77 
78 struct voice_entry {
79  const char *name;
80  cst_voice * (*register_fn)(const char *);
81  void (*unregister_fn)(cst_voice *);
82  cst_voice *voice;
83  unsigned usage_count;
84 } voice_entry;
85 
86 #define MAKE_VOICE_STRUCTURE(voice_name) { \
87  .name = #voice_name, \
88  .register_fn = register_cmu_us_ ## voice_name, \
89  .unregister_fn = unregister_cmu_us_ ## voice_name, \
90 }
91 static struct voice_entry voice_entries[] = {
94  MAKE_VOICE_STRUCTURE(kal16),
97 };
98 
99 static void list_voices(void *log_ctx, const char *sep)
100 {
101  int i, n = FF_ARRAY_ELEMS(voice_entries);
102  for (i = 0; i < n; i++)
103  av_log(log_ctx, AV_LOG_INFO, "%s%s",
104  voice_entries[i].name, i < (n-1) ? sep : "\n");
105 }
106 
107 static int select_voice(struct voice_entry **entry_ret, const char *voice_name, void *log_ctx)
108 {
109  int i;
110 
111  for (i = 0; i < FF_ARRAY_ELEMS(voice_entries); i++) {
112  struct voice_entry *entry = &voice_entries[i];
113  if (!strcmp(entry->name, voice_name)) {
114  if (!entry->voice)
115  entry->voice = entry->register_fn(NULL);
116  if (!entry->voice) {
117  av_log(log_ctx, AV_LOG_ERROR,
118  "Could not register voice '%s'\n", voice_name);
119  return AVERROR_UNKNOWN;
120  }
121  entry->usage_count++;
122  *entry_ret = entry;
123  return 0;
124  }
125  }
126 
127  av_log(log_ctx, AV_LOG_ERROR, "Could not find voice '%s'\n", voice_name);
128  av_log(log_ctx, AV_LOG_INFO, "Choose between the voices: ");
129  list_voices(log_ctx, ", ");
130 
131  return AVERROR(EINVAL);
132 }
133 
134 static av_cold int init(AVFilterContext *ctx)
135 {
136  FliteContext *flite = ctx->priv;
137  int ret = 0;
138 
139  if (flite->list_voices) {
140  list_voices(ctx, "\n");
141  return AVERROR_EXIT;
142  }
143 
144  if (!flite_inited) {
145  if (flite_init() < 0) {
146  av_log(ctx, AV_LOG_ERROR, "flite initialization failed\n");
147  return AVERROR_UNKNOWN;
148  }
149  flite_inited++;
150  }
151 
152  if ((ret = select_voice(&flite->voice_entry, flite->voice_str, ctx)) < 0)
153  return ret;
154  flite->voice = flite->voice_entry->voice;
155 
156  if (flite->textfile && flite->text) {
157  av_log(ctx, AV_LOG_ERROR,
158  "Both text and textfile options set: only one must be specified\n");
159  return AVERROR(EINVAL);
160  }
161 
162  if (flite->textfile) {
163  uint8_t *textbuf;
164  size_t textbuf_size;
165 
166  if ((ret = av_file_map(flite->textfile, &textbuf, &textbuf_size, 0, ctx)) < 0) {
167  av_log(ctx, AV_LOG_ERROR,
168  "The text file '%s' could not be read: %s\n",
169  flite->textfile, av_err2str(ret));
170  return ret;
171  }
172 
173  if (!(flite->text = av_malloc(textbuf_size+1)))
174  return AVERROR(ENOMEM);
175  memcpy(flite->text, textbuf, textbuf_size);
176  flite->text[textbuf_size] = 0;
177  av_file_unmap(textbuf, textbuf_size);
178  }
179 
180  if (!flite->text) {
181  av_log(ctx, AV_LOG_ERROR,
182  "No speech text specified, specify the 'text' or 'textfile' option\n");
183  return AVERROR(EINVAL);
184  }
185 
186  /* synth all the file data in block */
187  flite->wave = flite_text_to_wave(flite->text, flite->voice);
188  flite->wave_samples = flite->wave->samples;
189  flite->wave_nb_samples = flite->wave->num_samples;
190  return 0;
191 }
192 
193 static av_cold void uninit(AVFilterContext *ctx)
194 {
195  FliteContext *flite = ctx->priv;
196 
197  if (!--flite->voice_entry->usage_count)
198  flite->voice_entry->unregister_fn(flite->voice);
199  flite->voice = NULL;
200  flite->voice_entry = NULL;
201  delete_wave(flite->wave);
202  flite->wave = NULL;
203 }
204 
206 {
207  FliteContext *flite = ctx->priv;
208 
209  AVFilterChannelLayouts *chlayouts = NULL;
210  int64_t chlayout = av_get_default_channel_layout(flite->wave->num_channels);
211  AVFilterFormats *sample_formats = NULL;
212  AVFilterFormats *sample_rates = NULL;
213 
214  ff_add_channel_layout(&chlayouts, chlayout);
215  ff_set_common_channel_layouts(ctx, chlayouts);
216  ff_add_format(&sample_formats, AV_SAMPLE_FMT_S16);
217  ff_set_common_formats(ctx, sample_formats);
218  ff_add_format(&sample_rates, flite->wave->sample_rate);
219  ff_set_common_samplerates (ctx, sample_rates);
220 
221  return 0;
222 }
223 
224 static int config_props(AVFilterLink *outlink)
225 {
226  AVFilterContext *ctx = outlink->src;
227  FliteContext *flite = ctx->priv;
228 
229  outlink->sample_rate = flite->wave->sample_rate;
230  outlink->time_base = (AVRational){1, flite->wave->sample_rate};
231 
232  av_log(ctx, AV_LOG_VERBOSE, "voice:%s fmt:%s sample_rate:%d\n",
233  flite->voice_str,
234  av_get_sample_fmt_name(outlink->format), outlink->sample_rate);
235  return 0;
236 }
237 
238 static int request_frame(AVFilterLink *outlink)
239 {
240  AVFrame *samplesref;
241  FliteContext *flite = outlink->src->priv;
242  int nb_samples = FFMIN(flite->wave_nb_samples, flite->frame_nb_samples);
243 
244  if (!nb_samples)
245  return AVERROR_EOF;
246 
247  samplesref = ff_get_audio_buffer(outlink, nb_samples);
248  if (!samplesref)
249  return AVERROR(ENOMEM);
250 
251  memcpy(samplesref->data[0], flite->wave_samples,
252  nb_samples * flite->wave->num_channels * 2);
253  samplesref->pts = flite->pts;
254  av_frame_set_pkt_pos(samplesref, -1);
255  av_frame_set_sample_rate(samplesref, flite->wave->sample_rate);
256  flite->pts += nb_samples;
257  flite->wave_samples += nb_samples * flite->wave->num_channels;
258  flite->wave_nb_samples -= nb_samples;
259 
260  return ff_filter_frame(outlink, samplesref);
261 }
262 
263 static const AVFilterPad flite_outputs[] = {
264  {
265  .name = "default",
266  .type = AVMEDIA_TYPE_AUDIO,
267  .config_props = config_props,
268  .request_frame = request_frame,
269  },
270  { NULL }
271 };
272 
274  .name = "flite",
275  .description = NULL_IF_CONFIG_SMALL("Synthesize voice from text using libflite."),
276  .query_formats = query_formats,
277  .init = init,
278  .uninit = uninit,
279  .priv_size = sizeof(FliteContext),
280  .inputs = NULL,
281  .outputs = flite_outputs,
282  .priv_class = &flite_class,
283 };
static const AVOption flite_options[]
Definition: asrc_flite.c:53
This structure describes decoded (raw) audio or video data.
Definition: frame.h:76
#define FLAGS
Definition: asrc_flite.c:51
AVOption.
Definition: opt.h:251
static const AVFilterPad outputs[]
Definition: af_ashowinfo.c:117
external API header
static av_cold void uninit(AVFilterContext *ctx)
Definition: asrc_flite.c:193
int wave_nb_samples
Definition: asrc_flite.c:42
AVFILTER_DEFINE_CLASS(flite)
static int request_frame(AVFilterLink *outlink)
Definition: asrc_flite.c:238
#define FF_ARRAY_ELEMS(a)
signed 16 bits
Definition: samplefmt.h:52
text(-8, 1,'a)')
const char * name
Definition: asrc_flite.c:79
const char * name
Pad name.
cst_wave * wave
Definition: asrc_flite.c:40
struct voice_entry * voice_entry
Definition: asrc_flite.c:45
uint8_t
it can be given away to ff_start_frame *A reference passed to ff_filter_frame(or the deprecated ff_start_frame) is given away and must no longer be used.*A reference created with avfilter_ref_buffer belongs to the code that created it.*A reference obtained with ff_get_video_buffer or ff_get_audio_buffer belongs to the code that requested it.*A reference given as return value by the get_video_buffer or get_audio_buffer method is given away and must no longer be used.Link reference fields---------------------The AVFilterLink structure has a few AVFilterBufferRef fields.The cur_buf and out_buf were used with the deprecated start_frame/draw_slice/end_frame API and should no longer be used.src_buf
#define av_cold
Definition: attributes.h:78
int list_voices
Definition: asrc_flite.c:43
AVOptions.
static int query_formats(AVFilterContext *ctx)
Definition: asrc_flite.c:205
int64_t pts
Presentation timestamp in time_base units (time when frame should be shown to user).
Definition: frame.h:159
Misc file utilities.
#define AVERROR_EOF
End of file.
Definition: error.h:55
static void list_voices(void *log_ctx, const char *sep)
Definition: asrc_flite.c:99
#define OFFSET(x)
Definition: asrc_flite.c:50
void ff_set_common_formats(AVFilterContext *ctx, AVFilterFormats *formats)
A helper for query_formats() which sets all links to the same list of formats.
Definition: formats.c:545
char * text
Definition: asrc_flite.c:39
unsigned usage_count
Definition: asrc_flite.c:83
int frame_nb_samples
number of samples per frame
Definition: asrc_flite.c:47
A filter pad used for either input or output.
void av_file_unmap(uint8_t *bufptr, size_t size)
Unmap or free the buffer bufptr created by av_file_map().
int av_file_map(const char *filename, uint8_t **bufptr, size_t *size, int log_offset, void *log_ctx)
Read the file with name filename, and put its content in a newly allocated buffer or map it with mmap...
int ff_add_channel_layout(AVFilterChannelLayouts **l, uint64_t channel_layout)
Definition: formats.c:350
AVFrame * ff_get_audio_buffer(AVFilterLink *link, int nb_samples)
Request an audio samples buffer with a specific set of permissions.
Definition: audio.c:84
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification. ...
void av_frame_set_pkt_pos(AVFrame *frame, int64_t val)
void * priv
private data for use by the filter
Definition: avfilter.h:545
static struct voice_entry voice_entries[]
Definition: asrc_flite.c:91
#define MAKE_VOICE_STRUCTURE(voice_name)
Definition: asrc_flite.c:86
void av_log(void *avcl, int level, const char *fmt,...)
Definition: log.c:246
int ff_add_format(AVFilterFormats **avff, int64_t fmt)
Add fmt to the list of media formats contained in *avff.
Definition: formats.c:344
Definition: asrc_flite.c:78
static int config_props(AVFilterLink *outlink)
Definition: asrc_flite.c:224
#define AV_LOG_VERBOSE
Definition: log.h:157
struct AVRational AVRational
rational number numerator/denominator
void(* unregister_fn)(cst_voice *)
Definition: asrc_flite.c:81
audio channel layout utility functions
#define FFMIN(a, b)
Definition: common.h:58
static volatile int flite_inited
Definition: asrc_flite.c:66
char * voice_str
Definition: asrc_flite.c:37
ret
Definition: avfilter.c:821
#define av_err2str(errnum)
Convenience macro, the return value should be used only directly in function arguments but never stan...
Definition: error.h:110
cst_voice * voice
Definition: asrc_flite.c:44
void av_frame_set_sample_rate(AVFrame *frame, int val)
static int select_voice(struct voice_entry **entry_ret, const char *voice_name, void *log_ctx)
Definition: asrc_flite.c:107
#define AVERROR_EXIT
Immediate exit was requested; the called function should not be restarted.
Definition: error.h:56
const char * av_get_sample_fmt_name(enum AVSampleFormat sample_fmt)
Return the name of sample_fmt, or NULL if sample_fmt is not recognized.
Definition: samplefmt.c:47
A list of supported channel layouts.
Definition: formats.h:85
NULL
Definition: eval.c:55
int64_t pts
Definition: asrc_flite.c:46
typedef void(RENAME(mix_any_func_type))
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:148
int16_t * wave_samples
Definition: asrc_flite.c:41
void * av_malloc(size_t size)
Allocate a block of size bytes with alignment suitable for all memory accesses (including vectors if ...
Definition: mem.c:73
Describe the class of an AVClass context structure.
Definition: log.h:50
Filter definition.
Definition: avfilter.h:436
synthesis window for stochastic i
AVFilter avfilter_asrc_flite
Definition: asrc_flite.c:273
cst_voice * voice
Definition: asrc_flite.c:82
const char * name
filter name
Definition: avfilter.h:437
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFilterBuffer structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later.That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another.Buffer references ownership and permissions
uint8_t * data[AV_NUM_DATA_POINTERS]
pointer to the picture/channel planes.
Definition: frame.h:87
static av_cold int init(AVFilterContext *ctx)
Definition: asrc_flite.c:134
void ff_set_common_samplerates(AVFilterContext *ctx, AVFilterFormats *samplerates)
Definition: formats.c:533
#define AVERROR_UNKNOWN
Unknown error, typically from an external library.
Definition: error.h:71
A list of supported formats for one end of a filter link.
Definition: formats.h:64
struct voice_entry voice_entry
An instance of a filter.
Definition: avfilter.h:524
#define AV_LOG_INFO
Definition: log.h:156
#define DECLARE_REGISTER_VOICE_FN(name)
Definition: asrc_flite.c:69
char * textfile
Definition: asrc_flite.c:38
void ff_set_common_channel_layouts(AVFilterContext *ctx, AVFilterChannelLayouts *layouts)
A helper for query_formats() which sets all links to the same list of channel layouts/sample rates...
Definition: formats.c:526
static const AVFilterPad flite_outputs[]
Definition: asrc_flite.c:263
internal API functions
these buffered frames must be flushed immediately if a new input produces new the filter must not call request_frame to get more It must just process the frame or queue it The task of requesting more frames is left to the filter s request_frame method or the application If a filter has several inputs
cst_voice *(* register_fn)(const char *)
Definition: asrc_flite.c:80
int64_t av_get_default_channel_layout(int nb_channels)
Return default channel layout for a given number of channels.