cannam@197
|
1
|
cannam@197
|
2 # Piper audio feature extraction: schema for low-level operation
|
cannam@197
|
3 #
|
cannam@197
|
4 # This file is formatted to 130 characters width, in order to fit the
|
cannam@197
|
5 # comments next to the schema definitions.
|
cannam@197
|
6 #
|
cannam@197
|
7 # Copyright (c) 2015-2017 Queen Mary, University of London, provided
|
cannam@197
|
8 # under a BSD-style licence. See the file COPYING for details.
|
c@174
|
9
|
c@174
|
10 @0xc4b1c6c44c999206;
|
c@174
|
11
|
c@174
|
12 using Cxx = import "/capnp/c++.capnp";
|
c@174
|
13 $Cxx.namespace("piper");
|
c@174
|
14
|
c@174
|
15 struct Basic {
|
cannam@195
|
16 # Basic metadata common to many Piper structures.
|
cannam@195
|
17
|
cannam@196
|
18 identifier @0 :Text; # A computer-readable string. Must match the regex /^[a-zA-Z0-9_-]+$/.
|
cannam@196
|
19 name @1 :Text; # A short human-readable name or label. Must be present.
|
cannam@196
|
20 description @2 :Text; # An optional human-readable descriptive text that may accompany the name.
|
c@174
|
21 }
|
c@174
|
22
|
c@174
|
23 struct ParameterDescriptor {
|
cannam@196
|
24 # Properties of an adjustable parameter. A parameter's value is just a single
|
cannam@195
|
25 # float, but the descriptor explains how to interpret and present that value.
|
cannam@196
|
26 # A Piper feature extractor has a static list of parameters. The properties of
|
cannam@196
|
27 # a given parameter never change, in contrast to output descriptors, which
|
cannam@196
|
28 # may have different properties depending on the configuration of the extractor.
|
cannam@195
|
29
|
cannam@196
|
30 basic @0 :Basic; # Basic metadata about the parameter.
|
cannam@196
|
31 unit @1 :Text; # Human-recognisable unit of the parameter (e.g. Hz). May be left empty.
|
cannam@196
|
32 minValue @2 :Float32 = 0.0; # Minimum value. Must be provided.
|
cannam@196
|
33 maxValue @3 :Float32 = 0.0; # Maximum value. Must be provided.
|
cannam@196
|
34 defaultValue @4 :Float32 = 0.0; # Default if the parameter is not set to anything else. Must be provided.
|
cannam@196
|
35 isQuantized @5 :Bool = false; # True if parameter values are quantized to a particular resolution.
|
cannam@196
|
36 quantizeStep @6 :Float32 = 0.0; # Quantization resolution, if isQuantized.
|
cannam@196
|
37 valueNames @7 :List(Text) = []; # Optional human-readable labels for the values, if isQuantized.
|
c@174
|
38 }
|
c@174
|
39
|
c@174
|
40 enum SampleType {
|
cannam@195
|
41 # How returned features are spaced on the input timeline.
|
cannam@195
|
42
|
cannam@196
|
43 oneSamplePerStep @0; # Each process input returns a feature aligned with that input's timestamp.
|
cannam@196
|
44 fixedSampleRate @1; # Features are equally spaced at a given sample rate.
|
cannam@196
|
45 variableSampleRate @2; # Features have their own individual timestamps.
|
c@174
|
46 }
|
c@174
|
47
|
c@174
|
48 struct ConfiguredOutputDescriptor {
|
cannam@196
|
49 # Properties of an output, that is, a single stream of features produced
|
cannam@196
|
50 # in response to process and finish requests. A feature extractor may
|
cannam@196
|
51 # have any number of outputs, and it always calculates and returns features
|
cannam@196
|
52 # from all of them when processing; this is useful in cases where more
|
cannam@196
|
53 # than one feature can be easily calculated using a single method.
|
cannam@196
|
54 # This structure contains the properties of an output that are not static,
|
cannam@196
|
55 # i.e. that may depend on the parameter values provided at configuration.
|
cannam@196
|
56
|
cannam@196
|
57 unit @0 :Text; # Human-recognisable unit of the bin values in output features. May be empty.
|
cannam@196
|
58 hasFixedBinCount @1 :Bool = false; # True if this output has an equal number of values in each returned feature.
|
cannam@196
|
59 binCount @2 :Int32 = 0; # Number of values per feature for this output, if hasFixedBinCount.
|
cannam@196
|
60 binNames @3 :List(Text) = []; # Optional human-readable labels for the value bins, if hasFixedBinCount.
|
cannam@196
|
61 hasKnownExtents @4 :Bool = false; # True if all feature values fall within the same fixed min/max range.
|
cannam@196
|
62 minValue @5 :Float32 = 0.0; # Minimum value in range for any value from this output, if hasKnownExtents.
|
cannam@196
|
63 maxValue @6 :Float32 = 0.0; # Maximum value in range for any value from this output, if hasKnownExtents.
|
cannam@196
|
64 isQuantized @7 :Bool = false; # True if feature values are quantized to a particular resolution.
|
cannam@196
|
65 quantizeStep @8 :Float32 = 0.0; # Quantization resolution, if isQuantized.
|
cannam@196
|
66 sampleType @9 :SampleType; # How returned features from this output are spaced on the input timeline.
|
cannam@196
|
67 sampleRate @10 :Float32 = 0.0; # Sample rate (features per second) if sampleType == fixedSampleRate.
|
cannam@196
|
68 hasDuration @11 :Bool = false; # True if features returned from this output will have a duration.
|
c@174
|
69 }
|
c@174
|
70
|
c@174
|
71 struct OutputDescriptor {
|
cannam@196
|
72 # All the properties of an output, both static (the basic metadata) and
|
cannam@196
|
73 # potentially dependent on configuration parameters (the configured descriptor).
|
cannam@196
|
74
|
cannam@196
|
75 basic @0 :Basic; # Basic metadata about the output.
|
cannam@196
|
76 configured @1 :ConfiguredOutputDescriptor; # Properties of the output that may depend on configuration parameters.
|
c@174
|
77 }
|
c@174
|
78
|
c@174
|
79 enum InputDomain {
|
cannam@196
|
80 # Whether a feature extractor requires time-domain audio input (i.e.
|
cannam@196
|
81 # "normal" or "unprocessed" audio samples) or frequency-domain input
|
cannam@196
|
82 # (i.e. resulting from windowed, usually overlapping, short-time
|
cannam@196
|
83 # Fourier transforms).
|
cannam@196
|
84
|
cannam@196
|
85 timeDomain @0; # The plugin requires time-domain audio samples as input.
|
cannam@196
|
86 frequencyDomain @1; # The plugin requires input to have been pre-processed using windowed STFTs.
|
c@174
|
87 }
|
c@174
|
88
|
c@174
|
89 struct ExtractorStaticData {
|
cannam@196
|
90 # Static properties of a feature extractor. That is, metadata about the
|
cannam@196
|
91 # extractor that are the same regardless of how you configure or run it.
|
cannam@196
|
92
|
cannam@199
|
93 key @0 :Text; # String that "globally" identifies the extractor, used to load it (see docs).
|
cannam@196
|
94 basic @1 :Basic; # Basic metadata about the extractor.
|
cannam@196
|
95 maker @2 :Text; # Human-readable text naming the author or vendor of the extractor.
|
cannam@199
|
96 rights @3 :Text; # Human-readable summary of copyright and/or licensing terms for the extractor.
|
cannam@199
|
97 version @4 :Int32; # Version number of extractor; must increase if new algorithm changes results.
|
cannam@199
|
98 category @5 :List(Text); # List of general->specific category labels for this extractor (see docs).
|
cannam@196
|
99 minChannelCount @6 :Int32; # Minimum number of input channels of audio this extractor can accept.
|
cannam@196
|
100 maxChannelCount @7 :Int32; # Maximum number of input channels of audio this extractor can accept.
|
cannam@196
|
101 parameters @8 :List(ParameterDescriptor); # List of configurable parameter properties for the feature extractor.
|
cannam@197
|
102 programs @9 :List(Text); # List of predefined programs. For backward-compatibility, not recommended.
|
cannam@196
|
103 inputDomain @10 :InputDomain; # Whether the extractor requires time-domain or frequency-domain input audio.
|
cannam@196
|
104 basicOutputInfo @11 :List(Basic); # Basic metadata about all of the outputs of the extractor.
|
c@174
|
105 }
|
c@174
|
106
|
c@174
|
107 struct RealTime {
|
cannam@196
|
108 # Time structure. When used as a timestamp, this is relative to "start
|
cannam@196
|
109 # of audio".
|
cannam@196
|
110
|
cannam@196
|
111 sec @0 :Int32 = 0; # Number of seconds.
|
cannam@196
|
112 nsec @1 :Int32 = 0; # Number of nanoseconds. Must have same sign as sec unless sec == 0.
|
c@174
|
113 }
|
c@174
|
114
|
c@174
|
115 struct ProcessInput {
|
cannam@196
|
116 # Audio and timing input data provided to a process request.
|
cannam@196
|
117
|
cannam@197
|
118 inputBuffers @0 :List(List(Float32)); # A single block of audio data (time or frequency domain) for each channel.
|
cannam@197
|
119 timestamp @1 :RealTime; # Time of start of block (time-domain) or "centre" of it (frequency-domain).
|
c@174
|
120 }
|
c@174
|
121
|
c@174
|
122 struct Feature {
|
cannam@196
|
123 # A single feature calculated and returned from a process or finish request.
|
cannam@196
|
124
|
cannam@197
|
125 hasTimestamp @0 :Bool = false; # True if feature has a timestamp. Must be true for a variableSampleRate output.
|
cannam@196
|
126 timestamp @1 :RealTime; # Timestamp of feature, if hasTimestamp.
|
cannam@196
|
127 hasDuration @2 :Bool = false; # True if feature has a duration. Must be true if output's hasDuration is true.
|
cannam@196
|
128 duration @3 :RealTime; # Duration of feature, if hasDuration.
|
cannam@196
|
129 label @4 :Text; # Optional human-readable text attached to feature.
|
cannam@196
|
130 featureValues @5 :List(Float32) = []; # The feature values themselves (of size binCount, if output hasFixedBinCount).
|
c@174
|
131 }
|
c@174
|
132
|
c@174
|
133 struct FeatureSet {
|
cannam@197
|
134 # The set of all features, across all outputs, calculated and returned from
|
cannam@197
|
135 # a single process or finish request.
|
cannam@197
|
136
|
c@174
|
137 struct FSPair {
|
cannam@197
|
138 # A mapping between output identifier and ordered list of features for
|
cannam@197
|
139 # that output.
|
cannam@197
|
140
|
cannam@197
|
141 output @0 :Text; # Output id, matching the output's descriptor's basic identifier.
|
cannam@197
|
142 features @1 :List(Feature) = []; # Features calculated for that output during the current request, in time order.
|
c@174
|
143 }
|
cannam@197
|
144
|
cannam@197
|
145 featurePairs @0 :List(FSPair); # The feature lists for all outputs for which any features have been calculated.
|
c@174
|
146 }
|
c@174
|
147
|
cannam@191
|
148 struct Framing {
|
cannam@198
|
149 # Determines how audio should be split up into individual buffers for input.
|
cannam@198
|
150 # If the feature extractor accepts frequency-domain input, then this
|
cannam@198
|
151 # applies prior to the STFT transform.
|
cannam@197
|
152 #
|
cannam@197
|
153 # These values are sometimes mandatory, but in other contexts one or both may
|
cannam@197
|
154 # be set to zero to mean "don't care". See documentation for structures that
|
cannam@197
|
155 # include a framing field for details.
|
cannam@197
|
156
|
cannam@198
|
157 blockSize @0 :Int32; # Number of time-domain audio samples per buffer (on each channel).
|
cannam@198
|
158 stepSize @1 :Int32; # Number of samples to advance between buffers: equals blockSize for no overlap.
|
cannam@191
|
159 }
|
cannam@191
|
160
|
c@174
|
161 struct Configuration {
|
cannam@197
|
162 # Bundle of parameter values and other configuration data for a feature-
|
cannam@197
|
163 # extraction procedure.
|
cannam@197
|
164
|
c@174
|
165 struct PVPair {
|
cannam@197
|
166 # A mapping between parameter identifier and value.
|
cannam@197
|
167
|
cannam@197
|
168 parameter @0 :Text; # Parameter id, matching the parameter's descriptor's basic identifier.
|
cannam@197
|
169 value @1 :Float32; # Value to set parameter to (within constraints given in parameter descriptor).
|
c@174
|
170 }
|
cannam@197
|
171
|
cannam@197
|
172 parameterValues @0 :List(PVPair); # Values for all parameters, or at least any that are to change from defaults.
|
cannam@197
|
173 currentProgram @1 :Text; # Selection of predefined program. For backward-compatibility, not recommended.
|
cannam@197
|
174 channelCount @2 :Int32; # Number of audio channels of input.
|
cannam@197
|
175 framing @3 :Framing; # Step and block size for framing the input.
|
c@174
|
176 }
|
c@174
|
177
|
c@174
|
178 enum AdapterFlag {
|
cannam@198
|
179 # Flags that may be used when requesting a server to load a feature
|
cannam@198
|
180 # extractor, to ask the server to do some of the work of framing and input
|
cannam@198
|
181 # conversion instead of leaving it to the client side. These affect the
|
cannam@198
|
182 # apparent behaviour of the loaded extractor.
|
cannam@198
|
183
|
cannam@198
|
184 adaptInputDomain @0; # Input-domain conversion, so the extractor always expects time-domain input.
|
cannam@198
|
185 adaptChannelCount @1; # Channel mixing or duplication, so any number of input channels is acceptable.
|
cannam@198
|
186 adaptBufferSize @2; # Framing, so the extractor accepts any blockSize of non-overlapping buffers.
|
c@174
|
187 }
|
c@174
|
188
|
c@174
|
189 const adaptAllSafe :List(AdapterFlag) =
|
cannam@198
|
190 [ adaptInputDomain, adaptChannelCount ];
|
cannam@198
|
191 # The set of adapter flags that can always be applied, leaving results unchanged.
|
c@174
|
192
|
c@174
|
193 const adaptAll :List(AdapterFlag) =
|
cannam@198
|
194 [ adaptInputDomain, adaptChannelCount, adaptBufferSize ];
|
cannam@198
|
195 # The set of adapter flags that may cause "equivalent" results to be returned (see documentation).
|
c@174
|
196
|
c@174
|
197 struct ListRequest {
|
cannam@198
|
198 # Request a server to provide a list of available feature extractors.
|
cannam@198
|
199
|
cannam@198
|
200 from @0 :List(Text); # If non-empty, provide only extractors found in the given list of "libraries".
|
c@174
|
201 }
|
c@174
|
202
|
c@174
|
203 struct ListResponse {
|
cannam@198
|
204 # Response to a successful list request.
|
cannam@198
|
205
|
cannam@198
|
206 available @0 :List(ExtractorStaticData); # List of static data about available feature extractors.
|
c@174
|
207 }
|
c@174
|
208
|
c@174
|
209 struct LoadRequest {
|
cannam@198
|
210 # Request a server to load a feature extractor and return a handle to it.
|
cannam@198
|
211
|
cannam@198
|
212 key @0 :Text; # Key as found in the extractor's static data structure.
|
cannam@198
|
213 inputSampleRate @1 :Float32; # Sample rate for input audio. Properties of the extractor may depend on this.
|
cannam@198
|
214 adapterFlags @2 :List(AdapterFlag); # Set of optional flags to make any framing and input conversion requests.
|
c@174
|
215 }
|
c@174
|
216
|
c@174
|
217 struct LoadResponse {
|
cannam@198
|
218 # Response to a successful load request.
|
cannam@198
|
219
|
cannam@198
|
220 handle @0 :Int32; # Handle to be used to refer to the loaded feature extractor in future requests.
|
cannam@198
|
221 staticData @1 :ExtractorStaticData; # Static data about this feature extractor, identical to that in list response.
|
cannam@198
|
222 defaultConfiguration @2 :Configuration; # Extractor's default parameter values and preferred input framing.
|
c@174
|
223 }
|
c@174
|
224
|
c@174
|
225 struct ConfigurationRequest {
|
cannam@198
|
226 # Request a server to configure a loaded feature extractor and prepare
|
cannam@198
|
227 # it for use. This request must be carried out on a feature extractor
|
cannam@198
|
228 # before any process request can be made.
|
cannam@198
|
229
|
cannam@198
|
230 handle @0 :Int32; # Handle as returned in the load response from the loading of this extractor.
|
cannam@198
|
231 configuration @1 :Configuration; # Bundle of parameter values to set, and client's preferred input framing.
|
c@174
|
232 }
|
c@174
|
233
|
c@174
|
234 struct ConfigurationResponse {
|
cannam@198
|
235 # Response to a successful configuration request.
|
cannam@198
|
236
|
cannam@198
|
237 handle @0 :Int32; # Handle of extractor, as passed in the configuration request.
|
cannam@198
|
238 outputs @1 :List(OutputDescriptor); # Full set of properties of all outputs following configuration.
|
cannam@198
|
239 framing @2 :Framing; # Input framing that must be used for subsequent process requests.
|
c@174
|
240 }
|
c@174
|
241
|
c@174
|
242 struct ProcessRequest {
|
cannam@198
|
243 # Request a server to process a buffer of audio using a loaded and
|
cannam@198
|
244 # configured feature extractor.
|
cannam@198
|
245
|
cannam@198
|
246 handle @0 :Int32; # Handle as returned in the load response from the loading of this extractor.
|
cannam@198
|
247 processInput @1 :ProcessInput; # Audio in the input domain, with framing as in the configuration response.
|
c@174
|
248 }
|
c@174
|
249
|
c@174
|
250 struct ProcessResponse {
|
cannam@198
|
251 # Response to a successful process request.
|
cannam@198
|
252
|
cannam@198
|
253 handle @0 :Int32; # Handle of extractor, as passed in the process request.
|
cannam@198
|
254 features @1 :FeatureSet; # All features across all outputs calculated during this process request.
|
c@174
|
255 }
|
c@174
|
256
|
c@174
|
257 struct FinishRequest {
|
cannam@198
|
258 # Request a server to finish processing and unload a loaded feature
|
cannam@198
|
259 # extractor. This request may be made at any time -- the extractor does
|
cannam@198
|
260 # not have to have been configured or used. The extractor handle cannot
|
cannam@198
|
261 # be used again with this server afterwards.
|
cannam@198
|
262
|
cannam@198
|
263 handle @0 :Int32; # Handle as returned in the load response from the loading of this extractor.
|
c@174
|
264 }
|
c@174
|
265
|
c@174
|
266 struct FinishResponse {
|
cannam@198
|
267 # Response to a successful finish request.
|
cannam@198
|
268
|
cannam@198
|
269 handle @0 :Int32; # Handle of extractor, as passed in the finish request. May not be used again.
|
cannam@198
|
270 features @1 :FeatureSet; # Features the extractor has calculated now that it knows all input has ended.
|
c@174
|
271 }
|
c@174
|
272
|
c@174
|
273 struct Error {
|
cannam@198
|
274 # Response to any request that fails.
|
cannam@198
|
275
|
cannam@198
|
276 code @0 :Int32; # Error code.
|
cannam@198
|
277 message @1 :Text; # Error message.
|
c@174
|
278 }
|
c@174
|
279
|
c@174
|
280 struct RpcRequest {
|
c@174
|
281 # Request bundle for use when using Cap'n Proto serialisation without
|
c@174
|
282 # Cap'n Proto RPC layer. For Cap'n Proto RPC, see piper.rpc.capnp.
|
cannam@198
|
283
|
c@175
|
284 id :union {
|
cannam@198
|
285 # Identifier used solely to associate a response packet with its
|
cannam@198
|
286 # originating request. Server does not examine the contents of this,
|
cannam@198
|
287 # it just copies the request id into the response.
|
cannam@198
|
288
|
c@175
|
289 number @0 :Int32;
|
c@175
|
290 tag @1 :Text;
|
c@175
|
291 none @2 :Void;
|
c@175
|
292 }
|
cannam@198
|
293
|
c@174
|
294 request :union {
|
cannam@198
|
295 # For more details, see the documentation for the individual
|
cannam@198
|
296 # request structures.
|
cannam@198
|
297
|
cannam@198
|
298 list @3 :ListRequest; # Provide a list of available feature extractors.
|
cannam@198
|
299 load @4 :LoadRequest; # Load a feature extractor and return a handle to it.
|
cannam@198
|
300 configure @5 :ConfigurationRequest; # Configure a loaded feature extractor, set parameters, and prepare it for use.
|
cannam@198
|
301 process @6 :ProcessRequest; # Process a single fixed-size buffer of audio and return calculated features.
|
cannam@198
|
302 finish @7 :FinishRequest; # Get any remaining features and unload the extractor.
|
c@174
|
303 }
|
c@174
|
304 }
|
c@174
|
305
|
c@174
|
306 struct RpcResponse {
|
c@174
|
307 # Response bundle for use when using Cap'n Proto serialisation without
|
c@174
|
308 # Cap'n Proto RPC layer. For Cap'n Proto RPC, see piper.rpc.capnp.
|
cannam@198
|
309
|
c@175
|
310 id :union {
|
cannam@198
|
311 # Identifier used solely to associate a response packet with its
|
cannam@198
|
312 # originating request. Server does not examine the contents of this,
|
cannam@198
|
313 # it just copies the request id into the response.
|
cannam@198
|
314
|
c@175
|
315 number @0 :Int32;
|
c@175
|
316 tag @1 :Text;
|
c@175
|
317 none @2 :Void;
|
c@175
|
318 }
|
cannam@198
|
319
|
c@174
|
320 response :union {
|
cannam@198
|
321 # For more details, see the documentation for the individual
|
cannam@198
|
322 # response structures.
|
cannam@198
|
323
|
cannam@198
|
324 error @3 :Error; # The request (of whatever type) failed.
|
cannam@198
|
325 list @4 :ListResponse; # List succeeded, here is static data about the requested extractors.
|
cannam@198
|
326 load @5 :LoadResponse; # Load succeeded, here is a handle for the loaded extractor.
|
cannam@198
|
327 configure @6 :ConfigurationResponse;# Configure succeeded, ready to process, here are values such as block size.
|
cannam@198
|
328 process @7 :ProcessResponse; # Process succeeded, here are all features calculated from this input block.
|
cannam@198
|
329 finish @8 :FinishResponse; # Finish succeeded, extractor unloaded, here are all remaining features.
|
c@174
|
330 }
|
c@174
|
331 }
|
c@174
|
332
|