cannam@197: cannam@197: # Piper audio feature extraction: schema for low-level operation cannam@197: # cannam@197: # This file is formatted to 130 characters width, in order to fit the cannam@197: # comments next to the schema definitions. cannam@197: # cannam@197: # Copyright (c) 2015-2017 Queen Mary, University of London, provided cannam@197: # under a BSD-style licence. See the file COPYING for details. c@174: c@174: @0xc4b1c6c44c999206; c@174: c@174: using Cxx = import "/capnp/c++.capnp"; c@174: $Cxx.namespace("piper"); c@174: c@174: struct Basic { cannam@195: # Basic metadata common to many Piper structures. cannam@195: cannam@196: identifier @0 :Text; # A computer-readable string. Must match the regex /^[a-zA-Z0-9_-]+$/. cannam@196: name @1 :Text; # A short human-readable name or label. Must be present. cannam@196: description @2 :Text; # An optional human-readable descriptive text that may accompany the name. c@174: } c@174: c@174: struct ParameterDescriptor { cannam@196: # Properties of an adjustable parameter. A parameter's value is just a single cannam@195: # float, but the descriptor explains how to interpret and present that value. cannam@196: # A Piper feature extractor has a static list of parameters. The properties of cannam@196: # a given parameter never change, in contrast to output descriptors, which cannam@196: # may have different properties depending on the configuration of the extractor. cannam@195: cannam@196: basic @0 :Basic; # Basic metadata about the parameter. cannam@196: unit @1 :Text; # Human-recognisable unit of the parameter (e.g. Hz). May be left empty. cannam@196: minValue @2 :Float32 = 0.0; # Minimum value. Must be provided. cannam@196: maxValue @3 :Float32 = 0.0; # Maximum value. Must be provided. cannam@196: defaultValue @4 :Float32 = 0.0; # Default if the parameter is not set to anything else. Must be provided. cannam@196: isQuantized @5 :Bool = false; # True if parameter values are quantized to a particular resolution. cannam@196: quantizeStep @6 :Float32 = 0.0; # Quantization resolution, if isQuantized. cannam@196: valueNames @7 :List(Text) = []; # Optional human-readable labels for the values, if isQuantized. c@174: } c@174: c@174: enum SampleType { cannam@195: # How returned features are spaced on the input timeline. cannam@195: cannam@196: oneSamplePerStep @0; # Each process input returns a feature aligned with that input's timestamp. cannam@196: fixedSampleRate @1; # Features are equally spaced at a given sample rate. cannam@196: variableSampleRate @2; # Features have their own individual timestamps. c@174: } c@174: cannam@206: struct StaticOutputDescriptor { cannam@206: cannam@206: # Properties of an output, that is, a single stream of features cannam@206: # produced in response to process and finish requests. A feature cannam@206: # extractor may have any number of outputs, and it always cannam@206: # calculates and returns features from all of them when cannam@206: # processing; this is useful in cases where more than one feature cannam@206: # can be easily calculated using a single method. cannam@206: # cannam@206: # This structure contains the properties of an output that are cannam@206: # static, i.e. that do not depend on the parameter values provided cannam@206: # at configuration, excluding the Basic struct parameters like id cannam@206: # and description. The Basic struct properties are not included cannam@206: # for historical reasons: they were already referenced separately cannam@206: # in the OutputDescriptor and ExtractorStaticData before this cannam@206: # struct was introduced. cannam@206: cannam@206: typeURI @0 :Text; # URI indicating the sort of feature that this output returns (see docs). cannam@206: } cannam@206: c@174: struct ConfiguredOutputDescriptor { cannam@196: # Properties of an output, that is, a single stream of features produced cannam@196: # in response to process and finish requests. A feature extractor may cannam@196: # have any number of outputs, and it always calculates and returns features cannam@196: # from all of them when processing; this is useful in cases where more cannam@196: # than one feature can be easily calculated using a single method. cannam@196: # This structure contains the properties of an output that are not static, cannam@196: # i.e. that may depend on the parameter values provided at configuration. cannam@196: cannam@196: unit @0 :Text; # Human-recognisable unit of the bin values in output features. May be empty. cannam@196: hasFixedBinCount @1 :Bool = false; # True if this output has an equal number of values in each returned feature. cannam@196: binCount @2 :Int32 = 0; # Number of values per feature for this output, if hasFixedBinCount. cannam@196: binNames @3 :List(Text) = []; # Optional human-readable labels for the value bins, if hasFixedBinCount. cannam@196: hasKnownExtents @4 :Bool = false; # True if all feature values fall within the same fixed min/max range. cannam@196: minValue @5 :Float32 = 0.0; # Minimum value in range for any value from this output, if hasKnownExtents. cannam@196: maxValue @6 :Float32 = 0.0; # Maximum value in range for any value from this output, if hasKnownExtents. cannam@196: isQuantized @7 :Bool = false; # True if feature values are quantized to a particular resolution. cannam@196: quantizeStep @8 :Float32 = 0.0; # Quantization resolution, if isQuantized. cannam@196: sampleType @9 :SampleType; # How returned features from this output are spaced on the input timeline. cannam@196: sampleRate @10 :Float32 = 0.0; # Sample rate (features per second) if sampleType == fixedSampleRate. cannam@196: hasDuration @11 :Bool = false; # True if features returned from this output will have a duration. c@174: } c@174: c@174: struct OutputDescriptor { cannam@206: # All the properties of an output, both static (the basic metadata and static cannam@206: # descriptor) and potentially dependent on configuration parameters (the cannam@206: # configured descriptor). cannam@196: cannam@196: basic @0 :Basic; # Basic metadata about the output. cannam@196: configured @1 :ConfiguredOutputDescriptor; # Properties of the output that may depend on configuration parameters. cannam@206: static @2 :StaticOutputDescriptor; # Properties (other than Basic) that do not depend on parameters. c@174: } c@174: c@174: enum InputDomain { cannam@196: # Whether a feature extractor requires time-domain audio input (i.e. cannam@196: # "normal" or "unprocessed" audio samples) or frequency-domain input cannam@196: # (i.e. resulting from windowed, usually overlapping, short-time cannam@196: # Fourier transforms). cannam@196: cannam@196: timeDomain @0; # The plugin requires time-domain audio samples as input. cannam@196: frequencyDomain @1; # The plugin requires input to have been pre-processed using windowed STFTs. c@174: } c@174: c@174: struct ExtractorStaticData { cannam@196: # Static properties of a feature extractor. That is, metadata about the cannam@196: # extractor that are the same regardless of how you configure or run it. cannam@196: cannam@199: key @0 :Text; # String that "globally" identifies the extractor, used to load it (see docs). cannam@196: basic @1 :Basic; # Basic metadata about the extractor. cannam@196: maker @2 :Text; # Human-readable text naming the author or vendor of the extractor. cannam@199: rights @3 :Text; # Human-readable summary of copyright and/or licensing terms for the extractor. cannam@199: version @4 :Int32; # Version number of extractor; must increase if new algorithm changes results. cannam@199: category @5 :List(Text); # List of general->specific category labels for this extractor (see docs). cannam@196: minChannelCount @6 :Int32; # Minimum number of input channels of audio this extractor can accept. cannam@196: maxChannelCount @7 :Int32; # Maximum number of input channels of audio this extractor can accept. cannam@196: parameters @8 :List(ParameterDescriptor); # List of configurable parameter properties for the feature extractor. cannam@215: programs @9 :List(Text); # List of predefined programs. cannam@196: inputDomain @10 :InputDomain; # Whether the extractor requires time-domain or frequency-domain input audio. cannam@196: basicOutputInfo @11 :List(Basic); # Basic metadata about all of the outputs of the extractor. cannam@206: cannam@206: struct SOPair { cannam@206: # A mapping between output identifier and static descriptor for cannam@206: # that output. cannam@206: cannam@206: output @0 :Text; # Output id, matching the output's descriptor's basic identifier. cannam@206: static @1 :StaticOutputDescriptor; cannam@206: } cannam@206: cannam@206: staticOutputInfo @12 :List(SOPair); # Static descriptors for all outputs that have any static metadata. c@174: } c@174: c@174: struct RealTime { cannam@196: # Time structure. When used as a timestamp, this is relative to "start cannam@196: # of audio". cannam@196: cannam@196: sec @0 :Int32 = 0; # Number of seconds. cannam@196: nsec @1 :Int32 = 0; # Number of nanoseconds. Must have same sign as sec unless sec == 0. c@174: } c@174: c@174: struct ProcessInput { cannam@196: # Audio and timing input data provided to a process request. cannam@196: cannam@197: inputBuffers @0 :List(List(Float32)); # A single block of audio data (time or frequency domain) for each channel. cannam@197: timestamp @1 :RealTime; # Time of start of block (time-domain) or "centre" of it (frequency-domain). c@174: } c@174: c@174: struct Feature { cannam@196: # A single feature calculated and returned from a process or finish request. cannam@196: cannam@197: hasTimestamp @0 :Bool = false; # True if feature has a timestamp. Must be true for a variableSampleRate output. cannam@196: timestamp @1 :RealTime; # Timestamp of feature, if hasTimestamp. cannam@196: hasDuration @2 :Bool = false; # True if feature has a duration. Must be true if output's hasDuration is true. cannam@196: duration @3 :RealTime; # Duration of feature, if hasDuration. cannam@196: label @4 :Text; # Optional human-readable text attached to feature. cannam@196: featureValues @5 :List(Float32) = []; # The feature values themselves (of size binCount, if output hasFixedBinCount). c@174: } c@174: c@174: struct FeatureSet { cannam@197: # The set of all features, across all outputs, calculated and returned from cannam@197: # a single process or finish request. cannam@197: c@174: struct FSPair { cannam@197: # A mapping between output identifier and ordered list of features for cannam@197: # that output. cannam@197: cannam@197: output @0 :Text; # Output id, matching the output's descriptor's basic identifier. cannam@197: features @1 :List(Feature) = []; # Features calculated for that output during the current request, in time order. c@174: } cannam@197: cannam@197: featurePairs @0 :List(FSPair); # The feature lists for all outputs for which any features have been calculated. c@174: } c@174: cannam@191: struct Framing { cannam@198: # Determines how audio should be split up into individual buffers for input. cannam@198: # If the feature extractor accepts frequency-domain input, then this cannam@198: # applies prior to the STFT transform. cannam@197: # cannam@197: # These values are sometimes mandatory, but in other contexts one or both may cannam@197: # be set to zero to mean "don't care". See documentation for structures that cannam@197: # include a framing field for details. cannam@197: cannam@198: blockSize @0 :Int32; # Number of time-domain audio samples per buffer (on each channel). cannam@198: stepSize @1 :Int32; # Number of samples to advance between buffers: equals blockSize for no overlap. cannam@191: } cannam@191: c@174: struct Configuration { cannam@197: # Bundle of parameter values and other configuration data for a feature- cannam@197: # extraction procedure. cannam@197: c@174: struct PVPair { cannam@197: # A mapping between parameter identifier and value. cannam@197: cannam@197: parameter @0 :Text; # Parameter id, matching the parameter's descriptor's basic identifier. cannam@197: value @1 :Float32; # Value to set parameter to (within constraints given in parameter descriptor). c@174: } cannam@197: cannam@197: parameterValues @0 :List(PVPair); # Values for all parameters, or at least any that are to change from defaults. cannam@197: currentProgram @1 :Text; # Selection of predefined program. For backward-compatibility, not recommended. cannam@197: channelCount @2 :Int32; # Number of audio channels of input. cannam@197: framing @3 :Framing; # Step and block size for framing the input. c@174: } c@174: c@174: enum AdapterFlag { cannam@198: # Flags that may be used when requesting a server to load a feature cannam@198: # extractor, to ask the server to do some of the work of framing and input cannam@198: # conversion instead of leaving it to the client side. These affect the cannam@198: # apparent behaviour of the loaded extractor. cannam@198: cannam@198: adaptInputDomain @0; # Input-domain conversion, so the extractor always expects time-domain input. cannam@198: adaptChannelCount @1; # Channel mixing or duplication, so any number of input channels is acceptable. cannam@198: adaptBufferSize @2; # Framing, so the extractor accepts any blockSize of non-overlapping buffers. c@174: } c@174: c@174: const adaptAllSafe :List(AdapterFlag) = cannam@198: [ adaptInputDomain, adaptChannelCount ]; cannam@198: # The set of adapter flags that can always be applied, leaving results unchanged. c@174: c@174: const adaptAll :List(AdapterFlag) = cannam@198: [ adaptInputDomain, adaptChannelCount, adaptBufferSize ]; cannam@198: # The set of adapter flags that may cause "equivalent" results to be returned (see documentation). c@174: c@174: struct ListRequest { cannam@198: # Request a server to provide a list of available feature extractors. cannam@198: cannam@198: from @0 :List(Text); # If non-empty, provide only extractors found in the given list of "libraries". c@174: } c@174: c@174: struct ListResponse { cannam@198: # Response to a successful list request. cannam@198: cannam@198: available @0 :List(ExtractorStaticData); # List of static data about available feature extractors. c@174: } c@174: c@174: struct LoadRequest { cannam@198: # Request a server to load a feature extractor and return a handle to it. cannam@198: cannam@198: key @0 :Text; # Key as found in the extractor's static data structure. cannam@198: inputSampleRate @1 :Float32; # Sample rate for input audio. Properties of the extractor may depend on this. cannam@198: adapterFlags @2 :List(AdapterFlag); # Set of optional flags to make any framing and input conversion requests. c@174: } c@174: c@174: struct LoadResponse { cannam@198: # Response to a successful load request. cannam@198: cannam@198: handle @0 :Int32; # Handle to be used to refer to the loaded feature extractor in future requests. cannam@198: staticData @1 :ExtractorStaticData; # Static data about this feature extractor, identical to that in list response. cannam@198: defaultConfiguration @2 :Configuration; # Extractor's default parameter values and preferred input framing. cannam@215: cannam@215: struct PPPair { cannam@215: # A mapping between program name and parameter values for that program. cannam@215: cannam@215: program @0 :Text; # Program name, one of those listed in the static data. cannam@215: parameters @1 :List(Configuration.PVPair); cannam@215: # Parameter values for all parameters changed from defaults by that program setting. cannam@215: } cannam@215: cannam@215: programParameters @3 :List(PPPair); c@174: } c@174: c@174: struct ConfigurationRequest { cannam@198: # Request a server to configure a loaded feature extractor and prepare cannam@198: # it for use. This request must be carried out on a feature extractor cannam@198: # before any process request can be made. cannam@198: cannam@198: handle @0 :Int32; # Handle as returned in the load response from the loading of this extractor. cannam@198: configuration @1 :Configuration; # Bundle of parameter values to set, and client's preferred input framing. c@174: } c@174: c@174: struct ConfigurationResponse { cannam@198: # Response to a successful configuration request. cannam@198: cannam@198: handle @0 :Int32; # Handle of extractor, as passed in the configuration request. cannam@198: outputs @1 :List(OutputDescriptor); # Full set of properties of all outputs following configuration. cannam@198: framing @2 :Framing; # Input framing that must be used for subsequent process requests. c@174: } c@174: c@174: struct ProcessRequest { cannam@198: # Request a server to process a buffer of audio using a loaded and cannam@198: # configured feature extractor. cannam@198: cannam@198: handle @0 :Int32; # Handle as returned in the load response from the loading of this extractor. cannam@198: processInput @1 :ProcessInput; # Audio in the input domain, with framing as in the configuration response. c@174: } c@174: c@174: struct ProcessResponse { cannam@198: # Response to a successful process request. cannam@198: cannam@198: handle @0 :Int32; # Handle of extractor, as passed in the process request. cannam@198: features @1 :FeatureSet; # All features across all outputs calculated during this process request. c@174: } c@174: c@174: struct FinishRequest { cannam@198: # Request a server to finish processing and unload a loaded feature cannam@198: # extractor. This request may be made at any time -- the extractor does cannam@198: # not have to have been configured or used. The extractor handle cannot cannam@198: # be used again with this server afterwards. cannam@198: cannam@198: handle @0 :Int32; # Handle as returned in the load response from the loading of this extractor. c@174: } c@174: c@174: struct FinishResponse { cannam@198: # Response to a successful finish request. cannam@198: cannam@198: handle @0 :Int32; # Handle of extractor, as passed in the finish request. May not be used again. cannam@198: features @1 :FeatureSet; # Features the extractor has calculated now that it knows all input has ended. c@174: } c@174: c@174: struct Error { cannam@198: # Response to any request that fails. cannam@198: cannam@198: code @0 :Int32; # Error code. cannam@198: message @1 :Text; # Error message. c@174: } c@174: c@174: struct RpcRequest { c@174: # Request bundle for use when using Cap'n Proto serialisation without c@174: # Cap'n Proto RPC layer. For Cap'n Proto RPC, see piper.rpc.capnp. cannam@198: c@175: id :union { cannam@198: # Identifier used solely to associate a response packet with its cannam@198: # originating request. Server does not examine the contents of this, cannam@200: # it just copies the request id structure into the response. cannam@198: c@175: number @0 :Int32; c@175: tag @1 :Text; c@175: none @2 :Void; c@175: } cannam@198: c@174: request :union { cannam@198: # For more details, see the documentation for the individual cannam@198: # request structures. cannam@198: cannam@198: list @3 :ListRequest; # Provide a list of available feature extractors. cannam@198: load @4 :LoadRequest; # Load a feature extractor and return a handle to it. cannam@198: configure @5 :ConfigurationRequest; # Configure a loaded feature extractor, set parameters, and prepare it for use. cannam@198: process @6 :ProcessRequest; # Process a single fixed-size buffer of audio and return calculated features. cannam@198: finish @7 :FinishRequest; # Get any remaining features and unload the extractor. c@174: } c@174: } c@174: c@174: struct RpcResponse { c@174: # Response bundle for use when using Cap'n Proto serialisation without c@174: # Cap'n Proto RPC layer. For Cap'n Proto RPC, see piper.rpc.capnp. cannam@198: c@175: id :union { cannam@198: # Identifier used solely to associate a response packet with its cannam@198: # originating request. Server does not examine the contents of this, cannam@200: # it just copies the request id structure into the response. cannam@198: c@175: number @0 :Int32; c@175: tag @1 :Text; c@175: none @2 :Void; c@175: } cannam@198: c@174: response :union { cannam@198: # For more details, see the documentation for the individual cannam@198: # response structures. cannam@198: cannam@198: error @3 :Error; # The request (of whatever type) failed. cannam@200: list @4 :ListResponse; # List succeeded: here is static data about the requested extractors. cannam@200: load @5 :LoadResponse; # Load succeeded: here is a handle for the loaded extractor. cannam@200: configure @6 :ConfigurationResponse;# Configure succeeded: ready to process, here are values such as block size. cannam@200: process @7 :ProcessResponse; # Process succeeded: here are all features calculated from this input block. cannam@200: finish @8 :FinishResponse; # Finish succeeded: extractor unloaded, here are all remaining features. c@174: } c@174: } c@174: