FFmpeg filters 分析：af_silencedetect

一、概述

本文分析 FFmpeg af_silencedetect 的实现。

二、af_silencedetect 的作用及基本原理

af_silencedetect 的作用是获取音频的最大音量、平均音量以及音量直方图。
它只支持 AV_SAMPLE_FMT_S16 、 AV_SAMPLE_FMT_S32 、 AV_SAMPLE_FMT_FLT 和 AV_SAMPLE_FMT_DBL 这四种格式——如果不是当然 FFmpeg 能够自动转换。

多大音量认为是静音由参数 noise 确定，默认是 -60dB 或 0.001；多长的连续时长认为是静音由参数 duration 确定，默认是 2 秒。参数 mono 为非 0 表示各个声道分别检测，默认是合并在一起检测。

合并在一起检测：比如认为 2 秒连续无声(或小声)认为是静音，那么其中一个声道达标，另一个声道在该时段内不达标也不认为是静音。

三、在调用 ffmpeg 程序时使用 af_silencedetect

使用默认参数：

1	ffmpeg -i input.mp3 -af "silencedetect" -vn -sn -dn -f null /dev/null

在 Windows 中使用需将 /dev/null 替换为 NUL
-vn、 -sn 和 -dn 告知 FFmpeg 忽略非音频流。能够在分析时避免不必要的操作从而更快速.

输出类似于:

[silencedetect @ 0x137f044d0] silence_start: 0 0x    
[silencedetect @ 0x137f044d0] silence_end: 4.0214 | silence_duration: 4.0214
[silencedetect @ 0x137f044d0] silence_start: 8.08879
[silencedetect @ 0x137f044d0] silence_end: 15.1732 | silence_duration: 7.08437
[silencedetect @ 0x137f044d0] silence_start: 64.6201

各个声道分别检测：

1	ffmpeg -i 0.mp3 -af "silencedetect=mono=1" -vn -sn -dn -f null /dev/null

输出类似于:

[silencedetect @ 0x152704190] channel: 0 | silence_start: 0
[silencedetect @ 0x152704190] channel: 1 | silence_start: 0
[silencedetect @ 0x152704190] channel: 0 | silence_end: 4.0214 | silence_duration: 4.0214
[silencedetect @ 0x152704190] channel: 1 | silence_end: 4.0214 | silence_duration: 4.0214
[silencedetect @ 0x152704190] channel: 0 | silence_start: 8.08879
[silencedetect @ 0x152704190] channel: 1 | silence_start: 8.08879
[silencedetect @ 0x152704190] channel: 0 | silence_end: 15.1732 | silence_duration: 7.08437
[silencedetect @ 0x152704190] channel: 1 | silence_end: 15.1732 | silence_duration: 7.08437
[silencedetect @ 0x152704190] channel: 0 | silence_start: 64.6201
[silencedetect @ 0x152704190] channel: 1 | silence_start: 64.6201
[silencedetect @ 0x152704190] channel: 0 | silence_end: 68.664 | silence_duration: 4.04385
[silencedetect @ 0x152704190] channel: 1 | silence_end: 68.664 | silence_duration: 4.04385

四、源码分析

af_silencedetect 源码位于 ffmpg/libavfilter/af_silencedetect.c 中。

分析 filter 一般从 static int filter_frame(AVFilterLink *inlink, AVFrame *in) 函数入手。不过由于要支持多种采样格式，需要在 static int config_input(AVFilterLink *inlink) 根据采样格式设置检测函数。

static int config_input(AVFilterLink *inlink)
{
    AVFilterContext *ctx = inlink->dst;
    SilenceDetectContext *s = ctx->priv;
    int c;

    s->channels = inlink->channels;
    // 调用的参数 duration 单位是秒，s->duration 的单位是微妙。下面将其转换为采样数。
    // 比如 44100 的 2 秒音频，采样数就是 44100 * 2 = 88200。
    s->duration = av_rescale(s->duration, inlink->sample_rate, AV_TIME_BASE);
    // 独立声道数。如果 mono 参数不为 0 则取音频的声道数，否则固定为 1 。
    // 实际上因为音频格式是交错模式，如果 mono 为 0，不管多少声道都当成单声道处理。
    s->independent_channels = s->mono ? s->channels : 1;
    // nb_null_samples 用于在检测过程中记录检测到的采样数。考虑到独立声道检测的情况所以定义为数组。下一次检测前会将其各个元素重置为 0 。
    s->nb_null_samples = av_mallocz_array(sizeof(*s->nb_null_samples), s->independent_channels);
    if (!s->nb_null_samples)
        return AVERROR(ENOMEM);
    // start 用于在检测过程中记录检测到的第一个采样所在索引。考虑到独立声道检测的情况所以定义为数组。下一次检测前会将其重置为 INT64_MIN 。
    s->start = av_malloc_array(sizeof(*s->start), s->independent_channels);
    if (!s->start)
        return AVERROR(ENOMEM);
    for (c = 0; c < s->independent_channels; c++)
        s->start[c] = INT64_MIN; // 使用魔术值（magic value） INT64_MIN 表示尚未检测到第一个符合条件的采样。

    // 根据音频的输入格式选择合适的静音检测函数。
    switch (inlink->format) {
    case AV_SAMPLE_FMT_DBL: s->silencedetect = silencedetect_dbl; break;
    case AV_SAMPLE_FMT_FLT: s->silencedetect = silencedetect_flt; break;
    case AV_SAMPLE_FMT_S32:
        s->noise *= INT32_MAX;
        s->silencedetect = silencedetect_s32;
        break;
    case AV_SAMPLE_FMT_S16:
        s->noise *= INT16_MAX;
        s->silencedetect = silencedetect_s16;
        break;
    }

    return 0;
}

nb_null_samples 用于累加达标的采样数，通过

silencedetect_dbl、silencedetect_flt、silencedetect_s32 和 silencedetect_s16 由宏定义：

#define SILENCE_DETECT(name, type)                                               \
static void silencedetect_##name(SilenceDetectContext *s, AVFrame *insamples,    \
                                 int nb_samples, int64_t nb_samples_notify,      \
                                 AVRational time_base)                           \
{                                                                                \
    const type *p = (const type *)insamples->data[0];                            \
    const type noise = s->noise;                                                 \
    int i;                                                                       \
    
    // 遍历每一个采样进行检测                                                        \
    for (i = 0; i < nb_samples; i++, p++)                                        \
        update(s, insamples, *p < noise && *p > -noise, i,                       \
               nb_samples_notify, time_base);                                    \
}

SILENCE_DETECT(dbl, double)
SILENCE_DETECT(flt, float)
SILENCE_DETECT(s32, int32_t)
SILENCE_DETECT(s16, int16_t)

update 用于检测每一个采样：

static av_always_inline void update(SilenceDetectContext *s, AVFrame *insamples,
                                    int is_silence, int current_sample, int64_t nb_samples_notify,
                                    AVRational time_base)
{
    // 因为是音频交错模式，对于多声道各自检测，根据采样所在索引就能得出该采样属于哪个声道。
    int channel = current_sample % s->independent_channels;
    // 如果当前采样符合静音条件。
    if (is_silence) {
        if (s->start[channel] == INT64_MIN) { // 如果尚未开始
            s->nb_null_samples[channel]++;
            // 如果检测到足够多个采样则可以计算 `s->start[channel]` 并输出 `silence_start` 。
            if (s->nb_null_samples[channel] >= nb_samples_notify) {
                s->start[channel] = insamples->pts + av_rescale_q(current_sample / s->channels + 1 - nb_samples_notify * s->independent_channels / s->channels,
                        (AVRational){ 1, s->last_sample_rate }, time_base);
                set_meta(insamples, s->mono ? channel + 1 : 0, "silence_start",
                        av_ts2timestr(s->start[channel], &time_base));
                if (s->mono)
                    av_log(s, AV_LOG_INFO, "channel: %d | ", channel);
                av_log(s, AV_LOG_INFO, "silence_start: %s\n",
                        av_ts2timestr(s->start[channel], &time_base));
            }
        }
    } else {
        // 如果该采样不符合条件，判断之前的采样属于静音段，则表示该静音段结束了。输出 `silence_end` 和 `silence_duration`。
        if (s->start[channel] > INT64_MIN) {
            int64_t end_pts = insamples ? insamples->pts + av_rescale_q(current_sample / s->channels,
                    (AVRational){ 1, s->last_sample_rate }, time_base)
                    : s->frame_end;
            int64_t duration_ts = end_pts - s->start[channel];
            if (insamples) {
                set_meta(insamples, s->mono ? channel + 1 : 0, "silence_end",
                        av_ts2timestr(end_pts, &time_base));
                set_meta(insamples, s->mono ? channel + 1 : 0, "silence_duration",
                        av_ts2timestr(duration_ts, &time_base));
            }
            if (s->mono)
                av_log(s, AV_LOG_INFO, "channel: %d | ", channel);
            av_log(s, AV_LOG_INFO, "silence_end: %s | silence_duration: %s\n",
                    av_ts2timestr(end_pts, &time_base),
                    av_ts2timestr(duration_ts, &time_base));
        }

        // 重置辅助变量。
        s->nb_null_samples[channel] = 0;
        s->start[channel] = INT64_MIN;
    }
}

五、C# 简单实现

public class VolumeUtils
{
    /// <summary>
    /// 静音检测
    /// </summary>
    /// <param name="raw">PCM 数据。支持 S16LE 格式，单/双声道。</param>
    /// <param name="offset">数据偏移</param>
    /// <param name="length">数据长度</param>
    /// <param name="blockAlign">块对其长度。因为只检测第一声道，需该值来跳过数据。</param>
    /// <param name="sampleRate">采样率。配合 minDuration 使用。</param>
    /// <param name="noise">声量。取值范围：0 ~ 1。</param>
    /// <param name="minDuration">最小时长。 配合 sampleRate 使用。</param>
    /// <param name="detectMax">最多检测出多少段后终止。 0 表示检测全部段。</param>
    /// <returns>静音段集合</returns>
    public static List<SilencePeriod> SilenceDetect(byte[] raw, 
    int offset, 
    int length, 
    int blockAlign, 
    double sampleRate, 
    double noise, 
    double minDuration, 
    int detectMax = 0)
    {
        var result = new List<SilencePeriod>();

        noise = noise * Int16.MaxValue;
        var numberOfSamplesNotify = (int)(minDuration * sampleRate);

        var numberOfSilenceSamples = 0;
        var startSample = Int32.MinValue;

        for (var i = offset; i < length; i += blockAlign)
        {
            var sample = BitConverter.ToInt16(raw, i);
            var isSilence = sample < noise && sample > -noise;
            if (isSilence)
            {
                numberOfSilenceSamples++;
                if (startSample == Int32.MinValue)
                {
                    // 开始
                    startSample = i / blockAlign;
                }
            }
            else
            {
                if (startSample != Int32.MinValue && numberOfSilenceSamples >= numberOfSamplesNotify)
                {
                    // 结束
                    var silencePeriod = new SilencePeriod
                    {
                        Start = startSample * blockAlign,
                        Length = numberOfSilenceSamples * blockAlign,
                        Duration = numberOfSilenceSamples / sampleRate
                    };
                    silencePeriod.StartTS = (double)silencePeriod.Start / blockAlign / sampleRate;
                    result.Add(silencePeriod);

                    if(detectMax > 0 && result.Count == detectMax)
                    {
                        return result;
                    }
                }
                numberOfSilenceSamples = 0;
                startSample = Int32.MinValue;
            }
        }

        return result;
    }

    public class SilencePeriod
    {
        public int Start { get; set; }

        public int Length { get; set; }

        public double StartTS { get; set; }

        public double Duration { get; set; }

        public override string ToString()
        {
            return $"{{Start={Start},Length={Length},StartTS={StartTS:0.000},Duration={Duration:0.000}}}";
        }
    }
}

由于本人需要，对于多声道本方法也只检测第一个声道。在多个声道音量不是交错的情况下有助于提升效率。

参考资料

FFmpeg filters 官网文档: silencedetect