
So I needed to speed up / slow down an audio stream I had (speech generated with Flite TTS) and naively I thought it would suffice to simply sample it at the right intervals and interpolate.
I quickly discovered that just re-sampling won’t do because changing frequency also changes pitch proportionally. And then I discovered the world of Time Scaling in audio and it’s many algorithms and approaches to change the tempo without changing pitch.
To my surprise there were a number of ready made free libraries that do it, but the first one I tried – RubberBand – did not work out, it had too many dependencies I simply couldn’t be bothered compiling it for the Mac. But SoundTouch, well it had a Homebrew formula so it won by default.
I wrote a little simple wrapper around it, that interfaces nicely with Qt.
Let’s see what’s going on there
SoundTouch is fairly simple. I based my code off of their example SoundStretch example: http://svn.code.sf.net/p/soundtouch/code/trunk/source/SoundStretch/main.cpp
I ended up with a singleton SoundUtils class that does a very simple function – change the tempo or sample rate.
That’s pretty self explanatory, nothing fancy…
So here’s a small usage example using Flite TTS (it’s not gonna compile&run right away, only for illustration or copy/paste):
#include <QApplication>
#include "SoundUtils.h"
#include <flite/flite.h>
cst_voice *v;
int main(int argc, char *argv[]) {
// generate speech with Flite TTS
flite_init();
v = register_cmu_us_rms(NULL);
cst_wave* wave = flite_text_to_wave("hello sound touch", v);
unregister_cmu_us_rms(v);//done, we got the samples
// let's see what this wave looks like
qDebug() << "wave info \n\tsamples " << cst_wave_num_samples(wave) <<
"\n\tfreq " << cst_wave_sample_rate(wave) <<
"\n\tchannels " << cst_wave_num_channels(wave) <<
"\n\tsize of sample " << sizeof(typeof(*(wave->samples))) <<
"\n\ttype" << wave->type <<
"\n";
//some info about the WAV
int freq = cst_wave_sample_rate(wave);
int numchannels = cst_wave_num_channels(wave);
int samplesize_bytes = sizeof(typeof(*(wave->samples)));
int samplesize_bits = samplesize_bytes * 8;
short* buf = (short*)(cst_wave_samples(wave));
int numsamples = cst_wave_num_samples(wave);
int bufsize_bytes = numsamples * samplesize_bytes;
//output the available device
foreach (const QAudioDeviceInfo &deviceInfo, QAudioDeviceInfo::availableDevices(QAudio::AudioOutput))
qDebug() << deviceInfo.deviceName();
//pick the first device anyway... :)
QAudioDeviceInfo m_device = QAudioDeviceInfo::availableDevices(QAudio::AudioOutput).first();
QAudioFormat m_format;
m_format.setFrequency(freq);
m_format.setChannels(numchannels);
m_format.setSampleSize(samplesize_bits);
m_format.setCodec("audio/pcm");
m_format.setByteOrder(QAudioFormat::LittleEndian);
m_format.setSampleType(QAudioFormat::SignedInt);
//If the audio format of the wav doesn't play nice with our device - we may need to change the sample rate
QAudioDeviceInfo info(QAudioDeviceInfo::defaultOutputDevice());
if (!info.isFormatSupported(m_format)) {
qWarning() << "Default format not supported - trying to use nearest";
m_format = info.nearestFormat(m_format);
qDebug() << "freq " << m_format.frequency();
}
SoundUtils::Instance()->setup(freq,numchannels,10,m_format.frequency()); //10 percent speed increase
QBuffer outBuf;
outBuf.open(QIODevice::WriteOnly);
//process the sound
int new_numsamples = SoundUtils::Instance()->process(reinterpret_cast<const short*>(buf),numsamples,numchannels,samplesize_bytes, &outBuf);
QBuffer wavebuf;
//trim the start and end, which always carry some silence (using Flite TTS)
SoundUtils::Instance()->trim(&outBuf,newsamples_num,&wavebuf);
wavebuf.open(QIODevice::ReadOnly);
QApplication a(argc, argv);
QAudioOutput* m_audioOutput = new QAudioOutput(m_device, m_format, &a);
//play the sound out
m_audioOutput->start(&wavebuf);
//block until done
while(m_audioOutput->state() == QAudio::ActiveState)
a.processEvents();
return 0;