Learn about text to speech
Sunday, 22 April 2012
/// Classes for easy reading & writing of WAV sound files.
/// For big-endian CPU, define BIG_ENDIAN during compile-time to correctly
/// parse the WAV files with such processors.
/// Admittingly, more complete WAV reader routines may exist in public domain, but
/// the reason for 'yet another' one is that those generic WAV reader libraries are
/// exhaustingly large and cumbersome! Wanted to have something simpler here, i.e.
/// something that's not already larger than rest of the SoundTouch/SoundStretch program...
/// Author : Copyright (c) Olli Parviainen
/// Author e-mail : oparviai 'at' iki.fi
/// SoundTouch WWW: http://www.surina.net/soundtouch
// Last changed : $Date: 2006/02/05 16:44:06 $
// File revision : $Revision: 1.7 $
// $Id: WavFile.h,v 1.7 2006/02/05 16:44:06 Olli Exp $
// License :
// SoundTouch audio processing library
// Copyright (c) Olli Parviainen
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// Lesser General Public License for more details.
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#ifndef WAVFILE_H
#define WAVFILE_H
#include <stdio.h>
#ifndef uint
typedef unsigned int uint;
/// WAV audio file 'riff' section header
typedef struct
char riff_char[4];
int package_len;
char wave[4];
} WavRiff;
/// WAV audio file 'format' section header
typedef struct
char fmt[4];
int format_len;
short fixed;
short channel_number;
int sample_rate;
int byte_rate;
short byte_per_sample;
short bits_per_sample;
} WavFormat;
/// WAV audio file 'data' section header
typedef struct
char data_field[4];
uint data_len;
} WavData;
/// WAV audio file header
typedef struct
WavRiff riff;
WavFormat format;
WavData data;
} WavHeader;
/// Class for reading WAV audio files.
class WavInFile
/// File pointer.
FILE *fptr;
/// Counter of how many bytes of sample data have been read from the file.
uint dataRead;
/// WAV header information
WavHeader header;
/// Read WAV file headers.
/// \return zero if all ok, nonzero if file format is invalid.
int readWavHeaders();
/// Checks WAV file header tags.
/// \return zero if all ok, nonzero if file format is invalid.
int checkCharTags();
/// Reads a single WAV file header block.
/// \return zero if all ok, nonzero if file format is invalid.
int readHeaderBlock();
/// Reads WAV file 'riff' block
int readRIFFBlock();
/// Constructor: Opens the given WAV file. If the file can't be opened,
/// throws 'runtime_error' exception.
WavInFile(const char *filename);
/// Destructor: Closes the file.
/// Close the file. Notice that file is automatically closed also when the
/// class instance is deleted.
void close();
/// Rewind to beginning of the file
void rewind();
/// Get sample rate.
uint getSampleRate() const;
/// Get number of bits per sample, i.e. 8 or 16.
uint getNumBits() const;
/// Get sample data size in bytes. Ahem, this should return same information as
/// 'getBytesPerSample'...
uint getDataSizeInBytes() const;
/// Get total number of samples in file.
uint getNumSamples() const;
/// Get number of bytes per audio sample (e.g. 16bit stereo = 4 bytes/sample)
uint getBytesPerSample() const;
/// Get number of audio channels in the file (1=mono, 2=stereo)
uint getNumChannels() const;
/// Get the audio file length in milliseconds
uint getLengthMS() const;
/// Reads audio samples from the WAV file. This routine works only for 8 bit samples.
/// Reads given number of elements from the file or if end-of-file reached, as many
/// elements as are left in the file.
/// \return Number of 8-bit integers read from the file.
int read(char *buffer, int maxElems);
/// Reads audio samples from the WAV file to 16 bit integer format. Reads given number
/// of elements from the file or if end-of-file reached, as many elements as are
/// left in the file.
/// \return Number of 16-bit integers read from the file.
int read(short *buffer, ///< Pointer to buffer where to read data.
int maxElems ///< Size of 'buffer' array (number of array elements).
/// Reads audio samples from the WAV file to floating point format, converting
/// sample values to range [-1,1[. Reads given number of elements from the file
/// or if end-of-file reached, as many elements as are left in the file.
/// \return Number of elements read from the file.
int read(float *buffer, ///< Pointer to buffer where to read data.
int maxElems ///< Size of 'buffer' array (number of array elements).
/// Check end-of-file.
/// \return Nonzero if end-of-file reached.
int eof() const;
/// Class for writing WAV audio files.
class WavOutFile
/// Pointer to the WAV file
FILE *fptr;
/// WAV file header data.
WavHeader header;
/// Counter of how many bytes have been written to the file so far.
int bytesWritten;
/// Fills in WAV file header information.
void fillInHeader(const uint sampleRate, const uint bits, const uint channels);
/// Finishes the WAV file header by supplementing information of amount of
/// data written to file etc
void finishHeader();
/// Writes the WAV file header.
void writeHeader();
/// Constructor: Creates a new WAV file. Throws a 'runtime_error' exception
/// if file creation fails.
WavOutFile(const char *fileName, ///< Filename
int sampleRate, ///< Sample rate (e.g. 44100 etc)
int bits, ///< Bits per sample (8 or 16 bits)
int channels ///< Number of channels (1=mono, 2=stereo)
/// Destructor: Finalizes & closes the WAV file.
/// Write data to WAV file. This function works only with 8bit samples.
/// Throws a 'runtime_error' exception if writing to file fails.
void write(const char *buffer, ///< Pointer to sample data buffer.
int numElems ///< How many array items are to be written to file.
/// Write data to WAV file. Throws a 'runtime_error' exception if writing to
/// file fails.
void write(const short *buffer, ///< Pointer to sample data buffer.
int numElems ///< How many array items are to be written to file.
/// Write data to WAV file in floating point format, saturating sample values to range
/// [-1..+1[. Throws a 'runtime_error' exception if writing to file fails.
void write(const float *buffer, ///< Pointer to sample data buffer.
int numElems ///< How many array items are to be written to file.
/// Finalize & close the WAV file. Automatically supplements the WAV file header
/// information according to written data etc.
/// Notice that file is automatically closed also when the class instance is deleted.
void close();
Easy reading & writing of WAV sound files.
/// Classes for easy reading & writing of WAV sound files.
/// For big-endian CPU, define _BIG_ENDIAN_ during compile-time to correctly
/// parse the WAV files with such processors.
/// Admittingly, more complete WAV reader routines may exist in public domain,
/// but the reason for 'yet another' one is that those generic WAV reader
/// libraries are exhaustingly large and cumbersome! Wanted to have something
/// simpler here, i.e. something that's not already larger than rest of the
/// SoundTouch/SoundStretch program...
/// Author : Copyright (c) Olli Parviainen
/// Author e-mail : oparviai 'at' iki.fi
/// SoundTouch WWW: http://www.surina.net/soundtouch
// Last changed : $Date: 2006/02/05 16:44:06 $
// File revision : $Revision: 1.15 $
// $Id: WavFile.cpp,v 1.15 2006/02/05 16:44:06 Olli Exp $
// License :
// SoundTouch audio processing library
// Copyright (c) Olli Parviainen
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// Lesser General Public License for more details.
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#include <stdio.h>
#include <stdexcept>
#include <string>
#include <assert.h>
#include <limits.h>
#include "WavFile.h"
using namespace std;
const static char riffStr[] = "RIFF";
const static char waveStr[] = "WAVE";
const static char fmtStr[] = "fmt ";
const static char dataStr[] = "data";
// Helper functions for swapping byte order to correctly read/write WAV files
// with big-endian CPU's: Define compile-time definition _BIG_ENDIAN_ to
// turn-on the conversion if it appears necessary.
// For example, Intel x86 is little-endian and doesn't require conversion,
// while PowerPC of Mac's and many other RISC cpu's are big-endian.
// In gcc compiler detect the byte order automatically
// big-endian platform.
#define _BIG_ENDIAN_
#ifdef _BIG_ENDIAN_
// big-endian CPU, swap bytes in 16 & 32 bit words
// helper-function to swap byte-order of 32bit integer
static inline void _swap32(unsigned int &dwData)
dwData = ((dwData >> 24) & 0x000000FF) |
((dwData >> 8) & 0x0000FF00) |
((dwData << 8) & 0x00FF0000) |
((dwData << 24) & 0xFF000000);
// helper-function to swap byte-order of 16bit integer
static inline void _swap16(unsigned short &wData)
wData = ((wData >> 8) & 0x00FF) |
((wData << 8) & 0xFF00);
// helper-function to swap byte-order of buffer of 16bit integers
static inline void _swap16Buffer(unsigned short *pData, unsigned int dwNumWords)
unsigned long i;
for (i = 0; i < dwNumWords; i ++)
#else // BIG_ENDIAN
// little-endian CPU, WAV file is ok as such
// dummy helper-function
static inline void _swap32(unsigned int &dwData)
// do nothing
// dummy helper-function
static inline void _swap16(unsigned short &wData)
// do nothing
// dummy helper-function
static inline void _swap16Buffer(unsigned short *pData, unsigned int dwNumBytes)
// do nothing
#endif // BIG_ENDIAN
// Class WavInFile
WavInFile::WavInFile(const char *fileName)
int hdrsOk;
// Try to open the file for reading
fptr = fopen(fileName, "rb");
if (fptr == NULL)
// didn't succeed
string msg = "Error : Unable to open file \"";
msg += fileName;
msg += "\" for reading.";
throw runtime_error(msg);
// Read the file headers
hdrsOk = readWavHeaders();
if (hdrsOk != 0)
// Something didn't match in the wav file headers
string msg = "File \"";
msg += fileName;
msg += "\" is corrupt or not a WAV file";
throw runtime_error(msg);
if (header.format.fixed != 1)
string msg = "File \"";
msg += fileName;
msg += "\" uses unsupported encoding.";
throw runtime_error(msg);
dataRead = 0;
void WavInFile::rewind()
int hdrsOk;
fseek(fptr, 0, SEEK_SET);
hdrsOk = readWavHeaders();
assert(hdrsOk == 0);
dataRead = 0;
int WavInFile::checkCharTags()
// header.format.fmt should equal to 'fmt '
if (memcmp(fmtStr, header.format.fmt, 4) != 0) return -1;
// header.data.data_field should equal to 'data'
if (memcmp(dataStr, header.data.data_field, 4) != 0) return -1;
return 0;
int WavInFile::read(char *buffer, int maxElems)
int numBytes;
uint afterDataRead;
// ensure it's 8 bit format
if (header.format.bits_per_sample != 8)
throw runtime_error("Error: WavInFile::read(char*, int) works only with 8bit samples.");
assert(sizeof(char) == 1);
numBytes = maxElems;
afterDataRead = dataRead + numBytes;
if (afterDataRead > header.data.data_len)
// Don't read more samples than are marked available in header
numBytes = header.data.data_len - dataRead;
assert(numBytes >= 0);
numBytes = fread(buffer, 1, numBytes, fptr);
dataRead += numBytes;
return numBytes;
int WavInFile::read(short *buffer, int maxElems)
unsigned int afterDataRead;
int numBytes;
int numElems;
if (header.format.bits_per_sample == 8)
// 8 bit format
char *temp = new char[maxElems];
int i;
numElems = read(temp, maxElems);
// convert from 8 to 16 bit
for (i = 0; i < numElems; i ++)
buffer[i] = temp[i] << 8;
delete[] temp;
// 16 bit format
assert(header.format.bits_per_sample == 16);
assert(sizeof(short) == 2);
numBytes = maxElems * 2;
afterDataRead = dataRead + numBytes;
if (afterDataRead > header.data.data_len)
// Don't read more samples than are marked available in header
numBytes = header.data.data_len - dataRead;
assert(numBytes >= 0);
numBytes = fread(buffer, 1, numBytes, fptr);
dataRead += numBytes;
numElems = numBytes / 2;
// 16bit samples, swap byte order if necessary
_swap16Buffer((unsigned short *)buffer, numElems);
return numElems;
int WavInFile::read(float *buffer, int maxElems)
short *temp = new short[maxElems];
int num;
int i;
double fscale;
num = read(temp, maxElems);
fscale = 1.0 / 32768.0;
// convert to floats, scale to range [-1..+1[
for (i = 0; i < num; i ++)
buffer[i] = (float)(fscale * (double)temp[i]);
delete[] temp;
return num;
int WavInFile::eof() const
// return true if all data has been read or file eof has reached
return (dataRead == header.data.data_len || feof(fptr));
void WavInFile::close()
fptr = NULL;
// test if character code is between a white space ' ' and little 'z'
static int isAlpha(char c)
return (c >= ' ' && c <= 'z') ? 1 : 0;
// test if all characters are between a white space ' ' and little 'z'
static int isAlphaStr(char *str)
int c;
c = str[0];
while (c)
if (isAlpha(c) == 0) return 0;
str ++;
c = str[0];
return 1;
int WavInFile::readRIFFBlock()
fread(&(header.riff), sizeof(WavRiff), 1, fptr);
// swap 32bit data byte order if necessary
_swap32((unsigned int &)header.riff.package_len);
// header.riff.riff_char should equal to 'RIFF');
if (memcmp(riffStr, header.riff.riff_char, 4) != 0) return -1;
// header.riff.wave should equal to 'WAVE'
if (memcmp(waveStr, header.riff.wave, 4) != 0) return -1;
return 0;
int WavInFile::readHeaderBlock()
char label[5];
string sLabel;
// lead label string
fread(label, 1, 4, fptr);
label[4] = 0;
if (isAlphaStr(label) == 0) return -1; // not a valid label
// Decode blocks according to their label
if (strcmp(label, fmtStr) == 0)
int nLen, nDump;
// 'fmt ' block
memcpy(header.format.fmt, fmtStr, 4);
// read length of the format field
fread(&nLen, sizeof(int), 1, fptr);
// swap byte order if necessary
_swap32((unsigned int &)nLen); // int format_len;
header.format.format_len = nLen;
// calculate how much length differs from expected
nDump = nLen - (sizeof(header.format) - 8);
// if format_len is larger than expected, read only as much data as we've space for
if (nDump > 0)
nLen = sizeof(header.format) - 8;
// read data
fread(&(header.format.fixed), nLen, 1, fptr);
// swap byte order if necessary
_swap16((unsigned short &)header.format.fixed); // short int fixed;
_swap16((unsigned short &)header.format.channel_number); // short int channel_number;
_swap32((unsigned int &)header.format.sample_rate); // int sample_rate;
_swap32((unsigned int &)header.format.byte_rate); // int byte_rate;
_swap16((unsigned short &)header.format.byte_per_sample); // short int byte_per_sample;
_swap16((unsigned short &)header.format.bits_per_sample); // short int bits_per_sample;
// if format_len is larger than expected, skip the extra data
if (nDump > 0)
fseek(fptr, nDump, SEEK_CUR);
return 0;
else if (strcmp(label, dataStr) == 0)
// 'data' block
memcpy(header.data.data_field, dataStr, 4);
fread(&(header.data.data_len), sizeof(uint), 1, fptr);
// swap byte order if necessary
_swap32((unsigned int &)header.data.data_len);
return 1;
uint len, i;
uint temp;
// unknown block
// read length
fread(&len, sizeof(len), 1, fptr);
// scan through the block
for (i = 0; i < len; i ++)
fread(&temp, 1, 1, fptr);
if (feof(fptr)) return -1; // unexpected eof
return 0;
int WavInFile::readWavHeaders()
int res;
memset(&header, 0, sizeof(header));
res = readRIFFBlock();
if (res) return 1;
// read header blocks until data block is found
// read header blocks
res = readHeaderBlock();
if (res < 0) return 1; // error in file structure
} while (res == 0);
// check that all required tags are legal
return checkCharTags();
uint WavInFile::getNumChannels() const
return header.format.channel_number;
uint WavInFile::getNumBits() const
return header.format.bits_per_sample;
uint WavInFile::getBytesPerSample() const
return getNumChannels() * getNumBits() / 8;
uint WavInFile::getSampleRate() const
return header.format.sample_rate;
uint WavInFile::getDataSizeInBytes() const
return header.data.data_len;
uint WavInFile::getNumSamples() const
return header.data.data_len / header.format.byte_per_sample;
uint WavInFile::getLengthMS() const
uint numSamples;
uint sampleRate;
numSamples = getNumSamples();
sampleRate = getSampleRate();
assert(numSamples < UINT_MAX / 1000);
return (1000 * numSamples / sampleRate);
// Class WavOutFile
WavOutFile::WavOutFile(const char *fileName, int sampleRate, int bits, int channels)
bytesWritten = 0;
fptr = fopen(fileName, "wb");
if (fptr == NULL)
string msg = "Error : Unable to open file \"";
msg += fileName;
msg += "\" for writing.";
//pmsg = msg.c_str;
throw runtime_error(msg);
fillInHeader(sampleRate, bits, channels);
void WavOutFile::fillInHeader(uint sampleRate, uint bits, uint channels)
// fill in the 'riff' part..
// copy string 'RIFF' to riff_char
memcpy(&(header.riff.riff_char), riffStr, 4);
// package_len unknown so far
header.riff.package_len = 0;
// copy string 'WAVE' to wave
memcpy(&(header.riff.wave), waveStr, 4);
// fill in the 'format' part..
// copy string 'fmt ' to fmt
memcpy(&(header.format.fmt), fmtStr, 4);
header.format.format_len = 0x10;
header.format.fixed = 1;
header.format.channel_number = (short)channels;
header.format.sample_rate = sampleRate;
header.format.bits_per_sample = (short)bits;
header.format.byte_per_sample = (short)(bits * channels / 8);
header.format.byte_rate = header.format.byte_per_sample * sampleRate;
header.format.sample_rate = sampleRate;
// fill in the 'data' part..
// copy string 'data' to data_field
memcpy(&(header.data.data_field), dataStr, 4);
// data_len unknown so far
header.data.data_len = 0;
void WavOutFile::finishHeader()
// supplement the file length into the header structure
header.riff.package_len = bytesWritten + 36;
header.data.data_len = bytesWritten;
void WavOutFile::writeHeader()
WavHeader hdrTemp;
// swap byte order if necessary
hdrTemp = header;
_swap32((unsigned int &)hdrTemp.riff.package_len);
_swap32((unsigned int &)hdrTemp.format.format_len);
_swap16((unsigned short &)hdrTemp.format.fixed);
_swap16((unsigned short &)hdrTemp.format.channel_number);
_swap32((unsigned int &)hdrTemp.format.sample_rate);
_swap32((unsigned int &)hdrTemp.format.byte_rate);
_swap16((unsigned short &)hdrTemp.format.byte_per_sample);
_swap16((unsigned short &)hdrTemp.format.bits_per_sample);
_swap32((unsigned int &)hdrTemp.data.data_len);
// write the supplemented header in the beginning of the file
fseek(fptr, 0, SEEK_SET);
fwrite(&hdrTemp, sizeof(hdrTemp), 1, fptr);
// jump back to the end of the file
fseek(fptr, 0, SEEK_END);
void WavOutFile::close()
fptr = NULL;
void WavOutFile::write(const char *buffer, int numElems)
int res;
if (header.format.bits_per_sample != 8)
throw runtime_error("Error: WavOutFile::write(const char*, int) accepts only 8bit samples.");
assert(sizeof(char) == 1);
res = fwrite(buffer, 1, numElems, fptr);
if (res != numElems)
throw runtime_error("Error while writing to a wav file.");
bytesWritten += numElems;
void WavOutFile::write(const short *buffer, int numElems)
int res;
// 16 bit samples
if (numElems < 1) return; // nothing to do
if (header.format.bits_per_sample == 8)
int i;
char *temp = new char[numElems];
// convert from 16bit format to 8bit format
for (i = 0; i < numElems; i ++)
temp[i] = buffer[i] >> 8;
// write in 8bit format
write(temp, numElems);
delete[] temp;
// 16bit format
unsigned short *pTemp = new unsigned short[numElems];
assert(header.format.bits_per_sample == 16);
// allocate temp buffer to swap byte order if necessary
memcpy(pTemp, buffer, numElems * 2);
_swap16Buffer(pTemp, numElems);
res = fwrite(pTemp, 2, numElems, fptr);
delete[] pTemp;
if (res != numElems)
throw runtime_error("Error while writing to a wav file.");
bytesWritten += 2 * numElems;
void WavOutFile::write(const float *buffer, int numElems)
int i;
short *temp = new short[numElems];
int iTemp;
// convert to 16 bit integer
for (i = 0; i < numElems; i ++)
// convert to integer
iTemp = (int)(32768.0f * buffer[i]);
// saturate
if (iTemp < -32768) iTemp = -32768;
if (iTemp > 32767) iTemp = 32767;
temp[i] = (short)iTemp;
write(temp, numElems);
delete[] temp;
Simple SOLA algorithm Main.cpp
// Simple SOLA algorithm example. The example reads a .wav sound
// file with mono-16bit-44100Hz sample format, process it with SOLA
// and writes output into another .wav file.
// Copyright (c) Olli Parviainen 2006 <oparviai@iki.fi>
#include <stdexcept>
#include "wavfile.h"
using namespace std;
// Time scaling factor, values > 1.0 increase, values < 1.0 decrease tempo
#define TIME_SCALE 1 // 15% slower tempo
// Processing sequence size (100 msec with 44100Hz samplerate)
#define SEQUENCE 800//4410
// Overlapping size (20 msec)
#define OVERLAP 160//882
// Best overlap offset seeking window (15 msec)
#define SEEK_WINDOW 120//662
// Processing sequence flat mid-section duration
// Theoretical interval between the processing seqeuences
typedef short SAMPLE; // sample type, 16bit signed integer
// Use cross-correlation function to find best overlapping offset
// where input_prev and input_new match best with each other
int seek_best_overlap(const SAMPLE *input_prev, const SAMPLE *input_new)
int i;
int bestoffset = 0;
float bestcorr = -1e30f;
float temp[OVERLAP];
// Precalculate overlapping slopes with input_prev
for (i = 0; i < OVERLAP; i ++)
temp[i] = (float)(input_prev[i] * i * (OVERLAP - i));
// Find best overlap offset within [0..SEEK_WINDOW]
for (i = 0; i < SEEK_WINDOW; i ++)
int j;
float crosscorr = 0;
for (j = 0; j < OVERLAP; j ++)
crosscorr += (float)input_new[i + j] * temp[j];
if (crosscorr > bestcorr)
// found new best offset candidate
bestcorr = crosscorr;
bestoffset = i;
return bestoffset;
// Overlap 'input_prev' with 'input_new' by sliding the amplitudes during
// OVERLAP samples. Store result to 'output'.
void overlap(SAMPLE *output, const SAMPLE *input_prev, const SAMPLE *input_new)
int i;
for (i = 0; i < OVERLAP; i ++)
output[i] = (input_prev[i] * (OVERLAP - i) + input_new[i] * i) / OVERLAP;
// SOLA algorithm. Performs time scaling for sample data given in 'input',
// write result to 'output'. Return number of output samples.
int sola(SAMPLE *output, const SAMPLE *input, int num_in_samples)
int num_out_samples = 0;
const SAMPLE *seq_offset = input;
const SAMPLE *prev_offset;
int nTest = SEQUENCE_SKIP;
while (num_in_samples > SEQUENCE_SKIP + SEEK_WINDOW)
// copy flat mid-sequence from current processing sequence to output
memcpy(output, seq_offset, FLAT_DURATION * sizeof(SAMPLE));
// calculate a pointer to overlap at end of the processing sequence
prev_offset = seq_offset + FLAT_DURATION;
// update input pointer to theoretical next processing sequence begin
// seek actual best matching offset using cross-correlation
seq_offset = input + seek_best_overlap(prev_offset, input);
// do overlapping between previous & new sequence, copy result to output
overlap(output + FLAT_DURATION, prev_offset, seq_offset);
// Update input & sequence pointers by overlapping amount
seq_offset += OVERLAP;
input += OVERLAP;
// Update output pointer & sample counters
num_out_samples += SEQUENCE - OVERLAP;
num_in_samples -= SEQUENCE_SKIP;
return num_out_samples;
// Buffers for input/output sample data. For sake of simplicity, these are
// just made 'big enough' for the example purpose.
SAMPLE inbuffer[10240000];
SAMPLE outbuffer[20240000];
int main(int numstr, char **pstr)
if (numstr < 3)
printf("usage: solatest input.wav output.wav\n");
return -1;
int insamples, outsamples;
// Open input file
WavInFile infile(pstr[1]);
if ((infile.getSampleRate() != 44100) || (infile.getNumChannels() != 1))
printf("Sorry, this example processes mono audio sampled at 44100Hz.\n");
return -1;
// Read data from input file
insamples = infile.read(inbuffer, 10240000);
// Process
outsamples = sola(outbuffer, inbuffer, insamples);
// Write result to output file
WavOutFile outfile(pstr[2], infile.getSampleRate(), infile.getNumBits(), infile.getNumChannels());
outfile.write(outbuffer, outsamples);
catch (exception &e)
printf("Error: %s\n", e.what());
return 0;
sa=585;ss=438; %ÕâÊÇ?µµ?µÄ
%*********** time scaling **************
st=sa:sa:length(x); %?ªÊ?Ê?ÓïÒô?Î?ÓSa???ªÊ?ÊäÈë,ÒÔºóÃ??ÎÏòºóÒÆ??Sa?öµã
num=(length(x)-r)/sa; %?Ü??Òª?øÐÐnumÂÖ?Ù??
x=[x; zeros(w+kmax,1)];%ÓïÒôÎ??ÎÐèÒª??0??ÒòΪ?ÖÎö???ÚW?áÒÆ????ÇÒ?î?óÒÆ??kmax?öµã
for j=1:(num)
y=x(xst:1:xst+w-1); %?ªÊ?Ê?ÓïÒô?Î?ÓSa??
start=st(j):st(j)+kmax-1; %?ÖÎö???ÚµÄÆðµã???î?àÒÆ??kmax?öµã
cy=y(end:-1:end-wov+1); %È??öÊä?öÐòÁÐyµÄºówov?öµã
km_buf=zeros(1,kmax); %ÓÃÀ??ÇÂ?kmax?ö??Ïà?ØϵÊý
for i=1:kmax
xbuff=x(start(i):start(i)+w-1); %?ÖÎö???ÚËù?ØÈ?µÄw?öµã
cx=xbuff(1:wov); %È??öÇ?wov?öµã
if ( rxx_k==0) %ÈôΪÁã???íÊ?Òѵ?ÓïÒôÎ??Î?Õ?Å??0µÄ???Ö??ÔòÍ?Ö?
km=find(km_buf==max(km_buf)); %ÕÒ?ö?î?óµÄ??Ïà?ØϵÊýÔÚkm_bufµÄÎ?ÖÃ
yout=[yout; x(start(km)+wov:start(km)+w+1)]; %?ÑÒ?ÖÂÐÔ?îºÃµÄÐòÁеÄSs?öµã??Ϊ?îºóÊä?ö
for j=1:L;
for i=1:M;
if y_st<y_end
elseif y_st==y_end
invert=linspace(yout(y_st),0,L+1); %ÈôÊÇ?îºóÒ??öµã??ÔòËüÓëÁãÖ??ä?åÈëL-1?öµã
data=[data invert(1:end-1)]; %Ã??Î?ÑL?öµãÊä?öµ?data
data_out=[data_out data(1:M:end)]; %ÔÚdataÖÐÃ??ôM?öµãÈ??öÒ??öµã????Êä?öµ?data_out
data=[]; %?ÑdataµÄÄÚÈÝÇå?ý
Pitch Synchronous Overlap Add Method (PSOLA.CPP)
#include "../common/tdpsola.h"
#include "psola.h"
CPSOLA instance;
void PSOLA_EnableCosineSmooth(bool enable)
void PSOLA_SetSpectralMapping(bool useBezier, int x1, int y1, int x2, int y2)
instance.setSpectralMapping(useBezier, x1, y1, x2, y2);
bool PSOLA_IsCosineSmoothEnabled()
return instance.isCosineSmoothEnabled();
void PSOLA_EnableVoicelessExtension(int method)
int PSOLA_GetVoicelessExtension()
return instance.getVoicelessExtension();
unsigned PSOLA_ModifyPitchContour(
const short * srcWave,
unsigned srcLength,
const unsigned *srcTags,
unsigned tagNumber,
const unsigned *trgPeriods,
unsigned periodNumber,
unsigned trgDuration,
float specRatio,
short * trgWave,
unsigned trgBufferLength,
unsigned sampleRate)
return instance.modifyPitchContour(srcWave, srcLength, srcTags, tagNumber, trgWave, trgBufferLength, trgPeriods, periodNumber, trgDuration, specRatio, sampleRate);
unsigned PSOLA_Modify(
const short * srcWave,
unsigned srcLength,
const unsigned *srcTags,
unsigned tagNumber,
unsigned trgPitch,
unsigned trgDuration,
float specRatio,
short * trgWave,
unsigned trgBufferLength,
unsigned sampleRate)
return instance.modify(srcWave, srcLength, srcTags, tagNumber, trgWave, trgBufferLength, trgPitch, trgDuration, specRatio, sampleRate);
unsigned PSOLA_ModifyRatio(
const short * srcWave,
unsigned srcLength,
const unsigned * srcTags,
unsigned tagNumber,
float pitchRatio,
float durationRatio,
float specRatio,
short * trgWave,
unsigned trgBufferLength,
unsigned sampleRate
return instance.modifyRatio(srcWave, srcLength, srcTags, tagNumber, trgWave, trgBufferLength, pitchRatio, durationRatio, specRatio, sampleRate);
/// Modify wave using TP-PSOLA algorithm
/// @version 1.0.0
/// @author Jun Xu
/// @date 2007/07/18
# define PSOLA_DLL_EXPORTS __declspec(dllimport)
# ifdef _DEBUG
# pragma comment(lib, "psolad.lib")
# pragma message("Linking with psolad.dll")
# else
# pragma comment(lib, "psola.lib")
# pragma message("Linking with psola.dll")
# endif
# define PSOLA_DLL_EXPORTS __declspec(dllexport)
#ifdef _cplusplus
extern "C" {
#define PSOLA_VLPPMETHOD_NONE 0 ///< ÇåÒô¶Î²»×ö»ùƵÀ©Õ¹
#define PSOLA_VLPPMETHOD_PEAK 3 ///< ÇåÒô¶Î¸ù¾ÝÓïÒô¼â·åµãÀ´×öÖÜÆÚÀ©Õ¹
#define PSOLA_VLPPMETHOD_AUTO 4 ///< ×Ô¶¯×öÇåÒôÀ©Õ¹£¨¸ù¾Ýʱ³¤Ð޸ıÈÀý¾ö¶¨£©
/// ÉèÖÃÐ޸Ĺý³ÌÖеÄƵÆ×Ó³É䷽ʽ£¬Ð§¹û²»ºÃ£¬É÷ÓÃ
/// @param useBezier true:ʹÓñ´Èû¶ûÇúÏß,false:ʹÓÃÕÛÏß
/// @param x1,y1 µÚÒ»¸ö¿ØÖƵãµÄ×ø±ê
/// @param x2,y2 µÚ¶þ¸ö¿ØÖƵãµÄ×ø±ê
PSOLA_DLL_EXPORTS void PSOLA_SetSpectralMapping(bool useBezier, int x1, int y1, int x2, int y2);
/// ÉèÖÃÇåÒô¶ÎÖÜÆÚÀ©Õ¹·½Ê½
/// @param method 0-3£¬²Î¼ûÉÏÃæµÄºê¶¨Òå
PSOLA_DLL_EXPORTS void PSOLA_EnableVoicelessExtension(int method);
/// »ñÈ¡ÇåÒô¶ÎÖÜÆÚÀ©Õ¹·½Ê½
PSOLA_DLL_EXPORTS int PSOLA_GetVoicelessExtension();
/// ÆôÓÃÓàÏÒº¯Êý½øÐÐÆ´½Ó±ß½çƽ»¬
/// @param enable true:ÆôÓÃ,false:²»ÆôÓÃ
PSOLA_DLL_EXPORTS void PSOLA_EnableCosineSmooth(bool enable);
/// ÅжÏÓàÏұ߽çƽ»¬ÊÇ·ñ±»ÆôÓÃ
PSOLA_DLL_EXPORTS bool PSOLA_IsCosineSmoothEnabled();
/// Modify wave using PSOLA model
/// ʹÓÃPSOLAÄ£ÐͽøÐÐÓïÒôÐ޸ģ¬Ö¸¶¨Ä¿±êµÄƽ¾ù»ùƵÖÜÆÚÒÔ¼°ÓïÒô³¤¶È
/// @param srcWave[in] wave buffer read from speech database
/// ÓïÒôÊý¾Ý£¬±ØÐëΪ16bit²ÉÑù¾«¶È
/// @param srcLength[in] wave buffer length, in short count
/// ÓïÒôÊý¾ÝµÄ²ÉÑùµã¸öÊý
/// @param srcTags[in] peak tags read from speech database
/// each tag indicate the peak position offset to the first sample of wave
/// ÓïÒôÊý¾ÝµÄ·åÖµµã±ê×¢Êý×é
/// ÄÚ²¿±£´æÿ¸ö·åÖµµãÏà¶ÔÓïÒôÆðʼµãµÄÆ«ÒÆλÖÃ
/// @param tagNumber[in] peak tag count of srcTags
/// ·åÖµ±ê×¢¸öÊý
/// @param trgPitch[in] predicted average pitch period
/// trgPitch=0 means keeping pitch no change
/// Ä¿±ê»ùƵÖÜÆڵĴóС£¬Èç¹ûΪ0Ôò±íʾ²»½øÐÐÐÞ¸Ä
/// @param trgDuration[in] predicted wave duration, in short
/// Ä¿±êÓïÒô²ÉÑùµã¸öÊý£¬Èç¹ûΪ0Ôò±íʾ²»½øÐÐÐÞ¸Ä
/// @param specRatio [in] modification ratio of spectra
/// ƵÆ×Ð޸ıÈÀý£¬0Ϊ²»ÐÞ¸Ä
/// @param trgWave[out] modified wave, buffer should be allocated outside
/// Ä¿±êÓïÒôÊý¾Ý»º³åÇø£¬ÓÉÍⲿ·ÖÅ䣬Îñ±Ø±ÈtrgDurationÒª´óһЩ
/// @param sampleRate[in] Sample count per second, default is 16000
/// ²ÉÑùÂÊ£¬Ò»°ãÇëʹÓÃ16000
/// @return true if modified successfully
/// false if not, then the content of trgWave if un-defined
const short * srcWave,
unsigned srcLength,
const unsigned *srcTags,
unsigned tagNumber,
unsigned trgPitch,
unsigned trgDuration,
float specRatio,
short * trgWave,
unsigned trgBufferLength,
unsigned sampleRate);
/// Modify wave using PSOLA model
/// ʹÓÃPSOLAÄ£ÐͽøÐÐÓïÒôÐ޸ģ¬Ö¸¶¨Ä¿±ê»ùƵÇúÏß
/// @param srcWave[in] wave buffer read from speech database
/// ÓïÒôÊý¾Ý£¬±ØÐëΪ16bit²ÉÑù¾«¶È
/// @param srcLength[in] wave buffer length, in short count
/// ÓïÒôÊý¾ÝµÄ²ÉÑùµã¸öÊý
/// @param srcTags[in] peak tags read from speech database
/// each tag indicate the peak position offset to the first sample of wave
/// ÓïÒôÊý¾ÝµÄ·åÖµµã±ê×¢Êý×é
/// ÄÚ²¿±£´æÿ¸ö·åÖµµãÏà¶ÔÓïÒôÆðʼµãµÄÆ«ÒÆλÖÃ
/// @param tagNumber[in] peak tag count of srcTags
/// ·åÖµ±ê×¢¸öÊý
/// @param trgPeriods[in] predicted pitch period
/// Ä¿±ê»ùƵÖÜÆÚÊý×é
/// @param periodNumber[in] pitch period count of target
/// Ä¿±ê»ùƵÖÜÆÚÊýÄ¿
/// @param trgDuration[in] predicted wave duration, in short
/// Ä¿±êÓïÒô²ÉÑùµã¸öÊý£¬Èç¹ûΪ0Ôò±íʾ²»½øÐÐÐÞ¸Ä
/// @param specRatio [in] modification ratio of spectra
/// ƵÆ×Ð޸ıÈÀý£¬0Ϊ²»ÐÞ¸Ä
/// @param trgWave[out] modified wave, buffer should be allocated outside
/// Ä¿±êÓïÒôÊý¾Ý»º³åÇø£¬ÓÉÍⲿ·ÖÅ䣬Îñ±Ø±ÈtrgDurationÒª´óһЩ
/// @param sampleRate[in] Sample count per second, default is 16000
/// ²ÉÑùÂÊ£¬Ò»°ãÇëʹÓÃ16000
/// @return true if modified successfully
/// false if not, then the content of trgWave if un-defined
PSOLA_DLL_EXPORTS unsigned PSOLA_ModifyPitchContour(
const short * srcWave,
unsigned srcLength,
const unsigned *srcTags,
unsigned tagNumber,
const unsigned *trgPeriods,
unsigned periodNumber,
unsigned trgDuration,
float specRatio,
short * trgWave,
unsigned trgBufferLength,
unsigned sampleRate);
/// Modify wave using PSOLA model
/// ʹÓÃPSOLAÄ£ÐͽøÐÐÓïÒôÐ޸ģ¬Ö¸¶¨ÖÜÆÚ£¬Ê±³¤µÄÐ޸ıÈÀý
/// @param srcWave[in] wave buffer read from speech database
/// ÓïÒôÊý¾Ý£¬±ØÐëΪ16bit²ÉÑù¾«¶È
/// @param srcLength[in] wave buffer length, in short count
/// ÓïÒôÊý¾ÝµÄ²ÉÑùµã¸öÊý
/// @param srcTags[in] peak tags read from speech database
/// each tag indicate the peak position offset to the first sample of wave
/// ÓïÒôÊý¾ÝµÄ·åÖµµã±ê×¢Êý×é
/// ÄÚ²¿±£´æÿ¸ö·åÖµµãÏà¶ÔÓïÒôÆðʼµãµÄÆ«ÒÆλÖÃ
/// @param tagNumber[in] peak tag count of srcTags
/// ·åÖµ±ê×¢¸öÊý
/// @param pitchRatio[in] modification ratio of pitch
/// Ä¿±ê»ùƵÖÜÆÚÐ޸ıÈÀý£¬Èç¹ûΪ0Ôò±íʾ²»½øÐÐÐÞ¸Ä
/// @param durationRatio[in]modification ratio of duration
/// Ä¿±êÓïÒôʱ³¤Ð޸ıÈÀý£¬Èç¹ûΪ0Ôò±íʾ²»½øÐÐÐÞ¸Ä
/// @param specRatio [in] modification ratio of spectra
/// ƵÆ×Ð޸ıÈÀý£¬0Ϊ²»ÐÞ¸Ä
/// @param trgWave[out] modified wave, buffer should be allocated outside
/// Ä¿±êÓïÒôÊý¾Ý»º³åÇø£¬ÓÉÍⲿ·ÖÅ䣬Îñ±Ø±ÈtrgDurationÒª´óһЩ
/// @param sampleRate[in] Sample count per second, default is 16000
/// ²ÉÑùÂÊ£¬Ò»°ãÇëʹÓÃ16000
/// @return true if modified successfully
/// false if not, then the content of trgWave if un-defined
PSOLA_DLL_EXPORTS unsigned PSOLA_ModifyRatio(
const short * srcWave,
unsigned srcLength,
const unsigned *srcTags,
unsigned tagNumber,
float pitchRatio,
float durationRatio,
float specRatio,
short * trgWave,
unsigned trgBufferLength,
unsigned sampleRate);
#ifdef _cplusplus
PSOLA. h Header
#ifndef PSOLA_H_
#define PSOLA_H_
#include <vector>
#include "DSP.h"
using namespace std;
class CPsola{
void SetData(short*,unsigned);
void SetAmplitudeMultiple(float);
void SetDuration(float);
void SetPitch(float*,unsigned,float);
void SetNewPitch(float*,unsigned);
void SetSampleFrequency(unsigned);
void SetFrameLength(float);
void SetX1(float);
void Adjust();
void TD_PSOLA(float,float);
void PSOLA(float,float,bool);
unsigned GetNewLen();
short* GetNewData();
unsigned FindMax(unsigned,unsigned,short*);
int Approximate(float);
short Middle(unsigned,short*);
bool MarkPitch();
void MarkOneFrame(unsigned,unsigned);
void AdjustAmplitude();
void AdjustDuration();
void AdjustPitch();
void Smooth(short*,unsigned);
unsigned m_uSamFre;
float m_dFrameLen;
float m_dX1;
float m_dAmpMul;
float m_dDuration;
unsigned m_uPitchLen;
float* m_dPitch;
float* m_dNewPitch;
unsigned m_uDataLen;
// unsigned m_uNewPitchLen;
short* m_Data;
bool* flag;
short* m_InData;
CDSP m_filter;
void GetPitchMarks(vector<unsigned>&);
bool IsVowel(unsigned);
int GetAvgPitchLen(vector<unsigned>&,int&);
void GetFinal(vector<unsigned>&,vector<unsigned>&,
int,vector<int>&,vector<vector<unsigned> >&);
void GetUseds(int,int,int,vector<int>&);
void smooth(short*,unsigned,vector<float>&);
void OverlapAdd(vector<vector<unsigned> >& final, short* y, unsigned ylen,
vector<float>& w, float* pBeta = NULL);
void PSOLA(float,float*,int,float);
Subscribe to:
Posts (Atom)