Learn about text to speech

Sunday, 22 April 2012

////////////////////////////////////////////////////////////////////////////////
///
/// Classes for easy reading & writing of WAV sound files.
///
/// For big-endian CPU, define _BIG_ENDIAN_ during compile-time to correctly
/// parse the WAV files with such processors.
///
/// Admittingly, more complete WAV reader routines may exist in public domain,
/// but the reason for 'yet another' one is that those generic WAV reader
/// libraries are exhaustingly large and cumbersome! Wanted to have something
/// simpler here, i.e. something that's not already larger than rest of the
/// SoundTouch/SoundStretch program...
///
/// Author : Copyright (c) Olli Parviainen
/// Author e-mail : oparviai 'at' iki.fi
/// SoundTouch WWW: http://www.surina.net/soundtouch
///
////////////////////////////////////////////////////////////////////////////////
//
// Last changed : $Date: 2006/02/05 16:44:06 $
// File revision : $Revision: 1.15 $
//
// $Id: WavFile.cpp,v 1.15 2006/02/05 16:44:06 Olli Exp $
//
////////////////////////////////////////////////////////////////////////////////
//
// License :
//
// SoundTouch audio processing library
// Copyright (c) Olli Parviainen
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdexcept>
#include <string>
#include <assert.h>
#include <limits.h>

#include "WavFile.h"

using namespace std;

const static char riffStr[] = "RIFF";
const static char waveStr[] = "WAVE";
const static char fmtStr[] = "fmt ";
const static char dataStr[] = "data";

//////////////////////////////////////////////////////////////////////////////
//
// Helper functions for swapping byte order to correctly read/write WAV files
// with big-endian CPU's: Define compile-time definition _BIG_ENDIAN_ to
// turn-on the conversion if it appears necessary.
//
// For example, Intel x86 is little-endian and doesn't require conversion,
// while PowerPC of Mac's and many other RISC cpu's are big-endian.

#ifdef BYTE_ORDER
// In gcc compiler detect the byte order automatically
#if BYTE_ORDER == BIG_ENDIAN
// big-endian platform.
#define _BIG_ENDIAN_
#endif
#endif

#ifdef _BIG_ENDIAN_
// big-endian CPU, swap bytes in 16 & 32 bit words

// helper-function to swap byte-order of 32bit integer
static inline void _swap32(unsigned int &dwData)
{
dwData = ((dwData >> 24) & 0x000000FF) |
((dwData >> 8) & 0x0000FF00) |
((dwData << 8) & 0x00FF0000) |
((dwData << 24) & 0xFF000000);
}

// helper-function to swap byte-order of 16bit integer
static inline void _swap16(unsigned short &wData)
{
wData = ((wData >> 8) & 0x00FF) |
((wData << 8) & 0xFF00);
}

// helper-function to swap byte-order of buffer of 16bit integers
static inline void _swap16Buffer(unsigned short *pData, unsigned int dwNumWords)
{
unsigned long i;

for (i = 0; i < dwNumWords; i ++)
{
_swap16(pData[i]);
}
}

#else // BIG_ENDIAN
// little-endian CPU, WAV file is ok as such

// dummy helper-function
static inline void _swap32(unsigned int &dwData)
{
// do nothing
}

// dummy helper-function
static inline void _swap16(unsigned short &wData)
{
// do nothing
}

// dummy helper-function
static inline void _swap16Buffer(unsigned short *pData, unsigned int dwNumBytes)
{
// do nothing
}

#endif // BIG_ENDIAN

//////////////////////////////////////////////////////////////////////////////
//
// Class WavInFile
//

WavInFile::WavInFile(const char *fileName)
{
int hdrsOk;

// Try to open the file for reading
fptr = fopen(fileName, "rb");
if (fptr == NULL)
{
// didn't succeed
string msg = "Error : Unable to open file \"";
msg += fileName;
msg += "\" for reading.";
throw runtime_error(msg);
}

// Read the file headers
hdrsOk = readWavHeaders();
if (hdrsOk != 0)
{
// Something didn't match in the wav file headers
string msg = "File \"";
msg += fileName;
msg += "\" is corrupt or not a WAV file";
throw runtime_error(msg);
}

if (header.format.fixed != 1)
{
string msg = "File \"";
msg += fileName;
msg += "\" uses unsupported encoding.";
throw runtime_error(msg);
}

dataRead = 0;
}

WavInFile::~WavInFile()
{
close();
}

void WavInFile::rewind()
{
int hdrsOk;

fseek(fptr, 0, SEEK_SET);
hdrsOk = readWavHeaders();
assert(hdrsOk == 0);
dataRead = 0;
}

int WavInFile::checkCharTags()
{
// header.format.fmt should equal to 'fmt '
if (memcmp(fmtStr, header.format.fmt, 4) != 0) return -1;
// header.data.data_field should equal to 'data'
if (memcmp(dataStr, header.data.data_field, 4) != 0) return -1;

return 0;
}

int WavInFile::read(char *buffer, int maxElems)
{
int numBytes;
uint afterDataRead;

// ensure it's 8 bit format
if (header.format.bits_per_sample != 8)
{
throw runtime_error("Error: WavInFile::read(char*, int) works only with 8bit samples.");
}
assert(sizeof(char) == 1);

numBytes = maxElems;
afterDataRead = dataRead + numBytes;
if (afterDataRead > header.data.data_len)
{
// Don't read more samples than are marked available in header
numBytes = header.data.data_len - dataRead;
assert(numBytes >= 0);
}

numBytes = fread(buffer, 1, numBytes, fptr);
dataRead += numBytes;

return numBytes;
}

int WavInFile::read(short *buffer, int maxElems)
{
unsigned int afterDataRead;
int numBytes;
int numElems;

if (header.format.bits_per_sample == 8)
{
// 8 bit format
char *temp = new char[maxElems];
int i;

numElems = read(temp, maxElems);
// convert from 8 to 16 bit
for (i = 0; i < numElems; i ++)
{
buffer[i] = temp[i] << 8;
}
delete[] temp;
}
else
{
// 16 bit format
assert(header.format.bits_per_sample == 16);
assert(sizeof(short) == 2);

numBytes = maxElems * 2;
afterDataRead = dataRead + numBytes;
if (afterDataRead > header.data.data_len)
{
// Don't read more samples than are marked available in header
numBytes = header.data.data_len - dataRead;
assert(numBytes >= 0);
}

numBytes = fread(buffer, 1, numBytes, fptr);
dataRead += numBytes;
numElems = numBytes / 2;

// 16bit samples, swap byte order if necessary
_swap16Buffer((unsigned short *)buffer, numElems);
}

return numElems;
}

int WavInFile::read(float *buffer, int maxElems)
{
short *temp = new short[maxElems];
int num;
int i;
double fscale;

num = read(temp, maxElems);

fscale = 1.0 / 32768.0;
// convert to floats, scale to range [-1..+1[
for (i = 0; i < num; i ++)
{
buffer[i] = (float)(fscale * (double)temp[i]);
}

delete[] temp;

return num;
}

int WavInFile::eof() const
{
// return true if all data has been read or file eof has reached
return (dataRead == header.data.data_len || feof(fptr));
}

void WavInFile::close()
{
fclose(fptr);
fptr = NULL;
}

// test if character code is between a white space ' ' and little 'z'
static int isAlpha(char c)
{
return (c >= ' ' && c <= 'z') ? 1 : 0;
}

// test if all characters are between a white space ' ' and little 'z'
static int isAlphaStr(char *str)
{
int c;

c = str[0];
while (c)
{
if (isAlpha(c) == 0) return 0;
str ++;
c = str[0];
}

return 1;
}

int WavInFile::readRIFFBlock()
{
fread(&(header.riff), sizeof(WavRiff), 1, fptr);

// swap 32bit data byte order if necessary
_swap32((unsigned int &)header.riff.package_len);

// header.riff.riff_char should equal to 'RIFF');
if (memcmp(riffStr, header.riff.riff_char, 4) != 0) return -1;
// header.riff.wave should equal to 'WAVE'
if (memcmp(waveStr, header.riff.wave, 4) != 0) return -1;

return 0;
}

int WavInFile::readHeaderBlock()
{
char label[5];
string sLabel;

// lead label string
fread(label, 1, 4, fptr);
label[4] = 0;

if (isAlphaStr(label) == 0) return -1; // not a valid label

// Decode blocks according to their label
if (strcmp(label, fmtStr) == 0)
{
int nLen, nDump;

// 'fmt ' block
memcpy(header.format.fmt, fmtStr, 4);

// read length of the format field
fread(&nLen, sizeof(int), 1, fptr);
// swap byte order if necessary
_swap32((unsigned int &)nLen); // int format_len;
header.format.format_len = nLen;

// calculate how much length differs from expected
nDump = nLen - (sizeof(header.format) - 8);

// if format_len is larger than expected, read only as much data as we've space for
if (nDump > 0)
{
nLen = sizeof(header.format) - 8;
}

// read data
fread(&(header.format.fixed), nLen, 1, fptr);

// swap byte order if necessary
_swap16((unsigned short &)header.format.fixed); // short int fixed;
_swap16((unsigned short &)header.format.channel_number); // short int channel_number;
_swap32((unsigned int &)header.format.sample_rate); // int sample_rate;
_swap32((unsigned int &)header.format.byte_rate); // int byte_rate;
_swap16((unsigned short &)header.format.byte_per_sample); // short int byte_per_sample;
_swap16((unsigned short &)header.format.bits_per_sample); // short int bits_per_sample;

// if format_len is larger than expected, skip the extra data
if (nDump > 0)
{
fseek(fptr, nDump, SEEK_CUR);
}

return 0;
}
else if (strcmp(label, dataStr) == 0)
{
// 'data' block
memcpy(header.data.data_field, dataStr, 4);
fread(&(header.data.data_len), sizeof(uint), 1, fptr);

// swap byte order if necessary
_swap32((unsigned int &)header.data.data_len);

return 1;
}
else
{
uint len, i;
uint temp;
// unknown block

// read length
fread(&len, sizeof(len), 1, fptr);
// scan through the block
for (i = 0; i < len; i ++)
{
fread(&temp, 1, 1, fptr);
if (feof(fptr)) return -1; // unexpected eof
}
}
return 0;
}

int WavInFile::readWavHeaders()
{
int res;

memset(&header, 0, sizeof(header));

res = readRIFFBlock();
if (res) return 1;
// read header blocks until data block is found
do
{
// read header blocks
res = readHeaderBlock();
if (res < 0) return 1; // error in file structure
} while (res == 0);
// check that all required tags are legal
return checkCharTags();
}

uint WavInFile::getNumChannels() const
{
return header.format.channel_number;
}

uint WavInFile::getNumBits() const
{
return header.format.bits_per_sample;
}

uint WavInFile::getBytesPerSample() const
{
return getNumChannels() * getNumBits() / 8;
}

uint WavInFile::getSampleRate() const
{
return header.format.sample_rate;
}

uint WavInFile::getDataSizeInBytes() const
{
return header.data.data_len;
}

uint WavInFile::getNumSamples() const
{
return header.data.data_len / header.format.byte_per_sample;
}

uint WavInFile::getLengthMS() const
{
uint numSamples;
uint sampleRate;

numSamples = getNumSamples();
sampleRate = getSampleRate();

assert(numSamples < UINT_MAX / 1000);
return (1000 * numSamples / sampleRate);
}

//////////////////////////////////////////////////////////////////////////////
//
// Class WavOutFile
//

WavOutFile::WavOutFile(const char *fileName, int sampleRate, int bits, int channels)
{
bytesWritten = 0;
fptr = fopen(fileName, "wb");
if (fptr == NULL)
{
string msg = "Error : Unable to open file \"";
msg += fileName;
msg += "\" for writing.";
//pmsg = msg.c_str;
throw runtime_error(msg);
}

fillInHeader(sampleRate, bits, channels);
writeHeader();
}

WavOutFile::~WavOutFile()
{
close();
}

void WavOutFile::fillInHeader(uint sampleRate, uint bits, uint channels)
{
// fill in the 'riff' part..

// copy string 'RIFF' to riff_char
memcpy(&(header.riff.riff_char), riffStr, 4);
// package_len unknown so far
header.riff.package_len = 0;
// copy string 'WAVE' to wave
memcpy(&(header.riff.wave), waveStr, 4);

// fill in the 'format' part..

// copy string 'fmt ' to fmt
memcpy(&(header.format.fmt), fmtStr, 4);

header.format.format_len = 0x10;
header.format.fixed = 1;
header.format.channel_number = (short)channels;
header.format.sample_rate = sampleRate;
header.format.bits_per_sample = (short)bits;
header.format.byte_per_sample = (short)(bits * channels / 8);
header.format.byte_rate = header.format.byte_per_sample * sampleRate;
header.format.sample_rate = sampleRate;

// fill in the 'data' part..

// copy string 'data' to data_field
memcpy(&(header.data.data_field), dataStr, 4);
// data_len unknown so far
header.data.data_len = 0;
}

void WavOutFile::finishHeader()
{
// supplement the file length into the header structure
header.riff.package_len = bytesWritten + 36;
header.data.data_len = bytesWritten;

writeHeader();
}

void WavOutFile::writeHeader()
{
WavHeader hdrTemp;

// swap byte order if necessary
hdrTemp = header;
_swap32((unsigned int &)hdrTemp.riff.package_len);
_swap32((unsigned int &)hdrTemp.format.format_len);
_swap16((unsigned short &)hdrTemp.format.fixed);
_swap16((unsigned short &)hdrTemp.format.channel_number);
_swap32((unsigned int &)hdrTemp.format.sample_rate);
_swap32((unsigned int &)hdrTemp.format.byte_rate);
_swap16((unsigned short &)hdrTemp.format.byte_per_sample);
_swap16((unsigned short &)hdrTemp.format.bits_per_sample);
_swap32((unsigned int &)hdrTemp.data.data_len);

// write the supplemented header in the beginning of the file
fseek(fptr, 0, SEEK_SET);
fwrite(&hdrTemp, sizeof(hdrTemp), 1, fptr);
// jump back to the end of the file
fseek(fptr, 0, SEEK_END);
}

void WavOutFile::close()
{
finishHeader();
fclose(fptr);
fptr = NULL;
}

void WavOutFile::write(const char *buffer, int numElems)
{
int res;

if (header.format.bits_per_sample != 8)
{
throw runtime_error("Error: WavOutFile::write(const char*, int) accepts only 8bit samples.");
}
assert(sizeof(char) == 1);

res = fwrite(buffer, 1, numElems, fptr);
if (res != numElems)
{
throw runtime_error("Error while writing to a wav file.");
}

bytesWritten += numElems;
}

void WavOutFile::write(const short *buffer, int numElems)
{
int res;

// 16 bit samples
if (numElems < 1) return; // nothing to do

if (header.format.bits_per_sample == 8)
{
int i;
char *temp = new char[numElems];
// convert from 16bit format to 8bit format
for (i = 0; i < numElems; i ++)
{
temp[i] = buffer[i] >> 8;
}
// write in 8bit format
write(temp, numElems);
delete[] temp;
}
else
{
// 16bit format
unsigned short *pTemp = new unsigned short[numElems];

assert(header.format.bits_per_sample == 16);

// allocate temp buffer to swap byte order if necessary
memcpy(pTemp, buffer, numElems * 2);
_swap16Buffer(pTemp, numElems);

res = fwrite(pTemp, 2, numElems, fptr);

delete[] pTemp;

if (res != numElems)
{
throw runtime_error("Error while writing to a wav file.");
}
bytesWritten += 2 * numElems;
}
}

void WavOutFile::write(const float *buffer, int numElems)
{
int i;
short *temp = new short[numElems];
int iTemp;

// convert to 16 bit integer
for (i = 0; i < numElems; i ++)
{
// convert to integer
iTemp = (int)(32768.0f * buffer[i]);

// saturate
if (iTemp < -32768) iTemp = -32768;
if (iTemp > 32767) iTemp = 32767;
temp[i] = (short)iTemp;
}

write(temp, numElems);

delete[] temp;
}

Simple SOLA algorithm Main.cpp

/////////////////////////////////////////////////////////////////////
//
// Simple SOLA algorithm example. The example reads a .wav sound
// file with mono-16bit-44100Hz sample format, process it with SOLA
// and writes output into another .wav file.
//
// Copyright (c) Olli Parviainen 2006 <oparviai@iki.fi>
//
/////////////////////////////////////////////////////////////////////

#include <stdexcept>
#include "wavfile.h"

using namespace std;

// Time scaling factor, values > 1.0 increase, values < 1.0 decrease tempo
#define TIME_SCALE 1 // 15% slower tempo
// Processing sequence size (100 msec with 44100Hz samplerate)
#define SEQUENCE 800//4410
// Overlapping size (20 msec)
#define OVERLAP 160//882
// Best overlap offset seeking window (15 msec)
#define SEEK_WINDOW 120//662
// Processing sequence flat mid-section duration
#define FLAT_DURATION (SEQUENCE - 2 * (OVERLAP))
// Theoretical interval between the processing seqeuences
#define SEQUENCE_SKIP ((int)((SEQUENCE - OVERLAP) * (TIME_SCALE)))

typedef short SAMPLE; // sample type, 16bit signed integer

// Use cross-correlation function to find best overlapping offset
// where input_prev and input_new match best with each other
int seek_best_overlap(const SAMPLE *input_prev, const SAMPLE *input_new)
{
int i;
int bestoffset = 0;
float bestcorr = -1e30f;
float temp[OVERLAP];

// Precalculate overlapping slopes with input_prev
for (i = 0; i < OVERLAP; i ++)
{
temp[i] = (float)(input_prev[i] * i * (OVERLAP - i));
}

// Find best overlap offset within [0..SEEK_WINDOW]
for (i = 0; i < SEEK_WINDOW; i ++)
{
int j;
float crosscorr = 0;

for (j = 0; j < OVERLAP; j ++)
{
crosscorr += (float)input_new[i + j] * temp[j];
}
if (crosscorr > bestcorr)
{
// found new best offset candidate
bestcorr = crosscorr;
bestoffset = i;
}
}
return bestoffset;
}

// Overlap 'input_prev' with 'input_new' by sliding the amplitudes during
// OVERLAP samples. Store result to 'output'.
void overlap(SAMPLE *output, const SAMPLE *input_prev, const SAMPLE *input_new)
{
int i;

for (i = 0; i < OVERLAP; i ++)
{
output[i] = (input_prev[i] * (OVERLAP - i) + input_new[i] * i) / OVERLAP;
}
}

// SOLA algorithm. Performs time scaling for sample data given in 'input',
// write result to 'output'. Return number of output samples.
int sola(SAMPLE *output, const SAMPLE *input, int num_in_samples)
{
int num_out_samples = 0;
const SAMPLE *seq_offset = input;
const SAMPLE *prev_offset;

int nTest = SEQUENCE_SKIP;
while (num_in_samples > SEQUENCE_SKIP + SEEK_WINDOW)
{
// copy flat mid-sequence from current processing sequence to output
memcpy(output, seq_offset, FLAT_DURATION * sizeof(SAMPLE));
// calculate a pointer to overlap at end of the processing sequence
prev_offset = seq_offset + FLAT_DURATION;

// update input pointer to theoretical next processing sequence begin
input += SEQUENCE_SKIP - OVERLAP;
// seek actual best matching offset using cross-correlation
seq_offset = input + seek_best_overlap(prev_offset, input);

// do overlapping between previous & new sequence, copy result to output
overlap(output + FLAT_DURATION, prev_offset, seq_offset);

// Update input & sequence pointers by overlapping amount
seq_offset += OVERLAP;
input += OVERLAP;

// Update output pointer & sample counters
output += SEQUENCE - OVERLAP;
num_out_samples += SEQUENCE - OVERLAP;
num_in_samples -= SEQUENCE_SKIP;
}

return num_out_samples;
}

// Buffers for input/output sample data. For sake of simplicity, these are
// just made 'big enough' for the example purpose.
SAMPLE inbuffer[10240000];
SAMPLE outbuffer[20240000];

int main(int numstr, char **pstr)
{

if (numstr < 3)
{
printf("usage: solatest input.wav output.wav\n");
return -1;
}

try
{
int insamples, outsamples;

// Open input file
WavInFile infile(pstr[1]);

if ((infile.getSampleRate() != 44100) || (infile.getNumChannels() != 1))
{
printf("Sorry, this example processes mono audio sampled at 44100Hz.\n");
return -1;
}

// Read data from input file
insamples = infile.read(inbuffer, 10240000);

// Process
outsamples = sola(outbuffer, inbuffer, insamples);

// Write result to output file
WavOutFile outfile(pstr[2], infile.getSampleRate(), infile.getNumBits(), infile.getNumChannels());
outfile.write(outbuffer, outsamples);
}
catch (exception &e)
{
printf("Error: %s\n", e.what());
}

return 0;
}

SOLA.M MATHLAB

sa=585;ss=438; %ÕâÊÇ?µµ?µÄ
w=512;
wov=w-ss;kmax=500;
x=wavread('v017');
%*********** time scaling **************
xst=1;yout=[];
xbuff=x(sa:sa+w-1);
st=sa:sa:length(x); %?ªÊ?Ê?ÓïÒô?Î?ÓSa???ªÊ?ÊäÈë,ÒÔºóÃ??ÎÏòºóÒÆ??Sa?öµã
r=mod(length(x),sa);
num=(length(x)-r)/sa; %?Ü??Òª?øÐÐnumÂÖ?Ù??
x=[x; zeros(w+kmax,1)];%ÓïÒôÎ??ÎÐèÒª??0??ÒòÎª?ÖÎö???ÚW?áÒÆ????ÇÒ?î?óÒÆ??kmax?öµã
for j=1:(num)
y=x(xst:1:xst+w-1); %?ªÊ?Ê?ÓïÒô?Î?ÓSa??
start=st(j):st(j)+kmax-1; %?ÖÎö???ÚµÄÆðµã???î?àÒÆ??kmax?öµã
cy=y(end:-1:end-wov+1); %È??öÊä?öÐòÁÐyµÄºówov?öµã
cy=cy(end:-1:1);
km_buf=zeros(1,kmax); %ÓÃÀ??ÇÂ?kmax?ö??Ïà?ØÏµÊý
for i=1:kmax
xbuff=x(start(i):start(i)+w-1); %?ÖÎö???ÚËù?ØÈ?µÄw?öµã
cx=xbuff(1:wov); %È??öÇ?wov?öµã
rxx_k=sum(cx.^2);
rxy_k=sum(cx.*cy);
if ( rxx_k==0) %ÈôÎªÁã???íÊ?ÒÑµ?ÓïÒôÎ??Î?Õ?Å??0µÄ???Ö??ÔòÍ?Ö?
kmbuf(i)=0;
break;
else
km_buf(i)=(rxy_k.^2)./rxx_k;
end
end
km=find(km_buf==max(km_buf)); %ÕÒ?ö?î?óµÄ??Ïà?ØÏµÊýÔÚkm_bufµÄÎ?ÖÃ
yout=[yout; x(start(km)+wov:start(km)+w+1)]; %?ÑÒ?ÖÂÐÔ?îºÃµÄÐòÁÐµÄSs?öµã??Îª?îºóÊä?ö
xst=xst+sa;
end
%************?ä?ÉÑùÂÊ************************
L=sa;M=ss;
data=[];
data_out=[];
y_end=length(yout);
y_st=0;
for j=1:L;
for i=1:M;
y_st=y_st+1;
if y_st<y_end
invert=linspace(yout(y_st),yout(y_st+1),L+1);%ÔÚÃ?Á??öµãÖ??äÏßÐÎ?åÈëL-1?öµã
elseif y_st==y_end
invert=linspace(yout(y_st),0,L+1); %ÈôÊÇ?îºóÒ??öµã??ÔòËüÓëÁãÖ??ä?åÈëL-1?öµã
else
break;
end
data=[data invert(1:end-1)]; %Ã??Î?ÑL?öµãÊä?öµ?data
end
data_out=[data_out data(1:M:end)]; %ÔÚdataÖÐÃ??ôM?öµãÈ??öÒ??öµã????Êä?öµ?data_out
data=[]; %?ÑdataµÄÄÚÈÝÇå?ý
end

Pitch Synchronous Overlap Add Method (PSOLA.CPP)

PSOLA.CPP

#include "../common/tdpsola.h"
#include "psola.h"

CPSOLA instance;

void PSOLA_EnableCosineSmooth(bool enable)
{
instance.enableCosineSmooth(enable);
}

void PSOLA_SetSpectralMapping(bool useBezier, int x1, int y1, int x2, int y2)
{
instance.setSpectralMapping(useBezier, x1, y1, x2, y2);
}

bool PSOLA_IsCosineSmoothEnabled()
{
return instance.isCosineSmoothEnabled();
}

void PSOLA_EnableVoicelessExtension(int method)
{
instance.setVoicelessExtension(method);
}

int PSOLA_GetVoicelessExtension()
{
return instance.getVoicelessExtension();
}

unsigned PSOLA_ModifyPitchContour(
const short * srcWave,
unsigned srcLength,
const unsigned *srcTags,
unsigned tagNumber,
const unsigned *trgPeriods,
unsigned periodNumber,
unsigned trgDuration,
float specRatio,
short * trgWave,
unsigned trgBufferLength,
unsigned sampleRate)
{
return instance.modifyPitchContour(srcWave, srcLength, srcTags, tagNumber, trgWave, trgBufferLength, trgPeriods, periodNumber, trgDuration, specRatio, sampleRate);
}

unsigned PSOLA_Modify(
const short * srcWave,
unsigned srcLength,
const unsigned *srcTags,
unsigned tagNumber,
unsigned trgPitch,
unsigned trgDuration,
float specRatio,
short * trgWave,
unsigned trgBufferLength,
unsigned sampleRate)
{
return instance.modify(srcWave, srcLength, srcTags, tagNumber, trgWave, trgBufferLength, trgPitch, trgDuration, specRatio, sampleRate);
}

unsigned PSOLA_ModifyRatio(
const short * srcWave,
unsigned srcLength,
const unsigned * srcTags,
unsigned tagNumber,
float pitchRatio,
float durationRatio,
float specRatio,
short * trgWave,
unsigned trgBufferLength,
unsigned sampleRate
)
{
return instance.modifyRatio(srcWave, srcLength, srcTags, tagNumber, trgWave, trgBufferLength, pitchRatio, durationRatio, specRatio, sampleRate);
}

PSOLA.H

///
/// Modify wave using TP-PSOLA algorithm
///
/// @version 1.0.0
/// @author Jun Xu
/// @date 2007/07/18
///
#ifndef _CST_PSOLA_PSOLA_H_
#define _CST_PSOLA_PSOLA_H_

#ifndef PSOLA_EXPORTS
# define PSOLA_DLL_EXPORTS __declspec(dllimport)
# ifdef _DEBUG
# pragma comment(lib, "psolad.lib")
# pragma message("Linking with psolad.dll")
# else
# pragma comment(lib, "psola.lib")
# pragma message("Linking with psola.dll")
# endif
#else
# define PSOLA_DLL_EXPORTS __declspec(dllexport)
#endif

#ifdef _cplusplus
extern "C" {
#endif

#define PSOLA_VLPPMETHOD_NONE 0 ///< ÇåÒô¶Î²»×ö»ùÆµÀ©Õ¹
#define PSOLA_VLPPMETHOD_FIXED 1 ///< ÇåÒô¶Î×ö¹Ì¶¨ÖÜÆÚµÄ»ùÆµÀ©Õ¹
#define PSOLA_VLPPMETHOD_EQUAL 2 ///< ÇåÒô¶Î×öÓëµÚÒ»¸öÖÜÆÚÏàµÈµÄµÈÖÜÆÚÀ©Õ¹
#define PSOLA_VLPPMETHOD_PEAK 3 ///< ÇåÒô¶Î¸ù¾ÝÓïÒô¼â·åµãÀ´×öÖÜÆÚÀ©Õ¹
#define PSOLA_VLPPMETHOD_AUTO 4 ///< ×Ô¶¯×öÇåÒôÀ©Õ¹£¨¸ù¾ÝÊ±³¤ÐÞ¸Ä±ÈÀý¾ö¶¨£©
#define PSOLA_VLPPMETHOD_MAX 4

///
/// ÉèÖÃÐÞ¸Ä¹ý³ÌÖÐµÄÆµÆ×Ó³Éä·½Ê½£¬Ð§¹û²»ºÃ£¬É÷ÓÃ
///
/// @param useBezier true:Ê¹ÓÃ±´Èû¶ûÇúÏß,false:Ê¹ÓÃÕÛÏß
/// @param x1,y1 µÚÒ»¸ö¿ØÖÆµãµÄ×ø±ê
/// @param x2,y2 µÚ¶þ¸ö¿ØÖÆµãµÄ×ø±ê
///
PSOLA_DLL_EXPORTS void PSOLA_SetSpectralMapping(bool useBezier, int x1, int y1, int x2, int y2);

///
/// ÉèÖÃÇåÒô¶ÎÖÜÆÚÀ©Õ¹·½Ê½
///
/// @param method 0-3£¬²Î¼ûÉÏÃæµÄºê¶¨Òå
///
PSOLA_DLL_EXPORTS void PSOLA_EnableVoicelessExtension(int method);

/// »ñÈ¡ÇåÒô¶ÎÖÜÆÚÀ©Õ¹·½Ê½
PSOLA_DLL_EXPORTS int PSOLA_GetVoicelessExtension();

///
/// ÆôÓÃÓàÏÒº¯Êý½øÐÐÆ´½Ó±ß½çÆ½»¬
///
/// @param enable true:ÆôÓÃ,false:²»ÆôÓÃ
///
PSOLA_DLL_EXPORTS void PSOLA_EnableCosineSmooth(bool enable);

/// ÅÐ¶ÏÓàÏÒ±ß½çÆ½»¬ÊÇ·ñ±»ÆôÓÃ
PSOLA_DLL_EXPORTS bool PSOLA_IsCosineSmoothEnabled();

///
/// Modify wave using PSOLA model
/// Ê¹ÓÃPSOLAÄ£ÐÍ½øÐÐÓïÒôÐÞ¸Ä£¬Ö¸¶¨Ä¿±êµÄÆ½¾ù»ùÆµÖÜÆÚÒÔ¼°ÓïÒô³¤¶È
///
/// @param srcWave[in] wave buffer read from speech database
/// ÓïÒôÊý¾Ý£¬±ØÐëÎª16bit²ÉÑù¾«¶È
/// @param srcLength[in] wave buffer length, in short count
/// ÓïÒôÊý¾ÝµÄ²ÉÑùµã¸öÊý
/// @param srcTags[in] peak tags read from speech database
/// each tag indicate the peak position offset to the first sample of wave
/// ÓïÒôÊý¾ÝµÄ·åÖµµã±ê×¢Êý×é
/// ÄÚ²¿±£´æÃ¿¸ö·åÖµµãÏà¶ÔÓïÒôÆðÊ¼µãµÄÆ«ÒÆÎ»ÖÃ
/// @param tagNumber[in] peak tag count of srcTags
/// ·åÖµ±ê×¢¸öÊý
/// @param trgPitch[in] predicted average pitch period
/// trgPitch=0 means keeping pitch no change
/// Ä¿±ê»ùÆµÖÜÆÚµÄ´óÐ¡£¬Èç¹ûÎª0Ôò±íÊ¾²»½øÐÐÐÞ¸Ä
/// @param trgDuration[in] predicted wave duration, in short
/// Ä¿±êÓïÒô²ÉÑùµã¸öÊý£¬Èç¹ûÎª0Ôò±íÊ¾²»½øÐÐÐÞ¸Ä
/// @param specRatio [in] modification ratio of spectra
/// ÆµÆ×ÐÞ¸Ä±ÈÀý£¬0Îª²»ÐÞ¸Ä
/// @param trgWave[out] modified wave, buffer should be allocated outside
/// Ä¿±êÓïÒôÊý¾Ý»º³åÇø£¬ÓÉÍâ²¿·ÖÅä£¬Îñ±Ø±ÈtrgDurationÒª´óÒ»Ð©
/// @param sampleRate[in] Sample count per second, default is 16000
/// ²ÉÑùÂÊ£¬Ò»°ãÇëÊ¹ÓÃ16000
///
/// @return true if modified successfully
/// false if not, then the content of trgWave if un-defined
///
PSOLA_DLL_EXPORTS unsigned PSOLA_Modify(
const short * srcWave,
unsigned srcLength,
const unsigned *srcTags,
unsigned tagNumber,
unsigned trgPitch,
unsigned trgDuration,
float specRatio,
short * trgWave,
unsigned trgBufferLength,
unsigned sampleRate);

///
/// Modify wave using PSOLA model
/// Ê¹ÓÃPSOLAÄ£ÐÍ½øÐÐÓïÒôÐÞ¸Ä£¬Ö¸¶¨Ä¿±ê»ùÆµÇúÏß
///
/// @param srcWave[in] wave buffer read from speech database
/// ÓïÒôÊý¾Ý£¬±ØÐëÎª16bit²ÉÑù¾«¶È
/// @param srcLength[in] wave buffer length, in short count
/// ÓïÒôÊý¾ÝµÄ²ÉÑùµã¸öÊý
/// @param srcTags[in] peak tags read from speech database
/// each tag indicate the peak position offset to the first sample of wave
/// ÓïÒôÊý¾ÝµÄ·åÖµµã±ê×¢Êý×é
/// ÄÚ²¿±£´æÃ¿¸ö·åÖµµãÏà¶ÔÓïÒôÆðÊ¼µãµÄÆ«ÒÆÎ»ÖÃ
/// @param tagNumber[in] peak tag count of srcTags
/// ·åÖµ±ê×¢¸öÊý
/// @param trgPeriods[in] predicted pitch period
/// Ä¿±ê»ùÆµÖÜÆÚÊý×é
/// @param periodNumber[in] pitch period count of target
/// Ä¿±ê»ùÆµÖÜÆÚÊýÄ¿
/// @param trgDuration[in] predicted wave duration, in short
/// Ä¿±êÓïÒô²ÉÑùµã¸öÊý£¬Èç¹ûÎª0Ôò±íÊ¾²»½øÐÐÐÞ¸Ä
/// @param specRatio [in] modification ratio of spectra
/// ÆµÆ×ÐÞ¸Ä±ÈÀý£¬0Îª²»ÐÞ¸Ä
/// @param trgWave[out] modified wave, buffer should be allocated outside
/// Ä¿±êÓïÒôÊý¾Ý»º³åÇø£¬ÓÉÍâ²¿·ÖÅä£¬Îñ±Ø±ÈtrgDurationÒª´óÒ»Ð©
/// @param sampleRate[in] Sample count per second, default is 16000
/// ²ÉÑùÂÊ£¬Ò»°ãÇëÊ¹ÓÃ16000
///
/// @return true if modified successfully
/// false if not, then the content of trgWave if un-defined
///
PSOLA_DLL_EXPORTS unsigned PSOLA_ModifyPitchContour(
const short * srcWave,
unsigned srcLength,
const unsigned *srcTags,
unsigned tagNumber,
const unsigned *trgPeriods,
unsigned periodNumber,
unsigned trgDuration,
float specRatio,
short * trgWave,
unsigned trgBufferLength,
unsigned sampleRate);

///
/// Modify wave using PSOLA model
/// Ê¹ÓÃPSOLAÄ£ÐÍ½øÐÐÓïÒôÐÞ¸Ä£¬Ö¸¶¨ÖÜÆÚ£¬Ê±³¤µÄÐÞ¸Ä±ÈÀý
///
/// @param srcWave[in] wave buffer read from speech database
/// ÓïÒôÊý¾Ý£¬±ØÐëÎª16bit²ÉÑù¾«¶È
/// @param srcLength[in] wave buffer length, in short count
/// ÓïÒôÊý¾ÝµÄ²ÉÑùµã¸öÊý
/// @param srcTags[in] peak tags read from speech database
/// each tag indicate the peak position offset to the first sample of wave
/// ÓïÒôÊý¾ÝµÄ·åÖµµã±ê×¢Êý×é
/// ÄÚ²¿±£´æÃ¿¸ö·åÖµµãÏà¶ÔÓïÒôÆðÊ¼µãµÄÆ«ÒÆÎ»ÖÃ
/// @param tagNumber[in] peak tag count of srcTags
/// ·åÖµ±ê×¢¸öÊý
/// @param pitchRatio[in] modification ratio of pitch
/// Ä¿±ê»ùÆµÖÜÆÚÐÞ¸Ä±ÈÀý£¬Èç¹ûÎª0Ôò±íÊ¾²»½øÐÐÐÞ¸Ä
/// @param durationRatio[in]modification ratio of duration
/// Ä¿±êÓïÒôÊ±³¤ÐÞ¸Ä±ÈÀý£¬Èç¹ûÎª0Ôò±íÊ¾²»½øÐÐÐÞ¸Ä
/// @param specRatio [in] modification ratio of spectra
/// ÆµÆ×ÐÞ¸Ä±ÈÀý£¬0Îª²»ÐÞ¸Ä
/// @param trgWave[out] modified wave, buffer should be allocated outside
/// Ä¿±êÓïÒôÊý¾Ý»º³åÇø£¬ÓÉÍâ²¿·ÖÅä£¬Îñ±Ø±ÈtrgDurationÒª´óÒ»Ð©
/// @param sampleRate[in] Sample count per second, default is 16000
/// ²ÉÑùÂÊ£¬Ò»°ãÇëÊ¹ÓÃ16000
///
/// @return true if modified successfully
/// false if not, then the content of trgWave if un-defined
///
PSOLA_DLL_EXPORTS unsigned PSOLA_ModifyRatio(
const short * srcWave,
unsigned srcLength,
const unsigned *srcTags,
unsigned tagNumber,
float pitchRatio,
float durationRatio,
float specRatio,
short * trgWave,
unsigned trgBufferLength,
unsigned sampleRate);

#ifdef _cplusplus
}
#endif

#endif

Example PSOLA Method

PSOLA. h Header

#ifndef PSOLA_H_
#define PSOLA_H_
#include <vector>
#include "DSP.h"

using namespace std;

class CPsola{
public:
CPsola();
CPsola(short*,unsigned);
void SetData(short*,unsigned);
void SetAmplitudeMultiple(float);
void SetDuration(float);
void SetPitch(float*,unsigned,float);
void SetNewPitch(float*,unsigned);
void SetSampleFrequency(unsigned);
void SetFrameLength(float);
void SetX1(float);
void Adjust();

void TD_PSOLA(float,float);
void PSOLA(float,float,bool);

unsigned GetNewLen();
short* GetNewData();
~CPsola();
private:
unsigned FindMax(unsigned,unsigned,short*);
int Approximate(float);
short Middle(unsigned,short*);
bool MarkPitch();
void MarkOneFrame(unsigned,unsigned);
void AdjustAmplitude();
void AdjustDuration();
void AdjustPitch();
void Smooth(short*,unsigned);
private:
unsigned m_uSamFre;
float m_dFrameLen;
float m_dX1;
float m_dAmpMul;
float m_dDuration;
unsigned m_uPitchLen;
float* m_dPitch;
float* m_dNewPitch;
unsigned m_uDataLen;
;
// unsigned m_uNewPitchLen;
short* m_Data;
bool* flag;
short* m_InData;
CDSP m_filter;

void GetPitchMarks(vector<unsigned>&);
bool IsVowel(unsigned);
int GetAvgPitchLen(vector<unsigned>&,int&);
void GetFinal(vector<unsigned>&,vector<unsigned>&,
int,vector<int>&,vector<vector<unsigned> >&);
void GetUseds(int,int,int,vector<int>&);
void smooth(short*,unsigned,vector<float>&);
void OverlapAdd(vector<vector<unsigned> >& final, short* y, unsigned ylen,
vector<float>& w, float* pBeta = NULL);
public:
void PSOLA(float,float*,int,float);

};

#endif