FS#11913: Separate TTS correction expressions into separate file.

voice.pl will now read the TTS correction expressions from a file
tools/voice-corrections.txt which includes regular expressions for
adjusting the string. This makes it easier to adjust the corrections and
allows integrating them into tools like Rockbox Utility.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@29500 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Dominik Riebeling 2011-03-02 18:29:38 +00:00
parent 1f77d091a5
commit 7ad78222c4
2 changed files with 129 additions and 99 deletions

View file

@ -0,0 +1,92 @@
__________ __ ___.
Open \______ \ ____ ____ | | _\_ |__ _______ ___
Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
\/ \/ \/ \/ \/
$Id$
Voice string corrections for voice.pl to support TTS engines.
The list items are separated by the separator that is defined by the first
character on the line. If the first character is a whitespace the line will
get treated as comment.
Format:
/language/engine/vendor/string/replacement/
Where / is the separator, and all strings are Perl regexes.
Empty lines and lines starting with a whitespace are ignored, for all other
lines the first character will become the separator.
General for all engines and languages
/.*/.*/.*/USB/U S B/g
/.*/.*/.*/ID3/I D 3/g
English
/english/(sapi|festival)/.*/plugin(s?)/plug-in$1/ig
/english/festival/.*//\ba\b/ay/ig
/english/festival/.*//$/./
German (deutsch)
/deutsch/.*/.*/alkaline/alkalein/ig
/deutsch/.*/.*/byte(s?)/beit$1/ig
/deutsch/.*/.*/clip(s?)/klipp$1/ig
/deutsch/.*/.*/\bcover/kawwer/ig
/deutsch/.*/.*/cuesheet/kjuschiet/ig
/deutsch/.*/.*/dither/didder/ig
/deutsch/.*/.*/equalizer/iquileiser/ig
/deutsch/.*/.*/\bflash\b/fläsh/ig
/deutsch/.*/.*/\bfirmware(s?)\b/firmwer$1/ig
/deutsch/.*/.*/\bI D 3 tag\b/I D 3 täg/ig
/deutsch/.*/.*/\bloudness\b/laudness/ig
/deutsch/.*/.*/\bunicode\b/unikod/ig
/deutsch/sapi/AT&T Labs/alphabet/alfabet/ig;
/deutsch/sapi/AT&T Labs/ampere/amper/ig;
/deutsch/sapi/AT&T Labs/\bdezibel\b/de-zibell/ig;
/deutsch/sapi/AT&T Labs/diddering/didde-ring/ig;
/deutsch/sapi/AT&T Labs/energie\b/ener-gie/ig;
/deutsch/sapi/AT&T Labs/\Blauf\b/-lauf/ig;
/deutsch/sapi/AT&T Labs/\bnumerisch\b/numehrisch/ig;
Swedish (svenska)
for all swedish engines (e.g. for english words)
/svenska/.*/.*/kilobyte/kilobajt/ig
/svenska/.*/.*/megabyte/megabajt/ig
/svenska/.*/.*/gigabyte/gigabajt/ig
/svenska/.*/.*/\bloudness\b/laudness/ig
/svenska/espeak/.*/ampere/ampär/ig
/svenska/espeak/.*/bokmärken/bok-märken/ig
/svenska/espeak/.*/generella/schenerella/ig
/svenska/espeak/.*/dithering/diddering/ig
/svenska/espeak/.*/\bunicode\b/jynikod/ig
/svenska/espeak/.*/uttoning/utoning/ig
/svenska/espeak/.*/procent/pro-cent/ig
/svenska/espeak/.*/spellistor/spelistor/ig
/svenska/espeak/.*/cuesheet/qjyschiit/ig
Italian (italiano)
for all italian engines (e.g. for english words)
/italiano/.*/.*/Replaygain/Ripleyghein/ig
/italiano/.*/.*/Crossfade/Crossfeid/ig
/italiano/.*/.*/beep/Bip/ig
/italiano/.*/.*/cuesheet/chiushit/ig
/italiano/.*/.*/fade/feid/ig
/italiano/.*/.*/Crossfeed/crossfid/ig
/italiano/.*/.*/Cache/chash/ig
/italiano/.*/.*/\bfirmware(s?)\b/firmuer$1/ig
/italiano/.*/.*/\bFile(s?)\b/fail$1/ig
/italiano/.*/.*/\bloudness\b/laudness/ig
/italiano/.*/.*/\bunicode\b/unikod/ig
/italiano/.*/.*/Playlist/pleylist/ig
/italiano/.*/.*/WavPack/wave pak/ig
/italiano/.*/.*/BITRATE/bit reit/ig
/italiano/.*/.*/Codepage/cod page/ig
/italiano/.*/.*/PCM Wave/pcm Ue'iv/ig
/italiano/sapi/Loquendo/Inizializza/inizializa/ig
/italiano/sapi/ScanSoft, Inc/V/v/ig
/italiano/sapi/ScanSoft, Inc/X/x/ig
/italiano/sapi/ScanSoft, Inc/stop/stohp/ig

View file

@ -128,106 +128,12 @@ sub correct_string {
our $verbose;
my ($string, $language, $tts_object) = @_;
my $orig = $string;
switch($language) {
# General for all engines and languages
$string =~ s/USB/U S B/g;
$string =~ s/ID3/I D 3/g;
my $corrections = $tts_object->{"corrections"};
case "english" {
switch($$tts_object{"name"}) {
case ["sapi","festival"] {
$string =~ s/plugin(s?)/plug-in$1/ig; next
}
case "festival" {
$string =~ s/\ba\b/ay/ig;
$string =~ s/$/./;
}
}
}
case "deutsch" {
# for all german engines (e.g. for english words)
$string =~ s/alkaline/alkalein/ig;
$string =~ s/byte(s?)/beit$1/ig;
$string =~ s/clip(s?)/klipp$1/ig;
$string =~ s/\bcover/kawwer/ig;
$string =~ s/cuesheet/kjuschiet/ig;
$string =~ s/dither/didder/ig;
$string =~ s/equalizer/iquileiser/ig;
$string =~ s/\bflash\b/fläsh/ig;
$string =~ s/\bfirmware(s?)\b/firmwer$1/ig;
$string =~ s/\bI D 3 tag\b/I D 3 täg/ig; # can't just use "tag" here
$string =~ s/\bloudness\b/laudness/ig;
$string =~ s/\bunicode\b/unikod/ig;
switch($$tts_object{"name"}) {
case "sapi" { # just for SAPI
switch($$tts_object{"vendor"}) {
case "AT&T Labs" {
$string =~ s/alphabet/alfabet/ig;
$string =~ s/ampere/amper/ig;
$string =~ s/\bdezibel\b/de-zibell/ig;
$string =~ s/diddering/didde-ring/ig;
$string =~ s/energie\b/ener-gie/ig;
$string =~ s/\Blauf\b/-lauf/ig;
$string =~ s/\bnumerisch\b/numehrisch/ig;
}
}
}
}
}
case "svenska" {
# for all swedish engines (e.g. for english words)
$string =~ s/kilobyte/kilobajt/ig;
$string =~ s/megabyte/megabajt/ig;
$string =~ s/gigabyte/gigabajt/ig;
$string =~ s/\bloudness\b/laudness/ig;
switch($$tts_object{"name"}) {
case "espeak" { # just for eSpeak
$string =~ s/ampere/ampär/ig;
$string =~ s/bokmärken/bok-märken/ig;
$string =~ s/generella/schenerella/ig;
$string =~ s/dithering/diddering/ig;
$string =~ s/\bunicode\b/jynikod/ig;
$string =~ s/uttoning/utoning/ig;
$string =~ s/procent/pro-cent/ig;
$string =~ s/spellistor/spelistor/ig;
$string =~ s/cuesheet/qjyschiit/ig;
}
}
}
case "italiano" {
# for all italian engines (e.g. for english words)
$string =~ s/Replaygain/Ripleyghein/ig;
$string =~ s/Crossfade/Crossfeid/ig;
$string =~ s/beep/Bip/ig;
$string =~ s/cuesheet/chiushit/ig;
$string =~ s/fade/feid/ig;
$string =~ s/Crossfeed/crossfid/ig;
$string =~ s/Cache/chash/ig;
$string =~ s/\bfirmware(s?)\b/firmuer$1/ig;
$string =~ s/\bFile(s?)\b/fail$1/ig;
$string =~ s/\bloudness\b/laudness/ig;
$string =~ s/\bunicode\b/unikod/ig;
$string =~ s/Playlist/pleylist/ig;
$string =~ s/WavPack/wave pak/ig;
$string =~ s/BITRATE/bit reit/ig;
$string =~ s/Codepage/cod page/ig;
$string =~ s/PCM Wave/pcm Ue'iv/ig;
switch($$tts_object{"name"}) {
case "sapi" { # just for SAPI
switch($$tts_object{"vendor"}) {
case "Loquendo" {
$string =~ s/Inizializza/inizializa/ig;
}
case "ScanSoft, Inc" {
$string =~ s/V/v/ig;
$string =~ s/X/x/ig;
$string =~ s/stop/stohp/ig;
}
}
}
}
}
foreach (@$corrections) {
my $r = "s" . $_->{separator} . $_->{search} . $_->{separator}
. $_->{replace} . $_->{separator} . $_->{modifier};
eval ('$string =~' . "$r;");
}
if ($orig ne $string) {
printf("%s -> %s\n", $orig, $string) if $verbose;
@ -331,6 +237,7 @@ sub generateclips {
my ($language, $target, $encoder, $encoder_opts, $tts_engine, $tts_engine_opts) = @_;
my $english = dirname($0) . '/../apps/lang/english.lang';
my $langfile = dirname($0) . '/../apps/lang/' . $language . '.lang';
my $correctionsfile = dirname($0) . '/voice-corrections.txt';
my $id = '';
my $voice = '';
my $cmd = "genlang -o -t=$target -e=$english $langfile 2>/dev/null";
@ -340,6 +247,37 @@ sub generateclips {
local $| = 1; # make progress indicator work reliably
my $tts_object = init_tts($tts_engine, $tts_engine_opts, $language);
# add string corrections to tts_object.
my @corrects = ();
open(VOICEREGEXP, "<$correctionsfile") or die "Can't open corrections file!\n";
while(<VOICEREGEXP>) {
# get first character of line
my $line = $_;
my $separator = substr($_, 0, 1);
if($separator =~ m/\s+/) {
next;
}
chomp($line);
$line =~ s/^.//g; # remove separator at beginning
my ($lang, $engine, $vendor, $search, $replace, $modifier) = split(/$separator/, $line);
# does language match?
if($language !~ m/$lang/) {
next;
}
if($$tts_object{"name"} !~ m/$engine/) {
next;
}
my $v = $$tts_object{"vendor"} || ""; # vendor might be empty in $tts_object
if($v !~ m/$vendor/) {
next;
}
push @corrects, {separator => $separator, search => $search, replace => $replace, modifier => $modifier};
}
close(VOICEREGEXP);
$tts_object->{corrections} = [@corrects];
print("Generating voice clips");
print("\n") if $verbose;
for (`$cmd`) {