rockbox/tools/genlang
Dominik Riebeling d6ef5a0d80 Create binary voice string archive on target.
Extend genlang to create files holding the voice strings in a binary format
similar to lng. Create such files for all languages during a normal build, put
them in a zip file and add them to rockbox.zip so they are installed with a
main build.

Creating a voice file requires access to the voice strings, which are currently
not distributed with Rockbox. Therefore it is necessary to either have access
to the source code of the installed build or to use a build the genlang service
on rockbox.org can retrieve the strings for, making it impossible for normal
users to generate a voice file for custom builds, test builds and older builds
that aren't supported by the rockbox.org script. It also requires a network
connection when creating voice files with Rockbox Utility even if creating a
voice file by itself isn't network related.

Furthermore, putting the voice strings on the target is also a prerequisite for
eventually supporting a TTS on the player.

Strip voice strings path.

Change-Id: Ic2f86130909b3b257986a348e5cfcf618c4500c3
2012-06-09 22:44:56 +02:00

888 lines
25 KiB
Perl
Executable file

#!/usr/bin/perl -s
# __________ __ ___.
# Open \______ \ ____ ____ | | _\_ |__ _______ ___
# Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
# Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
# Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
# \/ \/ \/ \/ \/
# $Id$
#
# Copyright (C) 2006 - 2008 by Daniel Stenberg
#
# See apps/language.c (TODO: Use common include for both)
# Cookie and binary version for the binary lang file
my $LANGUAGE_COOKIE = 0x1a;
my $VOICE_COOKIE = 0x9a;
my $LANGUAGE_VERSION = 0x06;
my $LANGUAGE_FLAG_RTL = 0x01;
my $HEADER_SIZE = 4;
my $SUBHEADER_SIZE = 6;
# A note for future users and readers: The original v1 language system allowed
# the build to create and use a different language than english built-in. We
# removed that feature from our build-system, but the build scripts still had
# the ability. But, starting now, this ability is no longer provided since I
# figured it was boring and unnecessary to write support for now since we
# don't use it anymore.
if(!$ARGV[0]) {
print <<MOO
Usage: genlang [options] <langv2 file>
-p=<prefix>
Make the tool create a [prefix].c and [prefix].h file.
-b=<outfile>
Make the tool create a binary language (.lng) file named [outfile].
The use of this option requires that you also use -e, -t and -i.
-c=<outfile>
Create binary voicestring file named [outfile]. Works like -b and can be
used the same time.
-u
Update language file. Given the translated file and the most recent english
file, you\'ll get an updated version sent to stdout. Suitable action to do
when you intend to update a translation.
-e=<english lang file>
Point out the english (original source) file, to use that as master
language template. Used in combination with -b, -u or -s.
-s
Sort the Update language file in the same order as the strings in the
English file.
-t=<target>
Specify which target you want the translations/phrases for. Required when
-b or -p is used.
The target can in fact be specified as numerous different strings,
separated with colons. This will make genlang to use all the specified
strings when searching for a matching phrase.
-i=<target id>
The target id number, needed for -b.
-o
Voice mode output. Outputs all id: and voice: lines for the given target!
-v
Enables verbose (debug) output.
MOO
;
exit;
}
# How update works:
#
# 1) scan the english file, keep the whole <phrase> for each phrase.
# 2) read the translated file, for each end of phrase, compare:
# A) all source strings, if there's any change there should be a comment about
# it output
# B) the desc fields
#
# 3) output the phrase with the comments from above
# 4) check which phrases that the translated version didn't have, and spit out
# the english version of those
#
my $prefix = $p;
my $binary = $b;
my $update = $u;
my $sortfile = $s;
my $binvoice = $c;
my $english = $e;
my $voiceout = $o;
my $check = ($binary?.5:0) + ($prefix?1:0) + ($update?1:0) + ($voiceout?1:0) + ($sortfile?1:0) + ($binvoice?.5:0);
if($check > 1) {
print STDERR "Please use only one of -p, -u, -o, -b, -c and -s\n";
exit;
}
if(!$check) {
print STDERR "Please use at least one of -p, -u, -o, -c, -e and -s\n";
exit;
}
if(($binary || $update || $voiceout || $sortfile) && !$english) {
print STDERR "Please use -e too when you use -b, -o, -u or -s\n";
exit;
}
my $target_id = $i;
if($binary && !$target_id) {
print STDERR "Please specify a target id number (with -i)!\n";
exit;
}
my $target = $t;
if(!$target && !$update && !$sortfile) {
print STDERR "Please specify a target (with -t)!\n";
exit;
}
# Build up a regex which can be applied to target wildcard lists. We only need
# to support prefix matches, so a target parameter of foo:bar can be expanded
# to the regex "\*|f\*|fo\*|foo|b\*|ba\*|bar" and applied to the wildcard list
# (plus end-of-string or commas on either side). The regex engine should
# discard any duplicates generated for us in the process of constructing the
# state machine, so we don't bother to check.
my $target_regex = "(?:^|,) *(?:\\*";
foreach my $target_part (split ':', $target) {
for (my $c=1; $c<=length $target_part; $c++) {
my $partial = substr $target_part, 0, $c;
$target_regex .= "|$partial\\*";
}
$target_regex .= "|$target_part";
}
$target_regex .= ") *(?:,|\$)";
$target_regex = qr/$target_regex/;
my $binpath = "";
if ($binary =~ m|(.*)/[^/]+|) {
$binpath = $1;
}
my $verbose=$v;
my %id; # string to num hash
my @idnum; # num to string array
my %allphrases; # For sorting - an array of the <phrase> elements
my %source; # id string to source phrase hash
my %dest; # id string to dest phrase hash
my %voice; # id string to voice phrase hash
my %users =
('core' => 0);
my $input = $ARGV[0];
my @m;
my $m="blank";
sub trim {
my ($string) = @_;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
sub blank {
# nothing to do
}
my %head;
sub header {
my ($full, $n, $v)=@_;
$head{$n}=$v;
}
my %phrase;
sub phrase {
my ($full, $n, $v)=@_;
$phrase{$n}=$v;
}
my %options;
sub options {
my ($full, $n, $v)=@_;
$options{$n}=$v;
}
sub parsetarget {
my ($debug, $strref, $full, $n, $v)=@_;
my $string;
if ($n =~ $target_regex) {
$string = $v;
$$strref = $string;
return $string;
}
}
my $src;
sub source {
parsetarget("src", \$src, @_);
}
my $dest;
sub dest {
parsetarget("dest", \$dest, @_);
}
my $voice;
sub voice {
parsetarget("voice", \$voice, @_);
}
sub file_is_newer {
my ($file1, $file2) = @_;
my @s1 = stat $file1;
my @s2 = stat $file2;
return 1 if ($s1[9] > $s2[9]);
return 0;
}
my %idmap;
my %english;
if($english) {
readenglish();
}
sub readenglish {
# For the cases where the english file needs to be scanned/read, we do
# it before we read the translated file. For -b it isn't necessary, but for
# -u it is convenient.
my @idnum = ((0)); # start with a true number
my @vidnum = ((0x8000)); # first voice id
if ($binary and file_is_newer("$binpath/english.list", $english)) {
open(ENG, "<$binpath/english.list") ||
die "Error: can't open $binpath/english.list";
while (<ENG>) {
my ($user, $id, $value) = split ':', $_;
$idmap[$user]{$id} = $value;
$english{$id} = 1;
}
close ENG;
return;
}
open(ENG, "<$english") || die "Error: can't open $english";
my @phrase;
my $id;
my $maybeid;
my $user;
my $withindest;
my $numphrases = 0;
my $numusers = 1; # core is already in the users map
while(<ENG>) {
# get rid of DOS newlines
$_ =~ tr/\r//d;
if($_ =~ /^ *\<phrase\>/) {
# this is the start of a phrase
}
elsif($_ =~ /\<\/phrase\>/) {
# if id is something, when we count and store this phrase
if($id) {
# voice-only entries get a difference range
if($id =~ /^VOICE_/) {
# Assign an ID number to this entry
$idmap[$user]{$id}=$vidnum[$user];
$vidnum[$user]++;
}
else {
# Assign an ID number to this entry
$idmap[$user]{$id}=$idnum[$user];
$idnum[$user]++;
# print STDERR "DEST: bumped idnum to $idnum[$user]\n";
}
# this is the end of a phrase, add it to the english hash
$english{$id}=join("", @phrase);
}
undef @phrase;
$id="";
}
elsif($_ ne "\n") {
# gather everything related to this phrase
push @phrase, $_;
if($_ =~ /^ *\<dest\>/i) {
$withindest=1;
$deststr="";
}
elsif($withindest && ($_ =~ /^ *\<\/dest\>/i)) {
$withindest=0;
if($update || ($deststr && ($deststr !~ /^none\z/i))) {
# we unconditionally always use all IDs when the "update"
# feature is used
$id = $maybeid;
# print "DEST: use this id $id\n";
}
else {
# print "skip $maybeid for $name\n";
}
}
elsif($withindest && ($_ =~ / *([^:]+): *(.*)/)) {
my ($name, $val)=($1, $2);
$dest=""; # in case it is left untouched for when the
# model name isn't "our"
dest($_, $name, $val);
if($dest) {
# Store the current dest string. If this target matches
# multiple strings, it will get updated several times.
$deststr = $dest;
}
}
}
if($_ =~ /^ *id: ([^ \t\n]+)/i) {
$maybeid=$1;
$sortorder{$maybeid}=$numphrases++;
}
if($_ =~ /^ *user: ([^ \t\n]+)/i) {
$user = $users{$1};
if(!(defined $user)) {
$user = ++$numusers;
$users{$1} = $user;
}
}
}
close(ENG);
}
# a function that compares the english phrase with the translated one.
# compare source strings and desc
# Then output the updated version!
sub compare {
my ($idstr, $engref, $locref)=@_;
my ($edesc, $ldesc);
my ($esource, $lsource);
my $mode=0;
for my $l (@$engref) {
if($l =~ /^ *#/) {
# comment
next;
}
if($l =~ /^ *desc: (.*)/) {
$edesc=$1;
}
elsif($l =~ / *\<source\>/i) {
$mode=1;
}
elsif($mode) {
if($l =~ / *\<\/source\>/i) {
last;
}
$esource .= "$l\n";
}
}
my @show;
my @source;
$mode = 0;
for my $l (@$locref) {
if($l =~ /^ *desc: (.*)/) {
$ldesc=$1;
if(trim($edesc) ne trim($ldesc)) {
$l = "### The 'desc' field differs from the english!\n### the previously used desc is commented below:\n### desc: $ldesc\n desc: $edesc\n";
}
push @show, $l;
}
elsif($l =~ / *\<source\>/i) {
$mode=1;
push @show, $l;
}
elsif($mode) {
if($l =~ / *\<\/source\>/i) {
$mode = 0;
print @show;
if(trim($esource) ne trim($lsource)) {
print "### The <source> section differs from the english!\n",
"### the previously used one is commented below:\n";
for(split("\n", $lsource)) {
print "### $_\n";
}
print $esource;
}
else {
print $lsource;
}
undef @show; # start over
push @show, $l;
}
else {
$lsource .= "$l";
}
}
else {
push @show, $l;
}
}
print @show;
}
my @idcount; # counter for lang ID numbers
my @voiceid; # counter for voice-only ID numbers
for (keys %users) {
push @idcount, 0;
push @voiceid, 0x8001;
}
#
# Now start the scanning of the selected language string
#
open(LANG, "<$input") || die "Error: couldn't read language file named $input\n";
my @phrase;
my $header = 1;
my $langoptions = 0;
while(<LANG>) {
$line++;
# get rid of DOS newlines
$_ =~ tr/\r//d;
if($_ =~ /^( *\#|[ \t\n\r]*\z)/) {
# comment or empty line - output it if it's part of the header
if ($header and ($update || $sortfile)) {
print($_);
}
next;
}
$header = 0;
my $ll = $_;
# print "M: $m\n";
push @phrase, $ll;
# this is an XML-lookalike tag
if (/^(<|[^\"<]+<)([^>]*)>/) {
my $part = $2;
# print "P: $part\n";
if($part =~ /^\//) {
# this was a closing tag
if($part eq "/phrase") {
# closing the phrase
my $idstr = $phrase{'id'};
my $idnum;
if($binary && !$english{$idstr}) {
# $idstr doesn't exist for english, skip it\n";
}
elsif($dest =~ /^none\z/i) {
# "none" as dest (without quotes) means that this entire
# phrase is to be ignored
}
elsif($sortfile) {
$allphrases{$idstr}=join('',@phrase);
}
elsif(!$update) {
# we don't do the fully detailed analysis when we "update"
# since we don't do it for a particular target etc
# allow the keyword 'deprecated' to be used on dest and
# voice strings to mark that as deprecated. It will then
# be replaced with "".
$dest =~ s/^deprecate(|d)\z/\"\"/i;
$voice =~ s/^deprecate(|d)\z/\"\"/i;
# basic syntax error alerts, if there are no quotes we
# will assume an empty string was intended
if($dest !~ /^\"/) {
print STDERR "$input:$line:1: warning: dest before line lacks quotes ($dest)!\n";
$dest='""';
}
if($src !~ /^\"/) {
print STDERR "$input:$line:1: warning: source before line lacks quotes ($src)!\n";
$src='""';
}
if($voice !~ /^\"/ and $voice !~ /^none\z/i) {
print STDERR "$input:$line:1: warning: voice before line lacks quotes ($voice)!\n";
$voice='""';
}
if($dest eq '""' && $phrase{'desc'} !~ /deprecated/i && $idstr !~ /^VOICE/) {
print STDERR "$input:$line:1: warning: empty dest before line in non-deprecated phrase!\n";
}
my $userstr = trim($phrase{'user'});
my $user = $users{$userstr};
if ($userstr eq "") {
print STDERR "$input:$line:1: warning: missing user!\n";
$user = $users{"core"};
}
elsif(!(defined $user)) {
if($english) {
print STDERR "$input:$line:1: warning: user was not found in $english!\n";
$user = keys %users; # set to an invalid user so it won't be added
}
else {
# we found a new user, add it to the usermap
$user = ++$numusers;
$users{$userstr} = $user;
}
}
# Use the ID name to figure out which id number range we
# should use for this phrase. Voice-only strings are
# separated.
if($idstr =~ /^VOICE/) {
$idnum = $voiceid[$user]++;
}
else {
$idnum = $idcount[$user]++;
}
$id{$idstr} = $idnum;
$idnum[$user][$idnum]=$idstr;
$source{$idstr}=$src;
$dest{$idstr}=$dest;
$voice{$idstr}=$voice;
if($verbose) {
print "id: $phrase{id} ($idnum)\n";
print "source: $src\n";
print "dest: $dest\n";
print "voice: $voice\n";
print "user: $user\n";
}
undef $src;
undef $dest;
undef $voice;
undef $user;
undef %phrase;
}
if($update) {
my $e = $english{$idstr};
if($e) {
# compare original english with this!
my @eng = split("\n", $english{$idstr});
compare($idstr, \@eng, \@phrase);
$english{$idstr}=""; # clear it
}
else {
print "### $idstr: The phrase is not used. Skipped\n";
}
}
undef @phrase;
} # end of </phrase>
elsif($part eq "/options") {
# closing the options
if ($options{'rtl'}) {
$langoptions |= $LANGUAGE_FLAG_RTL;
}
} # end of </options>
# starts with a slash, this _ends_ this section
$m = pop @m; # get back old value, the previous level's tag
next;
} # end of tag close
# This is an opening (sub) tag
push @m, $m; # store old value
$m = $part;
next;
}
if(/^ *([^:]+): *(.*)/) {
my ($name, $val)=($1, $2);
&$m($_, $name, $val);
}
}
close(LANG);
if($update) {
my $any=0;
for(keys %english) {
if($english{$_}) {
print "###\n",
"### This phrase below was not present in the translated file\n",
"<phrase>\n";
print $english{$_};
print "</phrase>\n";
}
}
}
if ($sortfile) {
for(sort { $sortorder{$a} <=> $sortorder{$b} } keys %allphrases) {
print $allphrases{$_};
}
}
if($prefix) {
# We create a .c and .h file
open(HFILE_CORE, ">$prefix/lang.h") ||
die "Error: couldn't create file $prefix/lang.h\n";
open(CFILE_CORE, ">$prefix/lang_core.c") ||
die "Error: couldn't create file $prefix/lang_core.c\n";
# get header file name
$headername = "$prefix/lang.h";
$headername =~ s/(.*\/)*//;
print HFILE_CORE <<MOO
/* This file was automatically generated using genlang */
/*
* The str() macro/functions is how to access strings that might be
* translated. Use it like str(MACRO) and expect a string to be
* returned!
*/
#define str(x) language_strings[x]
/* this is the array for holding the string pointers.
It will be initialized at runtime. */
extern unsigned char *language_strings[];
/* this contains the concatenation of all strings, separated by \\0 chars */
extern const unsigned char core_language_builtin[];
/* The enum below contains all available strings */
enum \{
MOO
;
print CFILE_CORE <<MOO
/* This file was automatically generated using genlang, the strings come
from "$input" */
#include "$headername"
unsigned char *language_strings[LANG_LAST_INDEX_IN_ARRAY];
const unsigned char core_language_builtin[] =
MOO
;
# Output the ID names for the enum in the header file
my $i;
for $i (0 .. $idcount[$users{"core"}]-1) {
my $name=$idnum[$users{"core"}][$i]; # get the ID name
$name =~ tr/\"//d; # cut off the quotes
printf HFILE_CORE (" %s, /* %d */\n", $name, $i);
}
# Output separation marker for last string ID and the upcoming voice IDs
print HFILE_CORE <<MOO
LANG_LAST_INDEX_IN_ARRAY, /* this is not a string, this is a marker */
/* --- below this follows voice-only strings --- */
VOICEONLY_DELIMITER = 0x8000,
MOO
;
# Output the ID names for the enum in the header file
for $i (0x8001 .. ($voiceid[$users{"core"}]-1)) {
my $name=$idnum[$users{"core"}][$i]; # get the ID name
$name =~ tr/\"//d; # cut off the quotes
printf HFILE_CORE (" %s, /* 0x%x */\n", $name, $i);
}
# Output end of enum
print HFILE_CORE "\n};\n/* end of generated enum list */\n";
# Output the target phrases for the source file
for $i (0 .. $idcount[$users{"core"}]-1) {
my $name=$idnum[$users{"core"}][$i]; # get the ID
my $dest = $dest{$name}; # get the destination phrase
$dest =~ s:\"$:\\0\":; # insert a \0 before the second quote
if(!$dest) {
# this is just to be on the safe side
$dest = '"\0"';
}
printf CFILE_CORE (" %s\n", $dest);
}
# Output end of string chunk
print CFILE_CORE <<MOO
;
/* end of generated string list */
MOO
;
close(HFILE_CORE);
close(CFILE_CORE);
} # end of the c/h file generation
elsif($binary || $binvoice) {
# Creation of a binary lang file was requested
# We must first scan the english file to get the correct order of the id
# numbers used there, as that is what sets the id order for all language
# files. The english file is scanned before the translated file was
# scanned.
if($binary) {
open(OUTF, ">$binary") or die "Error: Can't create $binary";
binmode OUTF;
printf OUTF ("%c%c%c%c", $LANGUAGE_COOKIE, $LANGUAGE_VERSION, $target_id,
$langoptions); # magic lang file header
}
if($binvoice) {
open(OUTV, ">$binvoice") or die "Error: Can't create $binary";
binmode OUTV;
printf OUTV ("%c%c%c%c", $VOICE_COOKIE, $LANGUAGE_VERSION, $target_id,
$langoptions); # magic lang file header
}
# output the number of strings for each user
my $foffset = $HEADER_SIZE + $SUBHEADER_SIZE * keys(%users);
for (keys %users) {
my $size;
for $n (0 .. $idcount[$_]-1) {
$size += length(trim($dest{$idnum[$_][$n]})) + 1;
}
if($binary) {
printf OUTF ("%c%c%c%c%c%c", ($idcount[$_] >> 8), ($idcount[$_] & 0xff),
($size >> 8), ($size & 0xff), ($foffset >> 8), ($foffset & 0xff));
}
if($binvoice) {
printf OUTV ("%c%c%c%c%c%c", ($idcount[$_] >> 8), ($idcount[$_] & 0xff),
($size >> 8), ($size & 0xff), ($foffset >> 8), ($foffset & 0xff));
}
$foffset += $size;
}
for (keys %users) {
# loop over the target phrases
# This loops over the strings in the translated language file order
my @ids = ((0 .. ($idcount[$_]-1)));
push @ids, (0x8000 .. ($voiceid[$_]-1));
for $n (@ids) {
my $name=$idnum[$_][$n]; # get the ID
my $dest = $dest{$name}; # get the destination phrase
my $voice = $voice{$name}; # get the destination voice string
if($dest && $n < 0x8000 && $binary) {
$dest =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes
# Now, make sure we get the number from the english sort order:
$idnum = $idmap[$_]{$name};
printf OUTF ("%c%c%s\x00", ($idnum>>8), ($idnum&0xff), $dest);
}
if($voice && $binvoice) {
$voice =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes
# Now, make sure we get the number from the english sort order:
$idnum = $idmap[$_]{$name};
printf OUTV ("%c%c%s\x00", ($idnum>>8), ($idnum&0xff), $voice);
}
}
}
if($binary) {
close(OUTF);
}
if($binvoice) {
close(OUTV);
}
}
elsif($voiceout) {
# voice output requested, display id: and voice: strings in a v1-like
# fashion
my @engl;
# This loops over the strings in the translated language file order
my @ids = ((0 .. ($idcount[$users{"core"}]-1)));
push @ids, (0x8000 .. ($voiceid[$users{"core"}]-1));
#for my $id (@ids) {
# print "$id\n";
#}
for $i (@ids) {
my $name=$idnum[$users{"core"}][$i]; # get the ID
my $dest = $voice{$name}; # get the destination voice string
if($dest) {
$dest =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes
# Now, make sure we get the number from the english sort order:
$idnum = $idmap[$users{"core"}]{$name};
if(length($idnum)) {
$engl[$idnum] = $i;
#print "Input index $i output index $idnum\n";
}
else {
# not used, mark it so
$engl[$i] = -1
}
}
}
for my $i (@ids) {
my $o = $engl[$i];
if(($o < 0) || !length($o)) {
if($i < 0x8000) {
print "#$i\nid: NOT_USED_$i\nvoice: \"\"\n";
}
else {
print "#$i\nid: VOICE_NOT_USED_$i\nvoice: \"\"\n";
}
next;
}
my $name=$idnum[$users{"core"}][$o]; # get the ID
my $dest = $voice{$name}; # get the destination voice string
print "#$i ($o)\nid: $name\nvoice: $dest\n";
}
}
if($verbose) {
my $num_str = 0;
for (keys %users) {
$num_str += $idcount[$_];
}
printf("%d ID strings scanned\n", $num_str);
print "* head *\n";
for(keys %head) {
printf "$_: %s\n", $head{$_};
}
}
if ($binary and !file_is_newer("$binpath/english.list", $english)) {
open(ENGLIST, ">$binpath/english.list") ||
die "Failed creating $binpath/english.list";
for my $user (keys %users) {
for my $id (keys %{$idmap[$user]}) {
print ENGLIST "$user:$id:$idmap[$user]{$id}\n";
}
}
close ENGLIST;
}