rockbox/tools/genlang

717 lines
19 KiB
Text
Raw Normal View History

#!/usr/bin/perl -s
# __________ __ ___.
# Open \______ \ ____ ____ | | _\_ |__ _______ ___
# Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
# Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
# Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
# \/ \/ \/ \/ \/
# $Id$
#
# Copyright (C) 2006 - 2008 by Daniel Stenberg
#
# binary version for the binary lang file
my $langversion = 4; # 3 was the latest one used in the v1 format
# A note for future users and readers: The original v1 language system allowed
# the build to create and use a different language than english built-in. We
# removed that feature from our build-system, but the build scripts still had
# the ability. But, starting now, this ability is no longer provided since I
# figured it was boring and unnecessary to write support for now since we
# don't use it anymore.
if(!$ARGV[0]) {
print <<MOO
Usage: genlang [options] <langv2 file>
-p=<prefix>
Make the tool create a [prefix].c and [prefix].h file.
-b=<outfile>
Make the tool create a binary language (.lng) file named [outfile].
The use of this option requires that you also use -e, -t and -i.
-u
Update language file. Given the translated file and the most recent english
file, you\'ll get an updated version sent to stdout. Suitable action to do
when you intend to update a translation.
-e=<english lang file>
Point out the english (original source) file, to use that as master
language template. Used in combination with -b, -u or -s.
-s
Sort the Update language file in the same order as the strings in the
English file.
-t=<target>
Specify which target you want the translations/phrases for. Required when
-b or -p is used.
The target can in fact be specified as numerous different strings,
separated with colons. This will make genlang to use all the specified
strings when searching for a matching phrase.
-i=<target id>
The target id number, needed for -b.
-o
Voice mode output. Outputs all id: and voice: lines for the given target!
-v
Enables verbose (debug) output.
MOO
;
exit;
}
# How update works:
#
# 1) scan the english file, keep the whole <phrase> for each phrase.
# 2) read the translated file, for each end of phrase, compare:
# A) all source strings, if there's any change there should be a comment about
# it output
# B) the desc fields
#
# 3) output the phrase with the comments from above
# 4) check which phrases that the translated version didn't have, and spit out
# the english version of those
#
my $prefix = $p;
my $binary = $b;
my $update = $u;
my $sortfile = $s;
my $english = $e;
my $voiceout = $o;
my $check = ($binary?1:0) + ($prefix?1:0) + ($update?1:0) + ($voiceout?1:0) + ($sortfile?1:0);
if($check > 1) {
print "Please use only one of -p, -u, -o, -b and -s\n";
exit;
}
if(!$check) {
print "Please use at least one of -p, -u, -o, -b and -s\n";
exit;
}
if(($binary || $update || $voiceout || $sortfile) && !$english) {
print "Please use -e too when you use -b, -o, -u or -s\n";
exit;
}
my $target_id = $i;
if($binary && !$target_id) {
print "Please specify a target id number (with -i)!\n";
exit;
}
my $target = $t;
if(!$target && !$update && !$sortfile) {
print "Please specify a target (with -t)!\n";
exit;
}
my $verbose=$v;
my %id; # string to num hash
my @idnum; # num to string array
my %allphrases; # For sorting - an array of the <phrase> elements
my %source; # id string to source phrase hash
my %dest; # id string to dest phrase hash
my %voice; # id string to voice phrase hash
my $input = $ARGV[0];
my @m;
my $m="blank";
sub trim {
my ($string) = @_;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
sub match {
my ($string, $pattern)=@_;
$pattern =~ s/\*/.?*/g;
$pattern =~ s/\?/./g;
return ($string =~ /^$pattern\z/);
}
sub blank {
# nothing to do
}
my %head;
sub header {
my ($full, $n, $v)=@_;
$head{$n}=$v;
}
my %phrase;
sub phrase {
my ($full, $n, $v)=@_;
$phrase{$n}=$v;
}
sub parsetarget {
my ($debug, $strref, $full, $n, $v)=@_;
my $string;
my @all= split(" *, *", $n);
my $test;
for $test (@all) {
# print "TEST ($debug) $target for $test\n";
for my $part (split(":", $target)) {
if(match($part, $test)) {
$string = $v;
# print "MATCH: $test => $v\n";
$$strref = $string;
return $string;
}
}
}
}
my $src;
sub source {
parsetarget("src", \$src, @_);
}
my $dest;
sub dest {
parsetarget("dest", \$dest, @_);
}
my $voice;
sub voice {
parsetarget("voice", \$voice, @_);
}
my %idmap;
my %english;
if($english) {
# For the cases where the english file needs to be scanned/read, we do
# it before we read the translated file. For -b it isn't necessary, but for
# -u it is convenient.
my $idnum=0; # start with a true number
my $vidnum=0x8000; # first voice id
open(ENG, "<$english") || die "Error: can't open $english";
my @phrase;
my $id;
my $maybeid;
my $withindest;
my $numphrases = 0;
while(<ENG>) {
# get rid of DOS newlines
$_ =~ s/\r//g;
if($_ =~ /^ *\<phrase\>/) {
# this is the start of a phrase
}
elsif($_ =~ /^ *\<\/phrase\>/) {
# if id is something, when we count and store this phrase
if($id) {
# voice-only entries get a difference range
if($id =~ /^VOICE_/) {
# Assign an ID number to this entry
$idmap{$id}=$vidnum;
$vidnum++;
}
else {
# Assign an ID number to this entry
$idmap{$id}=$idnum;
$idnum++;
# print STDERR "DEST: bumped idnum to $idnum\n";
}
# this is the end of a phrase, add it to the english hash
$english{$id}=join("", @phrase);
}
undef @phrase;
$id="";
}
elsif($_ ne "\n") {
# gather everything related to this phrase
push @phrase, $_;
if($_ =~ /^ *\<dest\>/i) {
$withindest=1;
$deststr="";
}
elsif($withindest && ($_ =~ /^ *\<\/dest\>/i)) {
$withindest=0;
if($update || ($deststr && ($deststr !~ /^none\z/i))) {
# we unconditionally always use all IDs when the "update"
# feature is used
$id = $maybeid;
# print "DEST: use this id $id\n";
}
else {
# print "skip $maybeid for $name\n";
}
}
elsif($withindest && ($_ =~ / *([^:]+): *(.*)/)) {
my ($name, $val)=($1, $2);
$dest=""; # in case it is left untouched for when the
# model name isn't "our"
dest($_, $name, $val);
if($dest) {
# Store the current dest string. If this target matches
# multiple strings, it will get updated several times.
$deststr = $dest;
}
}
}
if($_ =~ /^ *id: ([^ \t\n]+)/i) {
$maybeid=$1;
$sortorder{$maybeid}=$numphrases++;
}
}
close(ENG);
}
# a function that compares the english phrase with the translated one.
# compare source strings and desc
# Then output the updated version!
sub compare {
my ($idstr, $engref, $locref)=@_;
my ($edesc, $ldesc);
my ($esource, $lsource);
my $mode=0;
for my $l (@$engref) {
if($l =~ /^ *#/) {
# comment
next;
}
if($l =~ /^ *desc: (.*)/) {
$edesc=$1;
}
elsif($l =~ / *\<source\>/i) {
$mode=1;
}
elsif($mode) {
if($l =~ / *\<\/source\>/i) {
last;
}
$esource .= "$l\n";
}
}
my @show;
my @source;
$mode = 0;
for my $l (@$locref) {
if($l =~ /^ *desc: (.*)/) {
$ldesc=$1;
if(trim($edesc) ne trim($ldesc)) {
$l = "### The 'desc' field differs from the english!\n### the previously used desc is commented below:\n### desc: $ldesc\n desc: $edesc\n";
}
push @show, $l;
}
elsif($l =~ / *\<source\>/i) {
$mode=1;
push @show, $l;
}
elsif($mode) {
if($l =~ / *\<\/source\>/i) {
$mode = 0;
print @show;
if(trim($esource) ne trim($lsource)) {
print "### The <source> section differs from the english!\n",
"### the previously used one is commented below:\n";
for(split("\n", $lsource)) {
print "### $_\n";
}
print $esource;
}
else {
print $lsource;
}
undef @show; # start over
push @show, $l;
}
else {
$lsource .= "$l";
}
}
else {
push @show, $l;
}
}
print @show;
}
my $idcount; # counter for lang ID numbers
my $voiceid=0x8000; # counter for voice-only ID numbers
#
# Now start the scanning of the selected language string
#
open(LANG, "<$input") || die "Error: couldn't read language file named $input\n";
my @phrase;
my $header = 1;
while(<LANG>) {
$line++;
# get rid of DOS newlines
$_ =~ s/\r//g;
if($_ =~ /^( *\#|[ \t\n\r]*\z)/) {
# comment or empty line - output it if it's part of the header
if ($header and ($update || $sortfile)) {
print($_);
}
next;
}
$header = 0;
my $ll = $_;
# print "M: $m\n";
push @phrase, $ll;
# this is an XML-lookalike tag
if (/^(<|[^\"<]+<)([^>]*)>/) {
my $part = $2;
# print "P: $part\n";
if($part =~ /^\//) {
# this was a closing tag
if($part eq "/phrase") {
# closing the phrase
my $idstr = $phrase{'id'};
my $idnum;
if($binary && !$english{$idstr}) {
# $idstr doesn't exist for english, skip it\n";
}
elsif($dest =~ /^none\z/i) {
# "none" as dest (without quotes) means that this entire
# phrase is to be ignored
}
elsif($sortfile) {
$allphrases{$idstr}=join('',@phrase);
}
elsif(!$update) {
# we don't do the fully detailed analysis when we "update"
# since we don't do it for a particular target etc
# allow the keyword 'deprecated' to be used on dest and
# voice strings to mark that as deprecated. It will then
# be replaced with "".
$dest =~ s/^deprecate(|d)\z/\"\"/i;
$voice =~ s/^deprecate(|d)\z/\"\"/i;
# basic syntax error alerts, if there are no quotes we
# will assume an empty string was intended
if($dest !~ /^\"/) {
print STDERR "$input:$line:1: warning: dest before line lacks quotes ($dest)!\n";
$dest='""';
}
if($src !~ /^\"/) {
print STDERR "$input:$line:1: warning: source before line lacks quotes ($src)!\n";
$src='""';
}
if($voice !~ /^\"/ and $voice !~ /^none\z/i) {
print STDERR "$input:$line:1: warning: voice before line lacks quotes ($voice)!\n";
$voice='""';
}
# Use the ID name to figure out which id number range we
# should use for this phrase. Voice-only strings are
# separated.
if($idstr =~ /^VOICE/) {
$idnum = $voiceid++;
}
else {
$idnum = $idcount++;
}
$id{$idstr} = $idnum;
$idnum[$idnum]=$idstr;
$source{$idstr}=$src;
$dest{$idstr}=$dest;
$voice{$idstr}=$voice;
if($verbose) {
print "id: $phrase{id} ($idnum)\n";
print "source: $src\n";
print "dest: $dest\n";
print "voice: $voice\n";
}
undef $src;
undef $dest;
undef $voice;
undef %phrase;
}
if($update) {
my $e = $english{$idstr};
if($e) {
# compare original english with this!
my @eng = split("\n", $english{$idstr});
compare($idstr, \@eng, \@phrase);
$english{$idstr}=""; # clear it
}
else {
print "### $idstr: The phrase is not used. Skipped\n";
}
}
undef @phrase;
} # end of </phrase>
# starts with a slash, this _ends_ this section
$m = pop @m; # get back old value, the previous level's tag
next;
} # end of tag close
# This is an opening (sub) tag
push @m, $m; # store old value
$m = $part;
next;
}
if(/^ *([^:]+): *(.*)/) {
my ($name, $val)=($1, $2);
&$m($_, $name, $val);
}
}
close(LANG);
if($update) {
my $any=0;
for(keys %english) {
if($english{$_}) {
print "###\n",
"### This phrase below was not present in the translated file\n",
"<phrase>\n";
print $english{$_};
print "</phrase>\n";
}
}
}
if ($sortfile) {
for(sort { $sortorder{$a} <=> $sortorder{$b} } keys %allphrases) {
print $allphrases{$_};
}
}
if($prefix) {
# We create a .c and .h file
open(HFILE, ">$prefix.h") ||
die "Error: couldn't create file $prefix.h\n";
open(CFILE, ">$prefix.c") ||
die "Error: couldn't create file $prefix.c\n";
print HFILE <<MOO
/* This file was automatically generated using genlang */
/*
* The str() macro/functions is how to access strings that might be
* translated. Use it like str(MACRO) and expect a string to be
* returned!
*/
#define str(x) language_strings[x]
/* this is the array for holding the string pointers.
It will be initialized at runtime. */
extern unsigned char *language_strings[];
/* this contains the concatenation of all strings, separated by \\0 chars */
extern const unsigned char language_builtin[];
/* The enum below contains all available strings */
enum \{
MOO
;
print CFILE <<MOO
/* This file was automaticly generated using genlang, the strings come
from "$input" */
#include "$prefix.h"
unsigned char *language_strings[LANG_LAST_INDEX_IN_ARRAY];
const unsigned char language_builtin[] =
MOO
;
# Output the ID names for the enum in the header file
my $i;
for $i (1 .. $idcount) {
my $name=$idnum[$i - 1]; # get the ID name
$name =~ s/\"//g; # cut off the quotes
printf HFILE (" %s, /* %d */\n", $name, $i-1);
}
# Output separation marker for last string ID and the upcoming voice IDs
print HFILE <<MOO
LANG_LAST_INDEX_IN_ARRAY, /* this is not a string, this is a marker */
/* --- below this follows voice-only strings --- */
VOICEONLY_DELIMITER = 0x8000,
MOO
;
# Output the ID names for the enum in the header file
for $i (0x8000 .. ($voiceid-1)) {
my $name=$idnum[$i]; # get the ID name
$name =~ s/\"//g; # cut off the quotes
printf HFILE (" %s,\n", $name);
}
# Output end of enum
print HFILE "\n};\n/* end of generated enum list */\n";
# Output the target phrases for the source file
for $i (1 .. $idcount) {
my $name=$idnum[$i - 1]; # get the ID
my $dest = $dest{$name}; # get the destination phrase
$dest =~ s:\"$:\\0\":; # insert a \0 before the second quote
if(!$dest) {
# this is just to be on the safe side
$dest = '"\0"';
}
printf CFILE (" %s\n", $dest);
}
# Output end of string chunk
print CFILE <<MOO
;
/* end of generated string list */
MOO
;
close(HFILE);
close(CFILE);
} # end of the c/h file generation
elsif($binary) {
# Creation of a binary lang file was requested
# We must first scan the english file to get the correct order of the id
# numbers used there, as that is what sets the id order for all language
# files. The english file is scanned before the translated file was
# scanned.
open(OUTF, ">$binary") or die "Error: Can't create $binary";
binmode OUTF;
printf OUTF ("\x1a%c%c", $langversion, $target_id); # magic lang file header
# loop over the target phrases
for $i (1 .. $idcount) {
my $name=$idnum[$i - 1]; # get the ID
my $dest = $dest{$name}; # get the destination phrase
if($dest) {
$dest =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes
# Now, make sure we get the number from the english sort order:
$idnum = $idmap{$name};
printf OUTF ("%c%c%s\x00", ($idnum>>8), ($idnum&0xff), $dest);
}
}
}
elsif($voiceout) {
# voice output requested, display id: and voice: strings in a v1-like
# fashion
my @engl;
# This loops over the strings in the translated language file order
my @ids = ((0 .. ($idcount-1)));
push @ids, (0x8000 .. ($voiceid-1));
#for my $id (@ids) {
# print "$id\n";
#}
for $i (@ids) {
my $name=$idnum[$i]; # get the ID
my $dest = $voice{$name}; # get the destination voice string
if($dest) {
$dest =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes
# Now, make sure we get the number from the english sort order:
$idnum = $idmap{$name};
if(length($idnum)) {
$engl[$idnum] = $i;
#print "Input index $i output index $idnum\n";
}
else {
# not used, mark it so
$engl[$i] = -1
}
}
}
for my $i (@ids) {
my $o = $engl[$i];
if(($o < 0) || !length($o)) {
print "#$i\nid: NOT_USED_$i\nvoice: \"\"\n";
next;
}
my $name=$idnum[$o]; # get the ID
my $dest = $voice{$name}; # get the destination voice string
print "#$i ($o)\nid: $name\nvoice: $dest\n";
}
}
if($verbose) {
printf("%d ID strings scanned\n", $idcount);
print "* head *\n";
for(keys %head) {
printf "$_: %s\n", $head{$_};
}
}