rockbox/tools/genlang

#!/usr/bin/perl -s
#             __________               __   ___.
#   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
#   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
#   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
#   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
#                     \/            \/     \/    \/            \/
# $Id$
#
# Copyright (C) 2006 - 2008 by Daniel Stenberg
#

# See apps/language.c (TODO: Use common include for both)
# Cookie and binary version for the binary lang file
my $LANGUAGE_COOKIE   = 0x1a;
my $LANGUAGE_VERSION  = 0x06;
my $LANGUAGE_FLAG_RTL = 0x01;

my $HEADER_SIZE       = 4;
my $SUBHEADER_SIZE    = 6;

# A note for future users and readers: The original v1 language system allowed
# the build to create and use a different language than english built-in. We
# removed that feature from our build-system, but the build scripts still had
# the ability. But, starting now, this ability is no longer provided since I
# figured it was boring and unnecessary to write support for now since we
# don't use it anymore.


if(!$ARGV[0]) {
    print <<MOO
Usage: genlang [options] <langv2 file>

 -p=<prefix>
    Make the tool create a [prefix].c and [prefix].h file.

 -b=<outfile>
    Make the tool create a binary language (.lng) file named [outfile].
    The use of this option requires that you also use -e, -t and -i.

 -u
    Update language file. Given the translated file and the most recent english
    file, you\'ll get an updated version sent to stdout. Suitable action to do
    when you intend to update a translation.

 -e=<english lang file>
    Point out the english (original source) file, to use that as master
    language template. Used in combination with -b, -u or -s.

 -s
    Sort the Update language file in the same order as the strings in the
    English file.

 -t=<target>
    Specify which target you want the translations/phrases for. Required when
    -b or -p is used.

    The target can in fact be specified as numerous different strings,
    separated with colons. This will make genlang to use all the specified
    strings when searching for a matching phrase.

 -i=<target id>
    The target id number, needed for -b.

 -o
    Voice mode output. Outputs all id: and voice: lines for the given target!

 -v
    Enables verbose (debug) output.
MOO
;
    exit;
}

# How update works:
#
# 1) scan the english file, keep the whole <phrase> for each phrase.
# 2) read the translated file, for each end of phrase, compare:
#  A) all source strings, if there's any change there should be a comment about
#     it output
#  B) the desc fields
#
# 3) output the phrase with the comments from above
# 4) check which phrases that the translated version didn't have, and spit out
#    the english version of those
#

my $prefix = $p;
my $binary = $b;
my $update = $u;
my $sortfile = $s;

my $english = $e;
my $voiceout = $o;

my $check = ($binary?1:0) + ($prefix?1:0) + ($update?1:0) + ($voiceout?1:0) + ($sortfile?1:0);

if($check > 1) {
    print STDERR "Please use only one of -p, -u, -o, -b and -s\n";
    exit;
}
if(!$check) {
    print STDERR "Please use at least one of -p, -u, -o, -b and -s\n";
    exit;
}


if(($binary || $update || $voiceout || $sortfile) && !$english) {
    print STDERR "Please use -e too when you use -b, -o, -u or -s\n";
    exit;
}

my $target_id = $i;
if($binary && !$target_id) {
    print STDERR "Please specify a target id number (with -i)!\n";
    exit;
}

my $target = $t;
if(!$target && !$update && !$sortfile) {
    print STDERR "Please specify a target (with -t)!\n";
    exit;
}

# Build up a regex which can be applied to target wildcard lists. We only need
# to support prefix matches, so a target parameter of foo:bar can be expanded
# to the regex "\*|f\*|fo\*|foo|b\*|ba\*|bar" and applied to the wildcard list
# (plus end-of-string or commas on either side). The regex engine should
# discard any duplicates generated for us in the process of constructing the
# state machine, so we don't bother to check.
my $target_regex = "(?:^|,) *(?:\\*";
foreach my $target_part (split ':', $target) {
    for (my $c=1; $c<=length $target_part; $c++) {
        my $partial = substr $target_part, 0, $c;
        $target_regex .= "|$partial\\*";
    }
    $target_regex .= "|$target_part";
}
$target_regex .= ") *(?:,|\$)";
$target_regex = qr/$target_regex/;

my $binpath = "";
if ($binary =~ m|(.*)/[^/]+|) {
    $binpath = $1;
}

my $verbose=$v;

my %id; # string to num hash
my @idnum; # num to string array

my %allphrases;  # For sorting - an array of the <phrase> elements
my %source; # id string to source phrase hash
my %dest; # id string to dest phrase hash
my %voice; # id string to voice phrase hash

my %users =
  ('core' => 0);

my $input = $ARGV[0];

my @m;
my $m="blank";

sub trim {
    my ($string) = @_;
    $string =~ s/^\s+//;
    $string =~ s/\s+$//;
    return $string;
}

sub blank {
    # nothing to do
}

my %head;
sub header {
    my ($full, $n, $v)=@_;
    $head{$n}=$v;
}

my %phrase;
sub phrase {
    my ($full, $n, $v)=@_;
    $phrase{$n}=$v;
}

my %options;
sub options {
    my ($full, $n, $v)=@_;
    $options{$n}=$v;
}

sub parsetarget {
    my ($debug, $strref, $full, $n, $v)=@_;
    my $string;
    if ($n =~ $target_regex) {
        $string = $v;
        $$strref = $string;
        return $string;
    }
}

my $src;
sub source {
    parsetarget("src", \$src, @_);
}

my $dest;
sub dest {
    parsetarget("dest", \$dest, @_);
}

my $voice;
sub voice {
    parsetarget("voice", \$voice, @_);
}

sub file_is_newer {
    my ($file1, $file2) = @_;

    my @s1 = stat $file1;
    my @s2 = stat $file2;

    return 1 if ($s1[9] > $s2[9]);
    return 0;
}

my %idmap;
my %english;
if($english) {
    readenglish();
}

sub readenglish {
    # For the cases where the english file needs to be scanned/read, we do
    # it before we read the translated file. For -b it isn't necessary, but for
    # -u it is convenient.

    my @idnum = ((0));       # start with a true number
    my @vidnum = ((0x8000)); # first voice id


    if ($binary and file_is_newer("$binpath/english.list", $english)) {
        open(ENG, "<$binpath/english.list") ||
            die "Error: can't open $binpath/english.list";
        while (<ENG>) {
            my ($user, $id, $value) = split ':', $_;
            $idmap[$user]{$id} = $value;
            $english{$id} = 1;
        }
        close ENG;

        return;
    }

    open(ENG, "<$english") || die "Error: can't open $english";
    my @phrase;
    my $id;
    my $maybeid;
    my $user;
    my $withindest;
    my $numphrases = 0;
    my $numusers = 1; # core is already in the users map

    while(<ENG>) {

        # get rid of DOS newlines
        $_ =~ tr/\r//d;

        if($_ =~ /^ *\<phrase\>/) {
            # this is the start of a phrase
        }
        elsif($_ =~ /\<\/phrase\>/) {

            # if id is something, when we count and store this phrase
            if($id) {
                # voice-only entries get a difference range
                if($id =~ /^VOICE_/) {
                    # Assign an ID number to this entry
                    $idmap[$user]{$id}=$vidnum[$user];
                    $vidnum[$user]++;
                }
                else {
                    # Assign an ID number to this entry
                    $idmap[$user]{$id}=$idnum[$user];
                    $idnum[$user]++;
   #                 print STDERR "DEST: bumped idnum to $idnum[$user]\n";
                }

                # this is the end of a phrase, add it to the english hash
                $english{$id}=join("", @phrase);
            }
            undef @phrase;
            $id="";
        }
        elsif($_ ne "\n") {
            # gather everything related to this phrase
            push @phrase, $_;
            if($_ =~ /^ *\<dest\>/i) {
                $withindest=1;
                $deststr="";
            }
            elsif($withindest && ($_ =~ /^ *\<\/dest\>/i)) {
                $withindest=0;

                if($update || ($deststr && ($deststr !~ /^none\z/i))) {
                    # we unconditionally always use all IDs when the "update"
                    # feature is used
                    $id = $maybeid;
    #                print "DEST: use this id $id\n";
                }
                else {
    #                print "skip $maybeid for $name\n";
                }
            }
            elsif($withindest && ($_ =~ / *([^:]+): *(.*)/)) {
                my ($name, $val)=($1, $2);
                $dest=""; # in case it is left untouched for when the
                # model name isn't "our"
                dest($_, $name, $val);

                if($dest) {
                    # Store the current dest string. If this target matches
                    # multiple strings, it will get updated several times.
                    $deststr = $dest;
                }
            }
        }

        if($_ =~ /^ *id: ([^ \t\n]+)/i) {
            $maybeid=$1;
            $sortorder{$maybeid}=$numphrases++;
        }
        if($_ =~ /^ *user: ([^ \t\n]+)/i) {
            $user = $users{$1};
            if(!(defined $user)) {
                $user = ++$numusers;
                $users{$1} = $user;
            }
        }
    }
    close(ENG);
}

# a function that compares the english phrase with the translated one.
# compare source strings and desc

# Then output the updated version!
sub compare {
    my ($idstr, $engref, $locref)=@_;
    my ($edesc, $ldesc);
    my ($esource, $lsource);
    my $mode=0;

    for my $l (@$engref) {
        if($l =~ /^ *#/) {
            # comment
            next;
        }
        if($l =~ /^ *desc: (.*)/) {
            $edesc=$1;
        }
        elsif($l =~ / *\<source\>/i) {
            $mode=1;
        }
        elsif($mode) {
            if($l =~ / *\<\/source\>/i) {
                last;
            }
            $esource .= "$l\n";
        }
    }

    my @show;
    my @source;

    $mode = 0;
    for my $l (@$locref) {
        if($l =~ /^ *desc: (.*)/) {
            $ldesc=$1;
            if(trim($edesc) ne trim($ldesc)) {
                $l = "### The 'desc' field differs from the english!\n### the previously used desc is commented below:\n### desc: $ldesc\n  desc: $edesc\n";
            }
            push @show, $l;
        }
        elsif($l =~ / *\<source\>/i) {
            $mode=1;
            push @show, $l;
        }
        elsif($mode) {
            if($l =~ / *\<\/source\>/i) {
                $mode = 0;
                print @show;
                if(trim($esource) ne trim($lsource)) {
                    print "### The <source> section differs from the english!\n",
                    "### the previously used one is commented below:\n";
                    for(split("\n", $lsource)) {
                        print "### $_\n";
                    }
                    print $esource;
                }
                else {
                    print $lsource;
                }
                undef @show; # start over

                push @show, $l;
            }
            else {
                $lsource .= "$l";
            }
        }
        else {
            push @show, $l;
        }
    }


    print @show;
}

my @idcount;           # counter for lang ID numbers
my @voiceid;           # counter for voice-only ID numbers

for (keys %users) {
    push @idcount, 0;
    push @voiceid, 0x8001;
}

#
# Now start the scanning of the selected language string
#

open(LANG, "<$input") || die "Error: couldn't read language file named $input\n";
my @phrase;
my $header = 1;
my $langoptions = 0;

while(<LANG>) {

    $line++;

    # get rid of DOS newlines
    $_ =~ tr/\r//d;

    if($_ =~ /^( *\#|[ \t\n\r]*\z)/) {
        # comment or empty line - output it if it's part of the header
        if ($header and ($update || $sortfile)) {
            print($_);
        }
        next;
    }
    $header = 0;

    my $ll = $_;

    # print "M: $m\n";

    push @phrase, $ll;

    # this is an XML-lookalike tag
    if (/^(<|[^\"<]+<)([^>]*)>/) {
        my $part = $2;
        # print "P: $part\n";

        if($part =~ /^\//) {
            # this was a closing tag

            if($part eq "/phrase") {
                # closing the phrase

                my $idstr = $phrase{'id'};
                my $idnum;

                if($binary && !$english{$idstr}) {
                    # $idstr doesn't exist for english, skip it\n";
                }
                elsif($dest =~ /^none\z/i) {
                    # "none" as dest (without quotes) means that this entire
                    # phrase is to be ignored
                }
                elsif($sortfile) {
                    $allphrases{$idstr}=join('',@phrase);
                }
                elsif(!$update) {
                    # we don't do the fully detailed analysis when we "update"
                    # since we don't do it for a particular target etc

                    # allow the keyword 'deprecated' to be used on dest and
                    # voice strings to mark that as deprecated. It will then
                    # be replaced with "".

                    $dest =~ s/^deprecate(|d)\z/\"\"/i;
                    $voice =~ s/^deprecate(|d)\z/\"\"/i;

                    # basic syntax error alerts, if there are no quotes we
                    # will assume an empty string was intended
                    if($dest !~ /^\"/) {
                        print STDERR "$input:$line:1: warning: dest before line lacks quotes ($dest)!\n";
                        $dest='""';
                    }
                    if($src !~ /^\"/) {
                        print STDERR "$input:$line:1: warning: source before line lacks quotes ($src)!\n";
                        $src='""';
                    }
                    if($voice !~ /^\"/ and $voice !~ /^none\z/i) {
                        print STDERR "$input:$line:1: warning: voice before line lacks quotes ($voice)!\n";
                        $voice='""';
                    }
                    if($dest eq '""' && $phrase{'desc'} !~ /deprecated/i && $idstr !~ /^VOICE/) {
                        print STDERR "$input:$line:1: warning: empty dest before line in non-deprecated phrase!\n";
                    }

                    my $userstr = trim($phrase{'user'});
                    my $user = $users{$userstr};
                    if ($userstr eq "") {
                        print STDERR "$input:$line:1: warning: missing user!\n";
                        $user = $users{"core"};
                    }
                    elsif(!(defined $user)) {
                        if($english) {
                           print STDERR "$input:$line:1: warning: user was not found in $english!\n";
                           $user = keys %users;  # set to an invalid user so it won't be added
                        }
                        else {
                            # we found a new user, add it to the usermap
                            $user = ++$numusers;
                            $users{$userstr} = $user;
                        }
                    }

                    # Use the ID name to figure out which id number range we
                    # should use for this phrase. Voice-only strings are
                    # separated.

                    if($idstr =~ /^VOICE/) {
                        $idnum = $voiceid[$user]++;
                    }
                    else {
                        $idnum = $idcount[$user]++;
                    }

                    $id{$idstr} = $idnum;
                    $idnum[$user][$idnum]=$idstr;

                    $source{$idstr}=$src;
                    $dest{$idstr}=$dest;
                    $voice{$idstr}=$voice;

                    if($verbose) {
                        print "id: $phrase{id} ($idnum)\n";
                        print "source: $src\n";
                        print "dest: $dest\n";
                        print "voice: $voice\n";
                        print "user: $user\n";
                    }

                    undef $src;
                    undef $dest;
                    undef $voice;
                    undef $user;
                    undef %phrase;
                }

                if($update) {
                    my $e = $english{$idstr};

                    if($e) {
                        # compare original english with this!
                        my @eng = split("\n", $english{$idstr});

                        compare($idstr, \@eng, \@phrase);

                        $english{$idstr}=""; # clear it
                    }
                    else {
                        print "### $idstr: The phrase is not used. Skipped\n";
                    }
                }
                undef @phrase;
            } # end of </phrase>
            elsif($part eq "/options") {
                # closing the options
                if ($options{'rtl'}) {
                    $langoptions |= $LANGUAGE_FLAG_RTL;
                }
            } # end of </options>

            # starts with a slash, this _ends_ this section
            $m = pop @m; # get back old value, the previous level's tag
            next;
        } # end of tag close

        # This is an opening (sub) tag

        push @m, $m; # store old value
        $m = $part;
        next;
    }

    if(/^ *([^:]+): *(.*)/) {
        my ($name, $val)=($1, $2);
        &$m($_, $name, $val);
    }
}
close(LANG);

if($update) {
    my $any=0;
    for(keys %english) {
        if($english{$_}) {
            print "###\n",
            "### This phrase below was not present in the translated file\n",
            "<phrase>\n";
            print $english{$_};
            print "</phrase>\n";
        }
    }
}

if ($sortfile) {
    for(sort { $sortorder{$a} <=> $sortorder{$b} } keys %allphrases) {
         print $allphrases{$_};
    }
}

if($prefix) {
    # We create a .c and .h file

    open(HFILE_CORE, ">$prefix/lang.h") ||
        die "Error: couldn't create file $prefix/lang.h\n";
    open(CFILE_CORE, ">$prefix/lang_core.c") ||
        die "Error: couldn't create file $prefix/lang_core.c\n";

   # get header file name
   $headername = "$prefix/lang.h";
   $headername =~ s/(.*\/)*//;

    print HFILE_CORE <<MOO
/* This file was automatically generated using genlang */
/*
 * The str() macro/functions is how to access strings that might be
 * translated. Use it like str(MACRO) and expect a string to be
 * returned!
 */
#define str(x) language_strings[x]

/* this is the array for holding the string pointers.
   It will be initialized at runtime. */
extern unsigned char *language_strings[];
/* this contains the concatenation of all strings, separated by \\0 chars */
extern const unsigned char core_language_builtin[];

/* The enum below contains all available strings */
enum \{
MOO
    ;

    print CFILE_CORE <<MOO
/* This file was automatically generated using genlang, the strings come
   from "$input" */

#include "$headername"

unsigned char *language_strings[LANG_LAST_INDEX_IN_ARRAY];
const unsigned char core_language_builtin[] =
MOO
;

    # Output the ID names for the enum in the header file
    my $i;
    for $i (0 .. $idcount[$users{"core"}]-1) {
        my $name=$idnum[$users{"core"}][$i]; # get the ID name

        $name =~ tr/\"//d; # cut off the quotes

        printf HFILE_CORE ("    %s, /* %d */\n", $name, $i);
    }

# Output separation marker for last string ID and the upcoming voice IDs

    print HFILE_CORE <<MOO
    LANG_LAST_INDEX_IN_ARRAY, /* this is not a string, this is a marker */
    /* --- below this follows voice-only strings --- */
    VOICEONLY_DELIMITER = 0x8000,
MOO
    ;

# Output the ID names for the enum in the header file
    for $i (0x8001 .. ($voiceid[$users{"core"}]-1)) {
        my $name=$idnum[$users{"core"}][$i]; # get the ID name

        $name =~ tr/\"//d; # cut off the quotes

        printf HFILE_CORE ("    %s, /* 0x%x */\n", $name, $i);
    }

    # Output end of enum
    print HFILE_CORE "\n};\n/* end of generated enum list */\n";

    # Output the target phrases for the source file
    for $i (0 .. $idcount[$users{"core"}]-1) {
        my $name=$idnum[$users{"core"}][$i]; # get the ID
        my $dest = $dest{$name}; # get the destination phrase

        $dest =~ s:\"$:\\0\":; # insert a \0 before the second quote

        if(!$dest) {
            # this is just to be on the safe side
            $dest = '"\0"';
        }

        printf CFILE_CORE ("    %s\n", $dest);
    }

# Output end of string chunk
    print CFILE_CORE <<MOO
;
/* end of generated string list */
MOO
;

    close(HFILE_CORE);
    close(CFILE_CORE);
} # end of the c/h file generation
elsif($binary) {
    # Creation of a binary lang file was requested

    # We must first scan the english file to get the correct order of the id
    # numbers used there, as that is what sets the id order for all language
    # files. The english file is scanned before the translated file was
    # scanned.

    open(OUTF, ">$binary") or die "Error: Can't create $binary";
    binmode OUTF;
    printf OUTF ("%c%c%c%c", $LANGUAGE_COOKIE, $LANGUAGE_VERSION, $target_id,
      $langoptions); # magic lang file header

    # output the number of strings for each user
    my $foffset = $HEADER_SIZE + $SUBHEADER_SIZE * keys(%users);
    for (keys %users) {
        my $size;
        for $n (0 .. $idcount[$_]-1) {
            $size += length(trim($dest{$idnum[$_][$n]})) + 1;
        }
        printf OUTF ("%c%c%c%c%c%c", ($idcount[$_] >> 8), ($idcount[$_] & 0xff),
                     ($size >> 8), ($size & 0xff), ($foffset >> 8), ($foffset & 0xff));
        $foffset += $size;
    }

    for (keys %users) {
        # loop over the target phrases
        for $n (0 .. $idcount[$_]-1) {
            my $name=$idnum[$_][$n]; # get the ID
            my $dest = $dest{$name}; # get the destination phrase

            if($dest) {
                $dest =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes

                # Now, make sure we get the number from the english sort order:
                $idnum = $idmap[$_]{$name};

                printf OUTF ("%c%c%s\x00", ($idnum>>8), ($idnum&0xff), $dest);
            }
        }
    }
}
elsif($voiceout) {
    # voice output requested, display id: and voice: strings in a v1-like
    # fashion

    my @engl;

    # This loops over the strings in the translated language file order
    my @ids = ((0 .. ($idcount[$users{"core"}]-1)));
    push @ids, (0x8000 .. ($voiceid[$users{"core"}]-1));

    #for my $id (@ids) {
    #    print "$id\n";
    #}

    for $i (@ids) {
        my $name=$idnum[$users{"core"}][$i]; # get the ID
        my $dest = $voice{$name}; # get the destination voice string

        if($dest) {
            $dest =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes

            # Now, make sure we get the number from the english sort order:
            $idnum = $idmap[$users{"core"}]{$name};

            if(length($idnum)) {
                $engl[$idnum] = $i;

                #print "Input index $i output index $idnum\n";
            }
            else {
                # not used, mark it so
                $engl[$i] = -1
            }

        }
    }
    for my $i (@ids) {

        my $o = $engl[$i];

        if(($o < 0) || !length($o)) {
            if($i < 0x8000) {
                print "#$i\nid: NOT_USED_$i\nvoice: \"\"\n";
            }
            else {
                print "#$i\nid: VOICE_NOT_USED_$i\nvoice: \"\"\n";
            }
            next;
        }

        my $name=$idnum[$users{"core"}][$o]; # get the ID
        my $dest = $voice{$name}; # get the destination voice string

        print "#$i ($o)\nid: $name\nvoice: $dest\n";
    }

}


if($verbose) {
    my $num_str = 0;

    for (keys %users) {
        $num_str += $idcount[$_];
    }

    printf("%d ID strings scanned\n", $num_str);

    print "* head *\n";
    for(keys %head) {
        printf "$_: %s\n", $head{$_};
    }
}

if ($binary and !file_is_newer("$binpath/english.list", $english)) {
    open(ENGLIST, ">$binpath/english.list") ||
        die "Failed creating $binpath/english.list";
    for my $user (keys %users) {
        for my $id (keys %{$idmap[$user]}) {
            print ENGLIST "$user:$id:$idmap[$user]{$id}\n";
        }
    }
    close ENGLIST;
}