rockbox/tools/genlang

#!/usr/bin/perl -s
#             __________               __   ___.
#   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
#   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
#   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
#   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
#                     \/            \/     \/    \/            \/
# $Id$
#
# Copyright (C) 2006 - 2008 by Daniel Stenberg
#

# binary version for the binary lang file
my $langversion = 4; # 3 was the latest one used in the v1 format

# A note for future users and readers: The original v1 language system allowed
# the build to create and use a different language than english built-in. We
# removed that feature from our build-system, but the build scripts still had
# the ability. But, starting now, this ability is no longer provided since I
# figured it was boring and unnecessary to write support for now since we
# don't use it anymore.

if(!$ARGV[0]) {
    print <<MOO
Usage: genlang [options] <langv2 file>

 -p=<prefix>
    Make the tool create a [prefix].c and [prefix].h file.

 -b=<outfile>
    Make the tool create a binary language (.lng) file named [outfile].
    The use of this option requires that you also use -e, -t and -i.

 -u
    Update language file. Given the translated file and the most recent english
    file, you\'ll get an updated version sent to stdout. Suitable action to do
    when you intend to update a translation.

 -e=<english lang file>
    Point out the english (original source) file, to use that as master
    language template. Used in combination with -b, -u or -s.

 -s
    Sort the Update language file in the same order as the strings in the
    English file.

 -t=<target>
    Specify which target you want the translations/phrases for. Required when
    -b or -p is used.

    The target can in fact be specified as numerous different strings,
    separated with colons. This will make genlang to use all the specified
    strings when searching for a matching phrase.

 -i=<target id>
    The target id number, needed for -b.

 -o
    Voice mode output. Outputs all id: and voice: lines for the given target!

 -v
    Enables verbose (debug) output.
MOO
;
    exit;
}

# How update works:
#
# 1) scan the english file, keep the whole <phrase> for each phrase.
# 2) read the translated file, for each end of phrase, compare:
#  A) all source strings, if there's any change there should be a comment about
#     it output
#  B) the desc fields
#
# 3) output the phrase with the comments from above
# 4) check which phrases that the translated version didn't have, and spit out
#    the english version of those
#

my $prefix = $p;
my $binary = $b;
my $update = $u;
my $sortfile = $s;

my $english = $e;
my $voiceout = $o;

my $check = ($binary?1:0) + ($prefix?1:0) + ($update?1:0) + ($voiceout?1:0) + ($sortfile?1:0);

if($check > 1) {
    print "Please use only one of -p, -u, -o, -b and -s\n";
    exit;
}
if(!$check) {
    print "Please use at least one of -p, -u, -o, -b and -s\n";
    exit;
}


if(($binary || $update || $voiceout || $sortfile) && !$english) {
    print "Please use -e too when you use -b, -o, -u or -s\n";
    exit;
}

my $target_id = $i;
if($binary && !$target_id) {
    print "Please specify a target id number (with -i)!\n";
    exit;
}

my $target = $t;
if(!$target && !$update && !$sortfile) {
    print "Please specify a target (with -t)!\n";
    exit;
}
my $verbose=$v;

my %id; # string to num hash
my @idnum; # num to string array

my %allphrases;  # For sorting - an array of the <phrase> elements
my %source; # id string to source phrase hash
my %dest; # id string to dest phrase hash
my %voice; # id string to voice phrase hash

my $input = $ARGV[0];

my @m;
my $m="blank";

sub trim {
    my ($string) = @_;
    $string =~ s/^\s+//;
    $string =~ s/\s+$//;
    return $string;
}

sub match {
    my ($string, $pattern)=@_;

    $pattern =~ s/\*/.?*/g;
    $pattern =~ s/\?/./g;

    return ($string =~ /^$pattern\z/);
}

sub blank {
    # nothing to do
}

my %head;
sub header {
    my ($full, $n, $v)=@_;
    $head{$n}=$v;
}

my %phrase;
sub phrase {
    my ($full, $n, $v)=@_;
    $phrase{$n}=$v;
}

sub parsetarget {
    my ($debug, $strref, $full, $n, $v)=@_;
    my $string;
    my @all= split(" *, *", $n);
    my $test;
    for $test (@all) {
#        print "TEST ($debug) $target for $test\n";
        for my $part (split(":", $target)) {
            if(match($part, $test)) {
                $string = $v;
#                print "MATCH: $test => $v\n";
                $$strref = $string;
                return $string;
            }
        }
    }
}

my $src;
sub source {
    parsetarget("src", \$src, @_);
}

my $dest;
sub dest {
    parsetarget("dest", \$dest, @_);
}

my $voice;
sub voice {
    parsetarget("voice", \$voice, @_);
}

my %idmap;
my %english;
if($english) {
    # For the cases where the english file needs to be scanned/read, we do
    # it before we read the translated file. For -b it isn't necessary, but for
    # -u it is convenient.

    my $idnum=0; # start with a true number
    my $vidnum=0x8000; # first voice id
    open(ENG, "<$english") || die "Error: can't open $english";
    my @phrase;
    my $id;
    my $maybeid;
    my $withindest;
    my $numphrases = 0;
    while(<ENG>) {

        # get rid of DOS newlines
        $_ =~ s/\r//g;

        if($_ =~ /^ *\<phrase\>/) {
            # this is the start of a phrase
        }
        elsif($_ =~ /^ *\<\/phrase\>/) {

            # if id is something, when we count and store this phrase
            if($id) {
                # voice-only entries get a difference range
                if($id =~ /^VOICE_/) {
                    # Assign an ID number to this entry
                    $idmap{$id}=$vidnum;
                    $vidnum++;
                }
                else {
                    # Assign an ID number to this entry
                    $idmap{$id}=$idnum;
                    $idnum++;
   #                 print STDERR "DEST: bumped idnum to $idnum\n";
                }

                # this is the end of a phrase, add it to the english hash
                $english{$id}=join("", @phrase);
            }
            undef @phrase;
            $id="";
        }
        elsif($_ ne "\n") {
            # gather everything related to this phrase
            push @phrase, $_;
            if($_ =~ /^ *\<dest\>/i) {
                $withindest=1;
                $deststr="";
            }
            elsif($withindest && ($_ =~ /^ *\<\/dest\>/i)) {
                $withindest=0;

                if($update || ($deststr && ($deststr !~ /^none\z/i))) {
                    # we unconditionally always use all IDs when the "update"
                    # feature is used
                    $id = $maybeid;
    #                print "DEST: use this id $id\n";
                }
                else {
    #                print "skip $maybeid for $name\n";
                }
            }
            elsif($withindest && ($_ =~ / *([^:]+): *(.*)/)) {
                my ($name, $val)=($1, $2);
                $dest=""; # in case it is left untouched for when the
                # model name isn't "our"
                dest($_, $name, $val);

                if($dest) {
                    # Store the current dest string. If this target matches
                    # multiple strings, it will get updated several times.
                    $deststr = $dest;
                }
            }
        }

        if($_ =~ /^ *id: ([^ \t\n]+)/i) {
            $maybeid=$1;
            $sortorder{$maybeid}=$numphrases++;
        }
    }
    close(ENG);
}

# a function that compares the english phrase with the translated one.
# compare source strings and desc

# Then output the updated version!
sub compare {
    my ($idstr, $engref, $locref)=@_;
    my ($edesc, $ldesc);
    my ($esource, $lsource);
    my $mode=0;
    
    for my $l (@$engref) {
        if($l =~ /^ *#/) {
            # comment
            next;
        }
        if($l =~ /^ *desc: (.*)/) {
            $edesc=$1;
        }
        elsif($l =~ / *\<source\>/i) {
            $mode=1;
        }
        elsif($mode) {
            if($l =~ / *\<\/source\>/i) {
                last;
            }
            $esource .= "$l\n";
        }
    }

    my @show;
    my @source;

    $mode = 0;
    for my $l (@$locref) {
        if($l =~ /^ *desc: (.*)/) {
            $ldesc=$1;
            if(trim($edesc) ne trim($ldesc)) {
                $l = "### The 'desc' field differs from the english!\n### the previously used desc is commented below:\n### desc: $ldesc\n  desc: $edesc\n";
            }
            push @show, $l;
        }
        elsif($l =~ / *\<source\>/i) {
            $mode=1;
            push @show, $l;
        }
        elsif($mode) {
            if($l =~ / *\<\/source\>/i) {
                $mode = 0;
                print @show;
                if(trim($esource) ne trim($lsource)) {
                    print "### The <source> section differs from the english!\n",
                    "### the previously used one is commented below:\n";
                    for(split("\n", $lsource)) {
                        print "### $_\n";
                    }
                    print $esource;
                }
                else {
                    print $lsource;
                }
                undef @show; # start over

                push @show, $l;
            }
            else {
                $lsource .= "$l";
            }
        }
        else {
            push @show, $l;
        }
    }


    print @show;
}

my $idcount;        # counter for lang ID numbers
my $voiceid=0x8000; # counter for voice-only ID numbers

#
# Now start the scanning of the selected language string
#

open(LANG, "<$input") || die "Error: couldn't read language file named $input\n";
my @phrase;
my $header = 1;
while(<LANG>) {

    $line++;

    # get rid of DOS newlines
    $_ =~ s/\r//g;

    if($_ =~ /^( *\#|[ \t\n\r]*\z)/) {
        # comment or empty line - output it if it's part of the header
        if ($header and ($update || $sortfile)) {
            print($_);
        }
        next;
    }
    $header = 0;

    my $ll = $_;

    # print "M: $m\n";

    push @phrase, $ll;

    # this is an XML-lookalike tag
    if (/^(<|[^\"<]+<)([^>]*)>/) {
        my $part = $2;
        # print "P: $part\n";

        if($part =~ /^\//) {
            # this was a closing tag

            if($part eq "/phrase") {
                # closing the phrase

                my $idstr = $phrase{'id'};
                my $idnum;

                if($binary && !$english{$idstr}) {
                    # $idstr doesn't exist for english, skip it\n";
                }
                elsif($dest =~ /^none\z/i) {
                    # "none" as dest (without quotes) means that this entire
                    # phrase is to be ignored
                }
                elsif($sortfile) {
                    $allphrases{$idstr}=join('',@phrase);
                }
                elsif(!$update) {
                    # we don't do the fully detailed analysis when we "update"
                    # since we don't do it for a particular target etc

                    # allow the keyword 'deprecated' to be used on dest and
                    # voice strings to mark that as deprecated. It will then
                    # be replaced with "".

                    $dest =~ s/^deprecate(|d)\z/\"\"/i;
                    $voice =~ s/^deprecate(|d)\z/\"\"/i;

                    # basic syntax error alerts, if there are no quotes we
                    # will assume an empty string was intended
                    if($dest !~ /^\"/) {
                        print STDERR "$input:$line:1: warning: dest before line lacks quotes ($dest)!\n";
                        $dest='""';
                    }
                    if($src !~ /^\"/) {
                        print STDERR "$input:$line:1: warning: source before line lacks quotes ($src)!\n";
                        $src='""';
                    }
                    if($voice !~ /^\"/ and $voice !~ /^none\z/i) {
                        print STDERR "$input:$line:1: warning: voice before line lacks quotes ($voice)!\n";
                        $voice='""';
                    }

                    # Use the ID name to figure out which id number range we
                    # should use for this phrase. Voice-only strings are
                    # separated.

                    if($idstr =~ /^VOICE/) {
                        $idnum = $voiceid++;
                    }
                    else {
                        $idnum = $idcount++;
                    }
                    
                    $id{$idstr} = $idnum;
                    $idnum[$idnum]=$idstr;
                    
                    $source{$idstr}=$src;
                    $dest{$idstr}=$dest;
                    $voice{$idstr}=$voice;

                    if($verbose) {
                        print "id: $phrase{id} ($idnum)\n";
                        print "source: $src\n";
                        print "dest: $dest\n";
                        print "voice: $voice\n";
                    }

                    undef $src;
                    undef $dest;
                    undef $voice;
                    undef %phrase;
                }

                if($update) {
                    my $e = $english{$idstr};

                    if($e) {
                        # compare original english with this!
                        my @eng = split("\n", $english{$idstr});

                        compare($idstr, \@eng, \@phrase);

                        $english{$idstr}=""; # clear it
                    }
                    else {
                        print "### $idstr: The phrase is not used. Skipped\n";
                    }
                }
                undef @phrase;

            } # end of </phrase>

            # starts with a slash, this _ends_ this section
            $m = pop @m; # get back old value, the previous level's tag
            next;
        } # end of tag close

        # This is an opening (sub) tag

        push @m, $m; # store old value
        $m = $part;
        next;
    }

    if(/^ *([^:]+): *(.*)/) {
        my ($name, $val)=($1, $2);
        &$m($_, $name, $val);
    }
}
close(LANG);

if($update) {
    my $any=0;
    for(keys %english) {
        if($english{$_}) {
            print "###\n",
            "### This phrase below was not present in the translated file\n",
            "<phrase>\n";
            print $english{$_};
            print "</phrase>\n";
        }
    }
}

if ($sortfile) {
    for(sort { $sortorder{$a} <=> $sortorder{$b} } keys %allphrases) {
         print $allphrases{$_};
    }
}

if($prefix) {
    # We create a .c and .h file

    open(HFILE, ">$prefix.h") ||
        die "Error: couldn't create file $prefix.h\n";
    open(CFILE, ">$prefix.c") ||
        die "Error: couldn't create file $prefix.c\n";        

    print HFILE <<MOO
/* This file was automatically generated using genlang */
/*
 * The str() macro/functions is how to access strings that might be
 * translated. Use it like str(MACRO) and expect a string to be
 * returned!
 */
#define str(x) language_strings[x]

/* this is the array for holding the string pointers.
   It will be initialized at runtime. */
extern unsigned char *language_strings[];
/* this contains the concatenation of all strings, separated by \\0 chars */
extern const unsigned char language_builtin[];

/* The enum below contains all available strings */
enum \{
MOO
    ;

    print CFILE <<MOO
/* This file was automaticly generated using genlang, the strings come
   from "$input" */
   
#include "$prefix.h"

unsigned char *language_strings[LANG_LAST_INDEX_IN_ARRAY];
const unsigned char language_builtin[] =
MOO
;

    # Output the ID names for the enum in the header file
    my $i;
    for $i (1 .. $idcount) {
        my $name=$idnum[$i - 1]; # get the ID name
        
        $name =~ s/\"//g; # cut off the quotes
        
        printf HFILE ("    %s, /* %d */\n", $name, $i-1);
    }

# Output separation marker for last string ID and the upcoming voice IDs

    print HFILE <<MOO
    LANG_LAST_INDEX_IN_ARRAY, /* this is not a string, this is a marker */
    /* --- below this follows voice-only strings --- */
    VOICEONLY_DELIMITER = 0x8000,
MOO
    ;

# Output the ID names for the enum in the header file
    for $i (0x8000 .. ($voiceid-1)) {
        my $name=$idnum[$i]; # get the ID name
        
        $name =~ s/\"//g; # cut off the quotes
        
        printf HFILE ("    %s,\n", $name);
    }

    # Output end of enum
    print HFILE "\n};\n/* end of generated enum list */\n";

    # Output the target phrases for the source file
    for $i (1 .. $idcount) {
        my $name=$idnum[$i - 1]; # get the ID
        my $dest = $dest{$name}; # get the destination phrase
        
        $dest =~ s:\"$:\\0\":; # insert a \0 before the second quote

        if(!$dest) {
            # this is just to be on the safe side
            $dest = '"\0"';
        }

        printf CFILE ("    %s\n", $dest);
    }

# Output end of string chunk
    print CFILE <<MOO
;
/* end of generated string list */
MOO
;

    close(HFILE);
    close(CFILE);
} # end of the c/h file generation
elsif($binary) {
    # Creation of a binary lang file was requested

    # We must first scan the english file to get the correct order of the id
    # numbers used there, as that is what sets the id order for all language
    # files. The english file is scanned before the translated file was
    # scanned.

    open(OUTF, ">$binary") or die "Error: Can't create $binary";
    binmode OUTF;
    printf OUTF ("\x1a%c%c", $langversion, $target_id); # magic lang file header

    # loop over the target phrases
    for $i (1 .. $idcount) {
        my $name=$idnum[$i - 1]; # get the ID
        my $dest = $dest{$name}; # get the destination phrase

        if($dest) {
            $dest =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes

            # Now, make sure we get the number from the english sort order:
            $idnum = $idmap{$name};

            printf OUTF ("%c%c%s\x00", ($idnum>>8), ($idnum&0xff), $dest);
        }
    }
}
elsif($voiceout) {
    # voice output requested, display id: and voice: strings in a v1-like
    # fashion

    my @engl;

    # This loops over the strings in the translated language file order
    my @ids = ((0 .. ($idcount-1)));
    push @ids, (0x8000 .. ($voiceid-1));

    #for my $id (@ids) {
    #    print "$id\n";
    #}

    for $i (@ids) {
        my $name=$idnum[$i]; # get the ID
        my $dest = $voice{$name}; # get the destination voice string

        if($dest) {
            $dest =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes

            # Now, make sure we get the number from the english sort order:
            $idnum = $idmap{$name};

            if(length($idnum)) {
                $engl[$idnum] = $i;

                #print "Input index $i output index $idnum\n";
            }
            else {
                # not used, mark it so
                $engl[$i] = -1
            }

        }
    }
    for my $i (@ids) {

        my $o = $engl[$i];

        if(($o < 0) || !length($o)) {
            print "#$i\nid: NOT_USED_$i\nvoice: \"\"\n";
            next;
        }

        my $name=$idnum[$o]; # get the ID
        my $dest = $voice{$name}; # get the destination voice string
        
        print "#$i ($o)\nid: $name\nvoice: $dest\n";
    }
    
}


if($verbose) {
    printf("%d ID strings scanned\n", $idcount);

    print "* head *\n";
    for(keys %head) {
        printf "$_: %s\n", $head{$_};
    }
}