Dictionary conversion tools.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@6395 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
parent
a810a67db7
commit
52abc68b11
4 changed files with 216 additions and 1 deletions
|
@ -10,6 +10,7 @@ rockbox-style.el
|
||||||
sample.emacs
|
sample.emacs
|
||||||
buildzip.pl
|
buildzip.pl
|
||||||
romsizetest.pl
|
romsizetest.pl
|
||||||
|
wn2rdf.pl
|
||||||
make.inc
|
make.inc
|
||||||
makesrc.inc
|
makesrc.inc
|
||||||
fwpatcher/*.[ch]
|
fwpatcher/*.[ch]
|
||||||
|
|
|
@ -9,7 +9,7 @@
|
||||||
CFLAGS := -O -ansi -g
|
CFLAGS := -O -ansi -g
|
||||||
LDFLAGS := -g
|
LDFLAGS := -g
|
||||||
|
|
||||||
TARGETS := scramble descramble sh2d bmp2rb convbdf generate_rocklatin mkboot
|
TARGETS := scramble descramble sh2d bmp2rb rdf2binary convbdf generate_rocklatin mkboot
|
||||||
|
|
||||||
all: $(TARGETS)
|
all: $(TARGETS)
|
||||||
@echo "tools done"
|
@echo "tools done"
|
||||||
|
@ -26,6 +26,9 @@ sh2d: sh2d.c
|
||||||
bmp2rb: bmp2rb.c
|
bmp2rb: bmp2rb.c
|
||||||
$(CC) -DAPPLICATION_NAME=\"$@\" -g $+ -o $@
|
$(CC) -DAPPLICATION_NAME=\"$@\" -g $+ -o $@
|
||||||
|
|
||||||
|
rdf2binary: rdf2binary.c
|
||||||
|
$(CC) -g $+ -o $@
|
||||||
|
|
||||||
mkboot: mkboot.c
|
mkboot: mkboot.c
|
||||||
$(CC) -g $+ -o $@
|
$(CC) -g $+ -o $@
|
||||||
|
|
||||||
|
|
89
tools/rdf2binary.c
Normal file
89
tools/rdf2binary.c
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
/***************************************************************************
|
||||||
|
* __________ __ ___.
|
||||||
|
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
|
||||||
|
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
|
||||||
|
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
|
||||||
|
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
|
||||||
|
* \/ \/ \/ \/ \/
|
||||||
|
* $Id$
|
||||||
|
*
|
||||||
|
* Copyright (C) 2005 Miika Pekkarinen
|
||||||
|
*
|
||||||
|
* All files in this archive are subject to the GNU General Public License.
|
||||||
|
* See the file COPYING in the source tree root for full license agreement.
|
||||||
|
*
|
||||||
|
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
|
||||||
|
* KIND, either express or implied.
|
||||||
|
*
|
||||||
|
****************************************************************************/
|
||||||
|
|
||||||
|
/*
|
||||||
|
This tool converts the rdf file to the binary data used in the dict plugin.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
/* maximum word lenght, has to be the same in dict.c */
|
||||||
|
#define WORDLEN 32
|
||||||
|
|
||||||
|
struct word {
|
||||||
|
char word[WORDLEN];
|
||||||
|
long offset;
|
||||||
|
};
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
FILE *in;
|
||||||
|
int idx_out, desc_out;
|
||||||
|
struct word w;
|
||||||
|
char buf[10000];
|
||||||
|
long cur_offset = 0;
|
||||||
|
|
||||||
|
in = fopen("dict.preparsed", "r");
|
||||||
|
idx_out = open("dict.index", O_WRONLY | O_CREAT);
|
||||||
|
desc_out = open("dict.desc", O_WRONLY | O_CREAT);
|
||||||
|
|
||||||
|
if (in == NULL || idx_out < 0 || desc_out < 0) {
|
||||||
|
fprintf(stderr, "Error: Some files couldn't be opened\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (fgets(buf, sizeof buf, in) != NULL) {
|
||||||
|
/* It is safe to use strtok here */
|
||||||
|
const char *word = strtok(buf, "\t");
|
||||||
|
const char *desc = strtok(NULL, "\t");
|
||||||
|
|
||||||
|
if (word == NULL || desc == NULL) {
|
||||||
|
fprintf(stderr, "Parse error!\n");
|
||||||
|
fprintf(stderr, "word: %s\ndesc: %s\n", word, desc);
|
||||||
|
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* We will null-terminate the words */
|
||||||
|
strncpy(w.word, word, WORDLEN - 1);
|
||||||
|
w.offset = cur_offset;
|
||||||
|
write(idx_out, &w, sizeof(struct word));
|
||||||
|
|
||||||
|
while (1) {
|
||||||
|
int len = strlen(desc);
|
||||||
|
cur_offset += len;
|
||||||
|
write(desc_out, desc, len);
|
||||||
|
|
||||||
|
desc = strtok(NULL, "\t");
|
||||||
|
if (desc == NULL)
|
||||||
|
break ;
|
||||||
|
|
||||||
|
cur_offset++;
|
||||||
|
write(desc_out, "\n", 1);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
122
tools/wn2rdf.pl
Normal file
122
tools/wn2rdf.pl
Normal file
|
@ -0,0 +1,122 @@
|
||||||
|
#! /usr/bin/perl -w
|
||||||
|
|
||||||
|
# Wordnet dictionary database converter
|
||||||
|
#
|
||||||
|
# Converts the Wordnet prolog data to rockbox dictionary format.
|
||||||
|
#
|
||||||
|
# Written by Miika Pekkarinen <slasher@ihme.org>
|
||||||
|
#
|
||||||
|
# $Id$
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
# Lookup tables
|
||||||
|
my %words;
|
||||||
|
my %descriptions;
|
||||||
|
|
||||||
|
sub getcatname {
|
||||||
|
my ($id) = @_;
|
||||||
|
|
||||||
|
return 'N' if $id == 1;
|
||||||
|
return 'V' if $id == 2;
|
||||||
|
return 'A' if $id == 3;
|
||||||
|
return 'A' if $id == 4;
|
||||||
|
return '?';
|
||||||
|
}
|
||||||
|
|
||||||
|
open IN_WORD, "wn_s.pl" or die "Open fail(#1): $!";
|
||||||
|
open IN_DESC, "wn_g.pl" or die "Open fail(#2): $!";
|
||||||
|
open OUTPUT, "> dict.preparsed" or die "Open fail(#3): $!";
|
||||||
|
|
||||||
|
print "Reading word file...\n";
|
||||||
|
|
||||||
|
# Read everything into memory
|
||||||
|
while (<IN_WORD>) {
|
||||||
|
chomp ;
|
||||||
|
|
||||||
|
# s(100001740,1,'entity',n,1,11). => 100001740,1,'entity',n,1,11
|
||||||
|
s/(^s\()(.*)(\)\.$)/$2/;
|
||||||
|
|
||||||
|
my ($seqid, $n1, $word, $n2, $n3, $n4) = split /,/, $_, 6;
|
||||||
|
|
||||||
|
# 'entity' => entity
|
||||||
|
$word =~ s/(^\')(.*)(\'$)/$2/;
|
||||||
|
$word =~ s/\'\'/\'/s;
|
||||||
|
|
||||||
|
my $category = substr $seqid, 0, 1;
|
||||||
|
|
||||||
|
$words{lc $word}{$seqid} = $category;
|
||||||
|
}
|
||||||
|
|
||||||
|
close IN_WORD;
|
||||||
|
|
||||||
|
print "Reading description file...\n";
|
||||||
|
while (<IN_DESC>) {
|
||||||
|
chomp ;
|
||||||
|
|
||||||
|
# g(100002056,'(a separate and self-contained entity)').
|
||||||
|
# => 100002056,'(a separate and self-contained entity)'
|
||||||
|
s/(^g\()(.*)(\)\.$)/$2/;
|
||||||
|
|
||||||
|
my ($seqid, $desc) = split /,/, $_, 2;
|
||||||
|
|
||||||
|
$desc =~ s/(^\'\()(.*)(\)\'$)/$2/;
|
||||||
|
$desc =~ s/\'\'/\'/s;
|
||||||
|
|
||||||
|
$descriptions{$seqid} = $desc;
|
||||||
|
}
|
||||||
|
|
||||||
|
close IN_DESC;
|
||||||
|
|
||||||
|
print "Sorting and writing output...\n";
|
||||||
|
|
||||||
|
# Now sort and find correct descriptions
|
||||||
|
foreach my $word (sort keys %words) {
|
||||||
|
my %categories;
|
||||||
|
|
||||||
|
# Find all definitions of the word
|
||||||
|
foreach my $id (keys %{$words{$word}}) {
|
||||||
|
my $catid = $words{$word}{$id};
|
||||||
|
my $description = $descriptions{$id};
|
||||||
|
|
||||||
|
if (!defined($description) or $description eq '') {
|
||||||
|
print "Error: Failed to link word: $word / ",
|
||||||
|
$words{$word}, "\n";
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
push @{$categories{$catid}}, $description;
|
||||||
|
}
|
||||||
|
|
||||||
|
my $finaldesc;
|
||||||
|
|
||||||
|
# 1 = noun
|
||||||
|
# 2 = verb
|
||||||
|
# 3 = adjective
|
||||||
|
# 4 = adverb
|
||||||
|
for my $catid (1 .. 4) {
|
||||||
|
my $n = 1;
|
||||||
|
my $catdesc;
|
||||||
|
|
||||||
|
next unless $categories{$catid};
|
||||||
|
foreach my $desc ( @{$categories{$catid}} ) {
|
||||||
|
$catdesc .= " " if $catdesc;
|
||||||
|
$catdesc .= "$n. $desc";
|
||||||
|
$n++;
|
||||||
|
}
|
||||||
|
|
||||||
|
next unless $catdesc;
|
||||||
|
$finaldesc .= "\t" if $finaldesc;
|
||||||
|
$finaldesc .= getcatname($catid) . ": $catdesc"
|
||||||
|
}
|
||||||
|
|
||||||
|
die "Internal error" unless $finaldesc;
|
||||||
|
|
||||||
|
print OUTPUT "$word\t$finaldesc\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
close OUTPUT;
|
||||||
|
|
||||||
|
print "Done, output was successfully written!\n";
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue