Dictionary conversion tools.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@6395 a1c6a512-1295-4272-9138-f99709370657
2005-05-02 15:05:07 +00:00 · 2005-05-02 15:05:07 +00:00 · 52abc68b11
commit 52abc68b11
parent a810a67db7
4 changed files with 216 additions and 1 deletions
--- a/tools/FILES
+++ b/tools/FILES
@ -10,6 +10,7 @@ rockbox-style.el
 sample.emacs
 buildzip.pl
 romsizetest.pl
 wn2rdf.pl
 make.inc
 makesrc.inc
 fwpatcher/*.[ch]
--- a/tools/Makefile
+++ b/tools/Makefile
@ -9,7 +9,7 @@
 CFLAGS := -O -ansi -g
 LDFLAGS := -g
-TARGETS := scramble descramble sh2d bmp2rb convbdf generate_rocklatin mkboot
+TARGETS := scramble descramble sh2d bmp2rb rdf2binary convbdf generate_rocklatin mkboot
 all: $(TARGETS)
 	@echo "tools done"
@ -26,6 +26,9 @@ sh2d: sh2d.c
 bmp2rb:	bmp2rb.c
 	$(CC) -DAPPLICATION_NAME=\"$@\" -g $+ -o $@
 rdf2binary:	rdf2binary.c
 	$(CC) -g $+ -o $@
 mkboot:	mkboot.c
 	$(CC) -g $+ -o $@
--- a/tools/rdf2binary.c
+++ b/tools/rdf2binary.c
@ -0,0 +1,89 @@
 /***************************************************************************
 *             __________               __   ___.
 *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
 *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
 *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
 *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
 *                     \/            \/     \/    \/            \/
 * $Id$
 *
 * Copyright (C) 2005 Miika Pekkarinen
 *
 * All files in this archive are subject to the GNU General Public License.
 * See the file COPYING in the source tree root for full license agreement.
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
 * KIND, either express or implied.
 *
 ****************************************************************************/
 /*
 This tool converts the rdf file to the binary data used in the dict plugin.
 */
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <string.h>
 #include <stdio.h>
 /* maximum word lenght, has to be the same in dict.c */
 #define WORDLEN 32
 struct word {
 	char word[WORDLEN];
 	long offset;
 };
 int main()
 {
 	FILE *in;
 	int idx_out, desc_out;
 	struct word w;
 	char buf[10000];
 	long cur_offset = 0;
 	in = fopen("dict.preparsed", "r");
 	idx_out = open("dict.index", O_WRONLY | O_CREAT);
 	desc_out = open("dict.desc", O_WRONLY | O_CREAT);
 	if (in == NULL || idx_out < 0 || desc_out < 0) {
 		fprintf(stderr, "Error: Some files couldn't be opened\n");
 		return 1;
 	}
 	while (fgets(buf, sizeof buf, in) != NULL) {
 		/* It is safe to use strtok here */
 		const char *word = strtok(buf, "\t");
 		const char *desc = strtok(NULL, "\t");
 		if (word == NULL || desc == NULL) {
 			fprintf(stderr, "Parse error!\n");
 			fprintf(stderr, "word: %s\ndesc: %s\n", word, desc);
 			return 2;
 		}
 		/* We will null-terminate the words */
 		strncpy(w.word, word, WORDLEN - 1);
 		w.offset = cur_offset;
 		write(idx_out, &w, sizeof(struct word));
 		while (1) {
 			int len = strlen(desc);
 			cur_offset += len;
 			write(desc_out, desc, len);
 			desc = strtok(NULL, "\t");
 			if (desc == NULL)
 				break ;
 			cur_offset++;
 			write(desc_out, "\n", 1);
 		}
 	}
 	return 0;
 }
--- a/tools/wn2rdf.pl
+++ b/tools/wn2rdf.pl
@ -0,0 +1,122 @@
 #! /usr/bin/perl -w
 # Wordnet dictionary database converter
 #
 # Converts the Wordnet prolog data to rockbox dictionary format.
 #
 # Written by Miika Pekkarinen <slasher@ihme.org>
 #
 # $Id$
 use strict;
 # Lookup tables
 my %words;
 my %descriptions;
 sub getcatname {
 	my ($id) = @_;
 	return 'N' if $id == 1;
 	return 'V' if $id == 2;
 	return 'A' if $id == 3;
 	return 'A' if $id == 4;
 	return '?';
 }
 open IN_WORD, "wn_s.pl" or die "Open fail(#1): $!";
 open IN_DESC, "wn_g.pl" or die "Open fail(#2): $!";
 open OUTPUT, "> dict.preparsed" or die "Open fail(#3): $!";
 print "Reading word file...\n";
 # Read everything into memory
 while (<IN_WORD>) {
 	chomp ;
 	# s(100001740,1,'entity',n,1,11). => 100001740,1,'entity',n,1,11
 	s/(^s\()(.*)(\)\.$)/$2/;
 	my ($seqid, $n1, $word, $n2, $n3, $n4) = split /,/, $_, 6;
 	# 'entity' => entity
 	$word =~ s/(^\')(.*)(\'$)/$2/;
 	$word =~ s/\'\'/\'/s;
 	my $category = substr $seqid, 0, 1;
 	$words{lc $word}{$seqid} = $category;
 }
 close IN_WORD;
 print "Reading description file...\n";
 while (<IN_DESC>) {
 	chomp ;
 	# g(100002056,'(a separate and self-contained entity)').
 	# => 100002056,'(a separate and self-contained entity)'
 	s/(^g\()(.*)(\)\.$)/$2/;
 	my ($seqid, $desc) = split /,/, $_, 2;
 	$desc =~ s/(^\'\()(.*)(\)\'$)/$2/;
 	$desc =~ s/\'\'/\'/s;
 	$descriptions{$seqid} = $desc;
 }
 close IN_DESC;
 print "Sorting and writing output...\n";
 # Now sort and find correct descriptions
 foreach my $word (sort keys %words) {
 	my %categories;
 	# Find all definitions of the word
 	foreach my $id (keys %{$words{$word}}) {
 		my $catid = $words{$word}{$id};
 		my $description = $descriptions{$id};
 		if (!defined($description) or $description eq '') {
 			print "Error: Failed to link word: $word / ",
 			  $words{$word}, "\n";
 			exit 1;
 		}
 		push @{$categories{$catid}}, $description;
 	}
 	my $finaldesc;
 	# 1 = noun
 	# 2 = verb
 	# 3 = adjective
 	# 4 = adverb
 	for my $catid (1 .. 4) {
 		my $n = 1;
 		my $catdesc;
 		next unless $categories{$catid};
 		foreach my $desc ( @{$categories{$catid}} ) {
 			$catdesc .= " " if $catdesc;
 			$catdesc .= "$n. $desc";
 			$n++;
 		}
 		next unless $catdesc;
 		$finaldesc .= "\t" if $finaldesc;
 		$finaldesc .= getcatname($catid) . ": $catdesc"
 	}
 	die "Internal error" unless $finaldesc;
 	print OUTPUT "$word\t$finaldesc\n";
 }
 close OUTPUT;
 print "Done, output was successfully written!\n";