Dictionary conversion tools.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@6395 a1c6a512-1295-4272-9138-f99709370657
2005-05-02 15:05:07 +00:00 · 2005-05-02 15:05:07 +00:00 · 52abc68b11
commit 52abc68b11
parent a810a67db7
4 changed files with 216 additions and 1 deletions
--- a/tools/FILES
+++ b/tools/FILES
@ -10,6 +10,7 @@ rockbox-style.el
 sample.emacs
 buildzip.pl
 romsizetest.pl
+wn2rdf.pl
 make.inc
 makesrc.inc
 fwpatcher/*.[ch]
--- a/tools/Makefile
+++ b/tools/Makefile
@ -9,7 +9,7 @@
 CFLAGS := -O -ansi -g
 LDFLAGS := -g

-TARGETS := scramble descramble sh2d bmp2rb convbdf generate_rocklatin mkboot
+TARGETS := scramble descramble sh2d bmp2rb rdf2binary convbdf generate_rocklatin mkboot

 all: $(TARGETS)
 	@echo "tools done"
@ -26,6 +26,9 @@ sh2d: sh2d.c
 bmp2rb:	bmp2rb.c
 	$(CC) -DAPPLICATION_NAME=\"$@\" -g $+ -o $@

+rdf2binary:	rdf2binary.c
+	$(CC) -g $+ -o $@
+
 mkboot:	mkboot.c
 	$(CC) -g $+ -o $@

--- a/tools/rdf2binary.c
+++ b/tools/rdf2binary.c
@ -0,0 +1,89 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2005 Miika Pekkarinen
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+/*
+This tool converts the rdf file to the binary data used in the dict plugin.
+*/
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <stdio.h>
+
+/* maximum word lenght, has to be the same in dict.c */
+#define WORDLEN 32
+
+struct word {
+	char word[WORDLEN];
+	long offset;
+};
+
+int main()
+{
+	FILE *in;
+	int idx_out, desc_out;
+	struct word w;
+	char buf[10000];
+	long cur_offset = 0;
+	
+	in = fopen("dict.preparsed", "r");
+	idx_out = open("dict.index", O_WRONLY | O_CREAT);
+	desc_out = open("dict.desc", O_WRONLY | O_CREAT);
+	
+	if (in == NULL || idx_out < 0 || desc_out < 0) {
+		fprintf(stderr, "Error: Some files couldn't be opened\n");
+		return 1;
+	}
+	
+	while (fgets(buf, sizeof buf, in) != NULL) {
+		/* It is safe to use strtok here */
+		const char *word = strtok(buf, "\t");
+		const char *desc = strtok(NULL, "\t");
+		
+		if (word == NULL || desc == NULL) {
+			fprintf(stderr, "Parse error!\n");
+			fprintf(stderr, "word: %s\ndesc: %s\n", word, desc);
+			
+			return 2;
+		}
+		
+		/* We will null-terminate the words */
+		strncpy(w.word, word, WORDLEN - 1);
+		w.offset = cur_offset;
+		write(idx_out, &w, sizeof(struct word));
+		
+		while (1) {
+			int len = strlen(desc);
+			cur_offset += len;
+			write(desc_out, desc, len);
+			
+			desc = strtok(NULL, "\t");
+			if (desc == NULL)
+				break ;
+			
+			cur_offset++;
+			write(desc_out, "\n", 1);
+			
+		}
+	}
+	
+	return 0;
+}
+
--- a/tools/wn2rdf.pl
+++ b/tools/wn2rdf.pl
@ -0,0 +1,122 @@
+#! /usr/bin/perl -w
+
+# Wordnet dictionary database converter
+#
+# Converts the Wordnet prolog data to rockbox dictionary format.
+#
+# Written by Miika Pekkarinen <slasher@ihme.org>
+#
+# $Id$
+
+use strict;
+
+# Lookup tables
+my %words;
+my %descriptions;
+
+sub getcatname {
+	my ($id) = @_;
+	
+	return 'N' if $id == 1;
+	return 'V' if $id == 2;
+	return 'A' if $id == 3;
+	return 'A' if $id == 4;
+	return '?';
+}
+
+open IN_WORD, "wn_s.pl" or die "Open fail(#1): $!";
+open IN_DESC, "wn_g.pl" or die "Open fail(#2): $!";
+open OUTPUT, "> dict.preparsed" or die "Open fail(#3): $!";
+
+print "Reading word file...\n";
+
+# Read everything into memory
+while (<IN_WORD>) {
+	chomp ;
+	
+	# s(100001740,1,'entity',n,1,11). => 100001740,1,'entity',n,1,11
+	s/(^s\()(.*)(\)\.$)/$2/;
+	
+	my ($seqid, $n1, $word, $n2, $n3, $n4) = split /,/, $_, 6;
+	
+	# 'entity' => entity
+	$word =~ s/(^\')(.*)(\'$)/$2/;
+	$word =~ s/\'\'/\'/s;
+	
+	my $category = substr $seqid, 0, 1;
+	
+	$words{lc $word}{$seqid} = $category;
+}
+
+close IN_WORD;
+
+print "Reading description file...\n";
+while (<IN_DESC>) {
+	chomp ;
+	
+	# g(100002056,'(a separate and self-contained entity)').
+	# => 100002056,'(a separate and self-contained entity)'
+	s/(^g\()(.*)(\)\.$)/$2/;
+	
+	my ($seqid, $desc) = split /,/, $_, 2;
+	
+	$desc =~ s/(^\'\()(.*)(\)\'$)/$2/;
+	$desc =~ s/\'\'/\'/s;
+	
+	$descriptions{$seqid} = $desc;
+}
+
+close IN_DESC;
+
+print "Sorting and writing output...\n";
+
+# Now sort and find correct descriptions
+foreach my $word (sort keys %words) {
+	my %categories;
+	
+	# Find all definitions of the word
+	foreach my $id (keys %{$words{$word}}) {
+		my $catid = $words{$word}{$id};
+		my $description = $descriptions{$id};
+		
+		if (!defined($description) or $description eq '') {
+			print "Error: Failed to link word: $word / ",
+			  $words{$word}, "\n";
+			exit 1;
+		}
+		
+		push @{$categories{$catid}}, $description;
+	}
+	
+	my $finaldesc;
+	
+	# 1 = noun
+	# 2 = verb
+	# 3 = adjective
+	# 4 = adverb
+	for my $catid (1 .. 4) {
+		my $n = 1;
+		my $catdesc;
+		
+		next unless $categories{$catid};
+		foreach my $desc ( @{$categories{$catid}} ) {
+			$catdesc .= " " if $catdesc;
+			$catdesc .= "$n. $desc";
+			$n++;
+		}
+		
+		next unless $catdesc;
+		$finaldesc .= "\t" if $finaldesc;
+		$finaldesc .= getcatname($catid) . ": $catdesc"
+	}
+	
+	die "Internal error" unless $finaldesc;
+	
+	print OUTPUT "$word\t$finaldesc\n";
+}
+
+close OUTPUT;
+
+print "Done, output was successfully written!\n";
+
+