CVS update: julius4/mkbingram (Julius-cvs 781) - Julius

Index: julius4/mkbingram/Makefile.in
diff -u julius4/mkbingram/Makefile.in:1.7 julius4/mkbingram/Makefile.in:1.8

--- julius4/mkbingram/Makefile.in:1.7	Fri Jul 27 17:44:57 2012
+++ julius4/mkbingram/Makefile.in	Sat Aug 11 19:44:02 2012
@@ -3,7 +3,7 @@
 # Copyright (c) 2005-2012 Julius project team, Nagoya Institute of Technology
 # All rights reserved
 #
-# $Id: Makefile.in,v 1.7 2012/07/27 08:44:57 sumomo Exp $
+# $Id: Makefile.in,v 1.8 2012/08/11 10:44:02 sumomo Exp $
 #
 SHELL=/bin/sh
 .SUFFIXES:
@@ -21,7 +21,7 @@
 exec_prefix=@exec_prefix@
 INSTALL=@INSTALL@
 
-OBJ=mkbingram.o
+OBJ=mkbingram.o charconv.o
 TARGET=mkbingram ＠ EXEEXT@
 
 all: $(TARGET)
Index: julius4/mkbingram/charconv.c
diff -u /dev/null julius4/mkbingram/charconv.c:1.1
--- /dev/null	Sat Aug 11 19:44:02 2012
+++ julius4/mkbingram/charconv.c	Sat Aug 11 19:44:02 2012
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 1991-2012 Kawahara Lab., Kyoto University
+ * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
+ * Copyright (c) 2005-2012 Julius project team, Nagoya Institute of Technology
+ * All rights reserved
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#define strmatch !strcmp
+
+#if defined(_WIN32)
+
+/* winnls */
+#include <windows.h>
+#include <winnls.h>
+#define UNICODE_BUFFER_SIZE 4096 ///< Buffer length for unicode conversion
+static unsigned int from_cp;	///< Source codepage
+static unsigned int to_cp;	///< Target codepage
+static wchar_t unibuf[UNICODE_BUFFER_SIZE]; ///< buffer for unicode conversion
+
+#else
+
+/* iconv */
+#include <iconv.h>
+#include <errno.h>
+#include <stdlib.h>
+static iconv_t cd = (iconv_t)-1; ///< Converstion descriptor
+
+#endif
+
+static int convert_enabled = 0; ///< 1 if charset converstion is enabled
+
+
+#if defined(_WIN32)
+
+static int
+str2code(char *codestr, unsigned int *code)
+{
+  if (strmatch(codestr, "euc-jp")
+      || strmatch(codestr, "euc")
+      || strmatch(codestr, "eucjp")) {
+    /* input = Shift_jis (codepage 932) */
+    *code = CODE_JPN_EUC;
+  } else if (strmatch(codestr, "ansi")) {
+    /* ANSI codepage (MBCS) ex. shift-jis in Windows XP Japanese edition.*/
+    *code = CP_ACP;
+  } else if (strmatch(codestr, "mac")) {
+    /* Macintosh codepage */
+    *code = CP_MACCP;
+  } else if (strmatch(codestr, "oem")) {
+    /* OEM localized default codepage */
+    *code = CP_OEMCP;
+  } else if (strmatch(codestr, "utf-7")) {
+    /* UTF-7 codepage */
+    *code = CP_UTF7;
+  } else if (strmatch(codestr, "utf-8")) {
+    /* UTF-8 codepage */
+    *code = CP_UTF8;
+  } else if (strmatch(codestr, "sjis")
+	     || strmatch(codestr, "sjis-win")
+	     || strmatch(codestr, "shift-jis")
+	     || strmatch(codestr, "shift_jis")) {
+    /* sjis codepage = 932 */
+    *code = 932;
+  } else if (codestr[0] >= '0' && codestr[0] <= '9') {
+    /* codepage number */
+    *code = atoi(codestr);
+    if (! IsValidCodePage(*code)) {
+      jlog("Error: charconv_win32: codepage \"%d\" not found\n", codestr);
+      return -1;
+    }
+  } else {
+    fprintf(stderr, "Error: str2code: unknown source codepage \"%s\"\n", codestr);
+    fprintf(stderr, "Error: str2code: valids are \"euc-jp\", \"ansi\", \"mac\", \"oem\", \"utf-7\", \"utf-8\", \"sjis\" and codepage number\n");
+    return -1;
+  }
+  
+  return 0;
+}
+#endif
+
+
+/** 
+ * Setup charset conversion.
+ * 
+ * @param fromcode [in] input charset name (only libjcode accepts NULL)
+ * @param tocode [in] output charset name, or NULL when disable conversion
+ * 
+ * @return 0 on success, -1 on failure.
+ */
+int
+charconv_setup(char *fromcode, char *tocode)
+{
+  convert_enabled = 0;
+
+  if (fromcode == NULL || tocode == NULL) {
+    fprintf(stderr, "Error: charconv_setup: input code or output code not specified\n");
+    return -1;
+  }
+
+#if defined(_WIN32)
+  if (str2code(fromcode, &from_cp) == -1) {
+    fpritnf(stderr, "Error: charconv_setup: unknown codepage specified\n");
+    return -1;
+  }
+  if (str2code(tocode, &to_cp) == -1) {
+    fpritnf(stderr, "Error: charconv_setup: unknown codepage specified\n");
+    return -1;
+  }
+#else
+  /* clear already allocated descriptor */
+  if (cd != (iconv_t)-1) {
+    if (iconv_close(cd) < 0) {
+      fprintf(stderr, "Error: charconv_setup: failed to close iconv\n");
+      return -1;
+    }
+    cd = (iconv_t)-1;
+  }
+  /* allocate conversion descriptor */
+  cd = iconv_open(tocode, fromcode);
+  if (cd == (iconv_t)-1) {
+    /* allocation failed */
+    fprintf(stderr, "Error: charconv_setup: unknown charset name in \"%s\" or \"%s\"\n", fromcode, tocode);
+    fprintf(stderr, "Error: charconv_setup: do \"iconv --list\" to get the list of available charset names.\n");
+    return -1;
+  }
+
+#endif
+
+  convert_enabled = 1;
+
+  return(0);
+}
+
+/** 
+ * Apply charset conversion to a string.
+ * 
+ * @param instr [in] source string
+ * @param outstr [in] destination buffer
+ * @param maxoutlen [in] allocated length of outstr in byte.
+ *
+ * @return either of instr or outstr, that holds the result string.
+ *
+ */
+char *
+charconv(char *instr, char *outstr, int maxoutlen)
+{
+
+#if defined(_WIN32)
+
+  int unilen, newlen;
+  char *srcbuf;
+
+  /* if diabled return instr itself */
+  if (convert_enabled == 0) return(instr); /* no conversion */
+  
+  srcbuf = instr;
+
+  /* get length of unicode string */
+  unilen = MultiByteToWideChar(from_cp, 0, srcbuf, -1, NULL, 0);
+  if (unilen <= 0) {
+    jlog("Error: charconv: conversion error?\n");
+    return(instr);
+  }
+  if (unilen > UNICODE_BUFFER_SIZE) {
+    jlog("Error: charconv: unicode buffer size exceeded (%d > %d)!\n", unilen, UNICODE_BUFFER_SIZE);
+    return(instr);
+  }
+  /* convert source string to unicode */
+  MultiByteToWideChar(from_cp, 0, srcbuf, -1, unibuf, unilen);
+  /* get length of target string */
+  newlen = WideCharToMultiByte(to_cp, 0, unibuf, -1, outstr, 0, NULL, NULL);
+  if (newlen <= 0) {
+    jlog("Error: charconv: conversion error?\n");
+    return(instr);
+  }
+  if (newlen > maxoutlen) {
+    jlog("Error: charconv: target buffer size exceeded (%d > %d)!\n", newlen, maxoutlen);
+    return(instr);
+  }
+  /* convert unicode to target string */
+  WideCharToMultiByte(to_cp, 0, unibuf, -1, outstr, newlen, NULL, NULL);
+  return(outstr);
+
+#else
+
+  char *src, *dst;
+  size_t srclen, dstlen;
+  size_t ret;
+
+  /* if diabled return instr itself */
+  if (convert_enabled == 0) return(instr); /* no conversion */
+
+  if (cd == (iconv_t)-1) {
+    fprintf(stderr, "Error: charconv: conversion descriptor not allocated\n");
+    return(instr);
+  }
+
+  srclen = strlen(instr)+1;
+  dstlen = maxoutlen;
+  src = instr;
+  dst = outstr;
+  ret = iconv(cd, &src, &srclen, &dst, &dstlen);
+  if (ret == -1) {
+    switch(errno) {
+    case EILSEQ:
+      fprintf(stderr, "Error: charconv: invalid multibyte sequence in the input\n"); exit(-1);
+      break;
+    case EINVAL:
+      fprintf(stderr, "Error: charconv: incomplete multibyte sequence in the input\n"); exit(-1);
+      break;
+    case E2BIG:
+      fprintf(stderr, "Error: charconv: converted string size exceeded buffer (>%d)\n", maxoutlen); exit(-1);
+      break;
+    }
+  }
+
+  return(outstr);
+
+#endif
+
+}
Index: julius4/mkbingram/charconv.h
diff -u /dev/null julius4/mkbingram/charconv.h:1.1
--- /dev/null	Sat Aug 11 19:44:02 2012
+++ julius4/mkbingram/charconv.h	Sat Aug 11 19:44:02 2012
@@ -0,0 +1,7 @@
+#ifndef __CHARCONV_H__
+#define __CHARCONV_H__
+
+int charconv_setup(char *fromcode, char *tocode);
+char *charconv(char *instr, char *outstr, int maxoutlen);
+
+#endif /* __CHARCONV_H__ */
Index: julius4/mkbingram/mkbingram.c
diff -u julius4/mkbingram/mkbingram.c:1.5 julius4/mkbingram/mkbingram.c:1.6
--- julius4/mkbingram/mkbingram.c:1.5	Fri Jul 27 17:44:57 2012
+++ julius4/mkbingram/mkbingram.c	Sat Aug 11 19:44:02 2012
@@ -18,7 +18,7 @@
  * @author Akinobu LEE
  * @date   Thu Mar 24 12:22:27 2005
  *
- * $Revision: 1.5 $
+ * $Revision: 1.6 $
  * 
  */
 /*
@@ -30,13 +30,15 @@
 
 /* mkbingram --- make binary n-gram for JULIUS from ARPA standard format */
 
-/* $Id: mkbingram.c,v 1.5 2012/07/27 08:44:57 sumomo Exp $ */
+/* $Id: mkbingram.c,v 1.6 2012/08/11 10:44:02 sumomo Exp $ */
 
 #include <sent/stddefs.h>
 #include <sent/ngram2.h>
 #include <sys/stat.h>
 #include <time.h>
 
+#include "charconv.h"
+
 static NGRAM_INFO *ngram;
 
 void
@@ -48,6 +50,7 @@
   printf("    -nlr file       forward  N-gram in ARPA format\n");
   printf("    -nrl file       backward N-gram in ARPA format\n");
   printf("    -d bingramfile  Julius binary N-gram file input\n");
+  printf("    -c from to      convert character code\n");
   printf("    -swap           swap \"%s\" and \"%s\"\n", BEGIN_WORD_DEFAULT, END_WORD_DEFAULT);
   printf("\n      When both \"-nlr\" and \"-nrl\" are specified, \n");
   printf("      Julius will use the BACKWARD N-gram as main LM\n");
@@ -66,9 +69,13 @@
   time_t now;
   char *binfile, *lrfile, *rlfile, *outfile;
   int i;
+  char *from_code, *to_code, *buf;
+  boolean charconv_enabled = FALSE;
   boolean force_swap = FALSE;
+  WORD_ID w;
 
   binfile = lrfile = rlfile = outfile = NULL;
+  from_code = to_code = NULL;
   if (argc <= 1) {
     usage(argv[0]);
     return -1;
@@ -106,6 +113,21 @@
 	  usage(argv[0]);
 	  return -1;
 	}
+      } else if (argv[i][1] == 'c') {
+	if (++i >= argc) {
+	  printf("Error: no argument for option \"%s\"\n", argv[i]);
+	  usage(argv[0]);
+	  return -1;
+	}
+	from_code = strcpy((char*)mymalloc(strlen(argv[i])+1), argv[i]);
+	if (++i >= argc) {
+	  printf("Error: no argument for option \"%s\"\n", argv[i]);
+	  usage(argv[0]);
+	  free(from_code);
+	  return -1;
+	}
+	to_code = strcpy((char*)mymalloc(strlen(argv[i])+1),argv[i]);
+	charconv_enabled = TRUE;
       } else if (argv[i][1] == 's') {
 	force_swap = TRUE;
       }
@@ -184,6 +206,21 @@
   }
 
   print_ngram_info(stdout, ngram);
+  
+  if (charconv_enabled == TRUE) {
+    /* do character conversion */
+    if (charconv_setup(from_code, to_code) == -1) {
+      fprintf(stderr, "failed to setup character convertsion\n");
+      return -1;
+    }
+    buf = (char *)mymalloc(4096);
+    for (w = 0; w < ngram->max_word_num; w++) {
+      charconv(ngram->wname[w], buf, 4096);
+      ngram->wname[w] = mybmalloc2(strlen(buf)+1, &(ngram->mroot));
+      strcpy(ngram->wname[w], buf);
+    }
+    free(buf);
+  }
 
   /* write in JULIUS binary format */
   if ((fp = fopen_writefile(outfile)) == NULL) {


Julius

[Julius-cvs 781] CVS update: julius4/mkbingram