From sumomo ¡÷ users.sourceforge.jp Fri Jun 24 14:07:56 2011 From: sumomo ¡÷ users.sourceforge.jp (sumomo ¡÷ users.sourceforge.jp) Date: Fri, 24 Jun 2011 14:07:56 +0900 Subject: [Julius-cvs 699] CVS update: julius4/libjulius/src Message-ID: <1308892076.647095.32747.nullmailer@users.sourceforge.jp> Index: julius4/libjulius/src/m_fusion.c diff -u julius4/libjulius/src/m_fusion.c:1.19 julius4/libjulius/src/m_fusion.c:1.20 --- julius4/libjulius/src/m_fusion.c:1.19 Fri Apr 29 14:09:15 2011 +++ julius4/libjulius/src/m_fusion.c Fri Jun 24 14:07:56 2011 @@ -20,7 +20,7 @@ * @author Akinobu Lee * @date Thu May 12 13:31:47 2005 * - * $Revision: 1.19 $ + * $Revision: 1.20 $ * */ /* @@ -457,7 +457,9 @@ } /* set unknown (=OOV) word id */ - set_unknown_id(ngram, lmconf->unknown_name); + if (strcmp(lmconf->unknown_name, UNK_WORD_DEFAULT)) { + set_unknown_id(ngram, lmconf->unknown_name); + } /* map dict item to N-gram entry */ if (make_voca_ref(ngram, winfo) == FALSE) { From sumomo ¡÷ users.sourceforge.jp Fri Jun 24 14:07:56 2011 From: sumomo ¡÷ users.sourceforge.jp (sumomo ¡÷ users.sourceforge.jp) Date: Fri, 24 Jun 2011 14:07:56 +0900 Subject: [Julius-cvs 700] CVS update: julius4/libsent/src/ngram Message-ID: <1308892076.819160.32764.nullmailer@users.sourceforge.jp> Index: julius4/libsent/src/ngram/init_ngram.c diff -u julius4/libsent/src/ngram/init_ngram.c:1.7 julius4/libsent/src/ngram/init_ngram.c:1.8 --- julius4/libsent/src/ngram/init_ngram.c:1.7 Fri Apr 29 14:09:17 2011 +++ julius4/libsent/src/ngram/init_ngram.c Fri Jun 24 14:07:56 2011 @@ -12,7 +12,7 @@ * @author Akinobu LEE * @date Wed Feb 16 07:40:53 2005 * - * $Revision: 1.7 $ + * $Revision: 1.8 $ * */ /* @@ -50,6 +50,10 @@ jlog("Error: init_ngram: failed to close \"%s\"\n", bin_ngram_file); return FALSE; } + + /* set default unknown (=OOV) word id */ + set_default_unknown_id(ndata); + jlog("Stat: init_ngram: finished reading n-gram\n"); return TRUE; } @@ -83,8 +87,11 @@ jlog("Error: init_ngram: failed to close \"%s\"\n", ngram_file); return FALSE; } - jlog("Stat: init_ngram: finished reading n-gram\n"); + /* set default unknown (=OOV) word id */ + set_default_unknown_id(ndata); + + jlog("Stat: init_ngram: finished reading n-gram\n"); return TRUE; } @@ -159,32 +166,47 @@ } /** - * @brief Set unknown word ID to the N-gram data. - * + * @brief Set default unknown word ID to the N-gram data. + * If default "" is not found, also try "". * * @param ndata [out] N-gram data to set unknown word ID. - * @param str [in] word name string of unknown word */ void -set_unknown_id(NGRAM_INFO *ndata, char *str) +set_default_unknown_id(NGRAM_INFO *ndata) { - ndata->unk_id = ngram_lookup_word(ndata, str); - if (ndata->unk_id == WORD_INVALID) { - if (strmatch(str, UNK_WORD_DEFAULT)) { - /* if default "" is not found, also try "" */ - ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT2); - if (ndata->unk_id == WORD_INVALID) { - jlog("Stat: init_ngram: either \"%s\" and \"%s\" not found, assuming close vocabulary LM\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2); - ndata->isopen = FALSE; - return; - } + ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT); + if (ndata->unk_id != WORD_INVALID) { + jlog("Stat: init_ngram: found unknown word entry \"%s\"\n", UNK_WORD_DEFAULT); + ndata->isopen = TRUE; + } else { + ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT2); + if (ndata->unk_id != WORD_INVALID) { + jlog("Stat: init_ngram: found unknown word entry \"%s\"\n", UNK_WORD_DEFAULT2); + ndata->isopen = TRUE; + } else{ + jlog("Stat: init_ngram: neither \"%s\" nor \"%s\" was found, assuming close vocabulary LM\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2); + ndata->isopen = FALSE; } } - if (ndata->unk_id == WORD_INVALID) { - jlog("Stat: init_ngram: \"%s\" not found, assuming close vocabulary LM\n", str); - ndata->isopen = FALSE; + ndata->unk_num = 0; +} + +/** + * @brief Set user-specified word ID to the N-gram data. + * + * @param ndata [out] N-gram data to set unknown word ID. + * @param str [in] word name string of unknown word + */ +void +set_unknown_id(NGRAM_INFO *ndata, char *str) +{ + WORD_ID w; + w = ngram_lookup_word(ndata, str); + if (w == WORD_INVALID) { + jlog("Stat: init_ngram: \"%s\" not found", str); } else { - jlog("Stat: init_ngram: unknown words will be mapped to \"%s\"\n", str); + jlog("Stat: init_ngram: unknown word entry was set to \"%s\"\n", str); + ndata->unk_id = w; ndata->isopen = TRUE; } } Index: julius4/libsent/src/ngram/ngram_util.c diff -u julius4/libsent/src/ngram/ngram_util.c:1.6 julius4/libsent/src/ngram/ngram_util.c:1.7 --- julius4/libsent/src/ngram/ngram_util.c:1.6 Fri Apr 29 14:09:17 2011 +++ julius4/libsent/src/ngram/ngram_util.c Fri Jun 24 14:07:56 2011 @@ -12,7 +12,7 @@ * @author Akinobu LEE * @date Wed Feb 16 17:18:55 2005 * - * $Revision: 1.6 $ + * $Revision: 1.7 $ * */ /* @@ -90,7 +90,9 @@ } if (ndata->isopen) { fprintf(fp, "\t OOV word = %s(id=%d)\n", ndata->wname[ndata->unk_id],ndata->unk_id); - fprintf(fp, "\t OOV size = %d words in dict\n", ndata->unk_num); + if (ndata->unk_num != 0) { + fprintf(fp, "\t OOV size = %d words in dict\n", ndata->unk_num); + } } else { fprintf(fp, "\t OOV word = none (assume close vocabulary)\n"); } From sumomo ¡÷ users.sourceforge.jp Fri Jun 24 14:07:56 2011 From: sumomo ¡÷ users.sourceforge.jp (sumomo ¡÷ users.sourceforge.jp) Date: Fri, 24 Jun 2011 14:07:56 +0900 Subject: [Julius-cvs 701] CVS update: julius4/libsent/include/sent Message-ID: <1308892076.720316.32755.nullmailer@users.sourceforge.jp> Index: julius4/libsent/include/sent/ngram2.h diff -u julius4/libsent/include/sent/ngram2.h:1.10 julius4/libsent/include/sent/ngram2.h:1.11 --- julius4/libsent/include/sent/ngram2.h:1.10 Fri Apr 29 14:09:16 2011 +++ julius4/libsent/include/sent/ngram2.h Fri Jun 24 14:07:56 2011 @@ -95,7 +95,7 @@ * @author Akinobu LEE * @date Fri Feb 11 15:04:02 2005 * - * $Revision: 1.10 $ + * $Revision: 1.11 $ * */ /* @@ -245,6 +245,7 @@ boolean init_ngram_bin(NGRAM_INFO *ndata, char *ngram_file); boolean init_ngram_arpa(NGRAM_INFO *ndata, char *ngram_file, int dir); boolean init_ngram_arpa_additional(NGRAM_INFO *ndata, char *bigram_file); +void set_default_unknown_id(NGRAM_INFO *ndata); void set_unknown_id(NGRAM_INFO *ndata, char *str); void print_ngram_info(FILE *fp, NGRAM_INFO *ndata);