/*************************************************************************************************
 * Implementation of common features
 *                                                      Copyright (C) 2003-2004 Mikio Hirabayashi
 * This file is part of Estraier, a personal full-text search system.
 * Estraier is free software; you can redistribute it and/or modify it under the terms of the GNU
 * General Public License as published by the Free Software Foundation; either version 2 of the
 * License, or any later version.
 * Estraier is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License along with Estraier;
 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA.
 *************************************************************************************************/


#include "estcommon.h"
#include "estregex.h"
#include "estdlfunc.h"
#include "estcjkuni.h"
#include "estchasen.h"
#include "estmecab.h"
#include "estkakasi.h"


/* private function prototypes */
static char *esttextcatlines(const char *text, int qb, int *sp);
static char *esthtmlrawtext(const char *html);
static char *estmimerawtext(const char *mime);
static void estnormalizetext(char *text, int size, int *sp);
static int estistrustedenc(const char *name);
static void estdocaddcjktext(ODDOC *doc, const char *text, int size, int mode);
static void estdocaddlatintext(ODDOC *doc, const char *text, int size, int mode);
static int estisstopworden(const char *word);
static const char *estwildtoregex(const char *word);



/*************************************************************************************************
 * public objects
 *************************************************************************************************/


/* Make a document handle from a plain text. */
ODDOC *estdocplain(const char *uri, const char *text, int size, const char *code){
  ODDOC *doc;
  char numbuf[ESTNUMBUFSIZ], *sel, *naked;
  int nsiz;
  assert(uri && text && size >= 0);
  /* create the handle */
  doc = oddocopen(uri);
  /* detect the encoding */
  if(!code) code = cbencname(text, size);
  /* set attributes */
  oddocaddattr(doc, "type", "text/plain");
  oddocaddattr(doc, "encoding", code);
  sprintf(numbuf, "%d", size);
  oddocaddattr(doc, "size", numbuf);
  /* set words */
  if(!(sel = cbiconv(text, size, code, "UTF-8", NULL, NULL))) sel = cbmemdup(text, size);
  naked = esttextcatlines(sel, FALSE, &nsiz);
  estdocaddtext(doc, naked, nsiz, "UTF-8", ESTDOCBOTH);
  free(naked);
  free(sel);
  return doc;
}


/* Make a document handle from a HTML. */
ODDOC *estdochtml(const char *uri, const char *text, int size, const char *code){
  ODDOC *doc;
  CBLIST *elems;
  char numbuf[ESTNUMBUFSIZ], *sel, *menc, *tmp, *raw, *rp;
  const char *pcode, *elem;
  int i, j, ssiz, miss, body, ign, rsiz;
  assert(uri && text && size >= 0);
  /* create the handle */
  doc = oddocopen(uri);
  /* detect the encoding */
  pcode = code;
  if(!code) code = cbencname(text, size);
  /* normalize the encoding */
  if(!(sel = cbiconv(text, size, code, "UTF-8", &ssiz, NULL))) sel = cbmemdup(text, size);
  /* detect the encoding and normalize it using a meta element */
  menc = NULL;
  if(!pcode){
    for(i = 0; i < ssiz; i++){
      if(sel[i] == '<'){
        if(cbstrfwimatch(sel + i, "<meta")){
          i += 5;
          while(strchr(" \t\n\r", sel[i])){
            i++;
          }
          if(cbstrfwimatch(sel + i, "http-equiv=\"content-type\"")){
            while(i < ssiz && sel[i] != '>'){
              if(cbstrfwimatch(sel + i, "charset=")){
                i += 8;
                j = i;
                while(j < ssiz && sel[j] != '"' && sel[j] != '\'' &&
                      sel[j] != '>' && sel[j] != '\n'){
                  j++;
                }
                menc = cbmemdup(sel + i, j - i);
                break;
              }
              i++;
            }
          }
        } else if(cbstrfwimatch(sel + i, "</head>")){
          break;
        }
      }
      if(menc) break;
    }
  }
  if(menc){
    if(cbstricmp(code, menc)){
      if((tmp = cbiconv(text, size, menc, "UTF-8", NULL, &miss)) != NULL){
        if(miss <= ESTENCMISSMAX || estistrustedenc(menc)){
          free(sel);
          sel = tmp;
          code = menc;
        } else {
          free(tmp);
        }
      }
    }
  }
  /* process each element */
  elems = cbxmlbreak(sel, TRUE);
  body = FALSE;
  ign = FALSE;
  for(i = 0; i < cblistnum(elems); i++){
    elem = cblistval(elems, i, NULL);
    if(cbstrfwimatch(elem, "<title")){
      /* process the title */
      i++;
      if(i < cblistnum(elems)){
        elem = cblistval(elems, i, NULL);
        if(elem[0] == '<') elem = "";
        raw = esthtmlrawtext(elem);
        rsiz = strlen(raw);
        for(j = 0; j < rsiz; j++){
          if(((unsigned char *)raw)[j] < 0x20) raw[j] = 0x20;
        }
        estdocaddtext(doc, raw, rsiz, "UTF-8", ESTDOCNONLY);
        for(j = rsiz - 1; j >= 0; j--){
          if(((unsigned char *)raw)[j] > 0x20) break;
          raw[j] = '\0';
        }
        rp = raw;
        while(*rp != '\0'){
          if(*(unsigned char *)rp > 0x20) break;
          rp++;
        }
        rsiz = strlen(rp);
        if(rsiz > 0) oddocaddattr(doc, "title", rp);
        free(raw);
      }
    } else if(cbstrfwimatch(elem, "<meta")){
      /* process the author */
      elem += 5;
      while(strchr(" \t\n\r", *elem)){
        elem++;
      }
      if(cbstrfwimatch(elem, "name=\"author\"")){
        elem += 13;
        while(strchr(" \t\n\r", *elem)){
          elem++;
        }
        if(cbstrfwimatch(elem, "content=\"")){
          elem += 9;
          while(strchr(" \t\n\r", *elem)){
            elem++;
          }
          j = 0;
          while(elem[j] != '\0' && !strchr("\"\n\r", elem[j])){
            j++;
          }
          if(j > 0){
            tmp = cbmemdup(elem, j);
            raw = esthtmlrawtext(tmp);
            oddocaddattr(doc, "author", raw);
            free(raw);
            free(tmp);
          }
        }
      }
    } else if(cbstrfwimatch(elem, "<body")){
      body = TRUE;
    } else if(cbstrfwimatch(elem, "<script") || cbstrfwimatch(elem, "<style")){
      ign = TRUE;
    } else if(body && !ign && elem[0] != '<'){
      raw = esthtmlrawtext(elem);
      rsiz = strlen(raw);
      estdocaddtext(doc, raw, rsiz, "UTF-8", ESTDOCBOTH);
      free(raw);
    } else {
      ign = FALSE;
    }
  }
  if(!body){
    for(i = 0; i < cblistnum(elems); i++){
      elem = cblistval(elems, i, NULL);
      if(cbstrfwimatch(elem, "<title")){
        i++;
        continue;
      }
      if(elem[0] == '<') continue;
      raw = esthtmlrawtext(elem);
      rsiz = strlen(raw);
      estdocaddtext(doc, raw, rsiz, "UTF-8", ESTDOCBOTH);
      free(raw);
    }
  }
  /* set attributes */
  oddocaddattr(doc, "type", "text/html");
  oddocaddattr(doc, "encoding", code);
  sprintf(numbuf, "%d", size);
  oddocaddattr(doc, "size", numbuf);
  /* release resources */
  cblistclose(elems);
  free(menc);
  free(sel);
  return doc;
}


/* Make a document handle from a MIME. */
ODDOC *estdocmime(const char *uri, const char *text, const char *code, int nude){
  ODDOC *doc, *tdoc;
  CBMAP *map;
  const CBLIST *nwords, *awords;
  const char *cbuf, *ep, *tval;
  char sizebuf[ESTNUMBUFSIZ], *line, *raw, *type, *enc, *bound, *dtext, *sel, *naked;
  int i, j, size, pv, len, csiz, hit, ssiz, blen, nsiz, miss;
  assert(uri && text);
  size = strlen(text);
  sprintf(sizebuf, "%d", size);
  doc = oddocopen(uri);
  map = cbmapopenex(ESTPETITBNUM);
  /* parse headers */
  pv = 0;
  for(i = 0; text[i] != '\0'; i++){
    if(text[i] == '\n' && text[i+1] != ' ' && text[i+1] != '\t'){
      if(i > pv){
        len = i - pv;
        line = cbmemdup(text + pv, len);
        for(j = len - 1; j >= 0; j--){
          if(line[j] == '\n' || line[j] == '\r'){
            line[j] = '\0';
            len--;
          } else {
            break;
          }
        }
        if(len < 1){
          free(line);
          break;
        }
        if((cbuf = strchr(line, ':')) != NULL){
          csiz = cbuf - line;
          for(j = 0; j < csiz; j++){
            if(line[j] >= 'A' && line[j] <= 'Z') line[j] += 'a' - 'A';
          }
          cbuf++;
          while(*cbuf != '\0' && (*cbuf == ' ' || *cbuf == '\t')){
            cbuf++;
          }
          cbmapput(map, line, csiz, cbuf, -1, DP_DOVER);
        }
        free(line);
      } else {
        break;
      }
      pv = i + 1;
    }
  }
  text += i;
  size -= i;
  if(size > 0){
    text++;
    size--;
  }
  /* get type and encoding */
  type = NULL;
  enc = NULL;
  bound = NULL;
  if((cbuf = cbmapget(map, "content-type", -1, &csiz)) != NULL){
    if((ep = strchr(cbuf, ';')) != NULL){
      type = cbmemdup(cbuf, ep - cbuf);
      ep++;
      while(*ep != '\0' && strchr(" \t\r\n", *ep)){
        ep++;
      }
      while(TRUE){
        if(cbstrfwimatch(ep, "charset=")){
          ep += 8;
          if(*ep == '"') ep++;
          cbuf = ep;
          while(*ep != '\0' && !strchr("; \t\r\n\"", *ep)){
            ep++;
          }
          free(enc);
          enc = cbmemdup(cbuf, ep - cbuf);
        } else if(cbstrfwimatch(ep, "boundary=")){
          ep += 9;
          if(*ep == '"') ep++;
          cbuf = ep;
          while(*ep != '\0' && !strchr(" \t\r\n\"", *ep)){
            ep++;
          }
          free(bound);
          bound = cbmemdup(cbuf, ep - cbuf);
        }
        if(!(ep = strchr(ep, ';'))) break;
        ep++;
        while(strchr(" \t\r\n", *ep)){
          ep++;
        }
      }
    } else {
      type = cbmemdup(cbuf, csiz);
    }
  }
  if(!type) type = cbmemdup("text/plain", -1);
  /* decode the transfer encoding */
  dtext = NULL;
  if((cbuf = cbmapget(map, "content-transfer-encoding", -1, NULL)) != NULL){
    if(!cbstricmp(cbuf, "quoted-printable")){
      hit = FALSE;
      for(i = 0; text[i] != '\0'; i++){
        if(text[i] < 0x9 || text[i] > 0x7e){
          hit = TRUE;
          break;
        }
        if(text[i] == '-' && text[i+1] == '-') break;
      }
      if(!hit){
        dtext = cbquotedecode(text, NULL);
        text = dtext;
        size = strlen(dtext);
      }
    } else if(!cbstricmp(cbuf, "base64")){
      hit = FALSE;
      for(i = 0; text[i] != '\0'; i++){
        if(text[i] < 0x9 || text[i] > 0x7e){
          hit = TRUE;
          break;
        }
        if(text[i] == '-' && text[i+1] == '-') break;
      }
      if(!hit){
        dtext = cbbasedecode(text, NULL);
        text = dtext;
        size = strlen(dtext);
      }
    }
  }
  if(code){
    if(enc) free(enc);
    enc = cbmemdup(code, -1);
  } else if(!enc){
    enc = cbmemdup(cbencname(text, size), -1);
  }
  /* set attributes */
  if((cbuf = cbmapget(map, "subject", -1, NULL)) != NULL){
    raw = estmimerawtext(cbuf);
    oddocaddattr(doc, "title", raw);
    estdocaddtext(doc, raw, strlen(raw), "UTF-8", ESTDOCNONLY);
    free(raw);
  }
  if((cbuf = cbmapget(map, "from", -1, NULL)) != NULL){
    raw = estmimerawtext(cbuf);
    oddocaddattr(doc, "author", raw);
    free(raw);
  }
  if((cbuf = cbmapget(map, "to", -1, NULL)) != NULL){
    raw = estmimerawtext(cbuf);
    oddocaddattr(doc, "recipient", raw);
    free(raw);
  }
  if((cbuf = cbmapget(map, "cc", -1, NULL)) != NULL){
    raw = estmimerawtext(cbuf);
    oddocaddattr(doc, "multicast", raw);
    free(raw);
  }
  if((cbuf = cbmapget(map, "date", -1, NULL)) != NULL){
    raw = estmimerawtext(cbuf);
    oddocaddattr(doc, "date", raw);
    free(raw);
  }
  oddocaddattr(doc, "type", "message/rfc822");
  oddocaddattr(doc, "encoding", enc);
  oddocaddattr(doc, "size", sizebuf);
  /* read body */
  if(cbstrfwimatch(type, "multipart/") &&  bound){
    /* multipart */
    blen = strlen(bound);
    if((ep = strstr(text, "--")) != NULL && cbstrfwmatch(ep + 2, bound)) text = ep + 2 + blen;
    if(*text == '\r') text++;
    if(*text == '\n') text++;
    cbuf = text;
    if((ep = strstr(text, "\r\n\r\n")) != NULL){
      cbuf = ep + 4;
    } else if((ep = strstr(text, "\n\n")) != NULL){
      cbuf = ep + 2;
    }
    for(ep = cbuf; *ep != '\0'; ep++){
      if(ep[0] == '-' && ep[1] == '-' && cbstrfwmatch(ep + 2, bound)) break;
    }
    if(*ep != '\0'){
      sel = cbmemdup(text, ep - text);
      tdoc = estdocmime(uri, sel, code, nude);
      nwords = oddocnwords(tdoc);
      awords = oddocawords(tdoc);
      for(i = 0; i < cblistnum(nwords); i++){
        oddocaddword(doc, cblistval(nwords, i, NULL), cblistval(awords, i, NULL));
      }
      if(nude){
        sprintf(sizebuf, "%d", size);
        oddocaddattr(doc, "size", sizebuf);
        if(!(tval = oddocgetattr(tdoc, "title"))) tval = "";
        oddocaddattr(doc, "title", tval);
        if(!(tval = oddocgetattr(tdoc, "author"))) tval = "";
        oddocaddattr(doc, "author", tval);
        if(!(tval = oddocgetattr(tdoc, "recipient"))) tval = "";
        oddocaddattr(doc, "recipient", tval);
        if(!(tval = oddocgetattr(tdoc, "multicast"))) tval = "";
        oddocaddattr(doc, "multicast", tval);
        if(!(tval = oddocgetattr(tdoc, "date"))) tval = "";
        oddocaddattr(doc, "date", tval);
        if(!(tval = oddocgetattr(tdoc, "type"))) tval = "";
        oddocaddattr(doc, "type", tval);
        if(!(tval = oddocgetattr(tdoc, "encoding"))) tval = "";
        oddocaddattr(doc, "encoding", tval);
      }
      oddocclose(tdoc);
      free(sel);
    }
  } else if(cbstrfwimatch(type, "text/html")){
    /* HTML */
    tdoc = estdochtml(uri, text, size, NULL);
    nwords = oddocnwords(tdoc);
    awords = oddocawords(tdoc);
    for(i = 0; i < cblistnum(nwords); i++){
      oddocaddword(doc, cblistval(nwords, i, NULL), cblistval(awords, i, NULL));
    }
    if(nude){
      oddocaddattr(doc, "type", "text/html");
      sprintf(sizebuf, "%d", size);
      oddocaddattr(doc, "size", sizebuf);
      if(!(tval = oddocgetattr(tdoc, "title"))) tval = "";
      oddocaddattr(doc, "title", tval);
      if(!(tval = oddocgetattr(tdoc, "author"))) tval = "";
      oddocaddattr(doc, "author", tval);
      if(!(tval = oddocgetattr(tdoc, "encoding"))) tval = "";
      oddocaddattr(doc, "encoding", tval);
    }
    oddocclose(tdoc);
  } else if(cbstrfwimatch(type, "text")){
    /* plain text */
    miss = 0;
    if(!(sel = cbiconv(text, size, enc, "UTF-8", &ssiz, &miss))) sel = cbmemdup(text, size);
    if(!code && miss > ESTENCMISSMAX && !estistrustedenc(enc)){
      free(sel);
      cbuf = cbencname(text, size);
      if((sel = cbiconv(text, size, cbuf, "UTF-8", &ssiz, NULL)) != NULL){
        oddocaddattr(doc, "encoding", cbuf);
      } else {
        sel = cbmemdup(text, size);
      }
    }
    naked = esttextcatlines(sel, TRUE, &nsiz);
    estdocaddtext(doc, naked, nsiz, "UTF-8", ESTDOCBOTH);
    free(naked);
    free(sel);
    if(nude){
      oddocaddattr(doc, "type", "text/plain");
      sprintf(sizebuf, "%d", size);
      oddocaddattr(doc, "size", sizebuf);
    }
  }
  /* release resources */
  free(dtext);
  free(bound);
  free(enc);
  free(type);
  cbmapclose(map);
  return doc;
}


/* Break a text into words and register them to a document handle. */
void estdocaddtext(ODDOC *doc, const char *text, int size, const char *code, int mode){
  char *cano;
  const unsigned char *ucs;
  int i, j, csiz;
  assert(doc && text && size >= 0 && code);
  if(!(cano = cbiconv(text, size, code, "UTF-16BE", &csiz, NULL))) return;
  estnormalizetext(cano, csiz, &csiz);
  ucs = (unsigned char *)cano;
  i = 0;
  while(i < csiz - 1){
    if(ucs[i] >= ESTCJKPMIN){
      for(j = i; j < csiz - 1; j += 2){
        if(ucs[j] < ESTCJKPMIN) break;
      }
      estdocaddcjktext(doc, (char *)(ucs + i), j - i, mode);
      i = j;
    } else {
      for(j = i; j < csiz - 1; j += 2){
        if(ucs[j] >= ESTCJKPMIN) break;
      }
      estdocaddlatintext(doc, (char *)(ucs + i), j - i, mode);
      i = j;
    }
  }
  free(cano);
}


/* Get the handle of the filter function in a dynamic linking library. */
ESTFILTER estfilterget(const char *file){
  assert(file);
  if(!estgetdlfunc) return NULL;
  return estgetdlfunc(file, ESTFILTERFUNC);
}


/* Make a search words and their conditions from a search phrase. */
ESTWORD *estsearchwords(const char *phrase, int *np, int norm){
  ESTWORD *estwords;
  const CBLIST *atoms;
  const char *pivot, *word, *tstr, *atom;
  char *tmp;
  int i, j, type, wsiz, tsiz, asiz;
  CBLIST *words, *types, *sumwords, *sumtypes;
  ODDOC *doc;
  assert(phrase && np);
  sumwords = cblistopen();
  sumtypes = cblistopen();
  words = cblistopen();
  types = cblistopen();
  pivot = phrase;
  type = ESTCONDAND;
  while(*phrase != '\0'){
    if(cbstrfwmatch(phrase, " ") || cbstrfwmatch(phrase, "\xe3\x80\x80")){
      if(phrase > pivot){
        cblistpush(words, pivot, phrase - pivot);
        cblistpush(types, type == ESTCONDAND ? "and" : type == ESTCONDOR ? "or" : "not", -1);
        type = ESTCONDAND;
      }
      pivot = phrase + (*phrase == ' ' ? 1 : 3);
    } else if(cbstrfwimatch(phrase, "[AND]")){
      type = ESTCONDAND;
      phrase += 4;
      pivot = phrase + 1;
    } else if(cbstrfwimatch(phrase, "[OR]")){
      if(cblistnum(words) > 0) type = ESTCONDOR;
      phrase += 3;
      pivot = phrase + 1;
    } else if(cbstrfwimatch(phrase, "[NOT]")){
      if(cblistnum(words) > 0) type = ESTCONDNOT;
      phrase += 4;
      pivot = phrase + 1;
    }
    phrase++;
  }
  if(phrase > pivot){
    cblistpush(words, pivot, phrase - pivot);
    cblistpush(types, type == ESTCONDAND ? "and" : type == ESTCONDOR ? "or" : "not", -1);
  }
  for(i = 0; i < cblistnum(words); i++){
    word = cblistval(words, i, &wsiz);
    tstr = cblistval(types, i, &tsiz);
    if(norm){
      doc = oddocopen("");
      estdocaddtext(doc, word, wsiz, "UTF-8", ESTDOCNONLY);
      atoms = oddocnwords(doc);
      for(j = 0; j < cblistnum(atoms); j++){
        atom = cblistval(atoms, j, &asiz);
        if(asiz < 1) continue;
        cblistpush(sumwords, atom, asiz);
        cblistpush(sumtypes, tstr, tsiz);
      }
      oddocclose(doc);
    } else {
      tmp = cbmemdup(word, wsiz);
      for(j = 0; j < wsiz; j++){
        if(tmp[j] >= 'A' && tmp[j] <= 'Z') tmp[j] += 'a' - 'A';
      }
      cblistpush(sumwords, tmp, wsiz);
      cblistpush(sumtypes, tstr, tsiz);
      free(tmp);
    }
  }
  cblistclose(types);
  cblistclose(words);
  *np = cblistnum(sumwords);
  estwords = cbmalloc(*np * sizeof(ESTWORD) + 1);
  for(i = 0; i < *np; i++){
    estwords[i].word = cblistshift(sumwords, &wsiz);
    tstr = cblistval(sumtypes, i, &tsiz);
    estwords[i].type = !strcmp(tstr, "and") ? ESTCONDAND :
      !strcmp(tstr, "or") ? ESTCONDOR : ESTCONDNOT;
    estwords[i].dnum = 0;
    estwords[i].evwords = NULL;
  }
  cblistclose(sumtypes);
  cblistclose(sumwords);
  return estwords;
}


/* Release regions of an array of search words. */
void estfreewords(ESTWORD *words, int num){
  int i;
  assert(words && num >= 0);
  for(i = 0; i < num; i++){
    if(words[i].evwords) cblistclose(words[i].evwords);
    free(words[i].word);
  }
  free(words);
}


/* Get search result with search words made with `estsearchwords'. */
ODPAIR *estsearch(ODEUM *odeum, ESTWORD *words, int wnum, int unit, int tfidf, int *np, int *lp,
                  int regex, int wild, int reevmax){
  ODPAIR *last, *cur, *rgx, *tmp;
  CURIA *indexdb;
  FILE *ifp;
  char *name, path[ESTPATHBUFSIZ], *iword, siword[ESTPATHBUFSIZ];
  const char *rword;
  int i, j, k, c, lnum, cnum, rnum, tnum, rhit, dnum;
  double dval, ival;
  assert(words && wnum >= 0 && unit >= 0 && np && lp);
  last = NULL;
  lnum = 0;
  *lp = 0;
  dval = odlogarithm(oddnum(odeum));
  if(dval <= 2.0) dval = 2.0;
  for(i = 0; i < wnum; i++){
    if(estregexmatch && (regex || (wild && strchr(words[i].word, '*')))){
      words[i].evwords = cblistopen();
      if(wild){
        rword = estwildtoregex(words[i].word);
      } else {
        rword = words[i].word;
      }
      cur = NULL;
      indexdb = odidbindex(odeum);
      criterinit(indexdb);
      name = odname(odeum);
      sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTWDLSNAME);
      free(name);
      ifp = fopen(path, "rb");
      rhit = 0;
      for(j = 0; rhit < reevmax; j++){
        if(ifp){
          for(k = 0; k < ESTPATHBUFSIZ - 1 && (c = fgetc(ifp)) != EOF && c != '\n'; k++){
            siword[k] = c;
          }
          siword[k] = '\0';
          iword = siword;
          if(k < 1) break;
        } else {
          if(!(iword = criternext(indexdb, NULL))) break;
        }
        if(!estregexmatch(iword, j < 1 ? rword : NULL) ||
           !(rgx = odsearch(odeum, iword, -1, &rnum))){
          if(!ifp) free(iword);
          continue;
        }
        rhit++;
        cblistpush(words[i].evwords, iword, -1);
        if(!cur){
          cur = rgx;
          cnum = rnum;
        } else {
          tmp = odpairsor(cur, cnum, rgx, rnum, &tnum);
          free(rgx);
          free(cur);
          cur = tmp;
          cnum = tnum;
        }
        if(!ifp) free(iword);
      }
      if(!cur){
        cur = cbmalloc(1);
        cnum = 0;
      }
      if(ifp) fclose(ifp);
    } else {
      if(!(cur = odsearch(odeum, words[i].word, words[i].type == ESTCONDNOT ? -1 : unit, &cnum))){
        cur = cbmalloc(1);
        cnum = 0;
      }
    }
    if(estregexmatch && (regex || (wild && strchr(words[i].word, '*')))){
      dnum = cnum;
      *lp = 0;
    } else {
      if((dnum = odsearchdnum(odeum, words[i].word)) < 0) dnum = 0;
      *lp += dnum - cnum;
    }
    if(tfidf){
      ival = odlogarithm(dnum);
      if(ival < 2.0) ival = 2.0;
      ival = (ival * ival * ival) / 8.0;
      for(j = 0; j < cnum; j++){
        cur[j].score = (cur[j].score * dval) / ival;
      }
    }
    words[i].dnum = dnum;
    if(!last){
      last = cur;
      lnum = cnum;
    } else {
      switch(words[i].type){
      case ESTCONDAND:
        tmp = odpairsand(last, lnum, cur, cnum, &tnum);
        break;
      case ESTCONDOR:
        tmp = odpairsor(last, lnum, cur, cnum, &tnum);
        break;
      default:
        tmp = odpairsnotand(last, lnum, cur, cnum, &tnum);
        break;
      }
      free(last);
      free(cur);
      last = tmp;
      lnum = tnum;
    }
  }
  if(!last){
    last = cbmalloc(1);
    lnum = 0;
  }
  *np = lnum;
  return last;
}


/* Initialize the iterator of a database. */
int estiterinit(ODEUM *odeum, const char *prefix){
  VILLA *villa;
  assert(odeum && prefix);
  if(!(villa = odidbrdocs(odeum))) return FALSE;
  if(!vlcurjump(villa, prefix, -1, VL_JFORWARD) && dpecode != DP_ENOITEM) return FALSE;
  return TRUE;
}


/* Get the URI of the next document whose URI begins with a prefix. */
char *estiternext(ODEUM *odeum, const char *prefix){
  VILLA *villa;
  char *kbuf;
  int ksiz;
  assert(odeum && prefix);
  if(!(villa = odidbrdocs(odeum))) return NULL;
  while(TRUE){
    if(!(kbuf = vlcurkey(villa, &ksiz))) break;
    if(kbuf[ksiz-1] == '\0'){
      free(kbuf);
      vlcurnext(villa);
      continue;
    }
    if(!cbstrfwmatch(kbuf, prefix)){
      free(kbuf);
      dpecode = DP_ENOITEM;
      break;
    }
    vlcurnext(villa);
    return kbuf;
  }
  return NULL;
}


/* Resurge the cursor to the next document of the last deleted document. */
int estiterresurge(ODEUM *odeum, const char *uri){
  VILLA *villa;
  assert(odeum && uri);
  if(!(villa = odidbrdocs(odeum))) return FALSE;
  if(!vlcurjump(villa, uri, -1, VL_JFORWARD) && dpecode != DP_ENOITEM) return FALSE;
  return TRUE;
}


/* Set an environment variable. */
void estputenv(const char *name, const char *value){
  static CBMAP *map = NULL;
  char *str;
  assert(name && value);
  if(!map){
    map = cbmapopenex(ESTPETITBNUM);
    cbglobalgc(map, (void (*)(void *))cbmapclose);
  }
  str = cbsprintf("%s=%s", name, value);
  cbmapput(map, name, -1, str, -1, TRUE);
  free(str);
  value = cbmapget(map, name, -1, NULL);
  putenv((char *)value);
}


/* Execute a command and get the result. */
char *estreadexec(const char *command, int *sp){
  FILE *ifp;
  int c, rv;
  char b;
  CBDATUM *datum;
  assert(command);
  if(!(ifp = popen(command, "r"))) return NULL;
  datum = cbdatumopen("", 0);
  while((c = fgetc(ifp)) != EOF){
    b = c;
    cbdatumcat(datum, &b, 1);
  }
  rv = pclose(ifp);
  return cbdatumtomalloc(datum, sp);
}


/* Make time data from a string of RFC822, RFC850, or ASCTIME. */
int eststrmktime(const char *str){
  char monthbuf[ESTDATEBUFSIZ], lagbuf[ESTDATEBUFSIZ];
  struct tm tse, *tp, gt, lt;
  time_t tt;
  int len, lag, mylag;
  len = strlen(str);
  if(len >= ESTDATEBUFSIZ || len < 4) return -1;
  lag = 0;
  if(str[3] == ','){
    if(sscanf(str, "%*s %d %s %d %d:%d:%d %s",
              &(tse.tm_mday), monthbuf, &(tse.tm_year), &(tse.tm_hour), &(tse.tm_min),
              &(tse.tm_sec), lagbuf) != 7) return -1;
    lag = atoi(lagbuf);
    tse.tm_year -= 1900;
  } else if(str[3] == ' '){
    if(sscanf(str, "%*s %s %d %d:%d:%d %d",
              monthbuf, &(tse.tm_mday), &(tse.tm_hour), &(tse.tm_min), &(tse.tm_sec),
              &(tse.tm_year)) != 6) return -1;
    tse.tm_year -= 1900;
  } else if((str[3] >= 'A' && str[3] <= 'Z') || (str[3] >= 'a' && str[3] <= 'z')){
    if(sscanf(str, "%*s %d-%3s-%d %d:%d:%d %s",
              &(tse.tm_mday), monthbuf, &(tse.tm_year), &(tse.tm_hour), &(tse.tm_min),
              &(tse.tm_sec), lagbuf) != 7) return -1;
    lag = atoi(lagbuf);
  } else {
    return -1;
  }
  if(!cbstricmp(monthbuf, "jan")){
    tse.tm_mon = 0;
  } else if(!cbstricmp(monthbuf, "feb")){
    tse.tm_mon = 1;
  } else if(!cbstricmp(monthbuf, "mar")){
    tse.tm_mon = 2;
  } else if(!cbstricmp(monthbuf, "apr")){
    tse.tm_mon = 3;
  } else if(!cbstricmp(monthbuf, "may")){
    tse.tm_mon = 4;
  } else if(!cbstricmp(monthbuf, "jun")){
    tse.tm_mon = 5;
  } else if(!cbstricmp(monthbuf, "jul")){
    tse.tm_mon = 6;
  } else if(!cbstricmp(monthbuf, "aug")){
    tse.tm_mon = 7;
  } else if(!cbstricmp(monthbuf, "sep")){
    tse.tm_mon = 8;
  } else if(!cbstricmp(monthbuf, "oct")){
    tse.tm_mon = 9;
  } else if(!cbstricmp(monthbuf, "nov")){
    tse.tm_mon = 10;
  } else if(!cbstricmp(monthbuf, "dec")){
    tse.tm_mon = 11;
  }
  mylag = 0;
  if((tt = time(NULL)) > 0){
    if((tp = gmtime(&tt)) != NULL){
      gt = *tp;
      if((tp = localtime(&tt)) != NULL){
        lt = *tp;
        mylag = (lt.tm_hour * 60 + lt.tm_min) - (gt.tm_hour * 60 + gt.tm_min);
        if(lt.tm_year > gt.tm_year){
          mylag += 24 * 60;
        } else if(lt.tm_year < gt.tm_year){
          mylag -= 24 * 60;
        } else if(lt.tm_mon > gt.tm_mon){
          mylag += 24 * 60;
        } else if(lt.tm_mon < gt.tm_mon){
          mylag -= 24 * 60;
        } else if(lt.tm_mday > gt.tm_mday){
          mylag += 24 * 60;
        } else if(lt.tm_mday < gt.tm_mday){
          mylag -= 24 * 60;
        }
      }
    }
  }
  tse.tm_min += mylag - ((lag / 100) * 60 + (lag % 100));
  tse.tm_isdst = -1;
  return(mktime(&tse));
}



/*************************************************************************************************
 * private objects
 *************************************************************************************************/


/* Strip a MIME text of quoting characters.
   `text' specifies a text whose encoding is UTF-8.
   `qb' specifies whether to delete quoting marks.
   `sp' specifies the pointer to a variable to which the size of the result is assigned.
   The return value is allocated string of result text. */
static char *esttextcatlines(const char *text, int qb, int *sp){
  CBDATUM *naked, *lines;
  unsigned char *utext;
  int i, head, llen;
  assert(text && sp);
  naked = cbdatumopen("", 0);
  lines = cbdatumopen("", 0);
  head = TRUE;
  for(i = 0; text[i] != '\0'; i++){
    if(text[i] == '\r') i++;
    if(text[i] == '\n'){
      cbdatumcat(lines, text + i, 1);
      head = TRUE;
    } else if(!qb || !head || !strchr(" \t>|}#", text[i])){
      cbdatumcat(lines, text + i, 1);
      head = FALSE;
    }
  }
  utext = (unsigned char *)cbdatumptr(lines);
  llen = 0;
  for(i = 0; utext[i] != '\0'; i++){
    if(llen >= ESTMIMEFOLD && utext[i] == '\n'){
      if(utext[i-3] < 0xe0 || utext[i+1] < 0x80) cbdatumcat(naked, " ", 1);
      llen = 0;
      continue;
    }
    cbdatumcat(naked, (char *)utext + i, 1);
    if(utext[i] == '\n'){
      llen = 0;
    } else {
      llen++;
    }
  }
  cbdatumclose(lines);
  return cbdatumtomalloc(naked, sp);
}


/* Unescape entity references of HTML.
   `html' specifies a text whose encoding is UTF-8.
   The return value is allocated string of result text. */
static char *esthtmlrawtext(const char *html){
  const char *pairs[] = {
    /* basic symbols */
    "&amp;", "&", "&lt;", "<", "&gt;", ">", "&quot;", "\"", "&apos;", "'",
    /* ISO-8859-1 */
    "&nbsp;", "\xc2\xa0", "&iexcl;", "\xc2\xa1", "&cent;", "\xc2\xa2",
    "&pound;", "\xc2\xa3", "&curren;", "\xc2\xa4", "&yen;", "\xc2\xa5",
    "&brvbar;", "\xc2\xa6", "&sect;", "\xc2\xa7", "&uml;", "\xc2\xa8",
    "&copy;", "\xc2\xa9", "&ordf;", "\xc2\xaa", "&laquo;", "\xc2\xab",
    "&not;", "\xc2\xac", "&shy;", "\xc2\xad", "&reg;", "\xc2\xae",
    "&macr;", "\xc2\xaf", "&deg;", "\xc2\xb0", "&plusmn;", "\xc2\xb1",
    "&sup2;", "\xc2\xb2", "&sup3;", "\xc2\xb3", "&acute;", "\xc2\xb4",
    "&micro;", "\xc2\xb5", "&para;", "\xc2\xb6", "&middot;", "\xc2\xb7",
    "&cedil;", "\xc2\xb8", "&sup1;", "\xc2\xb9", "&ordm;", "\xc2\xba",
    "&raquo;", "\xc2\xbb", "&frac14;", "\xc2\xbc", "&frac12;", "\xc2\xbd",
    "&frac34;", "\xc2\xbe", "&iquest;", "\xc2\xbf", "&Agrave;", "\xc3\x80",
    "&Aacute;", "\xc3\x81", "&Acirc;", "\xc3\x82", "&Atilde;", "\xc3\x83",
    "&Auml;", "\xc3\x84", "&Aring;", "\xc3\x85", "&AElig;", "\xc3\x86",
    "&Ccedil;", "\xc3\x87", "&Egrave;", "\xc3\x88", "&Eacute;", "\xc3\x89",
    "&Ecirc;", "\xc3\x8a", "&Euml;", "\xc3\x8b", "&Igrave;", "\xc3\x8c",
    "&Iacute;", "\xc3\x8d", "&Icirc;", "\xc3\x8e", "&Iuml;", "\xc3\x8f",
    "&ETH;", "\xc3\x90", "&Ntilde;", "\xc3\x91", "&Ograve;", "\xc3\x92",
    "&Oacute;", "\xc3\x93", "&Ocirc;", "\xc3\x94", "&Otilde;", "\xc3\x95",
    "&Ouml;", "\xc3\x96", "&times;", "\xc3\x97", "&Oslash;", "\xc3\x98",
    "&Ugrave;", "\xc3\x99", "&Uacute;", "\xc3\x9a", "&Ucirc;", "\xc3\x9b",
    "&Uuml;", "\xc3\x9c", "&Yacute;", "\xc3\x9d", "&THORN;", "\xc3\x9e",
    "&szlig;", "\xc3\x9f", "&agrave;", "\xc3\xa0", "&aacute;", "\xc3\xa1",
    "&acirc;", "\xc3\xa2", "&atilde;", "\xc3\xa3", "&auml;", "\xc3\xa4",
    "&aring;", "\xc3\xa5", "&aelig;", "\xc3\xa6", "&ccedil;", "\xc3\xa7",
    "&egrave;", "\xc3\xa8", "&eacute;", "\xc3\xa9", "&ecirc;", "\xc3\xaa",
    "&euml;", "\xc3\xab", "&igrave;", "\xc3\xac", "&iacute;", "\xc3\xad",
    "&icirc;", "\xc3\xae", "&iuml;", "\xc3\xaf", "&eth;", "\xc3\xb0",
    "&ntilde;", "\xc3\xb1", "&ograve;", "\xc3\xb2", "&oacute;", "\xc3\xb3",
    "&ocirc;", "\xc3\xb4", "&otilde;", "\xc3\xb5", "&ouml;", "\xc3\xb6",
    "&divide;", "\xc3\xb7", "&oslash;", "\xc3\xb8", "&ugrave;", "\xc3\xb9",
    "&uacute;", "\xc3\xba", "&ucirc;", "\xc3\xbb", "&uuml;", "\xc3\xbc",
    "&yacute;", "\xc3\xbd", "&thorn;", "\xc3\xbe", "&yuml;", "\xc3\xbf",
    /* ISO-10646 */
    "&fnof;", "\xc6\x92", "&Alpha;", "\xce\x91", "&Beta;", "\xce\x92",
    "&Gamma;", "\xce\x93", "&Delta;", "\xce\x94", "&Epsilon;", "\xce\x95",
    "&Zeta;", "\xce\x96", "&Eta;", "\xce\x97", "&Theta;", "\xce\x98",
    "&Iota;", "\xce\x99", "&Kappa;", "\xce\x9a", "&Lambda;", "\xce\x9b",
    "&Mu;", "\xce\x9c", "&Nu;", "\xce\x9d", "&Xi;", "\xce\x9e",
    "&Omicron;", "\xce\x9f", "&Pi;", "\xce\xa0", "&Rho;", "\xce\xa1",
    "&Sigma;", "\xce\xa3", "&Tau;", "\xce\xa4", "&Upsilon;", "\xce\xa5",
    "&Phi;", "\xce\xa6", "&Chi;", "\xce\xa7", "&Psi;", "\xce\xa8",
    "&Omega;", "\xce\xa9", "&alpha;", "\xce\xb1", "&beta;", "\xce\xb2",
    "&gamma;", "\xce\xb3", "&delta;", "\xce\xb4", "&epsilon;", "\xce\xb5",
    "&zeta;", "\xce\xb6", "&eta;", "\xce\xb7", "&theta;", "\xce\xb8",
    "&iota;", "\xce\xb9", "&kappa;", "\xce\xba", "&lambda;", "\xce\xbb",
    "&mu;", "\xce\xbc", "&nu;", "\xce\xbd", "&xi;", "\xce\xbe",
    "&omicron;", "\xce\xbf", "&pi;", "\xcf\x80", "&rho;", "\xcf\x81",
    "&sigmaf;", "\xcf\x82", "&sigma;", "\xcf\x83", "&tau;", "\xcf\x84",
    "&upsilon;", "\xcf\x85", "&phi;", "\xcf\x86", "&chi;", "\xcf\x87",
    "&psi;", "\xcf\x88", "&omega;", "\xcf\x89", "&thetasym;", "\xcf\x91",
    "&upsih;", "\xcf\x92", "&piv;", "\xcf\x96", "&bull;", "\xe2\x80\xa2",
    "&hellip;", "\xe2\x80\xa6", "&prime;", "\xe2\x80\xb2", "&Prime;", "\xe2\x80\xb3",
    "&oline;", "\xe2\x80\xbe", "&frasl;", "\xe2\x81\x84", "&weierp;", "\xe2\x84\x98",
    "&image;", "\xe2\x84\x91", "&real;", "\xe2\x84\x9c", "&trade;", "\xe2\x84\xa2",
    "&alefsym;", "\xe2\x84\xb5", "&larr;", "\xe2\x86\x90", "&uarr;", "\xe2\x86\x91",
    "&rarr;", "\xe2\x86\x92", "&darr;", "\xe2\x86\x93", "&harr;", "\xe2\x86\x94",
    "&crarr;", "\xe2\x86\xb5", "&lArr;", "\xe2\x87\x90", "&uArr;", "\xe2\x87\x91",
    "&rArr;", "\xe2\x87\x92", "&dArr;", "\xe2\x87\x93", "&hArr;", "\xe2\x87\x94",
    "&forall;", "\xe2\x88\x80", "&part;", "\xe2\x88\x82", "&exist;", "\xe2\x88\x83",
    "&empty;", "\xe2\x88\x85", "&nabla;", "\xe2\x88\x87", "&isin;", "\xe2\x88\x88",
    "&notin;", "\xe2\x88\x89", "&ni;", "\xe2\x88\x8b", "&prod;", "\xe2\x88\x8f",
    "&sum;", "\xe2\x88\x91", "&minus;", "\xe2\x88\x92", "&lowast;", "\xe2\x88\x97",
    "&radic;", "\xe2\x88\x9a", "&prop;", "\xe2\x88\x9d", "&infin;", "\xe2\x88\x9e",
    "&ang;", "\xe2\x88\xa0", "&and;", "\xe2\x88\xa7", "&or;", "\xe2\x88\xa8",
    "&cap;", "\xe2\x88\xa9", "&cup;", "\xe2\x88\xaa", "&int;", "\xe2\x88\xab",
    "&there4;", "\xe2\x88\xb4", "&sim;", "\xe2\x88\xbc", "&cong;", "\xe2\x89\x85",
    "&asymp;", "\xe2\x89\x88", "&ne;", "\xe2\x89\xa0", "&equiv;", "\xe2\x89\xa1",
    "&le;", "\xe2\x89\xa4", "&ge;", "\xe2\x89\xa5", "&sub;", "\xe2\x8a\x82",
    "&sup;", "\xe2\x8a\x83", "&nsub;", "\xe2\x8a\x84", "&sube;", "\xe2\x8a\x86",
    "&supe;", "\xe2\x8a\x87", "&oplus;", "\xe2\x8a\x95", "&otimes;", "\xe2\x8a\x97",
    "&perp;", "\xe2\x8a\xa5", "&sdot;", "\xe2\x8b\x85", "&lceil;", "\xe2\x8c\x88",
    "&rceil;", "\xe2\x8c\x89", "&lfloor;", "\xe2\x8c\x8a", "&rfloor;", "\xe2\x8c\x8b",
    "&lang;", "\xe2\x8c\xa9", "&rang;", "\xe2\x8c\xaa", "&loz;", "\xe2\x97\x8a",
    "&spades;", "\xe2\x99\xa0", "&clubs;", "\xe2\x99\xa3", "&hearts;", "\xe2\x99\xa5",
    "&diams;", "\xe2\x99\xa6", "&OElig;", "\xc5\x92", "&oelig;", "\xc5\x93",
    "&Scaron;", "\xc5\xa0", "&scaron;", "\xc5\xa1", "&Yuml;", "\xc5\xb8",
    "&circ;", "\xcb\x86", "&tilde;", "\xcb\x9c", "&ensp;", "\xe2\x80\x82",
    "&emsp;", "\xe2\x80\x83", "&thinsp;", "\xe2\x80\x89", "&zwnj;", "\xe2\x80\x8c",
    "&zwj;", "\xe2\x80\x8d", "&lrm;", "\xe2\x80\x8e", "&rlm;", "\xe2\x80\x8f",
    "&ndash;", "\xe2\x80\x93", "&mdash;", "\xe2\x80\x94", "&lsquo;", "\xe2\x80\x98",
    "&rsquo;", "\xe2\x80\x99", "&sbquo;", "\xe2\x80\x9a", "&ldquo;", "\xe2\x80\x9c",
    "&rdquo;", "\xe2\x80\x9d", "&bdquo;", "\xe2\x80\x9e", "&dagger;", "\xe2\x80\xa0",
    "&Dagger;", "\xe2\x80\xa1", "&permil;", "\xe2\x80\xb0", "&lsaquo;", "\xe2\x80\xb9",
    "&rsaquo;", "\xe2\x80\xba", "&euro;", "\xe2\x82\xac",
    NULL
  };
  char *raw, *wp, buf[2], *tmp;
  int i, j, hit, num, tsiz;
  assert(html);
  raw = cbmalloc(strlen(html) * 3 + 1);
  wp = raw;
  while(*html != '\0'){
    if(*html == '&'){
      if(*(html + 1) == '#'){
        if(*(html + 2) == 'x' || *(html + 2) == 'X'){
          num = strtol(html + 3, NULL, 16);
        } else {
          num = atoi(html + 2);
        }
        buf[0] = num / 256;
        buf[1] = num % 256;
        if((tmp = cbiconv(buf, 2, "UTF-16BE", "UTF-8", &tsiz, NULL)) != NULL){
          for(j = 0; j < tsiz; j++){
            *wp = ((unsigned char *)tmp)[j];
            wp++;
          }
          free(tmp);
        }
        while(*html != ';' && *html != ' ' && *html != '\n' && *html != '\0'){
          html++;
        }
        if(*html == ';') html++;
      } else {
        hit = FALSE;
        for(i = 0; pairs[i] != NULL; i += 2){
          if(cbstrfwmatch(html, pairs[i])){
            wp += sprintf(wp, "%s", pairs[i+1]);
            html += strlen(pairs[i]);
            hit = TRUE;
            break;
          }
        }
        if(!hit){
          *wp = *html;
          wp++;
          html++;
        }
      }
    } else {
      *wp = *html;
      wp++;
      html++;
    }
  }
  *wp = '\0';
  return raw;
}


/* Decode and normalize the value of a MIME header.
   `mime' specifies a text whose encoding is US-ASCII.
   The return value is allocated string of result text whose encoding is UTF-8. */
static char *estmimerawtext(const char *mime){
  CBDATUM *datum;
  CBLIST *list;
  const char *enc, *ep, *cenc, *benc, *body;
  char *ebuf, *tmp, *sel;
  int i, len, tsiz, ssiz;
  assert(mime);
  ebuf = NULL;
  len = strlen(mime);
  enc = cbencname(mime, len);
  if(cbstricmp(enc, "US-ASCII")){
    if((ebuf = cbiconv(mime, len, enc, "UTF-8", NULL, NULL)) != NULL){
      mime = ebuf;
    }
  }
  datum = cbdatumopen("", 0);
  for(i = 0; mime[i] != '\0'; i++){
    if(mime[i] == '=' && mime[i+1] == '?' && (ep = strstr(mime + i, "?=")) != NULL){
      if(*(ep - 2) == '?'){
        ep++;
        if(!(ep = strstr(ep, "?="))) ep = strstr(mime + i, "?=");
      }
      list = cbsplit(mime + i, ep - mime - i + 2, "?");
      if(cblistnum(list) == 5){
        cenc = cblistval(list, 1, NULL);
        benc = cblistval(list, 2, NULL);
        body = cblistval(list, 3, NULL);
        if(!cbstricmp(benc, "B")){
          tmp = cbbasedecode(body, &tsiz);
        } else if(!cbstricmp(benc, "Q")){
          tmp = cbquotedecode(body, &tsiz);
        } else {
          tsiz = strlen(body);
          tmp = cbmemdup(body, tsiz);
        }
        if((sel = cbiconv(tmp, tsiz, cenc, "UTF-8", &ssiz, NULL)) != NULL){
          cbdatumcat(datum, sel, ssiz);
          free(sel);
        }
        free(tmp);
      }
      cblistclose(list);
      i = ep - mime + 1;
      while(mime[i+1] != '\0' && strchr("\r\n", mime[i+1])){
        i++;
      }
      if(mime[i+1] == ' ' || mime[i+1] == '\t') i++;
    } else {
      cbdatumcat(datum, mime + i, 1);
    }
  }
  tmp = cbdatumtomalloc(datum, &tsiz);
  for(i = 0; i < tsiz; i++){
    if(tmp[i] == '\r' || tmp[i] == '\n' || tmp[i] == '\t') tmp[i] = ' ';
  }
  for(i = tsiz - 1; i >= 0; i--){
    if(tmp[i] == ' '){
      tmp[i] = '\0';
    } else {
      break;
    }
  }
  free(ebuf);
  return tmp;
}


/* Unescape entity references of HTML.
   `text' specifies a text whose encoding is UTF-16BE.
   `size' specifies the size of the text.
   `sp' specifies the pointer to a variable to which the size of the result is assigned. */
static void estnormalizetext(char *text, int size, int *sp){
  unsigned char *utext;
  int i, wi;
  assert(text && size >= 0 && sp);
  utext = (unsigned char *)text;
  wi = 0;
  for(i = 0; i < size - 1; i += 2){
    if(utext[i] == 0x0 && (utext[i+1] <= 0x8 || (utext[i+1] >= 0x0e && utext[i+1] <= 0x1f))){
      /* control characters */
      utext[i] = 0x0;
      utext[i+1] = 0x20;
    } else if(utext[i] == 0x0 && utext[i+1] == 0xa0){
      /* no-break space */
      utext[i] = 0x0;
      utext[i+1] = 0x20;
    } else if(utext[i] == 0x20 && utext[i+1] == 0x2){
      /* en space */
      utext[i] = 0x0;
      utext[i+1] = 0x20;
    } else if(utext[i] == 0x20 && utext[i+1] == 0x3){
      /* em space */
      utext[i] = 0x0;
      utext[i+1] = 0x20;
    } else if(utext[i] == 0x20 && utext[i+1] == 0x9){
      /* thin space */
      utext[i] = 0x0;
      utext[i+1] = 0x20;
    } else if(utext[i] == 0x30 && utext[i+1] == 0x0){
      /* fullwidth space */
      utext[i] = 0x0;
      utext[i+1] = 0x20;
    } else if(utext[i] == 0xff){
      if(utext[i+1] >= 0x21 && utext[i+1] <= 0x3a){
        /* fullwidth alphabets */
        utext[wi] = 0x0;
        utext[wi+1] = utext[i+1] - 0x21 + 0x41;
      } else if(utext[i+1] >= 0x41 && utext[i+1] <= 0x5a){
        /* fullwidth small alphabets */
        utext[wi] = 0x0;
        utext[wi+1] = utext[i+1] - 0x41 + 0x61;
      } else if(utext[i+1] >= 0x10 && utext[i+1] <= 0x19){
        /* fullwidth numbers */
        utext[wi] = 0x0;
        utext[wi+1] = utext[i+1] - 0x10 + 0x30;
      } else if(utext[i+1] == 0x61){
        /* halfwidth full stop */
        utext[wi] = 0x30;
        utext[wi+1] = 0x2;
      } else if(utext[i+1] == 0x62){
        /* halfwidth left corner */
        utext[wi] = 0x30;
        utext[wi+1] = 0xc;
      } else if(utext[i+1] == 0x63){
        /* halfwidth right corner */
        utext[wi] = 0x30;
        utext[wi+1] = 0xd;
      } else if(utext[i+1] == 0x64){
        /* halfwidth comma */
        utext[wi] = 0x1;
        utext[wi+1] = 0xf2;
      } else if(utext[i+1] == 0x65){
        /* halfwidth middle dot */
        utext[wi] = 0x30;
        utext[wi+1] = 0xfb;
      } else if(utext[i+1] == 0x66){
        /* halfwidth wo */
        utext[wi] = 0x30;
        utext[wi+1] = 0xf2;
      } else if(utext[i+1] >= 0x67 && utext[i+1] <= 0x6b){
        /* halfwidth small a-o */
        utext[wi] = 0x30;
        utext[wi+1] = (utext[i+1] - 0x67) * 2 + 0xa1;
      } else if(utext[i+1] >= 0x6c && utext[i+1] <= 0x6e){
        /* halfwidth small ya-yo */
        utext[wi] = 0x30;
        utext[wi+1] = (utext[i+1] - 0x6c) * 2 + 0xe3;
      } else if(utext[i+1] == 0x6f){
        /* halfwidth small tu */
        utext[wi] = 0x30;
        utext[wi+1] = 0xc3;
      } else if(utext[i+1] == 0x70){
        /* halfwidth prolonged mark */
        utext[wi] = 0x30;
        utext[wi+1] = 0xfc;
      } else if(utext[i+1] >= 0x71 && utext[i+1] <= 0x75){
        /* halfwidth a-o */
        utext[wi] = 0x30;
        utext[wi+1] = (utext[i+1] - 0x71) * 2 + 0xa2;
        if(i + 2 < size - 1 && utext[i+1] == 0x73 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
          utext[wi+1] = 0xf4;
          i += 2;
        }
      } else if(utext[i+1] >= 0x76 && utext[i+1] <= 0x7a){
        /* halfwidth ka-ko */
        utext[wi] = 0x30;
        utext[wi+1] = (utext[i+1] - 0x76) * 2 + 0xab;
        if(i + 2 < size - 1 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
          utext[wi+1] += 1;
          i += 2;
        }
      } else if(utext[i+1] >= 0x7b && utext[i+1] <= 0x7f){
        /* halfwidth sa-so */
        utext[wi] = 0x30;
        utext[wi+1] = (utext[i+1] - 0x7b) * 2 + 0xb5;
        if(i + 2 < size - 1 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
          utext[wi+1] += 1;
          i += 2;
        }
      } else if(utext[i+1] >= 0x80 && utext[i+1] <= 0x84){
        /* halfwidth ta-to */
        utext[wi] = 0x30;
        utext[wi+1] = (utext[i+1] - 0x80) * 2 + 0xbf + (utext[i+1] >= 0x82 ? 1 : 0);
        if(i + 2 < size - 1 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
          utext[wi+1] += 1;
          i += 2;
        }
      } else if(utext[i+1] >= 0x85 && utext[i+1] <= 0x89){
        /* halfwidth na-no */
        utext[wi] = 0x30;
        utext[wi+1] = utext[i+1] - 0x85 + 0xca;
      } else if(utext[i+1] >= 0x8a && utext[i+1] <= 0x8e){
        /* halfwidth ha-ho */
        utext[wi] = 0x30;
        utext[wi+1] = (utext[i+1] - 0x8a) * 3 + 0xcf;
        if(i + 2 < size - 1){
          if(utext[i+2] == 0xff && utext[i+3] == 0x9e){
            utext[wi+1] += 1;
            i += 2;
          } else if(utext[i+2] == 0xff && utext[i+3] == 0x9f){
            utext[wi+1] += 2;
            i += 2;
          }
        }
      } else if(utext[i+1] >= 0x8f && utext[i+1] <= 0x93){
        /* halfwidth ma-mo */
        utext[wi] = 0x30;
        utext[wi+1] = utext[i+1] - 0x8f + 0xde;
      } else if(utext[i+1] >= 0x94 && utext[i+1] <= 0x96){
        /* halfwidth ya-yo */
        utext[wi] = 0x30;
        utext[wi+1] = (utext[i+1] - 0x94) * 2 + 0xe4;
      } else if(utext[i+1] >= 0x97 && utext[i+1] <= 0x9b){
        /* halfwidth ra-ro */
        utext[wi] = 0x30;
        utext[wi+1] = utext[i+1] - 0x97 + 0xe9;
      } else if(utext[i+1] == 0x9c){
        /* halfwidth wa */
        utext[wi] = 0x30;
        utext[wi+1] = 0xef;
      } else if(utext[i+1] == 0x9d){
        /* halfwidth wo */
        utext[wi] = 0x30;
        utext[wi+1] = 0xf3;
      } else {
        utext[wi] = utext[i];
        utext[wi+1] = utext[i+1];
      }
    } else {
      utext[wi] = utext[i];
      utext[wi+1] = utext[i+1];
    }
    wi += 2;
  }
  *sp = wi;
}


/* Check wheter an encoding is to be trusted or not.
   `name' specifiest the name of an encoding.
   The return value is true if it is to be trusted, else, it is false. */
static int estistrustedenc(const char *name){
  if(!cbstricmp("GB2312", name)) return TRUE;
  if(!cbstricmp("Big5", name)) return TRUE;
  if(!cbstricmp("EUC-CN", name)) return TRUE;
  if(!cbstricmp("EUC-TW", name)) return TRUE;
  if(!cbstricmp("ISO-2022-CN", name)) return TRUE;
  if(!cbstricmp("EUC-KR", name)) return TRUE;
  if(!cbstricmp("ISO-2022-KR", name)) return TRUE;
  if(!cbstricmp("ISO-8859-2", name)) return TRUE;
  if(!cbstricmp("ISO-8859-3", name)) return TRUE;
  if(!cbstricmp("ISO-8859-4", name)) return TRUE;
  if(!cbstricmp("ISO-8859-5", name)) return TRUE;
  if(!cbstricmp("ISO-8859-6", name)) return TRUE;
  if(!cbstricmp("ISO-8859-7", name)) return TRUE;
  if(!cbstricmp("ISO-8859-8", name)) return TRUE;
  if(!cbstricmp("ISO-8859-9", name)) return TRUE;
  if(!cbstricmp("ISO-8859-13", name)) return TRUE;
  if(!cbstricmp("ISO-8859-15", name)) return TRUE;
  return FALSE;
}


/* Break a CJK text into words and register them to a document handle.
   `doc' specifies a document handle.
   `text' specifies a text whose encoding is UTF-16BE.
   `size' specifies the size of the text.
   `mode' specifies detailed behavior. */
static void estdocaddcjktext(ODDOC *doc, const char *text, int size, int mode){
  const unsigned char *ucs;
  char *euc, *sel;
  int i, j, miss;
  assert(doc && text && size >= 0);
  if(estdocaddtextcjkuni){
    estdocaddtextcjkuni(doc, text, size, mode);
    return;
  } else if(estdocaddtextchasen){
    /* using chasen */
    euc = cbiconv(text, size, "UTF-16BE", "EUC-JP", NULL, &miss);
    if(euc && miss < 1){
      estdocaddtextchasen(doc, euc, mode);
      free(euc);
      return;
    }
    free(euc);
  } else if(estdocaddtextmecab){
    /* using mecab */
    euc = cbiconv(text, size, "UTF-16BE", "EUC-JP", NULL, &miss);
    if(euc && miss < 1){
      estdocaddtextmecab(doc, euc, mode);
      free(euc);
      return;
    }
    free(euc);
  } else if(estdocaddtextkakasi){
    /* using kakasi */
    euc = cbiconv(text, size, "UTF-16BE", "EUC-JP", NULL, &miss);
    if(euc && miss < 1){
      estdocaddtextkakasi(doc, euc, mode);
      free(euc);
      return;
    }
    free(euc);
  }
  /* default implementation */
  ucs = (unsigned char *)text;
  i = 0;
  while(i < size - 1){
    if(ucs[i] == 0x30 && ucs[i+1] >= 0x41 && ucs[i+1] <= 0x9e){
      /* hiragana */
      for(j = i; j < size - 1; j += 2){
        if(ucs[j] != 0x30 || ucs[j+1] < 0x41 || (ucs[j+1] > 0x9e && ucs[j+1] != 0xfc)) break;
      }
      if((sel = cbiconv(text + i, j - i, "UTF-16BE", "UTF-8", NULL, NULL)) != NULL){
        if(ESTISNOSTOPW){
          switch(mode){
          case ESTDOCBOTH:
            oddocaddword(doc, sel, sel);
            break;
          case ESTDOCNONLY:
            oddocaddword(doc, sel, "");
            break;
          case ESTDOCAONLY:
            oddocaddword(doc, "", sel);
            break;
          }
        } else {
          switch(mode){
          case ESTDOCBOTH:
          case ESTDOCAONLY:
            oddocaddword(doc, "", sel);
            break;
          }
        }
        free(sel);
      }
      i = j;
    } else if(ucs[i] == 0x30 && ucs[i+1] >= 0xa1 && ucs[i+1] <= 0xfe && ucs[i+1] != 0xfb){
      /* katakana */
      for(j = i; j < size - 1; j += 2){
        if(ucs[j] != 0x30 || ucs[j+1] < 0xa1 || ucs[j+1] == 0xfb || ucs[j+1] > 0xfe) break;
      }
      if((sel = cbiconv(text + i, j - i, "UTF-16BE", "UTF-8", NULL, NULL)) != NULL){
        switch(mode){
        case ESTDOCBOTH:
          oddocaddword(doc, sel, sel);
          break;
        case ESTDOCNONLY:
          oddocaddword(doc, sel, "");
          break;
        case ESTDOCAONLY:
          oddocaddword(doc, "", sel);
          break;
        }
        free(sel);
      }
      i = j;
    } else if(ucs[i] >= 0xac && (ucs[i] < 0xd6 || (ucs[i] == 0xd7 && ucs[i+1] <= 0xa3))){
      /* hangul */
      for(j = i; j < size - 1; j += 2){
        if(ucs[j] < 0xac || ucs[j] > 0xd7 || (ucs[j] == 0xd7 && ucs[j+1] > 0xa3)) break;
      }
      if((sel = cbiconv(text + i, j - i, "UTF-16BE", "UTF-8", NULL, NULL)) != NULL){
        switch(mode){
        case ESTDOCBOTH:
          oddocaddword(doc, sel, sel);
          break;
        case ESTDOCNONLY:
          oddocaddword(doc, sel, "");
          break;
        case ESTDOCAONLY:
          oddocaddword(doc, "", sel);
          break;
        }
        free(sel);
      }
      i = j;
    } else if(ucs[i] >= 0x4e && ucs[i] < 0xa0){
      /* cjk ideograph */
      for(j = i; j < size - 1; j += 2){
        if(ucs[j] < 0x4e || ucs[j] >= 0xa0) break;
      }
      if((sel = cbiconv(text + i, j - i, "UTF-16BE", "UTF-8", NULL, NULL)) != NULL){
        switch(mode){
        case ESTDOCBOTH:
          oddocaddword(doc, sel, sel);
          break;
        case ESTDOCNONLY:
          oddocaddword(doc, sel, "");
          break;
        case ESTDOCAONLY:
          oddocaddword(doc, "", sel);
          break;
        }
        free(sel);
      }
      i = j;
    } else {
      /* others */
      j = i + 2;
      if((sel = cbiconv(text + i, j - i, "UTF-16BE", "UTF-8", NULL, NULL)) != NULL){
        switch(mode){
        case ESTDOCBOTH:
        case ESTDOCAONLY:
          oddocaddword(doc, "", sel);
          break;
        }
        free(sel);
      }
      i = j;
    }
  }
}


/* Break a CJK text into words and register them to a document handle.
   `doc' specifies a document handle.
   `text' specifies a text whose encoding is UTF-16BE.
   `size' specifies the size of the text.
   `mode' specifies detailed behavior. */
static void estdocaddlatintext(ODDOC *doc, const char *text, int size, int mode){
  CBLIST *awords;
  const char *asis;
  char *sel, *normal, *word;
  int i, j, ssiz, asiz;
  assert(doc && text && size >= 0);
  if(!(sel = cbiconv(text, size, "UTF-16BE", "UTF-8", &ssiz, NULL))) return;
  if(ESTISSTRICT){
    awords = cbsplit(sel, ssiz, " \t\f\v\r\n");
  } else {
    awords = odbreaktext(sel);
  }
  for(i = 0; i < cblistnum(awords); i++){
    asis = cblistval(awords, i, &asiz);
    if(asiz < 1) continue;
    switch(mode){
    case ESTDOCBOTH:
      normal = odnormalizeword(asis);
      word = normal;
      if(ESTISSTRICT){
        while(*word != '\0' && strchr(ESTDELIMCHARS, *word)){
          word++;
        }
        for(j = strlen(word) - 1; j > 0; j--){
          if(!strchr(ESTDELIMCHARS, word[j])) break;
          word[j] = '\0';
        }
      }
      if(estisstopworden(word)) word = "";
      if(word[0] != '\0' || asis[0] != '\0') oddocaddword(doc, word, asis);
      free(normal);
      break;
    case ESTDOCNONLY:
      normal = odnormalizeword(asis);
      word = normal;
      if(ESTISSTRICT){
        while(*word != '\0' && strchr(ESTDELIMCHARS, *word)){
          word++;
        }
        for(j = strlen(word) - 1; j > 0; j--){
          if(!strchr(ESTDELIMCHARS, word[j])) break;
          word[j] = '\0';
        }
      }
      if(estisstopworden(word)) word = "";
      if(word[0] != '\0') oddocaddword(doc, word, "");
      free(normal);
      break;
    case ESTDOCAONLY:
      if(asis[0] != '\0') oddocaddword(doc, "", asis);
      break;
    }
  }
  cblistclose(awords);
  free(sel);
}


/* Check a word is a English stop word or not.
   `word' specifies a word in normalized form.
   The return value is true if the word is a stop word, else, it is false. */
static int estisstopworden(const char *word){
  static CBMAP *map = NULL;
  char *stopwords[] = {
    "a", "an", "the", "there", "any", "no", "each", "all", "most", "more",
    "i", "my", "me", "mine", "you", "your", "yours", "we", "our", "us", "ours",
    "he", "his", "him", "she", "her", "hers", "it", "its", "they", "their", "them", "theirs",
    "this", "these", "that", "those",
    "what", "who", "which", "where", "when", "why", "whose", "how",
    "or", "and", "but", "not", "as", "if", "so", "because", "as", "unless", "until", "while",
    "to", "on", "of", "in", "for", "from", "by", "with", "at", "against", "via", "under", "over",
    "be", "am", "are", "is", "was", "were", "being", "wanna", "gonna", "gotta", "going",
    "do", "does", "did", "have", "has", "had", "get", "got", "go", "went", "come", "came",
    "will", "would", "ll", "can", "could", "may", "might", "shall", "should", "must",
    "i'm", "you're", "re", "it's", "aren't", "isn", "isn't",
    "don", "don't", "doesn", "doesn't", "didn", "didn't", "won", "won't", "wouldn", "wouldn't",
    "shouldn", "shouldn't", "mayn", "mayn't", "mustn", "mustn't",
    "ll", "i'll", "you'll", "we'll", "he'll", "she'll", "they'll",
    "ve", "i've", "you've", "we've", "he've", "she've", "they've",
    "i'd", "you'd", "he'd", "she'd", "they'd", "ain't", "haven't", "hasn't", "can't",
    NULL
  };
  int i, len;
  assert(word);
  if(ESTISNOSTOPW) return FALSE;
  if(!map){
    map = cbmapopenex(sizeof(stopwords) / sizeof(char *) * 2);
    cbglobalgc(map, (void (*)(void *))cbmapclose);
    for(i = 0; stopwords[i]; i++){
      cbmapput(map, stopwords[i], -1, "", 0, TRUE);
    }
  }
  len = strlen(word);
  if(len < ESTWMINLEN || len > ESTWMAXLEN) return TRUE;
  return cbmapget(map, word, len, NULL) != NULL;
}


/* Make the regular expression of a string with the wild cards.
   `word' specifies a string with the wild cards.
   The return value is the string of the regular expression. */
static const char *estwildtoregex(const char *word){
  static char regex[ESTWMAXLEN*3+3];
  char *wp;
  int i;
  wp = regex;
  *(wp++) = '^';
  for(i = 0; word[i] != '\0'; i++){
    if(word[i] == '*'){
      *(wp++) = '.';
      *(wp++) = '*';
    } else if(strchr("|.*+?{}()[]^$\\", word[i])){
      *(wp++) = '\\';
      *(wp++) = word[i];
    } else {
      *(wp++) = word[i];
    }
  }
  *(wp++) = '$';
  *wp = '\0';
  return regex;
}



/* END OF FILE */
