/*
  MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
 
  $Id: writer.cpp,v 1.7 2005/03/20 13:18:57 taku-ku Exp $;

  Copyright (C) 2001-2004 Taku Kudo <taku-ku@is.aist-nara.ac.jp>
  This is free software with ABSOLUTELY NO WARRANTY.
  
  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2.1 of the License, or (at your option) any later version.
  
  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Lesser General Public License for more details.
  
  You should have received a copy of the GNU Lesser General Public
  License along with this library; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/  
#include <stdexcept>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
#include "common.h"
#include "stringbuffer.h"
#include "writer.h"
#include "param.h"

namespace MeCab {

  inline static char getEscapedChar (const char p) 
  {
    switch (p) {
    case '0':  return '\0'; break;
    case 'a':  return '\a'; break;
    case 'b':  return '\b'; break;
    case 't':  return '\t'; break;
    case 'n':  return '\n'; break;
    case 'v':  return '\v'; break;
    case 'f':  return '\f'; break;
    case 'r':  return '\r'; break;
    case 's':  return ' ';  break; // space	     
    case '\\': return '\\'; break;
    default: throw std::runtime_error ("format error \\");
    }

    return '\0'; // never be here
  }

#define WRITER_INITILIZE  node_format (0), bos_format (0), eos_format (0), unk_format(0)

  Writer::Writer (Param &param): WRITER_INITILIZE
  {
    if (! open(param)) throw std::runtime_error (_what);
  }

  Writer::Writer (): WRITER_INITILIZE {}
  Writer::~Writer() { this->close(); }

  bool Writer::close ()
  {
    delete [] node_format; node_format = 0;
    delete [] eos_format;  eos_format  = 0;
    delete [] bos_format;  bos_format  = 0;
    delete [] unk_format;  unk_format  = 0;
    _write = &Writer::writeLattice;
    return true;
  }

  bool Writer::open (Param &param)
  {
    // output format style
    std::string ostyle = param.getProfileString ("output-format-type");
    _write = &Writer::writeLattice;

    if (ostyle == "wakati") {
      _write = &Writer::writeWakati;
    } else if (ostyle == "none") {
      _write = &Writer::writeNone;
    } else if (ostyle == "normal") {
      _write = &Writer::writeLattice;
    } else {
      std::string nfk = "node-format";
      std::string bfk = "bos-format";
      std::string efk = "eos-format";
      std::string ufk = "unk-format";

      if (ostyle != "") {
	nfk += "-"; nfk += ostyle;
	bfk += "-"; bfk += ostyle;
	efk += "-"; efk += ostyle;
	ufk += "-"; ufk += ostyle;

	if (std::string (param.getProfileString (nfk.c_str())).empty ()) {
	  _what = std::string("Writer::open(): Unknown format type [") + ostyle + "]";
	  return false;
	}
      }

      node_format = mystrdup (param.getProfileString (nfk.c_str()).c_str());
      bos_format  = mystrdup (param.getProfileString (bfk.c_str()).c_str());

      std::string ef = param.getProfileString (efk.c_str());
      if (ef.empty()) ef = "EOS\n";
      eos_format = mystrdup (ef.c_str());
	  
      std::string uf = param.getProfileString (ufk.c_str());
      if (uf.empty()) uf = param.getProfileString (nfk.c_str());
      unk_format = mystrdup (uf.c_str());
      
      if (*node_format != '\0') _write = &Writer::writeUser;
    }

    return true;
  }

  void Writer::write (StringBuffer &os, const char* str, Node *bosNode)
  {
    return (this->*_write) (os, str, bosNode);
  }

  void Writer::writeLattice (StringBuffer &os, const char* str, Node *bosNode)
  {
    for (Node *node = bosNode->next; node->next; node = node->next) {
      os.write (node->surface, node->length);
      os << '\t' << node->feature << '\n';
    }
    os << "EOS\n"; 
    return; 
  }
   
  void Writer::writeWakati (StringBuffer &os, const char* str, Node *bosNode)
  {
    for (Node *node = bosNode->next; node->next; node = node->next) {
      os.write (node->surface, node->length);
      os << ' ';
    }
    os << '\n';
    return; 
  }

  void Writer::writeNone (StringBuffer &os, const char* str, Node *)
  {
    return; // do nothing
  }

  void Writer::writeUser (StringBuffer &os, const char* str, Node *bosNode)
  {
    writeNode (os, bos_format, str, bosNode);
    Node *node = 0;
    for (node = bosNode->next; node->next; node = node->next) {
      if (node->stat == 1) writeNode (os, unk_format, str, node);
      else                 writeNode (os, node_format, str, node);
    }
    writeNode (os, eos_format, str, node);
    return;
  }

  void Writer::writeNode (StringBuffer&os, const char *p, const char *ibuf, Node *node) 
  {
    char buf  [1024];
    char *ptr [64];
    unsigned int psize = 0;

    for (; *p; p++) {
      switch (*p) {

      default: os << *p; break;

      case '\\': os << getEscapedChar (*++p); break;

      case '%': { // macros
	switch (*++p) {
	default: throw std::runtime_error (std::string ("unkonwn meta char ") + *p);
	case 'S': os.write (ibuf, strlen(ibuf)); break;  // input sentence
	case 'L': os << strlen(ibuf); break;  // sentence length
	case 'm': os.write (node->surface, node->length); break; // morph
        case 'M': os.write ((char*)(node->surface - node->end + node->length), node->end); break;
	case 'h': os << node->token->posid; break; // Part-Of-Speech ID
	case '%': os << '%'; break;           // %
	case 'c': os << (int)(node->token->cost); break; // word cost  
	case 'H': os << node->feature; break; // feature
	case 'p': { 
	  switch (*++p) {
	  default: throw std::runtime_error ("[iseSCwcnblLh] is required after %p");
	  case 'i': os << node->id; break; // node id
	  case 'S': os.write ((char*)(node->surface - node->end + node->length), node->end - node->length); break; // space
	  case 's': os << (int)(node->surface - ibuf); break; // start position
	  case 'e': os << (int)(node->surface - ibuf + node->length); break; // end position
	  case 'C': os << (int)(node->cost - node->prev->cost - node->token->cost); break; // connection cost
	  case 'w': os << (int)node->token->cost; break; // word cost
	  case 'c': os << (int)(node->cost); break; // best cost
	  case 'n': os << (int)(node->cost - node->prev->cost); break; // node cost
	  case 'b': os << (node->next ? '*' : ' '); break; // * if best path, ' 'otherwise 
	  case 'l': os << node->length; break; // length of morph
	  case 'L': os << node->end;    break; // length of morph including the spaces
	  case 'h': { // Hidden Layer ID
	    switch (*++p) {
	    default: throw std::runtime_error ("[012] is required after %ph");
	    case '0': os << node->token->lcAttr;  break;  // current
	    case '1': os << node->token->rcAttr1; break;  // prev
	    case '2': os << node->token->rcAttr2; break;  // prev-prev
	    }
	  } break;

	  case 'p': {
	    char mode = *++p;
	    char sep = *++p;
	    if (sep == '\\') sep = getEscapedChar (*++p);
	    if (!node->path) throw std::runtime_error ("no path information, use -a option");
	    for (Path *path = node->path; path; path = path->next) {
	      if (path != node->path) os << sep;
	      switch (mode) {
	      case 'i': os << path->node->id; break;
	      case 'c': os << path->cost; break;
	      default: throw std::runtime_error ("[ic] is required after %pp"); // connection cost
	      }
	    }
	  } break; 

	  }
	} break;

	case 'F':
	case 'f': {
	  if (! psize) {
	    strncpy (buf, node->feature, 1024);
	    ptr[psize++] = buf;
	    for (char *p = buf; *p;  ++p) { // split
	      if (',' == *p) { 
		*p = '\0'; 
		ptr[psize++] = p + 1; 
	      }
	    }
	  } 

	  // separator
	  char separator = '\t'; // default separator
	  if (*p == 'F') { // change separator
	    if (*++p == '\\') separator = getEscapedChar (*++p); 
	    else separator = *p;
	  }

	  if (*++p !='[') throw std::runtime_error ("cannot find '['");
	  unsigned int n = 0;
	  bool sep = false;
	  bool isfil = false;
	  p++;

	  for (;; ++p) {
	    switch (*p) {
	    case '0': case '1': case '2': case '3': case '4': 
	    case '5': case '6': case '7': case '8': case '9':
	      n = 10 * n + (*p - '0');
	      break;
	    case ',': case ']':
	      if (n >= psize) throw std::runtime_error ("given index is out of range");
	      isfil = (ptr[n][0] != '*');
	      if (isfil) { if (sep) os << separator; os << ptr[n]; }
	      if (*p == ']') goto last;
	      sep = isfil;
	      n = 0;
	      break;
	    default:
	      throw std::runtime_error ("cannot find ']'");
	      break;
	    }
	  }
	} last: break;
	} // end switch 
      } break; // end case '%'
      } // end switch
    }

    return;
  }
}
