/*---------------------------------------------------------------------------------------------------------------------
- File      : unifier.cc                                                                  Project ELSE, EPFL - DI/LIA -
-                                                                       Evaluation in Language and Speech Engineering -
- Author    : Seydoux Florian   Creation date : 07 Sept 1999                                                          -
- Eulogist  : -                 Approval date : -                  Version: 0.1                                       -
-                                                                                                                     -
- Descript. : Translate system tagged corpus to reference format (using the reference's configuration file).          -
-             Also translate all the forms prohibited for future processing (files back in html format).              -
-             Input:                                                                                                  -
-                    .) reference configuration file                                                                  -
-                    .) system configuration file (perhaps the same)                                                  -
-                    .) tagged corpus (system). Checking of this corpus (with ' checker') should not raise errors.    -
-             Output:                                                                                                 -
-                    .) A tagged corpus unified with the system format, and with html marks translated.               -
-                                                                                                                     -
- Requested : -                                                                                                       -
-                                                                                                                     -
- Gaps      :                                                                                                         -
-                                                                                                                     -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- Rev. date | Reviser               | Revise's description                                                            -
- - - - - - + - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- ../../....| ........              | ...                                                                             -
---------------------------------------------------------------------------------------------------------------------*/

#ifndef UNIFIER
#define UNIFIER

#include <fstream>
#include <iostream>
#include "globaldef.h"
#include "untagger.h"
#include "grace_tools.h"
#include "flow_basic_operators.h"

#ifdef _USE_NAMESPACES
    namespace Else { 
#endif // _USE_NAMESPACES

void dumpHelp();
void checkParams(Parameters&);
int main(int argc, char* argv[]);

typedef vector<string> StringCollection; // Redondant...

bool   prudent;              // Enable/Disable the prudent translation of the '&' mark.
bool   removeCmts;           // Enable/Disable the 'percolation' of comments.
string prefix, head, tail;   // Invariants for filenames.
string inCfg, outCfg;        // Source and projected format.
StringCollection inputs;     // Collection of targets corpus.

void dumpHelp()
{
    cerr << "Description:  (Evaluation part)\n"
         << "  Use 'unifier' to\n"
         << "  (a) Translate system tagged corpus to reference format,\n"
         << "  (b) Translate prohibited forms for future processing (that produce html output).\n"
         << "  (c) Optionel: remove comments for alignement process.\n"
         << "\n"
         << "Usage: unifier <out config file> <in config file> {switches <in corpus>}\n\n"
         << "<out config file> : Configuration file defining the format of the output.\n"
         << "<in config file>  : Configuration file providing the format of the (input) tagged corpus.\n"
         << "<in corpus>       : The corpus having to be unified (should not contain errors.\n"
         << "\n"
         << "Switches:\n"
         << "\n"
         << "  -prefix in_prefix : <in_prefix> is used to prefix all input file's name (incl. config).\n"
         << "  -head   out_head  : <out_head> used to prefix all output file.\n"
         << "  -tail   out_tail  : <out_tail> used to postfix all output file.\n"
         << "  -cmt | -nocomment : remove comments from the input corpus.\n"
         << "  -prudent          : translate only isolated occurence of '&' (default: all).\n"
         << '\n'
         << "  Name(s) of the output files are: <out_head><in corpus><out_tail>\n"
         << '\n';
    msg.switchesFormat(cerr);
    cerr << "\n\n";
    exit (0);
}

void checkParams(Parameters& params)
{
    if ((params.size() < 3)
        ||
           !(strcmp(params.front(), "?"))
        ||
           !(strcmp(params.front(), "-?"))
        || 
           !(strcmp(params.front(), "-h"))
        ||
           !(strcmp(params.front(), "-help"))
        ||
           !(strcmp(params.front(), "--help"))
       ) dumpHelp();

    removeCmts = prudent = false;
    inCfg = params.front(); params.erase(params.begin());
    outCfg = params.front(); params.erase(params.begin());
    msg.checkParams(params);

    for (Parameters::const_iterator currentArg = params.begin(); currentArg != params.end(); ++currentArg) 
    {
        if (!strcmp(*currentArg, "-prefix")) { if (++currentArg != params.end()) prefix = *currentArg; continue; }
        if (!strcmp(*currentArg, "-head"))   { if (++currentArg != params.end()) head   = *currentArg; continue; }
        if (!strcmp(*currentArg, "-tail"))   { if (++currentArg != params.end()) tail   = *currentArg; continue; }
        if (!strcmp(*currentArg, "-cmt"))       { removeCmts = true; continue; }
        if (!strcmp(*currentArg, "-nocomment")) { removeCmts = true; continue; }
        if (!strcmp(*currentArg, "-prudent"))   { prudent = true;    continue; }
        if ((*currentArg)[0] != '-') { inputs.push_back(string(*currentArg)); continue; }
        cerr << "Invalid argument: " << *currentArg << "\nTry 'recompose --help' for more informations!\n";
    }
}


int main(int argc, char* argv[]) {

    Parameters params(argc, argv);
    UserConfiguration inputConfiguration, outputConfiguration;

    checkParams(params);

    if ((head == prefix) && tail.empty())
        msg.seriousError("Unifier", "Oops, input and output are the same.");

    if (!inputConfiguration.readFrom(prefix + inCfg, UserConfiguration::Unknown, true))
        msg.seriousError("Unifier",
                         "Configuration file for the input format is not readeable, or it has too serious error(s)");
    if (!outputConfiguration.readFrom(prefix + outCfg, UserConfiguration::Unknown, true))
        msg.seriousError("Unifier",
                         "Configuration file of the targeted format is not readeable, or it has too serious error(s)");


    TextFlowToStream   endFlow(cout);
    FiltersPool        htmlAdjust(endFlow);
    StringToCharFlow   flowAdapter(htmlAdjust);

/*
 * Prudent translation: translate only isolated '&' and some particular patterns.
 * (For example, if the source admit composite forme, like &oelig;, no change are made).
 * 
 * The 'brut' version translate All occurences.
 */

    if (prudent)
    {
        htmlAdjust.addFilter(new QuietTranslateFilter(string("& "), string("&amp ")), true);
        htmlAdjust.addFilter(new QuietTranslateFilter(string("&'"), string("&amp;'")), true);
        htmlAdjust.addFilter(new QuietTranslateFilter(string("&\""), string("&amp;\"")), true);
        htmlAdjust.addFilter(new QuietTranslateFilter(string("&."), string("&amp;.")), true);
        htmlAdjust.addFilter(new QuietTranslateFilter(string("&\n"), string("&amp;\n")), true);
        htmlAdjust.addFilter(new QuietTranslateFilter(string("&\t"), string("&amp;\t")), true);
        htmlAdjust.addFilter(new QuietTranslateFilter(string("&;"), string("&amp;;")), true);
    } else {
        htmlAdjust.addFilter(new QuietTranslateFilter(string("&"), string("&amp;")), true);  // exclude '& '
//      htmlAdjust.addFilter(new QuietTranslateFilter(string("& "), string("&amp ")), true);
    }
    htmlAdjust.addFilter(new QuietTranslateFilter(string("< "), string("&lt ")), true);
    htmlAdjust.addFilter(new QuietTranslateFilter(string("<"), string("&lt;")), true);
    htmlAdjust.addFilter(new QuietTranslateFilter(string("> "), string("&gt ")), true);
    htmlAdjust.addFilter(new QuietTranslateFilter(string(">"), string("&gt;")), true);
    htmlAdjust.addFilter(new QuietTranslateFilter(string("\" "), string("&quot ")), true);
    htmlAdjust.addFilter(new QuietTranslateFilter(string("\""), string("&quot;")), true);


    for (StringCollection::const_iterator i = inputs.begin(); i != inputs.end(); ++i)
    {
        msg.message(string("Unifier"),
                    string("Compute '") + prefix + *i + string("' -> '") + head + *i + tail + string("'..."),
                    STANDARD_MSG);

        ofstream output((head + *i + tail).c_str(), ios::out|ios::trunc);
        TaggedCorpusReader reader(inputConfiguration, prefix + *i);
        TaggedCorpusWriter writer(outputConfiguration, flowAdapter);
        LemmeBaseAccess lba;
        
        if (!reader)
        {
            msg.error('U', 1, 1,
                      string("Input error"),
                      string("Unable to open the output file '") + prefix + *i + string("'."));
            continue;
        }

        if (!output) 
        {
            msg.error('U', 1, 0,
                      string("Output error"),
                      string("Unable to create the output file '") + head + *i + tail 
                          + string("'; Skiping '") + prefix + *i + string("'."));
            continue;
        }

        endFlow.setOutputStream(output);
        while (lba = reader.readNextEntry())
        {
            if (const SingleCommentAccess singleCmtPtr = dynamic_cast<SingleCommentAccess>(lba))
            {
                if (!removeCmts) writer.write(singleCmtPtr->cmt);
            }
            else if (const LemmeAccess lemmePtr = dynamic_cast<LemmeAccess>(lba))
            {
                if (removeCmts || lemmePtr->cmt.empty())
                    writer.write(lemmePtr->tok, lemmePtr->tag);
                else
                    writer.write(lemmePtr->tok, lemmePtr->tag, lemmePtr->cmt);
            }
            else THROW(domain_error(INVALID_RESULT));
        }
        htmlAdjust.flush();
        output.close();   
    }
    exit(0);
}


#ifdef __USE_NAMESPACES
}
#endif // _USE_NAMESPACES

#endif // defined UNIFIER
        

