/*---------------------------------------------------------------------------------------------------------------------
- File      : recomposer.cc                                                               Project ELSE, EPFL - DI/LIA -
-                                                                       Evaluation in Language and Speech Engineering -
- Author    : Seydoux Florian   Creation date : 07 July 1999                                                          -
- Eulogist  : -                 Approval date : -                  Version: 0.1                                       -
-                                                                                                                     -
- Descript. : Recomposer un corpus complet avec l'etiquetage de reference (partiel)                                   -
-             Le fichier doit avoir passer avec succes la verification de format                                      -
-             Des etiquettes 'vides' sont attribuées aux tokens exclus de l'évaluation                                -
-                                                                                                                     -
-             Les erreurs détectées sont: mauvaise tokenisation, transformation des tokens                            -
-             (ainsi que séquencement incohérent des tokens/tags, mais cela n'est pas toujours détecté)               -
-             Tags invalide (redondant avec 'checker, mais evite la recomposition du corpus lors de chaque correction -
-             d'etiquettes) 
-             Toutes les erreurs signalées sont accompagnées de la localisation de la référence et de l'erreur        -
-             (positionnement en caractères) dans les fichiers sources.                                               -
-                                                                                                                     -
- Requested : -                                                                                                       -
-                                                                                                                     -
- Gaps      : o) Reunir dans un seul fichier (objet) la chaine de traitements comune à extract.cc et recomposer.cc    -
-             o) Ajouter l'etiquette vide au lieu de '??' (corriger grace_tools.h, et au besoin transf. l'etiquette   -
-                vide en '??' dans le 
-                                                                                                                     -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- Rev. date | Reviser               | Revise's description                                                            -
- - - - - - + - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- ../../....| ........              | ...                                                                             -
---------------------------------------------------------------------------------------------------------------------*/

#ifndef RECOMPOSER
#define RECOMPOSER

#include <fstream>
#include <iostream>
#include "untagger.h"
#include "grace_tools.h"
#include "messages_manager.h"
#include "tags_mapping_table.h"

#ifdef _USE_NAMESPACES
    namespace Else { 
#endif // _USE_NAMESPACES

void dumpHelp();
void checkParams(Parameters&);
int main(int argc, char* argv[]);

bool chktags;                  // Flag to enable/disable the tags checking.
bool skipCmt;                  // Flag to enable the comments filtering.
conform_string prefix;         // Prefix for all filename.
conform_string config;         // Name of the configuration file.
conform_string output;         // Name of the output (recomposed) file.
conform_string reftags;        // Name of the reference tags table file.
conform_string refcorpus;      // Name of the partially marked out corpus file (reference for the check).
conform_string tstcorpus;      // Name of the corpus's chunk tagged by the experts (subject of the check).

void dumpHelp()
{
    cerr << "Description:  (Reference part)\n"
         << "  Use 'recompose' to\n"
         << "  (a) complete a partially tagged corpus (corpus chunk) with empty tag;\n"
         << "  (b) verify the tokenization of the corpus (must be conform to the Grace's tokenization specification);\n"
         << "  (c) verify the matching between flow of tagged tokens and tokens from the original corpus;\n"
         << "  (d) Optional: verify the tags (syntax of composite expression and existence of elementary tags).\n"
         << "\n"
         << "Usage: check [<config file>] {switches}\n\n"
         << "<config file>     : The reference configuration file. (default:reference.cfg)\n"
         << "\n"
         << "Switches:\n"
         << "\n"
         << "  -prefix prefix  : <prefix> is used to prefix ALL file's name (incl. config)\n"
         << "  -corpus Name    : set <corpus> to Name, default: 'corpus'\n"
         << "  -rc fileName    : original raw corpus, with reference part enclosed by spcl tags, default: <corpus>.raw\n"
         << "  -xc fileName    : corpus's chunk tagged by the experts, default: <corpus>.ref\n"
         << "  -o  fileName    : output file (recomposed file); by default, output are made to <corpus>.rcp\n"
         << "  -rt fileName    : reference tags table.\n"
         << "  -cmt            : comments are not writed at output\n"
         << "  -chktags        : enable the tags checking (check also the table).\n"
         << "\n";
    msg.switchesFormat(cerr);
    cerr << "\n\n";
    exit (0);
}

void checkParams(Parameters& params)
{
    if ((params.empty())
        ||
           !(strcmp(params.front(), "?"))
        ||
           !(strcmp(params.front(), "-?"))
        || 
           !(strcmp(params.front(), "-h"))
        ||
           !(strcmp(params.front(), "-help"))
        ||
           !(strcmp(params.front(), "--help"))
       ) dumpHelp();

    conform_string _corpus, _output, _refcorpus, _tstcorpus;

    chktags = false;
    _corpus = "corpus";
    prefix.clear();
    reftags.clear();
    skipCmt = false;

    if (params.front()[0] != '-' ) { config = params.front(); params.erase(params.begin()); }
    else config = "reference.cfg";

    msg.checkParams(params);

    for (Parameters::const_iterator currentArg = params.begin(); currentArg != params.end(); ++currentArg) {
        if (!strcmp(*currentArg, "-prefix")) { if (++currentArg != params.end()) prefix    = *currentArg; continue; }
        if (!strcmp(*currentArg, "-corpus")) { if (++currentArg != params.end()) _corpus    = *currentArg; continue; }
        if (!strcmp(*currentArg, "-rc"))     { if (++currentArg != params.end()) _refcorpus = *currentArg; continue; }
        if (!strcmp(*currentArg, "-rt"))     { if (++currentArg != params.end()) reftags   = *currentArg; continue; }
        if (!strcmp(*currentArg, "-xc"))     { if (++currentArg != params.end()) _tstcorpus = *currentArg; continue; }
        if (!strcmp(*currentArg, "-o"))      { if (++currentArg != params.end()) _output    = *currentArg; continue; }
        if (!strcmp(*currentArg, "-chktags")) { chktags = true; continue; }
        if (!strcmp(*currentArg, "-cmt")) { skipCmt = true; continue; }
        cerr << "Invalid argument: " << *currentArg << "\nTry 'recompose --help' for more informations!\n";
    }
    config.insert(0, prefix);
    output     = prefix + (_output.empty() ? _corpus + string(".rcp") : _output);
    refcorpus  = prefix + (_refcorpus.empty()  ? _corpus + string(".raw") : _refcorpus);
    tstcorpus  = prefix + (_tstcorpus.empty()  ? _corpus + string(".ref") : _tstcorpus);
}


int main(int argc, char* argv[]) {
    enum State { Padding, Merging };

    bool padding(true);
    Parameters params(argc, argv);
    TagsMappingTable  tagsTable;
    UserConfiguration userConfiguration;

    checkParams(params);
    if (!userConfiguration.readFrom(config, UserConfiguration::Reference, false))
        msg.seriousError("Recomposer", "Configuration file is not readeable, or it has too serious error(s)");
    ifstream refStream(refcorpus.c_str(), ios::in);
    if (!refStream)
        msg.seriousError(string("Recomposer"), 
                         string("Unable to open the partially marked out corpus ('") + output + string("')."));
    TaggedCorpusReader tagsReader(userConfiguration, tstcorpus);
    if (!tagsReader)
        msg.seriousError(string("Recomposer"),
                         string("Unable to open the corpus's chunk with reference tagging ('") + tstcorpus + string("')."));
    ofstream outputStream(output.c_str(), ios::out|ios::trunc);
    if (!outputStream)
        msg.seriousError(string("Recomposer"), string("Unable to create the output file ('") + output + string("')."));

	if (reftags.empty()) reftags = prefix + userConfiguration.mapName;
	else reftags = prefix + reftags;
    if (chktags) tagsTable.readReference(userConfiguration, reftags, userConfiguration.mapMode); 

    msg.message(string("Recomposer"), string("Translation Chain creating..."), DEBUG_MSG);

    State              state;
    TextFlowToStream   endFlow(outputStream);
    GraceTokenizer     tokenizer; 
    FiltersPool        translateReference(tokenizer);
    PatternTracker     startPadding(STOP_EVALUATED_PART, translateReference);
    PatternTracker     stopPadding(START_EVALUATED_PART, translateReference);
    TaggedCorpusWriter tagsWriter(userConfiguration, endFlow, skipCmt);

    Merger merger(tagsReader, tagsWriter, tokenizer, tagsTable, refcorpus, chktags, padding);
    tokenizer.setNextAgent(merger);

    //
    // The operators added to 'reference flow' must be the same as for the human expert tagger (untagger)
    // 
    translateReference.addFilter(new BasicQuietFilter(string("<"), string(">"), false, false), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&oelig;"), string("oe")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&oelig "), string("oe ")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&OElig;"), string("OE")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&OElig "), string("OE ")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&aelig;"), string("ae")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&aelig "), string("ae ")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&AElig;"), string("AE")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&AElig "), string("AE ")), true);
    
    translateReference.addFilter(new QuietTranslateFilter(string("&lt;"), string("<")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&lt "), string("< ")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&gt;"), string(">")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&gt "), string("> ")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&quot;"), string("\"")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&quot "), string("\" ")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&nbsp;"), string(" ")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&nbsp "), string("  ")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&reg;"), string("[Registered trademark]")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&reg "), string("[Registered trademark] ")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&copy;"), string("[Copyright]")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&copy "), string("[Copyright] ")), true);
    translateReference.addFilter(new QuietTranslateFilter(string("&amp;"), string("&")), true);
//  translateReference.addFilter(new QuietTranslateFilter(string("&amp "), string("& ")), true);

    //
    // Now we can proceed...
    // 
    msg.message(string("Recomposer"), string("Recomposition start..."), STANDARD_MSG);
    char chr;
    long long counter(0);
    while (refStream.get(chr)) {
        ++counter;
         if (padding) {
             stopPadding <<= IndexedChar(chr,counter);
            if (stopPadding.patternFound()) {
                 stopPadding.clear();
                 translateReference <<= ' '; // Assume that the </E> can't break a token
                 translateReference.flush();
                 merger.setPaddingMode(false);
                 padding = false;
             }
        } else {
             startPadding <<= IndexedChar(chr,counter);
            if (startPadding.patternFound()) {
                 startPadding.clear();
                 translateReference <<= ' ';
                 translateReference.flush();
                 merger.setPaddingMode(true);
                 padding = true;
            }
         }
    }
            
    // Manual flush...
    if (padding) stopPadding.flush();
    else startPadding.flush();
    translateReference.flush();
    tokenizer.flush();

    refStream.close();
    outputStream.close();
    exit(0);
}

#ifdef __USE_NAMESPACES
}
#endif // _USE_NAMESPACES

#endif // defined UNTAG_MAIN

