/*---------------------------------------------------------------------------------------------------------------------
- File      : untagger.cc                                                                 Project ELSE, EPFL - DI/LIA -
-                                                                       Evaluation in Language and Speech Engineering -
- Author    : Seydoux Florian   Creation date : 24 Sept 1999                                                          -
- Eulogist  : -                 Approval date : -                  Version: 0.1                                       -
-                                                                                                                     -
- Descript. : Untagger (main):                                                                                        -
-             Prepare a corpus for tagging (-> HTML -> TXT + Anchored Tags).                                          -
-             This program can be invoked to create file for the experts (extract delimited part for the evaluation,  -
-             without anchoring), and to create the files of the participants, by withdrawing all the                 -
-             tags, and by anchoring them in an ad hoc file.                                                          -
-                                                                                                                     -
- Requested : -                                                                                                       -
-                                                                                                                     -
- Gaps      :                                                                                                         -
-                                                                                                                     -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- Rev. date | Reviser               | Revise's description                                                            -
- - - - - - + - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- ../../....| ........              | ...                                                                             -
---------------------------------------------------------------------------------------------------------------------*/

#ifndef UNTAGGER
#define UNTAGGER

#include <fstream>
#include <iostream>
#include "globaldef.h"
#include "untagger.h"
#include "messages_manager.h"
#include "grace_tools.h"

#ifdef _USE_NAMESPACES
    namespace Else { 
#endif // _USE_NAMESPACES

void dumpHelp();
void checkParams(const Parameters&);
int main(int argc, char* argv[]);

bool expertOnly, grace_tokens;
conform_string input, p_output, x_output, anchored;

class ParticipantChain : public CharFlowAgent {
public:
    ParticipantChain(ostream& out, ostream& anch)
    : endFlow(out),
      untagger(endFlow, anch),
      flowEntry(string("<E>"), string(), string(), secondStep),
      secondStep(string("</E>"), string(), string(), untagger)
    {

        //
        // First of all, we need an quiet extractor for the '<E>' and '</E>' tag (flowEntry & secondStep).
        // Then, we can now extract and anchor all other tags
        // 
        untagger.addFilter(new ExtractAnchoredFilter(string("<"), string(">"), string(), 95 ), true);

        //
        // We can now translate unrecognized composite forms
        // 

    /*
     *  Disable translation of accentuated char and others forms
     *  
     *  untagger.addFilter(new TranslateAnchoredFilter(string("&eacute;"), string("é")), true);
     *  untagger.addFilter(new TranslateAnchoredFilter(string("&eacute "), string("é ")), true);
     *  untagger.addFilter(new TranslateAnchoredFilter(string("&agrave;"), string("à")), true);
     *  untagger.addFilter(new TranslateAnchoredFilter(string("&agrave "), string("à ")), true);
     */
        untagger.addFilter(new TranslateAnchoredFilter(string("&oelig;"), string("oe")),  true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&oelig "), string("oe ")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&OElig;"), string("OE")),  true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&OElig "), string("OE ")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&aelig;"), string("ae")),  true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&aelig "), string("ae ")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&AElig;"), string("AE")),  true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&AElig "), string("AE ")), true);

        //
        //  Now we can translate forms needed by HTML 
        //  
        untagger.addFilter(new TranslateAnchoredFilter(string("&lt;"), string("<")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&lt "), string("< ")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&gt;"), string(">")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&gt "), string("> ")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&quot;"), string("\"")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&quot "), string("\" ")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&nbsp;"), string(" ")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&nbsp "), string("  ")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&reg;"), string("[Registered trademark]")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&reg "), string("[Registered trademark] ")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&copy;"), string("[Copyright]")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&copy "), string("[Copyright] ")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&endsp;"), string("?")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&endsp "), string("? ")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&emsp;"), string("?")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&emsp "), string("? ")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&endash;"), string("?")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&endash "), string("? ")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&emdash;"), string("?")), true);
        untagger.addFilter(new TranslateAnchoredFilter(string("&emdash "), string("? ")), true);

        untagger.addFilter(new TranslateAnchoredFilter(string("&amp;"), string("&")), true); // exclude '& '
//      untagger.addFilter(new TranslateAnchoredFilter(string("&amp "), string("& ")), true);

        untagger.initialize();
    }

    void flush() { flowEntry.flush(); secondStep.flush(); untagger.flush(); } // only last.
    void computeFlowItem(const ichar& c) { flowEntry <<= c; } // untagger <<= c;

private:
    CharFlowToStream     endFlow;
    AnchoredFiltersPool  untagger;
    QuietTranslateFilter flowEntry;
    QuietTranslateFilter secondStep;
};

class ExpertChain : public CharFlowAgent {
public:
    ExpertChain(ostream& out, bool tokenize)
    : filters(),
      endFlow(out),
      mixer(string(1,'\n'), After, endFlow),
      tokenizer(mixer)
    {
    	if (tokenize) filters.setEndFlow(tokenizer);
    	else filters.setEndFlow(endFlow);
    	filters.addFilter(new BasicQuietFilter(START_EVALUATED_PART, STOP_EVALUATED_PART, true, true), true);
    	filters.addFilter(new BasicQuietFilter(string("<"), string(">"), false, false), true);
    	
	  	filters.addFilter(new QuietTranslateFilter(string("&oelig;"), string("oe")), true);
	    filters.addFilter(new QuietTranslateFilter(string("&oelig "), string("oe ")), true);
	    filters.addFilter(new QuietTranslateFilter(string("&OElig;"), string("OE")), true);
	    filters.addFilter(new QuietTranslateFilter(string("&OElig "), string("OE ")), true);
	    filters.addFilter(new QuietTranslateFilter(string("&aelig;"), string("ae")), true);
	    filters.addFilter(new QuietTranslateFilter(string("&aelig "), string("ae ")), true);
	    filters.addFilter(new QuietTranslateFilter(string("&AElig;"), string("AE")), true);
	    filters.addFilter(new QuietTranslateFilter(string("&AElig "), string("AE ")), true);
	    
	    filters.addFilter(new QuietTranslateFilter(string("&lt;"), string("<")), true);
	    filters.addFilter(new QuietTranslateFilter(string("&lt "), string("< ")), true);
	    filters.addFilter(new QuietTranslateFilter(string("&gt;"), string(">")), true);
	    filters.addFilter(new QuietTranslateFilter(string("&gt "), string("> ")), true);
	    filters.addFilter(new QuietTranslateFilter(string("&quot;"), string("\"")), true);
	    filters.addFilter(new QuietTranslateFilter(string("&quot "), string("\" ")), true);
	    filters.addFilter(new QuietTranslateFilter(string("&nbsp;"), string(" ")), true);
	    filters.addFilter(new QuietTranslateFilter(string("&nbsp "), string("  ")), true);
	    filters.addFilter(new QuietTranslateFilter(string("&reg;"), string("[Registered trademark]")), true);
	    filters.addFilter(new QuietTranslateFilter(string("&reg "), string("[Registered trademark] ")), true);
	    filters.addFilter(new QuietTranslateFilter(string("&copy;"), string("[Copyright]")), true);
	    filters.addFilter(new QuietTranslateFilter(string("&copy "), string("[Copyright] ")), true);
    	filters.addFilter(new QuietTranslateFilter(string("&amp;"), string("&")), true);
    }

    void flush()
    {
    	filters.flush();
    	tokenizer.flush();
    }
    
    void computeFlowItem(const ichar& c) 
    {
        filters <<= c; 
    }

private:   
      FiltersPool      filters;
      TextFlowToStream endFlow;
      TokenOutput      mixer;
      GraceTokenizer   tokenizer;
};

void dumpHelp()
{
    cerr << "Description: (Evaluation task)\n"
            "  Use 'untagger' to to create the corpus file for the experts (remove HTML tags except those delimited\n"
            "  the evaluated part, without anchoring), and to create the files for the participants, by withdrawing\n"
            "  all the tags, and by anchoring them in an ad hoc file. ...\n"
            "\n"
            "Usage: untagger {switches}\n"
            "\n"
            "Switches:\n"
            "\n"
            "  -in input      : untag 'input' file (default: stdin).\n"
            "  -pout output   : save participants's corpus to 'output' (default: stdout).\n"
            "  -xout output   : save experts's corpus to 'outputName' (default: none or stdout).\n"
            "  -anch anchored : save extracted & transformed sequence to 'anchored' (default: extracted.anch).\n"
            "  -xonly         : compute only experts corpus (don't output entrants & extracted files).\n"
            "  -tk            : tokenize the experts's file with Grace's rules.\n\n";
    exit (0);
}

void checkParams(const Parameters& params)
{
    if (! params.empty() &&
        ( !strcmp(params.front(), "?")  || !strcmp(params.front(), "-?") || 
          !strcmp(params.front(), "-h") || !strcmp(params.front(), "-help") || !strcmp(params.front(), "--help")))
        dumpHelp();

    // Init with default
    expertOnly = grace_tokens = false;
    input.clear();    // input.empty() -> stdin
    p_output.clear(); // p_output.empty() -> stdout
    x_output.clear(); // x_output.empty() -> (expertOnly=true ->no experts's output, false -> stdout)
    anchored = "extracted.anch";

    for (Parameters::const_iterator currentArg = params.begin(); currentArg != params.end(); ++currentArg) {
        if (!strcmp(*currentArg, "-in"))   { if (++currentArg != params.end()) input    = *currentArg; continue; }
        if (!strcmp(*currentArg, "-pout")) { if (++currentArg != params.end()) p_output = *currentArg; continue; }
        if (!strcmp(*currentArg, "-xout")) { if (++currentArg != params.end()) x_output = *currentArg; continue; }
        if (!strcmp(*currentArg, "-anch")) { if (++currentArg != params.end()) anchored = *currentArg; continue; }
        if (!strcmp(*currentArg, "-xonly")) { expertOnly = true; continue; }
        if (!strcmp(*currentArg, "-tk"))    { grace_tokens = true; continue; }
        if (currentArg == params.end()) --currentArg;
        cerr << "Invalid/Incomplete argument: " << *currentArg << "\nTry 'check --help' for more informations!\n";
    }
}

int main(int argc, char* argv[]) {

    // True instances ........
    ifstream* f1(0);
    ofstream* f2(0);
    ofstream* f3(0);
    ofstream* f4(0);

    ExpertChain*      chain1(0);
    ParticipantChain* chain2(0);

    Parameters params(argc, argv);


    // Interface ..............
    istream* in_str;
    ostream* pout_str;
    ostream* xout_str;
    ostream* anch_str;

    CharFlowAgent* partc;
    CharFlowAgent* exprt;
    

    // Checking parameters ............
    checkParams(params);

    if (input.empty()) in_str = &cin;
    else {
        f1 = new ifstream(input.c_str(), ios::in);
        if (f1->fail())
            msg.seriousError("Untagger", string("Unable to open input file '") + input + string("'."));
        in_str = f1;
    }

    if (! expertOnly) {
        if (p_output.empty()) pout_str = &cout;
        else {
            f2 = new ofstream(p_output.c_str(), ios::out | ios::trunc);
            if (f2->fail())
                msg.seriousError("Untagger", string("Unable to create participants's file '") + p_output + string("'."));
            pout_str = f2;
        }
        if (!x_output.empty()) {
            f3 = new ofstream(x_output.c_str(), ios::out | ios::trunc);
            if (f3->fail())
                msg.seriousError("Untagger", string("Unable to create experts's file '") + x_output + string("'."));
            xout_str = f3;
        }
        f4 = new ofstream(anchored.c_str(), ios::out | ios::trunc);
        if (f4->fail())
            msg.seriousError("Untagger", string("Unable to create ad hoc file '") + anchored + string("'."));
        anch_str = f4;
    } else {
        if (x_output.empty()) xout_str = &cout;
        else {
            f3 = new ofstream(x_output.c_str(), ios::out | ios::trunc);
            if (f3->fail())
                msg.seriousError("Untagger", string("Unable to create experts's file '") + x_output + string("'."));
            xout_str = f3;
        }
    }

    if (expertOnly) {
        partc = &nullCharFlowAgent;
        exprt = chain1 = new ExpertChain(*xout_str, grace_tokens);
    } else {
        partc = chain2 = new ParticipantChain(*pout_str, *anch_str);
        if (!x_output.empty()) exprt = chain1 = new ExpertChain(*xout_str, grace_tokens);
        else exprt = &nullCharFlowAgent;
    }

    char c;
    while (in_str->get(c)) {
        *partc <<= c;
        *exprt <<= c;
    }

    partc->flush();
    exprt->flush();
    
    delete chain1;
    delete chain2;
    delete f1;
    delete f2;
    delete f3;
    delete f4;
    
    return 0;
}

#ifdef __USE_NAMESPACES
}
#endif // _USE_NAMESPACES

#endif // defined UNTAGGER

