/*---------------------------------------------------------------------------------------------------------------------
- File      : tags_checker.cc                                                             Project ELSE, EPFL - DI/LIA -
-                                                                       Evaluation in Language and Speech Engineering -
- Author    : Seydoux Florian   Creation date : 07 July 1999                                                          -
- Eulogist  : -                 Approval date : -                  Version: 0.1                                       -
-                                                                                                                     -
- Descript. : Vérifier le format des fichiers annexes (configuration+table mapping)                                   -
-             Verifier les etiquettes et le format du fichier taggue                                                  -
-             Optionnel: Vérifier la tokenisation du fichier (reference)                                              -
-             Optionnel: Vérifier la concordance du flux de tokens avec le corpus a etiquetter                        -
-                                                                                                                     -
- Requested : -                                                                                                       -
-                                                                                                                     -
- Gaps      : o)                                                                                                      -
-                                                                                                                     -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- Rev. date | Reviser               | Revise's description                                                            -
- - - - - - + - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- ../../....| ........              | ...                                                                             -
---------------------------------------------------------------------------------------------------------------------*/

#ifndef TAGS_CHECKER
#define TAGS_CHECKER

#include <fstream>
#include <iostream>
#include "untagger.h"

#ifdef _USE_NAMESPACES
    namespace Else { 
#endif // _USE_NAMESPACES

int main(int argc, char* argv[]);

int main(int argc, char* argv[]) {
    char c;  // Character read from input corpus...

    if (argc < 4) {
        cerr << "Usage:   untag <corpus input> <untagged corpus output> <anchored tags>\n"
             << "Warning: \n"
             << "         on some system (win32 with CodeWarrior), output file must be\n"
             << "         erased before processing \n"
             << "         (mode 'out|trunc' is incorrect: file not truncated on closing)\n";
        exit (1);
    }
    ifstream* corpus = new ifstream(argv[1],ios::in);
    if (!(*corpus)) {
        cerr << "ERROR: unable to open corpus file: " << argv[1] << '\n';
        exit (2);
    }
    ofstream* target = new ofstream(argv[2],ios::out|ios::trunc);
    if (!(*target)) {
        cerr << "ERROR: unable to create untagged file: " << argv[2] << '\n';
        exit (3);
    }
    ofstream* extract = new ofstream(argv[3], ios::out|ios::trunc);
    if (!(*extract)) {
        cerr << "ERROR: unable to create extracted datas file: " << argv[3] << '\n';
        exit (4);
    }
    cerr << "Untag " << argv[1] << " to " << argv[2] << " and save extracted datas to " << argv[3] << ".\n";

    CharFlowToStream endFlow(*target);
    AnchoredFiltersPool untagger(endFlow, *extract);

    //
    // First of all, we need an quiet extractor for the '<E>' and '</E>' tag
    // 
    QuietTranslateFilter secondStep(string("</E>"), string(), string(), untagger);
    QuietTranslateFilter flowEntry(string("<E>"), string(), string(), secondStep);

    //
    // Then, we can now extract and anchor all other tags
    // 
    untagger.addFilter(new ExtractAnchoredFilter(string("<"), string(">"), string(), 95 ), true);

    //
    // We can now translate '&', '<', ... needed by HTML
    // 

    untagger.addFilter(new TranslateAnchoredFilter(string("&amp;"), string("&")), true);
    untagger.addFilter(new TranslateAnchoredFilter(string("&amp "), string("& ")), true);  // il faudrait specialiser la classe....
    
    untagger.addFilter(new TranslateAnchoredFilter(string("&lt;"), string("<")), true);
    untagger.addFilter(new TranslateAnchoredFilter(string("&lt "), string("< ")), true);

    untagger.addFilter(new TranslateAnchoredFilter(string("&gt;"), string(">")), true);
    untagger.addFilter(new TranslateAnchoredFilter(string("&gt "), string("> ")), true);

    untagger.addFilter(new TranslateAnchoredFilter(string("&quot;"), string("\"")), true);
    untagger.addFilter(new TranslateAnchoredFilter(string("&quot "), string("\" ")), true);

    untagger.addFilter(new TranslateAnchoredFilter(string("&nbsp;"), string(" ")), true);
    untagger.addFilter(new TranslateAnchoredFilter(string("&nbsp "), string("  ")), true);

    untagger.addFilter(new TranslateAnchoredFilter(string("&reg;"), string("[Registered trademark]")), true);
    untagger.addFilter(new TranslateAnchoredFilter(string("&reg "), string("[Registered trademark] ")), true);

    untagger.addFilter(new TranslateAnchoredFilter(string("&copy;"), string("[Copyright]")), true);
    untagger.addFilter(new TranslateAnchoredFilter(string("&copy "), string("[Copyright] ")), true);

    untagger.addFilter(new TranslateAnchoredFilter(string("&endsp;"), string(">")), true);
    untagger.addFilter(new TranslateAnchoredFilter(string("&endsp "), string("> ")), true);

    untagger.addFilter(new TranslateAnchoredFilter(string("&emsp;"), string(">")), true);
    untagger.addFilter(new TranslateAnchoredFilter(string("&emsp "), string("> ")), true);

    untagger.addFilter(new TranslateAnchoredFilter(string("&endash;"), string(">")), true);
    untagger.addFilter(new TranslateAnchoredFilter(string("&endash "), string("> ")), true);

    untagger.addFilter(new TranslateAnchoredFilter(string("&emdash;"), string(">")), true);
    untagger.addFilter(new TranslateAnchoredFilter(string("&emdash "), string("> ")), true);

    //
    // Finally, we can transwlate all unrecognifed forms
    // 
    untagger.addFilter(new TranslateAnchoredFilter(string("&eacute;"), string("é")), true);
    untagger.addFilter(new TranslateAnchoredFilter(string("&eacute "), string("é ")), true);

    untagger.addFilter(new TranslateAnchoredFilter(string("&agrave;"), string("ŕ")), true);
    untagger.addFilter(new TranslateAnchoredFilter(string("&agrave "), string("ŕ ")), true);

    untagger.addFilter(new TranslateAnchoredFilter(string("&oelig;"), string("oe")), true);
    untagger.addFilter(new TranslateAnchoredFilter(string("&oelig "), string("oe ")), true);

    untagger.addFilter(new TranslateAnchoredFilter(string("&OElig;"), string("OE")), true);
    untagger.addFilter(new TranslateAnchoredFilter(string("&OElig "), string("OE ")), true);

    untagger.addFilter(new TranslateAnchoredFilter(string("&aelig;"), string("ae")), true);
    untagger.addFilter(new TranslateAnchoredFilter(string("&aelig "), string("ae ")), true);

    untagger.addFilter(new TranslateAnchoredFilter(string("&AElig;"), string("AE")), true);
    untagger.addFilter(new TranslateAnchoredFilter(string("&AElig "), string("AE ")), true);


    //
    // Now we can proceed...
    // 
    untagger.initialize();
    while (corpus->get(c)) flowEntry <<= c;

    // Manual flush...
    flowEntry.flush();
    secondStep.flush();
    untagger.flush();

    corpus->close();
    target->close();
    extract->close();
    delete corpus;
    delete target;
    delete extract;
    
    exit(0);
}

#ifdef __USE_NAMESPACES
}
#endif // _USE_NAMESPACES

#endif // defined UNTAG_MAIN

