/*---------------------------------------------------------------------------------------------------------------------
- File      : retokenizer.cc                                                              Project ELSE, EPFL - DI/LIA -
-                                                                       Evaluation in Language and Speech Engineering -
- Author    : Seydoux Florian   Creation date : 03 August 1999                                                        -
- Eulogist  : -                 Approval date : -                  Version: 0.1                                       -
-                                                                                                                     -
- Descript. : Retokenize (with Grace's rules) the tagged input stream/file.                                           -
-             (This work seemeds already done by the limsi, but has never been communicated to me... then...)         -
-             Multi tokens (from the 'Grace's tokenization point of view) arent split, and tag will be duplicated,    -
-             with indication of retokenisation.                                                                      - 
-             Retokenization is not trivial, because it must be consistant with the existing tokenization:            -
-             Tok1  A/1.2 | B/1.3  ->  Tok1   A/1.3 | B/1.4   ---> Tok1   A/1.3|B/1.4//1.2/1.3(1)                     -
-             Tok2  A/2.2 | B/2.3  ->  Tok2a  A/2.3 | B/2.4   ---> Tok2a  A/2.3|B/2.4//2.2/2.3(2)                     -
-                                  ->  Tok2b  A/2.3 | B/3.4   ---> Tok2b  A/3.3|B/3.4//2.2/2.3(2)                     -
-             Tok3  B/3.3          ->  Tok3   B/4.4           ---> Tok3   B/4.4//3.3(3)                               -
-                                                                                                                     -
-             To allow the reconstitution of old tokenization, original segmentation are keeping in a extended field, -
-             marked up by the '//' symbol at the end of the tag. In 'patch' mode, location are also saved at the end -
-             of the composite tag field, wrapped between round parenthesis.                                          -
-                                                                                                                     -
- Requested : -                                                                                                       -
-                                                                                                                     -
- Gaps      : o)                                                                                                      -
-                                                                                                                     -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- Rev. date | Reviser               | Revise's description                                                            -
- - - - - - + - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- ../../....| ........              | ...                                                                             -
---------------------------------------------------------------------------------------------------------------------*/
//
// Version 0.02 -> vire d'office les cmt, et patch pour le realigneur limsi. (pas de parametrage possible)
// 
#ifndef RETOKENIZER
#define RETOKENIZER

#include <deque>
#include <vector>
#include <fstream>
#include <iostream>
#include <algorithm>
#include "globaldef.h"
#include "untagger.h"
#include "grace_tools.h"
#include "tags_mapping_table.h"

#ifdef __STL_USE_NAMESPACES
    namespace Else { 
#endif // __STL_USE_NAMESPACES

void dumpHelp();
void checkParams(Parameters&);
int main(int argc, char* argv[]);

conform_string configFile, inputFile, outputFile;

typedef vector<conform_string> StringCollection;
struct TokenInfos 
{
    ShortNatural       jobComputeTime;  // schedule computation
    ShortNatural       forward;         // Fwd window size
    ShortNatural       backward;        // bck window size (needed history)
    StringCollection   split;           // Retokenization
    AlternatedTagsSet  tag;             // tag with tokenization
    VeryLongNatural    location;        // token rank.

    TokenInfos(const VeryLongNatural time,
               const ShortNatural forward,
               const ShortNatural backward,
               const StringCollection& split,
               const AlternatedTagsSet& tag,
               const VeryLongNatural location)
    : jobComputeTime(time),
      forward(forward),
      backward(backward),
      split(split),
      tag(tag),
      location(location)
    { }
};

class GraceReTokenizer: public GraceTools, public StringFlowListener {
public:
    GraceReTokenizer(LemmesFlowListener& writer, const bool removeCmt = true)
    : removeCmt(removeCmt),
      tokenizer(*this),
      writer(writer),
      tokenSplit(),
      tokenId(0),
      location(0),
      buffer(),
      uncomputed(0)
    { }

    const string& nickname() const
    {
        return TOOLS_NICKNAME;
    }
    
    const unsigned char& id() const
    {
        return TOOLS_ID;
    }

    void computeLemme(const LemmeBaseAccess readedLba)
    {
        ++location; // pas cool, si cmt sur pls ligne, ou SEP2 ref != <ret>
// 
// Commentaires: eux aussi sont a introduire dans la file.
// 
//        if (const SingleCommentAccess singleCmtPtr = dynamic_cast<SingleCommentAccess>(readedLba)) {
//            if (!removeCmt) writer.computeFlowItem(singleCmtPtr->cmt);
//        } else

        if (const LemmeAccess lemmePtr = dynamic_cast<LemmeAccess>(readedLba)) {
            int maxFwd(0);
            int maxBck(0);
            
            ++tokenId;

            // 
            // (1) Analysing the token, and split it with Grace's rules.
            // 
            tokenizer.computeString(lemmePtr->tok);
            tokenizer <<= ' ';    // reinsert a token separator

            //
            // (2) Creating the structured tag representation
            // 
            AlternatedTagsSet tag(lemmePtr->tag);

            //
            // (3) Analysing tokenization
            // 
            for (AlternatedConstIterator atom = tag.begin(); atom != tag.end(); ++atom)
            {
                maxFwd = max(maxFwd, atom->segmentation.numberOfSegments - atom->segmentation.segmentRank);
                maxBck = max(maxBck, atom->segmentation.segmentRank - 1);
            }

            //
            // (4) Some tests of validity (monotonic growing)
            // 
            if ((buffer.empty() && (maxBck > 0)) || ( ! buffer.empty() && (maxBck-1 > buffer.back().backward)))
            {
                msg.error(id(), 1, 1,
                          string("Invalid sequence"),
                          string("The tag of '") + lemmePtr->tok + string("' [") + lemmePtr->tag
                              + string("] has tokenization mark not in monotonic growth order."),
                          Location(&inputFile, lemmePtr->tag.from, lemmePtr->tag.to));
                msg.message(nickname(),
                            "Due to invalid segmentation, it's possible than Retokenizer 'go to the stawberry'",
                            STANDARD_MSG);
            }
            if ( !buffer.empty() && (maxFwd+tokenId < buffer.back().jobComputeTime))
            {
                msg.error(id(), 1, 2,
                          string("Invalid sequence"),
                          string("Gap detected in tokenization marks of '") + lemmePtr->tok + string("' [") 
                              + lemmePtr->tag + string("]."),
                          Location(&inputFile, lemmePtr->tag.from, lemmePtr->tag.to));
                msg.message(nickname(),
                            "Due to invalid segmentation, it's possible than Retokenizer 'go to the stawberry'",
                            STANDARD_MSG);
            }

            //
            // (5) Bufferize the case
            // 
            buffer.push_back(TokenInfos(maxFwd+tokenId, maxFwd, maxBck, tokenSplit, tag, location));

            //
            // (6) Compute computable token (for which window is sufficiently big)
            while ((uncomputed < buffer.size()) && (buffer[uncomputed].jobComputeTime <= tokenId))
            {
                for (ShortNatural piece = 1; piece <= buffer[uncomputed].split.size(); ++piece)
                {
                    ostringstream patch;
                    AlternatedTagsSet newtag;
                    TokenInfos& item(buffer[uncomputed]);
                    assert(item.jobComputeTime == tokenId);

                    bool spclMark(false);

                    patch << SEGMENTATION_MARK;
                    for (AlternatedConstIterator atom = item.tag.begin(); atom != item.tag.end(); ++atom)
                    {
                        const Segmentation& seg(atom->segmentation);

                        ContractedTag newAtom(*atom);
                        newAtom.segmentation.numberOfSegments = 0;
                        newAtom.segmentation.segmentRank = piece;
                        ShortNatural min(uncomputed-(seg.segmentRank-1));
                        ShortNatural max(uncomputed+(seg.numberOfSegments - seg.segmentRank));

                        // (a) calculate new splitting infos
                        for (ShortNatural i = min; i <= max; ++i)
                            newAtom.segmentation.numberOfSegments += buffer[i].split.size();
                        for (ShortNatural i = min; i < uncomputed; ++i)
                            newAtom.segmentation.segmentRank += buffer[i].split.size();

                        // (b) saving the original segmentation.
                        if (newAtom.segmentation != seg)
                        {
	                        patch << SEGMENTATION_MARK << seg.segmentRank 
    	                          << SEGMENTATION_SUBMARK << seg.numberOfSegments;
    	                    spclMark = true;
    	                }

                        // (c) finalizing the new tag
                        newAtom.segmentation.specific = (newAtom.segmentation.numberOfSegments > 1);
                        newtag.insert(newAtom);
                    }
                    if (!spclMark) patch << SEGMENTATION_MARK;
                    patch << '(' << item.location << ')';
                    writer.computeFlowItem(item.split[piece-1], newtag.str()+patch.str());
                }
                ++uncomputed;
            }

            //
            // (7) Remove old entries...
            ShortNatural nbRemove(uncomputed);
            if (uncomputed < buffer.size())
                nbRemove -= buffer[uncomputed].backward;
            uncomputed -= nbRemove;
            for (ShortNatural i = 0; i < nbRemove; ++i) buffer.pop_front();
            tokenSplit.clear();
        }
    }

    void flush()
    {
        if (!buffer.empty())
        {
            msg.message(nickname(), "Segmentation indicate that some token are missing. "
                        "Writing with error message in comment field the uncorrectly retokenized forms.");
            for (Natural i = uncomputed; i < buffer.size(); ++i)
            for (ShortNatural piece = 0; piece < buffer[i].split.size(); ++piece)
                writer.computeFlowItem(buffer[i].split[piece], buffer[i].tag.str(),
                    string("ERROR(") +nickname() + string("): Due to missed token, segmentation is wrong !"));
        }
        buffer.clear();
    }
    

    void computeFlowItem(const string& data)
    {
        tokenSplit.push_back(data);
    }

private:
    typedef deque<TokenInfos> HistoryBuffer;

    static const string         TOOLS_NICKNAME;
    static const unsigned char  TOOLS_ID;
    
    const bool               removeCmt;
    GraceTokenizer           tokenizer;
    LemmesFlowListener&      writer;
    StringCollection         tokenSplit;
    VeryLongNatural          tokenId;
    VeryLongNatural          location;
    HistoryBuffer            buffer;
    ShortNatural             uncomputed;
};
const string GraceReTokenizer::TOOLS_NICKNAME("Retokenizer");
const unsigned char GraceReTokenizer::TOOLS_ID('R');

void dumpHelp()
{
    cerr << "Description: (Evaluation task grp)\n"
            "  Retokenize (with Grace's rules) the tagged input stream/file.\n\n"
            "Usage: retokenize <config> <input> <output> [-c|-o|-patch]\n"
            "  <config>  : the configuration file where the format of the tagged file is define (RefConfiguration).\n"
            "  <input>   : the tagged file who must been retokenized.\n"
            "  <output>  : the retokenized tagged file.\n\n"
            "Options:\n"
            "  -c             : remove all comments (not currently working)\n"
            "  -d             : dont keep old tokenization (not currently working)\n"
            "  -patch         : remove comments & add original location (always)\n";
   msg.switchesFormat(cerr);
   exit(0);
}

void checkParams(Parameters& params)
{
    if ((params.size() < 3)
        || !strcmp(params.front(), "?")
        || !strcmp(params.front(), "-?")
        || !strcmp(params.front(), "-h")
        || !strcmp(params.front(), "-help")
        || !strcmp(params.front(), "--help")
        ) dumpHelp();

    configFile = params.front(); params.erase(params.begin());
    inputFile  = params.front(); params.erase(params.begin());
    outputFile = params.front(); params.erase(params.begin());
    msg.checkParams(params);
    for (Parameters::const_iterator currentArg = params.begin(); currentArg != params.end(); ++currentArg) 
    {
        if (!strcmp(*currentArg, "-c"))     { continue; }
        if (!strcmp(*currentArg, "-d"))     { continue; }
        if (!strcmp(*currentArg, "-patch")) { continue; }
        cerr << "Invalid argument: " << *currentArg << "\nTry 'retokenize --help' for more informations!\n";
    }
}


int main(int argc, char* argv[])
{
    try {
    Parameters params(argc, argv);
    UserConfiguration config;

    checkParams(params);
    if (!config.readFrom(configFile, config.Unknown, false))
        msg.seriousError("Retokenizer",
                         "Configuration file for the input format is not readeable, or it has too serious error(s)");

    TaggedCorpusReader tagsReader(config, inputFile);
    if (!tagsReader)
        msg.seriousError("Retokenizer", string("Unable to open the input file '") + inputFile + string("'."));
        
    ofstream output(outputFile.c_str());
    if (!output)
        msg.seriousError("Retokenizer", string("Unable to create the output file '") + outputFile + string("'."));

    TextFlowToStream   endFlow(output);
    TaggedCorpusWriter writer(config, endFlow);
    GraceReTokenizer   retok(writer, true);
  
    while (tagsReader)
        retok.computeLemme(tagsReader.readNextLemme());
    retok.flush();
    output.close();
    } catch (exception& e) { cerr << "Exception occurs: " << e.what() << endl; abort(); }
}

#ifdef __STL_USE_NAMESPACES
}
#endif // __STL_USE_NAMESPACES

#endif // EXTRACT_MAIN

