/*---------------------------------------------------------------------------------------------------------------------
- File      : chk_tagged_corpus.h                                                         Project ELSE, EPFL - DI/LIA -
-                                                                       Evaluation in Language and Speech Engineering -
- Author    : Seydoux Florian   Creation date : 25 Aug 1999                                                           -
- Eulogist  : -                 Approval date : -                  Version: 0.1                                       -
-                                                                                                                     -
- Descript. : Check the conformity of a system's tagged corpus:                                                       -
-             a) Check the file format (strict)                                                                       -
-             b) Check the validity of the tags (must be in tags table)                                               -
-             c) Check the conformity of the stream of tagged tokens with the raw corpus (warn if found errors).      -
-                                                                                                                     -
- Requested : -                                                                                                       -
-                                                                                                                     -
- Gaps      : o)                                                                                                      -
-                                                                                                                     -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- Rev. date | Reviser               | Revise's description                                                            -
- - - - - - + - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- ../../....| ........              | ...                                                                             -
---------------------------------------------------------------------------------------------------------------------*/

#ifndef CHK_TAGGED_CORPUS_H
#define CHK_TAGGED_CORPUS_H

#include <iostream>
#include <fstream>
#include <algorithm>

#include "globaldef.h"
#include "messages_manager.h"

#include "grace_tools.h"
#include "tags_mapping_table.h"

#ifdef _USE_NAMESPACES
    namespace Else { 
#endif // _USE_NAMESPACES

class CheckTaggedCorpus : public GraceTools {
public:
    CheckTaggedCorpus(const UserConfiguration& config, const TagsMappingTable& table)
    : userConfiguration(config),
      translationTable(table),
      startRead(0),
      rawBuffer(),
      rawStream(0)
    { }
    
    const string& nickname() const
    {
        return TOOLS_NICKNAME;
    }
    
    const unsigned char& id() const
    {
        return TOOLS_ID;
    }

    void checkSystem(const string& rawCorpusName, const string& taggedCorpusName)
    {
        enum _State { _AllOk = 0, _TaggedEndReached = 1, _RawEndReached = 2, _AllEnd = 3 };

        _State          state(_AllOk);                 // AFD current state
        LemmeAccess     lemmePtr;                      // Readed lemme in tagged corpus
        conform_string::iterator rawIterator;          // Iterator on each characters, to test the equality
        conform_string::const_iterator taggedIterator; // of two streams, read with different chunk's size.

        ifstream rawfStream(rawCorpusName.c_str(), ios::in);
        rawStream = &rawfStream;
        startRead = 0;
        
        TaggedCorpusReader taggedReader(userConfiguration, taggedCorpusName);

        if (!(rawfStream))
           msg.seriousError(nickname(), string("Raw corpus file: " + rawCorpusName + " not found!"));
        if (!(taggedReader))
            msg.seriousError(nickname(), string("Tagged corpus file: " + taggedCorpusName + " not found!"));

        if (lemmePtr = taggedReader.readNextLemme()) taggedIterator = lemmePtr->tok.begin();
        else {
            msg.warning(id(), 0, 1,
                        string("Empty file"),
                        string("The tagged corpus file does not contain relevant data."),
                        Location(&taggedCorpusName, taggedReader.location()));
            state = static_cast<_State>(state | _TaggedEndReached);
        }

        if (readNextRaw()) rawIterator = rawBuffer.begin();
        else {
           msg.warning(id(), 0, 2,
                        string("Empty file"),
                        string("The raw corpus file does not contain relevant data."),
                        Location(&rawCorpusName, rawStream->tellg()));
           state = static_cast<_State>(state | _RawEndReached);
        }
        
     // while (state != _AllEnd)                // -> Strange error with gcc 2.8.1:
                                                //    "not implemented: initializer contains unrecognized tree code"
        while ( ! (state == _AllEnd)) {
            switch (state) {
                case _AllOk :
                    while ((categorize(*taggedIterator) == Separator) && (++taggedIterator != lemmePtr->tok.end()));
                    if (taggedIterator == lemmePtr->tok.end()) {
                        translationTable.checkSys(*this, taggedCorpusName, lemmePtr->tag);
                        if (lemmePtr = taggedReader.readNextLemme()) taggedIterator = lemmePtr->tok.begin();
                        else {
                            rawBuffer.erase(rawBuffer.begin(), rawIterator);
                            state = static_cast<_State>(state | _TaggedEndReached);
                        }
                        break;
                    }
                    if ( *rawIterator != *taggedIterator ) {
                        // 
                        // Error: the token is not correctly extracted from the corpus.
                        // 
                        msg.warning(id(), 1, 1,
                                    string("Invalid token"),
                                    string("Waited part of '") + string(rawBuffer.begin(), rawIterator) 
                                        + string("[") + string(rawIterator, rawBuffer.end())
                                        + string("]' Optained '") + string(lemmePtr->tok.begin(), taggedIterator)
                                        + string("[") + string(taggedIterator, lemmePtr->tok.end()) + string("]'."),
                                    Location(&taggedCorpusName, lemmePtr->tok.from, lemmePtr->tok.to),
                                    Location(&rawCorpusName, startRead, rawStream->tellg()));

                        /*
                         * Now we must found the resynchronisation point...
                         * 
                         * We search if the rest of the (wrong) tagged token exist nearly (criterion (bad): < 15 chars)
                         * the current position in the raw corpus; if not, we take the next tagged entry, and search 
                         * the token into the raw corpus, and so on.
                         * 
                           */  
                        conform_string  bufferClone(rawBuffer);
                        conform_string::size_type rawIteratorDelta(rawIterator-rawBuffer.begin());
                        VeryLongNatural mistakeLink(startRead + (rawIterator - rawBuffer.begin()));
                        VeryLongNatural rawLocation(rawStream->tellg());
                        VeryLongNatural mistakePosition(lemmePtr->tok.from + (taggedIterator - lemmePtr->tok.begin()));
                        VeryLongNatural rawSkipped(0);
                        VeryLongNatural taggedSkipped(0);
                        bool            synchroFail(true);

                        if ( (synchroFail = ! resynchronize(string(taggedIterator, lemmePtr->tok.end()),
                                                            rawIterator,
                                                            rawSkipped,
                                                            15)) ) {
                            taggedSkipped += (lemmePtr->tok.end() - taggedIterator);
                            while (synchroFail && (lemmePtr = taggedReader.readNextLemme())) {
                                rawStream->seekg(rawLocation);
                                rawStream->clear();
                                rawBuffer = bufferClone;
                                rawIterator = rawBuffer.begin() + rawIteratorDelta;
                                taggedIterator = lemmePtr->tok.begin();
                                if ( (synchroFail = ! resynchronize(lemmePtr->tok, rawIterator, rawSkipped)) )
                                    taggedSkipped += lemmePtr->tok.size();
                            }
                        }
                        if (synchroFail) {
                           rawBuffer = bufferClone;
                           rawIterator = rawBuffer.begin() + rawIteratorDelta;
                           rawStream->seekg(rawLocation);
                           rawStream->clear();
                           state = static_cast<_State>(state | _TaggedEndReached);
                           break;
                        } // else {
                        ostringstream skippedStr;
                        skippedStr << "A chunk of " << taggedSkipped << '/' << rawSkipped
                                   << " characters have been skip to resynchronize the streams.";
                        msg.warning(id(), 1, 2,
                                    string("Skipping chunk of data"),
                                    skippedStr.str(),
                                    Location(&taggedCorpusName, 
                                             mistakePosition,
                                             lemmePtr->tok.from + (taggedIterator - lemmePtr->tok.begin())),
                                    Location(&rawCorpusName,
                                             mistakeLink,
                                             startRead));
                        // 
                        // Now we can move the iterators to the end of synchronized piece.
                        //
                        if (taggedSkipped) { 
                            rawIterator += lemmePtr->tok.size() - 1;
                            taggedIterator += lemmePtr->tok.size() - 1;
                        } else {
                            rawIterator += (lemmePtr->tok.end() - taggedIterator) - 1;
                            taggedIterator += (lemmePtr->tok.end() - taggedIterator) - 1; 
                        }
                    }

/*
 *  Resynchro 1st version:  (fail if next tagged token is also wrong)
 *  Read next tagged token, and search it into the rest of raw corpus.
 * 

                       if (lemmePtr = taggedReader.readNextLemme()) {
                            taggedIterator = lemmePtr->tok.begin();
                            VeryLongNatural skipped(0);
                            VeryLongNatural positionBeforeSynchro(rawStream->tellg());
                            if (! resynchronize(lemmePtr->tok, rawIterator, skipped))
                                state = static_cast<_State>(state | _RawEndReached);
                            if (skipped) {
                                ostringstream skippedStr;
                                skippedStr << skipped;
                                msg.warning(id(), 1, 2,
                                            string("Skipping chunk of raw data"),
                                            string("A chunk of ") + skippedStr.str() 
                                                + string(" characters have been skip to resynchronize the streams."),
                                            Location(&taggedCorpusName, lemmePtr->tok.from),
                                            Location(&rawCorpusName, positionBeforeSynchro, rawStream->tellg()));
                            }
                        } else state = static_cast<_State>(state | _TaggedEndReached);
                    }
                    break;
*/                    

                    if (++taggedIterator == lemmePtr->tok.end()) {
                        //
                        // The entire token was checked, we must now check the tag...
                        // 
                        translationTable.checkSys(*this, taggedCorpusName, lemmePtr->tag);

                        if (lemmePtr = taggedReader.readNextLemme()) taggedIterator = lemmePtr->tok.begin();
                        else state = static_cast<_State>(state | _TaggedEndReached);
                    }
                    if (++rawIterator == rawBuffer.end()) {
                        if (readNextRaw()) rawIterator = rawBuffer.begin();
                        else state = static_cast<_State>(state | _RawEndReached);
                    }
                    break;


                case _TaggedEndReached :
                    msg.warning(id(), 1, 2,
                                string("Untagged data"),
                                string("A piece of raw corpus '") + string(rawBuffer.begin(), rawIterator) 
                                        + string("[") + string(rawIterator, rawBuffer.end())
                                        + string("]' seems not to be tagged."),
                                Location(&taggedCorpusName, taggedReader.location()),
                                Location(&rawCorpusName, startRead, rawStream->tellg()));
                                
                    if (!readNextRaw())
                        state = static_cast<_State>(state | _RawEndReached);
                    rawIterator = rawBuffer.begin();
                    break;

                case _RawEndReached :
                    msg.warning(id(), 1, 3,
                                string("Synchro Lost or Invalid token"),
                                string("Can't found the tagged token '") 
                                    + string(lemmePtr->tok.begin(), taggedIterator)
                                    + string("[") + string(taggedIterator, lemmePtr->tok.end())
                                    + string("]' in the raw corpus."),
                                Location(&taggedCorpusName, lemmePtr->tok.from, lemmePtr->tok.to),
                                Location(&rawCorpusName, rawStream->tellg()));
                    if (! (lemmePtr = taggedReader.readNextLemme()))
                        state = static_cast<_State>(state | _TaggedEndReached);
                    taggedIterator = lemmePtr->tok.begin();
                    break;

                case _AllEnd : // must be skipped...
                    msg.message(nickname(), "Corpus checker: end of all flow reached.", STANDARD_MSG);
                    break;

                default :
                    throw domain_error(nickname() + string(": invalid internal state"));
                    break;
            }
        }
        rawfStream.close();
        rawStream = 0;
    }

private:

    //
    // Read a part of raw corpus; skip all 'separators', and update the 'startRead'
    // (startRead = position in the input stream, before read)
    // 
    bool readNextRaw()
    {
        char chr;
        startRead = rawStream->tellg();
        rawBuffer.clear();
        while (rawStream->get(chr)) {
            if (categorize(chr) != Separator) {
               do rawBuffer.push_back(chr);
               while (rawStream->get(chr) && (categorize(chr) != Separator));
               msg.message(nickname(),
                           string("Reading from raw corpus:") + rawBuffer,
                           DEBUG_MSG);
               return true;
            }
        }
        msg.message(nickname(), "End of raw corpus reached.", DETAIL_MSG);
        return false;
    }

    // 
    // Try to resynchronize the two stream, after an error occurs.
    // Inform if the operation proceeded well (we assume that stream are synchronized if the searched
    // stream are found in the next of the raw corpus... then searched stream must be right to do that).
    // 
    bool resynchronize(const string& searched, conform_string::const_iterator& pos, VeryLongNatural& skip,
                       const VeryLongNatural& maxSkip = 0)
    {
        conform_string::size_type confirmedLocation;
        conform_string::size_type possibleLocation(pos-rawBuffer.begin());
        Natural initialDelta(possibleLocation);
        skip = 0;
        for (;;) {
            while ((possibleLocation = rawBuffer.find(searched[0], possibleLocation)) == rawBuffer.npos) {
                skip += rawBuffer.size();   
                if ( (! readNextRaw()) || ((maxSkip) && (skip > maxSkip)) ) {
                    pos = rawBuffer.begin();
                    msg.message(nickname(), string("Synchronization with '") + searched + ("' failed !"), DETAIL_MSG);
                    return false;
                } else possibleLocation = 0;
            }
            int d = searched.size() - (rawBuffer.size() - possibleLocation);
            while (d > 0) {
                char chr;
                rawStream->get(chr);
                if (categorize(chr) != Separator) {
                    rawBuffer.push_back(chr);
                    --d;
                }
            }
            if ((confirmedLocation = rawBuffer.find(searched, possibleLocation)) != rawBuffer.npos) {
                skip += confirmedLocation;
                skip -= initialDelta;
                pos = rawBuffer.begin() + confirmedLocation;
                msg.message(nickname(), string("Synchronization with '") + searched + ("' succeeded !"), DETAIL_MSG);
                return true;
            }
            rawBuffer.erase(0, possibleLocation+1);
            possibleLocation = 0;
        }
    }

private:
    static const string         TOOLS_NICKNAME;
    static const unsigned char  TOOLS_ID;
    
    const UserConfiguration&  userConfiguration;
    const TagsMappingTable&   translationTable;

    VeryLongNatural    startRead;
    conform_string     rawBuffer;
    istream*           rawStream;
};


const string CheckTaggedCorpus::TOOLS_NICKNAME("TaggedCorpusChecker");
const unsigned char CheckTaggedCorpus::TOOLS_ID('C');

#ifdef _USE_NAMESAPCES
}
#endif // _USE_NAMESPACES

#endif // CHK_TAGGED_CORPUS_H



