
#include "chk_tagged_corpus.h"

#ifdef _USE_NAMESPACES
    namespace Else { 
#endif // _USE_NAMESPACES

CheckTaggedCorpus::CheckTaggedCorpus::(const UserConfiguration& config, const TagsMappingTable& table)
: userConfiguration(config),
  translationTable(table),
  startRead(0),
  rawBuffer(),
  rawStream(0)
{ }
    
const string&
CheckTaggedCorpus::nickname() const { return TOOLS_NICKNAME; }
    
const unsigned char&
CheckTaggedCorpus::id() const { return TOOLS_ID; }

void
CheckTaggedCorpus::checkSystem(const string& rawCorpusName, const string& taggedCorpusName)
{
	enum _State { _AllOk = 0, _TaggedEndReached = 1, _RawEndReached = 2, _AllEnd = 3 };

	_State          state(_AllOk);                 // AFD current state
	LemmeAccess     lemmePtr;                      // Readed lemme in tagged corpus
	conform_string::iterator rawIterator;          // Iterator on each characters, to test the equality
	conform_string::const_iterator taggedIterator; // of two streams, read with different chunk's size.

	ifstream rawfStream(rawCorpusName.c_str(), ios::in);
	rawStream = &rawfStream;
	startRead = 0;
        
	TaggedCorpusReader taggedReader(userConfiguration, taggedCorpusName);

	if (!(rawfStream))
		msg.seriousError(nickname(), string("Raw corpus file: " + rawCorpusName + " not found!"));
	if (!(taggedReader))
		msg.seriousError(nickname(), string("Tagged corpus file: " + taggedCorpusName + " not found!"));

	if (lemmePtr = taggedReader.readNextLemme()) taggedIterator = lemmePtr->tok.begin();
	else
	{
		msg.warning(id(), 0, 1,
		            string("Empty file"),
		            string("The tagged corpus file does not contain relevant data."),
		            Location(&taggedCorpusName, taggedReader.location()));
		state = static_cast<_State>(state | _TaggedEndReached);
    }

	if (readNextRaw()) rawIterator = rawBuffer.begin();
	else
	{
		msg.warning(id(), 0, 2,
		            string("Empty file"),
		            string("The raw corpus file does not contain relevant data."),
		            Location(&rawCorpusName, rawStream->tellg()));
		state = static_cast<_State>(state | _RawEndReached);
	}
        
	// while (state != _AllEnd)                // -> Strange error with gcc 2.8.1:
                                               //    "not implemented: initializer contains unrecognized tree code"
	while ( ! (state == _AllEnd)) 
	{
		switch (state)
		{
			case _AllOk :
			{
				while ((categorize(*taggedIterator) == Separator) && (++taggedIterator != lemmePtr->tok.end()));
				if (taggedIterator == lemmePtr->tok.end()) 
				{
					translationTable.checkSys(*this, taggedCorpusName, lemmePtr->tag);
					if (lemmePtr = taggedReader.readNextLemme()) taggedIterator = lemmePtr->tok.begin();
					else 
					{
						rawBuffer.erase(rawBuffer.begin(), rawIterator);
						state = static_cast<_State>(state | _TaggedEndReached);
					}
					break;
				}
				if ( *rawIterator != *taggedIterator ) 
				{
					// 
					// Error: the token is not correctly extracted from the corpus.
					// 
					msg.warning(id(), 1, 1,
					            string("Invalid token"),
					            string("Waited part of '") + string(rawBuffer.begin(), rawIterator) 
					                    + string("[") + string(rawIterator, rawBuffer.end())
					                    + string("]' Optained '") + string(lemmePtr->tok.begin(), taggedIterator)
					                    + string("[") + string(taggedIterator, lemmePtr->tok.end()) + string("]'."),
					            Location(&taggedCorpusName, lemmePtr->tok.from, lemmePtr->tok.to),
					            Location(&rawCorpusName, startRead, rawStream->tellg()));

					//
					// Now we must found the resynchronisation point...
					//
					// We search if the rest of the (wrong) tagged token exist nearly (criterion (bad): < 15 chars)
					// the current position in the raw corpus; if not, we take the next tagged entry, and search 
					// the token into the raw corpus, and so on.
					//  
					conform_string  bufferClone(rawBuffer);
					conform_string::size_type rawIteratorDelta(rawIterator-rawBuffer.begin());
					VeryLongNatural mistakeLink(startRead + (rawIterator - rawBuffer.begin()));
					VeryLongNatural rawLocation(rawStream->tellg());
					VeryLongNatural mistakePosition(lemmePtr->tok.from + (taggedIterator - lemmePtr->tok.begin()));
					VeryLongNatural rawSkipped(0);
					VeryLongNatural taggedSkipped(0);
					bool            synchroFail(true);

					if ( (synchroFail = ! resynchronize(string(taggedIterator, lemmePtr->tok.end()),
					                                    rawIterator,
					                                    rawSkipped,
                                                        15)) )
					{
						taggedSkipped += (lemmePtr->tok.end() - taggedIterator);
						while (synchroFail && (lemmePtr = taggedReader.readNextLemme())) 
						{
							rawStream->seekg(rawLocation);
							rawStream->clear();
							rawBuffer = bufferClone;
							rawIterator = rawBuffer.begin() + rawIteratorDelta;
							taggedIterator = lemmePtr->tok.begin();
							if ( (synchroFail = ! resynchronize(lemmePtr->tok, rawIterator, rawSkipped)) )
								taggedSkipped += lemmePtr->tok.size();
						}
					}
					if (synchroFail) 
					{
						rawBuffer = bufferClone;
						rawIterator = rawBuffer.begin() + rawIteratorDelta;
						rawStream->seekg(rawLocation);
						rawStream->clear();
						state = static_cast<_State>(state | _TaggedEndReached);
						break;
					} // else {
					ostringstream skippedStr;
					skippedStr << "A chunk of " << taggedSkipped << '/' << rawSkipped
					           << " characters have been skip to resynchronize the streams.";
					msg.warning(id(), 1, 2,
					            string("Skipping chunk of data"),
					            skippedStr.str(),
					            Location(&taggedCorpusName,
					                     mistakePosition,
					                     lemmePtr->tok.from + (taggedIterator - lemmePtr->tok.begin())),
					            Location(&rawCorpusName, mistakeLink, startRead));
					// 
					// Now we can move the iterators to the end of synchronized piece.
					//
					if (taggedSkipped) 
					{ 
						rawIterator += lemmePtr->tok.size() - 1;
						taggedIterator += lemmePtr->tok.size() - 1;
					}
					else 
					{
						rawIterator += (lemmePtr->tok.end() - taggedIterator) - 1;
						taggedIterator += (lemmePtr->tok.end() - taggedIterator) - 1; 
					}
				}
				if (++taggedIterator == lemmePtr->tok.end()) 
				{
					//
					// The entire token was checked, we must now check the tag...
					// 
					translationTable.checkSys(*this, taggedCorpusName, lemmePtr->tag);
					if (lemmePtr = taggedReader.readNextLemme()) taggedIterator = lemmePtr->tok.begin();
					else state = static_cast<_State>(state | _TaggedEndReached);
				}
				if (++rawIterator == rawBuffer.end()) 
				{
					if (readNextRaw()) rawIterator = rawBuffer.begin();
					else state = static_cast<_State>(state | _RawEndReached);
				}
				break;
			}

			case _TaggedEndReached :
			{
				msg.warning(id(), 1, 2,
				            string("Untagged data"),
				            sring("A piece of raw corpus '") + string(rawBuffer.begin(), rawIterator) 
				                  + string("[") + string(rawIterator, rawBuffer.end())
				                  + string("]' seems not to be tagged."),
				            Location(&taggedCorpusName, taggedReader.location()),
				            Location(&rawCorpusName, startRead, rawStream->tellg()));

				if (!readNextRaw()) static_cast<_State>(state | _RawEndReached);
				rawIterator = rawBuffer.begin();
				break;
			}

			case _RawEndReached :
			{
				msg.warning(id(), 1, 3,
				            string("Synchro Lost or Invalid token"),
				            string("Can't found the tagged token '")
				                    + string(lemmePtr->tok.begin(), taggedIterator)
				                    + string("[") + string(taggedIterator, lemmePtr->tok.end())
				                    + string("]' in the raw corpus."),
				            Location(&taggedCorpusName, lemmePtr->tok.from, lemmePtr->tok.to),
				            Location(&rawCorpusName, rawStream->tellg()));
				if (! (lemmePtr = taggedReader.readNextLemme()))
					state = static_cast<_State>(state | _TaggedEndReached);
				taggedIterator = lemmePtr->tok.begin();
				break;
			}

			case _AllEnd : // must be skipped...
			{
				msg.message(nickname(), "Corpus checker: end of all flow reached.", STANDARD_MSG);
				break;
			}

			default :
			{
				throw domain_error(nickname() + string(": invalid internal state"));
				break;
			}
		}
	}
	rawfStream.close();
	rawStream = 0;
}

bool
CheckTaggedCorpus::readNextRaw()
{
	char chr;
	startRead = rawStream->tellg();
	rawBuffer.clear();
	while (rawStream->get(chr)) 
	{
		if (categorize(chr) != Separator) 
		{
			do rawBuffer.push_back(chr);
			while (rawStream->get(chr) && (categorize(chr) != Separator));
			msg.message(nickname(),
			            string("Reading from raw corpus:") + rawBuffer,
			            DEBUG_MSG);
			return true;
		}
	}
	msg.message(nickname(), "End of raw corpus reached.", DETAIL_MSG);
	return false;
}


bool
CheckTaggedCorpus::resynchronize(const string& searched,
                                 conform_string::const_iterator& pos,
                                 VeryLongNatural& skip,
                                 const VeryLongNatural& maxSkip)
{
	conform_string::size_type confirmedLocation;
	conform_string::size_type possibleLocation(pos-rawBuffer.begin());
	Natural initialDelta(possibleLocation);
	skip = 0;
	for (;;) 
	{
		while ((possibleLocation = rawBuffer.find(searched[0], possibleLocation)) == rawBuffer.npos) 
		{
			skip += rawBuffer.size();   
			if ( (! readNextRaw()) || ((maxSkip) && (skip > maxSkip)) ) 
			{
				pos = rawBuffer.begin();
				msg.message(nickname(), string("Synchronization with '") + searched + ("' failed !"), DETAIL_MSG);
				return false;
			} else possibleLocation = 0;
		}
		int d = searched.size() - (rawBuffer.size() - possibleLocation);
		while (d > 0) 
		{
			char chr;
			rawStream->get(chr);
			if (categorize(chr) != Separator) 
			{
				rawBuffer.push_back(chr);
				--d;
			}
		}
		if ((confirmedLocation = rawBuffer.find(searched, possibleLocation)) != rawBuffer.npos) 
		{
			skip += confirmedLocation;
			skip -= initialDelta;
			pos = rawBuffer.begin() + confirmedLocation;
			msg.message(nickname(), string("Synchronization with '") + searched + ("' succeeded !"), DETAIL_MSG);
			return true;
		}
		rawBuffer.erase(0, possibleLocation+1);
		possibleLocation = 0;
	}
}

#ifdef _USE_NAMESAPCES
}
#endif // _USE_NAMESPACES

