#include "frequency_distribution.h"

using namespace std;

namespace papi
{

    FrequencyDistribution::FrequencyDistribution( char const* name):
        count_word_sum(0), 
        last_word_begin(0),size_alphabet(0), search_next_word(true)
    {
        module_id = name;
    }

    FrequencyDistribution::~FrequencyDistribution()
    {
        if(count_character)
            delete [] count_character;
        if(count_bigram)
        {
            for(int i=0; i<size_alphabet;++i)
                delete [] count_bigram[i];
            delete [] count_bigram;
        }
            
    }

    void FrequencyDistribution::init(AnalyzeSetting & settings,long long file_length, const IndexMap<char> *ind_map)
    {        
        this->ind_map = ind_map;
        this->size_alphabet = ind_map->size;
        
        disable_word_distribution = settings.getBool("disable_word_distribution",false);
        disable_bigram_distribution = settings.getBool("disable_bigram_distribution",false);
        disable_character_distribution = settings.getBool("disable_character_distribution",false);
        
        if(!disable_word_distribution)
        {
            memset(word_delimiters,false,sizeof(word_delimiters));
            string word_delimiter_string;
            
            word_delimiter_string = settings.getString("word_delimiter","\\W");
            
            boost::regex re = boost::regex (word_delimiter_string);

            for(int i = 0; i<size_alphabet;++i)
            {
                char c = ind_map->getValue(i);
                if( boost::regex_match(&c, (&c)+1, re))
                    word_delimiters[i]=true;
            }
        }
        
        if(!disable_character_distribution)
        {
            count_character = new long long[size_alphabet];
            memset(count_character,0,sizeof(long long)*size_alphabet);
        }
        if(!disable_bigram_distribution)
        {
            count_bigram = new long long*[size_alphabet];
            for(int i=0;i<size_alphabet;++i)
            {
                count_bigram[i] = new long long[size_alphabet];
                memset(count_bigram[i],0,sizeof(long long)*size_alphabet);
            }
        }
    }

    void FrequencyDistribution::process(long long bufSize,unsigned char const* buffer,std::string directory,bool write_stdout, long file_id,std::string file_path,long long file_length)
    {
        cerr << "Frequency Distribution started"<<endl;

        long long num_bigrams = 0;
        
        for(long long i=0;i<bufSize;++i)
        {
            if(!disable_character_distribution)
                ++count_character[buffer[i]];
            if(!disable_bigram_distribution) {
                if(i>0) {
                    if(count_bigram[buffer[i-1]][buffer[i]] == 0) {
                        ++num_bigrams;
                    }
                    ++count_bigram[buffer[i-1]][buffer[i]];
                }
            }

            if(!disable_word_distribution)
            {
                bool isDelim = word_delimiters[buffer[i]];
                if(search_next_word && !isDelim)
                {
                    last_word_begin=i;
                    search_next_word = false;
                }
                else if((!search_next_word) && isDelim) 
                {
                    long long &count = count_word[pair<unsigned char const *,long long>(buffer+last_word_begin,i-last_word_begin)];
                    if(count == 0) {
                        ++count_word_sum;                        
                    }
                    ++count;
                    search_next_word = true;

                }
            }

        }

        if(!disable_word_distribution)
        {
            if(!search_next_word)
            {
                long long &count = count_word[pair<unsigned char const *,long long>(buffer+last_word_begin,bufSize-last_word_begin)];
                if(count == 0) {
                    ++count_word_sum;
                }
                ++count;
            }
        }
        
        if(!disable_character_distribution)
        {
            long double *distribution = new long double[size_alphabet];
            for(int i=0; i<size_alphabet; ++i) {
                distribution[i] = count_character[i]/(long double)bufSize;
            }
            printCsvCharacterDistribution(directory, write_stdout, getId(), 
                                         file_id, file_path, file_length,
                                         ind_map, distribution);
            delete [] distribution;
        }
        
        if(!disable_bigram_distribution)
        {
            printCsvBigramDistribution(directory, write_stdout, getId(),
                                       file_id, file_path, file_length, 
                                       ind_map, count_bigram, num_bigrams, bufSize-1);
        }
        
        if(!disable_word_distribution)
        {
            printCsvWordDistribution(directory, write_stdout, getId(), 
                                     file_id, file_path, file_length,
                                     ind_map, word_delimiters, count_word, count_word_sum);
        }
        cerr << "Frequency Distribution finished"<<endl;

    }

    const string FrequencyDistribution::getId()
    {
        return module_id;
    }

}

