#include "entropy.h"

using namespace std;

namespace papi
{

    Entropy::Entropy( char const* name):
        qgram_length_actual(0), size_alphabet(0),disable_entropy(false),count_char(NULL),
        count_word(NULL), prob(NULL), block_entropy(NULL), cond_entropy(NULL), 
        tail(NULL)
        
    {
        module_id = name;
    }

    Entropy::~Entropy()
    {
        if(count_char)
            delete [] count_char;

        if(block_entropy)
            delete [] block_entropy;
        
        if(cond_entropy)
            delete [] cond_entropy;
        if(prob)
            delete [] prob;
        if(count_word)
            delete count_word;
        if(tail)
            delete tail;
        
    }

    void Entropy::init(AnalyzeSetting & settings, long long file_length, const IndexMap<char> *ind_map)
    {
        this->size_alphabet = ind_map->size;
        this->ind_map = ind_map;
        
        disable_entropy = settings.getBool("disable_entropytropy",false);
        disable_qgram_distribution = settings.getBool("disable_qgram_distribution",false);
        disable_character_distribution = settings.getBool("disable_character_distribution",false);


        if(disable_entropy && disable_qgram_distribution &&disable_character_distribution)
        {
            cerr<<"Entropy Module: All submodules disabled"<<endl;
        }
        else 
        {
            entropy_entries = settings.getInt("entropy_entries",1);
            qgram_length_setting = settings.getInt("qgram_length",1);

            if(disable_entropy)
                entropy_entries = 0;
            if(disable_qgram_distribution)
                qgram_length_setting = 0;
            
            qgram_length_actual = max(entropy_entries,qgram_length_setting);
            
            count_char =  new int[size_alphabet];
            memset(count_char,0,sizeof(int)*size_alphabet);

            count_word = new unsigned_cstring_fixed_length_long_long_hash_map(0,hash_djb2::fixed_length(qgram_length_actual),eq::fixed_length(qgram_length_actual));
            tail = new unsigned char[qgram_length_setting+qgram_length_actual-2];
        }
    }




    void Entropy::process(long long bufSize,unsigned char const* buffer,string dir, bool write_stdout, long file_id,string file_path,long long file_length)
    {
        cerr<<"Entropy module started"<<endl;
        for(int i=0;i<qgram_length_setting-1;++i)
            tail[i] = buffer[bufSize-qgram_length_setting+1];
        for(int i=0;i<qgram_length_actual-1;++i)
            tail[qgram_length_setting-1+i] = buffer[i];
        
        for(long long i=0; i<bufSize;++i)
        {
            int c_index = buffer[i];

            
            ++count_char[c_index];
            
            if(i + qgram_length_actual <=bufSize)
            {
                ++(*count_word)[buffer+i];
            }
        }
            
        prob = new long double[size_alphabet];
        for(int i=0;i<size_alphabet;++i){
            prob[i] = (long double)count_char[i]/bufSize;
        }

        
        ofstream oFile_res;
        CsvOutStream csv_qp(POINT);
        
        if(!disable_qgram_distribution)
        {
            printCsvQgramDistributionHeader(csv_qp, oFile_res, dir, write_stdout,
                                            getId(), file_id, file_path, file_length,
                                            qgram_length_setting, size_alphabet);
        }    

        
        
        unsigned_cstring_fixed_length_long_long_hash_map *count_word_new;
        long double *log_n = new long double[qgram_length_actual];

        cond_entropy = new long double[qgram_length_actual];
        block_entropy = new long double[qgram_length_actual];
        
        memset(cond_entropy,0,sizeof(long double)*qgram_length_actual);
        memset(block_entropy, 0, sizeof(long double)*qgram_length_actual);
        
        //Precalculation of logarithmic values for the correction term
        for(int i=0;i<qgram_length_actual;++i)
            log_n[i] = log((long double)bufSize-i);
        
        /*
         * Idea: 
         * While processing qgrams of length k, count qgrams of length k-1
         */
        for(int order=qgram_length_actual-1;order>0;--order)
        {
            //qgrams of length k-1
            count_word_new = new unsigned_cstring_fixed_length_long_long_hash_map(0,hash_djb2::fixed_length(order),eq::fixed_length(order));
            
            for(unsigned_cstring_fixed_length_long_long_hash_map::iterator it=count_word->begin();it!=count_word->end();++it)
            {
                long long count = it->second;
                unsigned char const* cstring = it->first;

                //count qgrams of length k-1
                (*count_word_new)[cstring]+=count;
                
                /*
                 * block entropy
                 */
                if(order<entropy_entries)
                {
                    block_entropy[order]+=(long double)count*(log_n[order]-log((long double)count));
                }
                
                
            }
            
            
            if(order == qgram_length_setting-1)
            {
                for(int i=0;i<qgram_length_setting-1;++i)
                    ++(*count_word)[tail+i];

                for(unsigned_cstring_fixed_length_long_long_hash_map::iterator it=count_word->begin();it!=count_word->end();++it)
                {
                    /*
                     * Block entropy
                     */
                    long long count = it->second;
                    unsigned char const* cstring = it->first;

                    if(!disable_qgram_distribution)
                    {
                        string s;
                        for(int k=0;k<qgram_length_setting;++k)
                            s.push_back(ind_map->getValue(cstring[k]));
                        csv_qp.addCell(s);
                        csv_qp.addCell( (long double)count/(bufSize) );
                        csv_qp.newline();
                    }
                }
            }
            
            /*
             * Conditional Entropie
             */
            if(order<entropy_entries)
            {
                cond_entropy[order]=block_entropy[order];
                
                for(unsigned_cstring_fixed_length_long_long_hash_map::iterator it=count_word_new->begin();it!=count_word_new->end();++it)
                {
                    long long count = it->second;
                    cond_entropy[order]-=(long double)count*(log_n[order]- log((long double)count));
                }
            }
            
            delete count_word;
            count_word = count_word_new;
            ++(*count_word)[buffer+bufSize-order];
        }
        
        if(count_word)
        {
            delete count_word;
            count_word = 0;
        }
        
        /*
         * Finish calculations
         */
        if(entropy_entries>0)
        {
            
            for(int i=0;i<size_alphabet;++i)
            {
                if(count_char[i]>0)
                {
                    block_entropy[0]+=(long double)count_char[i]*(log_n[0]-log((long double)count_char[i]));
                }    
            }
            cond_entropy[0] = block_entropy[0];
            for(int i=0;i<qgram_length_actual;++i)
            {
                block_entropy[i]/=log((long double)2)*(bufSize-i);
                cond_entropy[i]/=log((long double)2)*(bufSize-i);
            }
        }
        
        if(!disable_qgram_distribution) {
            csv_qp.newline();
        }
        
        
        
        
        if(oFile_res.is_open())
            oFile_res.close();
        
        if(log_n)
            delete [] log_n;
        
       
        if(!disable_entropy)
        {
            printCsvEntropy(dir, write_stdout, getId(), file_id, file_path,
                            file_length, block_entropy, cond_entropy, entropy_entries);
        }
        
        if(!disable_character_distribution)
        {
            printCsvCharacterDistribution(dir, write_stdout, getId(), 
                                         file_id, file_path, file_length, 
                                         ind_map, prob);

        }

        cerr<<"Entropy module finished"<<endl;

    }

    const string Entropy::getId()
    {
        return module_id;
    }

}

