/*******************************************************************************
 * This program was created in the context of the PAPI-project:                *
 * "Practical Approximate Pattern Matching with Index Structures"              *
 * Project website: http://www14.in.tum.de/spp1307/                            *
 * Author: Andre Dau <dau#in.tum.de>                                           *
 ******************************************************************************/

/**
 *******************************************************************************
 * @file tt-generate.cpp
 *
 * <tt>tt-generate</tt> generates random texts and allows the user to choose 
 * between different (probability) models (such as discrete autoregressive process,
 * approximate repeats model by Allison et al (1998), markov chain, uniform 
 * distribution, fibonacci word). <br>
 * The more complex models need parameters which can be estimated from texts
 * using <tt>tt-analyze</tt>. 
 *
 * The <tt>tt-generate</tt> and <tt>tt-analyze</tt> tools are intended to be
 * used together. More details on the tools can be found in <i> Andre Dau (2010):
 * Analysis of the structure and statistical properties of texts and generation 
 * of random texts</i>.
 *
 * The required format of the parameter files is best described by the examples 
 * in the sample output folder which come bundled with the source code. 
 *
 * All input files have to be in csv format and must have a header. The last line
 * of the header must contain the field <tt>content_type</tt> which specifies the 
 * type of the file. Different probability models need different parameter file 
 * types as input. 
 *
 * Input files can also be read directly from <tt>stdin</tt> making it possible 
 * to pipe the output of <tt>tt-analyze</tt> to <tt>tt-generate</tt>. To pass 
 * multiple files via <tt>stdin</tt> the files have to be concatenate in an arbitrary order. 
 * If a parameter file is passed both via <tt>stdin</tt> and the command line
 * the file specified in the command line is always preferred. 
 * 
 *******************************************************************************
 * @b Usage:
 *   @code
 *     tt-generate <file_length> <model> [arguments]
 *   @endcode
 *
 *******************************************************************************
 * @param file_length       The length of the file to be generated.
 * @param model             A probability model (and its parameters) from
 *                          the list below.
 * @param arguments         Zero or more arguments from the list below.
 *******************************************************************************
 * @b Arguments:
 *  @code
 *      -o <file>                    -  output_file (default: stdout)
 *      -p <parameter_type> <file>   -  specify parameter file (default: read from stdin)
 *      --stdout                     -  print to stdout (default: only print to stdout if -o <file> 
 *                                      is not specified)
 *  @endcode
 *
 *******************************************************************************
 * @b Models:
 *  @code
 *      markov                                  --  Markov chain
 *      dar                                     --  Discrete Autoregressive Process dar(p)
 *                                                  (see: Jacobs, P. A. & Lewis, P. A. W.: 
 *                                                        Stationary Discrete Autoregressive-Moving 
 *                                                        Average Time Series Generated By Mixtures
 *                                                        In: Journal of Time Series Analysis 4 (1983), 
 *                                                        Nr. 1, pp. 19-36
 *                                                        http://dx.doi.org/10.1111/j.1467-9892.1983.tb00354.x)
 *                                                  (see: Dehnert, M. & Helm, W. E. & Huett, M.-Th.: 
 *                                                        A Discrete Autoregressive Process as a model 
 *                                                        for short-range correlations in DNA sequences
 *                                                        In: Physica A 327 (2003), pp. 535-553
 *                                                        http://dx.doi.org/10.1016/S0378-4371(03)00399-6)
 *                                                  (see: Huett, M.-Th. & Dehnert, M. : 
 *                                                        Methoden der Bioinformatik: Eine Einfuehrung
 *                                                        Springer, 2006)
 *      repeats                                 --  Repeat machine 
 *                                                  (see: Allison, L. & Edgoose, T. & Dix, T. I.: 
 *                                                        Compression of Strings with Approximate Repeats
 *                                                        In: Intelligent Systems in Mol. Biol. (1998), 
 *                                                        pp. 8-16)
 *      uniform <alphabet>                      --  Uniform distribution of characters
 *                                                  <alphabet> is either a string containing all symbols 
 *                                                  of the alphabet or one of the following presets:
 *                                                      --dna           -> ACGT
 *                                                      --dna5          -> ACGTN
 *                                                      --rna           -> ACGU
 *                                                      --rna5          -> ACGUN
 *                                                      --amino         -> ARNDCEQGHILKMFPSTWYV
 *                                                      --amino23       -> ARNDCEQGHILKMFPSTWYVBZX
 *      fibonacci [alphabet] [--use-tmp-file]   --  Fibonacci word
 *                                                  All arguments are optional.
 *                                                  Alphabet is a string consisting of two characters. 
 *                                                  Default alphabet: "ab" 
 *                                                  If --use-tmp-file is specified the fibonacci word 
 *                                                  is created using a (temporary) file as a buffer.
 *                                                  This is usually much slower but can save memory space.
 *  @endcode
 *
 *******************************************************************************
 * <b> Required parameter file types: </b>
 *  @code
 *      markov          --  character_distribution  (required)
 *                          qgram_distribution      (optional; for higher order markov chains)
 *      dar             --  character_distribution  (required)
 *                          autocorrelation_dar     (required)
 *      repeats         --  character_distribution  (required)
 *                          qgram_distribution      (optional; for higher order markov chains)
 *                          direct_repeat           (optional; for direct repeats)
 *                          mirror_repeat           (optional; for mirror repeats)
 *                          inverted_repeat         (optional; for inverted repeats)
 *      fibonacci       --  none
 *      uniform         --  none
 *  @endcode
 *******************************************************************************
 * @b Examples:
 * - The first 12 letters of the infinite fibonacci word using 0 and 1 as the alphabet:
 *   @code
 *     $ tt-generate 12 fibonacci "01"
 *     010010100100
 *   @endcode
 *
 * - Uniform distribution over the dna alphabet 'ACGT':
 *   @code
 *     $ tt-generate 50 uniform --dna
 *     TTGATCTATGTCAGAATGCCTAAGAGTGTTGTGATCTGATGAACGCTCGT
 *   @endcode
 *
 * - Uniform distribution over the alphabet '<>+-.,[]':
 *   @code
 *     $ tt-generate 90 uniform "<>+-.,[]"
 *     >>[[]<-<[<+->>..,,>.+++>.[<+<,,.-->,][>+<+,,]][][<--.,<>+>]<[+-.->+--.,<[[,]].,>>+<-<.>+->
 *   @endcode
 *
 * - Markov chain of order 5. Pipe estimated parameters directly from tt-analyze to tt-generate:
 *   @code
 *     $ tt-analyze markov 5 -i input.fasta | tt-generate 100 markov > output.fasta
 *   @endcode
 * 
 * - Complex example. Repeats model, print to file and stdout, parameter files given excplicitly in command line:
 *   @code
 *     $ tt-generate 30000000 repeats \
 *              -o sample_output.txt --stdout \
 *              -p qgram_distribution ApproximateRepeats_qgram_distribution.csv \
 *              -p character_distribution ApproximateRepeats_character_distribution.csv \
 *              -p direct_repeat ApproximateRepeats_direct_repeat.csv
 *   @endcode
 * 
 **************************************************************************************************
 * @return 0 on success, something else on error
 *
 **************************************************************************************************
 * @b Download:
 * - The newest version of this tool can be downloaded from http://www14.in.tum.de/spp1307/downloads.html
 *
 **************************************************************************************************
 */

#include <iostream>
#include <map>
#include <cstdlib>

#include "dar/dar.h"
#include "generator.h"
#include "linear_equation.h"
#include "markov/markov.h"
#include "repeats/approximate_repeats.h"
#include "fibonacci/fibonacci.h"

using namespace std;
using namespace papi;


/**
 * Prints the usage message.
 */
void printUsage()
{
    cerr << endl << "INTRODUCTION" << endl;
    cerr << "   The generator is intended to be used with tt-analyze which provides" << endl;
    cerr << "   the necessary parameter files. The header 'content-type' of the files" << endl;
    cerr << "   generated by tt-analyze identifies the parameter type for tt-generate."<< endl <<endl;
    cerr << "USAGE" <<endl;
    cerr << "   tt-generate <file_length> <model> [arguments] " <<endl << endl;
    
    cerr << "ARGUMENTS (all optional): "<< endl;
    cerr << "   -o <file>                   -   output_file (default: stdout)" <<endl;
    cerr << "   -p <parameter_type> <file>  -   specify parameter file" << endl;
    cerr << "                                   (default: read from stdin)" << endl;
    cerr << "   --stdout                    -   print to stdout" << endl;
    cerr << "                                   (default: only print to stdout if" << endl;
    cerr << "                                   -o [file] not specified)" << endl << endl;
    
    cerr << "MODELS:" << endl;
    cerr << "   uniform <alphabet>" << endl;
    cerr << "       Uniform distribution" << endl;
    cerr << "           <alphabet> is either a string containing all symbols or one of:" << endl;
    cerr << "               --dna           -> ACGT" << endl;
    cerr << "               --dna5          -> ACGTN" << endl;
    cerr << "               --rna           -> ACGU" << endl;
    cerr << "               --rna5          -> ACGUN" << endl;
    cerr << "               --amino         -> ARNDCEQGHILKMFPSTWYV" << endl;
    cerr << "               --amino23       -> ARNDCEQGHILKMFPSTWYVBZX" << endl;
    cerr << "   dar" << endl;
    cerr << "       Discrete Autoregressive Process of order p" << endl;
    cerr << "           Parameter file types:" << endl;
    cerr << "               character_distribution  (required)" << endl;
    cerr << "               autocorrelation_dar     (required)" << endl;
    cerr << "   markov" << endl;
    cerr << "       Markov chain" << endl;
    cerr << "           Parameter file types:" <<endl;
    cerr << "               character_distribution  (required)" << endl;
    cerr << "               qgram_distribution      (optional for higher order chains)" << endl;
    cerr << "   repeats" << endl;
    cerr << "       Approximate repeats model by Allison et al."<<endl;
    cerr << "           Parameter file types:" <<endl;
    cerr << "               character_distribution  (required)" << endl;
    cerr << "               qgram_distribution      (optional for higher order markov)" << endl;
    cerr << "               direct_repeat           (optional)" << endl;
    cerr << "               inverted_repeat         (optional)" << endl;
    cerr << "           mirror_repeat           (optional)" << endl;
    cerr << "   fibonacci [alphabet] [--use-tmp-file]" <<endl;
    cerr << "       Fibonacci word" <<endl;
    cerr << "           All arguments are optional." << endl;
    cerr << "           Alphabet is a string consisting of two characters. Default: \"ab\"" << endl;
    cerr << "           If --use-tmp-file is specified the fibonacci word is created" << endl;
    cerr << "           using a (temporary) file as a buffer. This is usually much slower" << endl;
    cerr << "           than using a buffer in the main memory but can save memory space." << endl << endl;   ;

    cerr << "EXAMPLE 1:" << endl;
    cerr << "   tt-generate 12 fibonacci \"01\""<<endl<<endl;
    cerr << "EXAMPLE 2 & 3:" << endl;
    cerr << "   tt-generate 1000 uniform --amino"<<endl;
    cerr << "   tt-generate 1000 uniform \"abcdefg\""<<endl<<endl;
    cerr << "EXAMPLE 4:" << endl;
    cerr << "   tt-analyze markov 5 -i input.fasta | tt-generate 100 markov > output.fasta"<<endl;
    cerr << "EXAMPLE 5:" << endl;
    cerr << "   tt-generate 30000000 repeats \\" << endl;
    cerr << "       -o sample_output.txt    --stdout \\" << endl;
    cerr << "       -p qgram_distribution ApproximateRepeats_qgram_distribution.csv \\" << endl;
    cerr << "       -p character_distribution ApproximateRepeats_character_distribution.csv \\" <<endl;
    cerr << "       -p direct_repeat ApproximateRepeats_direct_repeat.csv" <<endl <<endl;
    
    cerr << "NOTE:  It is possible to pass one or more concatenated parameter files to stdin." << endl;
    cerr << "       This allows direct piping of the results from tt-analyze to tt-generate." << endl;
    cerr << "       If a parameter file is explicitly given in the command line it is always" << endl;
    cerr << "       preferred to stdin." << endl << endl;
}

/**
 * Prints invalid parameter error message and exits with a failure
 */
void invalidParameter(string parameter) {
    cerr << "Invalid parameter: " << parameter << endl;
    exit(EXIT_FAILURE);
}


/**
 * Main method.
 * Reads parameters, starts model initialization and text generation.
 */
int main (int argc, char * const argv[]) {
    
    Generator* gen;
    
    
    string model;
    const char *output_file = NULL;
    long long file_length = 0;
    bool write_stdout = false;
    map<string,string> formal_parameters;
    vector<string> positional_parameters;
    
    for(int i=1;i<argc;++i)
    {
        if(i==2)
        {
            model = string(argv[i]);
        }
        else if(i==1)
        {
            file_length = strtoll(argv[i],NULL,0);
        }
        else if(strcmp(argv[i], "-o")==0)
        {
            output_file = argv[++i];
        }
        else if(strcmp(argv[i], "-p")==0)
        {
            formal_parameters[argv[i+1]] = argv[i+2];
            i+=2;
        } else if(strcmp(argv[i], "--stdout")==0) {
            write_stdout = true;
        }
        else {
            positional_parameters.push_back(argv[i]);
        }

    }

    
    if(model.empty())
    {
        cerr << "No model specified" << endl;
        printUsage();
        exit(EXIT_FAILURE);
    }
    if(file_length == 0)
    {
        cerr << "Invalid file length specified" << endl;
        printUsage();
        exit(EXIT_FAILURE);
    }
    if(output_file == NULL) {
        write_stdout = true;
    }

    if(model == "fibonacci") {
        if(positional_parameters.size() == 0) {
            gen = new Fibonacci();
        } else if(positional_parameters.size() == 1) {
            if(positional_parameters[0] == "--use-tmp-file") {
                gen = new Fibonacci(true);
            } else if(positional_parameters[0].size() == 2) {
                gen = new Fibonacci(positional_parameters[0][0],positional_parameters[0][1]);
            } else {
                cerr << "Invalid argument: " << positional_parameters[0] << endl;
                exit(EXIT_FAILURE);
            }
        } else if(positional_parameters.size() == 2) {
            if(positional_parameters[0].size() != 2) {
                cerr << "Invalid argument: " << positional_parameters[0] << endl;                
                exit(EXIT_FAILURE);
            }
            if(positional_parameters[1] != "--use-tmp-file") {
                cerr << "Invalid argument: " << positional_parameters[1] << endl;                                
                exit(EXIT_FAILURE);
            }
            gen = new Fibonacci(positional_parameters[0][0],positional_parameters[0][1],true);
        } else {
            cerr << "Wrong number of parameters for fibonacci." << endl;
            exit(EXIT_FAILURE);
        }
    }
    else if(model == "uniform") {
        Markov *markov = new Markov();
        if(positional_parameters.size()<1) {
            cerr << "Alphabet parameter missing" << endl;
            exit(EXIT_FAILURE);
        } else if(positional_parameters.size()>1) {
            cerr << "Too many parameters" << endl;
            exit(EXIT_FAILURE);
        } else {
            string alphabet;
            if(positional_parameters[0] == "--dna") {
                alphabet = "ACGT";
            } else if(positional_parameters[0] == "--dna5") {
                alphabet = "ACGTN";
            } else if(positional_parameters[0] == "--amino") {
                alphabet = "ARNDCEQGHILKMFPSTWYV";
            } else if(positional_parameters[0] == "--amino23") {
                alphabet = "ARNDCEQGHILKMFPSTWYVBZX";
            } else if(positional_parameters[0] == "--rna") {
                alphabet = "ACGU";
            } else if(positional_parameters[0] == "--rna5") {
                alphabet = "ACGUN";
            } else {
                alphabet = positional_parameters[0];
            }
            markov->initUniform(alphabet);
            gen = markov;
        }
    }
    else if(model == "dar")
    {
        gen = new Dar();
        if(!positional_parameters.empty()) {
            invalidParameter(positional_parameters[0]);
        }
    }
    else if(model =="markov")
    {
        gen = new Markov();

        if(!positional_parameters.empty()) {
            invalidParameter(positional_parameters[0]);
        }
    }
    else if (model == "repeats")
    {
        gen = new ApproximateRepeats();
        if(!positional_parameters.empty()) {
            invalidParameter(positional_parameters[0]);
        }
    }
    else 
    {
        cerr << "Unknown model" << endl;
        printUsage();
        exit(EXIT_FAILURE);
    }

    cerr << "Initialize generator" << endl;
    gen->initOutput(output_file, write_stdout);
    cerr << "Start generator" << endl;
    gen->generate(file_length, formal_parameters);
    cerr << endl <<"Finished!" << endl;
    
    if(gen)
        delete gen;
    
    return 0;
}


