/*************************************************************************/
/*                                                                       */
/*                Centre for Speech Technology Research                  */
/*                     University of Edinburgh, UK                       */
/*                         Copyright (c) 1996                            */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission to use, copy, modify, distribute this software and its    */
/*  documentation for research, educational and individual use only, is  */
/*  hereby granted without fee, subject to the following conditions:     */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*  This software may not be used for commercial purposes without        */
/*  specific prior written permission from the authors.                  */
/*                                                                       */
/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*                     Author :  Simon King & Alan W Black               */
/*                     Date   :  February 1997                           */
/*-----------------------------------------------------------------------*/
/*                                                                       */
/* A general class for ngrams (bi-gram, tri-gram etc)                    */
/*                                                                       */
/*=======================================================================*/
#ifndef __EST_BACKOFFNGRAMMAR_H__
#define __EST_BACKOFFNGRAMMAR_H__

#include <stdarg.h>
#include <stdlib.h>

#include "EST_rw_status.h"
#include "EST_PST.h"
#include "EST_StringTrie.h"
#include "EST_types.h"
#include "EST_FMatrix.h"
#include "EST_TList.h"
#include "EST_Ngrammar.h"


#if 0

class EST_BackoffNgrammar {

private:


protected:

    // an order N backoff ngram has a set of ngrams from order N down to 1
    int order;
    EST_Ngrammar *ngrammar;

    // and backoff weights for grammars 1 to N-1
    // stored as Ngrammars, which they aren't really

    // bw[1] is of order 1 and contains weights used to
    // back bigrams off to unigrams

    // e.g. p(a,b) = bw[1](a) * p(b);
    // e.g. p(a,b,c) = bw[2](a,b) * p(b,c);
    // e.g. p(a,b,d) = bw[2](a,b) * p(b,d);
    // e.g. p(a,b,c,d) = bw[3](a,b,c) * p(b,c,d);

    // bw is set so that sum{ p(a,b,x) } over x  = 1

    // there is no bw[0] or bw[order]
    EST_Ngrammar **bw; // actually pointers to Ngrammars ....

    // vocabulary (just a list of words, stored in a handy way)
    EST_Discrete *vocab;
    bool init_vocab(const EST_StrList &wordlist);

    // are we allowing out-of-voculary words, or is the vocabulary closed ?
    bool allow_oov;

    double min_frequency; // for backing off
    const bool backoff(double threshold_freq);

    inline void iterate(EST_Ngrammar *n,EST_StrVector &ngram,
			void (*function)(EST_Ngrammar *n,
					 EST_StrVector &ngram,
					 void *params),
			void *params){
	n->iterate(ngram,function,params);
    };

    inline void const_iterate(const EST_Ngrammar *const n,EST_StrVector &ngram,
			void (*function)(const EST_Ngrammar *n,
					 EST_StrVector &ngram, 
					 void *params),
			void *params) const {
	n->const_iterate(ngram,function,params);
    };

    // go through all matching ngrams ( *(ngram[i])="" matches anything )
    void iterate(const EST_StrVector &ngram,void (*function)(EST_BackoffNgrammar *n,EST_StrVector &ngram, void *params),void *params);

    void const_iterate(EST_StrVector &ngram,void (*function)(const EST_BackoffNgrammar *const n,EST_StrVector &ngram, void *params),void *params) const;

public:
    
    EST_BackoffNgrammar();
    ~EST_BackoffNgrammar();

    // need to specify order and representation to initialise
    const bool init(int o, const EST_StrList &word_list);

    // prepare input - in place (?)
    /*
    const bool prepare_data(EST_StrList &input,
		      const EST_String sentence_break_marker = EST_String("."), 
		      char *delete_these="\"'`,:;!?(){}[] ");
		      */

    // build - can't set prev,prev_prev and last since these
    // are fixed to make backing off work - 
    const bool build(const EST_String filename, double min_count=1);


    // load - automatically determine format
    EST_read_status load(const EST_String filename);

    // save
    EST_write_status save(const EST_String filename, const EST_String type="arpa");

    // access
    const int wordlist_index(const EST_String &word) const;
    inline const int get_order() const { return order; };
    inline const int vocab_size() const { return vocab->size(); };
    inline const double get_min_frequency() const {return min_frequency; };
    inline const bool closed_vocab() const {return !allow_oov; };
    inline const double backoff_weight(int ord, const EST_StrVector &ngram) const
    { return bw[ord]->p(ngram); };

    // backed-off probability of an N-gram, given N items
    const double probability(const EST_StrVector &words) const;
    // backed off prob, given order
    const double probability(const EST_StrVector &words, int ord) const;

    // distribution, where s[] has 'order' entries
    // and the unknown (to be predicted) words are left blank
    const EST_TProbDistribution *const prob_dist(const EST_StrVector &words) const;

    void set_prob(const EST_StrVector &ngram, double p);


    // entropy of test data
    const double entropy(const EST_String filename) const;


    // can only use ARPA file format for now
    friend EST_read_status load_arpa(const EST_String filename, 
				     EST_BackoffNgrammar &n);
    friend EST_write_status save_arpa(const EST_String filename,
				      EST_BackoffNgrammar &n);

    // does this make sense for backoff grammar ??
    //friend bool Good_Turing_smooth(EST_BackoffNgrammar &n, int maxcount=10);

    friend void backoff_sub1(EST_Ngrammar *n,const EST_StrVector &ngram, 
			     void *bngrammar);
    friend void backoff_sub2(EST_Ngrammar *n, const EST_StrVector &ngram, 
			     void *bngrammar);

};
#endif

#endif // __EST_BACKOFFNGRAMMAR_H__

