// Copyright 1998 by Kevin Atkinson under the terms of the LGPL

#include "editdist.hh"
#include <cstring>
#include <vector>

namespace autil {

  // edit_distance finds the shortest edit distance.  The edit distance is 
  // (cost of swap)(# of swaps) + (cost of deletion)(# of deletions) 
  //   + (cost of insertion)(# of insertions) 
  //   + (cost of substitutions)(# of substitutions)
  //   + (cost of "similar" match)(# of similar match)
  //
  // It does this by creating a graph of all possible edits and there
  // assiated costs (based on the given weights) to go from (0,0),
  // neither string shifted, to (size_aa+1,size_b+1), both strings
  // shifted to the end, and then uses modified version Dijkstra's
  // algorithim to find the shortest path.  The trick is that it does
  // not actually create the graph, instead it calculates the needed
  // information on the fly.  It also, unlikes Dijkstra's, does not push
  // all the vertices onto the priority queue.  Instead it only pushes
  // the nodes visted on the heap.  This creates a HUGE performance win
  // when the distance is releativly low becuase most of the nodes will
  // not be visted at all.  
  //
  // Let |a| = size the first string and |b| = size of second string then
  // The memory usages is on the order of |a||b|
  // The worst case complety is |a||b|lg(|a||b|)
  // However it does much better when the edit distance is relatively low

  // notes to self: consider making the push_down lazy.

  struct RandomAccessPriorityQueue {
    class Matrix {
      int           size_x;
      vector<int>   data;
    public:
      Matrix(int x, int y, int initial_value) 
	: size_x(x), data(size_x*y, initial_value) {}
      int & operator() (int x, int y) {
	return data[x+y*size_x];
      }
    };

    struct D {
      // I use shorts and floats becuase this data struture is
      // going to be copied a lot.  It makes a marked difference
      // on my platform (as opposed to using int and doubles)
      short score;
      unsigned char i;
      unsigned char j;
      D() {}
      D(unsigned char i0, unsigned char j0, short s) : score(s), i(i0), j(j0) {}
    };

    static const int NIL   = -1;
    static const int POPED = NIL - 1;

    Matrix    lookup;
    vector<D> heap;

    static int left  (int i) {return (i*2)+1;}
    static int right (int i) {return (i+1)*2;}
    static int parent(int i) {return (i-1)/2;}
  
    RandomAccessPriorityQueue(int x, int y) 
      : lookup(x+1,y+1,NIL) {heap.reserve(16);}

    void assign(int i, const D & d) {
      heap[i] = d;
      lookup(d.i,d.j) = i;
    }

    void push_down(int i) {
      D d = heap[i];
      int l, r;
      int min;
      int size = heap.size();
      while(1) {
	l = left(i);
	r = right(i);
      
	if (r >= size) {
	  if (l >= size) break;
	  else           min = l;
	} else if (heap[l].score > heap[r].score) {
	  min = r;
	} else {
	  min = l;
	}
      
	if (d.score > heap[min].score) {
	  assign(i, heap[min]);
	  i = min;
	} else {
	  break;
	}
      }
      assign(i, d);
    }
  
    void push_up(int i) {
      D d = heap[i];
      int p;
      while(1) {
	if (i == 0) break;
	p = parent(i);
	if (heap[p].score <= d.score) break;
	assign(i, heap[p]);
	i = p;
      }
      assign(i,d);
    }

    const D & minimum() {return heap[0];}

    void remove(int i) {
      vector<D>::size_type lastpos = heap.size() -1;
      heap[i] = heap[lastpos];
      heap.resize(lastpos);
      push_down(i);
    }
  
    void pop_minimum() {
      lookup(heap[0].i, heap[0].j) = POPED;
      remove(0);
    }
  
    void push_on(int i, int j, short score) {
      heap.push_back(D(i,j,score));
      push_up(heap.size()-1);
    }

    void possibly_push(int i, int j, short score) {
      int p = lookup(i,j);
      if (p == POPED);
      else if (p == NIL)
	push_on(i,j,score);
      else if (heap[p].score > score) {
	heap[p].score = score;
	push_up(p);
      } else;
    }
  
    void adjust_top(int i, int j) {
      D & d = heap[0];
      lookup(d.i,d.j) = POPED;
      d.i += i;
      d.j += j;
      int p = lookup(d.i,d.j);
      if (p == POPED) 
	remove(0);
      else if (p != NIL) // it has to be more so remove it
	remove(p);
    }
  
    void adjust_top(int i, int j, short score) {
      D d = heap[0];
      pop_minimum();
      possibly_push(d.i + i, d.j + j, d.score + score);
    }
  
#if 0 // check_heap & print_heap only needed for debugging
    void check_heap(int i) {
      if (i >= heap.size()) return;
      if (!(heap[parent(i)].score <= heap[i].score)) {
	cout << "invalid heap:\n";
	print_heap();
	abort();
      }
      check_heap(right(i));
      check_heap(left(i));
    }

    void print_heap() {
      for (int i=0; i!=heap.size(); ++i) {
	cout << " " << heap[i].i << " " << heap[i].j << " " 
	     << heap[i].score << endl;
      }
    }
#endif
  };


  short edit_distance(const char * a, const char * b, 
		      const char * a2, const char * b2,
		      const EditDistanceWeights & weight) {

    int a_size = strlen(a);
    int b_size = strlen(b);

    RandomAccessPriorityQueue    q(a_size, b_size);
    RandomAccessPriorityQueue::D d;
  
    q.possibly_push(0,0,0.0);

    while ( ! (q.minimum().i == a_size && q.minimum().j == b_size)) {
      if (a[q.minimum().i] == b[q.minimum().j]) {

	q.adjust_top(1, 1);

      } else if (a2[q.minimum().i] == b2[q.minimum().j]) {

	q.adjust_top(1, 1, weight.similar);

      } else if (q.minimum().i == a_size) {

	q.adjust_top(0, 1, weight.del2);

      } else if (q.minimum().j == b_size) {

	q.adjust_top(1, 0, weight.del1);

      } else {

	d = q.minimum();
	q.pop_minimum();
      
	if (a2[d.i] == b2[d.j+1] && b2[d.j] == a2[d.i+1]) {
	  int w = d.score + weight.swap;
	  if (weight.similar != 0) {
	    w += 2*weight.similar;
	    if (a[d.i] == b[d.j+1]) w -= weight.similar;
	    if (b[d.j] == a[d.i+1]) w -= weight.similar;
	  }
	  q.possibly_push(d.i+2,d.j+2,w);
	}

	q.possibly_push(d.i+1, d.j  , d.score + weight.del1);
	q.possibly_push(d.i  , d.j+1, d.score + weight.del2);
	q.possibly_push(d.i+1, d.j+1, d.score + weight.sub);

      }	  
    }

    return q.minimum().score;
  }

}
