
/******************************************************************************
* MODULE     : parsehtml.gen.cc
* DESCRIPTION: conversion of html strings into logical html trees
* COPYRIGHT  : (C) 2000  Joris van der Hoeven
*******************************************************************************
* This software falls under the GNU general public license and comes WITHOUT
* ANY WARRANTY WHATSOEVER. See the file $TEXMACS_PATH/LICENSE for more details.
* If you don't have this file, write to the Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
******************************************************************************/

#include <analyze.gen.h>
#include <convert.gen.h>
#include <hashmap.gen.h>

#module code_parsehtml
#import analyze
#import convert
#import hashmap (string, string)

void
print_tree (tree t, int tab=0) {
  int i;
  for (i=0; i<tab; i++) cout << " ";
  if (is_atomic (t)) cout << t->label << "\n";
  else {
    cout << CONSTRUCTOR_NAME [L(t)] << "\n";
    for (i=0; i<N(t); i++) print_tree (t[i], tab+2);
  }
}

/******************************************************************************
* Example of locical html tree:
* hopsakee <b x y> hola <x> hopsa </b> pok     yields
* (hopsakee ((b x y) hola (x) hopsa) pok)
******************************************************************************/

static hashmap<string,string> html_symbol ("");

struct html_parser {
  html_parser ();

  tree   parse_doc      (string s, int& i);
  tree   parse          (string s, int& i, string caller);
  tree   parse_tag      (string s, int& i);
  tree   parse_verbatim (string s, int& i);
  string parse_string   (string s, int& i);
  string parse_symbol   (string s, int& i, bool normal_mode= TRUE);
};

/******************************************************************************
* Initialization
******************************************************************************/

html_parser::html_parser () {
  if (N (html_symbol) != 0) return;
  html_symbol ("quot")= "\"";
  html_symbol ("amp")= "&";
  html_symbol ("lt")= "<";
  html_symbol ("gt")= ">";
  html_symbol ("nbsp")= "";

  html_symbol ("Agrave")= "";
  html_symbol ("Aacute")= "";
  html_symbol ("Acirc")= "";
  html_symbol ("Atilde")= "";
  html_symbol ("Auml")= "";
  html_symbol ("Aring")= "";
  html_symbol ("AElig")= "";
  html_symbol ("Ccedil")= "";
  html_symbol ("Egrave")= "";
  html_symbol ("Eacute")= "";
  html_symbol ("Ecirc")= "";
  html_symbol ("Euml")= "";
  html_symbol ("Igrave")= "";
  html_symbol ("Iacute")= "";
  html_symbol ("Icirc")= "";
  html_symbol ("Iuml")= "";
  html_symbol ("Ntilde")= "";
  html_symbol ("Oslash")= "";
  html_symbol ("Ograve")= "";
  html_symbol ("Oacute")= "";
  html_symbol ("Ocirc")= "";
  html_symbol ("Otilde")= "";
  html_symbol ("Ouml")= "";
  html_symbol ("Ugrave")= "";
  html_symbol ("Uacute")= "";
  html_symbol ("Ucirc")= "";
  html_symbol ("Uuml")= "";
  html_symbol ("Yacute")= "";
  html_symbol ("szlig")= "";

  html_symbol ("agrave")= "";
  html_symbol ("aacute")= "";
  html_symbol ("acirc")= "";
  html_symbol ("atilde")= "";
  html_symbol ("auml")= "";
  html_symbol ("aring")= "";
  html_symbol ("aelig")= "";
  html_symbol ("ccedil")= "";
  html_symbol ("egrave")= "";
  html_symbol ("eacute")= "";
  html_symbol ("ecirc")= "";
  html_symbol ("euml")= "";
  html_symbol ("igrave")= "";
  html_symbol ("iacute")= "";
  html_symbol ("icirc")= "";
  html_symbol ("iuml")= "";
  html_symbol ("ntilde")= "";
  html_symbol ("oslash")= "";
  html_symbol ("ograve")= "";
  html_symbol ("oacute")= "";
  html_symbol ("ocirc")= "";
  html_symbol ("otilde")= "";
  html_symbol ("ouml")= "";
  html_symbol ("ugrave")= "";
  html_symbol ("uacute")= "";
  html_symbol ("ucirc")= "";
  html_symbol ("uuml")= "";
  html_symbol ("yacute")= "";
  html_symbol ("yuml")= "";
}

/******************************************************************************
* Subroutines for badly structured markup
******************************************************************************/

static string
compute_tag (string s, int i) {
  int n=N(s);
  if ((i>=n) || (s[i]!='<')) return "";
  i++;
  int start= i;
  while ((i<n) && (s[i]!='>')) i++;
  if (i<n) return s (start, i);
  return "";
}

bool
special_sequence (string tag1, string tag2) {
  if (tag1 == "DT") {
    if (tag2 == "DD") return TRUE;
    if (tag2 == "/DL") return TRUE;
  }
  if (tag1 == "DD") {
    if (tag2 == "DT") return TRUE;
    if (tag2 == "/DL") return TRUE;
  }
  return FALSE;
}

bool
must_close (string tag) {
  return
    (tag == "UL") || (tag == "OL") || (tag == "DL") ||
    (tag == "DT") || (tag == "DD");
}

/******************************************************************************
* Main parsing routine
******************************************************************************/

tree
html_parser::parse_doc (string s, int& i) {
  int n= N(s);
  tree t (TUPLE);
  while (i<n) {
    tree u= parse (s, i, "");
    t << A(u);
    while ((i<n) && (s[i]!='>')) i++;
    if (i<n) i++;
  }
  return t;
}

tree
html_parser::parse (string s, int& i, string caller) {
  int n= N(s);
  string r;
  tree t (TUPLE);
  while (i<n) {
    if (s[i]=='&') r << parse_symbol (s, i);
    else if (s[i]=='<') {
      if (read (s, i, "<!--")) {
	while ((i<n) && (!read (s, i, "-->"))) i++;
	continue;
      }
      if (N(r)!=0) t << r;
      if (((i+1)<n) && (s[i+1]=='/')) return t;
      else {
	int    start= i++;
	tree   tag  = parse_tag (s, i);
	string lab  = tag[0]->label;
	tree   args;

	if (special_sequence (caller, lab)) {
	  i= start;
	  return t;
	}
	else if ((lab == "pre") || (lab == "code") ||
	    (lab == "PRE") || (lab == "CODE"))
	  args= parse_verbatim (s, i);
	else args= parse (s, i, lab);
	
 	if (((i+N(lab)+3) < n) && (s(i,i+N(lab)+3) == ("</" * lab * ">"))) {
	  tree f= tree (TUPLE, tag);
	  f << A(args);
	  i += N(lab)+3;
	  t << f;
	  // cout << "CLS= " << f << "\n";
	}
	else if (special_sequence (lab, compute_tag (s, i)) ||
		 must_close (lab))
	  {
	    tree f= tree (TUPLE, tag);
	    f << A(args);
	    t << f;
	    // cout << "CLS= " << f << "\n";
	  }
	else {
	  t << tag;
	  t << A(args);
	}
      }
      r="";
    }
    else if (s[i]=='\"') // "
      r << parse_string (s, i);
    else if ((s[i]==' ') || (s[i]=='\t') ||
	     (s[i]==((char) 10)) || (s[i]==((char) 13))) {
      if ((N(r)==0) || (r[N(r)-1]!=' ')) r << ' ';
      i++;
    }
    else r << s[i++];
  }
  if (N(r)!=0) t << r;
  return t;
}

tree
html_parser::parse_tag (string s, int& i) {
  int n= N(s);
  string r;
  tree t (TUPLE);
  while (i<n) {
    if (s[i]=='>') {
      if ((N(r)!=0) || (N(t)==0)) t << r;
      i++;
      // cout << "TAG= " << t << "\n";
      return t;
    }
    else if (s[i]=='\"') // "
      r << parse_string (s, i);
    else if ((s[i]==' ') || (s[i]=='\t') ||
	     (s[i]==((char) 10)) || (s[i]==((char) 13))) {
      if (N(r)!=0) t << r;
      r="";
      i++;
    }
    else r << s[i++];
  }
  if (N(r)!=0) t << r;
  return t;
}

string
html_parser::parse_string (string s, int& i) {
  int start=i, n=N(s);
  while (i<n) {
    if ((s[i]=='\\') && ((i+1)<n)) i+=2;
    else if (s[i]=='\"') { i++; break; } // "
    else i++;
  }
  return s (start, i);
}

string
html_parser::parse_symbol (string s, int& i, bool normal_mode) {
  int start=++i, n=N(s);
  while ((i<n) && (is_alpha (s[i]) || is_digit (s[i]) || (s[i]=='#'))) i++;
  if ((i==n) || (s[i]!=';')) { i= start; return "&"; }
  string r= s (start, i);
  if (i<n) i++;
  if ((r != "") && (r[0] == '#')) {
    char c= (char) as_int (r (1, N(r)));
    r= string (c);
  }
  else if (html_symbol->contains (r))
    r= copy (html_symbol [r]);
  else r= "&" * r * ";";
  if (normal_mode) {
    if (r == "<") r= "<less>";
    if (r == ">") r= "<gtr>";
  }
  return r;
}

tree
html_parser::parse_verbatim (string s, int& i) {
  int n=N(s);
  string r;
  tree t (TUPLE);
  while (i<n) {
    if (test (s, i, "</pre>") || test (s, i, "</code>") ||
	test (s, i, "</PRE>") || test (s, i, "</CODE>")) break;
    if ((s[i]==((char) 10)) || (s[i]==((char) 13))) {
      if (N(r)!=0) t << r;
      t << tree (TUPLE, "BR");
      r= "";
    }
    else if (s[i] == '&') {
      r << parse_symbol (s, i, FALSE);
      continue;
    }
    else r << s[i];
    i++;
  }
  if (N(r)!=0) t << r;
  return t;
}

/******************************************************************************
* Interface
******************************************************************************/

tree
parse_html (string s) {
  int i=0;
  html_parser html;
  // cout << "------------------------------------------------------------------------------\n";
  // cout << "s=" << s << "\n";
  tree t= html.parse_doc (s, i);
  // cout << "------------------------------------------------------------------------------\n";
  // print_tree (t);
  return t;
}

#endmodule // code_parsehtml
