/*
 * regexp.c	-- STklos Regexps
 * 
 * Copyright  2000-2001 Erick Gallesio - I3S-CNRS/ESSI <eg@unice.fr>
 * 
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 
 * USA.
 * 
 *           Author: Erick Gallesio [eg@unice.fr]
 *    Creation date: 24-Nov-2000 10:35 (eg)
 * Last file update: 16-Jan-2001 22:06 (eg)
 */

#include "stklos.h"
#include "../pcre/pcreposix.h"


struct regexp_obj {
  stk_header header;
  regex_t buffer;
};

#define REGEXPP(p) 		(BOXED_TYPE_EQ((p), tc_regexp))
#define REGEXP_BUFFER(p)	(((struct regexp_obj *) (p))->buffer)

static void error_bad_string(SCM obj)
{
  STk_error("bad string ~S", obj);
}



static void signal_regexp_error(int code, regex_t *buffer)
{
  size_t len;
  char *msg;

  /* First call regerror with a null buffer to get the message length */
  len = regerror(code, buffer, NULL, 0);

  /* Call again regerror with a freshly allocated buffer */
  msg = STk_must_malloc_atomic(len+2); /* ??? */
  regerror(code, buffer, msg, len+1);

  STk_error("%s", msg);
}

static void regexp_finalizer(SCM re)
{
  regfree(&REGEXP_BUFFER(re));
}

       
/*
<doc EXT string->regexp
 * (string->regexp string)
 *
 * |String->regexp| takes a string representation of a regular
 * expression and compiles it into a regexp value. Other regular
 * expression procedures accept either a string or a regexp value as
 * the matching pattern. If a regular expression string is used
 * multiple times, it is faster to compile the string once to a regexp
 * value and use it for repeated matches instead of using the string
 * each time.  
doc> 
 */
DEFINE_PRIMITIVE("string->regexp", regcomp, subr1, (SCM re))
{
  SCM z;
  int ret;

  ENTER_PRIMITIVE(regcomp);
  if (!STRINGP(re)) error_bad_string(re);
  NEWCELL_ATOMIC(z, regexp, sizeof(struct regexp_obj) ); 

  ret = regcomp(&REGEXP_BUFFER(z), STRING_CHARS(re), 0);

  if (ret) signal_regexp_error(ret, &REGEXP_BUFFER(z));
  STk_register_finalizer(z, regexp_finalizer);

  return z;
}

/*
<doc EXT regexp?
 * (regexp? obj) 
 *
 * |Regexp| returns |#t| if |obj| is a regexp value created by the |regexp|, 
 * otherwise |regexp| returns |#f|.
doc>
 */
DEFINE_PRIMITIVE("regexp?", regexpp, subr1, (SCM obj))
{
  return MAKE_BOOLEAN(REGEXPP(obj));
}


/*
<doc EXT regexp-match regexp-match-positions
 * (regexp-match pattern str)
 * (regexp-match-positions pattern str)
 *
 * These functions attempt to match |pattern| (a string or a regexp value) 
 * to |str|. If the match fails, |#f| is returned. If the match succeeds, 
 * a list (containing strings for |regexp-match| and positions for 
 * |regexp-match-positions| is returned. The first string (or positions) in
 * this list is the portion of string that matched pattern. If two portions 
 * of string can match pattern, then the earliest and longest match is found,
 * by default. 
 * 
 * Additional strings or positions are returned in the list if pattern contains
 * parenthesized sub-expressions; matches for the sub-expressions are provided
 * in the order of the opening parentheses in pattern. 
 * @lisp
 * (regexp-match-positions "ca" "abracadabra")
 *                  => ((4 6))
 * (regexp-match-positions "CA" "abracadabra")
 *                  => #f
 * (regexp-match-positions "(?i)CA" "abracadabra")
 *                  => ((4 6))
 * (regexp-match "(a*)(b*)(c*)" "abc")
 *                  => ("abc" "a" "b" "c")
 * (regexp-match-positions "(a*)(b*)(c*)" "abc")
 *                  => ((0 3) (0 1) (1 2) (2 3))
 * (regexp-match-positions "(a*)(b*)(c*)" "c")
 *                  => ((0 1) (0 0) (0 0) (0 1))
 * (regexp-match "(?<=\\d@{3@})(?<!999)foo" "999foo and 123foo")
 *      => ((14 17)) 
 * @end lisp
doc>
*/
static SCM regexec_helper(SCM re, SCM str, int pos_only)
{
  regmatch_t pmatch[10];
  int i, ret;
  SCM result;

  /* RE can be a string or a already compiled regexp */
  if (STRINGP(re)) re = STk_regcomp(re);
  else if (!REGEXPP(re)) STk_error("bad compiled regexp ~S", re);

  if (!STRINGP(str)) error_bad_string(str);

  ret = regexec(&REGEXP_BUFFER(re), STRING_CHARS(str), 10, pmatch, 0);
  if (ret) {
    if (ret == REG_NOMATCH) /* No match ==> #f */ return STk_false;
    signal_regexp_error(ret, &REGEXP_BUFFER(re));
  }

  result = STk_nil;
  for(i=0; i <10; i++) {
    int from = pmatch[i].rm_so;
    int to   = pmatch[i].rm_eo;

    if (from < 0) break;
    
    result = STk_cons((pos_only)? 
		         LIST2(STk_long2integer(from), STk_long2integer(to)) :
		         STk_makestring(to-from, STRING_CHARS(str)+from),
		      result);
  }
  return STk_dreverse(result);
}


DEFINE_PRIMITIVE("regexp-match", regexec, subr2, (SCM re, SCM str))
{
  ENTER_PRIMITIVE(regexec);
  return regexec_helper(re, str, FALSE);
}

DEFINE_PRIMITIVE("regexp-match-positions", regexec_pos, subr2, (SCM re, SCM str))
{
  ENTER_PRIMITIVE(regexec_pos);
  return regexec_helper(re, str, TRUE);
}

	  
/*===========================================================================*\
 * 
 * 				Initialization
 * 
\*===========================================================================*/

/* The stucture which describes the regexp type */
struct extended_type_descr xtype_regexp = {
  "regexp",			/* name */
  NULL				/* print function */
};



int STk_init_regexp(void)
{
  DEFINE_XTYPE(regexp,  &xtype_regexp);

  ADD_PRIMITIVE(regcomp);
  ADD_PRIMITIVE(regexec);
  ADD_PRIMITIVE(regexec_pos);
  ADD_PRIMITIVE(regexpp);
  return TRUE;
}
