/*   spattern.c
* ===========================================================================
*
*                            PUBLIC DOMAIN NOTICE
*               National Center for Biotechnology Information
*
*  This software/database is a "United States Government Work" under the
*  terms of the United States Copyright Act.  It was written as part of
*  the author's official duties as a United States Government employee and
*  thus cannot be copyrighted.  This software/database is freely available
*  to the public for use. The National Library of Medicine and the U.S.
*  Government have not placed any restriction on its use or reproduction.
*
*  Although all reasonable efforts have been taken to ensure the accuracy
*  and reliability of the software and data, the NLM and the U.S.
*  Government do not and cannot warrant the performance or results that
*  may be obtained by using this software or data. The NLM and the U.S.
*  Government disclaim all warranties, express or implied, including
*  warranties of performance, merchantability or fitness for any particular
*  purpose.
*
*  Please cite the author in any work or product based on this material.
*
* ===========================================================================
*
* File Name:  spattern.c
*
* Author:  Epstein
*
* Version Creation Date:   1/4/94
*
* $Revision: 6.0 $
*
* File Description:
*       Sequence-specific pattern matching
*
* Modifications:
* --------------------------------------------------------------------------
* Date     Name        Description of modification
* -------  ----------  -----------------------------------------------------
*
* $Log: spattern.c,v $
* Revision 6.0  1997/08/25 18:54:44  madden
* Revision changed to 6.0
*
* Revision 1.2  1996/06/21 14:10:44  epstein
* add boilerplate and run 'indent'
*
*
* ==========================================================================
*/

#include <ncbi.h>
#include <sequtil.h>
#include <seqport.h>
#include <spattern.h>
#include <gpattern.h>

static CharPtr 
load_seq_data (SeqLocPtr slp, Boolean is_na)
{
    CharPtr         seq;

    Int4 length, i;
    SeqPortPtr      spp;

    Uint1           res;

    Uint1           code;

    length = SeqLocLen (slp);
    if (is_na)
	code = Seq_code_iupacna;
    else
	code = Seq_code_ncbieaa;
    seq = MemNew ((size_t) (length + 1) * sizeof (Char));
    spp = SeqPortNewByLoc (slp, code);

    i = 0;
    while ((res = SeqPortGetResidue (spp)) != SEQPORT_EOF) {
	if (res != SEQPORT_EOS && res != SEQPORT_VIRT)
	    seq[i++] = res;
    }
    seq[i] = '\0';
    SeqPortFree (spp);

    return seq;
}


/*
   returns the number of times the pattern was matched in the Bioseq.
   Where:
     [I] maxMatches is the maximum number of matches which we're interested
         in hearing about
     [I] maxRange is the longest match of interest.  This is useful to
         avoid excessive computation when trying to match a pattern of
         interest
     [I] seqLocType is the type of the resulting seqLocs
     [I] offset is a value which is added to the resulting start position,
	 e.g., for use as a restriction-enzyme cutsite
     [O] starts is an array of size >= maxMatches which is populated by the
             function to indicate the starting position of each match
*/
Int4 LIBCALL 
PatternMatchSeqLoc (CompiledPattern compPat, SeqLocPtr slp, Boolean is_na,
		    Int4 maxMatches, Int4 maxRange, Uint1 seqLocType, Int4 offset, SeqLocPtr PNTR seqLocs)
{
    CharPtr         buf;

    SeqLocPtr       slp2;

    Int4            count = 0;

    Int4            len;

    int             pos;

    struct patstuff *pts = (struct patstuff *) compPat;

    Int4            start, stp;

    Int1            strand;

    SeqIdPtr        sip;

    if (slp == NULL || slp->choice != SEQLOC_INT) {
	ErrPostEx (SEV_ERROR, 0, 0, "Only SEQLOC_INT is supported by PatternMatchSeqLoc");
	return -1;
    }
    if (seqLocType != SEQLOC_INT && seqLocType != SEQLOC_PNT) {
	return -1;
    }
    if ((buf = load_seq_data (slp, is_na)) == NULL) {
	return -1;
    }
    start = SeqLocStart (slp);
    stp = SeqLocStop (slp);
    strand = SeqLocStrand (slp);
    sip = SeqLocId (slp);
    len = StrLen (buf);

    pos = 0;
    while (count < maxMatches && (pos = re_search (&pts->patbuf, buf, len, pos, maxRange, &pts->regs)) >= 0) {
	switch (seqLocType) {
	case SEQLOC_INT:
	    slp2 = SeqLocIntNew (strand == Seq_strand_minus ?
				 stp - pts->regs.end[0] - 1
		       : start + pos, strand == Seq_strand_minus ? stp - pos
				 : pts->regs.end[0] - 1, strand, sip);
	    break;

	case SEQLOC_PNT:
	    slp2 = SeqLocPntNew (strand == Seq_strand_minus ? stp - pts->regs.end[0] - 1 - offset
				 : start + pos + offset, strand, sip, FALSE);
	    break;
	}
	seqLocs[count++] = slp2;
	pos++;
    }

    MemFree (buf);

    return count;
}



typedef struct {
    Char            unambig;
    CharPtr         ambig;
}               Ambiguity, *AmbiguityPtr;

#define NUM_DNA_AMB_CHARS 15

Ambiguity       DNAAmbiguity[NUM_DNA_AMB_CHARS] = {
    {'A', "[ADHMNRVW]"},
    {'B', "[GTCBN]"},
    {'C', "[BCHMNSVY]"},
    {'D', "[GATDN]"},
    {'G', "[BDGKNRSV]"},
    {'H', "[ACTN]"},
    {'K', "[GTBDN]"},
    {'M', "[ACH"},
    {'N', "[ABCDGHKMNRSTVWY]"},
    {'R', "[AGDRN]"},
    {'S', "[CGBVSN]"},
    {'T', "[BDHKNTWY]"},
    {'V', ""},
    {'W', "[ATDHWN]"},
    {'Y', "[CTBHYN]"}
};

static CharPtr 
GenerousDNAAmbiguity (Char c)
{
    Int2 i;

    for (i = 0; i < NUM_DNA_AMB_CHARS; i++) {
	if (c == DNAAmbiguity[i].unambig)
	    return (DNAAmbiguity[i].ambig);
    }

    return "";
}


static          Boolean 
CleanMatch (SeqLocPtr slp, CharPtr recog,
	    Int2 numPermittedPartialMatches)
{
    return TRUE;
}

#define MAX_EXPANDED_RENZYME 256
#define MAX_MATCHES 512

SeqAnnotPtr LIBCALL 
FindCutSitesAsSeqAnnot (SeqLocPtr querySlp, CutsiteInfoPtr cutsites, CharPtr cutDbName, Int2 numPermittedPartialMatches)
{
    SeqAnnotPtr     annot;

    Char            regexp[MAX_EXPANDED_RENZYME];

    CharPtr         charp;

    CompiledPattern pat;

    Int4            matches;

    Int4            match;

    SeqLocPtr       retvals[MAX_MATCHES];

    SeqLocPtr       slp, slptail;

    SeqFeatPtr      sfp = NULL;

    SeqFeatPtr      sfptail;

    SeqFeatPtr      newsfp = NULL;

    ObjectIdPtr     oip;

    DbtagPtr        dbtag;

    ValNodePtr      rrp;

    ValNodePtr      desc;

    if (querySlp == NULL || querySlp->choice != SEQLOC_INT) {
	ErrPostEx (SEV_ERROR, 0, 0, "Only SEQLOC_INT is supported by FindCutSitesAsSeqAnnot");
	return NULL;
    }
    /* for ( each restriction enzyme ) do */
    for (; cutsites->enzymeName != NULL; cutsites++) {
	for (regexp[0] = '\0', charp = cutsites->recognitionSeq; *charp; charp++) {
	    StrCat (regexp, GenerousDNAAmbiguity (*charp));
	}
	pat = CompilePattern (regexp);
	matches = PatternMatchSeqLoc (pat, querySlp, TRUE, MAX_MATCHES, INT2_MAX, SEQLOC_INT, 0, retvals);
	slp = NULL;
	for (match = 0; match < matches; match++) {
	    if (CleanMatch (retvals[match], cutsites->recognitionSeq,
			    numPermittedPartialMatches)) {
		if (slp == NULL) {
		    slp = ValNodeNew (NULL);
		    slp->choice = SEQLOC_EQUIV;
		    slp->data.ptrvalue = retvals[match];
		} else {
		    slptail->next = retvals[match];
		}
		retvals[match]->next = NULL;
		slptail = retvals[match];
	    } else {
		SeqLocFree (retvals[match]);
	    }
	}
	if (slp != NULL) {
	    sfp = SeqFeatNew ();
	    sfp->data.choice = SEQFEAT_RSITE;
	    sfp->qual = NULL;
	    sfp->id.choice = 3;	/* local */
	    oip = ObjectIdNew ();
	    oip->str = StringSave (cutsites->recognitionSeq);
	    sfp->id.value.ptrvalue = oip;
	    dbtag = DbtagNew ();
	    dbtag->db = StringSave (cutDbName);
	    oip = ObjectIdNew ();
	    oip->str = StringSave (cutsites->enzymeName);
	    dbtag->tag = oip;
	    rrp = ValNodeNew (NULL);
	    rrp->choice = 2;
	    rrp->data.ptrvalue = dbtag;
	    sfp->data.value.ptrvalue = rrp;
	    sfp->comment = NULL;
	    sfp->location = slp;
	    sfp->product = NULL;
	    sfp->next = NULL;
	    if (newsfp == NULL) {
		newsfp = sfp;
	    } else {
		sfptail->next = sfp;
	    }
	    sfptail = sfp;
	}
    }

    desc = ValNodeNew (NULL);
    desc->choice = Annot_descr_name;
    desc->data.ptrvalue = StringSave ("cutsites");

    annot = SeqAnnotNew ();
    annot->type = 1;
    annot->data = newsfp;
    annot->desc = desc;

    return annot;
}
