/**********************************************************************
*
*  projet   : DilibPro
*  module   : SgmlText
*  commande : SgmlText
*  fichier  : SgmlText.c
*  Auteur   : Ducloy
*  Date     : Avril 95
*  Mise a jour Sept 97 (option toUpper)
*              Dec  97 (option reIndex)
*
****************************************************************************
*
* Copyright (c) 1995 CNRS/CRIN & INRIA Lorraine
* 
***********************************************************************/

#include "SgmlText.h"
#include "Except.h"
#include "SgmlCharSet.h"
#include <ctype.h>


SgmlTextIterator *SgmlTextIteratorCreate()
{
  SgmlTextIterator *i1;
  if(!(i1=(SgmlTextIterator *)malloc(sizeof(SgmlTextIterator))))
    {
      ExceptSetError ("SgmlText","00", "malloc failed","","",1);
    }
   i1->buffer=BufferCreate(100,100);
   i1->bufTrans=NULL;
   i1->string=NULL;
   i1->begin=NULL;
   i1->stopWordList=NULL;
   i1->minLenWord=0;
   i1->transco='S';
   return i1;
}

SgmlTextIterator *SgmlTextSetTransco(i1,c1)
SgmlTextIterator *i1;
char c1;
{
  char m[2];
  m[1]='\0';
  i1->transco=c1;
  switch(c1)
    {
    case 'S':
    case 's':
    case 'A':
    case 'a':
      break;
    case 'l':
      if(!i1->bufTrans)i1->bufTrans=BufferCreate(100,100);
      break;
    default:
      m[0]=c1;
      ExceptSetError ("SgmlText/Word","00", " transco option ",m," does not fit",1);
    }
  return i1;
}

void SgmlTextWordSetMinLen(i1,l1)
SgmlTextIterator *i1;
int l1;
{
  i1->minLenWord=l1;
}

SgmlTextIterator *SgmlTextIteratorInit(i1,s1)
SgmlTextIterator *i1;
char *s1;
{
  i1->string=s1;
  switch(i1->transco)
    {
    case 'S':
    case 's':
    case 'A':
    case 'a': 
      i1->begin=s1;
      break;
    case 'l':
      BufferStrcpy(i1->bufTrans, SgmlCharSetToLC(s1));
      i1->begin=BufferString(i1->bufTrans);
      break;
    }
  return i1;
}

char *SgmlTextReturn(i1,pt1)
  SgmlTextIterator *i1;
  char *pt1;
{
  BufferStrncpy(i1->buffer, i1->begin, pt1-i1->begin);
  i1->begin=pt1;
  while( i1->begin [0]==' ') i1->begin++;
  return BufferString(i1->buffer);
}

void SgmlTextAddStopWordTable(i1,o1)
  SgmlTextIterator *i1;
char *o1;
{
  if (!i1->stopWordList)
    { if((i1->stopWordList=StrSearchGetTable(o1,100)));
    else
      {
	ExceptSetError ("SgmlText/Word stopWordTable","00", " file ",o1," not founded",1);
      }
    }
  else
    {
      StrSearchAddTable(i1->stopWordList, o1);
    }
}

char *SgmlTextNextSent(i1)
  SgmlTextIterator *i1;
{
  char *nextPos;
  char *beginString;
  beginString=i1->begin;
  nextPos=i1->begin;
  if (i1->begin &&i1->begin[0])
    {
      while((nextPos=strpbrk(nextPos,".&;!?")))
	{
	  switch (nextPos[0])
	    {
	    case '.':
	      if ((nextPos[1]==' ')&&(isupper((int)nextPos[2])))return SgmlTextReturn(i1,nextPos+1);
	      nextPos++;continue;
	    case ';':
	    case '!':
	    case '?':
	      return SgmlTextReturn(i1,nextPos+1);
	    case '&':
	      if ((nextPos=(strchr(nextPos,';')))){nextPos++;continue;}
	      break;
	    }
	}
      i1->begin=NULL;
      return beginString;
      
    }
  return 0;
}

char *SgmlTextNextAlpha(i1,s1)
  SgmlTextIterator *i1;
char *s1;
{
  char c1;
  char *s2;
  s2=s1;
  while ((c1=s2[0]))
    {
      if (isalpha((int)c1))return s2;
      s2++;
    }
  return NULL;
}

char *SgmlEntityIsAccent(s1)
char *s1;
{
  if(strncmp(s1,"acute;",6)==0)return s1;
  if(strncmp(s1,"cedil;",6)==0)return s1;
  if(strncmp(s1,"circ;", 5)==0)return s1;
  if(strncmp(s1,"grave;",6)==0)return s1;
  if(strncmp(s1,"uml;"  ,4)==0)return s1;
  return NULL;
}

char *SgmlTextNextWordCandidate(i1)
  SgmlTextIterator *i1;
{
  char c1;
  i1->lastLenWord=0;
   if (i1->begin &&i1->begin[0])
    {
      if((i1->begin=SgmlTextNextAlpha(i1,i1->begin)))
	{
	  BufferReset(i1->buffer);
	  while((c1=i1->begin[0]))
	    {
	      if (islower((int)c1))
		{
		  BufferCharCat(i1->buffer,c1);
		  i1->begin++;
		  i1->lastLenWord++;
		  continue;
		}
	      else if (isupper((int)c1))
		{
		  i1->lastLenWord++;
		  switch(i1->transco)
		    {
		    case 's':
		    case 'a': 
		      BufferCharCat(i1->buffer,tolower(c1));
		      break;
		    case 'l':
		    case 'S':
		    case 'A':
		      BufferCharCat(i1->buffer,c1);
		      break;		      
		    }
		  i1->begin++;
		  continue;
		}
	      else if ((c1=='&')&&((i1->transco='s')||(i1->transco='S')))
		{
		  char *semiColon;
		  semiColon=strchr(i1->begin,';');
		  if (SgmlEntityIsAccent(i1->begin+2))
		    {
		      i1->lastLenWord++;
		      switch(i1->transco)
			{
			case 's':
			  BufferCharCat(i1->buffer, '&');
			  BufferCharCat(i1->buffer, tolower(i1->begin[1]));
			  BufferStrncat(i1->buffer, i1->begin+2, semiColon-i1->begin-1);
			  break;
			case 'S':
			  BufferStrncat(i1->buffer, i1->begin, semiColon-i1->begin+1);
			  break;
		      
			}
		      i1->begin=semiColon+1;
		    }
		  else
		    {
		      i1->begin=semiColon+1;
		      return BufferString(i1->buffer);
		    }
		}
	      else
		{
		  i1->begin++;
		  return BufferString(i1->buffer);
		}
	    }
	  return BufferString(i1->buffer);
	}
    }
  return NULL;
}


char *SgmlTextNextWord(i1)
  SgmlTextIterator *i1;
{
  if(i1->stopWordList)
    {
      char *nw1;
      while((nw1=SgmlTextNextWordCandidate(i1)))
	{
	  if (!StrSearch(i1->stopWordList,nw1)
	      && (i1->lastLenWord >= i1->minLenWord))return nw1;
	}
    }
  else
    {
      return SgmlTextNextWordCandidate(i1);
    }
  return NULL;
}
