/*   -*- coding: utf-8 -*-  */

/***********************************************************************
*
*        Module   : Tei
*        Fichier  : TeiSplitUsual.c
*        Auteur   : J. DUCLOY
*        Date     : octobre 2016
*
************************************************************************/

#include "Utf8Text.h"
#include "Utf8Converter.h"
#include "TeiHandler.h"
#include "SxmlNode.h"
#include "StrDict.h"
#include <string.h>
#include <stdio.h>
#include <stdlib.h>

StrDict *Utf8SplitStopWordDict=NULL;
StrDict *Utf8SplitReplaceDict=NULL;
StrDict *TeiKeyDict=NULL;
int      Utf8LowerCaseMode;
StrDict *wordTable;
int      debugLevel;
int      minLength;

int splitTextComputeNumberWords(SxmlNode *textNode)
{
  int numberWords;
  SxmlNode *textUnit;
  numberWords=0;
  if (SxmlIsText(textNode))
    {
      char *toParse;
      toParse=SxmlNodeValue(textNode);
      while ((toParse=strchr(toParse, ' '))) {numberWords++; toParse++;}
      return numberWords;
    }
  SxmlReset (textNode);
  while ((textUnit=SxmlNextNode(textNode)))
    {
      numberWords+=splitTextComputeNumberWords(textUnit);
    }
  return numberWords;
}

void splitTextSplitUnit(SxmlNode *textNode, double w, double len, double numberWords)
{
  SxmlNode *textUnit;
  if (SxmlIsText(textNode))
    {
      char *toParse;
      char *word;
      char *targetWord;
      toParse=SxmlNodeValue(textNode);
      while ((word=Utf8GetAlphaString(toParse, &toParse, 0)))
	{
	  double *pw1;
	  if (Utf8Length(word)<minLength) continue;
	  if (Utf8LowerCaseMode) targetWord=Utf8StringToLower(word);
	  else targetWord=word;
	  if (Utf8SplitStopWordDict)
	    {
	      if ( StrDictSearch(Utf8SplitStopWordDict, targetWord)) continue;
	    }
	  if (Utf8SplitReplaceDict)
	    {
	      char *newWord;
	      if ((newWord=StrDictSearch(Utf8SplitReplaceDict, targetWord)) )
		{
		  targetWord=newWord;
		}
	    }
	  if ((pw1=(double *)StrDictSearch(wordTable, targetWord)))
	    {
	      *pw1+=w*len/numberWords;
	      if (debugLevel>0)printf("%s\t%s\t%f\n", SxmlInputRecordKey, targetWord, *pw1);
	    }
	  else
	    {
	      pw1=malloc(sizeof(double));
	      *pw1=w*len/numberWords;
	      if (debugLevel>0)printf("%s\t%s\t%f\n", SxmlInputRecordKey, targetWord, *pw1);
	      StrDictAddNewDatum(wordTable, strdup(targetWord), (char *) pw1);
	    }
	}
      return;
    }
  SxmlReset (textNode);
  while ((textUnit=SxmlNextNode(textNode)))
    {
      splitTextSplitUnit(textUnit, w, len, numberWords);
    }
  return;
}

void splitTextRootNode(SxmlNode *textNode, double w, int len)
{
  double numberWords;

  numberWords=(double)splitTextComputeNumberWords(textNode);
  if (debugLevel>0)printf("%s\t%f\n", SxmlInputRecordKey, numberWords);
  splitTextSplitUnit(textNode,  w, (double) len, (double) numberWords);
}

int getopt();
extern char *optarg;
extern int optind;

SxmlNode *inputDoc;

int main(int argc, char **argv)
{
  char cOption;
  debugLevel=0;
  minLength=2;
  Utf8LowerCaseMode=0;

  while((cOption=getopt(argc,argv,"k:lm:r:s:D:K:R:S:"))!=EOF)
    {
      switch(cOption)
	{
	case 'D':
	  debugLevel=atoi(optarg);
	  break;
	case 'k':
	  if (!TeiKeyDict) TeiKeyDict=NewStrDict();
	  StrDictAddNewDatum(TeiKeyDict, optarg, argv[++optind-1]);
	  break;
	case 'K':
	  if (!TeiKeyDict) TeiKeyDict=StrDictFromFile(optarg);
	  else StrDictAddFromFile(TeiKeyDict, optarg);
	case 'l':
	  Utf8LowerCaseMode=1;
	  break;
	case 'm':
	  minLength=atoi(optarg);
	  break;
	case 's':
	  if (!Utf8SplitStopWordDict) Utf8SplitStopWordDict=NewStrDict();
	  StrDictAddNewDatum(Utf8SplitStopWordDict, optarg, optarg);
	  break;
	case 'S':
	  if (!Utf8SplitStopWordDict) Utf8SplitStopWordDict=StrDictFromFile(optarg);
	  else StrDictAddFromFile(Utf8SplitStopWordDict, optarg);
	  break;
	case 'r':
	  if (!Utf8SplitReplaceDict) Utf8SplitReplaceDict=NewStrDict();
	  StrDictAddNewDatum(Utf8SplitReplaceDict, optarg, argv[++optind-1]);
	  break;
	case 'R':
	  if (!Utf8SplitReplaceDict) Utf8SplitReplaceDict=StrDictFromFile(optarg);
	  else StrDictAddFromFile(Utf8SplitReplaceDict, optarg);
	  break;
	}
    }

  if (debugLevel>1)
    {
      if(TeiKeyDict)
	{
	  char *k1;
	  printf ("Key Dict :\n");
	  StrDictIteratorReset(TeiKeyDict);
	  while ((k1=StrDictNext(TeiKeyDict)))
	    {
	      printf("%s : %s\n", k1, StrDictValue(TeiKeyDict));
	    }
	}
    }
  while(TeiInputNextIstexRecord())
    {
      char *outputKey;
      if (TeiKeyDict)
	{
	  char *k1;
	  if ((k1=StrDictSearch(TeiKeyDict, SxmlInputRecordKey)))
	    outputKey=k1;
	  else outputKey=SxmlInputRecordKey;
	}
      else outputKey=SxmlInputRecordKey;
      wordTable = NewStrDict();
      if (Tei_titleStmt)
	{
	   if (debugLevel>0) 
	     {
	       SxmlPrint(Tei_titleStmt);
	       printf("\n");
	     }
	  splitTextRootNode(SxmlFirstChild(Tei_titleStmt),4.0, 10);
	}
      if (Tei_profileDesc)
	{
	  SxmlNode *absNode;
	  absNode=SxmlGetFirstChildByTagName(Tei_profileDesc, "abstract");
	  if (absNode)
	    {
	      if (debugLevel>0) 
		{
		  SxmlPrint(absNode);
		  printf("\n");
		}
	      splitTextRootNode(SxmlFirstChild(absNode),2.0, 100);
	    }
	}
      if (Tei_body)
	{
	  if (debugLevel>1) 
	    {
	      SxmlPrint(Tei_body);
	      printf("\n");
	    }
	  splitTextRootNode(SxmlFirstChild(Tei_body),1.0, 1000);
	}
      StrDictIteratorReset(wordTable);
      while (StrDictNext(wordTable))
	{
	  double *pw1;
	  pw1=(double *) StrDictValue(wordTable);
	  printf("%s\t%08.3f\t%s\n", StrDictKey(wordTable), *pw1, outputKey);
	}
      StrDictFreeStr(wordTable);
    }
  exit (EXIT_SUCCESS);
}
