/*******************************************************************************
*
* Copyright (c) 2000, Connex, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this program; if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* See 'License' file for License information.
*
* Filename:    xml2text.c
*
* Description: xml2text.c compiles to a program which takes an XML file as
*              input and produces a more easily read version of the file.
*              The file itself is NOT the same as the origional text in
*              that the output will likely add newlines and tabs and thus 
*              meaningful whitespace can be generated. The program does,
*              hoever, convert pre-existing tabs and newlines to &tab; and
*              &nl; respectively. This allows for the reversal of the
*              conversion with the '-r' option.
*
* Notations:   This is by no means complete for fully tested. I use it
*              for XML files which do not contain a DTD. I have added
*              code that will attempt to ignore DTDs, however, it may not
*              work very well, if at all. If you add to the functionality,
*              please send me changes so I can incorporate them if
*              appropriate. 
*
*              And, another thing, I know that the code is not optimal,
*              I know that stdio isn't exactly the fastest way of doing
*              things, and I know that some will want to see these
*              changed. I might do that, but please, try to follow the
*              coding standards that this program is written to. I hope
*              to release these standards soon for the public domain.
*
*
* Author:      Adam Potolsky (speaker@potolsky.com)
*
* Revision History:
*   12/03/00   Adam -- Creation -- Version 0.1
*     This is the first released version. Hope it is usefil.
*
*******************************************************************************/
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <sys/types.h>
#include <limits.h>

/*******************************************************************************
*
* int skipTag( FILE *fin )
*
* Description:  This routine will output all chars in the FILE till a '>'
*               is found. 
*
* Inputs:       fin   FILE pointer for the file to work on
*
* Requirements: fin   A valid FILE pointer
*
* Outputs:      NONE
*
* Effects:      FILE pointer moved, and output generated.
*
*******************************************************************************/
void skipTag(FILE *fin)
{

   char cChar = '\0';

   cChar = (char)fgetc(fin);
   while(cChar != '>')
   {
      fprintf(stdout,"%c",cChar);
      cChar = (char)fgetc(fin);
   }
   fprintf(stdout,"%c",cChar);
}
/*******************************************************************************
*
* int convertAmpTag( FILE *fin )
*
* Description:  This routine either outputs the converted '&;' tag, and
*               leaves the FILE pointer pointeing at the char immediately
*               following the tag, or rewinds the FILE pointer to follow
*               the '&'.
*
* Inputs:       fin   FILE pointer for the file to work on
*
* Requirements: fin   A valid FILE pointer
*
* Outputs:      int   0 if no conversion made
*                     1 if tag is converted
*                     other on error
*
* Effects:      FILE pointer moved, and output generated.
*
*******************************************************************************/
int convertAmpTag(FILE *fin)
{
   char  sBuf[32] = "";
   int   iReturn  = 0;

   fscanf(fin,"%c%c%c%c",&sBuf[0],&sBuf[1],&sBuf[2],&sBuf[3]);
   if( strncmp( sBuf , "nl;" , 3 ) == 0 )
   {
      fprintf( stdout , "\n" );
      if( !feof(fin) )               /* If we didn't run into the EOF, and   */
      {                              /* we read one to far for this Amp Tag, */
         fseek(fin,-1,SEEK_CUR);     /* rewind one so the parser is aligned  */
      }
      iReturn = 1;
   }
   else if( strncmp( sBuf , "tab;" , 4 ) == 0 )
   {
      fprintf( stdout , "\t" );
      iReturn = 1;
   }
   else 
   {
      fseek( fin , -4 , SEEK_CUR );
      iReturn = 0;
   }
   return( iReturn );
}
/*******************************************************************************
*
* void  addtab( char *sString )
*
* Description:  This routine appends a tab to the given string.
*
* Inputs:       sString   String to append a tab to
*
* Requirements: sString   A valid 'c' string with enough space to add a tab.
*
* Outputs:      NONE
*
* Effects:      String is modified
*
*******************************************************************************/
void addtab( char *sString )
{
   if(sString != NULL) strcat(sString,"\t");
}

/*******************************************************************************
*
* void  deltab( char *sString )
*
* Description:  This routine removes all leading tabs from the given string.
*
* Inputs:       sString   String to strip tabs from
*
* Requirements: sString   A valid 'c' string.
*
* Outputs:      NONE
*
* Effects:      String is modified
*
*******************************************************************************/
void deltab( char *sString )
{
   int iCntr = 0;
   if(sString == NULL) return;
   while(sString[iCntr] == '\t' )
   {
      iCntr++;
   }
   sString[iCntr-1] = '\0';
}

/*******************************************************************************
*
* void  useage( char *sName )
*
* Description:  This routine displays the useage for the application
*
* Inputs:       sName   Name of the application as displayed by useage.
*
* Requirements: sName   A valid 'c' string.
*
* Outputs:      NONE
*
* Effects:      Output is written to stderr.
*
*******************************************************************************/
void useage( char *sName )
{
   fprintf(stderr,"Useage:%s [-r] <input file>\n",sName);
   fprintf(stderr,"       Where <input file> is the file to read.\n");
   fprintf(stderr,"       '-r' reverses the conversion.\n");
}

/*******************************************************************************
*
* int  main( int argc, char *argv[] )
*
* Description:  This is the root process routine.
*
* Inputs:       argc   Number of arguements to command
*               argv   Array of strings which are the arguements
*
* Requirements: argc   there need to be at least 3 arguements
*               argv   argv[0] = name of executable from call to routine
*                      argv[1] = input  XML filename
*                      argv[2] = output XML filename
*                      Note: input and output names should differ
*
* Outputs:      exit value
*
* Effects:      Standard output is generated.
*
*******************************************************************************/
int main( int argc, char *argv[])
{
   FILE       *fin                      = NULL;  /* FILE pointer for input */
   int         iUseNL                   = 1;     /* Newline state keeper   */
   int         iReturnCode              = 1;     /* generic return code    */
   int         iMultiClose              = 1;     /* cnt of multiple closes */
   int         iFirstFound              = 1;     /* First Tag in file?     */
   int         iEmptyFound              = 0;     /* Empty Tag found        */
   int         iReverseRun              = 0;     /* which conversion?      */
   char        cFirst                   = '\0';  /* char read              */
   char        cSecond                  = '\0';  /* next char read         */
   char       *sFileName                = NULL;  /* Input file name        */
   char       *sErr                     = NULL;  /* ptr to strerror output */
   char        sTab[128]                = "";    /* array of indent tabs   */

/* Check the useage. If a -r is given, then reverse the direction of
   conversion */

   switch(argc)
   {
      case(2):
         iReverseRun = 0;
            sFileName = argv[1];
         break;
      case(3):
         if( strcmp(argv[1],"-r") == 0)
         {
            iReverseRun = 1;
            sFileName = argv[2];
         }
         else
         {
            useage(argv[0]);
            exit(1);
         }
         break;
      default:
         useage(argv[0]);
         exit(1);
   }

/* open the file, if there is an error, exit */
   fin = fopen(sFileName,"r");
   if(fin == NULL)
   {
      sErr = strerror(errno);
      fprintf(stderr,"Could not open file %s due to error %s\n",sFileName,sErr);
      exit(errno);
   }

/* Conversion from human to real */

   if(iReverseRun == 1)
   {

/* Strip all leading newlines, tabs and spaces*/

      fscanf(fin,"%c",&cFirst);
      while( cFirst == '\n' || cFirst == '\t' || cFirst == ' ')
      {
         fscanf(fin,"%c",&cFirst);
      }
      fprintf(stdout,"%c",cFirst);

/* while we're not at the end of the file, strip all newlines, tabs and
   spaces which are not part of an XML Tag. Exception below. */

      while( fscanf(fin,"%c",&cFirst) != EOF )
      {

/* If an '&' is found, then there might be an &nl; or &tab; in the file.
These are to be converted back to newlines and tabs in the XML file. They
were part of the content */

         if(cFirst == '&')
         {
            iReturnCode = convertAmpTag(fin);
            if( iReturnCode == 0 )
            {
               fprintf(stdout,"%c",cFirst);
            }
            else if( iReturnCode == 1 )
            {
               iUseNL = 0;
            }
         }
         else if(cFirst != '\n' && cFirst != '\t')
         {
            fprintf(stdout,"%c",cFirst);
         }

/* If we had found a special tag, (see below,) put in the newline, and reset
   the special tag condition */

         if( (cFirst == '\n') && (iUseNL == 1) )
         {
            fprintf(stdout,"\n");
            iUseNL = 0;
         }

/* This is an exception, if there is a tag which begins with a '<?' or '<!',
   it's special. Return to a new line once the whole TAG is finished. */

         if(cFirst == '<')
         {
            fscanf(fin,"%c",&cFirst);
            if( (cFirst == '?') || (cFirst == '!') )
            {
               fprintf(stdout,"%c",cFirst);
               skipTag(fin);
               iUseNL = 1;
            }
            else
            {

/* The '<' has already been printed, now we're outputing the next char we
   just read into cFirst. */

               fprintf(stdout,"%c",cFirst);
            }
         }
      }

      fclose(fin);   
      exit(0);
   } /* This is the end of the conversion from human to real */

/* Convert an XML file to a human readable file */

/* There are a few variables that get set as a state. These are iUseNL,
   iMultiClose, and iFirstFound. iUseNL is set whenever a NewLine should
   be output, iMultiClose means that there is more then one closing tag in
   a row, and iFirstFound is used as a special case for the first tag in
   the file. */

   while( fscanf(fin,"%c",&cFirst) != EOF)
   {

/* We have found the start of a tag. Depending on the next character read,
   there are various actions to perform. */

      if(cFirst == '<')
      {
         fscanf(fin,"%c",&cSecond);

         if(cSecond == '/')         /* Close of a tag.               */
         {
            iUseNL = 0;             /* Not ready for a newline       */
            iMultiClose++;            /* Start counting closing tags   */
         }
         else if(cSecond == '?')    /* Special tag type              */
         {
            iUseNL = 1;             /* Special tag, newline after it */
            iMultiClose = 0;        /* reset the multiclose count    */
         }
         else if(iFirstFound == 1)  /* 1st tag in file, special case */
         {
            iUseNL = 1;             /* use newline after <?xml> tag  */
            iMultiClose = 0;        /* set the multiclose to zero    */
            iFirstFound = 0;        /* set the first found to zero   */
         }
         else
         {
            iUseNL = 1;             /* use a newline after this tag  */
            addtab(sTab);           /* add a tab to the indent str   */
            iMultiClose = 0;        /* not a multiple close case     */
         }

/* This is where output is generted when a '<' was read first. For empty
   tags, newline cases and multiple closures, use a newline. */

         if( (iEmptyFound == 1) || (iUseNL == 1) || (iMultiClose >= 2) )
         {
            fprintf(stdout,"\n%s%c%c",sTab,cFirst,cSecond);
            iUseNL = 0;
         }
/* ...otherwise just output the characters we have read them */
         else
         {
            fprintf(stdout,"%c%c",cFirst,cSecond);
         }
         iEmptyFound = 0;

/* Here is part of the trick. if we're in closure case (multiclose > 0),
   remove a tab from the indention string. As we climb our way in, we keep
   deleting tabs. */

         if(iMultiClose >= 1)
         {
            deltab(sTab);
         }
      }

/* There is the potential for an empty tag when a '/' is found without
   a leading > */

      else if(cFirst == '/')
      {
         fscanf(fin,"%c",&cSecond);
         
/* if the next character is a '>', then we have a self closing tag:
   <thing stuff="foo"/>   In this case, it's an empty tag, and we want to
   newline for the next tage. It also means that we dont need to worry
   about a multiple closure case. */

         if(cSecond == '>')
         {
            iEmptyFound = 1;
            iUseNL = 1;
            iMultiClose = 0;
         }
/* ...otherwise, nothing special, not an empty tag, dont automaticly use a
      newline, and add to the number of closing tags */
         else
         {
            iEmptyFound = 0;
            iUseNL = 0;
            iMultiClose++;
         }
         if(iEmptyFound == 1)         /* If it's an empty tag, no '\n' yet */
         {
            fprintf(stdout,"%c%c",cFirst,cSecond);
         }
         else                         /* otherwise, use a newline          */
         {
            fprintf(stdout,"%c%c\n",cFirst,cSecond);
         }
/* remove an indention */
         deltab(sTab);
      }

/* Ignore newlines. */

      else if(cFirst == '\n')
      {
         fprintf(stdout,"&nl;");
      }
      else if(cFirst == '\t')
      {
         fprintf(stdout,"&tab;");
      }
      else
      {
         fprintf(stdout,"%c",cFirst);
      }
   }
   fclose(fin);
   return(0);
}
