/*
 * Splits an HTML file into several files and updates
 * hypertext links accordingly.
 *
 * Restrictions:
 *
 *   1. The ``<A NAME=...>'' and ``<A HREF=...>''
 *      anchors MUST be found verbatim, i.e. without
 *      excess whitespace and *not* split between 2
 *      adjacent lines.  If the HTML file was produced
 *      by Makeinfo, you should use the @w{} directive
 *      judiciously to prevent line-filling mechanism
 *      from splitting the anchors between lines.
 *   2. Currently only supports splitting the file one
 *      node per file; you cannot split the file by
 *      chapters.  The string which signals the beginning
 *      of a new node is hard-wired into the program and
 *      cannot be changed without recompiling.
 *      
 *
 * Author: Eli Zaretskii <eliz@is.elta.co.il>
 *
 * Version: 1.1
 *
 * Last updated: 22 June, 1996
 *
 * ----------------------------------------------------------
 *
 * You can do whatever you like with this program, except:
 * (1) preventing other people (including the author) do
 * whatever they like, and (2) removing the author and
 * version info above.
 *
 * ----------------------------------------------------------
 *
 */

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <fcntl.h>

#ifdef  __DJGPP__

#include <io.h>

/* Make so our start-up code is minimal: disable filename
   globbing, and don't load environment file.  */
#include <crt0.h>

char ** __crt0_glob_function(char *arg) { return (char **)0; }
void   __crt0_load_environment_file(char *app_name) {}

#else    /* not __DJGPP__ */

/* Some Unix boxes don't have functon prototypes on the header files.
   -Wall will complain about this, so here are the prototypes:  */

void perror (const char *);
int  fprintf(FILE *, const char *, ...);

/* Non-DJGPP libraries might not have these two functions.  */

#include <ctype.h>

int
strnicmp(const char *s1, const char *s2, size_t n)
{

  if (n == 0)
    return 0;
  do {
    if (tolower(*s1) != tolower(*s2++))
      return (int)tolower(*s1) - (int)tolower(*--s2);
    if (*s1++ == 0)
      break;
  } while (--n != 0);
  return 0;
}

#include <sys/types.h>
#include <sys/stat.h>

long
filelength(int fd)
{
  struct stat stbuf;

  if (fstat(fd, &stbuf) == 0)
    return stbuf.st_size;

  return -1;
}

#endif  /* not __DJGPP__ */

#ifndef O_BINARY
#define O_BINARY    0
#endif

static const char split_marker[] = "<P> | <A HREF=\"#";
static const char dest_marker[]  = "<A NAME=\"";
static const char link_marker[]  = "<A HREF=\"#";
static size_t split_marker_len = sizeof(split_marker) -1;
static size_t dest_marker_len  = sizeof(dest_marker) - 1;
static size_t link_marker_len  = sizeof(link_marker) - 1;

/* Is POINT at the first character of STRING whose length is LEN?  */
int
looking_at(const char string[], size_t len, char *point)
{
  return strnicmp(string, point, len) == 0;
}

/* Record a position where we'll split the file, bump point.  */
static int *split_pos_table;        /* table of split positions */
static int  split_pos_table_size;   /* the size of the table */
static int  split_pos_idx;          /* index of next free slot */

size_t
remember_split_pos(size_t pos)
{
  if (split_pos_idx >= split_pos_table_size)
    {
      if (split_pos_table)
        split_pos_table =
          (int *)realloc(split_pos_table,
                         (split_pos_table_size *= 2)*sizeof(size_t));
      else
        {
          split_pos_table_size = 100;
          split_pos_table = (int *)malloc(split_pos_table_size*sizeof(size_t));
        }

      if (split_pos_table == (int *)0)
        {
          errno = ENOMEM;
          perror("split_pos table");
          exit(2);
        }
    }

  split_pos_table[split_pos_idx++] = pos;

  return split_marker_len;
}

/* Return the file position where subfile FILENO ends.  */
size_t
get_split_pos(int fileno)
{
  return split_pos_table[fileno];
}

/* Record an anchor name and its subfile number, bump point.  */
struct _dest_pos {
  char *name;
  int   fileno;
};
static struct _dest_pos *dest_pos_table;        /* table of anchors */
static int               dest_pos_table_size;   /* the size of the table */
static int               dest_pos_idx;          /* index of next free slot */

int
remember_dest_pos(char *p, int fileno)
{
  char *save_point = p;
  char *name_start;

  if (dest_pos_idx >= dest_pos_table_size)
    {
      if (dest_pos_table)
        dest_pos_table = (struct _dest_pos *)
          realloc(dest_pos_table,
                  (dest_pos_table_size *= 2)*sizeof(struct _dest_pos));
      else
        {
          dest_pos_table_size = 100;
          dest_pos_table = (struct _dest_pos *)
            malloc(dest_pos_table_size*sizeof(struct _dest_pos));
        }

      if (dest_pos_table == (struct _dest_pos *)0)
        {
          errno = ENOMEM;
          perror("dest_pos table");
          exit(2);
        }
    }

  p += dest_marker_len;
  name_start = p;
  while (*p !='"')
    p++;

  dest_pos_table[dest_pos_idx].fileno = fileno;
  dest_pos_table[dest_pos_idx].name = (char *)malloc(p - name_start + 1);
  if (dest_pos_table[dest_pos_idx].name == (char *)0)
    {
      errno = ENOMEM;
      perror("name in dest_pos table");
      exit(2);
    }
  strncpy(dest_pos_table[dest_pos_idx].name, name_start, p - name_start);
  dest_pos_table[dest_pos_idx++].name[p - name_start] = '\0';

  return p - save_point;
}

/* Skip ``<A HREF="'', return pointer to beginning of anchor name.  */
char *
skip_until_anchor_name(char *point)
{
  return point + link_marker_len;
}

/* Which subfile is this anchor in?  */
int
subfile_num_for_anchor_at_point(char *point)
{
  char c, *name_start = point;
  int idx = 0;

  while (*point != '"')
    point++;

  for (c = *name_start; idx < dest_pos_idx; idx++)
    {
      register char *anchor = dest_pos_table[idx].name;

      if (anchor[0] == c)
        {
	  size_t len = strlen(anchor);

	  /* Be careful not to catch possible substrings!  */
          if (len == point - name_start
	      && strncmp(anchor, name_start, len) == 0)

            return dest_pos_table[idx].fileno;
        }
    }

  fprintf(stderr, "%.*s: not found in table of anchors\n",
          (int)(point - name_start), name_start);
  exit(2);
}

int
main(int argc, char *argv[])
{
  if (argc == 3)
    {
      int in_fd = open(argv[1], O_RDONLY | O_BINARY);
      int out_fd;
      long fsize, actual_size;
      char *in_file;
      char *p, *last_p, *from;
      int subfile = 0;
      char subfile_name[FILENAME_MAX];
      int max_digits = 1;
      size_t split_pos;

      /* First, read the file. */
      
      if (in_fd < 0)
        {
          perror(argv[1]);
          return 2;
        }

      fsize = filelength(in_fd);

      in_file = (char *)malloc(fsize + 1);      /* leave place for `\0' */
      if (in_file == (char *)0)
        {
          errno = ENOMEM;
          perror(argv[1]);
          return 2;
        }

      if ((actual_size = read(in_fd, in_file, fsize)) != fsize)
        {
          if (actual_size <= 0)
            {
              perror(argv[1]);
              return 2;
            }
          fprintf(stderr, "%s: size is %ld, but only %ld bytes read\n",
                          argv[1], fsize, actual_size);
          fsize = actual_size;
        }

      close (in_fd);

      for (p = in_file + fsize - 1; *p == 0x1a && p > in_file; --p)
        {
          fsize--;
          actual_size--;
        }

      if (fsize < 2048)
        {
          fprintf(stderr, "%s: too small to bother\n", argv[1]);
          return 3;
        }

      p[1] = '\0';

      /* Pass 1: Determine the file positions where the file
                 will be split, and remember positions of the
                 <A NAME="#dest"> destination anchors.  */

      for (last_p = p, p = in_file; p < last_p; )
        {
          if (*p == '\n' && looking_at(split_marker, split_marker_len, ++p))
            {
              p += remember_split_pos(p - in_file);
              subfile++;
            }
          else if (looking_at(dest_marker, dest_marker_len, p))
            {
              p += remember_dest_pos(p, subfile);
            }
          else
            ++p;
        }

      /* Last subfile ends at EOF.  */
      remember_split_pos(p - in_file);
      subfile++;

      while (subfile /= 10)
        max_digits++;

      /* Pass 2: Generate the subfiles with updated links.  */

      sprintf(subfile_name, "%s.html", argv[2]);
      if ((out_fd = open(subfile_name,
                         O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0666)) == -1)
        {
          perror(subfile_name);
          return 2;
        }
      split_pos = get_split_pos(subfile);

      for (p = in_file, from = p; p < last_p; ++p)
        {
          if (p - in_file >= split_pos)     /* time to start another file */
            {
              if (write(out_fd, from, split_pos - (from - in_file)) <= 0)
                {
                  perror("write at split position");
                  return 2;
                }
              close(out_fd);
              from = in_file + split_pos;
              split_pos = get_split_pos(++subfile);
              sprintf(subfile_name, "%s%.*d.html",
                      argv[2], max_digits, subfile);
              if ((out_fd = open(subfile_name,
				 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
				 0666)) == -1)
                {
                  perror(subfile_name);
                  return 2;
                }
            }
          else if (looking_at(link_marker, link_marker_len, p))
            {
              int which_file;

              p = skip_until_anchor_name(p);
              which_file = subfile_num_for_anchor_at_point(p);

              --p;  /* the `#' character goes AFTER the file */

              sprintf(subfile_name, which_file ? "%s%.*d.html" : "%s.html",
                      argv[2], max_digits, which_file);
              if (write(out_fd, from, p - from) <= 0 ||
                 write(out_fd, subfile_name, strlen(subfile_name)) <= 0)
                {
                  perror("write at anchor name");
                  return 2;
                }
              from = p;
            }
        }

      if (p != from)
        if (write(out_fd, from, p - from) <= 0)
          {
            perror("write at EOF");
            return 2;
          }

      fprintf(stderr, "%s was split into %d file%s\n",
              argv[1], subfile + 1, subfile ? "s" : "");

      return 0;
    }
  else
    {
      fprintf(stderr, "Usage: %s inputfile outbase\n", *argv);
      return 1;
    }
}
