/* tspell: spell checker adapted for TeX.
Version 2024-11-05.
Copyright 2016 Dmitri Pavlov.
Distributed under the terms of the GNU General Public License, version 3.
Synopsis: tspell [ -h ] input-file output-file [ personal-dictionary-file ]
Compile with -laspell and -lreadline.
The personal dictionary file is simply a list of words, one on each line.
The -h option allows one to see how tspell splits the text into individual words;
it can be used to track down problems that arise when a new aspect of TeX syntax is implemented. */

#define _GNU_SOURCE /* for memrchr */
#include <ctype.h>
#include <locale.h>
#include <wctype.h>
#include <errno.h>
#include <fcntl.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <aspell.h>
#include <readline/readline.h>
#include <readline/history.h>

void deprintf(int sline, const char *func, const char *format, ...) {
  if (errno != 0)
    fprintf(stderr, "system error: %s\n", strerror(errno));
  errno = 0;
  fprintf(stderr, "line %d, function %s\n", sline, func);
  va_list ap;
  va_start(ap, format);
  vfprintf(stderr, format, ap);
  fprintf(stderr, "\n");
  va_end(ap);
}

void err(void) {
  if (errno != 0)
    fprintf(stderr, "system error: %s\n", strerror(errno));
  errno = 0;
}

int debug = 0;

void debugprintf(const char *format, ...) {
  if (debug == 0)
    return;
  err();
  va_list ap;
  va_start(ap, format);
  vfprintf(stderr, format, ap);
  va_end(ap);
}

#define perr(...) deprintf(__LINE__, __func__, __VA_ARGS__)
#define serr(...) perr(__VA_ARGS__), exit(1)
#define aerr() perr("aspell error: %s", aspell_speller_error_message(speller))

char *prompt;
static int set_prompt(void) {
  rl_insert_text(prompt);
  return 0;
}

#define paccents "`'^\"~=.-"
#define laccents "uvHtcdb" "oOlL"
#define sletters "oe OE ae AE aa AA ss "

int str2cmp(const char *s1, const char *s2) {
  return strncmp(s1, s2, strlen(s2));
}

int has_accent(char *s) {
  return s[0] == '\\' && (strchr(paccents, s[1]) != NULL
                          || (strchr(laccents, s[1]) != NULL && s[2] == ' ')
                          || (s[1] != 0 && s[2] != 0 && s[3] == ' ' && strstr(sletters, (char[4]){s[1], s[2], s[3], 0}) != NULL));
}

int has_alpha(char *s, char *t) {
  int p = (unsigned char)*s;
  if (p < 0x80)
    return isalpha(p);
  else if (p < 0xC0)
    return 0; /* intermediate UTF-8 bytes */
  else if (p < 0xE0 && s + 1 < t && ((unsigned char)s[1] & 0xC0) == 0x80)
    return iswalpha(((p & ~0xC0) << 6) + ((unsigned char)s[1] & ~0x80));
  else if (p >= 0xE0 && p < 0xF0 && s + 2 < t && ((unsigned char)s[1] & 0xC0) == 0x80 && ((unsigned char)s[2] & 0xC0) == 0x80)
    return iswalpha(((p & ~0xE0) << 12) + (((unsigned char)s[1] & ~0x80) << 6) + ((unsigned char)s[2] & ~0x80));
  else
    return 0;
}

char *scan_word(char *s, char *t) {
  for (; *s == '\'' || ((unsigned char)(*s) & 0xC0) == 0x80 || has_alpha(s, t); s++)
    ;
  return s;
}

char *scan_ref(char *s, char *t) {
  while (*s == '\'' || ((unsigned char)(*s) & 0xC0) == 0x80 || has_alpha(s, t) || strchr("0123456789 -(,)[|]$", *s) != NULL)
    if (*s == '$') { /* skip formulas inside ^={...} */
      debugprintf("skipping a formula inside a reference\n");
      s++;
      while (s < t && *s != '$') {
        s++;
      }
      if (s < t && *s == '$')
        s++;
      debugprintf("skipping complete\n");
    } else
      s++;
  return s;
}

int pcmp(const void *p, const void *q) {
  return strcmp(*(char *const *)p, *(char *const *)q);
}

int main(int argc, char *argv[]) {
  int arg = 1, hilite = 0;
  while (arg < argc)
    if (strcmp(argv[arg], "-h") == 0) {
      hilite = 1;
      arg++;
    } else if (strcmp(argv[arg], "-d") == 0) {
      debug = 1;
      arg++;
    } else
      break;
  if (argc - arg != 2 && argc - arg != 3) {
    fprintf(stderr, "Synopsis: %s [ -h ] input-file output-file [ personal-dictionary-file ]\n", argv[0]);
    return 1;
  }
  setlocale(LC_CTYPE, "");
  AspellConfig *config = new_aspell_config();
  aspell_config_replace(config, "lang", "en_US");
  err();
  AspellCanHaveError *ret = new_aspell_speller(config);
  if (errno == ENOENT) /* e.g., missing ~/.aspell.conf */
    errno = 0;
  err();
  delete_aspell_config(config);
  if (aspell_error(ret) != 0)
    serr("aspell error: %s", aspell_error_message(ret));
  AspellSpeller *speller = to_aspell_speller(ret);

  int fd = open(argv[arg], O_RDONLY);
  if (fd == -1)
    serr("open(%s)", argv[arg]);
  struct stat sb;
  if (fstat(fd, &sb) != 0)
    serr("fstat(%s)", argv[arg]);
  int n = sb.st_size;
  char *inp = mmap(0, n + 1, PROT_READ, MAP_PRIVATE, fd, 0); /* add one zero byte at the end */
  if (inp == MAP_FAILED)
    serr("mmap(%s, %d+1)", argv[arg], n);
  
  FILE *output = fopen(argv[arg + 1], "w");
  if (output == NULL)
    serr("fopen(%s, w)", argv[arg + 1]);
  if (arg + 2 < argc) {
    FILE *words = fopen(argv[arg + 2], "r");
    if (words == NULL) {
      if (errno != ENOENT)
        serr("fopen(%s, r)", argv[arg + 2]);
      else
        errno = 0;
    } else {
      char buf[1024];
      while (fgets(buf, sizeof(buf), words) == buf) {
        buf[strlen(buf) - 1] = 0;
        aspell_speller_add_to_personal(speller, buf, strlen(buf));
        err();
        if (aspell_speller_error(speller) != 0) {
          aerr();
          return 1;
        }
      }
    }
  }
  
  int last = 0;
  int k = 0;
  err();
  while (k < n)
    if (has_alpha(inp + k, inp + n)) {
      debugprintf("%05d: %c (alpha)\n", k, inp[k]);
      int l;
      l = scan_word(inp + k, inp + n) - inp;
      if (l - 2 >= k && inp[l - 2] == '\'' && (inp[l - 1] == 's' || inp[l - 1] == '\''))
        l -= 2; /* ignore possessives and closing double quotes */
      if (l - 1 >= k && inp[l - 1] == '\'')
        l--; /* ignore plural possessives */
      if (has_accent(inp + l)) {
        /* ignore words with accents */
        k = l;
        continue;
      }

      char *word = inp + k;
      int wlen = l - k;
      debugprintf("aspell_speller_check(speller, %.*s)\n", wlen, word);
      /*
      int onlyL = 1, jj;
      for (jj = 0; jj < wlen; jj++)
        if (word[jj] != 'L')
          break;
      if (jj == wlen)
        debugprintf("Just L, length %d\n", wlen);
      */
      int ck = aspell_speller_check(speller, word, wlen);
      if (hilite) {
        fprintf(output, "%.*s\x1B[7m%.*s\x1B[m", k - last, inp + last, wlen, word);
        last = l;
      } else if (ck == 1)
        ;
      else if (ck == 0) {
        char *s = memrchr(inp, '\n', k), *t = memchr(inp + l, '\n', n - l);
        int lstart, lend;
        if (s == NULL)
          lstart = 0;
        else
          lstart = s - inp + 1;
        if (t == NULL)
          lend = n;
        else
          lend = t - inp + 1;
      restart:
        fprintf(stderr, "%.*s\x1B[7m%.*s\x1B[m%.*s", k - lstart, inp + lstart, wlen, word, lend - l, inp + l);
        const AspellWordList *wl = aspell_speller_suggest(speller, word, wlen);
        int cnt = 0;
        if (wl == NULL)
          aerr();
        else {
          AspellStringEnumeration *els = aspell_word_list_elements(wl);
          const char *w;
          for (cnt = 0; (w = aspell_string_enumeration_next(els)) != 0; cnt++)
            fprintf(stderr, "%d:%s ", cnt, w);
          delete_aspell_string_enumeration(els);
        }
        fprintf(stderr, "=:add -:lower *:exit\n");
        prompt = malloc(wlen + 1);
        if (prompt == NULL)
          serr("malloc(%d+1)", wlen);
        memcpy(prompt, word, wlen);
        prompt[wlen] = 0;
      reread:
        rl_startup_hook = set_prompt;
        char *ans = readline("");
        err();
        free(prompt);
        int len = strlen(ans);
        if (len >= wlen && memcmp(ans, word, wlen) == 0) {
          if (len == wlen) {
            debugprintf("aspell_speller_add_to_session(speller, %.*s)\n", wlen, word);
            aspell_speller_add_to_session(speller, word, wlen);
            if (aspell_speller_error(speller) != 0)
              aerr();
            if (word != inp + k)
              goto print_corr;
            k = l;
            continue;
          } else if (len == wlen + 1)
            switch (ans[len - 1]) {
            case '-':
              for (int i = 0; i < wlen; i++)
                ans[i] = tolower(ans[i]);
            case '=':
              len--;
              ans[len] = 0;
              debugprintf("aspell_speller_add_to_personal(speller, %.*s, %d)\n", wlen, ans, wlen);
              aspell_speller_add_to_personal(speller, ans, wlen);
              if (aspell_speller_error(speller) != 0)
                aerr();
              k = l;
              continue;
            case '*':
              goto finish;
            }
          int i;
          for (i = wlen; i < len; i++)
            if (!isdigit(ans[i]))
              break;
          if (i == len) {
            int ind;
            if (sscanf(ans + wlen, "%d", &ind) == 1 && ind >= 0 && ind < cnt) {
              AspellStringEnumeration *els = aspell_word_list_elements(wl);
              const char *w;
              for (int i = 0; i <= ind; i++)
                w = aspell_string_enumeration_next(els);
              strcpy(ans, w);
              len = strlen(ans);
              delete_aspell_string_enumeration(els);
              goto print_corr;
            }
            fprintf(stderr, "invalid index %d, repeat input\n", ind);
            goto reread;
          }
        }
        len = strlen(ans);
        int ck = aspell_speller_check(speller, ans, len);
        if (ck == 1) {
        print_corr:
          fprintf(output, "%.*s", k - last, inp + last); /* output what we read so far */
          fprintf(output, "%.*s", len, ans); /* output the corrected word */
          last = l;
          k = l; /* advance to the next character */
        } else if (ck == 0) {
          word = ans;
          wlen = len;
          goto restart;
        } else
          aerr();
      } else
        aerr();
      k = l; /* discard the word anyway */
      if (k + 2 < n && inp[k] == '\'' && inp[k + 1] == 's' && !isalpha(inp[k + 2]))
        k += 2; /* discard possessives */
    } else if (inp[k] == '\\') {
      debugprintf("%05d: %c (backslash)\n", k, inp[k]);
      int l = k;
      k++;
      if (k == n)
        break;
      if (has_accent(inp + k - 1)) {
        if (isalpha(inp[k])) {
          k++;
          if (isalpha(inp[k]))
            k++;
          if (inp[k] == ' ')
            k++;
        } else
          k++;
        for (; isalpha(inp[k]) /* || inp[k] == '-' */; k++)
          ;
      } else if (has_alpha(inp + k, inp + n)) {
        int pk = k;
        k = scan_word(inp + k, inp + n) - inp;
        debugprintf("%05d: command \\%.*s\n", k, k - pk, inp + pk);
        if (k == n)
          break;
        if (inp[k] == ':') {
          debugprintf("%05d: colon after command \\%.*s\n", k, k - pk, inp + pk);
          for (; isgraph(inp[k]); k++)
            ;
        } else if (str2cmp(inp + l, "\\begin{tikzpicture}") == 0) {
          debugprintf("%05d: tikzpicture environment\n", k);
          for (; k < n; k++)
            if (str2cmp(inp + k, "\\end{tikzpicture}") == 0) 
              break;
        } else if (str2cmp(inp + l, "\\begin{align}") == 0) {
          debugprintf("%05d: align environment\n", k);
          for (; k < n; k++)
            if (str2cmp(inp + k, "\\end{align}") == 0) 
              break;
        } else if (str2cmp(inp + l, "\\begin{align*}") == 0) {
          debugprintf("%05d: align* environment\n", k);
          for (; k < n; k++)
            if (str2cmp(inp + k, "\\end{align*}") == 0) 
              break;
        } else if (str2cmp(inp + l, "\\begin{equation}") == 0) {
          debugprintf("%05d: equation environment\n", k);
          for (; k < n; k++)
            if (str2cmp(inp + k, "\\end{equation}") == 0) 
              break;
        } else if ((str2cmp(inp + l, "\\def") == 0 && !isalpha(inp[l + 4]))
                 || str2cmp(inp + l, "\\hyphenation{") == 0
                 || str2cmp(inp + l, "\\cite{") == 0
                 || str2cmp(inp + l, "\\cite[") == 0
                 || str2cmp(inp + l, "\\cref{") == 0
                 || str2cmp(inp + l, "\\scref{") == 0
                 || str2cmp(inp + l, "\\gcref{") == 0
                 || str2cmp(inp + l, "\\input{") == 0
                 || str2cmp(inp + l, "\\label{") == 0
                 || str2cmp(inp + l, "\\bibitem{") == 0
                 || str2cmp(inp + l, "\\tikzset{") == 0
                 || str2cmp(inp + l, "\\begin{") == 0
                 || str2cmp(inp + l, "\\end{") == 0
                 || str2cmp(inp + l, "\\eqref{") == 0
                 || str2cmp(inp + l, "\\ref{") == 0) {
          debugprintf("%05d: LaTeX command \\%.*s\n", k, k - l, inp + l);
          int bal = 0;
          for (; k < n; k++)
            if (inp[k] == '{')
              bal++;
            else if (inp[k] == '}') {
              bal--;
              if (bal == 0)
                break;
            }
        } else if (str2cmp(inp + l, "\\input ") == 0) {
          debugprintf("%05d: input command\n", k);
          for (; k < n; k++)
            if (inp[k] == '\n')
              break;
        } else if (str2cmp(inp + l, "\\font\\") == 0 || str2cmp(inp + l, "\\chardef\\") == 0 || str2cmp(inp + l, "\\mathchardef\\") == 0) {
          debugprintf("%05d: font/chardef/mathchardef command \\%.*s\n", k, k - pk, inp + pk);
          for (; k < n; k++)
            if (inp[k] == ' ' || inp[k] == '\n')
              break;
        }
        debugprintf("%05d: finished handling at %.*s\n", k, k - pk, inp + pk);
      } else if (inp[k] == '<') {
        debugprintf("%05d: \\<...\\>\n", k);
        for (; k < n; k++)
          if (str2cmp(inp + k, "\\>") == 0)
            break;
      } else if (inp[k] == '[') {
        debugprintf("%05d: \\[...\\]\n", k);
        for (; k < n; k++)
          if (str2cmp(inp + k, "\\]") == 0)
            break;
      } else
        k++;
    } else if (inp[k] == '$') {
      debugprintf("%05d: %c (formula)\n", k, inp[k]);
      k++;
      if (k == n)
        break;
      if (inp[k] == '$') {
        debugprintf("%05d: displayed formula\n", k);
        int pk = k;
        for (; k + 1 < n && (inp[k] != '$' || inp[k + 1] != '$'); k++)
          ;
        k += 2;
        debugprintf("%05d: content of displayed formula: %.*s\n", k, k - (pk - 1), inp + pk - 1);
        if (k >= n)
          break;
      } else {
        int pk = k;
        for (; k < n && inp[k] != '$'; k++)
          ;
        k++;
        debugprintf("%05d: content of formula: %.*s\n", k, k - (pk - 1), inp + pk - 1);
        if (k >= n)
          break;
      }
    } else if (inp[k] == '^') {
      debugprintf("%05d: %c (reference)\n", k, inp[k]);
      /* special handling of dpmac references of the form ^^=:{some word[s|] possibly with plural[s|]} */
      k++;
      if (inp[k] == '^')
        k++;
      if (inp[k] == '=')
        k++;
      if (inp[k] == ':')
        k++;
      if (inp[k] == '{') {
        k++;
        debugprintf("processing the reference: {%c\n", inp[k]);
        int ll;
        ll = scan_ref(inp + k, inp + n) - inp; 
        if (inp[ll] != '}') {
          debugprintf("exceptional reference 1: %.*s\n", ll - k, inp + k);
          while (ll < n && inp[ll] != '}')
            ll++;
          if (inp[ll] == '}')
            ll++;
          debugprintf("exceptional reference 2: %.*s\n", ll - k, inp + k);
        } else if (inp[ll] == '}') {
          /* parse plurals inside {...} */
          debugprintf("*:%.*s\n", ll - k, inp + k);
          char *buf[2];
          int pos[2] = {0, 0};
          buf[0] = malloc(ll - k + 1);
          buf[1] = malloc(ll - k + 1);
          if (buf[0] == NULL || buf[1] == NULL)
            serr("malloc(%d - %d + 1)", ll, k);
          int state = 0;
          for (int r = k; r < ll; r++)
            switch (inp[r]) {
            case '[':
              state = 1;
              break;
            case '|':
              state = -1;
              break;
            case ']':
              state = 0;
              break;
            default:
              if (state >= 0)
                buf[0][pos[0]++] = inp[r];
              if (state <= 0)
                buf[1][pos[1]++] = inp[r];
              break;
            }
          buf[0][pos[0]] = 0;
          buf[1][pos[1]] = 0;
          for (int z = 0; z < 2; z++) {
            int k = 0, last = 0;
            while (k < pos[z])
              if (buf[z][k] == ' ' || buf[z][k] == '-')
                k++;
              else if (buf[z][k] == '$') {
                k++;
                while (k < pos[z] && buf[z][k] != '$')
                  k++;
                if (k < pos[z] && buf[z][k] == '$')
                  k++;
              } else {
                int l;
                l = scan_word(buf[z] + k, buf[z] + pos[z]) - buf[z];
                if (l == k) {
                  k++;
                  continue;
                }
                char *word = buf[z] + k;
                int wlen = l - k;
                int ck = aspell_speller_check(speller, word, wlen);
                if (hilite) {
                  fprintf(output, "%.*s\x1B[7m%.*s\x1B[m", k - last, buf[z] + last, wlen, word);
                  last = l;
                } else if (ck == 1)
                  ;
                else if (ck == 0) {
                restart2:
                  fprintf(stderr, "{[%.*s\x1B[7m%.*s\x1B[m%.*s]}\n", k, buf[z], wlen, word, pos[z] - l, buf[z] + l);
                  const AspellWordList *wl = aspell_speller_suggest(speller, word, wlen);
                  int cnt = 0;
                  if (wl == NULL)
                    aerr();
                  else {
                    AspellStringEnumeration *els = aspell_word_list_elements(wl);
                    const char *w;
                    for (cnt = 0; (w = aspell_string_enumeration_next(els)) != 0; cnt++)
                      fprintf(stderr, "%d:%s ", cnt, w);
                    delete_aspell_string_enumeration(els);
                  }
                  fprintf(stderr, "=:add -:lower *:exit\n");
                  prompt = malloc(wlen + 1);
                  if (prompt == NULL)
                    serr("malloc(%d+1)", wlen);
                  memcpy(prompt, word, wlen);
                  prompt[wlen] = 0;
                reread2:
                  rl_startup_hook = set_prompt;
                  char *ans = readline("");
                  err();
                  free(prompt);
                  int len = strlen(ans);
                  if (len >= wlen && memcmp(ans, word, wlen) == 0) {
                    if (len == wlen) {
                      aspell_speller_add_to_session(speller, word, wlen);
                      if (aspell_speller_error(speller) != 0)
                        aerr();
                      if (word != buf[z] + k)
                        goto print_corr2;
                      k = l;
                      continue;
                    } else if (len == wlen + 1)
                      switch (ans[len - 1]) {
                      case '-':
                        for (int i = 0; i < wlen; i++)
                          ans[i] = tolower(ans[i]);
                      case '=':
                        len--;
                        ans[len] = 0;
                        aspell_speller_add_to_personal(speller, ans, wlen);
                        if (aspell_speller_error(speller) != 0)
                          aerr();
                        k = l;
                        continue;
                      case '*':
                        goto finish;
                      }
                    int i;
                    for (i = wlen; i < len; i++)
                      if (!isdigit(ans[i]))
                        break;
                    if (i == len) {
                      int ind;
                      if (sscanf(ans + wlen, "%d", &ind) == 1 && ind >= 0 && ind < cnt) {
                        AspellStringEnumeration *els = aspell_word_list_elements(wl);
                        const char *w;
                        for (int i = 0; i <= ind; i++)
                          w = aspell_string_enumeration_next(els);
                        strcpy(ans, w);
                        len = strlen(ans);
                        delete_aspell_string_enumeration(els);
                        goto print_corr2;
                      }
                      fprintf(stderr, "invalid index %d, repeat input\n", ind);
                      goto reread2;
                    }
                  }
                  len = strlen(ans);
                  int ck = aspell_speller_check(speller, ans, len);
                  if (ck == 1) {
                  print_corr2:
                    fprintf(stderr, "\x1B[7mmanual correction:\x1B[m %.*s\n", len, ans); /* output the corrected word */
                    last = l;
                    k = l; /* advance to the next character */
                  } else if (ck == 0) {
                    word = ans;
                    wlen = len;
                    goto restart2;
                  } else
                    aerr();
                } else
                  aerr();
                k = l; /* discard the word anyway */
              }
            }
          k = ll;
          free(buf[0]);
          free(buf[1]);
        }
      }
    } else {
      debugprintf("%05d: %c (other)\n", k, inp[k]);
      k++;
    }
finish:
  fprintf(output, "%.*s", n - last, inp + last); /* output the remainder */
  if (hilite)
    return 0;
  last = n;
  if (argc == 4 + hilite) {
    FILE *pwords = fopen(argv[3 + hilite], "w");
    if (pwords == NULL)
      serr("fopen(%s, w)", argv[3 + hilite]);
    const AspellWordList *wl = aspell_speller_personal_word_list(speller);
    if (wl == NULL)
      aerr();
    else {
      int cnt = 0, tsize = 0;
      AspellStringEnumeration *els = aspell_word_list_elements(wl);
      const char *pw;
      while ((pw = aspell_string_enumeration_next(els)) != 0) {
        tsize += strlen(pw) + 1;
        cnt++;
      }
      char *buf = malloc(tsize);
      if (buf == NULL)
        serr("malloc(%d)", tsize);
      char **ptr = malloc(cnt * sizeof(char *));
      if (ptr == NULL)
        serr("malloc(%d * sizeof(char *))", cnt);
      int k = 0;
      char *p = buf;
      els = aspell_word_list_elements(wl);
      while ((pw = aspell_string_enumeration_next(els)) != 0) {
        ptr[k] = p;
        strcpy(p, pw);
        p += strlen(pw) + 1;
        k++;
      }
      qsort(ptr, cnt, sizeof(char *), pcmp);
      for (k = 0; k < cnt; k++)
        fprintf(pwords, "%s\n", ptr[k]);
      free(buf);
      free(ptr);
    }
  }
  return 0;
}