/* tspell: spell checker adapted for TeX. Version 2023-11-21. Copyright 2016 Dmitri Pavlov. Distributed under the terms of the GNU General Public License, version 3. Synopsis: tspell [ -h ] input-file output-file [ personal-dictionary-file ] Compile with -laspell and -lreadline. The personal dictionary file is simply a list of words, one on each line. The -h option allows one to see how tspell splits the text into individual words; it can be used to track down problems that arise when a new aspect of TeX syntax is implemented. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void deprintf(int sline, const char *func, const char *format, ...) { if (errno != 0) fprintf(stderr, "system error: %s\n", strerror(errno)); errno = 0; fprintf(stderr, "line %d, function %s\n", sline, func); va_list ap; va_start(ap, format); vfprintf(stderr, format, ap); fprintf(stderr, "\n"); va_end(ap); } void err(void) { if (errno != 0) fprintf(stderr, "system error: %s\n", strerror(errno)); errno = 0; } int debug = 0; void debugprintf(const char *format, ...) { if (debug == 0) return; err(); va_list ap; va_start(ap, format); vfprintf(stderr, format, ap); va_end(ap); } #define perr(...) deprintf(__LINE__, __func__, __VA_ARGS__) #define serr(...) perr(__VA_ARGS__), exit(1) #define aerr() perr("aspell error: %s", aspell_speller_error_message(speller)) char *prompt; static int set_prompt(void) { rl_insert_text(prompt); return 0; } #define paccents "`'^\"~=.-" #define laccents "uvHtcdb" "oOlL" #define sletters "oe OE ae AE aa AA ss " int str2cmp(const char *s1, const char *s2) { return strncmp(s1, s2, strlen(s2)); } int has_accent(char *s) { return s[0] == '\\' && (strchr(paccents, s[1]) != NULL || (strchr(laccents, s[1]) != NULL && s[2] == ' ') || (s[1] != 0 && s[2] != 0 && s[3] == ' ' && strstr(sletters, (char[4]){s[1], s[2], s[3], 0}) != NULL)); } int has_alpha(char *s, char *t) { int p = (unsigned char)*s; if (p < 0x80) return isalpha(p); else if (p < 0xC0) return 0; /* intermediate UTF-8 bytes */ else if (p < 0xE0 && s + 1 < t && ((unsigned char)s[1] & 0xC0) == 0x80) return iswalpha(((p & ~0xC0) << 6) + ((unsigned char)s[1] & ~0x80)); else if (p >= 0xE0 && p < 0xF0 && s + 2 < t && ((unsigned char)s[1] & 0xC0) == 0x80 && ((unsigned char)s[2] & 0xC0) == 0x80) return iswalpha(((p & ~0xE0) << 12) + (((unsigned char)s[1] & ~0x80) << 6) + ((unsigned char)s[2] & ~0x80)); else return 0; } char *scan_word(char *s, char *t) { for (; *s == '\'' || ((unsigned char)(*s) & 0xC0) == 0x80 || has_alpha(s, t); s++) ; return s; } char *scan_ref(char *s, char *t) { while (*s == '\'' || ((unsigned char)(*s) & 0xC0) == 0x80 || has_alpha(s, t) || strchr("0123456789 -(,)[|]$", *s) != NULL) if (*s == '$') { /* skip formulas inside ^={...} */ debugprintf("skipping a formula inside a reference\n"); s++; while (s < t && *s != '$') { s++; } if (s < t && *s == '$') s++; debugprintf("skipping complete\n"); } else s++; return s; } int pcmp(const void *p, const void *q) { return strcmp(*(char *const *)p, *(char *const *)q); } int main(int argc, char *argv[]) { int arg = 1, hilite = 0; while (arg < argc) if (strcmp(argv[arg], "-h") == 0) { hilite = 1; arg++; } else if (strcmp(argv[arg], "-d") == 0) { debug = 1; arg++; } else break; if (argc - arg != 2 && argc - arg != 3) { fprintf(stderr, "Synopsis: %s [ -h ] input-file output-file [ personal-dictionary-file ]\n", argv[0]); return 1; } setlocale(LC_CTYPE, ""); AspellConfig *config = new_aspell_config(); aspell_config_replace(config, "lang", "en_US"); AspellCanHaveError *ret = new_aspell_speller(config); delete_aspell_config(config); if (aspell_error(ret) != 0) serr("aspell error: %s", aspell_error_message(ret)); AspellSpeller *speller = to_aspell_speller(ret); int fd = open(argv[arg], O_RDONLY); if (fd == -1) serr("open(%s)", argv[arg]); struct stat sb; if (fstat(fd, &sb) != 0) serr("fstat(%s)", argv[arg]); int n = sb.st_size; char *inp = mmap(0, n + 1, PROT_READ, MAP_PRIVATE, fd, 0); /* add one zero byte at the end */ if (inp == MAP_FAILED) serr("mmap(%s, %d+1)", argv[arg], n); FILE *output = fopen(argv[arg + 1], "w"); if (output == NULL) serr("fopen(%s, w)", argv[arg + 1]); if (arg + 2 < argc) { FILE *words = fopen(argv[arg + 2], "r"); if (words == NULL) { if (errno != ENOENT) serr("fopen(%s, r)", argv[arg + 2]); else errno = 0; } else { char buf[1024]; while (fgets(buf, sizeof(buf), words) == buf) { buf[strlen(buf) - 1] = 0; aspell_speller_add_to_personal(speller, buf, strlen(buf)); err(); if (aspell_speller_error(speller) != 0) { aerr(); return 1; } } } } int last = 0; int k = 0; err(); while (k < n) if (has_alpha(inp + k, inp + n)) { debugprintf("%05d: %c (alpha)\n", k, inp[k]); int l; l = scan_word(inp + k, inp + n) - inp; if (l - 2 >= k && inp[l - 2] == '\'' && (inp[l - 1] == 's' || inp[l - 1] == '\'')) l -= 2; /* ignore possessives and closing double quotes */ if (l - 1 >= k && inp[l - 1] == '\'') l--; /* ignore plural possessives */ if (has_accent(inp + l)) { /* ignore words with accents */ k = l; continue; } char *word = inp + k; int wlen = l - k; debugprintf("aspell_speller_check(speller, %.*s)\n", wlen, word); /* int onlyL = 1, jj; for (jj = 0; jj < wlen; jj++) if (word[jj] != 'L') break; if (jj == wlen) debugprintf("Just L, length %d\n", wlen); */ int ck = aspell_speller_check(speller, word, wlen); if (hilite) { fprintf(output, "%.*s\x1B[7m%.*s\x1B[m", k - last, inp + last, wlen, word); last = l; } else if (ck == 1) ; else if (ck == 0) { char *s = memrchr(inp, '\n', k), *t = memchr(inp + l, '\n', n - l); int lstart, lend; if (s == NULL) lstart = 0; else lstart = s - inp + 1; if (t == NULL) lend = n; else lend = t - inp + 1; restart: fprintf(stderr, "%.*s\x1B[7m%.*s\x1B[m%.*s", k - lstart, inp + lstart, wlen, word, lend - l, inp + l); const AspellWordList *wl = aspell_speller_suggest(speller, word, wlen); int cnt = 0; if (wl == NULL) aerr(); else { AspellStringEnumeration *els = aspell_word_list_elements(wl); const char *w; for (cnt = 0; (w = aspell_string_enumeration_next(els)) != 0; cnt++) fprintf(stderr, "%d:%s ", cnt, w); delete_aspell_string_enumeration(els); } fprintf(stderr, "=:add -:lower *:exit\n"); prompt = malloc(wlen + 1); if (prompt == NULL) serr("malloc(%d+1)", wlen); memcpy(prompt, word, wlen); prompt[wlen] = 0; reread: rl_startup_hook = set_prompt; char *ans = readline(""); err(); free(prompt); int len = strlen(ans); if (len >= wlen && memcmp(ans, word, wlen) == 0) { if (len == wlen) { debugprintf("aspell_speller_add_to_session(speller, %.*s)\n", wlen, word); aspell_speller_add_to_session(speller, word, wlen); if (aspell_speller_error(speller) != 0) aerr(); if (word != inp + k) goto print_corr; k = l; continue; } else if (len == wlen + 1) switch (ans[len - 1]) { case '-': for (int i = 0; i < wlen; i++) ans[i] = tolower(ans[i]); case '=': len--; ans[len] = 0; debugprintf("aspell_speller_add_to_personal(speller, %.*s, %d)\n", wlen, ans, wlen); aspell_speller_add_to_personal(speller, ans, wlen); if (aspell_speller_error(speller) != 0) aerr(); k = l; continue; case '*': goto finish; } int i; for (i = wlen; i < len; i++) if (!isdigit(ans[i])) break; if (i == len) { int ind; if (sscanf(ans + wlen, "%d", &ind) == 1 && ind >= 0 && ind < cnt) { AspellStringEnumeration *els = aspell_word_list_elements(wl); const char *w; for (int i = 0; i <= ind; i++) w = aspell_string_enumeration_next(els); strcpy(ans, w); len = strlen(ans); delete_aspell_string_enumeration(els); goto print_corr; } fprintf(stderr, "invalid index %d, repeat input\n", ind); goto reread; } } len = strlen(ans); int ck = aspell_speller_check(speller, ans, len); if (ck == 1) { print_corr: fprintf(output, "%.*s", k - last, inp + last); /* output what we read so far */ fprintf(output, "%.*s", len, ans); /* output the corrected word */ last = l; k = l; /* advance to the next character */ } else if (ck == 0) { word = ans; wlen = len; goto restart; } else aerr(); } else aerr(); k = l; /* discard the word anyway */ if (k + 2 < n && inp[k] == '\'' && inp[k + 1] == 's' && !isalpha(inp[k + 2])) k += 2; /* discard possessives */ } else if (inp[k] == '\\') { debugprintf("%05d: %c (backslash)\n", k, inp[k]); int l = k; k++; if (k == n) break; if (has_accent(inp + k - 1)) { if (isalpha(inp[k])) { k++; if (isalpha(inp[k])) k++; if (inp[k] == ' ') k++; } else k++; for (; isalpha(inp[k]) /* || inp[k] == '-' */; k++) ; } else if (has_alpha(inp + k, inp + n)) { int pk = k; k = scan_word(inp + k, inp + n) - inp; debugprintf("%05d: command \\%.*s\n", k, k - pk, inp + pk); if (k == n) break; if (inp[k] == ':') { debugprintf("%05d: colon after command \\%.*s\n", k, k - pk, inp + pk); for (; isgraph(inp[k]); k++) ; } else if (str2cmp(inp + l, "\\begin{tikzpicture}") == 0) { debugprintf("%05d: tikzpicture environment\n", k); for (; k < n; k++) if (str2cmp(inp + k, "\\end{tikzpicture}") == 0) break; } else if (str2cmp(inp + l, "\\begin{align}") == 0) { debugprintf("%05d: align environment\n", k); for (; k < n; k++) if (str2cmp(inp + k, "\\end{align}") == 0) break; } else if (str2cmp(inp + l, "\\begin{align*}") == 0) { debugprintf("%05d: align* environment\n", k); for (; k < n; k++) if (str2cmp(inp + k, "\\end{align*}") == 0) break; } else if (str2cmp(inp + l, "\\begin{equation}") == 0) { debugprintf("%05d: equation environment\n", k); for (; k < n; k++) if (str2cmp(inp + k, "\\end{equation}") == 0) break; } else if ((str2cmp(inp + l, "\\def") == 0 && !isalpha(inp[l + 4])) || str2cmp(inp + l, "\\hyphenation{") == 0 || str2cmp(inp + l, "\\cite{") == 0 || str2cmp(inp + l, "\\cite[") == 0 || str2cmp(inp + l, "\\cref{") == 0 || str2cmp(inp + l, "\\label{") == 0 || str2cmp(inp + l, "\\bibitem{") == 0 || str2cmp(inp + l, "\\tikzset{") == 0 || str2cmp(inp + l, "\\begin{") == 0 || str2cmp(inp + l, "\\end{") == 0 || str2cmp(inp + l, "\\eqref{") == 0 || str2cmp(inp + l, "\\ref{") == 0) { debugprintf("%05d: LaTeX command \\%.*s\n", k, k - l, inp + l); int bal = 0; for (; k < n; k++) if (inp[k] == '{') bal++; else if (inp[k] == '}') { bal--; if (bal == 0) break; } } else if (str2cmp(inp + l, "\\input ") == 0) { debugprintf("%05d: input command\n", k); for (; k < n; k++) if (inp[k] == '\n') break; } else if (str2cmp(inp + l, "\\font\\") == 0 || str2cmp(inp + l, "\\chardef\\") == 0 || str2cmp(inp + l, "\\mathchardef\\") == 0) { debugprintf("%05d: font/chardef/mathchardef command \\%.*s\n", k, k - pk, inp + pk); for (; k < n; k++) if (inp[k] == ' ' || inp[k] == '\n') break; } debugprintf("%05d: finished handling at %.*s\n", k, k - pk, inp + pk); } else if (inp[k] == '<') { debugprintf("%05d: \\<...\\>\n", k); for (; k < n; k++) if (str2cmp(inp + k, "\\>") == 0) break; } else if (inp[k] == '[') { debugprintf("%05d: \\[...\\]\n", k); for (; k < n; k++) if (str2cmp(inp + k, "\\]") == 0) break; } else k++; } else if (inp[k] == '$') { debugprintf("%05d: %c (formula)\n", k, inp[k]); k++; if (k == n) break; if (inp[k] == '$') { debugprintf("%05d: displayed formula\n", k); int pk = k; for (; k + 1 < n && (inp[k] != '$' || inp[k + 1] != '$'); k++) ; k += 2; debugprintf("%05d: content of displayed formula: %.*s\n", k, k - (pk - 1), inp + pk - 1); if (k >= n) break; } else { int pk = k; for (; k < n && inp[k] != '$'; k++) ; k++; debugprintf("%05d: content of formula: %.*s\n", k, k - (pk - 1), inp + pk - 1); if (k >= n) break; } } else if (inp[k] == '^') { debugprintf("%05d: %c (reference)\n", k, inp[k]); /* special handling of dpmac references of the form ^^=:{some word[s|] possibly with plural[s|]} */ k++; if (inp[k] == '^') k++; if (inp[k] == '=') k++; if (inp[k] == ':') k++; if (inp[k] == '{') { k++; debugprintf("processing the reference: {%c\n", inp[k]); int ll; ll = scan_ref(inp + k, inp + n) - inp; if (inp[ll] != '}') { debugprintf("exceptional reference 1: %.*s\n", ll - k, inp + k); while (ll < n && inp[ll] != '}') ll++; if (inp[ll] == '}') ll++; debugprintf("exceptional reference 2: %.*s\n", ll - k, inp + k); } else if (inp[ll] == '}') { /* parse plurals inside {...} */ debugprintf("*:%.*s\n", ll - k, inp + k); char *buf[2]; int pos[2] = {0, 0}; buf[0] = malloc(ll - k + 1); buf[1] = malloc(ll - k + 1); if (buf[0] == NULL || buf[1] == NULL) serr("malloc(%d - %d + 1)", ll, k); int state = 0; for (int r = k; r < ll; r++) switch (inp[r]) { case '[': state = 1; break; case '|': state = -1; break; case ']': state = 0; break; default: if (state >= 0) buf[0][pos[0]++] = inp[r]; if (state <= 0) buf[1][pos[1]++] = inp[r]; break; } buf[0][pos[0]] = 0; buf[1][pos[1]] = 0; for (int z = 0; z < 2; z++) { int k = 0, last = 0; while (k < pos[z]) if (buf[z][k] == ' ' || buf[z][k] == '-') k++; else if (buf[z][k] == '$') { k++; while (k < pos[z] && buf[z][k] != '$') k++; if (k < pos[z] && buf[z][k] == '$') k++; } else { int l; l = scan_word(buf[z] + k, buf[z] + pos[z]) - buf[z]; if (l == k) { k++; continue; } char *word = buf[z] + k; int wlen = l - k; int ck = aspell_speller_check(speller, word, wlen); if (hilite) { fprintf(output, "%.*s\x1B[7m%.*s\x1B[m", k - last, buf[z] + last, wlen, word); last = l; } else if (ck == 1) ; else if (ck == 0) { restart2: fprintf(stderr, "{[%.*s\x1B[7m%.*s\x1B[m%.*s]}\n", k, buf[z], wlen, word, pos[z] - l, buf[z] + l); const AspellWordList *wl = aspell_speller_suggest(speller, word, wlen); int cnt = 0; if (wl == NULL) aerr(); else { AspellStringEnumeration *els = aspell_word_list_elements(wl); const char *w; for (cnt = 0; (w = aspell_string_enumeration_next(els)) != 0; cnt++) fprintf(stderr, "%d:%s ", cnt, w); delete_aspell_string_enumeration(els); } fprintf(stderr, "=:add -:lower *:exit\n"); prompt = malloc(wlen + 1); if (prompt == NULL) serr("malloc(%d+1)", wlen); memcpy(prompt, word, wlen); prompt[wlen] = 0; reread2: rl_startup_hook = set_prompt; char *ans = readline(""); err(); free(prompt); int len = strlen(ans); if (len >= wlen && memcmp(ans, word, wlen) == 0) { if (len == wlen) { aspell_speller_add_to_session(speller, word, wlen); if (aspell_speller_error(speller) != 0) aerr(); if (word != buf[z] + k) goto print_corr2; k = l; continue; } else if (len == wlen + 1) switch (ans[len - 1]) { case '-': for (int i = 0; i < wlen; i++) ans[i] = tolower(ans[i]); case '=': len--; ans[len] = 0; aspell_speller_add_to_personal(speller, ans, wlen); if (aspell_speller_error(speller) != 0) aerr(); k = l; continue; case '*': goto finish; } int i; for (i = wlen; i < len; i++) if (!isdigit(ans[i])) break; if (i == len) { int ind; if (sscanf(ans + wlen, "%d", &ind) == 1 && ind >= 0 && ind < cnt) { AspellStringEnumeration *els = aspell_word_list_elements(wl); const char *w; for (int i = 0; i <= ind; i++) w = aspell_string_enumeration_next(els); strcpy(ans, w); len = strlen(ans); delete_aspell_string_enumeration(els); goto print_corr2; } fprintf(stderr, "invalid index %d, repeat input\n", ind); goto reread2; } } len = strlen(ans); int ck = aspell_speller_check(speller, ans, len); if (ck == 1) { print_corr2: fprintf(stderr, "\x1B[7mmanual correction:\x1B[m %.*s\n", len, ans); /* output the corrected word */ last = l; k = l; /* advance to the next character */ } else if (ck == 0) { word = ans; wlen = len; goto restart2; } else aerr(); } else aerr(); k = l; /* discard the word anyway */ } } k = ll; free(buf[0]); free(buf[1]); } } } else { debugprintf("%05d: %c (other)\n", k, inp[k]); k++; } finish: fprintf(output, "%.*s", n - last, inp + last); /* output the remainder */ if (hilite) return 0; last = n; if (argc == 4 + hilite) { FILE *pwords = fopen(argv[3 + hilite], "w"); if (pwords == NULL) serr("fopen(%s, w)", argv[3 + hilite]); const AspellWordList *wl = aspell_speller_personal_word_list(speller); if (wl == NULL) aerr(); else { int cnt = 0, tsize = 0; AspellStringEnumeration *els = aspell_word_list_elements(wl); const char *pw; while ((pw = aspell_string_enumeration_next(els)) != 0) { tsize += strlen(pw) + 1; cnt++; } char *buf = malloc(tsize); if (buf == NULL) serr("malloc(%d)", tsize); char **ptr = malloc(cnt * sizeof(char *)); if (ptr == NULL) serr("malloc(%d * sizeof(char *))", cnt); int k = 0; char *p = buf; els = aspell_word_list_elements(wl); while ((pw = aspell_string_enumeration_next(els)) != 0) { ptr[k] = p; strcpy(p, pw); p += strlen(pw) + 1; k++; } qsort(ptr, cnt, sizeof(char *), pcmp); for (k = 0; k < cnt; k++) fprintf(pwords, "%s\n", ptr[k]); free(buf); free(ptr); } } return 0; }