Monday, 28 January 2013

Spell checker and corrector in BASH

Why? Because it can be done.


I have used nothing but BASH.

Sample run:




I tried to make it very feature rich while keeping code to about 100 lines - this version is about 80.

Can you make it smaller?

Features:
* Auto-corrects if it can only find 1 suggested correction.
* Lists all words in dictionary that could be corrections and lets user choose or replace word.
* Suggestions only consider words with single letter deletions, additions or changes.
* Spelling errors are highlighted in yellow.
* Corrections are highlighted in green.

Todo:
* Save result to a file (Currently just displays result on terminal).

I did go a little overboard with processing word case.


#!/bin/bash

[[ -z "$1" ]] && echo "Usage: $0 " && exit 1
ERR=33; COR=32                                   # error and corrected colours
PS3="Select or manually enter a correction: "
L="a-zA-Z'"                                      # these letters are deemed part of words
FINAL=
shopt -s extglob
shopt -u nocasematch
declare -A WORD
while read W; do WORD[${W,,}]=${W,,}; done < /usr/share/dict/words

function checkSuggestion() {
    local p=$1 S=$1 C                            # uses CASE and SUGGEST from caller
    [ -z "${WORD[$p]}" ] && return 1             # matches word in dict!
    [ $CASE = PROP  ] && S=${p^}                 # change case to original word
    [ $CASE = CAP   ] && S=${p^^}   
    [ $CASE = CAPs  ] && { S=${p%%\'*};             S="${S^^}'${p#*\'}"; }   
    [ $CASE = LAT   ] && { S=${p##*\'};             S="${p%\'*}'${S^^}"; }   
    [ $CASE = OPROP ] && { S=${p%%\'*}; C=${p#*\'}; S="${S^^}'${C^}";    }   
    SUGGEST="${SUGGEST/ $S /} $S "               # add to suggestion list
}

function getSuggestions() {
    local W=$1 t l # p
    SUGGEST=; CASE=LOWER
    [[ $W =~ ^[A-Z][a-z]        ]] && CASE=PROP   # Prop
    [[ $W =~ ^[A-Z][A-Z]        ]] && CASE=CAP    # CAP
    [[ $W =~ ^[A-Z]\'[a-z]      ]] && CASE=PROP   # P'rop
    [[ $W =~  [A-Z]\'[a-z]$     ]] && CASE=CAPs   # CAP's
    [[ $W =~ ^[a-z]\'[A-Z]      ]] && CASE=LAT    # l'At
    [[ $W =~ ^[A-Z]\'[A-Z][A-Z] ]] && CASE=CAP    # C'AP
    [[ $W =~ ^[A-Z]\'[A-Z][a-z] ]] && CASE=OPROP  # O'Prop
    W=${W,,}                                     # lowercase word
    for (( t=0 ; t<=${#W} ; t++ )); do           # for each letter position of word, delete|change|insert a letter
        checkSuggestion ${W:0:t}${W:t+1}         # try deleting letter at position t
        for l in {a..z} "'"; do                  # try changing and inserting letters
            checkSuggestion ${W:0:t}$l${W:t+1}   # try changing letter
            checkSuggestion ${W:0:t}$l${W:t}     # try inserting letter
        done
    done
}

function correctWord() {                         # correct word and return in RET as well as pattern in PAT
    local W=$1 D1 D2 SUGGEST                     # uses L RESULT
        getSuggestions "$W"
        [[ ! "$RESULT" =~ ([^$L]|^)$W([^$L]|$) ]] && { printf "ERROR: Could not find [%s] in %s\n" "$W" "$LINE"; exit 1; }
        D1="${BASH_REMATCH[1]}"; D2="${BASH_REMATCH[2]}"  # get delimiters around word
        PAT="${D1:-#}$W${D2:-%}"                 # match pattern
        case $SUGGEST in
             '') echo -e "${RESULT/$PAT/$D1\e[${ERR}m$W\e[0m$D2}"   # display line with spelling mistake highlighted
                 read -p "Enter correction [$W]: " RET < /dev/fd/3  # no suggestions - let user correct
                 RET=${RET:-$W};;
            +([$L ])\ +([$L ]))                                     # multiple suggestions
                 echo -e "${RESULT/$PAT/$D1\e[${ERR}m$W\e[0m$D2}"   # display line with spelling mistake highlighted
                 select RET in "(IGNORE)" $SUGGEST; do              # get correction
                     RET=${RET:-$REPLY}                             # user entered a word instead
                     [ "$REPLY" = "1" ] && RET=$W                   # no change
                     break
                 done < /dev/fd/3;;
              *) RET=${SUGGEST// /}                                 # get single suggestion
                 printf "%b\n" "\e[31mAuto:\e[0m ${RESULT/$PAT/$D1\e[9;${ERR}m$W\e[0;${COR}m $RET\e[0m$D2}"
        esac
        COL=$COR; [ "$RET" = "$W" ] && COL=$ERR   # if unchanged, highlight in error colour
        RESULT="${RESULT/$PAT/$D1\e[${COL}m${RET}\e[0m$D2}"       # correct word
}

function correctLine(){
    local LINE="$1" W                            # uses RESULT
    while read -d' ' W; do                       # for each word in line
        [[ -z "$W" || -n ${WORD[${W,,}]} ]] 2> /dev/null && continue  # null or word in dict so ignore it
        correctWord $W
        printf ">> %b\n\n" "$RESULT" 
    done <<< "$LINE"
}

while read LINE; do  # for each line
    RESULT="$LINE"
    correctLine " ${LINE//[^$L ]/ } "            # replace punctuation with spaces
    FINAL+="$RESULT\n"                           # append corrected line to result
done 3<&0 < $1                                   # redirect stdin to /dev/fd/3 for select and read

printf "\nCORRECTED TEXT\n\n%b\n" "$FINAL"