Showing posts with label tiny. Show all posts
Showing posts with label tiny. Show all posts

Monday, 28 January 2013

Spell checker and corrector in BASH

Why? Because it can be done.


I have used nothing but BASH.

Sample run:




I tried to make it very feature rich while keeping code to about 100 lines - this version is about 80.

Can you make it smaller?

Features:
* Auto-corrects if it can only find 1 suggested correction.
* Lists all words in dictionary that could be corrections and lets user choose or replace word.
* Suggestions only consider words with single letter deletions, additions or changes.
* Spelling errors are highlighted in yellow.
* Corrections are highlighted in green.

Todo:
* Save result to a file (Currently just displays result on terminal).

I did go a little overboard with processing word case.


#!/bin/bash

[[ -z "$1" ]] && echo "Usage: $0 " && exit 1
ERR=33; COR=32                                   # error and corrected colours
PS3="Select or manually enter a correction: "
L="a-zA-Z'"                                      # these letters are deemed part of words
FINAL=
shopt -s extglob
shopt -u nocasematch
declare -A WORD
while read W; do WORD[${W,,}]=${W,,}; done < /usr/share/dict/words

function checkSuggestion() {
    local p=$1 S=$1 C                            # uses CASE and SUGGEST from caller
    [ -z "${WORD[$p]}" ] && return 1             # matches word in dict!
    [ $CASE = PROP  ] && S=${p^}                 # change case to original word
    [ $CASE = CAP   ] && S=${p^^}   
    [ $CASE = CAPs  ] && { S=${p%%\'*};             S="${S^^}'${p#*\'}"; }   
    [ $CASE = LAT   ] && { S=${p##*\'};             S="${p%\'*}'${S^^}"; }   
    [ $CASE = OPROP ] && { S=${p%%\'*}; C=${p#*\'}; S="${S^^}'${C^}";    }   
    SUGGEST="${SUGGEST/ $S /} $S "               # add to suggestion list
}

function getSuggestions() {
    local W=$1 t l # p
    SUGGEST=; CASE=LOWER
    [[ $W =~ ^[A-Z][a-z]        ]] && CASE=PROP   # Prop
    [[ $W =~ ^[A-Z][A-Z]        ]] && CASE=CAP    # CAP
    [[ $W =~ ^[A-Z]\'[a-z]      ]] && CASE=PROP   # P'rop
    [[ $W =~  [A-Z]\'[a-z]$     ]] && CASE=CAPs   # CAP's
    [[ $W =~ ^[a-z]\'[A-Z]      ]] && CASE=LAT    # l'At
    [[ $W =~ ^[A-Z]\'[A-Z][A-Z] ]] && CASE=CAP    # C'AP
    [[ $W =~ ^[A-Z]\'[A-Z][a-z] ]] && CASE=OPROP  # O'Prop
    W=${W,,}                                     # lowercase word
    for (( t=0 ; t<=${#W} ; t++ )); do           # for each letter position of word, delete|change|insert a letter
        checkSuggestion ${W:0:t}${W:t+1}         # try deleting letter at position t
        for l in {a..z} "'"; do                  # try changing and inserting letters
            checkSuggestion ${W:0:t}$l${W:t+1}   # try changing letter
            checkSuggestion ${W:0:t}$l${W:t}     # try inserting letter
        done
    done
}

function correctWord() {                         # correct word and return in RET as well as pattern in PAT
    local W=$1 D1 D2 SUGGEST                     # uses L RESULT
        getSuggestions "$W"
        [[ ! "$RESULT" =~ ([^$L]|^)$W([^$L]|$) ]] && { printf "ERROR: Could not find [%s] in %s\n" "$W" "$LINE"; exit 1; }
        D1="${BASH_REMATCH[1]}"; D2="${BASH_REMATCH[2]}"  # get delimiters around word
        PAT="${D1:-#}$W${D2:-%}"                 # match pattern
        case $SUGGEST in
             '') echo -e "${RESULT/$PAT/$D1\e[${ERR}m$W\e[0m$D2}"   # display line with spelling mistake highlighted
                 read -p "Enter correction [$W]: " RET < /dev/fd/3  # no suggestions - let user correct
                 RET=${RET:-$W};;
            +([$L ])\ +([$L ]))                                     # multiple suggestions
                 echo -e "${RESULT/$PAT/$D1\e[${ERR}m$W\e[0m$D2}"   # display line with spelling mistake highlighted
                 select RET in "(IGNORE)" $SUGGEST; do              # get correction
                     RET=${RET:-$REPLY}                             # user entered a word instead
                     [ "$REPLY" = "1" ] && RET=$W                   # no change
                     break
                 done < /dev/fd/3;;
              *) RET=${SUGGEST// /}                                 # get single suggestion
                 printf "%b\n" "\e[31mAuto:\e[0m ${RESULT/$PAT/$D1\e[9;${ERR}m$W\e[0;${COR}m $RET\e[0m$D2}"
        esac
        COL=$COR; [ "$RET" = "$W" ] && COL=$ERR   # if unchanged, highlight in error colour
        RESULT="${RESULT/$PAT/$D1\e[${COL}m${RET}\e[0m$D2}"       # correct word
}

function correctLine(){
    local LINE="$1" W                            # uses RESULT
    while read -d' ' W; do                       # for each word in line
        [[ -z "$W" || -n ${WORD[${W,,}]} ]] 2> /dev/null && continue  # null or word in dict so ignore it
        correctWord $W
        printf ">> %b\n\n" "$RESULT" 
    done <<< "$LINE"
}

while read LINE; do  # for each line
    RESULT="$LINE"
    correctLine " ${LINE//[^$L ]/ } "            # replace punctuation with spaces
    FINAL+="$RESULT\n"                           # append corrected line to result
done 3<&0 < $1                                   # redirect stdin to /dev/fd/3 for select and read

printf "\nCORRECTED TEXT\n\n%b\n" "$FINAL"