aboutsummaryrefslogtreecommitdiff
path: root/misc/.local/bin/scan2pdf
blob: 247f1f22b173707447f55d3da789be314a027cc1 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/bin/bash

NC='\033[0m'
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'

msg_info() {
    echo -e "${GREEN}$1${NC}"
}

msg_warn() {
    echo -e "${YELLOW}$1${NC}"
}

msg_error() {
    echo -e "${RED}$1${NC}"
}

print_help() {
    echo
    echo "scan2pdf - turn document pictures into searchable PDFs"
    echo
    echo "    scan2pdf uses unpaper to clean the input image and"
    echo "    tesseract to extract the text and bundle everything in a"
    echo "    single PDF. imagemagick is used to prepare the input for"
    echo "    unpaper, and sxiv to selectively repeat the cleaning"
    echo "    process, when needed. Results may vary, depending on the"
    echo "    quality of the input image and the options used with"
    echo "    unpaper."
    echo
    echo "USAGE:"
    echo "    unpaper [OPTION ...] FILE [...]"
    echo
    echo "OPTIONs are:"
    echo "    -o | --output <name>"
    echo "        provide a name for the output file (without extension)"
    echo "    -h | --help"
    echo "        print this help"
    echo
}

batch_convert() {
    echo
    echo "Batch converting scans to PNM."
    for IMG in "${INPUT[@]}"
    do
        IMG_BASE=`basename $IMG`
        IMG_BASE_PNM="${IMG_BASE%.*}.pnm"
        echo "Converting $IMG -> $TEMP/$IMG_BASE_PNM"
        convert "$IMG" "$TEMP/$IMG_BASE_PNM"
        TO_PROCESS+=( "$TEMP/$IMG_BASE_PNM" )
    done
}

batch_unpaper() {
    echo
    echo "Batch unpaper scans."
    # Prompt user to get additional unpaper options. It is possible to
    # access the man page as well.
    UNPAPER_MORE_OPTIONS=()
    echo "Using the following options: ${UNPAPER_OPTIONS[@]}"
    echo "Specify additional options (quit|man|<option value>):"
    QUIT=0
    while [ "$QUIT" -eq 0 ]; do
        echo -n "> "
        read OPTION
        case "$OPTION" in
            q|quit)
                QUIT=1
                ;;
            m|man)
                man unpaper
                ;;
            *)
                UNPAPER_MORE_OPTIONS=( $OPTION )
                ;;
        esac
        echo "Using the following options: ${UNPAPER_OPTIONS[@]} ${UNPAPER_MORE_OPTIONS[@]}"
    done
    for IMG_PNM in "$@"
    do
        IMG_BASE_PNM=`basename "$IMG_PNM"`
        UNPAPER_IMG_BASE_PNM="unpaper_$IMG_BASE_PNM"
        # Possibly remove temp files from previous iterations
        rm -f "$TEMP/$UNPAPER_IMG_BASE_PNM"
        unpaper "${UNPAPER_OPTIONS[@]}" "${UNPAPER_MORE_OPTIONS[@]}" "$TEMP/$IMG_BASE_PNM" "$TEMP/$UNPAPER_IMG_BASE_PNM"
    done
    TO_PROCESS=()
    REVIEW=`\ls "$TEMP"/unpaper_* | sxiv -iopqt | sed "s|^$TEMP/unpaper_|$TEMP/|"`
    for FILE in $REVIEW
    do
        TO_PROCESS+=( "$FILE" )
    done
}

TEMP=`mktemp -d`
UNPAPER_OPTIONS=(--grayfilter-size 10,10 --grayfilter-step 5,5 --post-border 100,100,100,100 --border-align top --border-margin 100,100 --post-size a4)

INPUT=()
TO_PROCESS=()
OUTPUT=out

while [[ $# -gt 0 ]]; do
    case $1 in
        -h|--help)
            print_help
            exit 0
            ;;
        -o|--output)
            if [[ $# -ge 2 ]]; then
                OUTPUT=$2
                shift ; shift
            else
                msg_error "No value found for flag $1!"
                print_help
                exit 2
            fi
            ;;
        *)
            INPUT+=( "$1")
            shift
            ;;
    esac
done

# Convert all images to PNM
batch_convert "$@"

# Batch unpaper scans until user is ok with the result
while [[ ${TO_PROCESS[@]} ]]; do
    batch_unpaper "${TO_PROCESS[@]}"
done

echo
echo "Run tesseract on processed scans."
pushd "$TEMP" >/dev/null
\ls unpaper_* | tesseract - output --dpi 300 pdf
popd >/dev/null

echo
echo "Removing temporary files."
mv "$TEMP"/output.pdf ./
rm -rf "$TEMP"