blob: 247f1f22b173707447f55d3da789be314a027cc1 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
|
#!/bin/bash
NC='\033[0m'
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
msg_info() {
echo -e "${GREEN}$1${NC}"
}
msg_warn() {
echo -e "${YELLOW}$1${NC}"
}
msg_error() {
echo -e "${RED}$1${NC}"
}
print_help() {
echo
echo "scan2pdf - turn document pictures into searchable PDFs"
echo
echo " scan2pdf uses unpaper to clean the input image and"
echo " tesseract to extract the text and bundle everything in a"
echo " single PDF. imagemagick is used to prepare the input for"
echo " unpaper, and sxiv to selectively repeat the cleaning"
echo " process, when needed. Results may vary, depending on the"
echo " quality of the input image and the options used with"
echo " unpaper."
echo
echo "USAGE:"
echo " unpaper [OPTION ...] FILE [...]"
echo
echo "OPTIONs are:"
echo " -o | --output <name>"
echo " provide a name for the output file (without extension)"
echo " -h | --help"
echo " print this help"
echo
}
batch_convert() {
echo
echo "Batch converting scans to PNM."
for IMG in "${INPUT[@]}"
do
IMG_BASE=`basename $IMG`
IMG_BASE_PNM="${IMG_BASE%.*}.pnm"
echo "Converting $IMG -> $TEMP/$IMG_BASE_PNM"
convert "$IMG" "$TEMP/$IMG_BASE_PNM"
TO_PROCESS+=( "$TEMP/$IMG_BASE_PNM" )
done
}
batch_unpaper() {
echo
echo "Batch unpaper scans."
# Prompt user to get additional unpaper options. It is possible to
# access the man page as well.
UNPAPER_MORE_OPTIONS=()
echo "Using the following options: ${UNPAPER_OPTIONS[@]}"
echo "Specify additional options (quit|man|<option value>):"
QUIT=0
while [ "$QUIT" -eq 0 ]; do
echo -n "> "
read OPTION
case "$OPTION" in
q|quit)
QUIT=1
;;
m|man)
man unpaper
;;
*)
UNPAPER_MORE_OPTIONS=( $OPTION )
;;
esac
echo "Using the following options: ${UNPAPER_OPTIONS[@]} ${UNPAPER_MORE_OPTIONS[@]}"
done
for IMG_PNM in "$@"
do
IMG_BASE_PNM=`basename "$IMG_PNM"`
UNPAPER_IMG_BASE_PNM="unpaper_$IMG_BASE_PNM"
# Possibly remove temp files from previous iterations
rm -f "$TEMP/$UNPAPER_IMG_BASE_PNM"
unpaper "${UNPAPER_OPTIONS[@]}" "${UNPAPER_MORE_OPTIONS[@]}" "$TEMP/$IMG_BASE_PNM" "$TEMP/$UNPAPER_IMG_BASE_PNM"
done
TO_PROCESS=()
REVIEW=`\ls "$TEMP"/unpaper_* | sxiv -iopqt | sed "s|^$TEMP/unpaper_|$TEMP/|"`
for FILE in $REVIEW
do
TO_PROCESS+=( "$FILE" )
done
}
TEMP=`mktemp -d`
UNPAPER_OPTIONS=(--grayfilter-size 10,10 --grayfilter-step 5,5 --post-border 100,100,100,100 --border-align top --border-margin 100,100 --post-size a4)
INPUT=()
TO_PROCESS=()
OUTPUT=out
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help)
print_help
exit 0
;;
-o|--output)
if [[ $# -ge 2 ]]; then
OUTPUT=$2
shift ; shift
else
msg_error "No value found for flag $1!"
print_help
exit 2
fi
;;
*)
INPUT+=( "$1")
shift
;;
esac
done
# Convert all images to PNM
batch_convert "$@"
# Batch unpaper scans until user is ok with the result
while [[ ${TO_PROCESS[@]} ]]; do
batch_unpaper "${TO_PROCESS[@]}"
done
echo
echo "Run tesseract on processed scans."
pushd "$TEMP" >/dev/null
\ls unpaper_* | tesseract - output --dpi 300 pdf
popd >/dev/null
echo
echo "Removing temporary files."
mv "$TEMP"/output.pdf ./
rm -rf "$TEMP"
|