From 182c503fdbbc1ce241cd9966f9402ed9ea0b2143 Mon Sep 17 00:00:00 2001 From: Bastian Dehn Date: Sun, 22 Dec 2024 10:49:51 +0100 Subject: [PATCH] add scan to pdf scripts --- scantopdf | 141 +++++++++++++++++++++++++++++++++++++++++++++++++ scantopdfbw | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++ scantopdfgray | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 425 insertions(+) create mode 100755 scantopdf create mode 100755 scantopdfbw create mode 100755 scantopdfgray diff --git a/scantopdf b/scantopdf new file mode 100755 index 0000000..2cba38f --- /dev/null +++ b/scantopdf @@ -0,0 +1,141 @@ +#!/bin/bash +# +optimize() +{ + local input=$1 + local output=$input-out + + qpdf --linearize $input $output + mv $output $input +} + +addocr() +{ + local input=$1 + local output=$input-out + local error="" + + echo $input + error=$(ocrmypdf -l deu $input $output 2>&1) + + if [ -n "$(echo $error | grep offset)" ]; then + printf "%s: %s\n" $input $error >> $HOME/addocr-error.log + fi + + if [ -f $output ]; then + mv $output $input + optimize $input + fi +} + +createonepdf() +{ + local output=$1 + + for i in scanned*.tiff; do + tiff2pdf $i -o ${i/.tiff/.pdf} + rm $i + done + + pdftk scanned*.pdf output $output + rm scanned*.pdf + addocr $output +} + +createpdf() +{ + local start=$1 + local end=$2 + local countno= + local pdfs=() + + for i in $(seq $start $end); do + countno=$(printf "%02d" $i) + pdfs=$(echo $pdfs "scanned$countno.pdf") + done + + pdftk $pdfs output out$(printf "%02d" $start).pdf + addocr out$(printf "%02d" $start).pdf + + for i in $pdfs; do + rm $i + done +} + +detectsplit() +{ + local pdf=$1 + local trenn= + + trenn=$(zbarimg --raw --quiet $pdf) + if [ "$trenn" == "Trennblatt" ]; then + echo "true" + return + fi + + trenn=$(dmtxread $pdf) + if [ "$trenn" == "Trennblatt" ]; then + echo "true" + return + fi + + echo "false" +} + +createmultipdfs() +{ + local trenn=$1 + local startcount=1 + local endcount=1 + local pdf= + + for i in scanned*.tiff; do + pdf=${i/.tiff/.pdf} + tiff2pdf $i -o $pdf + trenn=$(detectsplit $pdf) + rm $i + if [ "$trenn" == "true" ]; then + createpdf $startcount $((endcount - 1)) + startcount=$((endcount + 1)) + rm $pdf + fi + ((endcount++)) + done +} + +main() +{ + if [ -z "$1" ]; then + echo "ERROR: no page count" + exit 1 + fi + + if [ -z "$2" ]; then + echo "ERROR: no pdf file name" + exit 1 + fi + + local pagecount=$1 + local output=$2 + + scanimage --resolution=300 \ + --skip-blank-pages=yes \ + --format=tiff \ + --batch-count=$pagecount \ + --batch-increment=1 \ + --batch=scanned%02d.tiff + + if [ $? -gt 0 ]; then + echo "ERROR: scan error" + exit 1 + fi + + if [ "$output" == "multi" ]; then + createmultipdfs $pagecount + exit 0 + fi + + createonepdf $output +} + +main $* diff --git a/scantopdfbw b/scantopdfbw new file mode 100755 index 0000000..f149327 --- /dev/null +++ b/scantopdfbw @@ -0,0 +1,142 @@ +#!/bin/bash +# +optimize() +{ + local input=$1 + local output=$input-out + + qpdf --linearize $input $output + mv $output $input +} + +addocr() +{ + local input=$1 + local output=$input-out + local error="" + + echo $input + error=$(ocrmypdf -l deu $input $output 2>&1) + + if [ -n "$(echo $error | grep offset)" ]; then + printf "%s: %s\n" $input $error >> $HOME/addocr-error.log + fi + + if [ -f $output ]; then + mv $output $input + optimize $input + fi +} + +createonepdf() +{ + local output=$1 + + for i in scanned*.tiff; do + tiff2pdf $i -o ${i/.tiff/.pdf} + rm $i + done + + pdftk scanned*.pdf output $output + rm scanned*.pdf + addocr $output +} + +createpdf() +{ + local start=$1 + local end=$2 + local countno= + local pdfs=() + + for i in $(seq $start $end); do + countno=$(printf "%02d" $i) + pdfs=$(echo $pdfs "scanned$countno.pdf") + done + + pdftk $pdfs output out$(printf "%02d" $start).pdf + addocr out$(printf "%02d" $start).pdf + + for i in $pdfs; do + rm $i + done +} + +detectsplit() +{ + local pdf=$1 + local trenn= + + trenn=$(zbarimg --raw --quiet $pdf) + if [ "$trenn" == "Trennblatt" ]; then + echo "true" + return + fi + + trenn=$(dmtxread $pdf) + if [ "$trenn" == "Trennblatt" ]; then + echo "true" + return + fi + + echo "false" +} + +createmultipdfs() +{ + local trenn=$1 + local startcount=1 + local endcount=1 + local pdf= + + for i in scanned*.tiff; do + pdf=${i/.tiff/.pdf} + tiff2pdf $i -o $pdf + trenn=$(detectsplit $pdf) + rm $i + if [ "$trenn" == "true" ]; then + createpdf $startcount $((endcount - 1)) + startcount=$((endcount + 1)) + rm $pdf + fi + ((endcount++)) + done +} + +main() +{ + if [ -z "$1" ]; then + echo "ERROR: no page count" + exit 1 + fi + + if [ -z "$2" ]; then + echo "ERROR: no pdf file name" + exit 1 + fi + + local pagecount=$1 + local output=$2 + + scanimage --resolution=300 \ + --mode=BW \ + --skip-blank-pages=yes \ + --format=tiff \ + --batch-count=$1 \ + --batch-increment=1 \ + --batch=scanned%02d.tiff + + if [ $? -gt 0 ]; then + echo "ERROR: scan error" + exit 1 + fi + + if [ "$output" == "multi" ]; then + createmultipdfs $pagecount + exit 0 + fi + + createonepdf $output +} + +main $* diff --git a/scantopdfgray b/scantopdfgray new file mode 100755 index 0000000..2abc9ab --- /dev/null +++ b/scantopdfgray @@ -0,0 +1,142 @@ +#!/bin/bash +# +optimize() +{ + local input=$1 + local output=$input-out + + qpdf --linearize $input $output + mv $output $input +} + +addocr() +{ + local input=$1 + local output=$input-out + local error="" + + echo $input + error=$(ocrmypdf -l deu $input $output 2>&1) + + if [ -n "$(echo $error | grep offset)" ]; then + printf "%s: %s\n" $input $error >> $HOME/addocr-error.log + fi + + if [ -f $output ]; then + mv $output $input + optimize $input + fi +} + +createonepdf() +{ + local output=$1 + + for i in scanned*.tiff; do + tiff2pdf $i -o ${i/.tiff/.pdf} + rm $i + done + + pdftk scanned*.pdf output $output + rm scanned*.pdf + addocr $output +} + +createpdf() +{ + local start=$1 + local end=$2 + local countno= + local pdfs=() + + for i in $(seq $start $end); do + countno=$(printf "%02d" $i) + pdfs=$(echo $pdfs "scanned$countno.pdf") + done + + pdftk $pdfs output out$(printf "%02d" $start).pdf + addocr out$(printf "%02d" $start).pdf + + for i in $pdfs; do + rm $i + done +} + +detectsplit() +{ + local pdf=$1 + local trenn= + + trenn=$(zbarimg --raw --quiet $pdf) + if [ "$trenn" == "Trennblatt" ]; then + echo "true" + return + fi + + trenn=$(dmtxread $pdf) + if [ "$trenn" == "Trennblatt" ]; then + echo "true" + return + fi + + echo "false" +} + +createmultipdfs() +{ + local trenn=$1 + local startcount=1 + local endcount=1 + local pdf= + + for i in scanned*.tiff; do + pdf=${i/.tiff/.pdf} + tiff2pdf $i -o $pdf + trenn=$(detectsplit $pdf) + rm $i + if [ "$trenn" == "true" ]; then + createpdf $startcount $((endcount - 1)) + startcount=$((endcount + 1)) + rm $pdf + fi + ((endcount++)) + done +} + +main() +{ + if [ -z "$1" ]; then + echo "ERROR: no page count" + exit 1 + fi + + if [ -z "$2" ]; then + echo "ERROR: no pdf file name" + exit 1 + fi + + local pagecount=$1 + local output=$2 + + scanimage --resolution=300 \ + --mode=Gray \ + --skip-blank-pages=yes \ + --format=tiff \ + --batch-count=$1 \ + --batch-increment=1 \ + --batch=scanned%02d.tiff + + if [ $? -gt 0 ]; then + echo "ERROR: scan error" + exit 1 + fi + + if [ "$output" == "multi" ]; then + createmultipdfs $pagecount + exit 0 + fi + + createonepdf $output +} + +main $* -- 2.39.5