aboutsummaryrefslogtreecommitdiff
path: root/Makefile
diff options
context:
space:
mode:
Diffstat (limited to 'Makefile')
-rw-r--r--Makefile52
1 files changed, 47 insertions, 5 deletions
diff --git a/Makefile b/Makefile
index 0091d86..5b03619 100644
--- a/Makefile
+++ b/Makefile
@@ -1,17 +1,58 @@
+SHELL=/bin/bash
+INPUT=2025-07-01T2031_NB_generated.pdf
+#INPUT=2025-07-01T2031_NB_generated.ocr.pdf
+
+# ocrmypdf -l nor --force-ocr --sidecar $(pwd)/sidecar.txt $(pwd)/2025-07-01T2031_NB_generated.printed.pdf $(pwd)/2025-07-01T2031_NB_generated.ocr.pdf
+
all: constance-ring.epub
-stage-1.txt: 2025-07-01T2031_NB_generated.pdf
- pdftotext -layout $< $@
+JPGs:=$(wildcard image/*.jpg)
+TXTs:=$(patsubst image/%.jpg,txt/%.txt,$(JPGs))
+
+page-index.txt: 2025-07-01T2031_NB_generated.pdf
+ @rm -f page-*.pdf
+ @mkdir -p image
+ pdfimages -print-filenames -j $< image/page > $@.tmp
+ @mv $@.tmp $@
+
+images: page-index.txt
+ocr: $(TXTs)
+
+.PRECIOUS: image/page-%.jpg
+image/page-%.jpg: page-index.txt
+ @true # nothing to do
+
+.PRECIOUS: txt/page-%.txt
+txt/page-%.txt:image/page-%.jpg
+ @mkdir -p $(dir $@)
+ tesseract -l nor $< - > $@.tmp
+ @mv $@.tmp $@
+
+stage-1.txt: $(TXTs)
+ cat txt/page-{014..328}.txt > $@
+ # pdftotext -layout $< $@
stage-2.txt: stage-1.txt
@echo $@
- tr '\f' '\n' < $< > $@
+ # tr '\f' '\n' < $< > $@
+ cp $< $@
stage-3.txt: stage-2.txt convert.awk
@echo $@
- awk -f convert.awk $< > $@
+ awk -f convert.awk $< > $@.tmp
+ @mv $@.tmp $@
+
+stage-4.txt: stage-3.txt Makefile
+ @echo $@
+ uniq $< > $@.tmp
+ @mv $@.tmp $@
+
+stage-5.md: stage-4.txt Makefile
+ @echo $@
+ pandoc --from markdown -o $@.tmp.md $<
+ @mv $@.tmp.md $@
-constance-ring.md: header.md stage-3.txt
+constance-ring.md: header.md stage-5.md
@echo $@
cat $^ > $@.tmp
mv $@.tmp $@
@@ -21,4 +62,5 @@ constance-ring.epub: constance-ring.md
.PHONY: clean
clean:
+ if [ -r page-index.txt ]; then cat page-index.txt | xargs rm; rm page-index.txt; fi
@rm -f stage-*.txt