SHELL=/bin/bash INPUT=2025-07-01T2031_NB_generated.pdf #INPUT=2025-07-01T2031_NB_generated.ocr.pdf # ocrmypdf -l nor --force-ocr --sidecar $(pwd)/sidecar.txt $(pwd)/2025-07-01T2031_NB_generated.printed.pdf $(pwd)/2025-07-01T2031_NB_generated.ocr.pdf MAKEFLAGS += -r all: constance-ring.epub JPGs:=$(wildcard image/*.jpg) TXTs:=$(patsubst image/%.jpg,txt/%.txt,$(JPGs)) page-index.txt: 2025-07-01T2031_NB_generated.pdf @rm -f page-*.pdf @mkdir -p image pdfimages -print-filenames -j $< image/page > $@.tmp @mv $@.tmp $@ images: page-index.txt ocr: $(TXTs) .PRECIOUS: image/page-%.jpg image/page-%.jpg: page-index.txt @true # nothing to do .PRECIOUS: txt/page-%.txt txt/page-%.txt:image/page-%.jpg @mkdir -p $(dir $@) tesseract -l nor $< - > $@.tmp @mv $@.tmp $@ stage-1.txt: $(TXTs) cat txt/page-{014..328}.txt > $@ stage-2.txt: stage-1.txt @echo '#' $@ @# tr '\f' '\n' < $< > $@ @cp $< $@ stage-3.txt: stage-2.txt convert.awk @echo '#' $@ @awk -f convert.awk $< > $@.tmp @mv $@.tmp $@ stage-4.txt: stage-3.txt Makefile @echo '#' $@ @uniq $< > $@.tmp @mv $@.tmp $@ stage-5.txt: stage-4.txt convert2.awk @echo '#' $@ awk -f convert2.awk $< > $@.tmp @mv $@.tmp $@ stage-6.md: stage-5.txt Makefile @echo '#' $@ @pandoc --from markdown -o $@.tmp.md $< @mv $@.tmp.md $@ # This actually updates dict by sorting and removing leading numbers/spaces. dict.tmp: dict touch dict.tmp cat dict |\ sed "s,^[0-9 ]*,," |\ sort |\ grep -v "^$$" |\ uniq > dict.tmp && cp dict.tmp dict spellcheck: stage-6.md hunspell -p $$(pwd)/dict -d nb_NO,constance-ring $< spellcheck-words: stage-6.md dict.tmp constance-ring.dic @echo hunspell hunspell -p $$(pwd)/dict -d nb_NO,constance-ring -l < $< | sort | uniq -c | sort -n > $@.tmp @mv $@.tmp spellcheck-words constance-ring.epub: header.md stage-6.md spellcheck-words @echo '#' $@ @pandoc --toc -o $@ $(filter %.md,$^) .PHONY: clean clean: if [ -r page-index.txt ]; then cat page-index.txt | xargs rm; rm page-index.txt; fi @rm -f stage-*.txt $(V).SILENT: