SHELL=/bin/bash
INPUT=2025-07-01T2031_NB_generated.pdf
#INPUT=2025-07-01T2031_NB_generated.ocr.pdf

# ocrmypdf -l nor --force-ocr --sidecar $(pwd)/sidecar.txt $(pwd)/2025-07-01T2031_NB_generated.printed.pdf $(pwd)/2025-07-01T2031_NB_generated.ocr.pdf 

MAKEFLAGS += -r

all: constance-ring.epub

JPGs:=$(wildcard image/*.jpg)
TXTs:=$(patsubst image/%.jpg,txt/%.txt,$(JPGs))

page-index.txt: 2025-07-01T2031_NB_generated.pdf
	@rm -f page-*.pdf
	@mkdir -p image
	pdfimages -print-filenames -j $< image/page > $@.tmp
	@mv $@.tmp $@

images: page-index.txt
ocr: $(TXTs)

.PRECIOUS: image/page-%.jpg
image/page-%.jpg: page-index.txt
	@true # nothing to do

.PRECIOUS: txt/page-%.txt
txt/page-%.txt:image/page-%.jpg
	@mkdir -p $(dir $@)
	tesseract -l nor $< - > $@.tmp
	@mv $@.tmp $@

stage-1.txt: $(TXTs)
	cat txt/page-{014..328}.txt > $@

stage-2.txt: stage-1.txt
	@echo '#' $@
	@# tr '\f' '\n' < $< > $@
	@cp $< $@

stage-3.txt: stage-2.txt convert.awk
	@echo '#' $@
	@awk -f convert.awk $< > $@.tmp
	@mv $@.tmp $@

stage-4.txt: stage-3.txt Makefile
	@echo '#' $@
	@uniq $< > $@.tmp
	@mv $@.tmp $@

stage-5.txt: stage-4.txt convert2.awk
	@echo '#' $@
	awk -f convert2.awk $< > $@.tmp
	@mv $@.tmp $@

stage-6.md: stage-5.txt Makefile
	@echo '#' $@
	@pandoc --from markdown -o $@.tmp.md $<
	@mv $@.tmp.md $@

# This actually updates dict by sorting and removing leading numbers/spaces.
dict.tmp: dict
	touch dict.tmp
	cat dict |\
	  sed "s,^[0-9 ]*,," |\
	  sort |\
	  grep -v "^$$" |\
	  uniq > dict.tmp && cp dict.tmp dict

spellcheck: stage-6.md
	hunspell -p $$(pwd)/dict -d nb_NO,constance-ring $<

spellcheck-words: stage-6.md dict.tmp constance-ring.dic
	@echo hunspell
	hunspell -p $$(pwd)/dict -d nb_NO,constance-ring -l < $< | sort | uniq -c | sort -n > $@.tmp
	@mv $@.tmp spellcheck-words

constance-ring.epub: header.md stage-6.md spellcheck-words
	@echo '#' $@
	@pandoc --toc -o $@ $(filter %.md,$^)

.PHONY: clean
clean:
	if [ -r page-index.txt ]; then cat page-index.txt | xargs rm; rm page-index.txt; fi
	@rm -f stage-*.txt

$(V).SILENT: