# This removes page footers, multiple blank lines and combines paragraphs that # are split across pages. # # A split paragraph is one where there are blank lines or a page footer, *and* # it starts with a lower case letter. BEGIN { getline prev = $0 extra_nl = 0 } { a = match($0, /[0-9]* *— *Amalie Skram\. [IL]/) b = match($0, /^ *[0-9]* *Constance *Ring *[0-9]* *$/) if (a || b) { if (prev != "") { print prev } extra_nl = 1 getline $0 = "" prev = "" } if ($0 == "" && prev == "") { } else { if (extra_nl) { extra_nl = 0 m=$0 # printf "m='%s'\n", m u = match(m, /^[^A-Za-z]*[A-Z].*/) # printf "u=%d, l=%d", u, l # printf "RSTART=%d, RLENGTH=%d, ", u, RSTART, RLENGTH l = match(m, /^[^A-Za-z]*[a-z]/) # printf "RSTART=%d, RLENGTH=%d\n", l, RSTART, RLENGTH if (u > l) { printf "\n", prev } printf "%s", prev # print } else { print prev } } prev=$0 } END { print $prev }