#!/bin/bash set -e checksum() { [ -r $1 ] && md5sum $1 | cut -f 1 -d ' ' } fetch() { curl -s -o tv.nrk.no.html.new http://tv.nrk.no/listobjects/recentlysent if [ -r tv.nrk.no.html -a "`checksum tv.nrk.no.html.new`" == "`checksum tv.nrk.no.html`" ] then rm -f tv.nrk.no.html.new echo "Already up to date." exit 0 fi dos2unix tv.nrk.no.html.new xmllint --html --dropdtd --xmlout tv.nrk.no.html.new 2> /dev/null > tv.nrk.no.html.new2 mv tv.nrk.no.html.new2 tv.nrk.no.html.new mv tv.nrk.no.html.new tv.nrk.no.html } fetch mkdir -p tmp cat tv.nrk.no.html | \ xmlstarlet sel \ -N strings=http://exslt.org/strings \ -N fn=http://www.w3.org/2005/xpath-functions \ -t -m '//a[@class="listobject-link"]' \ -v '@href' -v '" "' \ -v 'strings:replace(substring-after(normalize-space(@href), "http://tv.nrk.no/"), "/", "_")' -n \ | while read url name do if [ "$name" == "" -o -r "tmp/$name" ] then continue fi echo Url: $url echo Name: $name curl -s -L "$url" -o "tmp/$name" rm -f "tmp/$name.html" "tmp/$name.xml" set +e cat "tmp/$name" | dos2unix | \ tidy -utf8 -asxhtml -quiet -f /dev/null \ --new-inline-tags "time" \ --new-blocklevel-tags "article, hgroup, section, header, footer, mark, aside" \ > "tmp/$name.html" set -e cat "tmp/$name.html" | xmllint --format --xmlout - >"tmp/${name}.xml" done cat tv.nrk.no.html | xmlstarlet tr ./tv.nrk.no.xsl > tv.nrk.no.atom