warbo-utilities: 2bb96d6443b7da645ea241b53a68c45f06d1df5c
1: { bash, wget, wrap, xidel }:
2:
3: wrap {
4: name = "iplayer";
5: paths = [ bash wget xidel ];
6: script = ''
7: #!${bash}/bin/bash
8: set -e
9: set -o pipefail
10:
11: function fetchProgrammes {
12: # Fetch the URLs of programmes found at the given URL, followed by their
13: # titles. For example:
14: #
15: # http://programme-page-for-click
16: # http://programme-page-for-springwatch
17: #
18: # Click - 25th May 2016
19: # Springwatch 2016 Episode 3
20:
21: XPATH="//a[contains(@class,\"content-item__link\")]"
22:
23: echo "Fetching '$1'" 1>&2
24: wget -q -O- "$1" |
25: xidel -s \
26: -e "$XPATH/resolve-uri(@href, \"$1\")" \
27: -e "$XPATH//div[contains(@class,\"content-item__title\")]/text()" -
28: }
29:
30: function formattedProgrammes {
31: # Fetch the URLs and titles of programmes on the given page. For example:
32: #
33: # http://programme-page-for-click Click - 25th May 2016
34: # http://programme-page-for-springwatch Springwatch 2016 Episode 3
35:
36: OUTPUT=$(fetchProgrammes "$1") || fail "Couldn't fetch feed"
37: URLS=$(echo "$OUTPUT" | grep "^http")
38: TTLS=$(echo "$OUTPUT" | grep -v "^http" | grep "^.")
39:
40: assertSameLength "$URLS" "$TTLS"
41:
42: paste <(echo "$URLS") <(echo "$TTLS")
43: }
44:
45: function assertSameLength {
46: # Assert that both arguments contain the same number of lines
47: COUNT1=$(echo "$1" | wc -l)
48: COUNT2=$(echo "$2" | wc -l)
49:
50: echo "Got lists of '$COUNT1' and '$COUNT2' elements" 1>&2
51: [[ "$COUNT1" -eq "$COUNT2" ]] || {
52: echo -e "Found different length lists. First:\\n$1\\nSecond:\\n$2" 1>&2
53: exit 2
54: }
55: [[ "$COUNT1" -gt 3 ]] ||
56: fail "Only found '$COUNT' entries? Seems fishy, aborting."
57: }
58:
59: function listToFeed {
60: CHANNELURL=$(echo "$2" | xmlEscape)
61: echo '<rss version="2.0">'
62: echo '<channel>'
63: echo "<title>$1</title>"
64: echo "<link>$CHANNELURL</link>"
65:
66: FORMATTED=$(formattedProgrammes "$2") || fail "Couldn't format listing"
67:
68: COUNT=0
69: while read -r LINE
70: do
71: COUNT=$(( COUNT + 1 ))
72: THISURL=$(echo "$LINE" | cut -f 1)
73: THISTTL=$(echo "$LINE" | cut -f 2-)
74: writeItem "$THISURL" "$THISTTL"
75: done < <(echo "$FORMATTED")
76:
77: echo '</channel>'
78: echo '</rss>'
79: }
80:
81: function xmlEscape {
82: # From http://daemonforums.org/showthread.php?t=4054
83: sed -e 's~&~\&~g' -e 's~<~\<~g' -e 's~>~\>~g'
84: }
85:
86: function firstShown {
87: sleep 1
88: PAGE=$(wget -O- -q "$1") || fail "Couldn't fetch page $1"
89: EXTRACTED=$(echo "$PAGE" | grep -o '"release_date_time":"[^"]*"') ||
90: fail "Failed to extract first-shown date"
91:
92: echo "$EXTRACTED" | cut -d : -f 2- |
93: sed -e 's/"//g'
94: }
95:
96: function findCached {
97: MATCHES=$(grep -rlF "$1" "$CACHEDIR")
98: FOUND=$(echo "$MATCHES" | grep -v '\.rss$' | head -n1)
99: echo "$FOUND"
100: }
101:
102: function writeItem {
103: CACHED=$(findCached "$1")
104: if [[ -n "$CACHED" ]]
105: then
106: cat "$CACHED"
107: else
108: HASH=$(echo "$1" | md5sum | cut -d ' ' -f 1)
109: NAME=$(echo "$2" | tr '[:upper:]' '[:lower:]' | tr -dc '[:lower:]')
110: FILE="$HASH"_"$NAME".xml
111: writeItemReal "$1" "$2" | tee "$CACHEDIR/$FILE"
112: fi
113: }
114:
115: function writeItemReal {
116: echo "Writing item for '$1' '$2'" 1>&2
117: SAFEURL=$(echo "$1" | xmlEscape)
118: SAFETTL=$(echo "$2" | xmlEscape)
119: # Strip off "First shown:" and "HH:MMpm"
120: DATE=$(firstShown "$1")
121: echo "Got date '$DATE'" 1>&2
122: if PUBDATE=$(date --date="$DATE" --rfc-2822)
123: then
124: # Looks like a complete date
125: true
126: else
127: # Probably just a year, e.g. for a film
128: PUBDATE=$(date --date="1 Jan$DATE" --rfc-2822)
129: fi
130: echo "<item>"
131: echo "<title>$SAFETTL</title>"
132: echo "<link>$SAFEURL</link>"
133: echo "<description><a href=\"$SAFEURL\">link</a></description>"
134: echo "<guid isPermaLink=\"true\">$SAFEURL</guid>"
135: echo "<pubDate>$PUBDATE</pubDate>"
136: echo "</item>"
137: }
138:
139: CACHEDIR="$HOME/.cache/iplayer_feeds"
140: mkdir -p "$CACHEDIR"
141:
142: listToFeed "$1" "$2" | tr -cd '[:print:]\n'
143: '';
144: }
Generated by git2html.