warbo-utilities: 2bb96d6443b7da645ea241b53a68c45f06d1df5c

     1: { bash, wget, wrap, xidel }:
     2: 
     3: wrap {
     4:   name = "iplayer";
     5:   paths = [ bash wget xidel ];
     6:   script = ''
     7:     #!${bash}/bin/bash
     8:     set -e
     9:     set -o pipefail
    10: 
    11:     function fetchProgrammes {
    12:       # Fetch the URLs of programmes found at the given URL, followed by their
    13:       # titles. For example:
    14:       #
    15:       # http://programme-page-for-click
    16:       # http://programme-page-for-springwatch
    17:       #
    18:       # Click - 25th May 2016
    19:       # Springwatch 2016 Episode 3
    20: 
    21:       XPATH="//a[contains(@class,\"content-item__link\")]"
    22: 
    23:       echo "Fetching '$1'" 1>&2
    24:       wget -q -O- "$1" |
    25:         xidel -s \
    26:           -e "$XPATH/resolve-uri(@href, \"$1\")" \
    27:           -e "$XPATH//div[contains(@class,\"content-item__title\")]/text()" -
    28:     }
    29: 
    30:     function formattedProgrammes {
    31:       # Fetch the URLs and titles of programmes on the given page. For example:
    32:       #
    33:       # http://programme-page-for-click	Click - 25th May 2016
    34:       # http://programme-page-for-springwatch 	Springwatch 2016 Episode 3
    35: 
    36:       OUTPUT=$(fetchProgrammes "$1") || fail "Couldn't fetch feed"
    37:       URLS=$(echo "$OUTPUT" | grep "^http")
    38:       TTLS=$(echo "$OUTPUT" | grep -v "^http" | grep "^.")
    39: 
    40:       assertSameLength "$URLS" "$TTLS"
    41: 
    42:       paste <(echo "$URLS") <(echo "$TTLS")
    43:     }
    44: 
    45:     function assertSameLength {
    46:       # Assert that both arguments contain the same number of lines
    47:       COUNT1=$(echo "$1" | wc -l)
    48:       COUNT2=$(echo "$2" | wc -l)
    49: 
    50:       echo "Got lists of '$COUNT1' and '$COUNT2' elements" 1>&2
    51:       [[ "$COUNT1" -eq "$COUNT2" ]] || {
    52:         echo -e "Found different length lists. First:\\n$1\\nSecond:\\n$2" 1>&2
    53:         exit 2
    54:       }
    55:       [[ "$COUNT1" -gt 3 ]] ||
    56:         fail "Only found '$COUNT' entries? Seems fishy, aborting."
    57:     }
    58: 
    59:     function listToFeed {
    60:       CHANNELURL=$(echo "$2" | xmlEscape)
    61:       echo '<rss version="2.0">'
    62:       echo   '<channel>'
    63:       echo     "<title>$1</title>"
    64:       echo     "<link>$CHANNELURL</link>"
    65: 
    66:       FORMATTED=$(formattedProgrammes "$2") || fail "Couldn't format listing"
    67: 
    68:       COUNT=0
    69:       while read -r LINE
    70:       do
    71:         COUNT=$(( COUNT + 1 ))
    72:         THISURL=$(echo "$LINE" | cut -f 1)
    73:         THISTTL=$(echo "$LINE" | cut -f 2-)
    74:         writeItem "$THISURL" "$THISTTL"
    75:       done < <(echo "$FORMATTED")
    76: 
    77:       echo   '</channel>'
    78:       echo '</rss>'
    79:     }
    80: 
    81:     function xmlEscape {
    82:       # From http://daemonforums.org/showthread.php?t=4054
    83:       sed -e 's~&~\&~g' -e 's~<~\<~g' -e 's~>~\>~g'
    84:     }
    85: 
    86:     function firstShown {
    87:       sleep 1
    88:       PAGE=$(wget -O- -q "$1") || fail "Couldn't fetch page $1"
    89:       EXTRACTED=$(echo "$PAGE" | grep -o '"release_date_time":"[^"]*"') ||
    90:         fail "Failed to extract first-shown date"
    91: 
    92:       echo "$EXTRACTED" | cut -d : -f 2- |
    93:                           sed -e 's/"//g'
    94:     }
    95: 
    96:     function findCached {
    97:       MATCHES=$(grep -rlF "$1" "$CACHEDIR")
    98:       FOUND=$(echo "$MATCHES" | grep -v '\.rss$' | head -n1)
    99:       echo "$FOUND"
   100:     }
   101: 
   102:     function writeItem {
   103:       CACHED=$(findCached "$1")
   104:       if [[ -n "$CACHED" ]]
   105:       then
   106:         cat "$CACHED"
   107:       else
   108:         HASH=$(echo "$1" | md5sum | cut -d ' ' -f 1)
   109:         NAME=$(echo "$2" | tr '[:upper:]' '[:lower:]' | tr -dc '[:lower:]')
   110:         FILE="$HASH"_"$NAME".xml
   111:         writeItemReal "$1" "$2" | tee "$CACHEDIR/$FILE"
   112:       fi
   113:     }
   114: 
   115:     function writeItemReal {
   116:       echo "Writing item for '$1' '$2'" 1>&2
   117:       SAFEURL=$(echo "$1" | xmlEscape)
   118:       SAFETTL=$(echo "$2" | xmlEscape)
   119:       # Strip off "First shown:" and "HH:MMpm"
   120:       DATE=$(firstShown "$1")
   121:       echo "Got date '$DATE'" 1>&2
   122:       if PUBDATE=$(date --date="$DATE" --rfc-2822)
   123:       then
   124:         # Looks like a complete date
   125:         true
   126:       else
   127:         # Probably just a year, e.g. for a film
   128:         PUBDATE=$(date --date="1 Jan$DATE" --rfc-2822)
   129:       fi
   130:       echo "<item>"
   131:       echo   "<title>$SAFETTL</title>"
   132:       echo   "<link>$SAFEURL</link>"
   133:       echo   "<description><a href=\"$SAFEURL\">link</a></description>"
   134:       echo   "<guid isPermaLink=\"true\">$SAFEURL</guid>"
   135:       echo   "<pubDate>$PUBDATE</pubDate>"
   136:       echo "</item>"
   137:     }
   138: 
   139:     CACHEDIR="$HOME/.cache/iplayer_feeds"
   140:     mkdir -p "$CACHEDIR"
   141: 
   142:     listToFeed "$1" "$2" | tr -cd '[:print:]\n'
   143:   '';
   144: }

Generated by git2html.