warbo-utilities: 2077ef89852f855cdfa3f43b9a3d4891ba889bb4

     1: {
     2:   bash,
     3:   cacert,
     4:   curl,
     5:   fail,
     6:   html2text,
     7:   python3,
     8:   raw,
     9:   runCommand,
    10:   wget,
    11:   withDeps,
    12:   wrap,
    13:   xidel,
    14:   xmlstarlet,
    15: }:
    16: 
    17: with builtins;
    18: with rec {
    19:   getContent = wrap {
    20:     name = "getBBCContent.py";
    21:     file = raw."getBBCContent.py";
    22:     vars = {
    23:       SSL_CERT_FILE = "${cacert}/etc/ssl/certs/ca-bundle.crt";
    24:     };
    25:     paths = [
    26:       html2text
    27:       (python3.withPackages (p: [
    28:         p.beautifulsoup4
    29:         p.feedparser
    30:         p.PyRSS2Gen
    31:       ]))
    32:     ];
    33:   };
    34: 
    35:   bbcnews = wrap {
    36:     name = "bbcnews";
    37:     paths = [
    38:       bash
    39:       xmlstarlet
    40:       wget
    41:     ];
    42:     vars = {
    43:       inherit getContent;
    44:     };
    45:     script = ''
    46:       #!${bash}/bin/bash
    47:       set -e
    48: 
    49:       echo "Fetching BBC News" 1>&2
    50: 
    51:       function stripCrap {
    52:         # Remove item elements whose guid url contains the given text
    53:         xmlstarlet ed -d "//guid[contains(text(),'$1')]/.."
    54:       }
    55: 
    56:       # shellcheck disable=SC2154
    57:       wget -q -O- "http://feeds.bbci.co.uk/news/rss.xml?edition=uk" |
    58:         stripCrap '/sport/'                                         |
    59:         stripCrap '/news/magazine-'                                 |
    60:         stripCrap '/news/entertainment-arts'                        |
    61:         stripCrap '/news/in-pictures'                               |
    62:         stripCrap '/news/av/'                                       |
    63:         "$getContent"
    64:     '';
    65:   };
    66: 
    67:   tests = attrValues {
    68:     getContent =
    69:       runCommand "test-get-content"
    70:         {
    71:           inherit getContent;
    72:           buildInputs = [
    73:             fail
    74:             xidel
    75:           ];
    76:           HTML_EXAMPLE = raw."bbcExamplePage.html.gz";
    77:           RUN_TESTS = "1";
    78:         }
    79:         ''
    80:           "$getContent"
    81:           mkdir "$out"
    82:         '';
    83: 
    84:     noSport =
    85:       runCommand "no-sport-test"
    86:         {
    87:           inherit bbcnews;
    88:           buildInputs = [ curl ];
    89:         }
    90:         ''
    91:           set -e
    92: 
    93:           if curl -s "http://www.bbc.co.uk" > /dev/null
    94:           then
    95:             echo "Looks like we're online..." 1>&2
    96:           else
    97:             echo "Not online, skipping test" 1>&2
    98:             mkdir "$out"
    99:             exit 0
   100:           fi
   101: 
   102:           if "$bbcnews" | grep guid | grep '/sport/'
   103:           then
   104:             echo "Didn't filter out sport" 1>&2
   105:             exit 1
   106:           fi
   107: 
   108:           echo "Sport was filtered out correctly" 1>&2
   109:           mkdir "$out"
   110:         '';
   111:   };
   112: };
   113: withDeps tests bbcnews
Generated by git2html.