warbo-utilities: 37edf6f3c90459ee8d3ab07dbe86da2768466dc1

     1: {
     2:   bash,
     3:   coreutils,
     4:   feed2maildirsimple,
     5:   libxslt,
     6:   mkBin,
     7:   mu,
     8:   openssl,
     9:   procps,
    10:   python3,
    11:   raw,
    12:   scripts,
    13:   wget,
    14:   wrap,
    15:   writeScript,
    16:   xidel,
    17:   xmlstarlet,
    18: }:
    19: with rec {
    20:   cleanUp = wrap {
    21:     name = "clean-up-news";
    22:     paths = [
    23:       bash
    24:       mu
    25:       xidel
    26:     ];
    27:     script = ''
    28:       #!${bash}/bin/bash
    29: 
    30:       function stopMu {
    31:         while ps auxww | grep 'mu server' | grep -v grep | grep 'server'
    32:         do
    33:           pkill -2 -u "$UID" -f 'mu server'
    34:           sleep 1
    35:         done
    36:       }
    37: 
    38:       # Gather up the filenames to remove, so we can delete them all at once
    39:       # without having to keep polling mu
    40:       REMOVALS=()
    41: 
    42:       # Some feeds are high-volume and only interesting for a short time. We
    43:       # clean up their articles 1 month after posting
    44:       for FEED in BBCHeadlines HackerNews XKCD SMBC
    45:       do
    46:         CUTOFF=$(date -d "last month" "+%s")
    47:         while read -r F
    48:         do
    49:           D=$(grep "^Date: " < "$F" | sed -e 's/^Date: //g')
    50:           SECS=$(date -d "$D" "+%s")
    51:           if [[ "$SECS" -lt "$CUTOFF" ]]
    52:           then
    53:             REMOVALS+=("$F")
    54:           fi
    55:         done < <(find "$HOME/Mail/feeds/$FEED/cur" -type f)
    56:       done
    57: 
    58:       # Some feeds may post content that originated a while ago, e.g. the BBC
    59:       # showing films from many years ago. If these are only available for a
    60:       # short time (like iPlayer posts) then we should delete those whose file
    61:       # modification time (rather than posted date) is older than a month
    62:       for FEED in iPlayerComedy iPlayerFilms iPlayerSciNat
    63:       do
    64:         while read -r F
    65:         do
    66:           REMOVALS+=("$F")
    67:         done < <(find "$HOME/Mail/feeds/$FEED/cur" -type f -mtime +30)
    68:       done
    69: 
    70:       # Limit Reddit feeds to 100 messages. They only include about the latest
    71:       # 25 posts, so we shouldn't get any dupes creeping in.
    72:       for FEED in RedditHaskell RedditStallmanWasRight \
    73:                   ScienceBulletin BBCHeadlines HackerNews
    74:       do
    75:         while read -r F
    76:         do
    77:           REMOVALS+=("$F")
    78:         done < <(mu find --fields="l" --sortfield='d' --reverse \
    79:                          maildir:/feeds/"$FEED" not flag:unread |
    80:                  tail -n+100                                    |
    81:                  head -n20                                      )
    82:       done
    83: 
    84:       stopMu
    85:       for F in "''${REMOVALS[@]}"
    86:       do
    87:         rm "$F"
    88:         mu remove "$F"
    89:       done
    90: 
    91:       # Delete old BBC news content (since their RSS only has summaries)
    92:       find /tmp/bbcnews-cached -type f -mtime +30 -exec rm {} \;
    93: 
    94:       # Delete MeetUp events from the past. These may be posted well in advance,
    95:       # so we can't use the file's modification time.
    96:       for F in "$HOME/.cache/meetup"/*
    97:       do
    98:         if ! [[ -s "$F" ]]
    99:         then
   100:           # Delete empty files
   101:           rm -f "$F"
   102:           continue
   103:         fi
   104: 
   105:         MILLIS=$(xidel - -s -e '//time/@dateTime' < "$F" | head -n1)
   106:         if [[ -z "$MILLIS" ]]
   107:         then
   108:           # Delete undated events
   109:           rm -f "$F"
   110:           continue
   111:         fi
   112: 
   113:         SECS=$(( MILLIS / 1000 ))
   114:         PAST=$(date -d 'now - 7 days' '+%s')
   115:         (( PAST < SECS )) || rm -f "$F"
   116:         unset MILLIS
   117:         unset SECS
   118:         unset PAST
   119:       done
   120: 
   121:       # Occasionaly run a full index, to clean up old messages
   122:       stopMu
   123:       if [[ "$(( RANDOM % 100 ))" -eq 0 ]]
   124:       then
   125:         mu index --maildir="$HOME/Mail"
   126:       else
   127:         mu index --maildir="$HOME/Mail" --lazy-check
   128:       fi
   129:     '';
   130:   };
   131: 
   132:   convert = wrap {
   133:     name = "feeds2maildirs";
   134:     paths = [ (python3.withPackages (p: [ feed2maildirsimple ])) ];
   135:     script = ''
   136:       #!/usr/bin/env python
   137:       # coding: utf-8
   138: 
   139:       import hashlib
   140:       import os
   141:       import random
   142:       import sys
   143: 
   144:       from feed2maildir.converter import Converter
   145:       from feed2maildir.reader    import Reader
   146: 
   147:       msg = lambda x: (sys.stderr.write(x if type(x) == type("") \
   148:                                           else repr(x) + '\n'),
   149:                        sys.stderr.flush(),
   150:                        None)[-1]
   151: 
   152:       home     = os.environ['HOME']
   153:       rssDir   = home + '/.cache/rss'
   154:       rssFiles = [f for f in os.listdir(rssDir) if f.lower().endswith('.rss')]
   155: 
   156:       for rssFile in rssFiles:
   157:         name    = rssFile[:-4]
   158:         maildir = home + '/Mail/feeds/' + name
   159: 
   160:         try:
   161:           with open(rssDir + '/' + rssFile, 'r') as f:
   162:             data = f.read()
   163:         except Exception as e:
   164:           msg({
   165:             'exception' : e,
   166:             'message'   : 'Failed to read file, skipping',
   167:             'rssFile'   : rssFile,
   168:           })
   169:           continue
   170: 
   171:         # Hash the .rss file to a .hash file and see if it's changed
   172:         hashFile = rssDir + '/' + name + '.hash'
   173:         lastHash = None
   174:         try:
   175:           with open(hashFile, 'r') as f:
   176:             lastHash = f.read().strip()
   177:         except:
   178:           pass
   179: 
   180:         hasher = hashlib.md5()
   181:         hasher.update(data)
   182:         newHash = hasher.hexdigest()
   183: 
   184:         # Skip most unchanged files; do a few at random to escape erroneous data
   185:         if lastHash == newHash and random.randint(0, len(rssFiles)) > 5:
   186:           msg('Skipping ' + rssFile + ' since its hash has not changed')
   187:           continue
   188: 
   189:         msg('Converting ' + rssFile + ' to Maildir\n')
   190:         try:
   191:           reader    = Reader(data)
   192:           converter = Converter(maildir, name, strip=True)
   193:           converter.load(reader.feed)
   194:           converter.run()
   195:           with open(hashFile, 'w') as f:
   196:             f.write(newHash)
   197:         except Exception as e:
   198:           msg({
   199:             'exception' : e,
   200:             'message'   : 'Skipping file due to exception in conversion',
   201:             'rssFile'   : rssFile,
   202:           })
   203:     '';
   204:   };
   205: 
   206:   fixRss = mkBin {
   207:     name = "fixRss";
   208:     paths = [ xmlstarlet ];
   209:     script = ''
   210:       #!${bash}/bin/bash
   211: 
   212:       # Append an author to each item, using the feed name
   213:       xmlstarlet ed                         \
   214:         -s //item -t elem -n author         \
   215:         -v "$1"                             \
   216:         -d '//item/author[position() != 1]' |
   217: 
   218:         # Now that all items have an author, set them all to the feed name (to
   219:         # avoid special characters)
   220:         xmlstarlet ed -u "//author" -v "$1" |
   221: 
   222:         # Append today as the items' pubDate, then remove all but the first
   223:         # pubDate (i.e. append today as the pubDate, if none is given)
   224:         xmlstarlet ed                              \
   225:           -s //item -t elem -n pubDate             \
   226:           -v "$(date -d "today 00:00" --rfc-2822)" \
   227:           -d '//item/pubDate[position() != 1]'     |
   228: 
   229:         # Append an empty description, then remove all but the first description
   230:         xmlstarlet ed                              \
   231:           -s //item -t elem -n description         \
   232:           -v "No description given"                \
   233:           -d '//item/description[position() != 1]' |
   234: 
   235:         # Append an empty link, then remove all but the first link
   236:         xmlstarlet ed                       \
   237:           -s //item -t elem -n link         \
   238:           -v "http://example.com"           \
   239:           -d '//item/link[position() != 1]'
   240:     '';
   241:   };
   242: 
   243:   stripNonAscii = ''
   244:     tr -cd '[:print:]
   245:   '';
   246: 
   247:   get = "timeout 20 wget -O- -q --no-check-certificate";
   248: 
   249:   getRss = mkBin {
   250:     name = "getRss";
   251:     paths = [
   252:       coreutils
   253:       fixRss
   254:       wget
   255:     ];
   256:     script = ''
   257:       #!${bash}/bin/bash
   258:       ${get} "$2" | ${stripNonAscii} | fixRss "$1" > "$1.rss"
   259:     '';
   260:   };
   261: 
   262:   getAtom = mkBin {
   263:     name = "getAtom";
   264:     paths = [
   265:       coreutils
   266:       fixRss
   267:       (libxslt.bin or libxslt)
   268:       wget
   269:     ];
   270:     vars = {
   271:       xsl = raw."atom2rss-exslt.xsl";
   272:     };
   273:     script = ''
   274:       #!${bash}/bin/bash
   275:       ${get} "$2" | ${stripNonAscii} > "$1.atom"
   276:       xsltproc "$xsl" "$1.atom" |
   277:         fixRss "$1" > "$1.rss"
   278:     '';
   279:   };
   280: 
   281:   getYouTube = mkBin {
   282:     name = "getYouTube";
   283:     paths = [ getAtom ];
   284:     script = ''
   285:       #!${bash}/bin/bash
   286:       getAtom "$1" "http://www.youtube.com/feeds/videos.xml?channel_id=$2"
   287:     '';
   288:   };
   289: 
   290:   rss = wrap {
   291:     name = "pull_down_rss";
   292:     paths = [
   293:       bash
   294:       getAtom
   295:       getRss
   296:       getYouTube
   297:     ];
   298:     script = ''
   299:       #!${bash}/bin/bash
   300:       set -e
   301:       [[ -n "$1" ]] || fail "pull_down_rss need an output directory"
   302:       [[ -e "$1" ]] || fail "Output dir '$1' not found"
   303:       cd "$1"
   304: 
   305:       while read -r FEED
   306:       do
   307:         TYPE=$(echo "$FEED" | cut -f1)
   308:         NAME=$(echo "$FEED" | cut -f2)
   309:          URL=$(echo "$FEED" | cut -f3)
   310: 
   311:         echo "Getting feed $NAME" 1>&2
   312:         case "$TYPE" in
   313:           atom)
   314:             getAtom "$NAME" "$URL" || echo "Failed to get $NAME, skipping"
   315:             ;;
   316:           iplayer)
   317:             iplayer_feed "$NAME" "$URL" > "$NAME.rss" ||
   318:               echo "Failed to get $NAME, skipping"
   319:             ;;
   320:           rss)
   321:             getRss "$NAME" "$URL" || echo "Failed to get $NAME, skipping"
   322:             ;;
   323:           youtube)
   324:             getYouTube "$NAME" "$URL" || echo "Failed to get $NAME, skipping"
   325:             ;;
   326:           *)
   327:             echo "Can't handle '$FEED', skipping" 1>&2
   328:             ;;
   329:         esac
   330:       done
   331:     '';
   332:   };
   333: };
   334: 
   335: wrap {
   336:   name = "get-news-start";
   337:   paths = [
   338:     bash
   339:     mu
   340:     procps
   341:   ];
   342:   vars = {
   343:     inherit cleanUp convert rss;
   344:     inherit (scripts) sysPing;
   345:   };
   346:   script = ''
   347:     #!${bash}/bin/bash
   348:     set -e
   349: 
   350:     # Grabs RSS feeds and dumps them in ~/.cache, so all of our news is in one
   351:     # format and one place, ready to merge into our mail
   352:     # shellcheck disable=SC2154
   353:     if "$sysPing" -c 1 google.com
   354:     then
   355:       # Update all of our RSS files
   356:       mkdir -p ~/.cache/rss
   357:       cd ~/.cache/rss || fail "Couldn't cd to ~/.cache/rss"
   358: 
   359:       bbcnews > BBCHeadlines.rss ||
   360:         echo "Error getting BBC news, skipping" 1>&2
   361: 
   362:       # Our MeetUp scraper performs a query each time, so limit how often it
   363:       # runs (it does cache event details)
   364:       ACCEPTABLE_MEETUP=$(date -d 'now - 4 hours' '+%s')
   365:       if [[ -e meetup.rss ]]
   366:       then
   367:         LAST_MEETUP=$(date -r meetup.rss '+%s')
   368:       else
   369:         LAST_MEETUP=0
   370:       fi
   371: 
   372:       if (( LAST_MEETUP <= ACCEPTABLE_MEETUP ))
   373:       then
   374:         getmeetup > meetup.rss ||
   375:           echo "Error getting meetup events" 1>&2
   376:       fi
   377: 
   378:       # shellcheck disable=SC2154
   379:       "$rss" ~/.cache/rss < ~/.feeds
   380:     fi
   381: 
   382:     # Now convert out RSS to MailDir
   383:     # shellcheck disable=SC2154
   384:     "$convert"
   385: 
   386:     echo "Cleaning up old news" 1>&2
   387:     # shellcheck disable=SC2154
   388:     "$cleanUp"
   389: 
   390:     # Re-index (after stopping any existing instance, e.g. the server for mu4e)
   391:     pkill -2 -u "$UID" mu
   392:     sleep 1
   393:     mu index --maildir="$HOME/Mail" --lazy-check
   394:   '';
   395: }
Generated by git2html.