warbo-utilities: 488168a1fcb7f888d062305a7adcd5884b24b0e1

     1: { bash, coreutils, feed2maildirsimple, libxslt, mkBin, mu, openssl, procps
     2: , python3, raw, scripts, wget, wrap, writeScript, xidel, xmlstarlet }:
     3: with rec {
     4:   cleanUp = wrap {
     5:     name = "clean-up-news";
     6:     paths = [ bash mu xidel ];
     7:     script = ''
     8:       #!${bash}/bin/bash
     9: 
    10:       function stopMu {
    11:         while ps auxww | grep 'mu server' | grep -v grep | grep 'server'
    12:         do
    13:           pkill -2 -u "$UID" -f 'mu server'
    14:           sleep 1
    15:         done
    16:       }
    17: 
    18:       # Gather up the filenames to remove, so we can delete them all at once
    19:       # without having to keep polling mu
    20:       REMOVALS=()
    21: 
    22:       # Some feeds are high-volume and only interesting for a short time. We
    23:       # clean up their articles 1 month after posting
    24:       for FEED in BBCHeadlines HackerNews XKCD SMBC
    25:       do
    26:         CUTOFF=$(date -d "last month" "+%s")
    27:         while read -r F
    28:         do
    29:           D=$(grep "^Date: " < "$F" | sed -e 's/^Date: //g')
    30:           SECS=$(date -d "$D" "+%s")
    31:           if [[ "$SECS" -lt "$CUTOFF" ]]
    32:           then
    33:             REMOVALS+=("$F")
    34:           fi
    35:         done < <(find "$HOME/Mail/feeds/$FEED/cur" -type f)
    36:       done
    37: 
    38:       # Some feeds may post content that originated a while ago, e.g. the BBC
    39:       # showing films from many years ago. If these are only available for a
    40:       # short time (like iPlayer posts) then we should delete those whose file
    41:       # modification time (rather than posted date) is older than a month
    42:       for FEED in iPlayerComedy iPlayerFilms iPlayerSciNat
    43:       do
    44:         while read -r F
    45:         do
    46:           REMOVALS+=("$F")
    47:         done < <(find "$HOME/Mail/feeds/$FEED/cur" -type f -mtime +30)
    48:       done
    49: 
    50:       # Limit Reddit feeds to 100 messages. They only include about the latest
    51:       # 25 posts, so we shouldn't get any dupes creeping in.
    52:       for FEED in RedditHaskell RedditStallmanWasRight \
    53:                   ScienceBulletin BBCHeadlines HackerNews
    54:       do
    55:         while read -r F
    56:         do
    57:           REMOVALS+=("$F")
    58:         done < <(mu find --fields="l" --sortfield='d' --reverse \
    59:                          maildir:/feeds/"$FEED" not flag:unread |
    60:                  tail -n+100                                    |
    61:                  head -n20                                      )
    62:       done
    63: 
    64:       stopMu
    65:       for F in "''${REMOVALS[@]}"
    66:       do
    67:         rm "$F"
    68:         mu remove "$F"
    69:       done
    70: 
    71:       # Delete old BBC news content (since their RSS only has summaries)
    72:       find /tmp/bbcnews-cached -type f -mtime +30 -exec rm {} \;
    73: 
    74:       # Delete MeetUp events from the past. These may be posted well in advance,
    75:       # so we can't use the file's modification time.
    76:       for F in "$HOME/.cache/meetup"/*
    77:       do
    78:         if ! [[ -s "$F" ]]
    79:         then
    80:           # Delete empty files
    81:           rm -f "$F"
    82:           continue
    83:         fi
    84: 
    85:         MILLIS=$(xidel - -s -e '//time/@dateTime' < "$F" | head -n1)
    86:         if [[ -z "$MILLIS" ]]
    87:         then
    88:           # Delete undated events
    89:           rm -f "$F"
    90:           continue
    91:         fi
    92: 
    93:         SECS=$(( MILLIS / 1000 ))
    94:         PAST=$(date -d 'now - 7 days' '+%s')
    95:         (( PAST < SECS )) || rm -f "$F"
    96:         unset MILLIS
    97:         unset SECS
    98:         unset PAST
    99:       done
   100: 
   101:       # Occasionaly run a full index, to clean up old messages
   102:       stopMu
   103:       if [[ "$(( RANDOM % 100 ))" -eq 0 ]]
   104:       then
   105:         mu index --maildir="$HOME/Mail"
   106:       else
   107:         mu index --maildir="$HOME/Mail" --lazy-check
   108:       fi
   109:     '';
   110:   };
   111: 
   112:   convert = wrap {
   113:     name = "feeds2maildirs";
   114:     paths = [ (python3.withPackages (p: [ feed2maildirsimple ])) ];
   115:     script = ''
   116:       #!/usr/bin/env python
   117:       # coding: utf-8
   118: 
   119:       import hashlib
   120:       import os
   121:       import random
   122:       import sys
   123: 
   124:       from feed2maildir.converter import Converter
   125:       from feed2maildir.reader    import Reader
   126: 
   127:       msg = lambda x: (sys.stderr.write(x if type(x) == type("") \
   128:                                           else repr(x) + '\n'),
   129:                        sys.stderr.flush(),
   130:                        None)[-1]
   131: 
   132:       home     = os.environ['HOME']
   133:       rssDir   = home + '/.cache/rss'
   134:       rssFiles = [f for f in os.listdir(rssDir) if f.lower().endswith('.rss')]
   135: 
   136:       for rssFile in rssFiles:
   137:         name    = rssFile[:-4]
   138:         maildir = home + '/Mail/feeds/' + name
   139: 
   140:         try:
   141:           with open(rssDir + '/' + rssFile, 'r') as f:
   142:             data = f.read()
   143:         except Exception as e:
   144:           msg({
   145:             'exception' : e,
   146:             'message'   : 'Failed to read file, skipping',
   147:             'rssFile'   : rssFile,
   148:           })
   149:           continue
   150: 
   151:         # Hash the .rss file to a .hash file and see if it's changed
   152:         hashFile = rssDir + '/' + name + '.hash'
   153:         lastHash = None
   154:         try:
   155:           with open(hashFile, 'r') as f:
   156:             lastHash = f.read().strip()
   157:         except:
   158:           pass
   159: 
   160:         hasher = hashlib.md5()
   161:         hasher.update(data)
   162:         newHash = hasher.hexdigest()
   163: 
   164:         # Skip most unchanged files; do a few at random to escape erroneous data
   165:         if lastHash == newHash and random.randint(0, len(rssFiles)) > 5:
   166:           msg('Skipping ' + rssFile + ' since its hash has not changed')
   167:           continue
   168: 
   169:         msg('Converting ' + rssFile + ' to Maildir\n')
   170:         try:
   171:           reader    = Reader(data)
   172:           converter = Converter(maildir, name, strip=True)
   173:           converter.load(reader.feed)
   174:           converter.run()
   175:           with open(hashFile, 'w') as f:
   176:             f.write(newHash)
   177:         except Exception as e:
   178:           msg({
   179:             'exception' : e,
   180:             'message'   : 'Skipping file due to exception in conversion',
   181:             'rssFile'   : rssFile,
   182:           })
   183:     '';
   184:   };
   185: 
   186:   fixRss = mkBin {
   187:     name = "fixRss";
   188:     paths = [ xmlstarlet ];
   189:     script = ''
   190:       #!${bash}/bin/bash
   191: 
   192:       # Append an author to each item, using the feed name
   193:       xmlstarlet ed                         \
   194:         -s //item -t elem -n author         \
   195:         -v "$1"                             \
   196:         -d '//item/author[position() != 1]' |
   197: 
   198:         # Now that all items have an author, set them all to the feed name (to
   199:         # avoid special characters)
   200:         xmlstarlet ed -u "//author" -v "$1" |
   201: 
   202:         # Append today as the items' pubDate, then remove all but the first
   203:         # pubDate (i.e. append today as the pubDate, if none is given)
   204:         xmlstarlet ed                              \
   205:           -s //item -t elem -n pubDate             \
   206:           -v "$(date -d "today 00:00" --rfc-2822)" \
   207:           -d '//item/pubDate[position() != 1]'     |
   208: 
   209:         # Append an empty description, then remove all but the first description
   210:         xmlstarlet ed                              \
   211:           -s //item -t elem -n description         \
   212:           -v "No description given"                \
   213:           -d '//item/description[position() != 1]' |
   214: 
   215:         # Append an empty link, then remove all but the first link
   216:         xmlstarlet ed                       \
   217:           -s //item -t elem -n link         \
   218:           -v "http://example.com"           \
   219:           -d '//item/link[position() != 1]'
   220:     '';
   221:   };
   222: 
   223:   stripNonAscii = ''
   224:     tr -cd '[:print:]
   225:   '';
   226: 
   227:   get = "timeout 20 wget -O- -q --no-check-certificate";
   228: 
   229:   getRss = mkBin {
   230:     name = "getRss";
   231:     paths = [ coreutils fixRss wget ];
   232:     script = ''
   233:       #!${bash}/bin/bash
   234:       ${get} "$2" | ${stripNonAscii} | fixRss "$1" > "$1.rss"
   235:     '';
   236:   };
   237: 
   238:   getAtom = mkBin {
   239:     name = "getAtom";
   240:     paths = [ coreutils fixRss (libxslt.bin or libxslt) wget ];
   241:     vars = { xsl = raw."atom2rss-exslt.xsl"; };
   242:     script = ''
   243:       #!${bash}/bin/bash
   244:       ${get} "$2" | ${stripNonAscii} > "$1.atom"
   245:       xsltproc "$xsl" "$1.atom" |
   246:         fixRss "$1" > "$1.rss"
   247:     '';
   248:   };
   249: 
   250:   getYouTube = mkBin {
   251:     name = "getYouTube";
   252:     paths = [ getAtom ];
   253:     script = ''
   254:       #!${bash}/bin/bash
   255:       getAtom "$1" "http://www.youtube.com/feeds/videos.xml?channel_id=$2"
   256:     '';
   257:   };
   258: 
   259:   rss = wrap {
   260:     name = "pull_down_rss";
   261:     paths = [ bash getAtom getRss getYouTube ];
   262:     script = ''
   263:       #!${bash}/bin/bash
   264:       set -e
   265:       [[ -n "$1" ]] || fail "pull_down_rss need an output directory"
   266:       [[ -e "$1" ]] || fail "Output dir '$1' not found"
   267:       cd "$1"
   268: 
   269:       while read -r FEED
   270:       do
   271:         TYPE=$(echo "$FEED" | cut -f1)
   272:         NAME=$(echo "$FEED" | cut -f2)
   273:          URL=$(echo "$FEED" | cut -f3)
   274: 
   275:         echo "Getting feed $NAME" 1>&2
   276:         case "$TYPE" in
   277:           atom)
   278:             getAtom "$NAME" "$URL" || echo "Failed to get $NAME, skipping"
   279:             ;;
   280:           iplayer)
   281:             iplayer_feed "$NAME" "$URL" > "$NAME.rss" ||
   282:               echo "Failed to get $NAME, skipping"
   283:             ;;
   284:           rss)
   285:             getRss "$NAME" "$URL" || echo "Failed to get $NAME, skipping"
   286:             ;;
   287:           youtube)
   288:             getYouTube "$NAME" "$URL" || echo "Failed to get $NAME, skipping"
   289:             ;;
   290:           *)
   291:             echo "Can't handle '$FEED', skipping" 1>&2
   292:             ;;
   293:         esac
   294:       done
   295:     '';
   296:   };
   297: };
   298: 
   299: wrap {
   300:   name = "get-news-start";
   301:   paths = [ bash mu procps ];
   302:   vars = {
   303:     inherit cleanUp convert rss;
   304:     inherit (scripts) sysPing;
   305:   };
   306:   script = ''
   307:     #!${bash}/bin/bash
   308:     set -e
   309: 
   310:     # Grabs RSS feeds and dumps them in ~/.cache, so all of our news is in one
   311:     # format and one place, ready to merge into our mail
   312:     # shellcheck disable=SC2154
   313:     if "$sysPing" -c 1 google.com
   314:     then
   315:       # Update all of our RSS files
   316:       mkdir -p ~/.cache/rss
   317:       cd ~/.cache/rss || fail "Couldn't cd to ~/.cache/rss"
   318: 
   319:       bbcnews > BBCHeadlines.rss ||
   320:         echo "Error getting BBC news, skipping" 1>&2
   321: 
   322:       # Our MeetUp scraper performs a query each time, so limit how often it
   323:       # runs (it does cache event details)
   324:       ACCEPTABLE_MEETUP=$(date -d 'now - 4 hours' '+%s')
   325:       if [[ -e meetup.rss ]]
   326:       then
   327:         LAST_MEETUP=$(date -r meetup.rss '+%s')
   328:       else
   329:         LAST_MEETUP=0
   330:       fi
   331: 
   332:       if (( LAST_MEETUP <= ACCEPTABLE_MEETUP ))
   333:       then
   334:         getmeetup > meetup.rss ||
   335:           echo "Error getting meetup events" 1>&2
   336:       fi
   337: 
   338:       # shellcheck disable=SC2154
   339:       "$rss" ~/.cache/rss < ~/.feeds
   340:     fi
   341: 
   342:     # Now convert out RSS to MailDir
   343:     # shellcheck disable=SC2154
   344:     "$convert"
   345: 
   346:     echo "Cleaning up old news" 1>&2
   347:     # shellcheck disable=SC2154
   348:     "$cleanUp"
   349: 
   350:     # Re-index (after stopping any existing instance, e.g. the server for mu4e)
   351:     pkill -2 -u "$UID" mu
   352:     sleep 1
   353:     mu index --maildir="$HOME/Mail" --lazy-check
   354:   '';
   355: }

Generated by git2html.