warbo-utilities: 37edf6f3c90459ee8d3ab07dbe86da2768466dc1
1: {
2: bash,
3: coreutils,
4: feed2maildirsimple,
5: libxslt,
6: mkBin,
7: mu,
8: openssl,
9: procps,
10: python3,
11: raw,
12: scripts,
13: wget,
14: wrap,
15: writeScript,
16: xidel,
17: xmlstarlet,
18: }:
19: with rec {
20: cleanUp = wrap {
21: name = "clean-up-news";
22: paths = [
23: bash
24: mu
25: xidel
26: ];
27: script = ''
28: #!${bash}/bin/bash
29:
30: function stopMu {
31: while ps auxww | grep 'mu server' | grep -v grep | grep 'server'
32: do
33: pkill -2 -u "$UID" -f 'mu server'
34: sleep 1
35: done
36: }
37:
38: # Gather up the filenames to remove, so we can delete them all at once
39: # without having to keep polling mu
40: REMOVALS=()
41:
42: # Some feeds are high-volume and only interesting for a short time. We
43: # clean up their articles 1 month after posting
44: for FEED in BBCHeadlines HackerNews XKCD SMBC
45: do
46: CUTOFF=$(date -d "last month" "+%s")
47: while read -r F
48: do
49: D=$(grep "^Date: " < "$F" | sed -e 's/^Date: //g')
50: SECS=$(date -d "$D" "+%s")
51: if [[ "$SECS" -lt "$CUTOFF" ]]
52: then
53: REMOVALS+=("$F")
54: fi
55: done < <(find "$HOME/Mail/feeds/$FEED/cur" -type f)
56: done
57:
58: # Some feeds may post content that originated a while ago, e.g. the BBC
59: # showing films from many years ago. If these are only available for a
60: # short time (like iPlayer posts) then we should delete those whose file
61: # modification time (rather than posted date) is older than a month
62: for FEED in iPlayerComedy iPlayerFilms iPlayerSciNat
63: do
64: while read -r F
65: do
66: REMOVALS+=("$F")
67: done < <(find "$HOME/Mail/feeds/$FEED/cur" -type f -mtime +30)
68: done
69:
70: # Limit Reddit feeds to 100 messages. They only include about the latest
71: # 25 posts, so we shouldn't get any dupes creeping in.
72: for FEED in RedditHaskell RedditStallmanWasRight \
73: ScienceBulletin BBCHeadlines HackerNews
74: do
75: while read -r F
76: do
77: REMOVALS+=("$F")
78: done < <(mu find --fields="l" --sortfield='d' --reverse \
79: maildir:/feeds/"$FEED" not flag:unread |
80: tail -n+100 |
81: head -n20 )
82: done
83:
84: stopMu
85: for F in "''${REMOVALS[@]}"
86: do
87: rm "$F"
88: mu remove "$F"
89: done
90:
91: # Delete old BBC news content (since their RSS only has summaries)
92: find /tmp/bbcnews-cached -type f -mtime +30 -exec rm {} \;
93:
94: # Delete MeetUp events from the past. These may be posted well in advance,
95: # so we can't use the file's modification time.
96: for F in "$HOME/.cache/meetup"/*
97: do
98: if ! [[ -s "$F" ]]
99: then
100: # Delete empty files
101: rm -f "$F"
102: continue
103: fi
104:
105: MILLIS=$(xidel - -s -e '//time/@dateTime' < "$F" | head -n1)
106: if [[ -z "$MILLIS" ]]
107: then
108: # Delete undated events
109: rm -f "$F"
110: continue
111: fi
112:
113: SECS=$(( MILLIS / 1000 ))
114: PAST=$(date -d 'now - 7 days' '+%s')
115: (( PAST < SECS )) || rm -f "$F"
116: unset MILLIS
117: unset SECS
118: unset PAST
119: done
120:
121: # Occasionaly run a full index, to clean up old messages
122: stopMu
123: if [[ "$(( RANDOM % 100 ))" -eq 0 ]]
124: then
125: mu index --maildir="$HOME/Mail"
126: else
127: mu index --maildir="$HOME/Mail" --lazy-check
128: fi
129: '';
130: };
131:
132: convert = wrap {
133: name = "feeds2maildirs";
134: paths = [ (python3.withPackages (p: [ feed2maildirsimple ])) ];
135: script = ''
136: #!/usr/bin/env python
137: # coding: utf-8
138:
139: import hashlib
140: import os
141: import random
142: import sys
143:
144: from feed2maildir.converter import Converter
145: from feed2maildir.reader import Reader
146:
147: msg = lambda x: (sys.stderr.write(x if type(x) == type("") \
148: else repr(x) + '\n'),
149: sys.stderr.flush(),
150: None)[-1]
151:
152: home = os.environ['HOME']
153: rssDir = home + '/.cache/rss'
154: rssFiles = [f for f in os.listdir(rssDir) if f.lower().endswith('.rss')]
155:
156: for rssFile in rssFiles:
157: name = rssFile[:-4]
158: maildir = home + '/Mail/feeds/' + name
159:
160: try:
161: with open(rssDir + '/' + rssFile, 'r') as f:
162: data = f.read()
163: except Exception as e:
164: msg({
165: 'exception' : e,
166: 'message' : 'Failed to read file, skipping',
167: 'rssFile' : rssFile,
168: })
169: continue
170:
171: # Hash the .rss file to a .hash file and see if it's changed
172: hashFile = rssDir + '/' + name + '.hash'
173: lastHash = None
174: try:
175: with open(hashFile, 'r') as f:
176: lastHash = f.read().strip()
177: except:
178: pass
179:
180: hasher = hashlib.md5()
181: hasher.update(data)
182: newHash = hasher.hexdigest()
183:
184: # Skip most unchanged files; do a few at random to escape erroneous data
185: if lastHash == newHash and random.randint(0, len(rssFiles)) > 5:
186: msg('Skipping ' + rssFile + ' since its hash has not changed')
187: continue
188:
189: msg('Converting ' + rssFile + ' to Maildir\n')
190: try:
191: reader = Reader(data)
192: converter = Converter(maildir, name, strip=True)
193: converter.load(reader.feed)
194: converter.run()
195: with open(hashFile, 'w') as f:
196: f.write(newHash)
197: except Exception as e:
198: msg({
199: 'exception' : e,
200: 'message' : 'Skipping file due to exception in conversion',
201: 'rssFile' : rssFile,
202: })
203: '';
204: };
205:
206: fixRss = mkBin {
207: name = "fixRss";
208: paths = [ xmlstarlet ];
209: script = ''
210: #!${bash}/bin/bash
211:
212: # Append an author to each item, using the feed name
213: xmlstarlet ed \
214: -s //item -t elem -n author \
215: -v "$1" \
216: -d '//item/author[position() != 1]' |
217:
218: # Now that all items have an author, set them all to the feed name (to
219: # avoid special characters)
220: xmlstarlet ed -u "//author" -v "$1" |
221:
222: # Append today as the items' pubDate, then remove all but the first
223: # pubDate (i.e. append today as the pubDate, if none is given)
224: xmlstarlet ed \
225: -s //item -t elem -n pubDate \
226: -v "$(date -d "today 00:00" --rfc-2822)" \
227: -d '//item/pubDate[position() != 1]' |
228:
229: # Append an empty description, then remove all but the first description
230: xmlstarlet ed \
231: -s //item -t elem -n description \
232: -v "No description given" \
233: -d '//item/description[position() != 1]' |
234:
235: # Append an empty link, then remove all but the first link
236: xmlstarlet ed \
237: -s //item -t elem -n link \
238: -v "http://example.com" \
239: -d '//item/link[position() != 1]'
240: '';
241: };
242:
243: stripNonAscii = ''
244: tr -cd '[:print:]
245: '';
246:
247: get = "timeout 20 wget -O- -q --no-check-certificate";
248:
249: getRss = mkBin {
250: name = "getRss";
251: paths = [
252: coreutils
253: fixRss
254: wget
255: ];
256: script = ''
257: #!${bash}/bin/bash
258: ${get} "$2" | ${stripNonAscii} | fixRss "$1" > "$1.rss"
259: '';
260: };
261:
262: getAtom = mkBin {
263: name = "getAtom";
264: paths = [
265: coreutils
266: fixRss
267: (libxslt.bin or libxslt)
268: wget
269: ];
270: vars = {
271: xsl = raw."atom2rss-exslt.xsl";
272: };
273: script = ''
274: #!${bash}/bin/bash
275: ${get} "$2" | ${stripNonAscii} > "$1.atom"
276: xsltproc "$xsl" "$1.atom" |
277: fixRss "$1" > "$1.rss"
278: '';
279: };
280:
281: getYouTube = mkBin {
282: name = "getYouTube";
283: paths = [ getAtom ];
284: script = ''
285: #!${bash}/bin/bash
286: getAtom "$1" "http://www.youtube.com/feeds/videos.xml?channel_id=$2"
287: '';
288: };
289:
290: rss = wrap {
291: name = "pull_down_rss";
292: paths = [
293: bash
294: getAtom
295: getRss
296: getYouTube
297: ];
298: script = ''
299: #!${bash}/bin/bash
300: set -e
301: [[ -n "$1" ]] || fail "pull_down_rss need an output directory"
302: [[ -e "$1" ]] || fail "Output dir '$1' not found"
303: cd "$1"
304:
305: while read -r FEED
306: do
307: TYPE=$(echo "$FEED" | cut -f1)
308: NAME=$(echo "$FEED" | cut -f2)
309: URL=$(echo "$FEED" | cut -f3)
310:
311: echo "Getting feed $NAME" 1>&2
312: case "$TYPE" in
313: atom)
314: getAtom "$NAME" "$URL" || echo "Failed to get $NAME, skipping"
315: ;;
316: iplayer)
317: iplayer_feed "$NAME" "$URL" > "$NAME.rss" ||
318: echo "Failed to get $NAME, skipping"
319: ;;
320: rss)
321: getRss "$NAME" "$URL" || echo "Failed to get $NAME, skipping"
322: ;;
323: youtube)
324: getYouTube "$NAME" "$URL" || echo "Failed to get $NAME, skipping"
325: ;;
326: *)
327: echo "Can't handle '$FEED', skipping" 1>&2
328: ;;
329: esac
330: done
331: '';
332: };
333: };
334:
335: wrap {
336: name = "get-news-start";
337: paths = [
338: bash
339: mu
340: procps
341: ];
342: vars = {
343: inherit cleanUp convert rss;
344: inherit (scripts) sysPing;
345: };
346: script = ''
347: #!${bash}/bin/bash
348: set -e
349:
350: # Grabs RSS feeds and dumps them in ~/.cache, so all of our news is in one
351: # format and one place, ready to merge into our mail
352: # shellcheck disable=SC2154
353: if "$sysPing" -c 1 google.com
354: then
355: # Update all of our RSS files
356: mkdir -p ~/.cache/rss
357: cd ~/.cache/rss || fail "Couldn't cd to ~/.cache/rss"
358:
359: bbcnews > BBCHeadlines.rss ||
360: echo "Error getting BBC news, skipping" 1>&2
361:
362: # Our MeetUp scraper performs a query each time, so limit how often it
363: # runs (it does cache event details)
364: ACCEPTABLE_MEETUP=$(date -d 'now - 4 hours' '+%s')
365: if [[ -e meetup.rss ]]
366: then
367: LAST_MEETUP=$(date -r meetup.rss '+%s')
368: else
369: LAST_MEETUP=0
370: fi
371:
372: if (( LAST_MEETUP <= ACCEPTABLE_MEETUP ))
373: then
374: getmeetup > meetup.rss ||
375: echo "Error getting meetup events" 1>&2
376: fi
377:
378: # shellcheck disable=SC2154
379: "$rss" ~/.cache/rss < ~/.feeds
380: fi
381:
382: # Now convert out RSS to MailDir
383: # shellcheck disable=SC2154
384: "$convert"
385:
386: echo "Cleaning up old news" 1>&2
387: # shellcheck disable=SC2154
388: "$cleanUp"
389:
390: # Re-index (after stopping any existing instance, e.g. the server for mu4e)
391: pkill -2 -u "$UID" mu
392: sleep 1
393: mu index --maildir="$HOME/Mail" --lazy-check
394: '';
395: }
Generated by git2html.