warbo-utilities: 488168a1fcb7f888d062305a7adcd5884b24b0e1
1: { bash, coreutils, feed2maildirsimple, libxslt, mkBin, mu, openssl, procps
2: , python3, raw, scripts, wget, wrap, writeScript, xidel, xmlstarlet }:
3: with rec {
4: cleanUp = wrap {
5: name = "clean-up-news";
6: paths = [ bash mu xidel ];
7: script = ''
8: #!${bash}/bin/bash
9:
10: function stopMu {
11: while ps auxww | grep 'mu server' | grep -v grep | grep 'server'
12: do
13: pkill -2 -u "$UID" -f 'mu server'
14: sleep 1
15: done
16: }
17:
18: # Gather up the filenames to remove, so we can delete them all at once
19: # without having to keep polling mu
20: REMOVALS=()
21:
22: # Some feeds are high-volume and only interesting for a short time. We
23: # clean up their articles 1 month after posting
24: for FEED in BBCHeadlines HackerNews XKCD SMBC
25: do
26: CUTOFF=$(date -d "last month" "+%s")
27: while read -r F
28: do
29: D=$(grep "^Date: " < "$F" | sed -e 's/^Date: //g')
30: SECS=$(date -d "$D" "+%s")
31: if [[ "$SECS" -lt "$CUTOFF" ]]
32: then
33: REMOVALS+=("$F")
34: fi
35: done < <(find "$HOME/Mail/feeds/$FEED/cur" -type f)
36: done
37:
38: # Some feeds may post content that originated a while ago, e.g. the BBC
39: # showing films from many years ago. If these are only available for a
40: # short time (like iPlayer posts) then we should delete those whose file
41: # modification time (rather than posted date) is older than a month
42: for FEED in iPlayerComedy iPlayerFilms iPlayerSciNat
43: do
44: while read -r F
45: do
46: REMOVALS+=("$F")
47: done < <(find "$HOME/Mail/feeds/$FEED/cur" -type f -mtime +30)
48: done
49:
50: # Limit Reddit feeds to 100 messages. They only include about the latest
51: # 25 posts, so we shouldn't get any dupes creeping in.
52: for FEED in RedditHaskell RedditStallmanWasRight \
53: ScienceBulletin BBCHeadlines HackerNews
54: do
55: while read -r F
56: do
57: REMOVALS+=("$F")
58: done < <(mu find --fields="l" --sortfield='d' --reverse \
59: maildir:/feeds/"$FEED" not flag:unread |
60: tail -n+100 |
61: head -n20 )
62: done
63:
64: stopMu
65: for F in "''${REMOVALS[@]}"
66: do
67: rm "$F"
68: mu remove "$F"
69: done
70:
71: # Delete old BBC news content (since their RSS only has summaries)
72: find /tmp/bbcnews-cached -type f -mtime +30 -exec rm {} \;
73:
74: # Delete MeetUp events from the past. These may be posted well in advance,
75: # so we can't use the file's modification time.
76: for F in "$HOME/.cache/meetup"/*
77: do
78: if ! [[ -s "$F" ]]
79: then
80: # Delete empty files
81: rm -f "$F"
82: continue
83: fi
84:
85: MILLIS=$(xidel - -s -e '//time/@dateTime' < "$F" | head -n1)
86: if [[ -z "$MILLIS" ]]
87: then
88: # Delete undated events
89: rm -f "$F"
90: continue
91: fi
92:
93: SECS=$(( MILLIS / 1000 ))
94: PAST=$(date -d 'now - 7 days' '+%s')
95: (( PAST < SECS )) || rm -f "$F"
96: unset MILLIS
97: unset SECS
98: unset PAST
99: done
100:
101: # Occasionaly run a full index, to clean up old messages
102: stopMu
103: if [[ "$(( RANDOM % 100 ))" -eq 0 ]]
104: then
105: mu index --maildir="$HOME/Mail"
106: else
107: mu index --maildir="$HOME/Mail" --lazy-check
108: fi
109: '';
110: };
111:
112: convert = wrap {
113: name = "feeds2maildirs";
114: paths = [ (python3.withPackages (p: [ feed2maildirsimple ])) ];
115: script = ''
116: #!/usr/bin/env python
117: # coding: utf-8
118:
119: import hashlib
120: import os
121: import random
122: import sys
123:
124: from feed2maildir.converter import Converter
125: from feed2maildir.reader import Reader
126:
127: msg = lambda x: (sys.stderr.write(x if type(x) == type("") \
128: else repr(x) + '\n'),
129: sys.stderr.flush(),
130: None)[-1]
131:
132: home = os.environ['HOME']
133: rssDir = home + '/.cache/rss'
134: rssFiles = [f for f in os.listdir(rssDir) if f.lower().endswith('.rss')]
135:
136: for rssFile in rssFiles:
137: name = rssFile[:-4]
138: maildir = home + '/Mail/feeds/' + name
139:
140: try:
141: with open(rssDir + '/' + rssFile, 'r') as f:
142: data = f.read()
143: except Exception as e:
144: msg({
145: 'exception' : e,
146: 'message' : 'Failed to read file, skipping',
147: 'rssFile' : rssFile,
148: })
149: continue
150:
151: # Hash the .rss file to a .hash file and see if it's changed
152: hashFile = rssDir + '/' + name + '.hash'
153: lastHash = None
154: try:
155: with open(hashFile, 'r') as f:
156: lastHash = f.read().strip()
157: except:
158: pass
159:
160: hasher = hashlib.md5()
161: hasher.update(data)
162: newHash = hasher.hexdigest()
163:
164: # Skip most unchanged files; do a few at random to escape erroneous data
165: if lastHash == newHash and random.randint(0, len(rssFiles)) > 5:
166: msg('Skipping ' + rssFile + ' since its hash has not changed')
167: continue
168:
169: msg('Converting ' + rssFile + ' to Maildir\n')
170: try:
171: reader = Reader(data)
172: converter = Converter(maildir, name, strip=True)
173: converter.load(reader.feed)
174: converter.run()
175: with open(hashFile, 'w') as f:
176: f.write(newHash)
177: except Exception as e:
178: msg({
179: 'exception' : e,
180: 'message' : 'Skipping file due to exception in conversion',
181: 'rssFile' : rssFile,
182: })
183: '';
184: };
185:
186: fixRss = mkBin {
187: name = "fixRss";
188: paths = [ xmlstarlet ];
189: script = ''
190: #!${bash}/bin/bash
191:
192: # Append an author to each item, using the feed name
193: xmlstarlet ed \
194: -s //item -t elem -n author \
195: -v "$1" \
196: -d '//item/author[position() != 1]' |
197:
198: # Now that all items have an author, set them all to the feed name (to
199: # avoid special characters)
200: xmlstarlet ed -u "//author" -v "$1" |
201:
202: # Append today as the items' pubDate, then remove all but the first
203: # pubDate (i.e. append today as the pubDate, if none is given)
204: xmlstarlet ed \
205: -s //item -t elem -n pubDate \
206: -v "$(date -d "today 00:00" --rfc-2822)" \
207: -d '//item/pubDate[position() != 1]' |
208:
209: # Append an empty description, then remove all but the first description
210: xmlstarlet ed \
211: -s //item -t elem -n description \
212: -v "No description given" \
213: -d '//item/description[position() != 1]' |
214:
215: # Append an empty link, then remove all but the first link
216: xmlstarlet ed \
217: -s //item -t elem -n link \
218: -v "http://example.com" \
219: -d '//item/link[position() != 1]'
220: '';
221: };
222:
223: stripNonAscii = ''
224: tr -cd '[:print:]
225: '';
226:
227: get = "timeout 20 wget -O- -q --no-check-certificate";
228:
229: getRss = mkBin {
230: name = "getRss";
231: paths = [ coreutils fixRss wget ];
232: script = ''
233: #!${bash}/bin/bash
234: ${get} "$2" | ${stripNonAscii} | fixRss "$1" > "$1.rss"
235: '';
236: };
237:
238: getAtom = mkBin {
239: name = "getAtom";
240: paths = [ coreutils fixRss (libxslt.bin or libxslt) wget ];
241: vars = { xsl = raw."atom2rss-exslt.xsl"; };
242: script = ''
243: #!${bash}/bin/bash
244: ${get} "$2" | ${stripNonAscii} > "$1.atom"
245: xsltproc "$xsl" "$1.atom" |
246: fixRss "$1" > "$1.rss"
247: '';
248: };
249:
250: getYouTube = mkBin {
251: name = "getYouTube";
252: paths = [ getAtom ];
253: script = ''
254: #!${bash}/bin/bash
255: getAtom "$1" "http://www.youtube.com/feeds/videos.xml?channel_id=$2"
256: '';
257: };
258:
259: rss = wrap {
260: name = "pull_down_rss";
261: paths = [ bash getAtom getRss getYouTube ];
262: script = ''
263: #!${bash}/bin/bash
264: set -e
265: [[ -n "$1" ]] || fail "pull_down_rss need an output directory"
266: [[ -e "$1" ]] || fail "Output dir '$1' not found"
267: cd "$1"
268:
269: while read -r FEED
270: do
271: TYPE=$(echo "$FEED" | cut -f1)
272: NAME=$(echo "$FEED" | cut -f2)
273: URL=$(echo "$FEED" | cut -f3)
274:
275: echo "Getting feed $NAME" 1>&2
276: case "$TYPE" in
277: atom)
278: getAtom "$NAME" "$URL" || echo "Failed to get $NAME, skipping"
279: ;;
280: iplayer)
281: iplayer_feed "$NAME" "$URL" > "$NAME.rss" ||
282: echo "Failed to get $NAME, skipping"
283: ;;
284: rss)
285: getRss "$NAME" "$URL" || echo "Failed to get $NAME, skipping"
286: ;;
287: youtube)
288: getYouTube "$NAME" "$URL" || echo "Failed to get $NAME, skipping"
289: ;;
290: *)
291: echo "Can't handle '$FEED', skipping" 1>&2
292: ;;
293: esac
294: done
295: '';
296: };
297: };
298:
299: wrap {
300: name = "get-news-start";
301: paths = [ bash mu procps ];
302: vars = {
303: inherit cleanUp convert rss;
304: inherit (scripts) sysPing;
305: };
306: script = ''
307: #!${bash}/bin/bash
308: set -e
309:
310: # Grabs RSS feeds and dumps them in ~/.cache, so all of our news is in one
311: # format and one place, ready to merge into our mail
312: # shellcheck disable=SC2154
313: if "$sysPing" -c 1 google.com
314: then
315: # Update all of our RSS files
316: mkdir -p ~/.cache/rss
317: cd ~/.cache/rss || fail "Couldn't cd to ~/.cache/rss"
318:
319: bbcnews > BBCHeadlines.rss ||
320: echo "Error getting BBC news, skipping" 1>&2
321:
322: # Our MeetUp scraper performs a query each time, so limit how often it
323: # runs (it does cache event details)
324: ACCEPTABLE_MEETUP=$(date -d 'now - 4 hours' '+%s')
325: if [[ -e meetup.rss ]]
326: then
327: LAST_MEETUP=$(date -r meetup.rss '+%s')
328: else
329: LAST_MEETUP=0
330: fi
331:
332: if (( LAST_MEETUP <= ACCEPTABLE_MEETUP ))
333: then
334: getmeetup > meetup.rss ||
335: echo "Error getting meetup events" 1>&2
336: fi
337:
338: # shellcheck disable=SC2154
339: "$rss" ~/.cache/rss < ~/.feeds
340: fi
341:
342: # Now convert out RSS to MailDir
343: # shellcheck disable=SC2154
344: "$convert"
345:
346: echo "Cleaning up old news" 1>&2
347: # shellcheck disable=SC2154
348: "$cleanUp"
349:
350: # Re-index (after stopping any existing instance, e.g. the server for mu4e)
351: pkill -2 -u "$UID" mu
352: sleep 1
353: mu index --maildir="$HOME/Mail" --lazy-check
354: '';
355: }
Generated by git2html.