#!/usr/bin/env python
"""Parse LiveJournal's Atom feed of public posts for 'interesting' articles.
See http://brad.livejournal.com/2143713.html for details.
You can write your own callback handler; the RegexMatcher is
just an example. All your class needs is a scan() method that
takes a dict as an argument. The dict keys are
title, content, link, time
The title and content can be in UTF-8; the time is a unix
timestamp.
- we ignore stuff until we come across an tag
- store the entry's link href, title, and content (body)
- when we see an , we call the callback object's scan()
method
"""
import re
import string
import sys
import time
import urllib
import xml.sax.handler
import xml.sax.xmlreader
__author__ = 'Faried Nawaz -- faried@gmail.com -- http://www.hungry.com/~fn/'
__version__ = '20060302'
__license__ = 'This code is in the public domain.'
class LJDocumentHandler(xml.sax.handler.ContentHandler):
"""Parse LJ's feed."""
def __init__(self, outfile, callbackObject):
self.outfile = outfile
self.callbackObject = callbackObject
self.inEntry = False
self.currElement = None
self.contents = {}
self.bytesRead = 0L
def say(self, something):
self.outfile.write(something)
self.outfile.flush()
def startDocument(self):
# self.say('document start\n')
pass
def endDocument(self):
self.say('document end\n')
def startElement(self, name, attrs):
if name == 'entry':
self.inEntry = True
elif name == 'link':
if self.inEntry:
self.currElement = 'link'
self.contents[self.currElement] = attrs.getValue('href')
elif name == 'title':
if self.inEntry:
self.currElement = 'title'
self.contents[self.currElement] = ''
elif name == 'content':
if self.inEntry:
self.currElement = 'content'
self.contents[self.currElement] = ''
def endElement(self, name):
if name == 'entry':
self.inEntry = False
self.callbackObject.scan(self.contents)
self.currElement = None
elif name == 'link':
if self.inEntry:
self.currElement = None
elif name == 'title':
if self.inEntry:
self.currElement = None
elif name == 'content':
if self.inEntry:
self.currElement = None
def characters(self, chars):
self.bytesRead = self.bytesRead + len(chars)
if self.currElement:
self.contents[self.currElement] = self.contents[self.currElement] + chars
def bytes(self):
self.say('total bytes in this session: %ld\n' % (self.bytesRead))
class RegexMatcher(object):
"""Search using regular expressions."""
def __init__(self, outfile, regexdict):
self.outfile = outfile
self.regexs = regexdict
def say(self, something):
self.outfile.write(something)
self.outfile.flush()
def scan(self, data):
for regname, regex in self.regexs.items():
if regex.search(data['title']):
self.say('%s title match @ %s: %s\n' %
(regname, time.ctime(), data['link']))
elif regex.search(data['content']):
self.say('%s body match @ %s: %s\n' %
(regname, time.ctime(), data['link']))
def r(reg):
"""Wrapper around re.compile"""
return re.compile(reg, re.IGNORECASE)
def main():
"""Fire it up."""
# key is a label, value is a compiled regex
regexs = {
'aq' : r(r"([uo]sama )?b[ie]n lad[ie]n|q(u)?a[ie]da|zawahri|taliban|m[ou]lla(h)? [ou]m[ae]r"),
'wat' : r(r"war (against|on) terror|terrorism|terrorist|jihadi"),
'ir' : r(r"\biran"),
'fn' : r(r"faried|fn@hungry") # ah...vanity
}
regexmatcher = RegexMatcher(sys.stdout, regexs)
handler = LJDocumentHandler(sys.stderr, regexmatcher)
while 1:
parser = None
u = urllib.urlopen('http://danga.com:8081/atom-stream.xml')
# u = urllib.urlopen('http://updates.sixapart.com/atom-stream.xml')
while 1:
line = u.readline()
if line.find('') == 0:
# reset or create the parser
parser = xml.sax.make_parser(['IncrementalParser'])
parser.setContentHandler(handler)
elif line.find('') == 0:
# end of entry
parser = None
elif line.find('