#!/usr/local/bin/python """Downloads threads from groups.google.com. don't distribute this! it needs a rewrite and more program logic. written with python 2.2.1. you may need at least 2.2.0 to run this. uses standard python modules. """ import sys, os, os.path, time, getopt, re import urllib, urlparse import rfc822 import htmllib, formatter __version__ = '20020519' __appname__ = sys.argv[0][2:] class MyUrlOpener(urllib.FancyURLopener): """Subclassed FancyURLopener to generate a version string.""" def __init__(self, *args): self.version = __appname__ + '/' + __version__ urllib.FancyURLopener.__init__(self, *args) class Parser(htmllib.HTMLParser): """Subclassed HTMLParser that watches for HTML BASE and Anchor tags. The ``alist'' list attribute holds lists of urls and their descriptions. The inherited ``base'' tag holds the HTML BASE tag, if any. All urls in the ``alist'' attribute are corrected against the HTML BASE tag. """ def __init__(self, verbose=0, url=None): htmllib.HTMLParser.__init__(self, formatter.NullFormatter()) self.verbose = verbose self.alist = [] self.temp_anchor = None self.url = url self.page = None if url != None: self.parse(url) def do_base(self, args): htmllib.HTMLParser.do_base(self, args) if self.verbose > 1: for el in args: if el[0] == 'href': print 'saw a base tag with href:', el[1] break def anchor_bgn(self, href, name, type): htmllib.HTMLParser.anchor_bgn(self, href, name, type) self.temp_anchor = href self.save_bgn() def anchor_end(self): htmllib.HTMLParser.anchor_end(self) if self.temp_anchor: # list of lists because tuples suck. self.alist.append([self.temp_anchor, self.save_end()]) self.temp_anchor = None def fix_base(self, url): url_parts = map(None, urlparse.urlsplit(url)) base_parts = map(None, urlparse.urlsplit(self.base)) for i in range(0, 5): if base_parts[i] == '': base_parts[i] = url_parts[i] self.base = urlparse.urlunsplit((base_parts[0], base_parts[1], base_parts[2], base_parts[3], base_parts[4])) def fix_alist(self, url): self.alist = [ [ urlparse.urljoin(self.base, link), data ] for link, data in self.alist ] def parse(self, url): if self.verbose > 0: print "getting page %s..." % url, self.url = url try: self.page = urllib.urlretrieve(url) except: sys.stderr.write("\noops! couldn't get " + url + "\n") sys.exit(1); if self.verbose > 0: print "ok." self.parse_file(self.page[0]) def parse_file(self, file): self.feed(open(file).read()) self.close() if self.base: self.fix_base(self.url) else: self.base = self.url self.fix_alist(self.base) if self.verbose > 1: print 'HTML Anchor tags parsed from document: ' for e in self.alist: print e class FrameParser(Parser): """Parse a groups.google.com thread link. This is a specialized class that looks for HTML FRAME tags. The ``frames'' dictionary attribute holds urls to the left and right frames. """ def __init__(self, verbose=0, url=None): Parser.__init__(self, verbose) self.frames = {} self.url = url if url != None: self.parse(url) def start_frame(self, args): name = '' src = '' for element in args: if element[0] == 'name': name = element[1] elif element[0] == 'src': src = element[1] self.frames[name] = src if self.verbose > 1: print "frame %s is %s" % (name, src) def parse(self, url): Parser.parse(self, url) for key in self.frames: self.frames[key] = urlparse.urljoin(self.base, self.frames[key]) if self.verbose > 1: print 'HTML FRAME tags parsed from document:' for key in self.frames: print "%s: %s" % (key, self.frames[key]) class PageParser(Parser): """This class parses a message and filters HTTP Anchor tags against regexs. The attribute ``download_fnames'' contains a list of files with articles in them that we want to save off. """ def __init__(self, verbose=0, parsed=None, url=None, folder=None): Parser.__init__(self, verbose) self.verbose = verbose self.view_regex = re.compile('View this article only') self.download_regex = re.compile('Original Format') self.munge_regex = re.compile('.*(selm.*)') self.download_fnames = [] self.folder = folder if parsed == None: self.url = url self.parse(url) else: self.url = parsed.url self.alist = parsed.alist self.extract_urls() # self.parse_file(parsed.page[0]) def parse_file(self, file): Parser.parse_file(self, file) self.extract_urls() def extract_urls(self): new_list = [] # match against 'View' regex for entry in self.alist: if self.view_regex.match(entry[1]): # munge the url to create the 'Original Format' urls. if self.verbose > 1: print "munging %s to" % entry[0], entry_parts = urlparse.urlsplit(entry[0]) tmp = self.munge_regex.match(entry_parts[3]).group(1) + '&output=gplain' entry[0] = urlparse.urlunsplit((entry_parts[0], entry_parts[1], entry_parts[2], tmp, entry_parts[4])) entry[0] = urllib.unquote(entry[0]) if self.verbose > 1: print entry[0] new_list.append(entry) if new_list != []: self.alist = new_list # self.alist now contains links to articles we want to download. for entry in self.alist: if self.verbose > 1: entry[0] = urllib.unquote(entry[0]) print "grabbing article %s..." % entry[0], self.download_fnames.append(urllib.urlretrieve(entry[0])[0]) if self.verbose > 1: print "ok." return None # match against 'Original' regex for entry in self.alist: if self.download_regex.match(entry[1]): entry[0] = urllib.unquote(entry[0]) new_list.append(entry) if new_list != []: self.alist = new_list # this is an article page. load the url # to get a plain-text message. if self.verbose > 1: print "grabbing article %s..." % new_list[0][0] self.download_fnames.append(urllib.urlretrieve(new_list[0][0])[0]) if self.verbose > 1: print "ok." return None def save(self): for fname in self.download_fnames: m = Article(self.verbose, fname) self.folder.save(m) if self.verbose > 0: print 'articles saved.' class Article(rfc822.Message): """This class holds the data for a thread message. It's a simpler layer on top of rfc822.Message. """ def __init__(self, verbose=0, fname=None): rfc822.Message.__init__(self, open(fname)) self.verbose = verbose # must be a better way to do this. def write(self, destfd, mbox=0): if self.verbose > 1: print 'writing a message...', if mbox == 1: # need to fake a 'From ' header. # the header contains: Fromenvsenderdate-in-asctime-formatopt-data rp = self.getaddr('Return-Path')[1] s = self.getaddr('Sender')[1] f = self.getaddr('From')[1] if rp != None: envsender = rp elif s != None: envsender = s elif f != None: envsender = f else: envsender = 'XXX' # date needs to be in an asctime format string (exactly 24 chars long) d = self.getheader('Date') if d == None: d = 'Thu, 31 Dec 1969 16:00:00 -0800' dt = time.asctime(rfc822.parsedate(d)) from_header = 'From ' + envsender + ' ' + dt + '\n' destfd.write(from_header) # write the message self.fp.seek(0) data = self.fp.read() destfd.write(data) if mbox == 1: destfd.write('\n') if self.verbose > 1: print 'ok.' class Folder: """This handles all folder manipulation.""" def __init__(self, verbose=0, outname='mbox'): self.verbose = verbose self.outname = outname self.fd = None self.pid = os.getpid() if outname[-1] == '/': self.outtype = 'maildir' else: self.outtype = 'mbox' if os.environ.has_key('HOSTNAME'): self.hostname = os.environ['HOSTNAME'] else: self.hostname = 'localhost' self.create_or_reuse_folder() def create_or_reuse_folder(self): if self.outtype == 'maildir': self.use_maildir() else: self.use_mbox() def close(self): if self.verbose > 1: print 'closing folder', self.outname if self.outtype == 'mbox': self.fd.close() def use_maildir(self): try: s = os.stat(self.outname) except OSError: # directory doesn't exist if self.verbose > 0: print 'creating folder %s...' % self.outname, try: os.mkdir(self.outname) except IOError, (errno, strerror): sys.stderr.write('error(' + errno + '): ' + strerror + '\n') sys.exit(1) else: os.chdir(self.outname) os.mkdir('new') os.mkdir('cur') os.mkdir('tmp') if self.verbose > 0: print 'ok.' else: if self.verbose > 0: print 'folder %s already exists; reusing it...' % self.outname if os.access(self.outname, os.W_OK) == 0: sys.stderr.write('error: ' + self.outname + ' is not writable by me.') sys.exit(1) os.chdir(self.outname) def use_mbox(self): if os.path.exists(self.outname) == 0: if self.verbose > 0: print 'creating folder %s...' % self.outname, else: if self.verbose > 0: print 'folder %s already exists; appending to it...' % self.outname, if os.access(self.outname, os.W_OK) == 0: sys.stderr.write('error: ' + self.outname + ' is not writable by me.') sys.exit(1) try: self.fd = open(self.outname, 'a') except IOError, (errno, strerror): sys.stderr.write('error(' + errno + '): ' + strerror + '\n') sys.exit(1) else: if self.verbose > 0: print 'ok.' def save(self, message): # if this is a maildir, create a new file and put the message in it. # otherwise, just write the message out with a 'From ' header. if self.outtype == 'maildir': t = time.time() fname = str(t) + str(self.pid) + self.hostname # i trust that the following will never occur if os.path.exists('tmp/' + fname) == 1: time.sleep(2) self.save(message) else: self.fd = open('tmp/' + fname, 'w') message.write(self.fd) self.fd.close() os.link('tmp/' + fname, 'new/' + fname) os.unlink('tmp/' + fname) else: message.write(self.fd, mbox=1) def usage(): sys.stderr.write('usage: %s [-h] [-v] [-e] [-o mbox|maildir/] url\n' % __appname__) sys.stderr.write('usage: %s [--help] [--verbose] [--extra-verbose] [--out mbox|maildir/] url\n' % __appname__) sys.stderr.write('the url must be a "view with frames" url or a thread with one article.\n') sys.exit(1) def handle_opts(argv): verbose = 0 outname = 'mbox' try: opts, args = getopt.getopt(argv[1:], 'hveo:', ['help', 'verbose', 'extra-verbose', 'out']) except getopt.GetoptError, v: sys.stderr.write('error: ' + str(v) + '\n') usage() for opt, arg in opts: if opt in ('--help', '-h'): usage() elif opt in ('--verbose', '-v'): print __appname__, 'version', __version__ verbose = 1 elif opt in ('--extra-verbose', '-e'): print __appname__, 'version', __version__ verbose = 2 elif opt in ('--out', '-o'): outname = arg if len(args) == 0: usage() return (args, verbose, outname) def main(argv): args, verbose, outname = handle_opts(argv) url = args[0] urllib._urlopener = MyUrlOpener() outfolder = Folder(verbose, outname) # 1. load the url top_page = FrameParser(verbose, url) # if the page has no 'left' frame, the thread has one message. # we parse the message for the 'Original Format' link, load that url, # save it, and exit. if top_page.frames.has_key('left') == 0: # strip non-article urls top_page.alist = top_page.alist[6:] top_page.alist = top_page.alist[:6] p = PageParser(verbose, parsed=top_page, folder=outfolder) p.save() else: url = top_page.frames['left'] top_page.reset() top_page.parse(url) top_page.alist = top_page.alist[6:] count = 0 number = len(top_page.alist) if verbose > 0: print "%d articles to grab." % number while number > 0: if verbose > 0: print "loading articles %d onwards..." % (count + 1) if verbose > 1: print "loading", top_page.alist[count][0] p = PageParser(verbose, url=top_page.alist[count][0], folder=outfolder) p.save() del p count += 10 number -= 10 urllib.urlcleanup() outfolder.close() sys.exit(0) if __name__ == '__main__': main(sys.argv)