#!/usr/bin/env python """ Calcium: A script to seed the CoralCDN with pages from various "new links" feeds. Calcium reads these feeds through the FeedTree proxy running on localhost, to ensure prompt discovery of new links (without placing undue stress on the webserver hoting the feed). The idea is that these "new links" will (presumably) show up on these feeds (and therefore be discovered by Calcium) before they become "hot" (and consequently unavailable, due to the Digg/Reddit/Slashdot effect). See also: * FeedTree - http://feedtree.net * CoralCDN - http://coralcdn.org """ import sys ; sys.path.append('lib') import shelve, urllib2, urlparse from BeautifulSoup import BeautifulSoup import feedparser import coralcache from extractors import * from utils import * class Feed: def __init__(self, url, link_extractor=DefaultExtractor): self.url = url ; self.link_extractor = link_extractor() CRAWL_FEEDS = [ Feed('http://digg.com/rss/indexdig.xml', link_extractor=DiggExtractor), Feed('http://reddit.com/new.rss'), Feed('http://del.icio.us/rss/popular/'), ] PROXY_PREFIX = 'http://127.0.0.1:8500/cache/' URL_CACHE_FILE = 'feeds.shelf' def coral_seed(url): sys.stdout.write('[%s]' % coralcache.coralize(url)) page = urlfetch(url, 1) return True def main(): links_seen = shelve.open(URL_CACHE_FILE,'c') print "Calcium: loaded %d old URLs" % len(links_seen) def feedtree_fetch(url): return urlfetch(PROXY_PREFIX + url) try: for feedinfo in CRAWL_FEEDS: try: sys.stdout.write("Fetching from FeedTree: " + feedinfo.url) page = feedtree_fetch(feedinfo.url) sys.stdout.write(" (%d b)\n" % len(page)) #print page[0:80] doc = feedparser.parse(page) #print doc # print " - Title: %s" % doc.title print " - Items: %d" % len(doc.entries) for e in doc.entries: link = str(e.link) if not link in links_seen: sys.stdout.write(" + Examining feed URL: " + link) coral_url = feedinfo.link_extractor.get_link(link) sys.stdout.write("\n => Coralizing new URL: %s " % coral_url) coral_seed(coral_url) sys.stdout.write(" (OK)\n") links_seen[link] = True except IOError, e: print "\nIO exception: " + `e` #raise e except urllib2.HTTPError, e: print "\nHTTP exception: " + `e` # raise e except KeyboardInterrupt: print "Interrupted..." links_seen.close() sys.exit(1) except KeyboardInterrupt: print "Interrupted..." links_seen.close() sys.exit(1) links_seen.close() sys.exit(0) if __name__ == '__main__': main()