Recently I have to get some historical data from one RSS feed. It seems that RSS can only output a limited number of recent items. Since in Google Reader, we can always roll to previous item [if there is one], my solution here is to use Google Reader as the feed processor.

Actually I am not the first one to do this. gPowered and  GoogleReaderAPI have already made it possible. I extract the necessary code here and omit other lines. As usual, it’s Python.[download]

""" wuFetcher
	Usage: python

	Author: Eric You XU, Washington University
	[Free to use for whatever purpose, absolutely NO WARRANTY]
	Kudos to 	gPowered:

import urllib
import urllib2
import re  

login = ''
password = 'wuFetcher2007'
source = 'wuFetcher'

google_url = ''
reader_url = google_url + '/reader'
login_url = ''
get_feed_url = reader_url + '/atom/feed/'

def get_SID():
	header = {'User-agent' : source}
	post_data = urllib.urlencode({ 'Email': login, \
								'Passwd': password, \
								'service': 'reader', \
								'source': source, \
								'continue': google_url, })
	# @see GoogleReaderAPI: Identification

	request = urllib2.Request(login_url, post_data, header)
	try :
		f = urllib2.urlopen(request)
		result =
		print result
		print 'Error logging in'
	return'SID=(\S*)', result).group(1)

def get_results(SID, url, number):
	header = {'User-agent' : source}
	header['Cookie']='Name=SID;SID=%s;;Path=/;Expires=160000000000' % SID
	post_data = urllib.urlencode({'n': str(number)})
	request = urllib2.Request(url+'?n='+str(number), None, header)
	try :
		f = urllib2.urlopen( request )
		print 'Error getting data from %s' % url
	return None

if __name__ == "__main__":
	sid= get_SID()
	feed_url= ""
	# replace this url with the rss feed you want to fetch

	number = 10
	# replace this number with number of items you want to fetch

	result = get_results(sid, get_feed_url+feed_url.encode('utf-8'), number)
	f = open(feed_url.split('/')[-1], 'w')