Recently I have to get some historical data from one RSS feed. It seems that RSS can only output a limited number of recent items. Since in Google Reader, we can always roll to previous item [if there is one], my solution here is to use Google Reader as the feed processor.

Actually I am not the first one to do this. gPowered and  GoogleReaderAPI have already made it possible. I extract the necessary code here and omit other lines. As usual, it’s Python.[download]

""" wuFetcher
	Usage: python wufetcher.py

	Author: Eric You XU, Washington University
	[Free to use for whatever purpose, absolutely NO WARRANTY]
	Kudos to 	gPowered: 			http://blog.gpowered.net/2007/08/google-reader-api-functions.html
				GoogleReaderAPI		http://code.google.com/p/pyrfeed/wiki/GoogleReaderAPI
"""

import urllib
import urllib2
import re  

login = 'wufetcher@gmail.com'
password = 'wuFetcher2007'
source = 'wuFetcher'

google_url = 'http://www.google.com'
reader_url = google_url + '/reader'
login_url = 'https://www.google.com/accounts/ClientLogin'
get_feed_url = reader_url + '/atom/feed/'

def get_SID():
	header = {'User-agent' : source}
	post_data = urllib.urlencode({ 'Email': login, \
								'Passwd': password, \
								'service': 'reader', \
								'source': source, \
								'continue': google_url, })
	# @see GoogleReaderAPI: Identification

	request = urllib2.Request(login_url, post_data, header)
	try :
		f = urllib2.urlopen(request)
		result = f.read()
		print result
	except:
		print 'Error logging in'
		exit(-1)
	return re.search('SID=(\S*)', result).group(1)

def get_results(SID, url, number):
	header = {'User-agent' : source}
	header['Cookie']='Name=SID;SID=%s;Domain=.google.com;Path=/;Expires=160000000000' % SID
	post_data = urllib.urlencode({'n': str(number)})
	request = urllib2.Request(url+'?n='+str(number), None, header)
	try :
		f = urllib2.urlopen( request )
		return f.read()
	except:
		print 'Error getting data from %s' % url
	return None

if __name__ == "__main__":
	sid= get_SID()
	feed_url= "http://feeds.feedburner.com/xumathena"
	# replace this url with the rss feed you want to fetch

	number = 10
	# replace this number with number of items you want to fetch

	result = get_results(sid, get_feed_url+feed_url.encode('utf-8'), number)
	f = open(feed_url.split('/')[-1], 'w')
	f.write(result)
	f.close()