Creating a wikipedia watchlist rss feed with Python and Twill
Wikipedia doesn’t have a rss feed so I created one with Python and Twill. You can run it with wikipedia_rss.py username password.
It’s my first Python script. I had trouble getting to work in my crontab. I kept getting ImportError: No module named. It turns out crontab was running a different Python version than my /usr/bin/env python. Running which python from the command line showed me my working version could be found at /opt/local/bin/python.
#!/opt/local/bin/python
from twill.commands import go, follow, showforms, fv, submit, find, code, show, save_html
import twill, xml.dom.minidom, sys, string, datetime
try:
username = sys.argv[1]
password = sys.argv[2]
except IndexError:
print "Please supply username password"
sys.exit(1)
remote_html = "http://en.wikipedia.org/wiki/Special:Watchlist"
temp_html = "/Users/petrik/Scripts/tmp/wikipedia.html"
temp_rss = "/Library/WebServer/Documents/crap/wikipedia.rss"
rss_title = "Wikipedia watchlist"
rss_link = "http://en.wikipedia.org"
#main methods
def loginWikiPedia(username, password):
print "Logging in with username " + username
go("http://en.wikipedia.org/w/index.php?title=Special:Userlogin")
fv("1", "wpName", username)
fv("1", "wpPassword", password)
submit("wpLoginattempt")
def getSavedHtmlPage(filename):
return xml.dom.minidom.parseString(open(filename, 'r').read())
def saveHtmlPage(html_page, filename):
go(html_page)
save_html(filename)
#rss utils
def createRss(html):
r = "\n"
r += "\n"
r += getTitle(html)
r += handleTag(rss_link, "link")
r += handleTag(rss_title , "title")
r += handleTag(getTitle(html) , "description")
r += handleTag("en", "language")
r += handleTag(datetime.datetime.now().strftime('%a, %d %b %Y %X +0000'), "pubDate")
r += handleList(html.getElementsByTagName("li"))
r += " \n ;"
return string.replace(r,'&','&')
def handleTag(content, tag):
return "<" + tag + ">" + str(content) + "" + tag + ">\n"
def handleLink(node):
return rss_link + str(getAttribute(node.childNodes[1], "href"))
def getTitle(html):
return getText(html.getElementsByTagName("title")[0].childNodes)
def handleList(listItems):
i = 0
r = ""
for li in listItems:
if i < 10 & li.childNodes.length > 1:
r += handleTag(handleItem(li), "item")
i += 1
return r
def handleItem(item):
contentIndex = filterContent(5, item, "m")
r = handleTag(getText(item.childNodes[contentIndex].childNodes), "title")
r += handleTag(handleLink(item), "link")
r += handleTag(handleDescription(item, contentIndex), "description")
return r
def handleDescription(item, contentIndex):
i = 0
description = ""
for child in item.childNodes:
if i > contentIndex:
description += " " + str(getText(child.childNodes))
i += 1
return description
def filterContent(index, item, pattern):
if getText(item.childNodes[index].childNodes) == pattern:
index += 2
return index
#xml utils
def getText(nodelist):
rc = ""
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
rc += node.data
return rc
def getAttribute(node, attribute):
attrs = node.attributes
for attrName in attrs.keys():
if attrName == attribute:
attrNode = attrs.get(attrName)
return attrNode.nodeValue
loginWikiPedia(username, password)
saveHtmlPage(remote_html, temp_html)
rss = createRss(getSavedHtmlPage(temp_html))
print rss
f = open(temp_rss, 'w')
f.write(rss)
f.close
print "saved rss to " + temp_rss
After running the script the rss feed can be found at whatever you define as temp_rss.