Creating a wikipedia watchlist rss feed with Python and Twill

Wikipedia doesn’t have a rss feed so I created one with Python and Twill. You can run it with wikipedia_rss.py username password.

It’s my first Python script. I had trouble getting to work in my crontab. I kept getting ImportError: No module named. It turns out crontab was running a different Python version than my /usr/bin/env python. Running which python from the command line showed me my working version could be found at /opt/local/bin/python.

#!/opt/local/bin/python

from twill.commands import go, follow, showforms, fv, submit, find, code, show, save_html
import twill, xml.dom.minidom, sys, string, datetime
 
try:
  username = sys.argv[1]
  password = sys.argv[2]
except IndexError:
  print "Please supply username password"
  sys.exit(1)

remote_html = "http://en.wikipedia.org/wiki/Special:Watchlist"
temp_html = "/Users/petrik/Scripts/tmp/wikipedia.html"
temp_rss = "/Library/WebServer/Documents/crap/wikipedia.rss"
rss_title = "Wikipedia watchlist"
rss_link = "http://en.wikipedia.org"


#main methods
def loginWikiPedia(username, password):
    print "Logging in with username " + username
    go("http://en.wikipedia.org/w/index.php?title=Special:Userlogin")
    fv("1", "wpName", username)
    fv("1", "wpPassword", password)
    submit("wpLoginattempt")  

def getSavedHtmlPage(filename):
    return xml.dom.minidom.parseString(open(filename, 'r').read())

def saveHtmlPage(html_page, filename):
    go(html_page)
    save_html(filename)

#rss utils            
def createRss(html):
    r = "\n"
    r += "\n"
    r += getTitle(html)
    r += handleTag(rss_link, "link")
    r += handleTag(rss_title , "title")
    r += handleTag(getTitle(html) , "description")
    r += handleTag("en", "language")
    r += handleTag(datetime.datetime.now().strftime('%a, %d %b %Y %X +0000'), "pubDate")
    r += handleList(html.getElementsByTagName("li"))
    r += "\n;"
    return string.replace(r,'&','&') 

def handleTag(content, tag):
    return "<" + tag + ">" + str(content) + "\n"

def handleLink(node):
    return  rss_link + str(getAttribute(node.childNodes[1], "href"))    
        
def getTitle(html):
    return getText(html.getElementsByTagName("title")[0].childNodes)
    
def handleList(listItems):
    i = 0
    r = ""
    for li in listItems:
        if i < 10 & li.childNodes.length > 1:
            r += handleTag(handleItem(li), "item")
            i += 1
    return r

def handleItem(item):
    contentIndex = filterContent(5, item, "m")
    r = handleTag(getText(item.childNodes[contentIndex].childNodes), "title")
    r += handleTag(handleLink(item), "link")
    r += handleTag(handleDescription(item, contentIndex), "description")
    return r

def handleDescription(item, contentIndex):
    i = 0
    description = ""
    for child in item.childNodes:
        if i > contentIndex:
            description += " " + str(getText(child.childNodes))
        i += 1
    return description

def filterContent(index, item, pattern):
    if getText(item.childNodes[index].childNodes) == pattern:
        index += 2
    return index

#xml utils
def getText(nodelist):
    rc = ""
    for node in nodelist:
        if node.nodeType == node.TEXT_NODE:
            rc += node.data
    return rc

def getAttribute(node, attribute):
    attrs = node.attributes
    for attrName in attrs.keys():
        if attrName == attribute:
            attrNode = attrs.get(attrName)
            return attrNode.nodeValue  


loginWikiPedia(username, password)
saveHtmlPage(remote_html, temp_html)
rss = createRss(getSavedHtmlPage(temp_html))
print rss

f = open(temp_rss, 'w')
f.write(rss)
f.close
print "saved rss to " + temp_rss
After running the script the rss feed can be found at whatever you define as temp_rss.

admin