Use BeautifulSoup
from urllib import urlopen from bs4 import BeautifulSoup as BS text = urlopen("http://www.python.org/community/jobs/").read() soup = BS(text.decode('gbk', 'ignore')) jobs = set() for header in soup('h2'): links = header('a', 'reference') if not links: continue link = links[0] jobs.add('%s (%s)' % (link.string, link['href'])) print '\n'.join(sorted(jobs, key = lambda s: s.lower())) eliminate duplicates and print the names in sorted order soup('h2'): to get a list of all h2 elements header('a', 'reference') to get a list of child elements of the reference class