import urllib2
import re
import os
import sys
url = "http://www.phdcomics.com/comics/archive.php?comicid="
for id in range(int(sys.argv[1]), int(sys.argv[2])):
print `id`+"..."
try:
page = urllib2.urlopen(url+`id`).read()
img = re.search('img src=(http://www.phdcomics.com/comics/archive/.+?.gif)', page).group(1)
os.system('wget -nc '+img)
except urllib2.HTTPError:
print "No page_id " + `id`
Saturday, November 28, 2009
Web-scraping web comics
A template for web-scraping comics, using Piled Higher & Deeper as an example (in Python; if on windows, you'll need wget from Cygwin):
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment