3232# Connect to the db and create the tables if they don't already exist
3333connection = sqlite .connect (dbname )
3434cursor = connection .cursor ()
35- cursor .execute ('CREATE TABLE IF NOT EXISTS crawl_index (id INTEGER, parent INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256) )' )
35+ cursor .execute ('CREATE TABLE IF NOT EXISTS crawl_index (crawlid INTEGER, parentid INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256) )' )
3636cursor .execute ('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, depth INTEGER, url VARCHAR(256))' )
3737cursor .execute ('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TEXT )' )
3838connection .commit ()
@@ -69,7 +69,7 @@ def run(self):
6969
7070 # if theres nothing in the que, then set the status to done and exit
7171 if crawling == None :
72- cursor .execute ("INSERT INTO status VALUES ((?), (? ))" , (0 , "datetime('now')" ))
72+ cursor .execute ("INSERT INTO status VALUES ((?), datetime('now' ))" , (0 ,))
7373 connection .commit ()
7474 sys .exit ("Done!" )
7575 # Crawl the link
@@ -95,7 +95,11 @@ def crawl(self, crawling):
9595 del crawled [:]
9696 try :
9797 # Load the link
98- response = urllib2 .urlopen (curl )
98+ request = urllib2 .Request (curl )
99+ request .add_header ("User-Agent" , "PyCrawler" )
100+ opener = urllib2 .build_opener ()
101+ response = opener .open (request ).read ()
102+
99103 except :
100104 # If it doesn't load, skip this url
101105 return
0 commit comments