Skip to content

Commit 426482e

Browse files
author
unknown
committed
Updated command line args and sqlite table creation. Added id and parent fields to both queue and crawl_index table. Added status table to keep track of whether the crawler has started and finished.
1 parent e2251a6 commit 426482e

File tree

1 file changed

+23
-6
lines changed

1 file changed

+23
-6
lines changed

PyCrawler.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,32 @@
1212
except ImportError:
1313
print "Continuing without psyco JIT compilation!"
1414

15+
"""
16+
The program should take 3 arguments
17+
1) database file name
18+
2) start url
19+
3) crawl depth
20+
Start out by checking to see if the args are there and
21+
set them to their variables
22+
"""
23+
if len(sys.argv) < 4:
24+
sys.exit("Not enough arguments!")
25+
else:
26+
dbname = sys.argv[1]
27+
starturl = sys.argv[2]
28+
crawldepth = sys.argv[3]
29+
30+
1531
# Connect to the db and create the tables if they don't already exist
16-
connection = sqlite.connect('crawl.db')
32+
connection = sqlite.connect(db)
1733
cursor = connection.cursor()
18-
cursor.execute('CREATE TABLE IF NOT EXISTS crawl_index ( url VARCHAR(256) PRIMARY KEY, title VARCHAR(256), keywords VARCHAR(256) )')
19-
cursor.execute('CREATE TABLE IF NOT EXISTS queue ( url VARCHAR(256) PRIMARY KEY )')
34+
cursor.execute('CREATE TABLE IF NOT EXISTS crawl_index (id INTEGER, parent INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256) )')
35+
cursor.execute('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, url VARCHAR(256) PRIMARY KEY )')
36+
cursor.execute('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TIMESTAMP )')
2037
connection.commit()
21-
38+
"""
2239
# Check for a start point
23-
if len(argv) < 2:
40+
if len(sys.argv) < 2:
2441
print "No starting point! Checking existing queue"
2542
cursor.execute("SELECT * FROM queue LIMIT 1")
2643
c = cursor.fetchone()
@@ -33,7 +50,7 @@
3350
connection.commit()
3451
except:
3552
pass
36-
53+
"""
3754
# Compile keyword and link regex expressions
3855
keywordregex = re.compile('<meta\sname=["\']keywords["\']\scontent=["\'](.*?)["\']\s/>')
3956
linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')

0 commit comments

Comments
 (0)