Skip to content

Commit c07b5dd

Browse files
committed
Restructured the crawler. works with the new tables, crawles to limited specified depth. keeps current and parent ids so the gui can display the nesting properly
1 parent 426482e commit c07b5dd

File tree

1 file changed

+52
-45
lines changed

1 file changed

+52
-45
lines changed

PyCrawler.py

Lines changed: 52 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,10 @@
3232
connection = sqlite.connect(db)
3333
cursor = connection.cursor()
3434
cursor.execute('CREATE TABLE IF NOT EXISTS crawl_index (id INTEGER, parent INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256) )')
35-
cursor.execute('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, url VARCHAR(256) PRIMARY KEY )')
36-
cursor.execute('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TIMESTAMP )')
35+
cursor.execute('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, depth INTEGER, url VARCHAR(256) PRIMARY KEY )')
36+
cursor.execute('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TEXT )')
3737
connection.commit()
38+
3839
"""
3940
# Check for a start point
4041
if len(sys.argv) < 2:
@@ -50,12 +51,21 @@
5051
connection.commit()
5152
except:
5253
pass
53-
"""
54+
"""
55+
5456
# Compile keyword and link regex expressions
5557
keywordregex = re.compile('<meta\sname=["\']keywords["\']\scontent=["\'](.*?)["\']\s/>')
5658
linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')
5759
crawled = []
5860

61+
# set crawling status and stick starting url into the queue
62+
cursor.execute("INSERT INTO status VALUES ((?), (?))", (1, "datetime('now')"))
63+
cursor.execute("INSERT INTO queue VALUES ((?), (?), (?))", (None, 0, 0, staturl))
64+
connection.commit()
65+
66+
67+
# insert starting url into queue
68+
5969
class threader ( threading.Thread ):
6070
# Main run method to run
6171
def run(self):
@@ -64,28 +74,42 @@ def run(self):
6474
# Get the first item from the queue
6575
cursor.execute("SELECT * FROM queue LIMIT 1")
6676
crawling = cursor.fetchone()
67-
crawling = crawling[0]
6877
# Remove the item from the queue
69-
cursor.execute("DELETE FROM queue WHERE url = (?)", (crawling, ))
78+
cursor.execute("DELETE FROM queue WHERE id = (?)", (crawling[0], ))
7079
connection.commit()
7180
print crawling
7281
except KeyError:
7382
raise StopIteration
83+
84+
# if theres nothing in the que, then set the status to done and exit
85+
if crawling == None:
86+
cursor.execute("INSERT INTO status VALUES ((?), (?))", (0, "datetime('now')"))
87+
connection.commit()
88+
sys.exit("Done!")
7489
# Crawl the link
7590
self.crawl(crawling)
91+
7692

7793
def crawl(self, crawling):
94+
# crawler id
95+
cid = crawling[0]
96+
# parent id. 0 if start url
97+
pid = crawling[1]
98+
# current depth
99+
curdepth = crawling[2]
100+
# crawling urL
101+
curl = crawling[3]
78102
# Split the link into its sections
79-
url = urlparse.urlparse(crawling)
103+
url = urlparse.urlparse(curl)
80104
try:
81105
# Add the link to the already crawled list
82-
crawled.append(crawling)
106+
crawled.append(curl)
83107
except MemoryError:
84108
# If the crawled array is too big, deleted it and start over
85109
del crawled[:]
86110
try:
87111
# Load the link
88-
response = urllib2.urlopen(crawling)
112+
response = urllib2.urlopen(curl)
89113
except:
90114
# If it doesn't load, kill the function
91115
return
@@ -96,26 +120,6 @@ def crawl(self, crawling):
96120
soup = BeautifulSoup(msg)
97121
# find the title
98122
title = soup.find('title' limit=1)
99-
# Find the title of the page
100-
#startPos = msg.find('<title>')
101-
#if startPos != -1:
102-
# endPos = msg.find('</title>', startPos+7)
103-
# if endPos != -1:
104-
# title = msg[startPos+7:endPos]
105-
# Get the keywords
106-
107-
# Find the meta keywords tag
108-
#metalist = soup.findall("meta")
109-
#keywordsmeta = None
110-
# search through all the meta tags for the keywords tag
111-
#for meta in metalist:
112-
# if "keywords" is in the string
113-
# if meta.find("keywords") != -1:
114-
# keywordsmeta = meta
115-
# break
116-
#if keywordsmeta != None:
117-
# bs = BeautifulSoup(keywordsmeta)
118-
# bs.find(text=re.compile('content\=[\'|"].*?[\'|"]'))
119123

120124
keywordlist = keywordregex.findall(msg)
121125
if len(keywordlist) > 0:
@@ -128,29 +132,32 @@ def crawl(self, crawling):
128132
keywordlist.replace("'", "\'")
129133

130134
# queue up the links
131-
queue_links(links)
135+
queue_links(links, cid, curdepth)
132136

133137
try:
134138
# Put now crawled link into the db
135-
cursor.execute("INSERT INTO crawl_index VALUES( (?), (?), (?) )", (crawling, title, keywordlist))
139+
cursor.execute("INSERT INTO crawl_index VALUES( (?), (?), (?), (?), (?) )", (cid, pid, curl, title, keywordlist))
136140
connection.commit()
137141
except:
138142
pass
139-
def queue_links(links):
140-
# Read the links and inser them into the queue
141-
for link in (links.pop(0) for _ in xrange(len(links))):
142-
if link.startswith('/'):
143-
link = 'http://' + url[1] + link
144-
elif link.startswith('#'):
145-
link = 'http://' + url[1] + url[2] + link
146-
elif not link.startswith('http'):
147-
link = 'http://' + url[1] + '/' + link
148-
if link.decode('utf-8') not in crawled:
149-
try:
150-
cursor.execute("INSERT INTO queue VALUES ( (?) )", (link, ))
151-
connection.commit()
152-
except:
153-
continue
143+
def queue_links(self, links, cid, curdepth):
144+
if curdepth < crawldepth:
145+
# Read the links and inser them into the queue
146+
for link in (links.pop(0) for _ in xrange(len(links))):
147+
if link.startswith('/'):
148+
link = 'http://' + url[1] + link
149+
elif link.startswith('#'):
150+
link = 'http://' + url[1] + url[2] + link
151+
elif not link.startswith('http'):
152+
link = 'http://' + url[1] + '/' + link
153+
if link.decode('utf-8') not in crawled:
154+
try:
155+
cursor.execute("INSERT INTO queue VALUES ( (?), (?), (?), (?) )", (None, cid, curdepth+1, link))
156+
connection.commit()
157+
except:
158+
continue
159+
else:
160+
pass
154161
if __name__ == '__main__':
155162
# Run main loop
156163
threader().run()

0 commit comments

Comments
 (0)