3232connection = sqlite .connect (db )
3333cursor = connection .cursor ()
3434cursor .execute ('CREATE TABLE IF NOT EXISTS crawl_index (id INTEGER, parent INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256) )' )
35- cursor .execute ('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, url VARCHAR(256) PRIMARY KEY )' )
36- cursor .execute ('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TIMESTAMP )' )
35+ cursor .execute ('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, depth INTEGER, url VARCHAR(256) PRIMARY KEY )' )
36+ cursor .execute ('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TEXT )' )
3737connection .commit ()
38+
3839"""
3940# Check for a start point
4041if len(sys.argv) < 2:
5051 connection.commit()
5152 except:
5253 pass
53- """
54+ """
55+
5456# Compile keyword and link regex expressions
5557keywordregex = re .compile ('<meta\sname=["\' ]keywords["\' ]\scontent=["\' ](.*?)["\' ]\s/>' )
5658linkregex = re .compile ('<a\s*href=[\' |"](.*?)[\' "].*?>' )
5759crawled = []
5860
61+ # set crawling status and stick starting url into the queue
62+ cursor .execute ("INSERT INTO status VALUES ((?), (?))" , (1 , "datetime('now')" ))
63+ cursor .execute ("INSERT INTO queue VALUES ((?), (?), (?))" , (None , 0 , 0 , staturl ))
64+ connection .commit ()
65+
66+
67+ # insert starting url into queue
68+
5969class threader ( threading .Thread ):
6070 # Main run method to run
6171 def run (self ):
@@ -64,28 +74,42 @@ def run(self):
6474 # Get the first item from the queue
6575 cursor .execute ("SELECT * FROM queue LIMIT 1" )
6676 crawling = cursor .fetchone ()
67- crawling = crawling [0 ]
6877 # Remove the item from the queue
69- cursor .execute ("DELETE FROM queue WHERE url = (?)" , (crawling , ))
78+ cursor .execute ("DELETE FROM queue WHERE id = (?)" , (crawling [ 0 ] , ))
7079 connection .commit ()
7180 print crawling
7281 except KeyError :
7382 raise StopIteration
83+
84+ # if theres nothing in the que, then set the status to done and exit
85+ if crawling == None :
86+ cursor .execute ("INSERT INTO status VALUES ((?), (?))" , (0 , "datetime('now')" ))
87+ connection .commit ()
88+ sys .exit ("Done!" )
7489 # Crawl the link
7590 self .crawl (crawling )
91+
7692
7793 def crawl (self , crawling ):
94+ # crawler id
95+ cid = crawling [0 ]
96+ # parent id. 0 if start url
97+ pid = crawling [1 ]
98+ # current depth
99+ curdepth = crawling [2 ]
100+ # crawling urL
101+ curl = crawling [3 ]
78102 # Split the link into its sections
79- url = urlparse .urlparse (crawling )
103+ url = urlparse .urlparse (curl )
80104 try :
81105 # Add the link to the already crawled list
82- crawled .append (crawling )
106+ crawled .append (curl )
83107 except MemoryError :
84108 # If the crawled array is too big, deleted it and start over
85109 del crawled [:]
86110 try :
87111 # Load the link
88- response = urllib2 .urlopen (crawling )
112+ response = urllib2 .urlopen (curl )
89113 except :
90114 # If it doesn't load, kill the function
91115 return
@@ -96,26 +120,6 @@ def crawl(self, crawling):
96120 soup = BeautifulSoup (msg )
97121 # find the title
98122 title = soup .find ('title' limit = 1 )
99- # Find the title of the page
100- #startPos = msg.find('<title>')
101- #if startPos != -1:
102- # endPos = msg.find('</title>', startPos+7)
103- # if endPos != -1:
104- # title = msg[startPos+7:endPos]
105- # Get the keywords
106-
107- # Find the meta keywords tag
108- #metalist = soup.findall("meta")
109- #keywordsmeta = None
110- # search through all the meta tags for the keywords tag
111- #for meta in metalist:
112- # if "keywords" is in the string
113- # if meta.find("keywords") != -1:
114- # keywordsmeta = meta
115- # break
116- #if keywordsmeta != None:
117- # bs = BeautifulSoup(keywordsmeta)
118- # bs.find(text=re.compile('content\=[\'|"].*?[\'|"]'))
119123
120124 keywordlist = keywordregex .findall (msg )
121125 if len (keywordlist ) > 0 :
@@ -128,29 +132,32 @@ def crawl(self, crawling):
128132 keywordlist .replace ("'" , "\' " )
129133
130134 # queue up the links
131- queue_links (links )
135+ queue_links (links , cid , curdepth )
132136
133137 try :
134138 # Put now crawled link into the db
135- cursor .execute ("INSERT INTO crawl_index VALUES( (?), (?), (?) ) " , (crawling , title , keywordlist ))
139+ cursor .execute ("INSERT INTO crawl_index VALUES( (?), (?), (?), (?), (?) ) " , (cid , pid , curl , title , keywordlist ))
136140 connection .commit ()
137141 except :
138142 pass
139- def queue_links (links ):
140- # Read the links and inser them into the queue
141- for link in (links .pop (0 ) for _ in xrange (len (links ))):
142- if link .startswith ('/' ):
143- link = 'http://' + url [1 ] + link
144- elif link .startswith ('#' ):
145- link = 'http://' + url [1 ] + url [2 ] + link
146- elif not link .startswith ('http' ):
147- link = 'http://' + url [1 ] + '/' + link
148- if link .decode ('utf-8' ) not in crawled :
149- try :
150- cursor .execute ("INSERT INTO queue VALUES ( (?) )" , (link , ))
151- connection .commit ()
152- except :
153- continue
143+ def queue_links (self , links , cid , curdepth ):
144+ if curdepth < crawldepth :
145+ # Read the links and inser them into the queue
146+ for link in (links .pop (0 ) for _ in xrange (len (links ))):
147+ if link .startswith ('/' ):
148+ link = 'http://' + url [1 ] + link
149+ elif link .startswith ('#' ):
150+ link = 'http://' + url [1 ] + url [2 ] + link
151+ elif not link .startswith ('http' ):
152+ link = 'http://' + url [1 ] + '/' + link
153+ if link .decode ('utf-8' ) not in crawled :
154+ try :
155+ cursor .execute ("INSERT INTO queue VALUES ( (?), (?), (?), (?) )" , (None , cid , curdepth + 1 , link ))
156+ connection .commit ()
157+ except :
158+ continue
159+ else :
160+ pass
154161if __name__ == '__main__' :
155162 # Run main loop
156163 threader ().run ()
0 commit comments